天天看點

資訊内容安全實驗-----某東評論資料感情分析前言一、大緻思路二、源代碼總結

文章目錄

  • 前言
  • 一、大緻思路
  • 二、源代碼
  • 總結

前言

本文使用的是基于情感詞典的情感分析

一、大緻思路

一.導入資料

1.導入資料。

二.資料預處理

(一)去重

(二)資料清洗

(三)分詞、詞性标注、去掉停用詞、詞雲圖

1.去重(利用python自帶的函數drop_duplicates去重)。

2.資料清洗(利用正規表達式删掉數字字母和一些品牌名)。

3.用jieba進行分詞,整理得詞以及對應的詞性。

4.形成包含詞、詞性和詞長度的新架構。

5.去掉停用詞和标點符号。

6.重新統計分詞數量。

7.提取名詞并形成分詞後詞雲圖和分此後名詞詞雲圖。

三.模型建構

(一)情感分析

1.導入知網的評價詞,積極指派1,消極指派-1。

2.增加新詞。

3.将知網的評價詞表和資料預處理得到的資料連接配接。

4.修正情感傾向,即判斷是否是否定的(準備一加入修正權值,準備二去掉權值等于0的。)

5.計算情感值,大于1 的為pos,小于1的為neg。

6.合并到一張大表裡面。

7.生成積極和消極詞雲圖。

二、源代碼

import numpy as np
import pandas as pd
import re
import PIL
import jieba.posseg as pg
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter

if __name__ == '__main__':
    raw_data = pd.read_excel('review_try.xlsx')
    raw_data.info()
    for cate in ['review_nickname', 'review_time']:
        print(raw_data[cate].value_counts())
    # 去重
    review = raw_data.copy()
    review = review[['review_content']]
    print('去重之前:', review.shape[0])
    review = review.drop_duplicates()
    print('去重之後:', review.shape[0])
    # 資料清洗
    # 清洗之前
    content = review['review_content']
    for i in range(1, 100):
        print(content[i])
        print('----------------------')
    # 清洗之後,把字母數字京東,榮耀,華為,手機,機,pg
    info = re.compile('[0-9a-zA-Z]|京東|華為|榮耀|手機|機|蘋果|小米|')
    content = content.apply(lambda x: info.sub('', x))
    # 清洗之後
    for i in range(1, 100):
        print(content[i])
        print('---------------------')
    # 用jieba進行分詞
    seg_content = content.apply(lambda x: [(x.word, x.flag) for x in pg.cut(x)])
    print(seg_content.shape)
    print(len(seg_content))
    print('.............')
    for i in range(20):
        print(seg_content[i])
    print('.............')
    # 統計評論數
    n_word = seg_content.apply(lambda x:len(x))
    print(n_word)
    print(n_word.head(8))
    # 得到各分詞在第幾評論
    n_content = [[x+1]*y for x,y in zip(list(seg_content.index), list(n_word))]
    index_content_long = sum(n_content, [])
    print(len(index_content_long))
    # 分詞以及詞性,去掉[],拉平。
    print(seg_content.head())
    seg_content_long = sum(seg_content, [])
    print(seg_content_long)
    # 得到加長版的分詞,詞性。
    word_long = [x[0] for x in seg_content_long]
    nature_long = [x[1] for x in seg_content_long]
    print(len(word_long))
    print(len(nature_long))
    # 形成新架構
    review_long = pd.DataFrame({'index_content': index_content_long, 'word': word_long, 'nature': nature_long})
    print(review_long.shape)
    print(review_long.head())
    # 去掉标點符号,停用詞。
    print(review_long['nature'].unique())
    review_long_clean = review_long[review_long['nature'] != 'x']
    print(review_long_clean.shape)
    # 導入停用詞
    with open('stoplist.txt', 'r', encoding='utf-8') as f:
        stop_words = f.readlines()
    print(len(stop_words))
    print(stop_words[0:5])
    # 停用詞預處理
    stop_words = [word.strip('\n') for word in stop_words]
    print(stop_words[0:5])
    # 得到不含停用詞的分詞表
    word_long_clean = list(set(word_long) - set(stop_words))
    print(len(word_long_clean))
    review_long_clean = review_long_clean[review_long_clean['word'].isin(word_long_clean)]
    print(review_long_clean.shape)
    # 再次統計每條評論裡面的分詞數量
    n_word = review_long_clean.groupby('index_content').count()['word']
    print(n_word)
    index_word = [list(np.arange(1, x+1)) for x in n_word]
    index_word_long = sum(index_word, [])
    review_long_clean['index_word'] = index_word_long
    print(review_long_clean.head())
    review_long_clean.to_csv('1_review_long_clean.csv')
    #  提取名詞
    n_review_long_clean = review_long_clean[['n' in nat for nat in review_long_clean.nature]]
    print(n_review_long_clean.shape)
    print(n_review_long_clean.head())
    n_review_long_clean.nature.value_counts()
    n_review_long_clean.to_csv('n_review_long_clean.csv')
    # 詞雲圖
    image1 = PIL.Image.open('星星.jpg')
    MASK = np.array(image1)
    wordcloud = WordCloud(font_path='C:\\Windows\\Fonts\\msyh.ttc', max_words=100, background_color='white', mask=MASK)
    wordcloud.generate_from_frequencies(Counter(review_long_clean.word.values))
    wordcloud.to_file('分詞後的詞雲圖.png')
    #plt.figure(figsize=(20, 10))
    #plt.imshow(wordcloud)
    #plt.axis('off')
    #plt.show()
    image2 = PIL.Image.open('愛心.jpg')
    MASK1 = np.array(image2)
    wordcloud1 = WordCloud(font_path='C:\\Windows\\Fonts\\msyh.ttc', max_words=100, background_color='white', mask=MASK1)
    wordcloud1.generate_from_frequencies(Counter(n_review_long_clean.word.values))
    wordcloud1.to_file('分詞後的詞雲圖(名詞).png')
    #plt.figure(figsize=(20, 10))
    #plt.imshow(wordcloud)
    #plt.axis('off')
    #plt.show()
    #情感分析
    #來自知網的評價詞
    pos_comment = pd.read_csv('正面評價詞語(中文).txt', header=None, sep='\n', encoding='utf-8')
    neg_comment = pd.read_csv('負面評價詞語(中文).txt', header=None, sep='\n', encoding='utf-8')

    pos_emotion = pd.read_csv('正面情感詞語(中文).txt', header=None, sep='\n', encoding='utf-8')
    neg_emotion = pd.read_csv('負面情感詞語(中文).txt', header=None, sep='\n', encoding='utf-8')

    pos = pd.concat([pos_comment, pos_emotion], axis=0)
    neg = pd.concat([neg_comment, neg_emotion], axis=0)
    # 增加新詞
    new_pos = pd.Series(['點贊'])
    new_neg = pd.Series(['歇菜'])
    positive = pd.concat([pos, new_pos], axis=0)
    negtive = pd.concat([neg, new_neg], axis=0)
    positive.columns = ['review']
    positive['weight'] = pd.Series([1]*len(positive))
    positive.head()
    negtive.columns = ['review']
    negtive['weight'] = pd.Series([-1]*len(negtive))
    negtive.head()
    pos_neg = pd.concat([positive,negtive], axis=0)
    print(pos_neg.shape)
    # 表連接配接
    data = review_long_clean.copy()
    review_mltype = pd.merge(data, pos_neg, how='left', left_on='word', right_on='review')
    review_mltype = review_mltype.drop(['review'], axis=1)
    review_mltype = review_mltype.replace(np.nan, 0)
    # 修正情感傾向
    notdic = pd.read_csv('not.csv')
    notdic['freq'] = [1]*len(notdic)
    # 準備一
    review_mltype['amend_weight'] = review_mltype['weight']
    review_mltype['id'] = np.arange(0, review_mltype.shape[0])
    # 準備二
    only_review_mltype = review_mltype[review_mltype['weight'] != 0]
    only_review_mltype.index = np.arange(0, only_review_mltype.shape[0])
    i = 4
    review_i = review_mltype[review_mltype['index_content'] == only_review_mltype['index_content'][i]]
    print(review_i)
    # 判斷否定詞語氣
    index = only_review_mltype['id']
    for i in range(0,only_review_mltype.shape[0]):
        review_i = review_mltype[review_mltype['index_content'] == only_review_mltype['index_content'][i]]
        review_i.index = np.arange(0, review_i.shape[0])
        word_ind = only_review_mltype['index_word'][i]
        if word_ind == 2:
            na = sum({review_i['word'][word_ind-1] in notdic['term']})
            if na == 1:
                review_mltype['amend_weight'][index[i]] = -(review_mltype['weight'][index[i]])
        elif word_ind > 2:
            na = sum([word in notdic['term'] for word in review_i['word'][[word_ind-1,word_ind-2]]])
            if na == 1:
                review_mltype['amend_weight'][index[i]] = -(review_mltype['weight'][index[i]])
    print(review_mltype.shape)
    review_mltype[(review_mltype['weight']-review_mltype['amend_weight']) != 0]
    # 計算情感值
    print(review_mltype.tail())
    emotion_value = review_mltype.groupby('index_content', as_index=False)['amend_weight'].sum()
    print(emotion_value.head())
    emotion_value.to_csv('1_emotion_value', index=True, header=True)
    # 每條評論的amend_weight不等于0
    content_emotion_value = emotion_value.copy()
    print(content_emotion_value.shape)
    content_emotion_value = content_emotion_value[content_emotion_value['amend_weight'] != 0]
    content_emotion_value['ml_type'] = ''
    content_emotion_value['ml_type'][content_emotion_value['amend_weight'] > 0] = 'pos'
    content_emotion_value['ml_type'][content_emotion_value['amend_weight'] < 0] = 'neg'
    print(content_emotion_value.shape)
    print(content_emotion_value.head())
    # 合并到大表中
    content_emotion_value = content_emotion_value.drop(['amend_weight'], axis=1)
    print(review_mltype.shape)
    review_mltype = pd.merge(review_mltype, content_emotion_value, how='left', left_on='index_content', right_on='index_content')
    review_mltype = review_mltype.drop(['id'], axis=1)
    print(review_mltype.shape)
    print(review_mltype.head())
    review_mltype.to_csv('1_review_mltype', index=True, header=True)
    # 生成詞雲圖
    # 隻看情感詞
    data = review_mltype.copy()
    data = data[data['amend_weight'] != 0]
    word_data_pos = data[data['ml_type'] == 'pos']
    word_data_neg = data[data['ml_type'] == 'neg']
    image3 = PIL.Image.open('星星.jpg')
    MASK3 = np.array(image3)
    wordcloud3 = WordCloud(font_path='C:\\Windows\\Fonts\\msyh.ttc', max_words=100, background_color='white', mask=MASK3)
    wordcloud3.generate_from_frequencies(Counter(word_data_pos.word.values))
    wordcloud3.to_file('積極情感詞雲圖.png')
    image4 = PIL.Image.open('愛心.jpg')
    MASK4 = np.array(image4)
    wordcloud4 = WordCloud(font_path='C:\\Windows\\Fonts\\msyh.ttc', max_words=100, background_color='white',mask=MASK4)
    wordcloud4.generate_from_frequencies(Counter(word_data_neg.word.values))
    wordcloud4.to_file('消極情感詞雲圖.png')
           

總結

大緻的代碼來自B站一位小姐姐的視訊。