#!/usr/bin/python
# coding=utf-8
# TF-IDF提取文本關鍵詞
# http://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting
import sys
import os
from config_ch import *
import chardet
import numpy as np
import pandas as pd
import xlrd
import copy
import glob
import jieba.posseg
import jieba.analyse
import io
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
"""
TF-IDF權重:
1、CountVectorizer 建構詞頻矩陣
2、TfidfTransformer 建構TF-IDF權值計算
3、文本的關鍵詞
4、對應的TF-IDF矩陣
"""
# 資料讀取
"""
輸入資料所在檔案夾路徑data_path, 輸出data為一字典, 包含'id', 'title', 'abstract'
"""
def dataRead(data_path):
file_list = os.listdir(data_path)
idList, titleList, abstractList = range(0, len(file_list)), [], [] # 建構3個list, 用于存放文本編号, 文本标題, 文本内容
for file_name in file_list:
file_path = os.path.join(data_path, file_name)
if os.path.isfile(file_path):
f = io.open(file_path, 'rb').read()
encoding_type = chardet.detect(f) # 擷取文本的編碼形式
if not encoding_type['encoding']:
encoding_type['encoding'] = 'utf-8-sig' # 一些文本編碼形式為none, 強制轉換
file = f.decode(encoding_type['encoding'])
titleList.append(file[0:file.find('\n', 1)+1]) # 文本第一行為标題
abstractList.append(file)
data = {"id": idList, "title": titleList, "abstract": abstractList}
return data
# 預處理
"""
輸入文本text及停用詞表stopword, 輸出分詞結果text_seg
預處理包括jieba分詞, 去停用詞, 篩選詞性
"""
def dataPrepos(text, stopword):
text_seg = []
seg = jieba.posseg.cut(text) # 分詞
for i in seg:
if i.word not in stopword and i.flag in pos: # 去停用詞 + 篩選詞性
text_seg.append(i.word)
return text_seg
# 關鍵詞映射
"""
輸入關鍵詞key及映射表mapword, 輸出key_left_mapped,
包括映射後剩餘關鍵詞"left"及映射得到的關鍵詞"mapped"
映射表第1列為atom詞清單, 從第2列起為替換詞清單,
若key中某詞屬于atom清單, 則将該atom對應的替換詞加入mappedList, 并從leftList中删除該詞,
若key中某詞本身屬于替換詞清單, 則将該詞加入mappedList, 并從leftList中删除
"""
def keysMapping(key, mapword):# key中關鍵詞若存在于atom中,則加入mappedList,leftList隻保留未出現在atom中的關鍵詞
leftList, mappedList = copy.deepcopy(key), [] # 初始化leftList, mappedList
atom = mapword.col_values(0)
for i in key:
if i in atom: # 關鍵詞為atom清單中的詞, 則用對應的替換詞進行替換
mappedList.extend(mapword.row_values(atom.index(i))[1:])
mappedList = list(filter(None, mappedList)) # 去除""字元串
leftList.pop(leftList.index(i)) # 從leftList中删除
else:
for n in range(len(atom)):
row = mapword.row_values(n)[1:]
if i in row: # 關鍵詞本身為替換詞清單中的詞, 則加入mappedList, 并從leftList中删除
mappedList.extend([i])
leftList.pop(leftList.index(i))
break
mappedList = list(set(mappedList)) # 去除重複詞
key_left_mapped = {"left": leftList, "mapped": mappedList}
return key_left_mapped
# TF-IDF提取topK關鍵詞
"""
輸入包括資料data, 停用詞表stopword, 映射表mapword, 及中間變量mapped和keys_all,
當mode為'tf'時, 每個文本單獨調用getKeyword, 需傳入文本id,
當mode為'tfidf'時, 多個文本作為整體隻調用一次getKeyword, 不需id, 令id = 0
"""
def getKeywords(data, id, stopword, mapword, mapped, keys_all):
# 從data中取出id, title, abstract, 建構3個list
if mode == 'tfidf':
idList, titleList, abstractList = data['id'], data['title'], data['abstract']
elif mode == 'tf': # 取出第id個文本的資訊
idList, titleList, abstractList = [data['id'][id]], [data['title'][id]], [data['abstract'][id]]
corpus = [] # 将所有文本到輸出到一個list中, 每行為一個文本
result = pd.DataFrame({"id": [], "title": [], "key": [], "left": [], "mapped": []},
columns=['id', 'title', 'key', 'left', 'mapped'])
# 分别對每個文本進行預處理, 将處理後的詞連接配接成字元串(空格分隔), 輸入到corpus中的一行
for index in range(len(idList)):
text = '%s' % abstractList[index]
text_seg = dataPrepos(text, stopword)
text_seg = " ".join(text_seg)
corpus.append(text_seg)
if corpus == ['']:
return result # 空文本
# 1、建構詞頻矩陣,将文本中的詞語轉換成詞頻矩陣
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus) # 詞頻矩陣
# 2、統計每個詞的TF-IDF權值
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
# 3、擷取詞袋模型中的關鍵詞
word = vectorizer.get_feature_names()
# 4、擷取TF-IDF矩陣
weight = tfidf.toarray()
# 5、列印詞語權重
# 以下變量分别用于存放文本編号, 标題, 提取出的關鍵詞, 映射得到的關鍵詞, 映射後剩餘的關鍵詞
ids, titles, keys, keys_mapped, keys_left = [], [], [], [], []
for i in range(len(weight)):
print(u"-------這裡輸出第", i+1, u"篇文本的詞語TF-IDF------")
ids.append(idList[i]) # 添加編号到ids
titles.append(titleList[i]) # 添加标題到titles
df_word, df_weight = [], [] # 目前文本的所有詞彙清單、詞彙對應權重清單
for j in range(len(word)):
print(word[j], weight[i][j])
if weight[i][j] == 0:
df_word.append(' ') # 用空字元串替換權重為0的詞
else:
df_word.append(word[j])
df_weight.append(weight[i][j])
# 将df_word和df_weight轉換為pandas中的DataFrame形式, 用于排序
df_word = pd.DataFrame(df_word, columns=['word'])
df_weight = pd.DataFrame(df_weight, columns=['weight'])
word_weight = pd.concat([df_word, df_weight], axis=1) # 拼接詞彙清單和權重清單
word_weight = word_weight.sort_values(by="weight", ascending=False) # 按照權重值降序排列
keyword = np.array(word_weight['word']) # 選擇詞彙列并轉成數組格式
key = [keyword[x] for x in range(0, min(topK, len(word)))] # 抽取前topK個詞彙作為關鍵詞
keys_all.extend(key) # 将目前文本提取出的關鍵詞加入keys_all中, 用于後續的高頻關鍵詞提取
# 關鍵詞映射
key_left_mapped = keysMapping(key, mapword)
# 将list中的詞連接配接成字元串
key = " ".join(key)
key_left_split = " ".join(key_left_mapped["left"])
key_mapped_split = " ".join(key_left_mapped["mapped"])
mapped.extend(key_left_mapped["mapped"]) # 将每個文本映射後的關鍵詞合并到mapped中, 有重複
keys.append(key)
keys_left.append(key_left_split)
keys_mapped.append(key_mapped_split)
result = pd.DataFrame({"id": ids, "title": titles, "key": keys, "left": keys_left, "mapped": keys_mapped}, columns=['id', 'title', 'key', 'left', 'mapped'])
return result
# 提取topN高頻關鍵詞
"""
輸入keys_all為每個文本提取出的topK關鍵詞合并後的清單,
輸出key_most為提取出的topN個高頻關鍵詞
"""
def getKeymost(keys_all):
counts = []
keys_nodup = list(set(keys_all)) # keys_all去重後結果
for item in keys_nodup:
counts.append(keys_all.count(item)) # 統計每個關鍵詞出現的次數
key_word = pd.DataFrame(keys_nodup, columns=['key'])
count_word = pd.DataFrame(counts, columns=['count'])
key_count = pd.concat([key_word, count_word], axis=1)
key_count = key_count.sort_values(by="count", ascending=False)
key_freq = np.array(key_count['key'])
key_most = [key_freq[x] for x in range(0, min(topN, len(key_word)))]
return key_most
def main():
# 删除曆史結果
for f in glob.glob(os.path.join('result', '*.xls')):
os.remove(f)
# 加載停用詞表
stopword = [w.strip() for w in io.open(stopword_path, 'r', encoding='UTF-8').readlines()]
# 加載映射表
mapword = xlrd.open_workbook(map_path).sheet_by_index(0)
# 加載自定義字典,用于jieba分詞
jieba.load_userdict(dict_path)
folderList = os.listdir(data_path)
for folder in folderList: # 周遊全部電影檔案夾, 每個檔案夾中為1部電影的全部影評
folder_path = os.path.join(data_path, folder)
# 讀取資料
data = dataRead(folder_path)
keys_all = [] # 用于存放所有文本提取出的關鍵詞
mapped = [] # 用于合并所有文本映射後的關鍵詞
# 關鍵詞提取,
if mode == 'tfidf':
result = getKeywords(data, 0, stopword, mapword, mapped, keys_all)
result.to_csv("result/CHkeys_tfidf_" + folder + ".xls", index=False, encoding='utf-8-sig')
elif mode == 'tf':
for i in range(len(data['id'])): # 'tf'模式下, 每個文本單獨調用getKeywords
result = getKeywords(data, i, stopword, mapword, mapped, keys_all)
result.to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', header=False, index=False, encoding='utf-8-sig')
mapped = list(set(mapped)) # 去除重複詞
mapped_result = pd.DataFrame({"mapped": [" ".join(mapped)]}, columns=['mapped'])
pd.DataFrame({"": [" ".join([])]}).to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', index=False) # 增加空行
mapped_result.to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', index=False, encoding='utf-8-sig', columns=['', '', 'mapped'])
# 提取高頻關鍵詞
key_most = getKeymost(keys_all)
key_most = pd.DataFrame({"most mentioned": [" ".join(key_most)]}, columns=['most mentioned'])
pd.DataFrame({"": [" ".join([])]}).to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', index=False) # 增加空行
key_most.to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', index=False, encoding='utf-8-sig', columns=['', '', 'most mentioned'])
if __name__ == '__main__':
main()