天天看點

python關鍵詞提取_python 關鍵詞提取 (jieba+sklearn)

#!/usr/bin/python

# coding=utf-8

# TF-IDF提取文本關鍵詞

# http://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting

import sys

import os

from config_ch import *

import chardet

import numpy as np

import pandas as pd

import xlrd

import copy

import glob

import jieba.posseg

import jieba.analyse

import io

from sklearn import feature_extraction

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.feature_extraction.text import CountVectorizer

"""

TF-IDF權重:

1、CountVectorizer 建構詞頻矩陣

2、TfidfTransformer 建構TF-IDF權值計算

3、文本的關鍵詞

4、對應的TF-IDF矩陣

"""

# 資料讀取

"""

輸入資料所在檔案夾路徑data_path, 輸出data為一字典, 包含'id', 'title', 'abstract'

"""

def dataRead(data_path):

file_list = os.listdir(data_path)

idList, titleList, abstractList = range(0, len(file_list)), [], [] # 建構3個list, 用于存放文本編号, 文本标題, 文本内容

for file_name in file_list:

file_path = os.path.join(data_path, file_name)

if os.path.isfile(file_path):

f = io.open(file_path, 'rb').read()

encoding_type = chardet.detect(f) # 擷取文本的編碼形式

if not encoding_type['encoding']:

encoding_type['encoding'] = 'utf-8-sig' # 一些文本編碼形式為none, 強制轉換

file = f.decode(encoding_type['encoding'])

titleList.append(file[0:file.find('\n', 1)+1]) # 文本第一行為标題

abstractList.append(file)

data = {"id": idList, "title": titleList, "abstract": abstractList}

return data

# 預處理

"""

輸入文本text及停用詞表stopword, 輸出分詞結果text_seg

預處理包括jieba分詞, 去停用詞, 篩選詞性

"""

def dataPrepos(text, stopword):

text_seg = []

seg = jieba.posseg.cut(text) # 分詞

for i in seg:

if i.word not in stopword and i.flag in pos: # 去停用詞 + 篩選詞性

text_seg.append(i.word)

return text_seg

# 關鍵詞映射

"""

輸入關鍵詞key及映射表mapword, 輸出key_left_mapped,

包括映射後剩餘關鍵詞"left"及映射得到的關鍵詞"mapped"

映射表第1列為atom詞清單, 從第2列起為替換詞清單,

若key中某詞屬于atom清單, 則将該atom對應的替換詞加入mappedList, 并從leftList中删除該詞,

若key中某詞本身屬于替換詞清單, 則将該詞加入mappedList, 并從leftList中删除

"""

def keysMapping(key, mapword):# key中關鍵詞若存在于atom中,則加入mappedList,leftList隻保留未出現在atom中的關鍵詞

leftList, mappedList = copy.deepcopy(key), [] # 初始化leftList, mappedList

atom = mapword.col_values(0)

for i in key:

if i in atom: # 關鍵詞為atom清單中的詞, 則用對應的替換詞進行替換

mappedList.extend(mapword.row_values(atom.index(i))[1:])

mappedList = list(filter(None, mappedList)) # 去除""字元串

leftList.pop(leftList.index(i)) # 從leftList中删除

else:

for n in range(len(atom)):

row = mapword.row_values(n)[1:]

if i in row: # 關鍵詞本身為替換詞清單中的詞, 則加入mappedList, 并從leftList中删除

mappedList.extend([i])

leftList.pop(leftList.index(i))

break

mappedList = list(set(mappedList)) # 去除重複詞

key_left_mapped = {"left": leftList, "mapped": mappedList}

return key_left_mapped

# TF-IDF提取topK關鍵詞

"""

輸入包括資料data, 停用詞表stopword, 映射表mapword, 及中間變量mapped和keys_all,

當mode為'tf'時, 每個文本單獨調用getKeyword, 需傳入文本id,

當mode為'tfidf'時, 多個文本作為整體隻調用一次getKeyword, 不需id, 令id = 0

"""

def getKeywords(data, id, stopword, mapword, mapped, keys_all):

# 從data中取出id, title, abstract, 建構3個list

if mode == 'tfidf':

idList, titleList, abstractList = data['id'], data['title'], data['abstract']

elif mode == 'tf': # 取出第id個文本的資訊

idList, titleList, abstractList = [data['id'][id]], [data['title'][id]], [data['abstract'][id]]

corpus = [] # 将所有文本到輸出到一個list中, 每行為一個文本

result = pd.DataFrame({"id": [], "title": [], "key": [], "left": [], "mapped": []},

columns=['id', 'title', 'key', 'left', 'mapped'])

# 分别對每個文本進行預處理, 将處理後的詞連接配接成字元串(空格分隔), 輸入到corpus中的一行

for index in range(len(idList)):

text = '%s' % abstractList[index]

text_seg = dataPrepos(text, stopword)

text_seg = " ".join(text_seg)

corpus.append(text_seg)

if corpus == ['']:

return result # 空文本

# 1、建構詞頻矩陣,将文本中的詞語轉換成詞頻矩陣

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus) # 詞頻矩陣

# 2、統計每個詞的TF-IDF權值

transformer = TfidfTransformer()

tfidf = transformer.fit_transform(X)

# 3、擷取詞袋模型中的關鍵詞

word = vectorizer.get_feature_names()

# 4、擷取TF-IDF矩陣

weight = tfidf.toarray()

# 5、列印詞語權重

# 以下變量分别用于存放文本編号, 标題, 提取出的關鍵詞, 映射得到的關鍵詞, 映射後剩餘的關鍵詞

ids, titles, keys, keys_mapped, keys_left = [], [], [], [], []

for i in range(len(weight)):

print(u"-------這裡輸出第", i+1, u"篇文本的詞語TF-IDF------")

ids.append(idList[i]) # 添加編号到ids

titles.append(titleList[i]) # 添加标題到titles

df_word, df_weight = [], [] # 目前文本的所有詞彙清單、詞彙對應權重清單

for j in range(len(word)):

print(word[j], weight[i][j])

if weight[i][j] == 0:

df_word.append(' ') # 用空字元串替換權重為0的詞

else:

df_word.append(word[j])

df_weight.append(weight[i][j])

# 将df_word和df_weight轉換為pandas中的DataFrame形式, 用于排序

df_word = pd.DataFrame(df_word, columns=['word'])

df_weight = pd.DataFrame(df_weight, columns=['weight'])

word_weight = pd.concat([df_word, df_weight], axis=1) # 拼接詞彙清單和權重清單

word_weight = word_weight.sort_values(by="weight", ascending=False) # 按照權重值降序排列

keyword = np.array(word_weight['word']) # 選擇詞彙列并轉成數組格式

key = [keyword[x] for x in range(0, min(topK, len(word)))] # 抽取前topK個詞彙作為關鍵詞

keys_all.extend(key) # 将目前文本提取出的關鍵詞加入keys_all中, 用于後續的高頻關鍵詞提取

# 關鍵詞映射

key_left_mapped = keysMapping(key, mapword)

# 将list中的詞連接配接成字元串

key = " ".join(key)

key_left_split = " ".join(key_left_mapped["left"])

key_mapped_split = " ".join(key_left_mapped["mapped"])

mapped.extend(key_left_mapped["mapped"]) # 将每個文本映射後的關鍵詞合并到mapped中, 有重複

keys.append(key)

keys_left.append(key_left_split)

keys_mapped.append(key_mapped_split)

result = pd.DataFrame({"id": ids, "title": titles, "key": keys, "left": keys_left, "mapped": keys_mapped}, columns=['id', 'title', 'key', 'left', 'mapped'])

return result

# 提取topN高頻關鍵詞

"""

輸入keys_all為每個文本提取出的topK關鍵詞合并後的清單,

輸出key_most為提取出的topN個高頻關鍵詞

"""

def getKeymost(keys_all):

counts = []

keys_nodup = list(set(keys_all)) # keys_all去重後結果

for item in keys_nodup:

counts.append(keys_all.count(item)) # 統計每個關鍵詞出現的次數

key_word = pd.DataFrame(keys_nodup, columns=['key'])

count_word = pd.DataFrame(counts, columns=['count'])

key_count = pd.concat([key_word, count_word], axis=1)

key_count = key_count.sort_values(by="count", ascending=False)

key_freq = np.array(key_count['key'])

key_most = [key_freq[x] for x in range(0, min(topN, len(key_word)))]

return key_most

def main():

# 删除曆史結果

for f in glob.glob(os.path.join('result', '*.xls')):

os.remove(f)

# 加載停用詞表

stopword = [w.strip() for w in io.open(stopword_path, 'r', encoding='UTF-8').readlines()]

# 加載映射表

mapword = xlrd.open_workbook(map_path).sheet_by_index(0)

# 加載自定義字典,用于jieba分詞

jieba.load_userdict(dict_path)

folderList = os.listdir(data_path)

for folder in folderList: # 周遊全部電影檔案夾, 每個檔案夾中為1部電影的全部影評

folder_path = os.path.join(data_path, folder)

# 讀取資料

data = dataRead(folder_path)

keys_all = [] # 用于存放所有文本提取出的關鍵詞

mapped = [] # 用于合并所有文本映射後的關鍵詞

# 關鍵詞提取,

if mode == 'tfidf':

result = getKeywords(data, 0, stopword, mapword, mapped, keys_all)

result.to_csv("result/CHkeys_tfidf_" + folder + ".xls", index=False, encoding='utf-8-sig')

elif mode == 'tf':

for i in range(len(data['id'])): # 'tf'模式下, 每個文本單獨調用getKeywords

result = getKeywords(data, i, stopword, mapword, mapped, keys_all)

result.to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', header=False, index=False, encoding='utf-8-sig')

mapped = list(set(mapped)) # 去除重複詞

mapped_result = pd.DataFrame({"mapped": [" ".join(mapped)]}, columns=['mapped'])

pd.DataFrame({"": [" ".join([])]}).to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', index=False) # 增加空行

mapped_result.to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', index=False, encoding='utf-8-sig', columns=['', '', 'mapped'])

# 提取高頻關鍵詞

key_most = getKeymost(keys_all)

key_most = pd.DataFrame({"most mentioned": [" ".join(key_most)]}, columns=['most mentioned'])

pd.DataFrame({"": [" ".join([])]}).to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', index=False) # 增加空行

key_most.to_csv("result/CHkeys_tf_" + folder + ".xls", mode='a', index=False, encoding='utf-8-sig', columns=['', '', 'most mentioned'])

if __name__ == '__main__':

main()