Step:
- 目标文章:鹿鼎記
- 實作功能:
-- coding: utf-8 --
"""
Created on Sat Jul 7 16:57:02 2018
@author: fslq
"""
#初步擷取文本
import os
import os.path
import codecs
import jieba
import numpy
import pandas
#擷取文本内容
fileContents=[]
segments=[]
def getContent(filespath,stonampath):# filespath:源檔案夾路徑
for root,dirs,files in os.walk(filespath):
for name in files:
f=codecs.open(os.path.join(root,name),'r','utf-8')
fileContent=f.read()
f.close()
fileContents.append(fileContent)
corpos=pandas.DataFrame({'fileContent':fileContents})
#分解文章
jieba.load_userdict(stonampath)
seg_list=jieba.cut(fileContent,cut_all=False)
for index,row in corpos.iterrows():
fileContent=row['fileContent']
for w in seg_list:
segments.append(w)
def inputTxt(stonampath,stoworpath):
segmentDataFrame=pandas.DataFrame({'stopword':segments})
#分詞計數
segStat=segmentDataFrame.groupby(by='stopword')['stopword'].agg({'計數':numpy.size}).reset_index().sort_values(by='計數',ascending=False)
stopwords=pandas.read_csv(stonampath,encoding='utf-8',index_col=False,engine='python')
#擷取每篇文章的停用詞表
fSegStat=segStat[~segStat.stopword.isin(stopwords.stopword)]
fSegStat.stopword.to_csv(stoworpath, header=True,index=False, sep='\t')
#得到人物出現次數資料
def funTxt(stoworpath,namnumpath): #namnumpath:人物名計數 stoworpath:停用詞表(表列名為stopword)
#過濾人物名
segmentDataFrame=pandas.DataFrame({'segment':segments})
#分詞計數
segStat=segmentDataFrame.groupby(by='segment')['segment'].agg({'計數':numpy.size}).reset_index().sort_values(by='計數',ascending=False)
#擷取每篇文章的停用詞表
stopwords=pandas.read_csv(stoworpath,encoding='utf-8',index_col=False,engine='python')
fSegStat=segStat[~segStat.segment.isin(stopwords.stopword)]
fSegStat.to_csv(namnumpath, header=True,index=False, sep='\t')
#删除不必要的字元串
with open(namnumpath,'r',encoding='utf8') as r:
lines=r.readlines()
with open(namnumpath,'w',encoding='utf8') as w:
for l in lines:
if '"' in l:
pass
elif ',' in l:
pass
elif '\u3000' in l:
pass
elif '\n' is l:
pass
elif ' ' in l:
pass
else:
w.write(l)
#繪圖
def draws(namnumpath):
from matplotlib.font_manager import FontProperties
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator
from scipy.misc import imread
font = FontProperties(fname=r'C:\Windows\Fonts\STXINGKA.TTF', size=12)
self_img = imread(r'C:\Users\fslq\Desktop\PythonFile\timg.png')
fSegStat=pandas.read_csv(namnumpath, encoding='utf-8' ,sep='\t',engine='python')
wordcloud=WordCloud(
font_path=r'C:\Windows\Fonts\STXINGKA.TTF',
background_color='white',
mask=self_img,
width=1500,
height=1500
)
#color_mask = imread(r'C:\Users\fslq\Desktop\PythonFile\timg.png')
words=fSegStat.set_index('segment').to_dict()
wordcloud.fit_words(words['計數'])
plt.title(namnumpath[44:52], fontproperties=font)
plt.axis("off")
plt.imshow(wordcloud)
loandata=fSegStat.head(15)
fd=loandata.set_index('segment')
fd[::-1].plot(kind='barh',rot=0).set_yticklabels(loandata.segment[::-1], fontproperties=font)
plt.legend(prop=font)
plt.show()
- 主程式:
#1鹿鼎記
if __name__=='__main__':
stoworpath=r'C:\Users\fslq\Desktop\PythonFile\stoworpath\停用表.txt'
stonampath=r'C:\Users\fslq\Desktop\PythonFile\stonampath\人物名稱-停用詞.txt'
namnumpath=r'C:\Users\fslq\Desktop\PythonFile\namnumpath\分析金庸人物計數.txt'
filespath=r'C:\Users\fslq\Desktop\PythonFile\filespath'#需更改文章
getContent(filespath,stonampath)
inputTxt(stonampath,stoworpath)
funTxt(stoworpath,namnumpath)
draws(namnumpath)
- 執行結果:
Python爬取金庸人物 Python爬取金庸人物