天天看點

Python爬取金庸人物

Step:

  • 目标文章:鹿鼎記
  • 實作功能:

-- coding: utf-8 --

"""
Created on Sat Jul  7 16:57:02 2018

@author: fslq
"""
#初步擷取文本
import os
import os.path
import codecs
import jieba
import numpy
import pandas
#擷取文本内容
fileContents=[]  
segments=[]   
def getContent(filespath,stonampath):#  filespath:源檔案夾路徑
    for root,dirs,files in os.walk(filespath):          
        for name in files:                             
            f=codecs.open(os.path.join(root,name),'r','utf-8')
            fileContent=f.read()
            f.close()
            fileContents.append(fileContent)
    corpos=pandas.DataFrame({'fileContent':fileContents})
                        
#分解文章    
    jieba.load_userdict(stonampath) 
    seg_list=jieba.cut(fileContent,cut_all=False)
    for index,row in corpos.iterrows():
        fileContent=row['fileContent']
        for w in seg_list:
            segments.append(w)  

def inputTxt(stonampath,stoworpath): 
    segmentDataFrame=pandas.DataFrame({'stopword':segments})
    #分詞計數
    segStat=segmentDataFrame.groupby(by='stopword')['stopword'].agg({'計數':numpy.size}).reset_index().sort_values(by='計數',ascending=False)
    stopwords=pandas.read_csv(stonampath,encoding='utf-8',index_col=False,engine='python')
    #擷取每篇文章的停用詞表
    fSegStat=segStat[~segStat.stopword.isin(stopwords.stopword)]
    fSegStat.stopword.to_csv(stoworpath, header=True,index=False, sep='\t')
#得到人物出現次數資料
def funTxt(stoworpath,namnumpath): #namnumpath:人物名計數 stoworpath:停用詞表(表列名為stopword)
    #過濾人物名  
    segmentDataFrame=pandas.DataFrame({'segment':segments})
    #分詞計數
    segStat=segmentDataFrame.groupby(by='segment')['segment'].agg({'計數':numpy.size}).reset_index().sort_values(by='計數',ascending=False)
    #擷取每篇文章的停用詞表
    stopwords=pandas.read_csv(stoworpath,encoding='utf-8',index_col=False,engine='python')
    fSegStat=segStat[~segStat.segment.isin(stopwords.stopword)]
    fSegStat.to_csv(namnumpath, header=True,index=False, sep='\t')
    #删除不必要的字元串                                    
    with open(namnumpath,'r',encoding='utf8') as r:
        lines=r.readlines()
    with open(namnumpath,'w',encoding='utf8') as w:
        for l in lines:
            if '"' in l:
                pass
            elif ',' in l:
                pass
            elif '\u3000' in l:
                pass
            elif '\n' is l:
                pass
            elif ' ' in l:
                pass
            else:
                w.write(l)       
#繪圖
def draws(namnumpath):
    from matplotlib.font_manager import FontProperties
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud,ImageColorGenerator
    from scipy.misc import imread
    font = FontProperties(fname=r'C:\Windows\Fonts\STXINGKA.TTF', size=12)
    self_img = imread(r'C:\Users\fslq\Desktop\PythonFile\timg.png')
    fSegStat=pandas.read_csv(namnumpath, encoding='utf-8' ,sep='\t',engine='python')
    wordcloud=WordCloud(
        font_path=r'C:\Windows\Fonts\STXINGKA.TTF',
        background_color='white',
        mask=self_img,
        width=1500,
        height=1500
    )
    #color_mask = imread(r'C:\Users\fslq\Desktop\PythonFile\timg.png')
    words=fSegStat.set_index('segment').to_dict()
    wordcloud.fit_words(words['計數'])
    plt.title(namnumpath[44:52], fontproperties=font)
    plt.axis("off")
    plt.imshow(wordcloud)
    
    loandata=fSegStat.head(15)
    fd=loandata.set_index('segment')
    fd[::-1].plot(kind='barh',rot=0).set_yticklabels(loandata.segment[::-1], fontproperties=font)
    plt.legend(prop=font)
    plt.show()
     
           
  • 主程式:
#1鹿鼎記
if __name__=='__main__':
    stoworpath=r'C:\Users\fslq\Desktop\PythonFile\stoworpath\停用表.txt'
    stonampath=r'C:\Users\fslq\Desktop\PythonFile\stonampath\人物名稱-停用詞.txt'
    namnumpath=r'C:\Users\fslq\Desktop\PythonFile\namnumpath\分析金庸人物計數.txt'
    
    filespath=r'C:\Users\fslq\Desktop\PythonFile\filespath'#需更改文章
    getContent(filespath,stonampath)
    inputTxt(stonampath,stoworpath)
    funTxt(stoworpath,namnumpath)
    draws(namnumpath)
           
  • 執行結果:
    Python爬取金庸人物
    Python爬取金庸人物