天天看点

python 统计哈姆雷特词汇频率

基础语法解决------》哈姆雷特词频统计

python 统计哈姆雷特词汇频率
''' 
Text word frequency statistics
use wordcloud

'''

import wordcloud
import time


def file_change(path="../use_data/hamlet.txt"):
    '''处理特殊字符和大写'''
    with open(path) as fe:
        fe = fe.read().lower()  # 全部转为小写
        for i in '!"#$%^&*()_+-=~`:;{}[]\|<>,.?/':
            fe = fe.replace(i, " ")
    return fe


def main():
    hamlet_txt = file_change()
    hamlet_list = hamlet_txt.split()
    # split() 去除所有空字符 \n " " \t \r
    # split(" ") 只去除空格
    dic = {}
    for i in hamlet_list:
        c = hamlet_list.count(i)      # 得到词频统计,耗时较多
        dic[i] = c
    ham_list = list(dic.items())      # 转化为列表
    # ham_list.sort(key = lambda s:s[1],reverse = True)               # 将列表进行排序
    ham_list = sorted(ham_list, key=lambda s: s[1], reverse=True)  # sorted方法
    for i in range(10):
        m, n = ham_list[i]
        print("{}:{}".format(m, n))  # 将出现频率最高的十个单词打印


def main1():
    ham_ls = file_change()
    words = ham_ls.split()
    counts = {}
    for word in words:
        counts[word] = counts.get(word, 0) + 1
    items = list(counts.items())
    items.sort(key=lambda s: s[1], reverse=True)
    for i in range(10):
        m, n = items[i]
        print("{0:<10}{1:>5}".format(m, n))


start = time.perf_counter()

if __name__ == '__main__':
    main()

print("spend {} second".format(time.perf_counter() - start))

           
python 统计哈姆雷特词汇频率

在程序段,使用count统计词频,耗时13秒

使用字典get方法只需要0.11秒

使用 wordcloud 库显示词频

''' 
Text word frequency statistics
use wordcloud

'''

import wordcloud
import time


def file_change(path="../use_data/hamlet.txt"):
    '''处理特殊字符和大写'''
    with open(path) as fe:
        fe = fe.read().lower()  # 全部转为小写
        for i in '!"#$%^&*()_+-=~`:;{}[]\|<>,.?/':
            fe = fe.replace(i, " ")
    return fe

# 使用wordcloud 库
start = time.perf_counter()

text = file_change()
# 创建wordcloud对象
words = wordcloud.WordCloud(width = 500,\
                            height = 500, 
                            max_words = 20,
                            min_font_size = 10)
# 统计词汇
words.generate(text)
# 将词汇按数量的大小变为字体的大小生成图片
words.to_file("../use_data/hamlet_words.jpg")

print("spend {} second".format(time.perf_counter() - start))