效果圖
jieba+wordcloud 詞雲分析 202302 QCon 議題 TOP 關鍵詞 步驟
(1)依賴
(2)源碼
import jieba
import wordcloud
# 對文本進行分詞
# top_num 取前 top_num
# exclude_words 無關詞語清單
# user_words 自定義分詞
def cut_word(input_path, out_path, top_num=30, exclude_words=[], user_words=[]):
file = open(input_path, 'r', encoding='utf-8')
txt = file.read()
if len(user_words) > 0:
for user_word in user_words:
jieba.add_word(user_word)
words = jieba.lcut(txt)
# 對詞頻進行統計
count = {}
for word in words:
if len(word) == 1:
continue
else:
count[word] = count.get(word, 0) + 1
# 周遊字典的所有鍵,即所有 word
for key in list(count.keys()):
# 引入停用詞
if key in exclude_words:
del count[key]
lists = list(count.items())
# 詞頻排序
lists.sort(key=lambda x: x[1], reverse=True)
# 列印前 top_num 條詞頻
for i in range(top_num):
word, number = lists[i]
print("關鍵字:{:-<5}頻次:{}".format(word, number))
# 詞頻寫入
with open(out_path, 'w', encoding='utf-8') as f:
for i in range(top_num):
word, number = lists[i]
f.write('{}\t{}\n'.format(word, number))
f.close()
return out_path
# 制作詞雲
def get_cloud(input_path, image_out_path):
with open(input_path, 'r', encoding='utf-8') as f:
text = f.read()
wcloud = wordcloud.WordCloud(
font_path=r'C:\Windows\Fonts\simhei.ttf',
background_color='white',
width=500,
max_words=1000,
height=400,
margin=5
).generate(text)
# 指定詞雲檔案路徑
wcloud.to_file(image_out_path)
f.close()
print("詞雲圖檔已儲存")
if __name__ == '__main__':
cut_word('./input.txt', out_path='./wordcloud.txt', top_num=200, exclude_words=[
'實踐', '技術', '基于', '應用', '建設', '實戰', '探索', '系統', '體系'
], user_words=['雲原生', '研發效能', '分布式', '微服務'])
get_cloud(input_path='./wordcloud.txt', image_out_path='./qcon.png')
參考
- https://www.cnblogs.com/yangyezhuang/p/16896980.html
- https://blog.csdn.net/zhangzeyuaaa/article/details/122192065
- https://baijiahao.baidu.com/s?id=1702691581630693235