為了結果直覺,做的簡單
網頁位址:
https://search.51job.com/list/180200,000000,0000,00,9,99,%25E6%25AD%25A6%25E6%25B1%2589,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
# -*- coding:utf-8 -*-
import requests
from lxml import etree
from wordcloud import WordCloud# 寫入csv
def write_csv(name, row):
reload(sys)
sys.setdefaultencoding("utf-8")
# 統計出現頻率
def get_count(text):
wordlist_jieba = jieba.cut(text)
# jieba分詞
def chinese_jieba(text):
wordlist_jieba = jieba.cut(text)
text_jieba = " ".join(wordlist_jieba)
return text_jieba
# 生成詞雲圖
def get_ciyun(text):
text = chinese_jieba(text)
print(text)
# mask_pic = numpy.array(Image.open(os.path.join(cur_path, "bit.jpg")))
# print(text)
# 背景顔色 詞數量 詞字型大小 字型檔案路徑(需要放到和py檔案同一個路徑下) 去掉的詞 遮罩層
font_path = path.join(d, 'fonts', 'Symbola', 'Symbola.ttf')
image.show()
#擷取資料
def get_data(url):
result = ''
headers={'Host':'search.51job.com','Upgrade-Insecure-Requests':'1','User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)\ Chrome/63.0.3239.132 Safari/537.36'}
return result# 主程式
def main():
url = 'https://search.51job.com/list/180200,000000,0000,00,9,99,%25E6%25AD%25A6%25E6%25B1%2589,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
text = get_data(url)
#print("擷取完畢,分詞生成詞雲")
get_ciyun(text)
get_count(text) main()
網頁:
結果
1 抓取結果
2 詞雲圖