最近聽到一首很喜歡的歌,許薇的《我以為》,評論也很有趣,遂有想爬取該歌曲下的所有評論并用詞雲工具展示。
我們使用chrome開發者工具,發現歌曲的評論都隐藏在以 R_SO_4 開頭的 XHR 檔案中

接下來思路就很明确,拿到該檔案,解析該檔案的 json 資料,拿到全部評論。
我們可以看到該檔案有兩個用JS加密的參數 params 和 encSecKey ,關于這兩個加密參數,參考了知乎使用者的解答:
https://www.zhihu.com/question/36081767 。步驟:
1.導入必要的子產品:
from Crypto.Cipher import AES
from wordcloud import WordCloud
#需加入下面兩句話,不然會報錯:matplotlib: RuntimeError: Python is not installed as a framework
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import base64
import requests
import json
import codecs
import time
import jieba
注:本人使用MacOS,在該環境下會報錯,加入:
import matplotlib
matplotlib.use('TkAgg')
2.寫入請求頭:
headers = {
'Host':'music.163.com',
'Origin':'https://music.163.com',
'Referer':'https://music.163.com/song?id=28793052',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
3.解析 params 和 encSecKey 這兩個參數:
# 第一個參數
# first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'
# 第二個參數
second_param = "010001"
# 第三個參數
third_param = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
# 第四個參數
forth_param = "0CoJUm6Qyw8W8jud"
# 擷取參數
def get_params(page): # page為傳入頁數
iv = "0102030405060708"
first_key = forth_param
second_key = 16 * 'F'
if(page == 1): # 如果為第一頁
first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'
h_encText = AES_encrypt(first_param, first_key, iv)
else:
offset = str((page-1)*20)
first_param = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}' %(offset,'false')
h_encText = AES_encrypt(first_param, first_key, iv)
h_encText = AES_encrypt(h_encText, second_key, iv)
return h_encText
# 擷取 encSecKey
def get_encSecKey():
encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"
return encSecKey
# 解密過程
def AES_encrypt(text, key, iv):
pad = 16 - len(text) % 16
text = text + pad * chr(pad)
encryptor = AES.new(key, AES.MODE_CBC, iv)
encrypt_text = encryptor.encrypt(text)
encrypt_text = base64.b64encode(encrypt_text)
encrypt_text = str(encrypt_text, encoding="utf-8") #注意一定要加上這一句,沒有這一句則出現錯誤
return encrypt_text
4.擷取 json 資料并抓取評論:
# 獲得評論json資料
def get_json(url, params, encSecKey):
data = {
"params": params,
"encSecKey": encSecKey
}
response = requests.post(url, headers=headers, data=data)
return response.content
# 抓取某一首歌的評論
def get_all_comments(url,page):
all_comments_list = [] # 存放所有評論
for i in range(page): # 逐頁抓取
params = get_params(i+1)
encSecKey = get_encSecKey()
json_text = get_json(url,params,encSecKey)
json_dict = json.loads(json_text)
for item in json_dict['comments']:
comment = item['content'] # 評論内容
comment_info = str(comment)
all_comments_list.append(comment_info)
print('第%d頁抓取完畢!' % (i+1))
#time.sleep(random.choice(range(1,3))) #爬取過快的話,設定休眠時間,跑慢點,減輕伺服器負擔
return all_comments_list
5.使用結巴分詞過濾停用詞并用 wordcloud 生成詞雲:
#生成詞雲
def wordcloud(all_comments):
# 對句子進行分詞,加載停用詞
# 打開和儲存檔案時記得加encoding='utf-8'編碼,不然會報錯。
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip(), cut_all=False) # 精确模式
stopwords = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8').readlines()] # 這裡加載停用詞的路徑
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
return outstr
for line in all_comments:
line_seg = seg_sentence(line) # 這裡的傳回值是字元串
with open('outputs.txt', 'a', encoding='utf-8') as f:
f.write(line_seg + '\n')
data = open('outputs.txt', 'r', encoding='utf-8').read()
my_wordcloud = WordCloud(
background_color='white', #設定背景顔色
max_words=200, #設定最大實作的字數
font_path=r'SimHei.ttf', #設定字型格式,如不設定顯示不了中文
).generate(data)
plt.figure()
plt.imshow(my_wordcloud)
plt.axis('off')
plt.show() # 展示詞雲
注意編碼格式為 'utf-8' 。
6.定義主函數并設定函數出口:
def main():
start_time = time.time() # 開始時間
url = "https://music.163.com/weapi/v1/resource/comments/R_SO_4_28793052?csrf_token=" # 替換為你想下載下傳的歌曲R_SO的連結
all_comments = get_all_comments(url, page=2000) # 需要爬取的頁面數
wordcloud(all_comments)
end_time = time.time() # 結束時間
print('程式耗時%f秒.' % (end_time - start_time))
if __name__ == '__main__':
main()
運作過程如下(個人爬取了《我以為》的前2000頁的評論):
生成詞雲:
完整代碼已上傳至 github:
https://github.com/weixuqin/PythonProjects/tree/master/wangyiyun