爬取哔哩哔哩彈幕

import re
import sqlite3
import requests
from lxml import etree
import threading
from queue import Queue
import pandas as pd
import pymysql
from sqlalchemy import create_engine
import jieba
import random
import time
from wordcloud import WordCloud
import matplotlib.pyplot as plt
class BiliSpider:
'''哔哩哔哩彈幕爬蟲'''
tindex=0
global g_wordlist
global episodes
global allBarrage
global ct_episodes
allBarrage = []
episodes={}
ct_episodes=1
g_wordlist=[]
def __init__(self):
url_st = self.get_url()
self.start_url = url_st
self.headers = {
'Referer': 'https://www.bilibili.com/bangumi/play/ep7821',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
# 'Cookie': 'finger=846f9182; LIVE_BUVID=AUTO7515275889865517; fts=1527589020; BANGUMI_SS_413_REC=7823; sid=bywgf18g; buvid3=89102350-5F5E-4056-A926-16EEC8780EE8140233infoc; rpdid=oqllxwklspdosimsqlwiw; bg_view_413=7820%7C7819%7C7823%7C7822',
'Host': 'm.bilibili.com',
}
self.barrage_url = 'https://comment.bilibili.com/{}.xml'
# self.proxies = {'https': 'https://115.223.209.238:9000'}
# 要請求的url隊列
self.url_queue = Queue()
# 解析出的html字元串隊列
self.html_str_q = Queue()
#擷取集數隊列
self.ep_list_q = Queue()
# 擷取到的彈幕隊列
self.barrage_list_q = Queue()
#儲存至資料庫
#print("在self前")
print(self.barrage_list_q)
def get_url(self):
url_input = input("請輸入移動版的bilibili番劇ep号:\n(如ep63725)")
# url ='https://m.bilibili.com/bangumi/play/ep63725'
url='https://m.bilibili.com/bangumi/play/{}'.format(url_input)
return url
def parse_url(self, url=None, headers={}):
if url is None:
while True:
url = self.url_queue.get()
print('\n彈幕xml為:')
print(url)
res = requests.get(url, headers=headers)
res.encoding = 'utf-8'
self.html_str_q.put(res.text)
# self.url_queue.task_done()
return
else:
print('\n彈幕xml為:')
print(url)
res = requests.get(url, headers=headers)
res.encoding = 'utf-8'
return res.text
def get_cid(self, html_str):
html = etree.HTML(html_str)
print(html_str)
script = html.xpath('//script[contains(text(),"epList")]/text()')[0]
cid_list = re.findall(r'"cid":(\d+)', script)
return cid_list
#集标題及副标題
def get_episodes(self, html_str):
global episodes
html = etree.HTML(html_str)
ep_content = html.xpath('//script[contains(text(),"epList")]/text()')[0]
print(ep_content)
ep_list = re.findall(r'"share_copy":"(\S+)', ep_content)
self.ep_list_q.put(ep_list)
#擷取彈幕檔案url
def get_barrage_url(self, cid_list):
for i in (cid_list[1:]):
self.url_queue.put(self.barrage_url.format(i))
# return url_list
def get_barrage_list(self):
while True:
barrage_str = self.html_str_q.get()
barrage_str = barrage_str.encode('utf-8')
barrage_xml = etree.HTML(barrage_str)
barrage_list = barrage_xml.xpath('//d/text()')
self.barrage_list_q.put(barrage_list)
return barrage_list
def takeSecond(elem):
return elem[1]
def save_barrage(self):
global g_wordlist
global ct_episodes
global allBarrage
ct_episodes+=1
#停用詞表
stop = [line.strip() for line in open("stop.txt", 'r', encoding='utf-8').readlines()]
barrage_list = self.barrage_list_q.get()
#輸出彈幕
with open('barrage2.txt', 'w', encoding='utf-8') as f:
for barrage in barrage_list:
f.write(barrage)
f.write('\n')
fo = open('barrage2.txt','r',encoding='utf-8')
tk = fo.read()
for s in stop:
tk = tk.replace(s, "")
fo.close()
wordlist = jieba.lcut(tk)
b_ls=[]
#生成一個字典
temp ={}
for word in wordlist:
duplicates=False
if len(word)==1:
continue
else:
temp[word]=temp.get(word,0)+1
count=temp[word]
new=aBrrage(word,episodes[ct_episodes],count)
for n1 in b_ls[0:]:
if n1['word']==new['word']:
duplicates=True
if int(new['count'])>int(n1['count']):
n_temp = new
b_ls.remove(n1)
b_ls.append(n_temp)
break
#字典清單
if(duplicates==False):
b_ls.append(new)
print("\n******")
print(episodes[ct_episodes])
print("字幕數量:",len(barrage_list))
print("處理後彈幕數量:",len(b_ls))
allBarrage.extend(b_ls)
g_wordlist.extend(wordlist)
print('擷取成功')
return allBarrage
def create_dict(self):
dict = {}
wordlist = {}
return dict
def run(self):
'''主要邏輯'''
global episodes
# 請求初始視訊url
html_str = self.parse_url(url=self.start_url, headers=self.headers)
# 提取資料cid
cid_list = self.get_cid(html_str)
print(cid_list)
ep_list=self.get_episodes(html_str)
# 組織彈幕的url
self.get_barrage_url(cid_list)
# 請求網址
episodes={}
episodes = self.ep_list_q.get()
ex_len=len(episodes)
print('==========')
for i in range(ex_len-2):
self.parse_url()
self.get_barrage_list()
res = self.save_barrage()
time.sleep(random.random() * 3)#設定爬取的時間間隔
if(i==ex_len-3):
save_assql(res)
#生成字典
wcdict = {}
for word in g_wordlist:
if len(word)==1:
continue
else:
wcdict[word]= wcdict.get(word,0)+1
#排序
wcls = list(wcdict.items())
wcls.sort(key = lambda x:x[1],reverse=True)
#輸出前二十五詞
print('輸出系列前二十五詞:')
for i in range(25):
print(wcls[i])
ciyun(g_wordlist)
print("儲存到資料庫")
#源代碼使用的線程
# for i in range(100):
# # barrage_str = self.parse_url(url)
# t_parse = threading.Thread(target=self.parse_url)
# t_parse.setDaemon(True)
# t_parse.start()
#
# # 提取出資訊
# for i in range(2):
# # barrage_list = self.get_barrage_list(barrage_str)
# t_barrage_list = threading.Thread(target=self.get_barrage_list)
# t_barrage_list.setDaemon(True)
# t_barrage_list.start()
#
# # 寫入檔案
# for i in range(2):
# # self.save_barrage(barrage_list)
# t_save = threading.Thread(target=self.save_barrage)
# t_save.setDaemon(True)
# t_save.start()
#
#
# for q in [self.html_str_q, self.barrage_list_q, self.url_queue]:
# q.join()
print('==========')
print('主線程結束')
#儲存至資料庫
def save_assql(list):
conInfo = "mysql+pymysql://root:123456@localhost:3306/bilibili?charset=utf8"
engine = create_engine(conInfo,encoding='utf-8')
if(list!=[]):
df = pd.DataFrame(list)
df.to_sql(name = 'bilibilitest', con = engine, if_exists = 'append', index = False)
pymysql.connect(host='localhost',port=3306,user='root',passwd='123456',db='bilibili',charset='utf8')
else:
return
def aBrrage(str,episodes,count):
ab_dict={}
ab_dict['word']= str
ab_dict['e_index'] = episodes
ab_dict['count']=count
return ab_dict
def ciyun(wordlist):
wl_split=''.join(wordlist)
#生成詞雲
mywc = WordCloud().generate(wl_split)
plt.imshow(mywc)
plt.axis("off")
plt.show()
if __name__ == '__main__':
bili = BiliSpider()
bili.run()
View Code
通過僞造headers通路
self.headers = {
'Referer': 'https://www.bilibili.com/bangumi/play/ep7821',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
# 'Cookie': 'finger=846f9182; LIVE_BUVID=AUTO7515275889865517; fts=1527589020; BANGUMI_SS_413_REC=7823; sid=bywgf18g; buvid3=89102350-5F5E-4056-A926-16EEC8780EE8140233infoc; rpdid=oqllxwklspdosimsqlwiw; bg_view_413=7820%7C7819%7C7823%7C7822',
'Host': 'm.bilibili.com',
}
關鍵代碼:
1.擷取視訊id:cid

def get_cid(self, html_str):
html = etree.HTML(html_str)
print(html_str)
script = html.xpath('//script[contains(text(),"epList")]/text()')[0]
# print(script)
cid_list = re.findall(r'"cid":(\d+)', script)
return cid_list
2.生成字典與詞雲

def save_barrage(self):
global episodes
global g_wordlist
episodes=episodes+1
#停用詞表
stop = [line.strip() for line in open("stop.txt", 'r', encoding='utf-8').readlines()]
while True:
barrage_list = self.barrage_list_q.get()
#輸出彈幕
g_wordlist=[]
# print(barrage_list)
with open('barrage2.txt', 'w', encoding='utf-8') as f:
for barrage in barrage_list:
f.write(barrage)
f.write('\n')
fo = open('barrage2.txt','r',encoding='utf-8')
tk = fo.read()
for s in stop:
tk = tk.replace(s, "")
fo.close()
wordlist = jieba.lcut(tk)
b_ls=[]
ab_dict = {'word':"test",'e_index':999,'count':1}
b_ls.append(ab_dict)
#生成一個字典
temp ={}
for word in wordlist:
duplicates=False
if len(word)==1:
continue
else:
temp[word]=temp.get(word,0)+1
count=temp[word]
new=aBrrage(word,episodes,count)
# ab_dict['word']= word
# ab_dict['e_index'] = episodes
# ab_dict['count']=ab_dict.get(word,0)+1
for n1 in b_ls[0:]:
if n1['word']==new['word']:
duplicates=True
if int(new['count'])>int(n1['count']):
n_temp = new
b_ls.remove(n1)
b_ls.append(n_temp)
break
#字典清單
if(duplicates==False):
b_ls.append(new)
print("\n******")
print("第",episodes,"集")
print("字幕數量:",len(barrage_list))
print("處理後彈幕數量:",len(b_ls))
allBarrage.extend(b_ls)
save_assql(allBarrage)
ciyun(wordlist)
print('儲存成功')
3.全局變量儲存彈幕等資訊

global g_wordlist#儲存結巴彈幕
global episodes#在存放xml的集标題
global allBarrage#将儲存到資料庫有其他資訊如集資訊的清單
global ct_episodes#集數序數
allBarrage = []
episodes={}
ct_episodes=1#第一集集标題前兩項均為無效資訊
g_wordlist=[]
4.詞雲及資料庫儲存

def ciyun(wordlist):
wl_split=''.join(wordlist)
#生成詞雲
mywc = WordCloud().generate(wl_split)
plt.imshow(mywc)
plt.axis("off")
plt.show()
def save_assql(list):
conInfo = "mysql+pymysql://root:123456@localhost:3306/bilibili?charset=utf8"
engine = create_engine(conInfo,encoding='utf-8')
if(list!=[]):
df = pd.DataFrame(list)
df.to_sql(name = 'bilibilitest', con = engine, if_exists = 'append', index = False)
pymysql.connect(host='localhost',port=3306,user='root',passwd='123456',db='bilibili',charset='utf8')
else:
print('儲存失敗')
return
5.主要邏輯

def run(self):
'''主要邏輯'''
global episodes
# 請求初始視訊url
html_str = self.parse_url(url=self.start_url, headers=self.headers)
# 提取資料cid
cid_list = self.get_cid(html_str)
print(cid_list)
ep_list=self.get_episodes(html_str)
# 組織彈幕的url
self.get_barrage_url(cid_list)
# 請求網址
episodes={}
episodes = self.ep_list_q.get()
ex_len=len(episodes)
print('==========')
for i in range(ex_len-2):
self.parse_url()
self.get_barrage_list()
res = self.save_barrage()
time.sleep(random.random() * 3)#設定爬取的時間間隔
if(i==ex_len-3):
save_assql(res)
#生成字典
wcdict = {}
for word in g_wordlist:
if len(word)==1:
continue
else:
wcdict[word]= wcdict.get(word,0)+1
#排序
wcls = list(wcdict.items())
wcls.sort(key = lambda x:x[1],reverse=True)
#輸出前二十五詞
print('輸出系列前二十五詞:')
for i in range(25):
print(wcls[i])
ciyun(g_wordlist)
print("儲存到資料庫")
print('==========')
print('主線程結束')
4.輸出詞雲能夠分析該視訊或者該系列視訊的關鍵詞
輸出系列劇集的前十五詞:
拉取彈幕提示:
輸出該系列劇集的彈幕詞雲如下圖:
總結:
讀取的彈幕可以了解到該視訊的主要内容,能在看之前就較直覺地了解視訊的好評程度。若有鋪天遍地的謾罵那麼或許就不是一部适合大衆觀看的視訊。
若看到關鍵詞是自己喜歡的,那麼就是能很快選擇到自己喜歡的視訊。