天天看點

爬蟲綜合大作業

爬取哔哩哔哩彈幕

爬蟲綜合大作業
爬蟲綜合大作業

import re
import sqlite3

import requests
from lxml import etree
import threading
from queue import Queue
import pandas as pd
import pymysql
from sqlalchemy import create_engine
import jieba
import random
import time
from wordcloud import WordCloud
import matplotlib.pyplot as plt

class BiliSpider:
    '''哔哩哔哩彈幕爬蟲'''
    tindex=0
    global g_wordlist
    global episodes
    global allBarrage
    global ct_episodes
    allBarrage = []
    episodes={}
    ct_episodes=1
    g_wordlist=[]
    def __init__(self):
        url_st = self.get_url()
        self.start_url = url_st
        self.headers = {
            'Referer': 'https://www.bilibili.com/bangumi/play/ep7821',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            # 'Cookie': 'finger=846f9182; LIVE_BUVID=AUTO7515275889865517; fts=1527589020; BANGUMI_SS_413_REC=7823; sid=bywgf18g; buvid3=89102350-5F5E-4056-A926-16EEC8780EE8140233infoc; rpdid=oqllxwklspdosimsqlwiw; bg_view_413=7820%7C7819%7C7823%7C7822',
            'Host': 'm.bilibili.com',
        }

        self.barrage_url = 'https://comment.bilibili.com/{}.xml'

        # self.proxies = {'https': 'https://115.223.209.238:9000'}
        # 要請求的url隊列
        self.url_queue = Queue()
        # 解析出的html字元串隊列
        self.html_str_q = Queue()
        #擷取集數隊列
        self.ep_list_q = Queue()
        # 擷取到的彈幕隊列
        self.barrage_list_q = Queue()
        #儲存至資料庫
        #print("在self前")
        print(self.barrage_list_q)
    def get_url(self):
        url_input = input("請輸入移動版的bilibili番劇ep号:\n(如ep63725)")
        # url ='https://m.bilibili.com/bangumi/play/ep63725'
        url='https://m.bilibili.com/bangumi/play/{}'.format(url_input)
        return url

    def parse_url(self, url=None, headers={}):
        if url is None:
            while True:
                url = self.url_queue.get()
                print('\n彈幕xml為:')
                print(url)
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                self.html_str_q.put(res.text)
                # self.url_queue.task_done()
                return
        else:
            print('\n彈幕xml為:')
            print(url)
            res = requests.get(url, headers=headers)
            res.encoding = 'utf-8'
            return res.text

    def get_cid(self, html_str):
        html = etree.HTML(html_str)

        print(html_str)
        script = html.xpath('//script[contains(text(),"epList")]/text()')[0]
        cid_list = re.findall(r'"cid":(\d+)', script)
        return cid_list

    #集标題及副标題
    def get_episodes(self, html_str):
        global episodes

        html = etree.HTML(html_str)
        ep_content = html.xpath('//script[contains(text(),"epList")]/text()')[0]

        print(ep_content)
        ep_list = re.findall(r'"share_copy":"(\S+)', ep_content)
        self.ep_list_q.put(ep_list)

    #擷取彈幕檔案url
    def get_barrage_url(self, cid_list):

        for i in (cid_list[1:]):

            self.url_queue.put(self.barrage_url.format(i))
            # return url_list

    def get_barrage_list(self):
        while True:
            barrage_str = self.html_str_q.get()
            barrage_str = barrage_str.encode('utf-8')
            barrage_xml = etree.HTML(barrage_str)
            barrage_list = barrage_xml.xpath('//d/text()')

            self.barrage_list_q.put(barrage_list)
            return barrage_list

    def takeSecond(elem):
        return elem[1]

    def save_barrage(self):

        global g_wordlist
        global ct_episodes
        global allBarrage
        ct_episodes+=1



        #停用詞表
        stop = [line.strip() for line in open("stop.txt", 'r', encoding='utf-8').readlines()]
        barrage_list = self.barrage_list_q.get()
        #輸出彈幕
        with open('barrage2.txt', 'w', encoding='utf-8') as f:

            for barrage in barrage_list:

                f.write(barrage)
                f.write('\n')
            fo = open('barrage2.txt','r',encoding='utf-8')
            tk = fo.read()

            for s in stop:
                tk = tk.replace(s, "")
            fo.close()
            wordlist = jieba.lcut(tk)

            b_ls=[]

            #生成一個字典
            temp ={}

            for word in wordlist:

                duplicates=False
                if len(word)==1:
                    continue
                else:
                    temp[word]=temp.get(word,0)+1
                    count=temp[word]
                    new=aBrrage(word,episodes[ct_episodes],count)
                    for n1 in b_ls[0:]:
                        if n1['word']==new['word']:
                            duplicates=True

                            if int(new['count'])>int(n1['count']):

                                n_temp = new
                                b_ls.remove(n1)
                                b_ls.append(n_temp)
                                break
                    #字典清單
                    if(duplicates==False):
                        b_ls.append(new)


        print("\n******")
        print(episodes[ct_episodes])
        print("字幕數量:",len(barrage_list))
        print("處理後彈幕數量:",len(b_ls))

        allBarrage.extend(b_ls)
        g_wordlist.extend(wordlist)
        print('擷取成功')
        return allBarrage

    def create_dict(self):
        dict = {}
        wordlist = {}
        return dict

    def run(self):
        '''主要邏輯'''
        global episodes
        # 請求初始視訊url
        html_str = self.parse_url(url=self.start_url, headers=self.headers)

        # 提取資料cid

        cid_list = self.get_cid(html_str)

        print(cid_list)

        ep_list=self.get_episodes(html_str)
        # 組織彈幕的url
        self.get_barrage_url(cid_list)
        # 請求網址
        episodes={}
        episodes = self.ep_list_q.get()


        ex_len=len(episodes)
        print('==========')


        for i in range(ex_len-2):
            self.parse_url()
            self.get_barrage_list()

            res = self.save_barrage()

            time.sleep(random.random() * 3)#設定爬取的時間間隔
            if(i==ex_len-3):
                save_assql(res)
                #生成字典
                wcdict = {}
                for word in g_wordlist:
                    if len(word)==1:
                        continue
                    else:
                        wcdict[word]= wcdict.get(word,0)+1
                #排序
                wcls = list(wcdict.items())
                wcls.sort(key = lambda x:x[1],reverse=True)

                #輸出前二十五詞
                print('輸出系列前二十五詞:')
                for i in range(25):
                    print(wcls[i])
                ciyun(g_wordlist)
                print("儲存到資料庫")



        #源代碼使用的線程
        # for i in range(100):
        #     # barrage_str = self.parse_url(url)
        #     t_parse = threading.Thread(target=self.parse_url)
        #     t_parse.setDaemon(True)
        #     t_parse.start()
        #
        #     # 提取出資訊
        # for i in range(2):
        #     # barrage_list = self.get_barrage_list(barrage_str)
        #     t_barrage_list = threading.Thread(target=self.get_barrage_list)
        #     t_barrage_list.setDaemon(True)
        #     t_barrage_list.start()
        #
        # # 寫入檔案
        # for i in range(2):
        #     # self.save_barrage(barrage_list)
        #     t_save = threading.Thread(target=self.save_barrage)
        #     t_save.setDaemon(True)
        #     t_save.start()
        #
        #
        # for q in [self.html_str_q, self.barrage_list_q, self.url_queue]:
        #     q.join()
        print('==========')
        print('主線程結束')

    #儲存至資料庫
def save_assql(list):

        conInfo = "mysql+pymysql://root:123456@localhost:3306/bilibili?charset=utf8"
        engine = create_engine(conInfo,encoding='utf-8')
        if(list!=[]):
            df = pd.DataFrame(list)
            df.to_sql(name = 'bilibilitest', con = engine, if_exists = 'append', index = False)
            pymysql.connect(host='localhost',port=3306,user='root',passwd='123456',db='bilibili',charset='utf8')
        else:
            return

def aBrrage(str,episodes,count):
    ab_dict={}
    ab_dict['word']= str
    ab_dict['e_index'] = episodes
    ab_dict['count']=count
    return ab_dict

def ciyun(wordlist):
    wl_split=''.join(wordlist)
    #生成詞雲

    mywc = WordCloud().generate(wl_split)
    plt.imshow(mywc)
    plt.axis("off")
    plt.show()
    
if __name__ == '__main__':

    bili = BiliSpider() 
    bili.run()      

View Code

通過僞造headers通路

self.headers = {
            'Referer': 'https://www.bilibili.com/bangumi/play/ep7821',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            # 'Cookie': 'finger=846f9182; LIVE_BUVID=AUTO7515275889865517; fts=1527589020; BANGUMI_SS_413_REC=7823; sid=bywgf18g; buvid3=89102350-5F5E-4056-A926-16EEC8780EE8140233infoc; rpdid=oqllxwklspdosimsqlwiw; bg_view_413=7820%7C7819%7C7823%7C7822',
            'Host': 'm.bilibili.com',
        }      

 關鍵代碼:

1.擷取視訊id:cid

爬蟲綜合大作業
爬蟲綜合大作業
def get_cid(self, html_str):
        html = etree.HTML(html_str)
        print(html_str)
        script = html.xpath('//script[contains(text(),"epList")]/text()')[0]

        # print(script)
        cid_list = re.findall(r'"cid":(\d+)', script)
        return cid_list      

2.生成字典與詞雲

爬蟲綜合大作業
爬蟲綜合大作業
def save_barrage(self):
        global episodes
        global g_wordlist
        episodes=episodes+1
        #停用詞表
        stop = [line.strip() for line in open("stop.txt", 'r', encoding='utf-8').readlines()]
        while True:
            barrage_list = self.barrage_list_q.get()
            #輸出彈幕
            g_wordlist=[]
            # print(barrage_list)
            with open('barrage2.txt', 'w', encoding='utf-8') as f:

                for barrage in barrage_list:

                    f.write(barrage)
                    f.write('\n')
                fo = open('barrage2.txt','r',encoding='utf-8')
                tk = fo.read()
                for s in stop:
                    tk = tk.replace(s, "")
                fo.close()
                wordlist = jieba.lcut(tk)

                b_ls=[]
                ab_dict = {'word':"test",'e_index':999,'count':1}
                b_ls.append(ab_dict)

                #生成一個字典

                temp ={}
                for word in wordlist:
                    duplicates=False
                    if len(word)==1:
                        continue
                    else:
                        temp[word]=temp.get(word,0)+1
                        count=temp[word]
                        new=aBrrage(word,episodes,count)
                        # ab_dict['word']= word
                        # ab_dict['e_index'] = episodes
                        # ab_dict['count']=ab_dict.get(word,0)+1
                        for n1 in b_ls[0:]:
                            if n1['word']==new['word']:
                                duplicates=True

                                if int(new['count'])>int(n1['count']):

                                    n_temp = new
                                    b_ls.remove(n1)
                                    b_ls.append(n_temp)
                                    break
                        #字典清單
                        if(duplicates==False):
                            b_ls.append(new)


            print("\n******")
            print("第",episodes,"集")
            print("字幕數量:",len(barrage_list))
            print("處理後彈幕數量:",len(b_ls))

            allBarrage.extend(b_ls)
            save_assql(allBarrage)
            ciyun(wordlist)
            print('儲存成功')      

3.全局變量儲存彈幕等資訊

爬蟲綜合大作業
爬蟲綜合大作業
global g_wordlist#儲存結巴彈幕
    global episodes#在存放xml的集标題
    global allBarrage#将儲存到資料庫有其他資訊如集資訊的清單
    global ct_episodes#集數序數
    allBarrage = []
    episodes={}
    ct_episodes=1#第一集集标題前兩項均為無效資訊
    g_wordlist=[]      

4.詞雲及資料庫儲存

爬蟲綜合大作業
爬蟲綜合大作業
def ciyun(wordlist):
    wl_split=''.join(wordlist)
    #生成詞雲

    mywc = WordCloud().generate(wl_split)
    plt.imshow(mywc)
    plt.axis("off")
    plt.show()

def save_assql(list):

        conInfo = "mysql+pymysql://root:123456@localhost:3306/bilibili?charset=utf8"
        engine = create_engine(conInfo,encoding='utf-8')
        if(list!=[]):
            df = pd.DataFrame(list)
            df.to_sql(name = 'bilibilitest', con = engine, if_exists = 'append', index = False)
            pymysql.connect(host='localhost',port=3306,user='root',passwd='123456',db='bilibili',charset='utf8')
        else:
            print('儲存失敗')
            return      

5.主要邏輯

爬蟲綜合大作業
爬蟲綜合大作業
def run(self):
        '''主要邏輯'''
        global episodes
        # 請求初始視訊url
        html_str = self.parse_url(url=self.start_url, headers=self.headers)

        # 提取資料cid

        cid_list = self.get_cid(html_str)

        print(cid_list)

        ep_list=self.get_episodes(html_str)
        # 組織彈幕的url
        self.get_barrage_url(cid_list)
        # 請求網址
        episodes={}
        episodes = self.ep_list_q.get()


        ex_len=len(episodes)
        print('==========')


        for i in range(ex_len-2):
            self.parse_url()
            self.get_barrage_list()

            res = self.save_barrage()

            time.sleep(random.random() * 3)#設定爬取的時間間隔
            if(i==ex_len-3):
                save_assql(res)
                #生成字典
                wcdict = {}
                for word in g_wordlist:
                    if len(word)==1:
                        continue
                    else:
                        wcdict[word]= wcdict.get(word,0)+1
                #排序
                wcls = list(wcdict.items())
                wcls.sort(key = lambda x:x[1],reverse=True)

                #輸出前二十五詞
                print('輸出系列前二十五詞:')
                for i in range(25):
                    print(wcls[i])
                ciyun(g_wordlist)
                print("儲存到資料庫")
        print('==========')
        print('主線程結束')      

4.輸出詞雲能夠分析該視訊或者該系列視訊的關鍵詞

輸出系列劇集的前十五詞:

爬蟲綜合大作業

拉取彈幕提示:

爬蟲綜合大作業

輸出該系列劇集的彈幕詞雲如下圖:

爬蟲綜合大作業

總結:

讀取的彈幕可以了解到該視訊的主要内容,能在看之前就較直覺地了解視訊的好評程度。若有鋪天遍地的謾罵那麼或許就不是一部适合大衆觀看的視訊。

若看到關鍵詞是自己喜歡的,那麼就是能很快選擇到自己喜歡的視訊。