天天看點

b站爬取視訊資訊,并以excel存儲資訊

運作環境:python 3.7.3

所需庫:

  1. requests
  2. xlwt
  3. bs4
  4. time
  5. multiprocessing

說明:b站有反爬機制,隻能爬取50頁視訊的資訊

import xlwt
import requests
from bs4 import BeautifulSoup
import time
import os
from multiprocessing import Pool
j=1#定義全局變量,從第1行寫入資料

def transform(string):#統一機關将機關萬轉為普通機關
    if string[-1]=='萬':
        string1=string.replace('萬','')
        return str(float(string1)*10000)
    else:
        return string

def singleweb(url):#爬取單頁内容
    global j
    r=requests.get(url,headers=head)
    r.encoding=r.apparent_encoding
    r.raise_for_status()
    soup=BeautifulSoup(r.text,'html.parser')
    soups1=soup.find_all('div',class_='info')
    for soup1 in soups1:
        things=[]
        soup2=soup1.find('span',class_='type avid')
        things.append(soup2.text)
        soup3=soup1.find('span',class_='type hide')
        things.append(soup3.text)
        things.append(soup1.a.get('title'))
        things.append(transform(soup1.find('span',title='觀看').text.replace('\n','').replace('\r','').replace(' ',''))) #去掉多餘字元
        things.append(transform(soup1.find('span',title='彈幕').text.replace('\n','').replace('\r','').replace(' ','')))#去掉多餘字元
        things.append(soup1.find('span',title='上傳時間').text.replace('\n','').replace('\r','').replace(' ',''))#去掉多餘字元
        things.append(soup1.find('span',title='up主').text.replace('\n','').replace('\r','').replace(' ',''))#去掉多餘字元
        for k in range(0,7):
            sheet.write(j,k,things[k])
        print('\r正在爬取第'+str(j)+'條資訊',end='')
        j=j+1
if __name__ == "__main__":
    try:
        os.makedirs('D://xlwt')#創早儲存位址
    except:
        pass
    keyword=input('輸入搜尋關鍵詞:')
    urls=[]
    print('爬取的資訊将在操作完成後儲存在D://xlwt裡面')
    header=['av号','類型','标題','觀看','彈幕','上傳時間','up主']
    for i in range(1,51):#爬取50頁,也是上限
        urls.append('https://search.bilibili.com/all?keyword='+keyword+f'&page={i}')
    head={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
    book=xlwt.Workbook(encoding='utf-8')#建立excel工作區
    sheet=book.add_sheet('sheet1')
    for i in range(0,7):#寫入表頭
        sheet.write(0,i,header[i])
    for url in urls:
        singleweb(url)
    book.save('D://xlwt//'+keyword+'.xls')#儲存内容
    print('\n操作完成')

           

運作結果如下圖所示:

b站爬取視訊資訊,并以excel存儲資訊