運作環境:python 3.7.3
所需庫:
- requests
- xlwt
- bs4
- time
- multiprocessing
說明:b站有反爬機制,隻能爬取50頁視訊的資訊
import xlwt
import requests
from bs4 import BeautifulSoup
import time
import os
from multiprocessing import Pool
j=1#定義全局變量,從第1行寫入資料
def transform(string):#統一機關将機關萬轉為普通機關
if string[-1]=='萬':
string1=string.replace('萬','')
return str(float(string1)*10000)
else:
return string
def singleweb(url):#爬取單頁内容
global j
r=requests.get(url,headers=head)
r.encoding=r.apparent_encoding
r.raise_for_status()
soup=BeautifulSoup(r.text,'html.parser')
soups1=soup.find_all('div',class_='info')
for soup1 in soups1:
things=[]
soup2=soup1.find('span',class_='type avid')
things.append(soup2.text)
soup3=soup1.find('span',class_='type hide')
things.append(soup3.text)
things.append(soup1.a.get('title'))
things.append(transform(soup1.find('span',title='觀看').text.replace('\n','').replace('\r','').replace(' ',''))) #去掉多餘字元
things.append(transform(soup1.find('span',title='彈幕').text.replace('\n','').replace('\r','').replace(' ','')))#去掉多餘字元
things.append(soup1.find('span',title='上傳時間').text.replace('\n','').replace('\r','').replace(' ',''))#去掉多餘字元
things.append(soup1.find('span',title='up主').text.replace('\n','').replace('\r','').replace(' ',''))#去掉多餘字元
for k in range(0,7):
sheet.write(j,k,things[k])
print('\r正在爬取第'+str(j)+'條資訊',end='')
j=j+1
if __name__ == "__main__":
try:
os.makedirs('D://xlwt')#創早儲存位址
except:
pass
keyword=input('輸入搜尋關鍵詞:')
urls=[]
print('爬取的資訊将在操作完成後儲存在D://xlwt裡面')
header=['av号','類型','标題','觀看','彈幕','上傳時間','up主']
for i in range(1,51):#爬取50頁,也是上限
urls.append('https://search.bilibili.com/all?keyword='+keyword+f'&page={i}')
head={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
book=xlwt.Workbook(encoding='utf-8')#建立excel工作區
sheet=book.add_sheet('sheet1')
for i in range(0,7):#寫入表頭
sheet.write(0,i,header[i])
for url in urls:
singleweb(url)
book.save('D://xlwt//'+keyword+'.xls')#儲存内容
print('\n操作完成')
運作結果如下圖所示:
![](https://img.laitimes.com/img/__Qf2AjLwojIjJCLyojI0JCLiAzNfRHLGZkRGZkRfJ3bs92YsYTMfVmepNHL1UFVPNzaE1UMRpHW4Z0MMBjVtJWd0ckW65UbM5WOHJWa5kHT20ESjBjUIF2X0hXZ0xCMx81dvRWYoNHLrdEZwZ1Rh5WNXp1bwNjW1ZUba9VZwlHdssmch1mclRXY39CXldWYtlWPzNXZj9mcw1ycz9WL49zZuBnL3gjN1IDO0EjM1IzNwkTMwIzLc52YucWbp5GZzNmLn9Gbi1yZtl2Lc9CX6MHc0RHaiojIsJye.png)