一、不攜帶視訊名稱
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import threading
import time
from concurrent import futures
import csv
import requests
total = 1
result = []
lock = threading.Lock()
def run(url):
# 啟動爬蟲
global total
headers = {
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
"(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
}
other_info = requests.get(url, headers=headers, timeout=6).json()
time.sleep(0.5)
try:
data = other_info["data"]
if data["view"] != "--" and data["aid"] != 0:
video = [
data["aid"], # 視訊編号
data["view"], # 播放量
data["danmaku"], # 彈幕數
data["reply"], # 評論數
data["favorite"], # 收藏數
data["coin"], # 硬币數
data["share"], # 分享數
data["like"], # 點贊數
]
with lock:
result.append(video)
if total % 100 == 0:
print(total)
total += 1
except:
pass
def save_csv(file_csv,result):
# 儲存CSV檔案
csv_writer = csv.writer(file_csv)
csv_writer.writerow(["視訊編号", "播放量", "彈幕數", "評論數", "收藏數", "硬币數", "分享數","點贊數"])
for data_num in range(len(result)):
csv_writer.writerow(result[data_num])
if __name__ == "__main__":
path = r'E:\python work\b站使用者資訊\B站使用者資訊.csv' # 根據自身存儲位置進行編輯
file_csv = open(path, 'w+', encoding='utf-8-sig',newline='')
print("啟動爬蟲,開始爬取資料")
for i in range(1, 100):
begin = 30000 * i
urls = [
"http://api.bilibili.com/archive_stat/stat?aid={}".format(j)
for j in range(begin, begin + 300)
]
with futures.ThreadPoolExecutor(64) as executor:
executor.map(run, urls)
save_csv(file_csv,result)
print("爬蟲結束,共為您爬取到 {} 條資料".format(total))
result=[]
file_csv.close()
二、帶有視訊名稱
在嘗試幾次以後,貌似不能爬取了,還求大神指點
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import threading
import time
from concurrent import futures
import csv
from bs4 import BeautifulSoup
import requests
total = 1
result = []
lock = threading.Lock()
def run(title_url,info_url):
# 啟動爬蟲
global total
headers = {
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
"(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
}
name_info=requests.get(title_url,headers=headers)
title_txt = BeautifulSoup(name_info.text, 'lxml')
title_info=title_txt.find('span', class_="tit tr-fix").text
other_info = requests.get(info_url, headers=headers, timeout=6).json()
time.sleep(0.5)
try:
data = other_info["data"]
if data["view"] != "--" and data["aid"] != 0:
video = [
title_info, # 視訊名稱
data["aid"], # 視訊編号
data["view"], # 播放量
data["danmaku"], # 彈幕數
data["reply"], # 評論數
data["favorite"], # 收藏數
data["coin"], # 硬币數
data["share"], # 分享數
data["like"], # 點贊數
]
with lock:
result.append(video)
if total % 100 == 0:
print(total)
total += 1
except:
pass
def save_csv(file_csv,result):
# 儲存CSV檔案
csv_writer = csv.writer(file_csv)
csv_writer.writerow(["視訊名稱","視訊編号", "播放量", "彈幕數", "評論數", "收藏數", "硬币數", "分享數","點贊數"])
for data_num in range(len(result)):
csv_writer.writerow(result[data_num])
if __name__ == "__main__":
path = r'E:\python work\b站使用者資訊\B站使用者資訊.csv' # 根據自身存儲位置進行編輯
file_csv = open(path, 'w+', encoding='utf-8-sig',newline='')
print("啟動爬蟲,開始爬取資料")
for i in range(1, 100):
begin = 30000 * i
title_urls = [
"https://www.bilibili.com/video/av{}".format(j)
for j in range(begin, begin + 300)
]
info_urls = [
"http://api.bilibili.com/archive_stat/stat?aid={}".format(j)
for j in range(begin, begin + 300)
]
with futures.ThreadPoolExecutor(64) as executor:
executor.map(run, title_urls,info_urls)
save_csv(file_csv,result)
print("爬蟲結束,共為您爬取到 {} 條資料".format(total))
result=[]
file_csv.close()
參考文獻:
Github源碼學習