天天看點

B站視訊資訊爬取+CSV存儲

一、不攜帶視訊名稱

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import threading
import time
from concurrent import futures
import csv
import requests

total = 1
result = []
lock = threading.Lock()

def run(url):
    # 啟動爬蟲
    global total
    headers = {
        "X-Requested-With": "XMLHttpRequest",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
                      "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
    }
    other_info = requests.get(url, headers=headers, timeout=6).json()
    time.sleep(0.5)  
    try:
        data = other_info["data"]
        if data["view"] != "--" and data["aid"] != 0:
            video = [
                data["aid"],  # 視訊編号
                data["view"],  # 播放量
                data["danmaku"],  # 彈幕數
                data["reply"],  # 評論數
                data["favorite"],  # 收藏數
                data["coin"],  # 硬币數
                data["share"],  # 分享數
                data["like"],  # 點贊數
            ]
            with lock:
                result.append(video)
                if total % 100 == 0:
                    print(total)
                total += 1
    except:
        pass


def save_csv(file_csv,result):
    # 儲存CSV檔案
    csv_writer = csv.writer(file_csv)
    csv_writer.writerow(["視訊編号", "播放量", "彈幕數", "評論數", "收藏數", "硬币數", "分享數","點贊數"])
    for data_num in range(len(result)):
        csv_writer.writerow(result[data_num])


if __name__ == "__main__":
    path = r'E:\python work\b站使用者資訊\B站使用者資訊.csv'  # 根據自身存儲位置進行編輯
    file_csv = open(path, 'w+', encoding='utf-8-sig',newline='')
    print("啟動爬蟲,開始爬取資料")
    for i in range(1, 100):
        begin = 30000 * i
        urls = [
            "http://api.bilibili.com/archive_stat/stat?aid={}".format(j)
            for j in range(begin, begin + 300)
        ]
        with futures.ThreadPoolExecutor(64) as executor:
            executor.map(run, urls)
    save_csv(file_csv,result)
    print("爬蟲結束,共為您爬取到 {} 條資料".format(total))
    result=[]
    file_csv.close()
           

二、帶有視訊名稱

在嘗試幾次以後,貌似不能爬取了,還求大神指點

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import threading
import time
from concurrent import futures
import csv
from bs4 import BeautifulSoup
import requests

total = 1
result = []
lock = threading.Lock()


def run(title_url,info_url):
    # 啟動爬蟲
    global total
    headers = {
        "X-Requested-With": "XMLHttpRequest",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
                      "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
    }
    name_info=requests.get(title_url,headers=headers)
    title_txt = BeautifulSoup(name_info.text, 'lxml')
    title_info=title_txt.find('span', class_="tit tr-fix").text
    other_info = requests.get(info_url, headers=headers, timeout=6).json()
    time.sleep(0.5) 
    try:
        data = other_info["data"]
        if data["view"] != "--" and data["aid"] != 0:
            video = [
                title_info,  # 視訊名稱
                data["aid"],  # 視訊編号
                data["view"],  # 播放量
                data["danmaku"],  # 彈幕數
                data["reply"],  # 評論數
                data["favorite"],  # 收藏數
                data["coin"],  # 硬币數
                data["share"],  # 分享數
                data["like"],  # 點贊數
            ]
            with lock:
                result.append(video)
                if total % 100 == 0:
                    print(total)
                total += 1
    except:
        pass


def save_csv(file_csv,result):
    # 儲存CSV檔案
    csv_writer = csv.writer(file_csv)
    csv_writer.writerow(["視訊名稱","視訊編号", "播放量", "彈幕數", "評論數", "收藏數", "硬币數", "分享數","點贊數"])
    for data_num in range(len(result)):
        csv_writer.writerow(result[data_num])


if __name__ == "__main__":
    path = r'E:\python work\b站使用者資訊\B站使用者資訊.csv'  # 根據自身存儲位置進行編輯
    file_csv = open(path, 'w+', encoding='utf-8-sig',newline='')
    print("啟動爬蟲,開始爬取資料")
    for i in range(1, 100):
        begin = 30000 * i
        title_urls = [
            "https://www.bilibili.com/video/av{}".format(j)
            for j in range(begin, begin + 300)
        ]
        info_urls = [
            "http://api.bilibili.com/archive_stat/stat?aid={}".format(j)
            for j in range(begin, begin + 300)
        ]
        with futures.ThreadPoolExecutor(64) as executor:
            executor.map(run, title_urls,info_urls)
    save_csv(file_csv,result)
    print("爬蟲結束,共為您爬取到 {} 條資料".format(total))
    result=[]
    file_csv.close()
           

參考文獻:

Github源碼學習