B站視訊資訊爬取+CSV存儲

一、不攜帶視訊名稱

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import threading
import time
from concurrent import futures
import csv
import requests

total = 1
result = []
lock = threading.Lock()

def run(url):
    # 啟動爬蟲
    global total
    headers = {
        "X-Requested-With": "XMLHttpRequest",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
                      "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
    }
    other_info = requests.get(url, headers=headers, timeout=6).json()
    time.sleep(0.5)  
    try:
        data = other_info["data"]
        if data["view"] != "--" and data["aid"] != 0:
            video = [
                data["aid"],  # 視訊編号
                data["view"],  # 播放量
                data["danmaku"],  # 彈幕數
                data["reply"],  # 評論數
                data["favorite"],  # 收藏數
                data["coin"],  # 硬币數
                data["share"],  # 分享數
                data["like"],  # 點贊數
            ]
            with lock:
                result.append(video)
                if total % 100 == 0:
                    print(total)
                total += 1
    except:
        pass


def save_csv(file_csv,result):
    # 儲存CSV檔案
    csv_writer = csv.writer(file_csv)
    csv_writer.writerow(["視訊編号", "播放量", "彈幕數", "評論數", "收藏數", "硬币數", "分享數","點贊數"])
    for data_num in range(len(result)):
        csv_writer.writerow(result[data_num])


if __name__ == "__main__":
    path = r'E:\python work\b站使用者資訊\B站使用者資訊.csv'  # 根據自身存儲位置進行編輯
    file_csv = open(path, 'w+', encoding='utf-8-sig',newline='')
    print("啟動爬蟲，開始爬取資料")
    for i in range(1, 100):
        begin = 30000 * i
        urls = [
            "http://api.bilibili.com/archive_stat/stat?aid={}".format(j)
            for j in range(begin, begin + 300)
        ]
        with futures.ThreadPoolExecutor(64) as executor:
            executor.map(run, urls)
    save_csv(file_csv,result)
    print("爬蟲結束，共為您爬取到 {} 條資料".format(total))
    result=[]
    file_csv.close()

二、帶有視訊名稱

在嘗試幾次以後，貌似不能爬取了，還求大神指點

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import threading
import time
from concurrent import futures
import csv
from bs4 import BeautifulSoup
import requests

total = 1
result = []
lock = threading.Lock()


def run(title_url,info_url):
    # 啟動爬蟲
    global total
    headers = {
        "X-Requested-With": "XMLHttpRequest",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
                      "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
    }
    name_info=requests.get(title_url,headers=headers)
    title_txt = BeautifulSoup(name_info.text, 'lxml')
    title_info=title_txt.find('span', class_="tit tr-fix").text
    other_info = requests.get(info_url, headers=headers, timeout=6).json()
    time.sleep(0.5) 
    try:
        data = other_info["data"]
        if data["view"] != "--" and data["aid"] != 0:
            video = [
                title_info,  # 視訊名稱
                data["aid"],  # 視訊編号
                data["view"],  # 播放量
                data["danmaku"],  # 彈幕數
                data["reply"],  # 評論數
                data["favorite"],  # 收藏數
                data["coin"],  # 硬币數
                data["share"],  # 分享數
                data["like"],  # 點贊數
            ]
            with lock:
                result.append(video)
                if total % 100 == 0:
                    print(total)
                total += 1
    except:
        pass


def save_csv(file_csv,result):
    # 儲存CSV檔案
    csv_writer = csv.writer(file_csv)
    csv_writer.writerow(["視訊名稱","視訊編号", "播放量", "彈幕數", "評論數", "收藏數", "硬币數", "分享數","點贊數"])
    for data_num in range(len(result)):
        csv_writer.writerow(result[data_num])


if __name__ == "__main__":
    path = r'E:\python work\b站使用者資訊\B站使用者資訊.csv'  # 根據自身存儲位置進行編輯
    file_csv = open(path, 'w+', encoding='utf-8-sig',newline='')
    print("啟動爬蟲，開始爬取資料")
    for i in range(1, 100):
        begin = 30000 * i
        title_urls = [
            "https://www.bilibili.com/video/av{}".format(j)
            for j in range(begin, begin + 300)
        ]
        info_urls = [
            "http://api.bilibili.com/archive_stat/stat?aid={}".format(j)
            for j in range(begin, begin + 300)
        ]
        with futures.ThreadPoolExecutor(64) as executor:
            executor.map(run, title_urls,info_urls)
    save_csv(file_csv,result)
    print("爬蟲結束，共為您爬取到 {} 條資料".format(total))
    result=[]
    file_csv.close()

參考文獻：

Github源碼學習

B站視訊資訊爬取+CSV存儲

一、不攜帶視訊名稱

二、帶有視訊名稱

繼續閱讀

來自python的【條件控制/語句循環/break/continue/else/pass】一、條件控制二、語句循環

無法解析的外部符号 wmain，該符号在函數 "void cdecl mainCRTStartupHelper(struct HINSTANCE *,unsigned short con......

TestLink導出用例轉換工具(XML2Excel)

YAML簡介和PyYAML安全操作YAML支援的類型YAML的優點：yaml的基本文法python操作

Small tricks

libsvm for python 安裝

學習軟體測試基礎測試第七天

Zeppelin 配置通路 REST APIApache Zeppelin Configuration REST API

【Torch】最簡潔logging使用指南

27. Remove Element(清單)題目代碼

Cloud Studio初體驗

使用 ctypes 進行 Python 和 C 的混合程式設計

【python】【資料處理】畫多元資料分布圖

【python】netconf協定對接管理裝置

「Python 網絡自動化」NETCONF —— Python 使用 NETCONF 管理配置 H3C 網絡裝置

在python中建立excel并寫入