【python爬蟲實戰】爬取豆瓣影評資料

概述：

爬取豆瓣影評資料步驟：
1、擷取網頁請求
2、解析擷取的網頁
3、提速資料
4、儲存檔案
源代碼：

# 1、導入需要的庫
import urllib.request
from bs4 import BeautifulSoup
# 随機數的庫
import random
# 時間庫
import time
# 表格庫
import csv

# 2、分多個浏覽器通路豆瓣網，防止通路多頁時被拒絕
# 每個浏覽器在請求資料的時候，請求頭是不一樣
# 計算機命名規則：駝峰命名法
# url：傳值過來的通路位址
def getRequest(url):
    # 谷歌浏覽器
    header1 = {
        "Host":"movie.douban.com",
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
    }
    # 火狐浏覽器
    header2 = {
        "Host": "movie.douban.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:73.0) Gecko/20100101 Firefox/73.0"
    }
    # 将浏覽器裝入清單裡
    list = [header1,header2]
    # 随機取一個請求頭  len(list)-1：清單長度-1
    index = random.randint(0,len(list)-1)
    # 随機用一個請求頭，開始通路位址
    req = urllib.request.Request(url=url,headers=list[index])
    # 傳回結果
    return req


# 封裝函數，爬取資料
def getData(url,commentAll):
    # 擷取處理後的請求
    req = getRequest(url)
    # 打開網址
    html = urllib.request.urlopen(req)
    # 讀取資料(data得到所有資料)
    data = html.read()
    # 輸出爬取到的所有資料，進制形式顯示
    # print(data)
    # 定義soup對象，解析網頁
    soup = BeautifulSoup(data,"html.parser")
    # 找到裝有所有評論的id名為comments的div
    # ["資料"]  數組裡隻有一個元素----資料
    comments = soup.select("#comments")[0]
    # print(comments)
    # 讀取到每一條評論，div的class名為comment-item
    items = comments.select(".comment-item")
    # print(items)
    # 循環周遊每一條評論
    for i in items:
        # 找到裝着使用者名和星級的span标簽，class名為comment-info
        info = i.select(".comment-info")[0]
        # print(info)
        # 讀出使用者名的a标簽裡面的字元串使用者名 [<a></a>]
        # author = info.select("a")[0].string  資料在清單裡
        author = info.find("a").string
        # print(author)
        # 取星級，找到裝着星級的span标簽，讀取title值
        # ["看過"，星級，時間]
        star = info.select("span")[1]["title"]
        # print(star)
        # 取評論，找到class名為short的p标簽
        short = i.select(".short")[0].string
        # print(short)
        # 将 使用者名、星級、評論 裝入在字典裡面
        talk = {"author":author,"star":star,"short":short}
        # print(talk)
        # 将字典類型的資料，加到清單裡面
        commentAll.append(talk)
    # 傳回整個清單
    return commentAll

# 封裝函數，把資料裝入表格中
def writeInto(commentAll):
    # 打開表格  as從命名 file
    # 參數1：表格名稱
    # 參數2："a+"追加模式  "w"寫入模式   "r"讀取模式
    # w：writer   r：read  a：append
    # wb二進制，不帶b就是文本
    # 參數3：資料格式為utf-8
    # 參數4：newline 新行，空行
    with open("douban.csv","a+",encoding="utf-8",newline="") as file:
        # 向表格寫入資料
        writer = csv.writer(file)
        # 資料在commentAll清單，循環周遊清單，讀取資料
        for i in commentAll:
            # 讀取每一個字段  使用者名、星級、評論
            info = [i["author"],i["star"],i["short"]]
            # 把資料寫入表格
            writer.writerow(info)
        # 關閉表格
        file.close()

# 函數的入口
# 直接輸入main，有提示
if __name__ == \'__main__\':
    # 初始化一個空清單,将得到的所有資料
    commentAll = []
    # range()産生序列 0.1.2,爬取3頁
    for i in range(0,3):
        # 爬取的網頁位址
        # limit=20 每一頁讀取20條資料
        # start = 80  從第幾條讀取資料 20-39  40-59  60-79 80-99
        url = "https://movie.douban.com/subject/25931446/comments?start=%d&limit=20&sort=new_score&status=P"%(i*20)
        # 調用函數，爬取資料
        getData(url,commentAll)
        # 每爬取一個頁面資料，休息10秒，防止被封号
        time.sleep(10)
    # 調用函數，爬取完資料，裝入表格
    writeInto(commentAll)

    # 将表格用 記事本 打開，另存為ANSI格式
    # 如果你要操作資料，還要轉回utf-8
效果圖：

作者

1、作者個人網站

2、作者CSDN

3、作者部落格園

4、作者簡書