python爬取豆瓣電影排行榜前250名

爬取豆瓣電影排行榜

選擇頁面

首先，我們打開豆瓣的電影排行榜的頁面。網頁連結：輕按兩下跳轉

2. 頁面分析

接下來，我們要在這個頁面提取每一部電影的詳情節連結，總頁碼數，每一部電影的短評。

我們先來提取電影詳情頁的連結。

# 擷取豆瓣top250每個頁面下的電影豆瓣連結清單
        movies_link_list = html.xpath('//li//div[@class="info"]/div[@class="hd"]/a/@href')

提取導航頁的每一部電影的短評。

`# 擷取每個頁面下的電影的代表影評
        movies_quote_list = html.xpath('//p[@class="quote"]/span[@class="inq"]/text()')`

提取電影排行榜的總頁數

`total_page_num = int(html.xpath('string(//span[@class="next"]/preceding-sibling::a[1])'))`

下面，我們提取每一部電影的詳細資訊。

python爬取豆瓣電影排行榜前250名

我們提取的資訊包括上圖中的電影排名、名稱、連結、評分、評價人數以及各個星級的評價人數，還有電影的導演，編劇、主演、類型、制片國家/地區、語言、上映日期、片長、又名和IMDb連結。代碼如下：

# 擷取電影排名
                movie_rank = r'電影排名:{}'.format(html.xpath('string(//div[@class="top250"]/span[@class="top250-no"])'))
                # 擷取電影名稱
                movie_name = r'電影名稱:{}'.format(html.xpath('string(//span[@property="v:itemreviewed"])'))
                # 擷取電影連結
                movie_link = r'電影連結:{}'.format(movie_link)
                # 擷取電影評分
                movie_score = r'電影評分:{}'.format(html.xpath('string(//strong[@property="v:average"])'))
                # 擷取電影評價總人數
                movie_evaluated_people = r'電影評價人數:{}'.format(html.xpath('string(//a[@class="rating_people"])'))
                # 擷取電影5個評價星級
                movie_stars_list = [i.strip() for i in
                                    html.xpath('//div[@class="item"]//span[starts-with(@class, "stars")]/text()')]
                # 擷取電影5個評價星級對應的評價人數的百分比
                movie_evaluated_per_list = html.xpath('//div[@class="item"]//span[@class="rating_per"]/text()')

下載下傳頁面

try:
        response = requests.get(url, headers)
        if response.status_code == 200:
            return response.content.decode('utf-8')
    except requests.ConnectionError:
        print('網頁資料爬取錯誤！！！！')
        return None

4.儲存頁面

我把爬取的電影的詳細儲存到JSON檔案。

with open(file_name, 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, indent=4, ensure_ascii=False))
        f.write('\n' * 2)

5.完整代碼

下面是完整的代碼。

import json
import re
import time

import requests
from lxml import etree


def get_page_source(url, headers):
    """
    擷取網頁源代碼
    :param url: 網頁位址
    :param headers: 請求頭
    :return: html字元串
    """
    try:
        response = requests.get(url, headers)
        if response.status_code == 200:
            return response.content.decode('utf-8')
    except requests.ConnectionError:
        print('網頁資料爬取錯誤！！！！')
        return None


def get_page_num(page_source):
    """
    擷取頁面總頁數
    :param page_source: html字元串
    :return: 頁數(int)
    """
    html = etree.HTML(page_source)
    total_page_num = int(html.xpath('string(//span[@class="next"]/preceding-sibling::a[1])'))
    return total_page_num


def get_movie_info(page_source, headers):
    """
    擷取每一部電影的詳細資訊
    :param page_source:導航頁HTML字元串
    :param headers: 請求頭
    :return:電影的全部資訊
    """
    if page_source:
        html = etree.HTML(page_source)
        # 擷取豆瓣top250每個頁面下的電影豆瓣連結清單
        movies_link_list = html.xpath('//li//div[@class="info"]/div[@class="hd"]/a/@href')
        # 擷取每個頁面下的電影的代表影評
        movies_quote_list = html.xpath('//p[@class="quote"]/span[@class="inq"]/text()')
        for movie_link, movie_quote in zip(movies_link_list, movies_quote_list):
            time.sleep(0.5)  # 延遲，防屏蔽
            # 擷取每一部電影的詳細頁面
            _page_source = get_page_source(movie_link, headers)
            if _page_source:
                html = etree.HTML(_page_source)
                # 建立清單,用于存儲導演、編劇、主演、類型、制片國家/地區、語言、上映日期、片長、又名、IMDb連結的具體資訊
                movie_info_list = [a for a in re.split(r'\n        ', html.xpath('string(//div[@id="info"])').strip())
                                   if bool(a)]
                # 将影評添加到movie_info_list的最後
                movie_info_list.append('影評:' + movie_quote)
                # 擷取電影排名
                movie_rank = r'電影排名:{}'.format(html.xpath('string(//div[@class="top250"]/span[@class="top250-no"])'))
                # 擷取電影名稱
                movie_name = r'電影名稱:{}'.format(html.xpath('string(//span[@property="v:itemreviewed"])'))
                # 擷取電影連結
                movie_link = r'電影連結:{}'.format(movie_link)
                # 擷取電影評分
                movie_score = r'電影評分:{}'.format(html.xpath('string(//strong[@property="v:average"])'))
                # 擷取電影評價總人數
                movie_evaluated_people = r'電影評價人數:{}'.format(html.xpath('string(//a[@class="rating_people"])'))
                # 建立清單，存儲電影其他資訊
                movie_other_info_list = [
                    movie_rank, movie_name, movie_link, movie_score, movie_evaluated_people,
                ]
                # 擷取電影5個評價星級
                movie_stars_list = [i.strip() for i in
                                    html.xpath('//div[@class="item"]//span[starts-with(@class, "stars")]/text()')]
                # 擷取電影5個評價星級對應的評價人數的百分比
                movie_evaluated_per_list = html.xpath('//div[@class="item"]//span[@class="rating_per"]/text()')
                # 将電影5個評價星級及對應評價人數寫入清單
                for each in zip(movie_stars_list, movie_evaluated_per_list):
                    movie_other_info_list.append('{}評價人數:{}'.format(*each))
                # 合并電影的詳細資訊
                movie_other_info_list.extend(movie_info_list)
                yield movie_other_info_list
                print(
                    '已經寫入第{}部電影\t{}\t的資訊'.format(html.xpath('string(//div[@class="top250"]/span[@class="top250-no"])'),
                                                 html.xpath('string(//span[@property="v:itemreviewed"])')))


def save_to_json(content, file_name):
    """
    儲存到JSON檔案
    :param content: 電影資訊
    :param file_name: 檔案名
    :return:
    """
    with open(file_name, 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, indent=4, ensure_ascii=False))
        f.write('\n' * 2)


def main():
    file_name = r'top250.json'
    url = r'https://movie.douban.com/top250'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
    }
    page_source = get_page_source(url, headers)
    total_page_num = get_page_num(page_source)
    for each_page in range(total_page_num):
        url = 'https://movie.douban.com/top250?start={}'.format(each_page * 25)
        page_source = get_page_source(url, headers)
        for _movie_info in get_movie_info(page_source, headers):
            save_to_json(_movie_info, file_name)


if __name__ == "__main__":
    print('----------爬取開始----------')
    main()
    print('豆瓣電影排行榜前250的電影資訊寫入完畢')

python爬取豆瓣電影排行榜前250名

繼續閱讀

無法解析的外部符号 wmain，該符号在函數 "void cdecl mainCRTStartupHelper(struct HINSTANCE *,unsigned short con......

TestLink導出用例轉換工具(XML2Excel)

YAML簡介和PyYAML安全操作YAML支援的類型YAML的優點：yaml的基本文法python操作

Small tricks

libsvm for python 安裝

學習軟體測試基礎測試第七天

Zeppelin 配置通路 REST APIApache Zeppelin Configuration REST API

【Torch】最簡潔logging使用指南

27. Remove Element(清單)題目代碼

sort()函數到底是怎樣進行數字排序的

Cloud Studio初體驗

使用 ctypes 進行 Python 和 C 的混合程式設計

【python】【資料處理】畫多元資料分布圖

【python】netconf協定對接管理裝置

「Python 網絡自動化」NETCONF —— Python 使用 NETCONF 管理配置 H3C 網絡裝置

在python中建立excel并寫入