天天看點

python3爬取豆瓣電影資訊(前500部)

import requests
from bs4 import BeautifulSoup
import operator
from lxml import etree
import json

class Spider(object):
    def __init__(self):
        self.headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
        }


    def getHtml(self,url):
        res = requests.get(url,headers=self.headers)
        return res.content.decode("utf-8")

    def handleInfo(self):
        html = self.getHtml("https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=rank&page_limit=1000&page_start=0")
        dic = json.loads(html)
        movies = dic["subjects"]
        #擷取導演和主演資訊

        print("正在爬取...")
        for movie in movies:
             movieDetail =json.loads(self.getHtml("https://movie.douban.com/j/subject_abstract?subject_id=%s" % (movie["id"])))["subject"]
             movie["directors"] = movieDetail["directors"]
             movie["actors"] = movieDetail["actors"]
             movie["rate"] = float(movie["rate"])

        sorted_movies = sorted(movies, key=operator.itemgetter('rate'), reverse=True)  # True 是倒叙  預設是False
        for i in range(10):
            with open("moviesInfoDetail.txt", "a+", encoding="utf-8") as f:
                f.write(str(sorted_movies[i])+'\r\n')
        print("爬取完畢,結果儲存在moviesInfoDetail.txt中")
if __name__=="__main__":
    spider = Spider()
    spider.handleInfo()
           

參考:http://www.facesjoy.cn/article/2019/10/20/9.html