import requests
from bs4 import BeautifulSoup
import operator
from lxml import etree
import json
class Spider(object):
def __init__(self):
self.headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}
def getHtml(self,url):
res = requests.get(url,headers=self.headers)
return res.content.decode("utf-8")
def handleInfo(self):
html = self.getHtml("https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=rank&page_limit=1000&page_start=0")
dic = json.loads(html)
movies = dic["subjects"]
#擷取導演和主演資訊
print("正在爬取...")
for movie in movies:
movieDetail =json.loads(self.getHtml("https://movie.douban.com/j/subject_abstract?subject_id=%s" % (movie["id"])))["subject"]
movie["directors"] = movieDetail["directors"]
movie["actors"] = movieDetail["actors"]
movie["rate"] = float(movie["rate"])
sorted_movies = sorted(movies, key=operator.itemgetter('rate'), reverse=True) # True 是倒叙 預設是False
for i in range(10):
with open("moviesInfoDetail.txt", "a+", encoding="utf-8") as f:
f.write(str(sorted_movies[i])+'\r\n')
print("爬取完畢,結果儲存在moviesInfoDetail.txt中")
if __name__=="__main__":
spider = Spider()
spider.handleInfo()
參考:http://www.facesjoy.cn/article/2019/10/20/9.html