天天看點

scrapy ------ 爬取豆瓣電影TOP250

轉載自 —> 原文

#items.py
# -*- coding: utf-8 -*-
import scrapy

class DoubanMovieItem(scrapy.Item):
    ranking = scrapy.Field()        #排名
    movie_name = scrapy.Field()     #電影名稱
    score = scrapy.Field()          #評分
    score_num = scrapy.Field()      #評論人數
           
#douban_spider.py
#-*- coding:utf-8 -*-

from scrapy.spider import Spider
from scrapyspider.items import DoubanMovieItem
import scrapy


class DoubanMovieTop250spider(Spider):
    name = 'douban_movie_top250'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
    }

    def start_requests(self):
        url = 'https://movie.douban.com/top250'
        yield scrapy.Request(url, headers=self.headers)

    def parse(self,response):
        item = DoubanMovieItem()
        movies = response.xpath('//ol[@class="grid_view"]/li')
        for movie in movies:
            item['ranking'] = movie.xpath('.//div[@class="pic"]/em/text()').extract()[]
            item['movie_name'] = movie.xpath('.//div[@class="hd"]/a/span[1]/text()').extract()[]
            item['score'] = movie.xpath('.//div[@class="star"]/span[@class="rating_num"]/text()').extract()[]
            item['score_num'] = movie.xpath('.//div[@class="star"]/span[4]/text()').extract()[]
            yield item

        next_url = response.xpath('//span[@class="next"]/a/@href').extract()  #擷取下一頁連結
        if next_url:
            next_url = 'https://movie.douban.com/top250' + next_url[]
            yield scrapy.Request(next_url, headers=self.headers)
           

繼續閱讀