代码实现
打开终端输入
cd Desktop
scrapy startproject DouyuSpider
cd DouyuSpider
scrapy genspider douyu douyu.com

然后用Pycharm打开桌面生成的文件夹
douyu.py
# -*- coding: utf-8 -*-
import scrapy
import json
from ..items import DouyuspiderItem
class DouyuSpider(scrapy.Spider):
name = 'douyu'
allowed_domains = ['douyu.com']
start_urls = ['http://api.douyucdn.cn/api/v1/getverticalRoom?limit=20&offset=1']
def parse(self, response):
yield scrapy.Request(
url=response.url,
callback=self.parse_all_page,
meta={},
dont_filter=True
)
def parse_one_page(self,response):
json1 = response.text
json1 = json.loads(json1)
data_list = json1['data']
for data in data_list:
room_src = data['room_src']
item = DouyuspiderItem()
# 该字段必须是图片链接的可迭代对象,否则报错
item['room_src']= [room_src]
item['downloadUrl'] = [downloadUrl]
yield item
def parse_all_page(self,response):
for page in range(1,501,20):
url = 'http://api.douyucdn.cn/api/v1/getverticalRoom?limit=20&offset={}'.format(page)
print(url)
yield scrapy.Request(
url=url,
callback=self.parse_one_page,
dont_filter=True,
meta={}
)
报错 : ValueError: Missing scheme in request url: h
解决方法:将item['x'] = x改为item['x'] = [x]即可
items.py
class DouyuspiderItem(scrapy.Item):
room_src = scrapy.Field()
downloadUrl = scrapy.Field()
settings.py
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
# 'DouyuSpider.pipelines.DouyuspiderPipeline': 300,
# 图片下载管道
'scrapy.pipelines.images.ImagesPipeline':1,
# 文件下载管道
'scrapy.pipelines.files.FilesPipeline':2
}
# 图片的存储路径
IMAGES_STORE = 'file/image'
# 图片的下载地址,根据items中的字段来设置哪一个内容需要被下载
IMAGES_URLS_FIELD = 'room_src'
# 文件的存储路径
FILES_STORE = 'file/book'
# 文件的下载地址,根据items中的字段来设置哪一个内容需要被下载
FILES_URLS_FIELD = 'downloadUrl'
运行结果运行爬虫
将在配置文件中所配置的图片保存路径下,生成如下目录结构:
full文件夹下的图片