settings.py(需要在settings中配置如下内容)
ITEM_PIPELINES = {
# 啟用scrapy自帶的圖檔下載下傳ImagesPipeline
'scrapy.contrib.pipeline.images.ImagesPipeline': None,
# 如果需要采用自定義的ImagesPipiline,需要将自帶的ImagesPipelin設定為None
'chinazSpider.pipelines.ImagesPipiline': 1,
}
# 配置圖檔的儲存目錄
IMAGES_STORE = 'images'
# 在ImagesPipeline進行下載下傳圖檔時,配置圖檔對應的Item字段
IMAGES_URLS_FIELD = 'img_url'
pilelines.py
from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
class ImagesPipiline(ImagesPipeline):
def get_media_requests(self, item, info):
# 從item中擷取要下載下傳的圖檔的url,根據url構造Request()對象,并傳回該對象
image_url = item['img_url']
yield Request(image_url, meta={'item': item})
def file_path(self, request, response=None, info=None):
# 用來自定義圖檔的下載下傳路徑
item = request.meta['item']
# 每個分類的名稱
category = item['sort_name'][:-2]
# 圖檔的檔案名
img_name = item['img_url'].split('/')[-1]
# 通過分類名和圖檔的檔案名,拼接圖檔的相對路徑
path = category + '/' + img_name
return path
def item_completed(self, results, item, info):
# 圖檔下載下傳完成後,傳回結果result
print(results)
return item