本文章需要對Scrapy有一定基礎才可閱讀,講解過程不會面向新手。Scrapy爬蟲架構,入門案例
github源碼
目錄
1.ImagesPipeline子產品說明
2.案例:百度圖檔爬取
1)URL分析
2)程式設計
3)執行程式與效果預覽
1.ImagesPipeline子產品說明
ImagesPipeline是Scrapy中的一個pipe管道元件的一個插件子產品,封裝了對圖檔爬取的一些處理,是Scrapy給出的圖檔處理方案,可以避免我們重複造輪子,我們隻要拿到圖檔連結丢給這個管道,其他的不用我們管,包括圖檔的命名和字尾名處理,圖檔存儲位址(需要修改預設值),圖檔長寬處理等。
下面是ImagesPipeline的源碼,這裡僅講兩個函數
第一個函數是get_media_requests,該函數的作用是下載下傳圖檔,調用item中圖檔的連結(我們一般将圖檔連結存在item中),調用Request函數進行下載下傳,預設的函數寫死了,我們還需要進行一些處理才可以拿到圖檔連結。
def get_media_requests(self, item, info): return [Request(x) for x in item.get(self.images_urls_field, [])]
第二個函數是file_path,定義下載下傳圖檔存儲的路徑,我們需要處理一下,以此把圖檔存儲到我們想存儲的地方
def file_path(self, request, response=None, info=None): image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest() return 'full/%s.jpg' % (image_guid)
"""
Images Pipeline
See documentation in topics/media-pipeline.rst
"""
import functools
import hashlib
import six
try:
from cStringIO import StringIO as BytesIO
except ImportError:
from io import BytesIO
from PIL import Image
from scrapy.utils.misc import md5sum
from scrapy.utils.python import to_bytes
from scrapy.http import Request
from scrapy.settings import Settings
from scrapy.exceptions import DropItem
#TODO: from scrapy.pipelines.media import MediaPipeline
from scrapy.pipelines.files import FileException, FilesPipeline
class NoimagesDrop(DropItem):
"""Product with no images exception"""
class ImageException(FileException):
"""General image error exception"""
class ImagesPipeline(FilesPipeline):
"""Abstract pipeline that implement the image thumbnail generation logic
"""
MEDIA_NAME = 'image'
# Uppercase attributes kept for backward compatibility with code that subclasses
# ImagesPipeline. They may be overridden by settings.
MIN_WIDTH = 0
MIN_HEIGHT = 0
EXPIRES = 90
THUMBS = {}
DEFAULT_IMAGES_URLS_FIELD = 'image_urls'
DEFAULT_IMAGES_RESULT_FIELD = 'images'
def __init__(self, store_uri, download_func=None, settings=None):
super(ImagesPipeline, self).__init__(store_uri, settings=settings,
download_func=download_func)
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
resolve = functools.partial(self._key_for_pipe,
base_class_name="ImagesPipeline",
settings=settings)
self.expires = settings.getint(
resolve("IMAGES_EXPIRES"), self.EXPIRES
)
if not hasattr(self, "IMAGES_RESULT_FIELD"):
self.IMAGES_RESULT_FIELD = self.DEFAULT_IMAGES_RESULT_FIELD
if not hasattr(self, "IMAGES_URLS_FIELD"):
self.IMAGES_URLS_FIELD = self.DEFAULT_IMAGES_URLS_FIELD
self.images_urls_field = settings.get(
resolve('IMAGES_URLS_FIELD'),
self.IMAGES_URLS_FIELD
)
self.images_result_field = settings.get(
resolve('IMAGES_RESULT_FIELD'),
self.IMAGES_RESULT_FIELD
)
self.min_width = settings.getint(
resolve('IMAGES_MIN_WIDTH'), self.MIN_WIDTH
)
self.min_height = settings.getint(
resolve('IMAGES_MIN_HEIGHT'), self.MIN_HEIGHT
)
self.thumbs = settings.get(
resolve('IMAGES_THUMBS'), self.THUMBS
)
@classmethod
def from_settings(cls, settings):
s3store = cls.STORE_SCHEMES['s3']
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
s3store.AWS_ENDPOINT_URL = settings['AWS_ENDPOINT_URL']
s3store.AWS_REGION_NAME = settings['AWS_REGION_NAME']
s3store.AWS_USE_SSL = settings['AWS_USE_SSL']
s3store.AWS_VERIFY = settings['AWS_VERIFY']
s3store.POLICY = settings['IMAGES_STORE_S3_ACL']
gcs_store = cls.STORE_SCHEMES['gs']
gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
gcs_store.POLICY = settings['IMAGES_STORE_GCS_ACL'] or None
store_uri = settings['IMAGES_STORE']
return cls(store_uri, settings=settings)
def file_downloaded(self, response, request, info):
return self.image_downloaded(response, request, info)
def image_downloaded(self, response, request, info):
checksum = None
for path, image, buf in self.get_images(response, request, info):
if checksum is None:
buf.seek(0)
checksum = md5sum(buf)
width, height = image.size
self.store.persist_file(
path, buf, info,
meta={'width': width, 'height': height},
headers={'Content-Type': 'image/jpeg'})
return checksum
def get_images(self, response, request, info):
path = self.file_path(request, response=response, info=info)
orig_image = Image.open(BytesIO(response.body))
width, height = orig_image.size
if width < self.min_width or height < self.min_height:
raise ImageException("Image too small (%dx%d < %dx%d)" %
(width, height, self.min_width, self.min_height))
image, buf = self.convert_image(orig_image)
yield path, image, buf
for thumb_id, size in six.iteritems(self.thumbs):
thumb_path = self.thumb_path(request, thumb_id, response=response, info=info)
thumb_image, thumb_buf = self.convert_image(image, size)
yield thumb_path, thumb_image, thumb_buf
def convert_image(self, image, size=None):
if image.format == 'PNG' and image.mode == 'RGBA':
background = Image.new('RGBA', image.size, (255, 255, 255))
background.paste(image, image)
image = background.convert('RGB')
elif image.mode == 'P':
image = image.convert("RGBA")
background = Image.new('RGBA', image.size, (255, 255, 255))
background.paste(image, image)
image = background.convert('RGB')
elif image.mode != 'RGB':
image = image.convert('RGB')
if size:
image = image.copy()
image.thumbnail(size, Image.ANTIALIAS)
buf = BytesIO()
image.save(buf, 'JPEG')
return image, buf
def get_media_requests(self, item, info):
return [Request(x) for x in item.get(self.images_urls_field, [])]
def item_completed(self, results, item, info):
if isinstance(item, dict) or self.images_result_field in item.fields:
item[self.images_result_field] = [x for ok, x in results if ok]
return item
def file_path(self, request, response=None, info=None):
image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
return 'full/%s.jpg' % (image_guid)
def thumb_path(self, request, thumb_id, response=None, info=None):
thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)
2.案例:百度圖檔爬取
1)URL分析
打開百度圖檔:https://image.baidu.com/
随便搜一點東西

打開檢查,Network,ctrl+R重新整理,向下翻幾頁,可以看到百度是通過ajax的xhr來傳遞異步資料的
打開一個xhr,複制Request URL
可以看到請求參數還是蠻多的
ctrl+F搜尋thumbURL,可以得出一個xhr一共提供30個圖檔的連結![]()
Scrapy爬蟲架構,ImagesPipeline的基本用法,圖檔爬取思路詳解 下面分析URL的請求參數,可以很明顯的看出queryWord:星際就是我們搜尋的内容![]()
Scrapy爬蟲架構,ImagesPipeline的基本用法,圖檔爬取思路詳解 接着分析,word也有一個搜尋内容,然後是pn:30,這個pn就是頁碼數,經過觀察可以發現第一頁為30,第二頁為60,依次+30。![]()
Scrapy爬蟲架構,ImagesPipeline的基本用法,圖檔爬取思路詳解 在看到這一段URL![]()
Scrapy爬蟲架構,ImagesPipeline的基本用法,圖檔爬取思路詳解 word=%E6%98%9F%E9%99%和word=迪麗熱巴,queryWord=迪麗熱巴,這裡為什麼是https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=迪麗熱巴&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=©right=&word=迪麗熱巴&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=30&rn=30&gsm=1e&1586660816262=粘貼到浏覽器上看看效果,成功![]()
Scrapy爬蟲架構,ImagesPipeline的基本用法,圖檔爬取思路詳解 2)程式設計
打開一個終端,建立scrapy項目(Linux打開Terminor,window打開cmd)
輸入
使用編輯器打開項目,我用的是PyChram,項目結構就出來了scrapy startproject baiduimage
cd baiduimage
scrapy genspider image image.baidu.com
![]()
Scrapy爬蟲架構,ImagesPipeline的基本用法,圖檔爬取思路詳解 由于這個程式比較簡單,我僅講解其中幾個坑點,源碼在開篇給出的github裡有,下載下傳後用編輯器打開就可以運作
images.py
第一個要注意的地方就是轉義(http協定不支援中文),調用urllib提供的parse.quote即可轉義
在一個就是正則比對,拿到thumbURL圖檔連結,關于正則比對規則我這裡不多說,我部落格的網絡爬蟲欄裡有适合新手的教程word_origin = input("請輸入搜尋關鍵字:")
word = parse.quote(word_origin)
regex = '"thumbURL":"(.*?)"'
pattern = re.compile(regex, re.S)
links = pattern.findall(response.text)
# -*- coding: utf-8 -*- import scrapy import json import re from urllib import parse from ..items import BaiduimageItem class ImagesSpider(scrapy.Spider): name = 'images' allowed_domains = ['image.baidu.com'] word_origin = input("請輸入搜尋關鍵字:") word = parse.quote(word_origin) url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&is=&fp=result&queryWord=" + word + "&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=©right=&word=" + word + "&s=&se=&tab=&width=&height=& def start_requests(self): #頁數可調整,開始頁30,每次遞增30 for pn in range(30,151,30): url=self.url.format(pn) yield scrapy.Request(url=url,callback=self.parse) def parse(self, response): #正則比對符合條件的thumbURL regex = '"thumbURL":"(.*?)"' pattern = re.compile(regex, re.S) links = pattern.findall(response.text) item=BaiduimageItem() #将搜尋内容指派給item,建立檔案夾會用到 item["word"]=self.word_origin for i in links: item["link"]=i yield item
pipelines.py
此處class BaiduimagePipeline繼承了scrapy給我們提供的ImagesPipeline圖檔處理管道,為了讓這個子產品适應我們的程式,需要重寫兩個函數。
這裡我将item中的word參數複制給了一個全局變量,目的是傳遞給file_path。也可以使用scrapy.Request(meta={})去傳遞
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.pipelines.images import ImagesPipeline import scrapy import hashlib from scrapy.utils.python import to_bytes class BaiduimagePipeline(ImagesPipeline): word="" # 重寫ImagesPipeline裡的get_media_requests函數 # 原函數的請求不适應與本程式 def get_media_requests(self, item, info): self.word=item['word'] yield scrapy.Request(url=item['link']) # 重寫ImagesPipeline裡的file_path函數 # 原函數return 'full/%s.jpg' % (image_guid) # 我們将其改為自己想存放的路徑位址 def file_path(self, request, response=None, info=None): image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest() return self.word + '/%s.jpg' % (image_guid)
settings.py
在settings.py裡IMAGES_STORE是圖檔存儲的路徑
ImagesPipeline中的from_setting會調用 IMAGES_STORE,store_uri=settings['IMAGES_STORE']BOT_NAME = 'baiduimage' SPIDER_MODULES = ['baiduimage.spiders'] NEWSPIDER_MODULE = 'baiduimage.spiders' ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 0.5 DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36' } ITEM_PIPELINES = { 'baiduimage.pipelines.BaiduimagePipeline': 300, } IMAGES_STORE= 'D:\\圖檔\\'
@classmethod def from_settings(cls, settings): s3store = cls.STORE_SCHEMES['s3'] s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID'] s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY'] s3store.AWS_ENDPOINT_URL = settings['AWS_ENDPOINT_URL'] s3store.AWS_REGION_NAME = settings['AWS_REGION_NAME'] s3store.AWS_USE_SSL = settings['AWS_USE_SSL'] s3store.AWS_VERIFY = settings['AWS_VERIFY'] s3store.POLICY = settings['IMAGES_STORE_S3_ACL'] gcs_store = cls.STORE_SCHEMES['gs'] gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID'] gcs_store.POLICY = settings['IMAGES_STORE_GCS_ACL'] or None store_uri = settings['IMAGES_STORE'] return cls(store_uri, settings=settings)
items.py
兩個參數,一個圖檔連結,一個搜尋内容
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class BaiduimageItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() link=scrapy.Field() word=scrapy.Field() pass
3)執行程式與效果預覽
建立一個執行程式from scrapy import cmdline cmdline.execute('scrapy crawl images'.split())
![]()
Scrapy爬蟲架構,ImagesPipeline的基本用法,圖檔爬取思路詳解 ![]()
Scrapy爬蟲架構,ImagesPipeline的基本用法,圖檔爬取思路詳解 ![]()
Scrapy爬蟲架構,ImagesPipeline的基本用法,圖檔爬取思路詳解