天天看點

scrapy爬取百度圖檔

百度圖檔基本沒什麼反爬蟲措施,我們爬取圖檔時直接滑鼠右鍵--->檢查----->network---->XHR,往下拖動得到頁面,可以看到headers下的General,檢視實際的請求Request URL,提取其中的關鍵資訊即可

話不多說,直接上代碼;

spider檔案:

class BaidupictureSpider(scrapy.Spider):

    name = 'baidupicture'

    allowed_domains = ['http://image.baidu.com']

    #搜尋關鍵字清單

    search_lists = [‘臭豆腐’,‘雞肉’,‘美女’]

    #關鍵字對應搜尋的頁數

    search_pages = [20,10,100]

    def start_requests(self):

        for i in range(len(self.search_lists)):

            queryWord = urlencode({'queryWord': self.search_lists[i]})

            word = urlencode({'word': self.search_lists[i]})

            for i in range(self.search_pages[i]):

                url ="https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&" + queryWord +"&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&"+word+"&s=&se=&tab=&width=&height=& + str(i*30) +"&rn=30"

                yield scrapy.Request(url, callback=self.parse, meta=({'search': self.search_lists[i]}),dont_filter=True)

    def parse(self, response):

        item = FacepictureItem()

        item['recode'] = response.meta['search']#關鍵字,可以傳到pipeline作為檔案名

        datas = response.text

        item['imgs_url'] = re.findall('"thumbURL":"(https://.*?.jpg)"', datas)#提取圖檔連結

        yield item

settings:

設定 

ROBOTSTXT_OBEY = False
      

pipeline:

from hashlib import md5

from urllib.request import urlretrieve

import os

class FacepicturePipeline(object):

    def process_item(self, item, spider):

        if not os.path.exists(item['recode']):

            os.mkdir(item['recode'])

        for url in item['imgs_url']:

            print('正在寫的url是:', url)

            img_path = '{0}/{1}.{2}'.format(item['recode'], md5(url.encode("utf-8")).hexdigest(), 'jpg')

            try:

                if not os.path.exists(img_path):

                    urlretrieve(url, filename=img_path)

            except:

                continue

        return item

完畢