天天看點

爬蟲基礎 利用xpath,requests,lxml,多線程爬取鬥圖啦表情包

完整代碼如下:

import requests
import urllib.request
from lxml import etree
import os
import re
import threading
from queue import Queue
class ProducerThread(threading.Thread):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
    #接收url隊列和 圖檔隊列
    #擷取每個表情的url 并且放到隊列中

    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(ProducerThread, self).__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue
    def run(self): #解析init
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.parse_img(url) #調用parse_img方法 擷取每個表情的url 然後存入img_url中
    #這個方法用來擷取每個表情的url位址 并存入img_queue
    def parse_img(self,url):
        response = requests.get(url,headers=self.headers)
        text = response.content.decode('utf-8')

        html = etree.HTML(text)
        imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
        for img in imgs:
            img_url = img.get('data-original')
            suffix = os.path.splitext(img_url)[1]  # 擷取圖檔的字尾名
            alt = img.get('alt')
            alt = re.sub(r'[?\?\.。,!!\/< >]', '', alt)  # windows中不需要特殊符号作為檔案名  這裡将特殊符号替換為空
            filename = alt + suffix
            # print(alt)
            if not self.img_queue.full():
                self.img_queue.put((img_url,filename))


class CustomerThread(threading.Thread):
    #從img_queue隊列中擷取每個表情的url
    #把它下載下傳下來
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(CustomerThread, self).__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue
    def run(self):
        while True:
            if self.img_queue.empty() and self.page_queue.empty():
                break
            img_url,filename = self.img_queue.get()
            urllib.request.urlretrieve(img_url,'images/'+filename)
            print(filename+"下載下傳完成")




def main():
    page_queue = Queue(50)  #爬取50頁
    img_queue = Queue(1000) #儲存1000個表情
    for x in range(1,51):
        url = "http://www.doutula.com/photo/list/?page=%d" % x
        page_queue.put(url) #把每一個的url位址放到了隊列中
    for x in range(5):
        t = ProducerThread(page_queue,img_queue)
        t.start()

    for x in range(5):
        t = CustomerThread(page_queue,img_queue)
        t.start()

if __name__ == '__main__':
    print('開始下載下傳')
    main()


           

萬水千山總是情,點個關注行不行。

你的一個小小舉動,将是我分享更多幹貨的動力。