天天看点

爬虫基础 利用xpath,requests,lxml,多线程爬取斗图啦表情包

完整代码如下:

import requests
import urllib.request
from lxml import etree
import os
import re
import threading
from queue import Queue
class ProducerThread(threading.Thread):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
    #接收url队列和 图片队列
    #获取每个表情的url 并且放到队列中

    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(ProducerThread, self).__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue
    def run(self): #解析init
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.parse_img(url) #调用parse_img方法 获取每个表情的url 然后存入img_url中
    #这个方法用来获取每个表情的url地址 并存入img_queue
    def parse_img(self,url):
        response = requests.get(url,headers=self.headers)
        text = response.content.decode('utf-8')

        html = etree.HTML(text)
        imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
        for img in imgs:
            img_url = img.get('data-original')
            suffix = os.path.splitext(img_url)[1]  # 获取图片的后缀名
            alt = img.get('alt')
            alt = re.sub(r'[?\?\.。,!!\/< >]', '', alt)  # windows中不需要特殊符号作为文件名  这里将特殊符号替换为空
            filename = alt + suffix
            # print(alt)
            if not self.img_queue.full():
                self.img_queue.put((img_url,filename))


class CustomerThread(threading.Thread):
    #从img_queue队列中获取每个表情的url
    #把它下载下来
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(CustomerThread, self).__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue
    def run(self):
        while True:
            if self.img_queue.empty() and self.page_queue.empty():
                break
            img_url,filename = self.img_queue.get()
            urllib.request.urlretrieve(img_url,'images/'+filename)
            print(filename+"下载完成")




def main():
    page_queue = Queue(50)  #爬取50页
    img_queue = Queue(1000) #保存1000个表情
    for x in range(1,51):
        url = "http://www.doutula.com/photo/list/?page=%d" % x
        page_queue.put(url) #把每一个的url地址放到了队列中
    for x in range(5):
        t = ProducerThread(page_queue,img_queue)
        t.start()

    for x in range(5):
        t = CustomerThread(page_queue,img_queue)
        t.start()

if __name__ == '__main__':
    print('开始下载')
    main()


           

万水千山总是情,点个关注行不行。

你的一个小小举动,将是我分享更多干货的动力。