天天看點

Spider (queue)下載下傳表情包案例

import os

import requests
from lxml import etree
from urllib import request
from threading import Thread
from queue import Queue

img_queue = Queue()
url_queue = Queue()
           
BASE_DIR = os.path.dirname(os.path.abspath(__file__))

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
    'Cookie': '__cfduid=dd64424b455df541fc9b3923cef61a9661573534965; UM_distinctid=16e5dfe20b27e2-0b235f820c1bfd-1d3c6a5a-13c680-16e5dfe20b3ad9; _ga=GA1.2.1548844508.1573534966; _gid=GA1.2.438142511.1573534966; __gads=Test; BAIDU_SSP_lcr=https://www.baidu.com/link?url=gM9DhsPWCFEV7dGzWICxtXEbS2dlLuXNBzPIuVY8K6YxGKHYgPzKlc3LjPB3x2eS&wd=&eqid=c5dcb28c00076e6a000000065dcb67f4; CNZZDATA1256911977=1171828334-1573530403-null%7C1573612169; XSRF-TOKEN=eyJpdiI6ImtFZEJTVVdMNzNNUDFXSE1cL2R3K09nPT0iLCJ2YWx1ZSI6IlEwSWk3aTZqMFE2Y3hYUTNrNmQwNjZ4cEZBRXFGa29wSnFMOWNwNDJpNXhMdHhtblJoXC9PRTlzZGdzbTB1cVJlIiwibWFjIjoiMGVhYjVkZmRlNzNmNjVlMWU1MWRhMWMwYWE0MTRiZDllNjBmMTA5MzM5NzljMDFhMTU4Zjg1OWM0ZDVlZTk3MSJ9; doutula_session=eyJpdiI6IjcyTzVRQUhBN3BNZmozaWFUTGo0M2c9PSIsInZhbHVlIjoiNUpidGlqd0FLdVU5VEVDQms2WFlKT0JvdTFEbldxRWljTmY0Zm5wRG9qb0grcjVWdWlTRjAzTUU5aklKN3RtUSIsIm1hYyI6ImY2YWNhZmVkNzA1NGRiOWNhMDFhMjljZGIzMjNmNzE3ZDVlYmM2YWUzYjhjZjI3Y2JjYTc0YmU0OGZkMDI2ZDYifQ%3D%3D',
    'Referer': 'http://www.doutula.com/photo/list/?page=3'
}

PROXY = {
    'http': '49.79.195.69:4256'
}

           
def parse_img():

    while True:

        if url_queue.empty():
            break

        d_url, page = url_queue.get()

        res = requests.get(d_url, headers=HEADERS, proxies=PROXY)
        con = res.text

        html = etree.HTML(con)

        srcs = html.xpath('//div[@class="random_picture"]//img/@data-original')
        alts = html.xpath('//div[@class="random_picture"]//img/@alt')

        img_path = os.path.join(BASE_DIR, 'bqb/%s' % page)
        if not os.path.exists(img_path):
            os.mkdir(img_path)

        for src, alt in zip(srcs, alts):
            ext = os.path.splitext(src)[1]
            img_name = alt + ext
            img_save_path = os.path.join(img_path, img_name)
            img_queue.put((src,img_save_path))

           
def down_img():

    while True:
        if img_queue.empty() and url_queue.empty():
            break
        src, img_path = img_queue.get()
        request.urlretrieve(src, img_path)
           
if __name__ == '__main__':

    for i in range(1, 50):
        url = 'http://www.doutula.com/photo/list/?page=%s' % i
        url_queue.put((url, i))


    data = []
    for i in range(0, 5):
        t = Thread(target=parse_img)
        data.append(t)

    for i in range(0, 5):
        t = Thread(target=down_img)
        data.append(t)


    for i in data:
        i.start()

           

繼續閱讀