import os
import requests
from lxml import etree
from urllib import request
from threading import Thread
from queue import Queue
img_queue = Queue()
url_queue = Queue()
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
'Cookie': '__cfduid=dd64424b455df541fc9b3923cef61a9661573534965; UM_distinctid=16e5dfe20b27e2-0b235f820c1bfd-1d3c6a5a-13c680-16e5dfe20b3ad9; _ga=GA1.2.1548844508.1573534966; _gid=GA1.2.438142511.1573534966; __gads=Test; BAIDU_SSP_lcr=https://www.baidu.com/link?url=gM9DhsPWCFEV7dGzWICxtXEbS2dlLuXNBzPIuVY8K6YxGKHYgPzKlc3LjPB3x2eS&wd=&eqid=c5dcb28c00076e6a000000065dcb67f4; CNZZDATA1256911977=1171828334-1573530403-null%7C1573612169; XSRF-TOKEN=eyJpdiI6ImtFZEJTVVdMNzNNUDFXSE1cL2R3K09nPT0iLCJ2YWx1ZSI6IlEwSWk3aTZqMFE2Y3hYUTNrNmQwNjZ4cEZBRXFGa29wSnFMOWNwNDJpNXhMdHhtblJoXC9PRTlzZGdzbTB1cVJlIiwibWFjIjoiMGVhYjVkZmRlNzNmNjVlMWU1MWRhMWMwYWE0MTRiZDllNjBmMTA5MzM5NzljMDFhMTU4Zjg1OWM0ZDVlZTk3MSJ9; doutula_session=eyJpdiI6IjcyTzVRQUhBN3BNZmozaWFUTGo0M2c9PSIsInZhbHVlIjoiNUpidGlqd0FLdVU5VEVDQms2WFlKT0JvdTFEbldxRWljTmY0Zm5wRG9qb0grcjVWdWlTRjAzTUU5aklKN3RtUSIsIm1hYyI6ImY2YWNhZmVkNzA1NGRiOWNhMDFhMjljZGIzMjNmNzE3ZDVlYmM2YWUzYjhjZjI3Y2JjYTc0YmU0OGZkMDI2ZDYifQ%3D%3D',
'Referer': 'http://www.doutula.com/photo/list/?page=3'
}
PROXY = {
'http': '49.79.195.69:4256'
}
def parse_img():
while True:
if url_queue.empty():
break
d_url, page = url_queue.get()
res = requests.get(d_url, headers=HEADERS, proxies=PROXY)
con = res.text
html = etree.HTML(con)
srcs = html.xpath('//div[@class="random_picture"]//img/@data-original')
alts = html.xpath('//div[@class="random_picture"]//img/@alt')
img_path = os.path.join(BASE_DIR, 'bqb/%s' % page)
if not os.path.exists(img_path):
os.mkdir(img_path)
for src, alt in zip(srcs, alts):
ext = os.path.splitext(src)[1]
img_name = alt + ext
img_save_path = os.path.join(img_path, img_name)
img_queue.put((src,img_save_path))
def down_img():
while True:
if img_queue.empty() and url_queue.empty():
break
src, img_path = img_queue.get()
request.urlretrieve(src, img_path)
if __name__ == '__main__':
for i in range(1, 50):
url = 'http://www.doutula.com/photo/list/?page=%s' % i
url_queue.put((url, i))
data = []
for i in range(0, 5):
t = Thread(target=parse_img)
data.append(t)
for i in range(0, 5):
t = Thread(target=down_img)
data.append(t)
for i in data:
i.start()