完整代码如下:
import requests
import urllib.request
from lxml import etree
import os
import re
import threading
from queue import Queue
class ProducerThread(threading.Thread):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
#接收url队列和 图片队列
#获取每个表情的url 并且放到队列中
def __init__(self,page_queue,img_queue,*args,**kwargs):
super(ProducerThread, self).__init__(*args,**kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self): #解析init
while True:
if self.page_queue.empty():
break
url = self.page_queue.get()
self.parse_img(url) #调用parse_img方法 获取每个表情的url 然后存入img_url中
#这个方法用来获取每个表情的url地址 并存入img_queue
def parse_img(self,url):
response = requests.get(url,headers=self.headers)
text = response.content.decode('utf-8')
html = etree.HTML(text)
imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
for img in imgs:
img_url = img.get('data-original')
suffix = os.path.splitext(img_url)[1] # 获取图片的后缀名
alt = img.get('alt')
alt = re.sub(r'[?\?\.。,!!\/< >]', '', alt) # windows中不需要特殊符号作为文件名 这里将特殊符号替换为空
filename = alt + suffix
# print(alt)
if not self.img_queue.full():
self.img_queue.put((img_url,filename))
class CustomerThread(threading.Thread):
#从img_queue队列中获取每个表情的url
#把它下载下来
def __init__(self,page_queue,img_queue,*args,**kwargs):
super(CustomerThread, self).__init__(*args,**kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.img_queue.empty() and self.page_queue.empty():
break
img_url,filename = self.img_queue.get()
urllib.request.urlretrieve(img_url,'images/'+filename)
print(filename+"下载完成")
def main():
page_queue = Queue(50) #爬取50页
img_queue = Queue(1000) #保存1000个表情
for x in range(1,51):
url = "http://www.doutula.com/photo/list/?page=%d" % x
page_queue.put(url) #把每一个的url地址放到了队列中
for x in range(5):
t = ProducerThread(page_queue,img_queue)
t.start()
for x in range(5):
t = CustomerThread(page_queue,img_queue)
t.start()
if __name__ == '__main__':
print('开始下载')
main()
万水千山总是情,点个关注行不行。
你的一个小小举动,将是我分享更多干货的动力。