完整代碼如下:
import requests
import urllib.request
from lxml import etree
import os
import re
import threading
from queue import Queue
class ProducerThread(threading.Thread):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
#接收url隊列和 圖檔隊列
#擷取每個表情的url 并且放到隊列中
def __init__(self,page_queue,img_queue,*args,**kwargs):
super(ProducerThread, self).__init__(*args,**kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self): #解析init
while True:
if self.page_queue.empty():
break
url = self.page_queue.get()
self.parse_img(url) #調用parse_img方法 擷取每個表情的url 然後存入img_url中
#這個方法用來擷取每個表情的url位址 并存入img_queue
def parse_img(self,url):
response = requests.get(url,headers=self.headers)
text = response.content.decode('utf-8')
html = etree.HTML(text)
imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
for img in imgs:
img_url = img.get('data-original')
suffix = os.path.splitext(img_url)[1] # 擷取圖檔的字尾名
alt = img.get('alt')
alt = re.sub(r'[?\?\.。,!!\/< >]', '', alt) # windows中不需要特殊符号作為檔案名 這裡将特殊符号替換為空
filename = alt + suffix
# print(alt)
if not self.img_queue.full():
self.img_queue.put((img_url,filename))
class CustomerThread(threading.Thread):
#從img_queue隊列中擷取每個表情的url
#把它下載下傳下來
def __init__(self,page_queue,img_queue,*args,**kwargs):
super(CustomerThread, self).__init__(*args,**kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.img_queue.empty() and self.page_queue.empty():
break
img_url,filename = self.img_queue.get()
urllib.request.urlretrieve(img_url,'images/'+filename)
print(filename+"下載下傳完成")
def main():
page_queue = Queue(50) #爬取50頁
img_queue = Queue(1000) #儲存1000個表情
for x in range(1,51):
url = "http://www.doutula.com/photo/list/?page=%d" % x
page_queue.put(url) #把每一個的url位址放到了隊列中
for x in range(5):
t = ProducerThread(page_queue,img_queue)
t.start()
for x in range(5):
t = CustomerThread(page_queue,img_queue)
t.start()
if __name__ == '__main__':
print('開始下載下傳')
main()
萬水千山總是情,點個關注行不行。
你的一個小小舉動,将是我分享更多幹貨的動力。