網站:http://www.win4000.com(圖檔的品質還不錯)
基本環境配置:
- Python版本:3.6
相關子產品:
import re
import requests
import time
from multiprocessing import Pool
from lxml import etree
import os
import uuid
####完整代碼:
import re
import requests
import time
from multiprocessing import Pool
from lxml import etree
import os
import uuid
'''
遇到不懂的問題?Python學習交流群:821460695滿足你的需求,資料都已經上傳群檔案,可以自行下載下傳!
'''
# 第一個首頁面位址
rooturl = 'http://www.win4000.com/zt/huyan_'
# http://www.win4000.com/zt/fengjing.html
# 模拟浏覽器請求頭
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
" AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/70.0.3538.110 Safari/537.36"
}
count = 0
# 圖檔集url
def graph_set(rooturl):
set = []
title = []
results = requests.get(rooturl, headers=header)
text = results.text
res = re.findall('.*href="(.*)" target="_blank" rel="external nofollow" target="_blank" rel="external nofollow" target="_blank" rel="external nofollow" alt="',text)
selector = etree.HTML(text)
tt = selector.xpath('//div[contains(@class,"tab_tj")]//li//p')
for url in res:
set.append(url)
for tit in tt[:24]:
title.append(tit.text)
return title,set
# 圖檔頁面解析原圖集合
def parser(tup):
response = requests.get(tup[0],headers=header)
text = response.text
originset = re.findall('href="(.*)" target="_blank" rel="external nofollow" target="_blank" rel="external nofollow" target="_blank" rel="external nofollow" class=.*檢視原圖',text)
time.sleep(1)
oringin(originset.pop(),tup[1])
# 圖集原圖集合
def oringin(page,name):
print(name+'正在爬取')
dir = 'G:\python 資源\python project\美桌網桌面爬取\護眼圖檔\\'
oringin = []
response = requests.get(page,headers=header)
res = re.findall('li.*href="(.*)" target="_blank" rel="external nofollow" target="_blank" rel="external nofollow" target="_blank" rel="external nofollow" .*><img.*/li',response.text)
for url in res:
result = re.findall('(.*)" target', url)
oringin.append(result)
num = len(oringin)
for url in oringin:
count = uuid.uuid1()
res = requests.get(url.pop(), headers=header)
with open(dir+str(count)+'.jpg','wb') as file:
file.write(res.content)
# time.sleep(1)
# print(oringin)
def main(rooturl):
pagename,pageset = graph_set(rooturl)
# for url,name in dict(zip(pageset,pagename)).items():
# orin = parser(url)
# oringin(orin,name)
# print(url,name)
p = Pool()
p.map(parser,zip(pageset,pagename))
if __name__ == '__main__':
for i in range(1,6):
pageurl = rooturl + str(i) + '.html'
print(str(i)+'頁面開始爬取......')
main(pageurl)