# -*- coding: utf-8 -*-
# !/usr/bin/env python
# 擷取站長之家風景圖:https://sc.chinaz.com/tupian/fengjingtupian.html,長時間爬取會出現圖檔響應逾時的問題。
# 首先從第一頁中擷取第一頁所有圖檔詳情頁連結和下一頁的連結
# 對詳情頁就行解析下載下傳,下載下傳完畢請求下一頁,并重複上一步操作,直到最後一頁為止。
# 在下載下傳圖檔前,先擷取所有已下載下傳的圖檔名字,如果存在則不下載下傳
import os, re, time
import requests
# 導入自定義随機請求頭的包
from utils.header import get_ua
class Chinaz():
def __init__(self):
self.url = "https://sc.chinaz.com/tupian/fengjingtupian.html"
self.base_url = "https://sc.chinaz.com/tupian/"
self.img_file = "imgs"
if not os.path.exists(self.img_file):
os.makedirs(self.img_file)
else:
# 如果檔案夾存在,則擷取裡面所有的檔案名字,也有可能檔案夾裡什麼都沒有
for root, dirs, files in os.walk(self.img_file):
self.files= files
# 專注于發送請求,并傳回響應對象
def get_html(self, url):
resp = requests.get(url, headers={"User-Agent": get_ua()})
resp.encoding = 'utf-8'
if resp.status_code == 200:
return resp
# 擷取所有圖檔網頁位址和下一頁連結
def get_all(self, html):
# 圖檔清單頁的規則
img_url_list_patt = re.compile(r'<p><a target="_blank" href="(.*?)" target="_blank" rel="external nofollow" target="_blank" rel="external nofollow" alt=".*?">')
all_img_urls = img_url_list_patt.findall(html)
all_img_urls = ["https:" + i for i in all_img_urls]
# 圖檔詳情頁的規則
img_url_patt = re.compile(r'<a href="(.*?)" target="_blank" rel="external nofollow" target="_blank" rel="external nofollow" title="(.*?)" class="image_gall">')
for img_url in all_img_urls:
img_html = self.get_html(img_url)
if img_html:
res_img_urls = img_url_patt.findall(img_html.text)
res_img_url = "https:" + res_img_urls[0][0]
res_img_title = res_img_urls[0][1]
# 圖檔名字:江面風景唯美意境圖檔zzpic9603.jpg
res_img_title += res_img_url.split("/")[-1]
# 如果檔案夾為空或者圖檔名字不存在則下載下傳
if not self.files or self.img_exist(res_img_title):
try:
self.download_img(res_img_url, res_img_title)
except Exception as e:
print("%s,該圖檔下載下傳失敗,跳過,出錯原因:%s" % (res_img_title,e))
continue
else:
print("該圖檔已存在,無需下載下傳:%s" % res_img_title)
# 圖檔清單頁擷取下一頁規則
next_patt = re.compile(r'(fengjing.*?)"\s+class="nextpage">下一頁</a>')
next_page = next_patt.findall(html)
try:
next_page = self.base_url + next_page[0].split('"')[-1]
print("即将處理連結:", next_page)
resp = self.get_html(next_page)
if resp:
self.get_all(resp.text)
except IndexError:
print("沒有下一頁了!")
except Exception as e:
print("出錯了:", e)
# 擷取圖檔位址并下載下傳
def download_img(self, img_url, res_img_title):
time.sleep(1)
print("下載下傳圖檔:", res_img_title)
resp = self.get_html(img_url)
if resp:
with open(self.img_file + "/" + res_img_title, 'wb')as f:
f.write(resp.content)
else:
print("%s下載下傳圖檔失敗,忽略~" % res_img_title)
# 下載下傳圖檔之前先判斷,即将要下載下傳的圖檔名字是否存在,如果存在則不再下載下傳
def img_exist(self, res_img_title):
if res_img_title not in self.files:
return True
# for root, dirs, files in os.walk(self.img_file):
# if res_img_title not in files:
# return True
if __name__ == '__main__':
cz = Chinaz()
html = cz.get_html(cz.url)
cz.get_all(html.text)
get_ua請求頭可以自己随機設定一個,或者參考:https://blog.csdn.net/z564359805/article/details/111354241