天天看點

python正則擷取站長之家風景圖,儲存到本地

# -*- coding: utf-8 -*-
# !/usr/bin/env python
# 擷取站長之家風景圖:https://sc.chinaz.com/tupian/fengjingtupian.html,長時間爬取會出現圖檔響應逾時的問題。
# 首先從第一頁中擷取第一頁所有圖檔詳情頁連結和下一頁的連結
# 對詳情頁就行解析下載下傳,下載下傳完畢請求下一頁,并重複上一步操作,直到最後一頁為止。
# 在下載下傳圖檔前,先擷取所有已下載下傳的圖檔名字,如果存在則不下載下傳

import os, re, time
import requests
# 導入自定義随機請求頭的包
from utils.header import get_ua


class Chinaz():
    def __init__(self):
        self.url = "https://sc.chinaz.com/tupian/fengjingtupian.html"
        self.base_url = "https://sc.chinaz.com/tupian/"
        self.img_file = "imgs"
        if not os.path.exists(self.img_file):
            os.makedirs(self.img_file)
        else:
            # 如果檔案夾存在,則擷取裡面所有的檔案名字,也有可能檔案夾裡什麼都沒有
            for root, dirs, files in os.walk(self.img_file):
                self.files= files

    # 專注于發送請求,并傳回響應對象
    def get_html(self, url):
        resp = requests.get(url, headers={"User-Agent": get_ua()})
        resp.encoding = 'utf-8'
        if resp.status_code == 200:
            return resp

    # 擷取所有圖檔網頁位址和下一頁連結
    def get_all(self, html):
        # 圖檔清單頁的規則
        img_url_list_patt = re.compile(r'<p><a target="_blank" href="(.*?)" target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow"  alt=".*?">')
        all_img_urls = img_url_list_patt.findall(html)
        all_img_urls = ["https:" + i for i in all_img_urls]
        # 圖檔詳情頁的規則
        img_url_patt = re.compile(r'<a href="(.*?)" target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow"  title="(.*?)" class="image_gall">')
        for img_url in all_img_urls:
            img_html = self.get_html(img_url)
            if img_html:
                res_img_urls = img_url_patt.findall(img_html.text)
                res_img_url = "https:" + res_img_urls[0][0]
                res_img_title = res_img_urls[0][1]
                # 圖檔名字:江面風景唯美意境圖檔zzpic9603.jpg
                res_img_title += res_img_url.split("/")[-1]
                # 如果檔案夾為空或者圖檔名字不存在則下載下傳
                if not self.files or self.img_exist(res_img_title):
                    try:
                        self.download_img(res_img_url, res_img_title)
                    except Exception as e:
                        print("%s,該圖檔下載下傳失敗,跳過,出錯原因:%s" % (res_img_title,e))
                        continue
                else:
                    print("該圖檔已存在,無需下載下傳:%s" % res_img_title)

        # 圖檔清單頁擷取下一頁規則
        next_patt = re.compile(r'(fengjing.*?)"\s+class="nextpage">下一頁</a>')
        next_page = next_patt.findall(html)
        try:
            next_page = self.base_url + next_page[0].split('"')[-1]
            print("即将處理連結:", next_page)
            resp = self.get_html(next_page)
            if resp:
                self.get_all(resp.text)
        except IndexError:
            print("沒有下一頁了!")
        except Exception as e:
            print("出錯了:", e)

    # 擷取圖檔位址并下載下傳
    def download_img(self, img_url, res_img_title):
        time.sleep(1)
        print("下載下傳圖檔:", res_img_title)
        resp = self.get_html(img_url)
        if resp:
            with open(self.img_file + "/" + res_img_title, 'wb')as f:
                f.write(resp.content)
        else:
            print("%s下載下傳圖檔失敗,忽略~" % res_img_title)

    # 下載下傳圖檔之前先判斷,即将要下載下傳的圖檔名字是否存在,如果存在則不再下載下傳
    def img_exist(self, res_img_title):
        if res_img_title not in self.files:
            return True
        # for root, dirs, files in os.walk(self.img_file):
        #     if res_img_title not in files:
        #         return True


if __name__ == '__main__':
    cz = Chinaz()
    html = cz.get_html(cz.url)
    cz.get_all(html.text)
           
get_ua請求頭可以自己随機設定一個,或者參考:https://blog.csdn.net/z564359805/article/details/111354241      

繼續閱讀