天天看點

python爬蟲學習筆記-1

# 導入requests包
import urllib.request
import re
import requests
from bs4 import BeautifulSoup
import time
from threading import Lock, Thread  # 線程包

# 請求頭
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36',
    "Referer": "https://www.baidu.com/link?url=S9qOAJnnFVvK9MaArz9E-MpFsvJW2y3H8fAo044AAz1EfTWte8eO3ny3aXgoBIE2&wd=&eqid=8e91efc50002b2cb000000066183ad90",
    "Host": "www.kluniv.edu.cn",
    "cooklie": "JSESSIONID=A764E4DE3C7C57E3F13BB55DADAABA90",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Remote Address": "111.123.226.32:443",
}

# proxies = {'協定': '協定://IP:端口号'}
proxies = {
            'http': 'http://{}'.format('8.129.28.247:8888'),
            'https': 'https://{}'.format('8.129.28.247:8888'),
        }

texturl = "https://www.kluniv.edu.cn/"
session = requests.Session()  # 調用requests.Session方法對url進行會話挂起狀态
text = session.post(url=texturl, headers=header).content.decode()  # 初次通路


def paqutu1(xx):
    url = 'https://www.kluniv.edu.cn/index/xw.htm'
    if xx == 0:
        url = 'https://www.kluniv.edu.cn/index/xw.htm'
    else:
        url = 'https://www.kluniv.edu.cn/index/xw/' + str(xx) + '.htm'  # 連接配接url
    time.sleep(0.1)
    audio_content = session.get(url=url, headers=header).content  # 擷取網頁源碼
    soup = BeautifulSoup(audio_content, "html.parser", from_encoding="utf-8")  # 編譯源碼為友善查找的格式
    link_node = soup.find_all('a', href=re.compile(r"info/1028"))  # 篩選符合的網頁(所有a标簽下 href包含 info/1028 的a标簽)
    for j in range(0, len(link_node)):
        reg1 = r"/info\S*htm"  # (正則,要比對的字元串)
        p1 = re.findall(reg1, str(link_node[j]))  # 篩選出的網頁後半段
        url2 = 'https://www.kluniv.edu.cn' + p1[0]  # 連接配接url
        time.sleep(0.1)
        sd = session.get(url=url2, headers=header).content
        soup1 = BeautifulSoup(sd, "html.parser", from_encoding="utf-8")
        src1 = soup1.find_all('img', orisrc=re.compile(r"local"))
        print(url2)
        print(src1)
        if len(src1):
            for k in range(0, len(src1)):
                # print(src1[k])
                reg2 = r"/__local\S*jpg"  # (正則,要比對的字元串)
                reg3 = r"/__local\S*png"  # (正則,要比對的字元串)
                p2 = re.findall(reg2, str(src1[k]))  # 篩選出所有符合reg2正規表達式的字元串
                p3 = re.findall(reg3, str(src1[k]))  # 篩選出所有符合reg3正規表達式的字元串
                # 存入本地
                if len(p2):
                    print(p2)
                    url3 = 'https://www.kluniv.edu.cn' + p2[0]  # 連接配接url
                    print(url3)
                    urllib.request.urlretrieve(url3, f'tu/第{xx}頁-第{j + 1}個新聞-第{k + 1}張u.jpg')  # 儲存圖檔
                    # audio_content3 = session.get(url=url3, headers=header).content
                    # with open(f'tu/{i}{j + 1}{k + 1}tu.jpg', 'wb') as f:
                    #     f.write(audio_content3)
                    #     f.close()
                elif len(p3):
                    print(p3)
                    url4 = 'https://www.kluniv.edu.cn' + p3[0]  # 連接配接url
                    print(url4)
                    urllib.request.urlretrieve(url4, f'tu/第{xx}頁-第{j + 1}個新聞-第{k + 1}張tu.png')  # 儲存圖檔
                    # audio_content4 = session.get(url=url4, headers=header).content
                    # with open(f'tu/{i}{j + 1}{k + 1}tu.png', 'wb') as e:
                    #     e.write(audio_content4)
                    #     e.close()
        print(f"第{xx}頁的第{j}個新聞完成")


if __name__ == '__main__':
    for i in range(281):  # 啟用多線程執行paqutu1
        p1 = Thread(target=paqutu1, args=(i,))  # target:方法,args:方法的參數
        p1.start()  # 啟動線程