# 導入requests包
import urllib.request
import re
import requests
from bs4 import BeautifulSoup
import time
from threading import Lock, Thread # 線程包
# 請求頭
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36',
"Referer": "https://www.baidu.com/link?url=S9qOAJnnFVvK9MaArz9E-MpFsvJW2y3H8fAo044AAz1EfTWte8eO3ny3aXgoBIE2&wd=&eqid=8e91efc50002b2cb000000066183ad90",
"Host": "www.kluniv.edu.cn",
"cooklie": "JSESSIONID=A764E4DE3C7C57E3F13BB55DADAABA90",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Remote Address": "111.123.226.32:443",
}
# proxies = {'協定': '協定://IP:端口号'}
proxies = {
'http': 'http://{}'.format('8.129.28.247:8888'),
'https': 'https://{}'.format('8.129.28.247:8888'),
}
texturl = "https://www.kluniv.edu.cn/"
session = requests.Session() # 調用requests.Session方法對url進行會話挂起狀态
text = session.post(url=texturl, headers=header).content.decode() # 初次通路
def paqutu1(xx):
url = 'https://www.kluniv.edu.cn/index/xw.htm'
if xx == 0:
url = 'https://www.kluniv.edu.cn/index/xw.htm'
else:
url = 'https://www.kluniv.edu.cn/index/xw/' + str(xx) + '.htm' # 連接配接url
time.sleep(0.1)
audio_content = session.get(url=url, headers=header).content # 擷取網頁源碼
soup = BeautifulSoup(audio_content, "html.parser", from_encoding="utf-8") # 編譯源碼為友善查找的格式
link_node = soup.find_all('a', href=re.compile(r"info/1028")) # 篩選符合的網頁(所有a标簽下 href包含 info/1028 的a标簽)
for j in range(0, len(link_node)):
reg1 = r"/info\S*htm" # (正則,要比對的字元串)
p1 = re.findall(reg1, str(link_node[j])) # 篩選出的網頁後半段
url2 = 'https://www.kluniv.edu.cn' + p1[0] # 連接配接url
time.sleep(0.1)
sd = session.get(url=url2, headers=header).content
soup1 = BeautifulSoup(sd, "html.parser", from_encoding="utf-8")
src1 = soup1.find_all('img', orisrc=re.compile(r"local"))
print(url2)
print(src1)
if len(src1):
for k in range(0, len(src1)):
# print(src1[k])
reg2 = r"/__local\S*jpg" # (正則,要比對的字元串)
reg3 = r"/__local\S*png" # (正則,要比對的字元串)
p2 = re.findall(reg2, str(src1[k])) # 篩選出所有符合reg2正規表達式的字元串
p3 = re.findall(reg3, str(src1[k])) # 篩選出所有符合reg3正規表達式的字元串
# 存入本地
if len(p2):
print(p2)
url3 = 'https://www.kluniv.edu.cn' + p2[0] # 連接配接url
print(url3)
urllib.request.urlretrieve(url3, f'tu/第{xx}頁-第{j + 1}個新聞-第{k + 1}張u.jpg') # 儲存圖檔
# audio_content3 = session.get(url=url3, headers=header).content
# with open(f'tu/{i}{j + 1}{k + 1}tu.jpg', 'wb') as f:
# f.write(audio_content3)
# f.close()
elif len(p3):
print(p3)
url4 = 'https://www.kluniv.edu.cn' + p3[0] # 連接配接url
print(url4)
urllib.request.urlretrieve(url4, f'tu/第{xx}頁-第{j + 1}個新聞-第{k + 1}張tu.png') # 儲存圖檔
# audio_content4 = session.get(url=url4, headers=header).content
# with open(f'tu/{i}{j + 1}{k + 1}tu.png', 'wb') as e:
# e.write(audio_content4)
# e.close()
print(f"第{xx}頁的第{j}個新聞完成")
if __name__ == '__main__':
for i in range(281): # 啟用多線程執行paqutu1
p1 = Thread(target=paqutu1, args=(i,)) # target:方法,args:方法的參數
p1.start() # 啟動線程