注:由于我進行的并不是規範爬蟲,每次爬都是有點心虛,是以下手對象也是一些不法網站QAQ,其中涉及敏感資訊,是以就不給出網站URL,哈哈
import requests
from bs4 import BeautifulSoup
import time
import socket
import os
import re
import bs4
def getText(url):
try:
header = {'User-Agent': 'Mozilla/5.0'}
r = requests.get(url, timeout=20, headers=header, stream=True)
r.raise_for_status()
return r.content
except:
print("申請視訊錯誤")
def writeFile(path, content):
if not os.path.exists(path):
with open(path, "wb") as file:
file.write(content)
file.flush()
else:
pass
def dealUrl(url, dir):
if not os.path.exists(dir):
os.mkdir(dir)
str = url.split("/")
ss = str[-1].split(".")
mv = str[-2] + "_" + ss[0] + ".mp4"
path = dir + mv
return path
def getHtml(url):
try:
header = {'User-Agent': 'Mozilla/5.0'}
r = requests.get(url, timeout=20, headers=header)
r.encoding = r.apparent_encoding
r.raise_for_status()
return r.text
except:
print("申請網站錯誤")
return r.status_code
def getMvUrl(html):
soup = BeautifulSoup(html, "html.parser")
a_search = soup("a", href=re.compile("/all/"))
mvs = []
for a in a_search:
if isinstance(a, bs4.element.Tag):
if not a.has_attr('rel') and len(a(lambda x: x.name != None)) != 0:
if a["href"] not in mvs:
mvs.append(a["href"])
return mvs
def dealMvUrl(mvs, url):
strs = url.split("//")
mv_paths = []
for mvitem in mvs:
strs[-1] = mvitem
each = strs[0] + "//" + strs[1] + strs[2]
mv_paths.append(each)
return mv_paths
def getVideoUrl(url):
try:
header = {'User-Agent': 'Mozilla/5.0'}
r = requests.get(url, timeout=20, headers=header)
r.encoding = r.apparent_encoding
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
video = soup.find("div", {"id": "vod"})
if isinstance(video, bs4.element.Tag):
src = video.string.split("$")
return src[-2]
except:
print("申請視訊頁面錯誤")
def getPages(html): # 擷取子頁面路徑
pages = []
soup = BeautifulSoup(html, "html.parser")
menus = soup.find("div", {"class": "menu on"})
uls = menus("a", href=re.compile("/all/"))
for a in uls:
if isinstance(a, bs4.element.Tag):
pages.append(a["href"])
return pages
def dealPageUrl(pages, url): # 規範化每個子頁面HTTP申請,爬取第2-12頁
abs = []
str = url.split("/")
for page in pages: # 對于每一個從主站擷取來的頁面,都添加前12頁
i = str[0] + "//" + str[2] + page
ii = i.split(".")
for index in range(2, 13):
iii = "{0}.{1}.{2}{3}.{4}".format(ii[0], ii[1], ii[2], index, ii[3])
abs.append(iii)
return abs
if __name__ == '__main__':
dir_url = "D:\\mp4_MV\\" # 存儲檔案夾路徑
url = "http://www.****.com//index.html"
mvUrls = [] # 定義視訊路徑清單
mv_num = 0
now_op=1
html_main = getHtml(url) # 擷取主網站結構
pageUrls = getPages(html_main) # 擷取子頁面路徑
abs_page_url = dealPageUrl(pageUrls, url) # 擷取每個頁面的http申請#此時就應該擷取所有頁面申請
for page in abs_page_url: # 依次對每個頁面進行操作:
html = getHtml(page) # 擷取每個子頁面結構
if html == 404: # 處理申請404
print(html)
continue
mvUrls = getMvUrl(html) # 擷取每個子頁面所有mv相對路徑
True_mv_path = dealMvUrl(mvUrls, url) # 擷取mv頁面HTTP路徑
mv_num += len(True_mv_path)
for mvitem in True_mv_path: # 對應每一個頁面路徑,獲得video的src
try:
print("開始第-"+str(now_op)+"-個")
time.sleep(1)
src = getVideoUrl(mvitem)
path = dealUrl(src, dir_url) # 擷取每一個mv本地存儲路徑#并處理檔案夾存在與否
content = getText(src)
writeFile(path, content) # 寫入本地檔案
print("第"+str(now_op)+"-個-this_ok--" + path)
now_op+=1
except:
print("第-"+str(now_op)+"-個-this--" + path + "-出現錯誤")
now_op+=1
continue
print("all_ok")
print("{:-^20}".format("共有_" + str(mv_num) + "_個視訊"))