天天看點

【Python~分享】爬取 mp4 格式視訊

注:由于我進行的并不是規範爬蟲,每次爬都是有點心虛,是以下手對象也是一些不法網站QAQ,其中涉及敏感資訊,是以就不給出網站URL,哈哈

import requests
from bs4 import BeautifulSoup
import time
import socket
import os
import re
import bs4


def getText(url):
    try:
        header = {'User-Agent': 'Mozilla/5.0'}
        r = requests.get(url, timeout=20, headers=header, stream=True)
        r.raise_for_status()
        return r.content
    except:
        print("申請視訊錯誤")


def writeFile(path, content):
    if not os.path.exists(path):
        with open(path, "wb") as file:
            file.write(content)
            file.flush()
    else:
        pass


def dealUrl(url, dir):
    if not os.path.exists(dir):
        os.mkdir(dir)
    str = url.split("/")
    ss = str[-1].split(".")
    mv = str[-2] + "_" + ss[0] + ".mp4"
    path = dir + mv
    return path


def getHtml(url):
    try:
        header = {'User-Agent': 'Mozilla/5.0'}
        r = requests.get(url, timeout=20, headers=header)
        r.encoding = r.apparent_encoding
        r.raise_for_status()
        return r.text
    except:
        print("申請網站錯誤")
        return r.status_code


def getMvUrl(html):
    soup = BeautifulSoup(html, "html.parser")
    a_search = soup("a", href=re.compile("/all/"))
    mvs = []
    for a in a_search:
        if isinstance(a, bs4.element.Tag):
            if not a.has_attr('rel') and len(a(lambda x: x.name != None)) != 0:
                if a["href"] not in mvs:
                    mvs.append(a["href"])
    return mvs


def dealMvUrl(mvs, url):
    strs = url.split("//")
    mv_paths = []
    for mvitem in mvs:
        strs[-1] = mvitem
        each = strs[0] + "//" + strs[1] + strs[2]
        mv_paths.append(each)
    return mv_paths


def getVideoUrl(url):
    try:
        header = {'User-Agent': 'Mozilla/5.0'}
        r = requests.get(url, timeout=20, headers=header)
        r.encoding = r.apparent_encoding
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        video = soup.find("div", {"id": "vod"})
        if isinstance(video, bs4.element.Tag):
            src = video.string.split("$")
        return src[-2]
    except:
        print("申請視訊頁面錯誤")


def getPages(html):  # 擷取子頁面路徑
    pages = []
    soup = BeautifulSoup(html, "html.parser")
    menus = soup.find("div", {"class": "menu on"})
    uls = menus("a", href=re.compile("/all/"))
    for a in uls:
        if isinstance(a, bs4.element.Tag):
            pages.append(a["href"])
    return pages


def dealPageUrl(pages, url):  # 規範化每個子頁面HTTP申請,爬取第2-12頁
    abs = []
    str = url.split("/")
    for page in pages:  # 對于每一個從主站擷取來的頁面,都添加前12頁
        i = str[0] + "//" + str[2] + page
        ii = i.split(".")
        for index in range(2, 13):
            iii = "{0}.{1}.{2}{3}.{4}".format(ii[0], ii[1], ii[2], index, ii[3])
            abs.append(iii)
    return abs


if __name__ == '__main__':

    dir_url = "D:\\mp4_MV\\"  # 存儲檔案夾路徑
    url = "http://www.****.com//index.html"
    mvUrls = []  # 定義視訊路徑清單
    mv_num = 0
    now_op=1
    html_main = getHtml(url)  # 擷取主網站結構
    pageUrls = getPages(html_main)  # 擷取子頁面路徑

    abs_page_url = dealPageUrl(pageUrls, url)  # 擷取每個頁面的http申請#此時就應該擷取所有頁面申請

    for page in abs_page_url:  # 依次對每個頁面進行操作:
        html = getHtml(page)  # 擷取每個子頁面結構
        if html == 404:  # 處理申請404
            print(html)
            continue

        mvUrls = getMvUrl(html)  # 擷取每個子頁面所有mv相對路徑

        True_mv_path = dealMvUrl(mvUrls, url)  # 擷取mv頁面HTTP路徑
        mv_num += len(True_mv_path)

        for mvitem in True_mv_path:  # 對應每一個頁面路徑,獲得video的src
            try:
                print("開始第-"+str(now_op)+"-個")
                time.sleep(1)
                src = getVideoUrl(mvitem)
                path = dealUrl(src, dir_url)  # 擷取每一個mv本地存儲路徑#并處理檔案夾存在與否
                content = getText(src)
                writeFile(path, content)  # 寫入本地檔案
                print("第"+str(now_op)+"-個-this_ok--" + path)
                now_op+=1
            except:
                print("第-"+str(now_op)+"-個-this--" + path + "-出現錯誤")
                now_op+=1
                continue

    print("all_ok")
    print("{:-^20}".format("共有_" + str(mv_num) + "_個視訊"))
           

繼續閱讀