天天看點

Step05:爬蟲小項目,爬取最新電影迅雷下載下傳位址

1.簡述

由于電影天堂的廣告實在令人不厭其煩,但其視訊資源卻的确有可取之處。是以,趁着學習爬蟲技術的這段時間,簡單實作了一個完整的小項目。

(完整代碼——連結)

2.技術準備

IDE:Pycharm,python3.6.5,使用requests+re從電影天堂爬取最新電影資源的下載下傳位址。使用tkinter設計簡單的界面,中間還涉及使用了多線程技術,python對于多線程有threading庫支援,簡化了許多工作。

3.項目步驟

進入Pycharm建立project,實作以下目錄結構:

Step05:爬蟲小項目,爬取最新電影迅雷下載下傳位址

\ThunderAndSpider\message_spider\spider_config.py

headers={
    'Cookie':'37cs_user=37cs63629906334; XLA_CI=3e976860bea5549a9a73e10df8153fcd; 37cs_pidx=2; 37cs_show=253%2C75; cscpvrich5041_fidx=3',
    'Host':'www.dytt8.net',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400',
}

other_headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400',
    'Referer':'http://www.dytt8.net/',
    'Host':'www.dytt8.net',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}

url_dytt = "http://www.dytt8.net"
re_strIndex = r'>最新電影下載下傳</a>]<a\shref=\'(.*?)\'>(.*?)</a><br'
re_strLink = r'<td\sstyle="WORD-WRAP.*?<a\shref=".*?" target="_blank" rel="external nofollow" >(.*?)</a></td>'
           

\ThunderAndSpider\message_spider\dytt_spider.py(簡單實作的爬蟲類)

import requests
import re

from message_spider.spider_config import *
from requests.exceptions import RequestException

class dytt_spider:
    def __init__(self):
        self.url = url_dytt

    def get_html(self, url, headers):
        try:
            response = requests.get(url, headers=headers)
            # Python HTTP庫requests中文頁面亂碼解決方案
            response.encoding = response.apparent_encoding
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None

    def _get_re_findall_items(self, html, re_str):
        pattern = re.compile(re_str, re.S)
        items = re.findall(pattern, html)
        return items

    def get_index(self, html, re_str):
        items = self._get_re_findall_items(html, re_str)
        for item in items:
            yield {
                "name": item[1],
                "url": url_dytt + item[0]
            }

    def get_thunder_link(self, html, re_str):
        items = self._get_re_findall_items(html, re_str)
        for item in items:
            yield {
                "thunder": item
            }

    def get_all_thunderlink(self):
        index = self.get_html(self.url, headers)
        for item in self.get_index(index, re_strIndex):
            html = self.get_html(item['url'], other_headers)
            if html:
                for x in self.get_thunder_link(html, re_strLink):
                    yield {
                        "影片名:": item["name"],
                        "磁力連結:": x['thunder'],
                    }

if __name__=="__main__":
    dytt = dytt_spider(url_dytt)
    for mess in dytt.get_all_thunderlink():
        print(mess)
           

\ThunderAndSpider\thunder\thunder_config.py

#==注意,此處要修改為迅雷所在目錄的完整路徑==
thunder_path = 'E:\Thunder.exe'
#==注意,此處修改為迅雷下載下傳檔案存放目錄==
save_path= 'G:\\thunder_download\\'
           

\ThunderAndSpider\thunder\dytt_thunder.py

import os, time
#import threading
from thunder.thunder_config import *

class my_thunder:
    def __init__(self, url):
        self.url = url
        self.filename = os.path.split(self.url)[1]
        self.args = r'"{thunder_path}" {url}'.format(thunder_path=thunder_path, url=url)

    def start_target(self):
        print("準備下載下傳---{name}".format(name=self.filename))
        os.system(self.args)
        #new_thread = threading.Thread(target=os.system, args=(self.args,))
        #new_thread.start()

    def check_start(self):
        check_file=self.filename+".xltd"
        return os.path.exists(os.path.join(save_path, check_file))

    def check_end(self):
        return os.path.exists(os.path.join(save_path, self.filename))
    '''
    def download(self):
        self.start_target()
        print("正在下載下傳{name}".format(name=self.filename))
        if self.check_start():
            while True:
                time.sleep(60)
                if self.check_end():
                    print("下載下傳完成")
                    return True
        else:
            print("下載下傳失敗")
            return False
    '''
           

\ThunderAndSpider\win_gui\main_gui.py(此處設計界面)

from tkinter import *

class MainGUI:
    def __init__(self):
        self.root=Tk()
        self.root.title("電影下載下傳")
        self.root.geometry("700x500")
        self.root.resizable(False, False)
        self._set_gui()

    def open_gui(self):
        self.root.mainloop()

    def _set_gui(self):
        Label(self.root, text="資源來源:").grid(row=0, column=0)
        self.entry_01 = Entry(self.root)
        self.entry_01.grid(row=0,column=1,sticky=W)
        Label(self.root, text="資源種子:").grid(row=1, column=0)
        self.text_01 = Text(self.root)
        self.text_01.grid(row=1,column=1, sticky=W)
        Label(self.root, text="目前電影:").grid(row=2, column=0)
        self.entry_02 = Entry(self.root, width=300)
        self.entry_02.grid(row=2,column=1,sticky=W)
        self.frm = Frame(self.root)
        self.frm.grid(row=3, column=1, sticky=W)
        self.btn_01 = Button(self.frm, text="上一部")
        self.btn_01.grid(row=0, column=1)
        self.btn_02 = Button(self.frm, text="下一部")
        self.btn_02.grid(row=0, column=2)
        self.btn_03 = Button(self.frm, text="下載下傳目前部")
        self.btn_03.grid(row=0, column=3)
           

\ThunderAndSpider\main.py

from win_gui.main_gui import *
from thunder.dytt_thunder import *
from message_spider.dytt_spider import *
from message_spider.spider_config import url_dytt

import threading

urls = []
link_message = ""
urls_index = 0

def get_urls_and_linkmessage(spider):
    global urls
    global link_message
    for x in spider.get_all_thunderlink():
        link_message += x["影片名:"]
        link_message += "\n"
        link_message += x["磁力連結:"]
        link_message += "\n\n"
        urls.append(x['磁力連結:'])

def change_entry_a(entry):
    global urls_index
    if urls_index == 0:
        urls_index = len(urls) - 1
    else:
        urls_index = urls_index - 1
    entry.delete(0, END)
    entry.insert(20, urls[urls_index])

def change_entry_b(entry):
    global urls_index
    if urls_index ==len(urls)-1:
        urls_index = 0
    else:
        urls_index = urls_index + 1
    entry.delete(0,END)
    entry.insert(20,urls[urls_index])

def download_current():
    thunder = my_thunder(urls[urls_index])
    #new_thread = threading.Thread(target=thunder.download)
    new_thread = threading.Thread(target=thunder.start_target)
    new_thread.start()
    #thunder.download()

def mainGUI_config(mainWin):
    # INSERT索引表示在光标處插入,END索引号表示在最後插入
    mainWin.entry_01.insert(END, url_dytt)
    mainWin.text_01.insert(1.0, link_message)
    mainWin.entry_02.insert(20, urls[0])
    mainWin.btn_01.config(command=lambda: change_entry_a(mainWin.entry_02))
    mainWin.btn_02.config(command=lambda: change_entry_b(mainWin.entry_02))
    mainWin.btn_03.config(command=download_current)

def main():
    print("++++主程式啟動++++")
    mainWin = MainGUI()
    spider = dytt_spider()

    get_urls_and_linkmessage(spider)
    mainGUI_config(mainWin)
    mainWin.open_gui()

if __name__ =="__main__":
    main()
           

4.項目成果

目标站點:

Step05:爬蟲小項目,爬取最新電影迅雷下載下傳位址

項目主界面

Step05:爬蟲小項目,爬取最新電影迅雷下載下傳位址