天天看點

百度貼吧 | 通用抓圖腳本搞專業爬蟲的話,先保證網速夠好,再考慮多程序還是多線程~

多程序優勢:單個程序的崩潰,不會影響其它程序

随之而來的問題是,程序之間,資源不共享,資訊不共享,是以程序通訊的問題,是實作多程序協作,必須解決的問題

為解決程序間的通訊,人們常用的方法是 --> 建立一個中間人(隊列),作為他們交流的中介...

以爬取某貼吧文章中的所有圖檔為例:

大緻需要四步

一. 擷取所有的 文章的目錄

二. 根據文章的目錄,擷取 文章詳情頁

三. 根據 文章詳情頁 的源碼,擷取内嵌的 圖檔url資訊

四. 根據 圖檔url資訊,下載下傳圖檔到本地

通信圖

動圖_腳本運作效果

運作中

from multiprocessing import Process, Queue
import time
from time import sleep
import requests
from lxml import etree
import os, sys
import re

class BaiduTb(object):
    def __init__(self, tb_name):
        self.start_url = "https://tieba.baidu.com/f?ie=utf-8&kw=" + tb_name
        self.q_list = Queue()
        self.q_detail = Queue()
        self.q_image = Queue()
        self.headers = {
            "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0) "
        }
        self.p1 = None
        self.p2 = None
        self.p3 = None
        self.p4 = None
    # 擷取響應内容
    def get_response_content(self, url):
        response = requests.get(url, headers = self.headers)
        print("-->",response)
        return response.content
    # 擷取下一個網頁url
    def get_next_url(self, now_url):
        content = self.get_response_content(now_url)
        # 将響應資訊轉換為element對象
        html = etree.HTML(content)
        # 通過xpath擷取下一頁的連結
        next_url = "http:" + html.xpath('//a[@class="next pagination-item "]/@href')[0]
        return next_url

    def get_list_page_urls(self, q_list):
        while True:
            try:
                sleep(1)
                q_list.put(self.start_url)
                self.start_url = self.get_next_url(self.start_url)
            except:
                print("主程式..")

    def get_detail_page_urls(self, q_list, q_detail):
        while True:
            sleep(1)
            try:
                if not q_list.empty():
                    list_url = q_list.get(True)
                else:
                    continue

            except:
                print("第二程序已完成..")

            content = self.get_response_content(list_url)
            # 将響應資訊轉換為element對象
            html = etree.HTML(content)

            # 擷取 标題 和url 清單
            detail_titles_list = html.xpath('//*[@id="thread_list"]//div/div[2]/div[1]/div[1]/a/text()')
            detail_urls_list = html.xpath('//*[@id="thread_list"]//div/div[2]/div[1]/div[1]/a/@href')

            # 将"标題"和 url 增加到 詳情頁隊列中
            for i in range(len(detail_titles_list)):
                detail_title = detail_titles_list[i]
                detail_url = "https://tieba.baidu.com" + detail_urls_list[i]
                temp_tuple = (detail_title, detail_url)
                q_detail.put(temp_tuple)

    def get_image_urls (self, q_detail, q_image):
        while True:
            sleep(3)
            try:
                detail = q_detail.get()
                detail_title = detail[0]
                detail_url = detail[1]
            except Exception as e:
                print("第三程序已完成")
            # 擷取詳情頁所有圖檔的url
            content = self.get_response_content(detail_url)
            # 将響應資訊轉換為element對象
            html = etree.HTML(content)
            # 通過xpath擷取下一頁的連結
            image_urls = html.xpath('//cc//img/@src')
            for i in range(len(image_urls)):
                image_url = image_urls[i]
                temp_image_info = (detail_title, image_url)
                q_image.put(temp_image_info)

    def save_image(self, q_image):

        while True:
            sleep(3)
            try:
                if not q_image.empty():
                    image_info = q_image.get()
                    image_name = re.match(r".*(.{10})", image_info[1]).group(1)
                    print("圖檔名稱為:", image_name, "圖檔位址為:", image_info[1], "文章标題為:", image_info[0])
                    # 嘗試根據文章名稱,建立新檔案夾
                    try:
                        os.mkdir("./%s"%(image_info[0]))
                    except:
                        pass
                    new_file_path = "./%s/%s"%(image_info[0],image_name)
                    with open(new_file_path, "wb") as f:
                        data = self.get_response_content(image_info[1])
                        f.write(data)
                    if (self.q_list.empty()) and (self.q_detail.empty()) and (self.q_image.empty()):
                        exit()
            except:
                print("第四程序已完成")

    def run(self):
        print("開始執行")

        self.p1 = Process(target = self.get_list_page_urls, args=(self.q_list,))
        self.p2 = Process(target = self.get_detail_page_urls, args=(self.q_list, self.q_detail,))   
        self.p3 = Process(target = self.get_image_urls, args=(self.q_detail, self.q_image,))
        self.p4 = Process(target = self.save_image, args=(self.q_image,))

        self.p1.start()
        self.p2.start()
        self.p3.start()
        self.p4.start()

        self.p1.join()
        self.p2.join()
        self.p3.join()
        self.p4.join()


def main():
    name = input("請輸入貼吧名稱:")
    beautiful_girl = BaiduTb(name)
    beautiful_girl.run()


if __name__ == '__main__':
    main()

           

多程序和和多線程哪個更好用?

追求資源使用率,考慮多線程

追求程式穩定性,考慮多程序

搞專業爬蟲的話,先保證網速夠好,再考慮多程序還是多線程~

繼續閱讀