天天看點

Python爬蟲多線程爬搜尋引擎

爬搜尋引擎的資訊要注意page和key的變化,還有正規表達式一定要正确

爬下面的URL:    http://weixin.sogou.com/weixin?type=2&query=

後面再跟page資訊

一共三個線程,第一個負責把URL存到隊列中去,第二個URL負責讀取需要的資訊并儲存,第三個如果隊列為空,則結束

import queue
import threading
import urllib.request
import urllib.error
import re
import time

urlqueue = queue.Queue()

#獲得html文檔
def GetData(url):
    try:
        headers = ("User-Agent",
                   "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36")
        opener = urllib.request.build_opener()
        opener.addheaders = [headers]
        urllib.request.install_opener(opener)
        data = urllib.request.urlopen(url).read().decode('utf-8')
        return data
    except urllib.error.URLError as e:
        if hasattr(e, 'code'):
            print(e.code)
        if hasattr(e, 'reason'):
            print(e.reason)
        time.sleep(10)
    except Exception as e:
        print("exception:" + str(e))
        time.sleep(1)


# thread1
class GetUrl(threading.Thread):
    def __init__(self, key, pagestart, pageend, urlqueue):
        threading.Thread.__init__(self)
        self.key = key
        self.pagestart = pagestart
        self.pageend = pageend
        self.urlqueue = urlqueue

    def run(self):
        keycode = urllib.request.quote(self.key)
        pagecode = urllib.request.quote("&page=")
        for page in range(self.pagestart, self.pageend+1):
            url = "http://weixin.sogou.com/weixin?type=2&query="+keycode+pagecode+str(page)
            data = GetData(url)
            listurlpattern = '<div class="txt-box">.*?(http://.*?)"'
            page_urls = re.compile(listurlpattern, re.S).findall(data)
            for page_url in page_urls:
                page_url = page_url.replace("amp;", "")
                self.urlqueue.put(page_url)
                self.urlqueue.task_done()


class GetConnect(threading.Thread):

    def __init__(self, urlqueue):
        threading.Thread.__init__(self)
        self.urlqueue = urlqueue

    def run(self):
        html1 = '''
        <html>
        <head>
        <title>微信文章</title>
        </head>
        <body>
        '''
        fh = open("1.html", 'wb')
        fh.write(html1.encode('utf-8'))
        fh.close()
        fh = open("1.html", 'ab')
        i = 1
        while(True):
            try:
                url = self.urlqueue.get()
                print(url)
                data = GetData(url)
                titlepat = '<title>(.*?)</title>'
                contentpat = 'id="js_content">(.*?)id="js_sg_bar"'
                title = re.compile(titlepat, re.S).findall(data)
                content = re.compile(contentpat, re.S).findall(data)
                thistitle = "no"
                thiscontent = "no"
                if (title != []):
                    thistitle = title[0]
                if(content != []):
                    thiscontent = content[0]
                dataall = "<p>标題是:"+thistitle+"</p><p>内容是:"+thiscontent+"</p><br>"
                fh.write(dataall.encode('utf-8'))
                print("第"+str(i)+"個網頁處理")
                i += 1
            except urllib.request.URLError as e:
                if hasattr(e, 'code'):
                    print(e.code)
                if hasattr(e, 'reason'):
                    print(e.reason)
                time.sleep(10)
            except Exception as e:
                print("exception:" + str(e))
                time.sleep(1)
        fh.close()
        html2 = '''
                </body>
                </html>
                '''
        fh = open("1.html", 'ab')
        fh.write(html2.encode('utf-8'))
        fh.close()


class Conrl(threading.Thread):
    def __init__(self, urlqueue):
        threading.Thread.__init__(self)
        self.urlqueue = urlqueue

    def run(self):
        while(True):
            print("程式執行ing")
            time.sleep(60)
            if self.urlqueue.empty():
                print("執行完畢")
                exit()

key = "IT"
pagestart = 1
pageend = 2
thread1 = GetUrl(key, pagestart, pageend, urlqueue)
thread1.start()
thread2 = GetConnect(urlqueue)
thread2.start()
thread3 = Conrl(urlqueue)
thread3.start()