爬搜尋引擎的資訊要注意page和key的變化,還有正規表達式一定要正确
爬下面的URL: http://weixin.sogou.com/weixin?type=2&query=
後面再跟page資訊
一共三個線程,第一個負責把URL存到隊列中去,第二個URL負責讀取需要的資訊并儲存,第三個如果隊列為空,則結束
import queue
import threading
import urllib.request
import urllib.error
import re
import time
urlqueue = queue.Queue()
#獲得html文檔
def GetData(url):
try:
headers = ("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
data = urllib.request.urlopen(url).read().decode('utf-8')
return data
except urllib.error.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
time.sleep(10)
except Exception as e:
print("exception:" + str(e))
time.sleep(1)
# thread1
class GetUrl(threading.Thread):
def __init__(self, key, pagestart, pageend, urlqueue):
threading.Thread.__init__(self)
self.key = key
self.pagestart = pagestart
self.pageend = pageend
self.urlqueue = urlqueue
def run(self):
keycode = urllib.request.quote(self.key)
pagecode = urllib.request.quote("&page=")
for page in range(self.pagestart, self.pageend+1):
url = "http://weixin.sogou.com/weixin?type=2&query="+keycode+pagecode+str(page)
data = GetData(url)
listurlpattern = '<div class="txt-box">.*?(http://.*?)"'
page_urls = re.compile(listurlpattern, re.S).findall(data)
for page_url in page_urls:
page_url = page_url.replace("amp;", "")
self.urlqueue.put(page_url)
self.urlqueue.task_done()
class GetConnect(threading.Thread):
def __init__(self, urlqueue):
threading.Thread.__init__(self)
self.urlqueue = urlqueue
def run(self):
html1 = '''
<html>
<head>
<title>微信文章</title>
</head>
<body>
'''
fh = open("1.html", 'wb')
fh.write(html1.encode('utf-8'))
fh.close()
fh = open("1.html", 'ab')
i = 1
while(True):
try:
url = self.urlqueue.get()
print(url)
data = GetData(url)
titlepat = '<title>(.*?)</title>'
contentpat = 'id="js_content">(.*?)id="js_sg_bar"'
title = re.compile(titlepat, re.S).findall(data)
content = re.compile(contentpat, re.S).findall(data)
thistitle = "no"
thiscontent = "no"
if (title != []):
thistitle = title[0]
if(content != []):
thiscontent = content[0]
dataall = "<p>标題是:"+thistitle+"</p><p>内容是:"+thiscontent+"</p><br>"
fh.write(dataall.encode('utf-8'))
print("第"+str(i)+"個網頁處理")
i += 1
except urllib.request.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
time.sleep(10)
except Exception as e:
print("exception:" + str(e))
time.sleep(1)
fh.close()
html2 = '''
</body>
</html>
'''
fh = open("1.html", 'ab')
fh.write(html2.encode('utf-8'))
fh.close()
class Conrl(threading.Thread):
def __init__(self, urlqueue):
threading.Thread.__init__(self)
self.urlqueue = urlqueue
def run(self):
while(True):
print("程式執行ing")
time.sleep(60)
if self.urlqueue.empty():
print("執行完畢")
exit()
key = "IT"
pagestart = 1
pageend = 2
thread1 = GetUrl(key, pagestart, pageend, urlqueue)
thread1.start()
thread2 = GetConnect(urlqueue)
thread2.start()
thread3 = Conrl(urlqueue)
thread3.start()