性能相關
學習參考:http://www.cnblogs.com/wupeiqi/articles/6229292.html
在編寫爬蟲時,性能的消耗主要在IO請求中,當單程序單線程模式下請求URL時必然會引起等待,進而使得請求整體變慢。

import requests
def fetch_async(url):
response = requests.get(url)
return response
url_list = ['http://www.github.com', 'http://www.bing.com']
for url in url_list:
fetch_async(url)
1.同步執行

from concurrent.futures import ThreadPoolExecutor
import requests
def fetch_async(url):
response = requests.get(url)
return response
url_list = ['http://www.github.com', 'http://www.bing.com']
pool = ThreadPoolExecutor(5)
for url in url_list:
pool.submit(fetch_async, url)
pool.shutdown(wait=True)
2.多線程執行

from concurrent.futures import ThreadPoolExecutor
import requests
def fetch_async(url):
response = requests.get(url)
return response
def callback(future):
print(future.result())
url_list = ['http://www.github.com', 'http://www.bing.com']
pool = ThreadPoolExecutor(5)
for url in url_list:
v = pool.submit(fetch_async, url)
v.add_done_callback(callback)
pool.shutdown(wait=True)
2.多線程+回調函數執行

from concurrent.futures import ProcessPoolExecutor
import requests
def fetch_async(url):
response = requests.get(url)
return response
url_list = ['http://www.github.com', 'http://www.bing.com']
pool = ProcessPoolExecutor(5)
for url in url_list:
pool.submit(fetch_async, url)
pool.shutdown(wait=True)
3.多程序執行

from concurrent.futures import ProcessPoolExecutor
import requests
def fetch_async(url):
response = requests.get(url)
return response
def callback(future):
print(future.result())
url_list = ['http://www.github.com', 'http://www.bing.com']
pool = ProcessPoolExecutor(5)
for url in url_list:
v = pool.submit(fetch_async, url)
v.add_done_callback(callback)
pool.shutdown(wait=True)
3.多程序+回調函數執行
通過上述代碼均可以完成對請求性能的提高,對于多線程和多進行的缺點是在IO阻塞時會造成了線程和程序的浪費,是以異步IO回事首選:

import asyncio
@asyncio.coroutine
def func1():
print('before...func1......')
yield from asyncio.sleep(5)
print('end...func1......')
tasks = [func1(), func1()]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
1.asyncio示例1

import asyncio
@asyncio.coroutine
def fetch_async(host, url='/'):
print(host, url)
reader, writer = yield from asyncio.open_connection(host, 80)
request_header_content = """GET %s HTTP/1.0\r\nHost: %s\r\n\r\n""" % (url, host,)
request_header_content = bytes(request_header_content, encoding='utf-8')
writer.write(request_header_content)
yield from writer.drain()
text = yield from reader.read()
print(host, url, text)
writer.close()
tasks = [
fetch_async('www.cnblogs.com', '/wupeiqi/'),
fetch_async('dig.chouti.com', '/pic/show?nid=4073644713430508&lid=10273091')
]
loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
1.asyncio示例2

import aiohttp
import asyncio
@asyncio.coroutine
def fetch_async(url):
print(url)
response = yield from aiohttp.request('GET', url)
# data = yield from response.read()
# print(url, data)
print(url, response)
response.close()
tasks = [fetch_async('http://www.google.com/'), fetch_async('http://www.chouti.com/')]
event_loop = asyncio.get_event_loop()
results = event_loop.run_until_complete(asyncio.gather(*tasks))
event_loop.close()
2.asyncio + aiohttp

import asyncio
import requests
@asyncio.coroutine
def fetch_async(func, *args):
loop = asyncio.get_event_loop()
future = loop.run_in_executor(None, func, *args)
response = yield from future
print(response.url, response.content)
tasks = [
fetch_async(requests.get, 'http://www.cnblogs.com/wupeiqi/'),
fetch_async(requests.get, 'http://dig.chouti.com/pic/show?nid=4073644713430508&lid=10273091')
]
loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
3.asyncio + requests

import gevent
import requests
from gevent import monkey
monkey.patch_all()
def fetch_async(method, url, req_kwargs):
print(method, url, req_kwargs)
response = requests.request(method=method, url=url, **req_kwargs)
print(response.url, response.content)
# ##### 發送請求 #####
gevent.joinall([
gevent.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),
gevent.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),
gevent.spawn(fetch_async, method='get', url='https://github.com/', req_kwargs={}),
])
# ##### 發送請求(協程池控制最大協程數量) #####
# from gevent.pool import Pool
# pool = Pool(None)
# gevent.joinall([
# pool.spawn(fetch_async, method='get', url='https://www.python.org/', req_kwargs={}),
# pool.spawn(fetch_async, method='get', url='https://www.yahoo.com/', req_kwargs={}),
# pool.spawn(fetch_async, method='get', url='https://www.github.com/', req_kwargs={}),
# ])
4.gevent + requests

import grequests
request_list = [
grequests.get('http://httpbin.org/delay/1', timeout=0.001),
grequests.get('http://fakedomain/'),
grequests.get('http://httpbin.org/status/500')
]
# ##### 執行并擷取響應清單 #####
# response_list = grequests.map(request_list)
# print(response_list)
# ##### 執行并擷取響應清單(處理異常) #####
# def exception_handler(request, exception):
# print(request,exception)
# print("Request failed")
# response_list = grequests.map(request_list, exception_handler=exception_handler)
# print(response_list)
5.grequests

from twisted.web.client import getPage, defer
from twisted.internet import reactor
def all_done(arg):
reactor.stop()
def callback(contents):
print(contents)
deferred_list = []
url_list = ['http://www.bing.com', 'http://www.baidu.com', ]
for url in url_list:
deferred = getPage(bytes(url, encoding='utf8'))
deferred.addCallback(callback)
deferred_list.append(deferred)
dlist = defer.DeferredList(deferred_list)
dlist.addBoth(all_done)
reactor.run()
6.Twisted示例

from tornado.httpclient import AsyncHTTPClient
from tornado.httpclient import HTTPRequest
from tornado import ioloop
def handle_response(response):
"""
處理傳回值内容(需要維護計數器,來停止IO循環),調用 ioloop.IOLoop.current().stop()
:param response:
:return:
"""
if response.error:
print("Error:", response.error)
else:
print(response.body)
def func():
url_list = [
'http://www.baidu.com',
'http://www.bing.com',
]
for url in url_list:
print(url)
http_client = AsyncHTTPClient()
http_client.fetch(HTTPRequest(url), handle_response)
ioloop.IOLoop.current().add_callback(func)
ioloop.IOLoop.current().start()
7.Tornado

from twisted.internet import reactor
from twisted.web.client import getPage
import urllib.parse
def one_done(arg):
print(arg)
reactor.stop()
post_data = urllib.parse.urlencode({'check_data': 'adf'})
post_data = bytes(post_data, encoding='utf8')
headers = {b'Content-Type': b'application/x-www-form-urlencoded'}
response = getPage(bytes('http://dig.chouti.com/login', encoding='utf8'),
method=bytes('POST', encoding='utf8'),
postdata=post_data,
cookies={},
headers=headers)
response.addBoth(one_done)
reactor.run()
Twisted更多
以上均是Python内置以及第三方子產品提供異步IO請求子產品,使用簡便大大提高效率,而對于異步IO請求的本質則是【非阻塞Socket】+【IO多路複用】:

import select
import socket
import time
class AsyncTimeoutException(TimeoutError):
"""
請求逾時異常類
"""
def __init__(self, msg):
self.msg = msg
super(AsyncTimeoutException, self).__init__(msg)
class HttpContext(object):
"""封裝請求和相應的基本資料"""
def __init__(self, sock, host, port, method, url, data, callback, timeout=5):
"""
sock: 請求的用戶端socket對象
host: 請求的主機名
port: 請求的端口
port: 請求的端口
method: 請求方式
url: 請求的URL
data: 請求時請求體中的資料
callback: 請求完成後的回調函數
timeout: 請求的逾時時間
"""
self.sock = sock
self.callback = callback
self.host = host
self.port = port
self.method = method
self.url = url
self.data = data
self.timeout = timeout
self.__start_time = time.time()
self.__buffer = []
def is_timeout(self):
"""目前請求是否已經逾時"""
current_time = time.time()
if (self.__start_time + self.timeout) < current_time:
return True
def fileno(self):
"""請求sockect對象的檔案描述符,用于select監聽"""
return self.sock.fileno()
def write(self, data):
"""在buffer中寫入響應内容"""
self.__buffer.append(data)
def finish(self, exc=None):
"""在buffer中寫入響應内容完成,執行請求的回調函數"""
if not exc:
response = b''.join(self.__buffer)
self.callback(self, response, exc)
else:
self.callback(self, None, exc)
def send_request_data(self):
content = """%s %s HTTP/1.0\r\nHost: %s\r\n\r\n%s""" % (
self.method.upper(), self.url, self.host, self.data,)
return content.encode(encoding='utf8')
class AsyncRequest(object):
def __init__(self):
self.fds = []
self.connections = []
def add_request(self, host, port, method, url, data, callback, timeout):
"""建立一個要請求"""
client = socket.socket()
client.setblocking(False)
try:
client.connect((host, port))
except BlockingIOError as e:
pass
# print('已經向遠端發送連接配接的請求')
req = HttpContext(client, host, port, method, url, data, callback, timeout)
self.connections.append(req)
self.fds.append(req)
def check_conn_timeout(self):
"""檢查所有的請求,是否有已經連接配接逾時,如果有則終止"""
timeout_list = []
for context in self.connections:
if context.is_timeout():
timeout_list.append(context)
for context in timeout_list:
context.finish(AsyncTimeoutException('請求逾時'))
self.fds.remove(context)
self.connections.remove(context)
def running(self):
"""事件循環,用于檢測請求的socket是否已經就緒,進而執行相關操作"""
while True:
r, w, e = select.select(self.fds, self.connections, self.fds, 0.05)
if not self.fds:
return
for context in r:
sock = context.sock
while True:
try:
data = sock.recv(8096)
if not data:
self.fds.remove(context)
context.finish()
break
else:
context.write(data)
except BlockingIOError as e:
break
except TimeoutError as e:
self.fds.remove(context)
self.connections.remove(context)
context.finish(e)
break
for context in w:
# 已經連接配接成功遠端伺服器,開始向遠端發送請求資料
if context in self.fds:
data = context.send_request_data()
context.sock.sendall(data)
self.connections.remove(context)
self.check_conn_timeout()
if __name__ == '__main__':
def callback_func(context, response, ex):
"""
:param context: HttpContext對象,内部封裝了請求相關資訊
:param response: 請求響應内容
:param ex: 是否出現異常(如果有異常則值為異常對象;否則值為None)
:return:
"""
print(context, response, ex)
obj = AsyncRequest()
url_list = [
{'host': 'www.google.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,
'callback': callback_func},
{'host': 'www.baidu.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,
'callback': callback_func},
{'host': 'www.bing.com', 'port': 80, 'method': 'GET', 'url': '/', 'data': '', 'timeout': 5,
'callback': callback_func},
]
for item in url_list:
print(item)
obj.add_request(**item)
obj.running()
史上最牛逼的異步IO子產品
Scrapy
Scrapy是一個為了爬取網站資料,提取結構性資料而編寫的應用架構。 其可以應用在資料挖掘,資訊處理或存儲曆史資料等一系列的程式中。
其最初是為了頁面抓取 (更确切來說, 網絡抓取 )所設計的, 也可以應用在擷取API所傳回的資料(例如 Amazon Associates Web Services ) 或者通用的網絡爬蟲。Scrapy用途廣泛,可以用于資料挖掘、監測和自動化測試。
Scrapy 使用了 Twisted異步網絡庫來處理網絡通訊。整體架構大緻如下
Scrapy主要包括了以下元件:
-
引擎(Scrapy)
用來處理整個系統的資料流處理, 觸發事務(架構核心)
-
排程器(Scheduler)
用來接受引擎發過來的請求, 壓入隊列中, 并在引擎再次請求的時候傳回. 可以想像成一個URL(抓取網頁的網址或者說是連結)的優先隊列, 由它來決定下一個要抓取的網址是什麼, 同時去除重複的網址
-
下載下傳器(Downloader)
用于下載下傳網頁内容, 并将網頁内容傳回給蜘蛛(Scrapy下載下傳器是建立在twisted這個高效的異步模型上的)
-
爬蟲(Spiders)
爬蟲是主要幹活的, 用于從特定的網頁中提取自己需要的資訊, 即所謂的實體(Item)。使用者也可以從中提取對外連結接,讓Scrapy繼續抓取下一個頁面
-
項目管道(Pipeline)
負責處理爬蟲從網頁中抽取的實體,主要的功能是持久化實體、驗證明體的有效性、清除不需要的資訊。當頁面被爬蟲解析後,将被發送到項目管道,并經過幾個特定的次序處理資料。
-
下載下傳器中間件(Downloader Middlewares)
位于Scrapy引擎和下載下傳器之間的架構,主要是處理Scrapy引擎與下載下傳器之間的請求及響應。
-
爬蟲中間件(Spider Middlewares)
介于Scrapy引擎和爬蟲之間的架構,主要工作是處理蜘蛛的響應輸入和請求輸出。
-
排程中間件(Scheduler Middewares)
介于Scrapy引擎和排程之間的中間件,從Scrapy引擎發送到排程的請求和響應。
Scrapy運作流程大概如下:
- 引擎從排程器中取出一個連結(URL)用于接下來的抓取
- 引擎把URL封裝成一個請求(Request)傳給下載下傳器
- 下載下傳器把資源下載下傳下來,并封裝成應答包(Response)
- 爬蟲解析Response
- 解析出實體(Item),則交給實體管道進行進一步的處理
- 解析出的是連結(URL),則把URL交給排程器等待抓取
一、安裝
Linux
pip3 install scrapy
Windows
a. pip3 install wheel
b. 下載下傳twisted http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
c. 進入下載下傳目錄,執行 pip3 install Twisted‑17.1.0‑cp35‑cp35m‑win_amd64.whl
d. pip3 install scrapy
e. 下載下傳并安裝pywin32:https://sourceforge.net/projects/pywin32/files/
二、基本使用
1. 基本指令
1. scrapy startproject 項目名稱
- 在目前目錄中建立中建立一個項目檔案(類似于Django)
2. scrapy genspider [-t template] <name> <domain>
- 建立爬蟲應用
如:
scrapy gendspider chouti chouti.com
scrapy gensipider -t xmlfeed autohome autohome.com.cn
PS:
檢視所有指令:scrapy gensipider -l
檢視模闆指令:scrapy gensipider -d 模闆名稱
3. scrapy list
- 展示爬蟲應用清單
4. scrapy crawl 爬蟲應用名稱
- 運作單獨爬蟲應用
2.項目結構以及爬蟲應用簡介
project_name/
scrapy.cfg
project_name/
__init__.py
items.py
pipelines.py
settings.py
spiders/
__init__.py
爬蟲1.py
爬蟲2.py
爬蟲3.py
檔案說明:
- scrapy.cfg 項目的主配置資訊。(真正爬蟲相關的配置資訊在settings.py檔案中)
- items.py 設定資料存儲模闆,用于結構化資料,如:Django的Model
- pipelines 資料處理行為,如:一般結構化的資料持久化
- settings.py 配置檔案,如:遞歸的層數、并發數,延遲下載下傳等
- spiders 爬蟲目錄,如:建立檔案,編寫爬蟲規則
注意:一般建立爬蟲檔案時,以網站域名命名

import scrapy
class XiaoHuarSpider(scrapy.spiders.Spider):
name = "xiaohuar" # 爬蟲名稱 *****
allowed_domains = ["xiaohuar.com"] # 允許的域名
start_urls = [
"http://www.xiaohuar.com/hua/", # 其實URL
]
def parse(self, response):
# 通路起始URL并擷取結果後的回調函數
爬蟲1.py

import sys,os
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
關于windows編碼
3. 小試牛刀
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
class DigSpider(scrapy.Spider):
# 爬蟲應用的名稱,通過此名稱啟動爬蟲指令
name = "dig"
# 允許的域名
allowed_domains = ["chouti.com"]
# 起始URL
start_urls = [
'http://dig.chouti.com/',
]
has_request_set = {}
def parse(self, response):
print(response.url)
hxs = HtmlXPathSelector(response)
page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/\d+")]/@href').extract()
for page in page_list:
page_url = 'http://dig.chouti.com%s' % page
key = self.md5(page_url)
if key in self.has_request_set:
pass
else:
self.has_request_set[key] = page_url
obj = Request(url=page_url, method='GET', callback=self.parse)
yield obj
@staticmethod
def md5(val):
import hashlib
ha = hashlib.md5()
ha.update(bytes(val, encoding='utf-8'))
key = ha.hexdigest()
return key
執行此爬蟲檔案,則在終端進入項目目錄執行如下指令:
scrapy crawl dig
-
-
nolog
對于上述代碼重要之處在于:
- Request是一個封裝使用者請求的類,在回調函數中yield該對象表示繼續通路
- HtmlXpathSelector用于結構化HTML代碼并提供選擇器功能
4. 選擇器
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from scrapy.selector import Selector, HtmlXPathSelector
from scrapy.http import HtmlResponse
html = """<!DOCTYPE html>
<html>
<head lang="en">
<meta charset="UTF-8">
<title></title>
</head>
<body>
<ul>
<li class="item-"><a id='i1' href="link.html">first item</a></li>
<li class="item-0"><a id='i2' href="llink.html">first item</a></li>
<li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
</ul>
<div><a href="llink2.html">second item</a></div>
</body>
</html>
"""
response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8')
# hxs = HtmlXPathSelector(response)
# print(hxs)
# hxs = Selector(response=response).xpath('//a')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[2]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@id]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@id="i1"]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[contains(@href, "link")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first()
# print(hxs)
# ul_list = Selector(response=response).xpath('//body/ul/li')
# for item in ul_list:
# v = item.xpath('./a/span')
# # 或
# # v = item.xpath('a/span')
# # 或
# # v = item.xpath('*/a/span')
# print(v)

# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from scrapy.http.cookies import CookieJar
from scrapy import FormRequest
class ChouTiSpider(scrapy.Spider):
# 爬蟲應用的名稱,通過此名稱啟動爬蟲指令
name = "chouti"
# 允許的域名
allowed_domains = ["chouti.com"]
cookie_dict = {}
has_request_set = {}
def start_requests(self):
url = 'http://dig.chouti.com/'
# return [Request(url=url, callback=self.login)]
yield Request(url=url, callback=self.login)
def login(self, response):
cookie_jar = CookieJar()
cookie_jar.extract_cookies(response, response.request)
for k, v in cookie_jar._cookies.items():
for i, j in v.items():
for m, n in j.items():
self.cookie_dict[m] = n.value
req = Request(
url='http://dig.chouti.com/login',
method='POST',
headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'},
body='phone=8615131255089&password=pppppppp&oneMonth=1',
cookies=self.cookie_dict,
callback=self.check_login
)
yield req
def check_login(self, response):
req = Request(
url='http://dig.chouti.com/',
method='GET',
callback=self.show,
cookies=self.cookie_dict,
dont_filter=True
)
yield req
def show(self, response):
# print(response)
hxs = HtmlXPathSelector(response)
news_list = hxs.select('//div[@id="content-list"]/div[@class="item"]')
for new in news_list:
# temp = new.xpath('div/div[@class="part2"]/@share-linkid').extract()
link_id = new.xpath('*/div[@class="part2"]/@share-linkid').extract_first()
yield Request(
url='http://dig.chouti.com/link/vote?linksId=%s' %(link_id,),
method='POST',
cookies=self.cookie_dict,
callback=self.do_favor
)
page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/\d+")]/@href').extract()
for page in page_list:
page_url = 'http://dig.chouti.com%s' % page
import hashlib
hash = hashlib.md5()
hash.update(bytes(page_url,encoding='utf-8'))
key = hash.hexdigest()
if key in self.has_request_set:
pass
else:
self.has_request_set[key] = page_url
yield Request(
url=page_url,
method='GET',
callback=self.show
)
def do_favor(self, response):
print(response.text)
示例:自動登陸抽屜并點贊

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http.response.html import HtmlResponse
from scrapy.http import Request
from scrapy.http.cookies import CookieJar
class ChoutiSpider(scrapy.Spider):
name = "chouti"
allowed_domains = ["chouti.com"]
start_urls = (
'http://www.chouti.com/',
)
def start_requests(self):
url = 'http://dig.chouti.com/'
yield Request(url=url, callback=self.login, meta={'cookiejar': True})
def login(self, response):
print(response.headers.getlist('Set-Cookie'))
req = Request(
url='http://dig.chouti.com/login',
method='POST',
headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'},
body='phone=8613121758648&password=woshiniba&oneMonth=1',
callback=self.check_login,
meta={'cookiejar': True}
)
yield req
def check_login(self, response):
print(response.text)
處理Cookie
注意:settings.py中設定DEPTH_LIMIT = 1來指定“遞歸”的層數。
5. 格式化處理
上述執行個體隻是簡單的處理,是以在parse方法中直接處理。如果對于想要擷取更多的資料處理,則可以利用Scrapy的items将資料格式化,然後統一交由pipelines來處理。

import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from scrapy.http.cookies import CookieJar
from scrapy import FormRequest
class XiaoHuarSpider(scrapy.Spider):
# 爬蟲應用的名稱,通過此名稱啟動爬蟲指令
name = "xiaohuar"
# 允許的域名
allowed_domains = ["xiaohuar.com"]
start_urls = [
"http://www.xiaohuar.com/list-1-1.html",
]
# custom_settings = {
# 'ITEM_PIPELINES':{
# 'spider1.pipelines.JsonPipeline': 100
# }
# }
has_request_set = {}
def parse(self, response):
# 分析頁面
# 找到頁面中符合規則的内容(校花圖檔),儲存
# 找到所有的a标簽,再通路其他a标簽,一層一層的搞下去
hxs = HtmlXPathSelector(response)
items = hxs.select('//div[@class="item_list infinite_scroll"]/div')
for item in items:
src = item.select('.//div[@class="img"]/a/img/@src').extract_first()
name = item.select('.//div[@class="img"]/span/text()').extract_first()
school = item.select('.//div[@class="img"]/div[@class="btns"]/a/text()').extract_first()
url = "http://www.xiaohuar.com%s" % src
from ..items import XiaoHuarItem
obj = XiaoHuarItem(name=name, school=school, url=url)
yield obj
urls = hxs.select('//a[re:test(@href, "http://www.xiaohuar.com/list-1-\d+.html")]/@href')
for url in urls:
key = self.md5(url)
if key in self.has_request_set:
pass
else:
self.has_request_set[key] = url
req = Request(url=url,method='GET',callback=self.parse)
yield req
@staticmethod
def md5(val):
import hashlib
ha = hashlib.md5()
ha.update(bytes(val, encoding='utf-8'))
key = ha.hexdigest()
return key
spiders/xiahuar.py

import scrapy
class XiaoHuarItem(scrapy.Item):
name = scrapy.Field()
school = scrapy.Field()
url = scrapy.Field()
items

import json
import os
import requests
class JsonPipeline(object):
def __init__(self):
self.file = open('xiaohua.txt', 'w')
def process_item(self, item, spider):
v = json.dumps(dict(item), ensure_ascii=False)
self.file.write(v)
self.file.write('\n')
self.file.flush()
return item
class FilePipeline(object):
def __init__(self):
if not os.path.exists('imgs'):
os.makedirs('imgs')
def process_item(self, item, spider):
response = requests.get(item['url'], stream=True)
file_name = '%s_%s.jpg' % (item['name'], item['school'])
with open(os.path.join('imgs', file_name), mode='wb') as f:
f.write(response.content)
return item
pipelines

ITEM_PIPELINES = {
'spider1.pipelines.JsonPipeline': 100,
'spider1.pipelines.FilePipeline': 300,
}
# 每行後面的整型值,确定了他們運作的順序,item按數字從低到高的順序,通過pipeline,通常将這些數字定義在0-1000範圍内。
settings
對于pipeline可以做更多,如下:

from scrapy.exceptions import DropItem
class CustomPipeline(object):
def __init__(self,v):
self.value = v
def process_item(self, item, spider):
# 操作并進行持久化
# return表示會被後續的pipeline繼續處理
return item
# 表示将item丢棄,不會被後續pipeline處理
# raise DropItem()
@classmethod
def from_crawler(cls, crawler):
"""
初始化時候,用于建立pipeline對象
:param crawler:
:return:
"""
val = crawler.settings.getint('MMMM')
return cls(val)
def open_spider(self,spider):
"""
爬蟲開始執行時,調用
:param spider:
:return:
"""
print('000000')
def close_spider(self,spider):
"""
爬蟲關閉時,被調用
:param spider:
:return:
"""
print('111111')
自定義pipeline
6.中間件

class SpiderMiddleware(object):
def process_spider_input(self,response, spider):
"""
下載下傳完成,執行,然後交給parse處理
:param response:
:param spider:
:return:
"""
pass
def process_spider_output(self,response, result, spider):
"""
spider處理完成,傳回時調用
:param response:
:param result:
:param spider:
:return: 必須傳回包含 Request 或 Item 對象的可疊代對象(iterable)
"""
return result
def process_spider_exception(self,response, exception, spider):
"""
異常調用
:param response:
:param exception:
:param spider:
:return: None,繼續交給後續中間件處理異常;含 Response 或 Item 的可疊代對象(iterable),交給排程器或pipeline
"""
return None
def process_start_requests(self,start_requests, spider):
"""
爬蟲啟動時調用
:param start_requests:
:param spider:
:return: 包含 Request 對象的可疊代對象
"""
return start_requests
爬蟲中間件

class DownMiddleware1(object):
def process_request(self, request, spider):
"""
請求需要被下載下傳時,經過所有下載下傳器中間件的process_request調用
:param request:
:param spider:
:return:
None,繼續後續中間件去下載下傳;
Response對象,停止process_request的執行,開始執行process_response
Request對象,停止中間件的執行,将Request重新排程器
raise IgnoreRequest異常,停止process_request的執行,開始執行process_exception
"""
pass
def process_response(self, request, response, spider):
"""
spider處理完成,傳回時調用
:param response:
:param result:
:param spider:
:return:
Response 對象:轉交給其他中間件process_response
Request 對象:停止中間件,request會被重新排程下載下傳
raise IgnoreRequest 異常:調用Request.errback
"""
print('response1')
return response
def process_exception(self, request, exception, spider):
"""
當下載下傳處理器(download handler)或 process_request() (下載下傳中間件)抛出異常
:param response:
:param exception:
:param spider:
:return:
None:繼續交給後續中間件處理異常;
Response對象:停止後續process_exception方法
Request對象:停止中間件,request将會被重新調用下載下傳
"""
return None
下載下傳器中間件
7. 自定制指令
- 在spiders同級建立任意目錄,如:commands
- 在其中建立 crawlall.py 檔案 (此處檔案名就是自定義的指令)

from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders'
def run(self, args, opts):
spider_list = self.crawler_process.spiders.list()
for name in spider_list:
self.crawler_process.crawl(name, **opts.__dict__)
self.crawler_process.start()
crawlall.py
- 在settings.py 中添加配置 COMMANDS_MODULE = '項目名稱.目錄名稱'
- 在項目目錄執行指令:scrapy crawlall

from scrapy.cmdline import execute
if __name__ == '__main__':
execute(["scrapy","github","--nolog"])
單個爬蟲
9. 避免重複通路
scrapy預設使用 scrapy.dupefilter.RFPDupeFilter 進行去重,相關配置有:
DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
DUPEFILTER_DEBUG = False
JOBDIR = "儲存範文記錄的日志路徑,如:/root/" # 最終路徑為 /root/requests.seen

class RepeatUrl:
def __init__(self):
self.visited_url = set()
@classmethod
def from_settings(cls, settings):
"""
初始化時,調用
:param settings:
:return:
"""
return cls()
def request_seen(self, request):
"""
檢測目前請求是否已經被通路過
:param request:
:return: True表示已經通路過;False表示未通路過
"""
if request.url in self.visited_url:
return True
self.visited_url.add(request.url)
return False
def open(self):
"""
開始爬去請求時,調用
:return:
"""
print('open replication')
def close(self, reason):
"""
結束爬蟲爬取時,調用
:param reason:
:return:
"""
print('close replication')
def log(self, request, spider):
"""
記錄日志
:param request:
:param spider:
:return:
"""
print('repeat', request.url)
自定義URL去重操作
10.其他

# -*- coding: utf-8 -*-
# Scrapy settings for step8_king project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# 1. 爬蟲名稱
BOT_NAME = 'step8_king'
# 2. 爬蟲應用路徑
SPIDER_MODULES = ['step8_king.spiders']
NEWSPIDER_MODULE = 'step8_king.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# 3. 用戶端 user-agent請求頭
# USER_AGENT = 'step8_king (+http://www.yourdomain.com)'
# Obey robots.txt rules
# 4. 禁止爬蟲配置
# ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# 5. 并發請求數
# CONCURRENT_REQUESTS = 4
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# 6. 延遲下載下傳秒數
# DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
# 7. 單域名通路并發數,并且延遲下次秒數也應用在每個域名
# CONCURRENT_REQUESTS_PER_DOMAIN = 2
# 單IP通路并發數,如果有值則忽略:CONCURRENT_REQUESTS_PER_DOMAIN,并且延遲下次秒數也應用在每個IP
# CONCURRENT_REQUESTS_PER_IP = 3
# Disable cookies (enabled by default)
# 8. 是否支援cookie,cookiejar進行操作cookie
# COOKIES_ENABLED = True
# COOKIES_DEBUG = True
# Disable Telnet Console (enabled by default)
# 9. Telnet用于檢視目前爬蟲的資訊,操作爬蟲等...
# 使用telnet ip port ,然後通過指令操作
# TELNETCONSOLE_ENABLED = True
# TELNETCONSOLE_HOST = '127.0.0.1'
# TELNETCONSOLE_PORT = [6023,]
# 10. 預設請求頭
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
# 11. 定義pipeline處理請求
# ITEM_PIPELINES = {
# 'step8_king.pipelines.JsonPipeline': 700,
# 'step8_king.pipelines.FilePipeline': 500,
# }
# 12. 自定義擴充,基于信号進行調用
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# # 'step8_king.extensions.MyExtension': 500,
# }
# 13. 爬蟲允許的最大深度,可以通過meta檢視目前深度;0表示無深度
# DEPTH_LIMIT = 3
# 14. 爬取時,0表示深度優先Lifo(預設);1表示廣度優先FiFo
# 後進先出,深度優先
# DEPTH_PRIORITY = 0
# SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue'
# 先進先出,廣度優先
# DEPTH_PRIORITY = 1
# SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
# 15. 排程器隊列
# SCHEDULER = 'scrapy.core.scheduler.Scheduler'
# from scrapy.core.scheduler import Scheduler
# 16. 通路URL去重
# DUPEFILTER_CLASS = 'step8_king.duplication.RepeatUrl'
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
"""
17. 自動限速算法
from scrapy.contrib.throttle import AutoThrottle
自動限速設定
1. 擷取最小延遲 DOWNLOAD_DELAY
2. 擷取最大延遲 AUTOTHROTTLE_MAX_DELAY
3. 設定初始下載下傳延遲 AUTOTHROTTLE_START_DELAY
4. 當請求下載下傳完成後,擷取其"連接配接"時間 latency,即:請求連接配接到接受到響應頭之間的時間
5. 用于計算的... AUTOTHROTTLE_TARGET_CONCURRENCY
target_delay = latency / self.target_concurrency
new_delay = (slot.delay + target_delay) / 2.0 # 表示上一次的延遲時間
new_delay = max(target_delay, new_delay)
new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
slot.delay = new_delay
"""
# 開始自動限速
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# 初始下載下傳延遲
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# 最大下載下傳延遲
# AUTOTHROTTLE_MAX_DELAY = 10
# The average number of requests Scrapy should be sending in parallel to each remote server
# 平均每秒并發數
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# 是否顯示
# AUTOTHROTTLE_DEBUG = True
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
"""
18. 啟用緩存
目的用于将已經發送的請求或相應緩存下來,以便以後使用
from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
from scrapy.extensions.httpcache import DummyPolicy
from scrapy.extensions.httpcache import FilesystemCacheStorage
"""
# 是否啟用緩存政策
# HTTPCACHE_ENABLED = True
# 緩存政策:所有請求均緩存,下次在請求直接通路原來的緩存即可
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
# 緩存政策:根據Http響應頭:Cache-Control、Last-Modified 等進行緩存的政策
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy"
# 緩存逾時時間
# HTTPCACHE_EXPIRATION_SECS = 0
# 緩存儲存路徑
# HTTPCACHE_DIR = 'httpcache'
# 緩存忽略的Http狀态碼
# HTTPCACHE_IGNORE_HTTP_CODES = []
# 緩存存儲的插件
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
"""
19. 代理,需要在環境變量中設定
from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware
方式一:使用預設
os.environ
{
http_proxy:http://root:[email protected]:9999/
https_proxy:http://192.168.11.11:9999/
}
方式二:使用自定義下載下傳中間件
def to_bytes(text, encoding=None, errors='strict'):
if isinstance(text, bytes):
return text
if not isinstance(text, six.string_types):
raise TypeError('to_bytes must receive a unicode, str or bytes '
'object, got %s' % type(text).__name__)
if encoding is None:
encoding = 'utf-8'
return text.encode(encoding, errors)
class ProxyMiddleware(object):
def process_request(self, request, spider):
PROXIES = [
{'ip_port': '111.11.228.75:80', 'user_pass': ''},
{'ip_port': '120.198.243.22:80', 'user_pass': ''},
{'ip_port': '111.8.60.9:8123', 'user_pass': ''},
{'ip_port': '101.71.27.120:80', 'user_pass': ''},
{'ip_port': '122.96.59.104:80', 'user_pass': ''},
{'ip_port': '122.224.249.122:8088', 'user_pass': ''},
]
proxy = random.choice(PROXIES)
if proxy['user_pass'] is not None:
request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass']))
request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
print "**************ProxyMiddleware have pass************" + proxy['ip_port']
else:
print "**************ProxyMiddleware no pass************" + proxy['ip_port']
request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
DOWNLOADER_MIDDLEWARES = {
'step8_king.middlewares.ProxyMiddleware': 500,
}
"""
"""
20. Https通路
Https通路時有兩種情況:
1. 要爬取網站使用的可信任證書(預設支援)
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory"
2. 要爬取網站使用的自定義證書
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory"
# https.py
from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate)
class MySSLFactory(ScrapyClientContextFactory):
def getCertificateOptions(self):
from OpenSSL import crypto
v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.key.unsecure', mode='r').read())
v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.pem', mode='r').read())
return CertificateOptions(
privateKey=v1, # pKey對象
certificate=v2, # X509對象
verify=False,
method=getattr(self, 'method', getattr(self, '_ssl_method', None))
)
其他:
相關類
scrapy.core.downloader.handlers.http.HttpDownloadHandler
scrapy.core.downloader.webclient.ScrapyHTTPClientFactory
scrapy.core.downloader.contextfactory.ScrapyClientContextFactory
相關配置
DOWNLOADER_HTTPCLIENTFACTORY
DOWNLOADER_CLIENTCONTEXTFACTORY
"""
"""
21. 爬蟲中間件
class SpiderMiddleware(object):
def process_spider_input(self,response, spider):
'''
下載下傳完成,執行,然後交給parse處理
:param response:
:param spider:
:return:
'''
pass
def process_spider_output(self,response, result, spider):
'''
spider處理完成,傳回時調用
:param response:
:param result:
:param spider:
:return: 必須傳回包含 Request 或 Item 對象的可疊代對象(iterable)
'''
return result
def process_spider_exception(self,response, exception, spider):
'''
異常調用
:param response:
:param exception:
:param spider:
:return: None,繼續交給後續中間件處理異常;含 Response 或 Item 的可疊代對象(iterable),交給排程器或pipeline
'''
return None
def process_start_requests(self,start_requests, spider):
'''
爬蟲啟動時調用
:param start_requests:
:param spider:
:return: 包含 Request 對象的可疊代對象
'''
return start_requests
内置爬蟲中間件:
'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50,
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500,
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700,
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800,
'scrapy.contrib.spidermiddleware.depth.DepthMiddleware': 900,
"""
# from scrapy.contrib.spidermiddleware.referer import RefererMiddleware
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
# 'step8_king.middlewares.SpiderMiddleware': 543,
}
"""
22. 下載下傳中間件
class DownMiddleware1(object):
def process_request(self, request, spider):
'''
請求需要被下載下傳時,經過所有下載下傳器中間件的process_request調用
:param request:
:param spider:
:return:
None,繼續後續中間件去下載下傳;
Response對象,停止process_request的執行,開始執行process_response
Request對象,停止中間件的執行,将Request重新排程器
raise IgnoreRequest異常,停止process_request的執行,開始執行process_exception
'''
pass
def process_response(self, request, response, spider):
'''
spider處理完成,傳回時調用
:param response:
:param result:
:param spider:
:return:
Response 對象:轉交給其他中間件process_response
Request 對象:停止中間件,request會被重新排程下載下傳
raise IgnoreRequest 異常:調用Request.errback
'''
print('response1')
return response
def process_exception(self, request, exception, spider):
'''
當下載下傳處理器(download handler)或 process_request() (下載下傳中間件)抛出異常
:param response:
:param exception:
:param spider:
:return:
None:繼續交給後續中間件處理異常;
Response對象:停止後續process_exception方法
Request對象:停止中間件,request将會被重新調用下載下傳
'''
return None
預設下載下傳中間件
{
'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100,
'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300,
'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830,
'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,
}
"""
# from scrapy.contrib.downloadermiddleware.httpauth import HttpAuthMiddleware
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'step8_king.middlewares.DownMiddleware1': 100,
# 'step8_king.middlewares.DownMiddleware2': 500,
# }
settings
11.TinyScrapy

from twisted.web.client import getPage
from twisted.internet import reactor
from twisted.internet import defer
url_list = ['http://www.bing.com', 'http://www.baidu.com', ]
def callback(arg):
print('回來一個', arg)
defer_list = []
for url in url_list:
ret = getPage(bytes(url, encoding='utf8'))
ret.addCallback(callback)
defer_list.append(ret)
def stop(arg):
print('已經全部現在完畢', arg)
reactor.stop()
d = defer.DeferredList(defer_list)
d.addBoth(stop)
reactor.run()
twisted示例一

#!/usr/bin/env python
# -*- coding:utf-8 -*-
from twisted.web.client import getPage
from twisted.internet import reactor
from twisted.internet import defer
@defer.inlineCallbacks
def task(url):
ret = getPage(bytes(url, encoding='utf8'))
ret.addCallback(callback)
yield ret
def callback(arg):
print('回來一個', arg)
url_list = ['http://www.bing.com', 'http://www.baidu.com', ]
defer_list = []
for url in url_list:
ret = task(url)
defer_list.append(ret)
def stop(arg):
print('已經全部現在完畢', arg)
reactor.stop()
d = defer.DeferredList(defer_list)
d.addBoth(stop)
reactor.run()
twisted示例二

#!/usr/bin/env python
# -*- coding:utf-8 -*-
from twisted.internet import defer
from twisted.web.client import getPage
from twisted.internet import reactor
import threading
def _next_request():
_next_request_from_scheduler()
def _next_request_from_scheduler():
ret = getPage(bytes('http://www.chouti.com', encoding='utf8'))
ret.addCallback(callback)
ret.addCallback(lambda _: reactor.callLater(0, _next_request))
_closewait = None
@defer.inlineCallbacks
def engine_start():
global _closewait
_closewait = defer.Deferred()
yield _closewait
@defer.inlineCallbacks
def task(url):
reactor.callLater(0, _next_request)
yield engine_start()
counter = 0
def callback(arg):
global counter
counter +=1
if counter == 10:
_closewait.callback(None)
print('one', len(arg))
def stop(arg):
print('all done', arg)
reactor.stop()
if __name__ == '__main__':
url = 'http://www.cnblogs.com'
defer_list = []
deferObj = task(url)
defer_list.append(deferObj)
v = defer.DeferredList(defer_list)
v.addBoth(stop)
reactor.run()
twisted示例三

#!/usr/bin/env python
# -*- coding:utf-8 -*-
from twisted.web.client import getPage, defer
from twisted.internet import reactor
import queue
class Response(object):
def __init__(self, body, request):
self.body = body
self.request = request
self.url = request.url
@property
def text(self):
return self.body.decode('utf-8')
class Request(object):
def __init__(self, url, callback=None):
self.url = url
self.callback = callback
class Scheduler(object):
def __init__(self, engine):
self.q = queue.Queue()
self.engine = engine
def enqueue_request(self, request):
self.q.put(request)
def next_request(self):
try:
req = self.q.get(block=False)
except Exception as e:
req = None
return req
def size(self):
return self.q.qsize()
class ExecutionEngine(object):
def __init__(self):
self._closewait = None
self.running = True
self.start_requests = None
self.scheduler = Scheduler(self)
self.inprogress = set()
def check_empty(self, response):
if not self.running:
self._closewait.callback('......')
def _next_request(self):
while self.start_requests:
try:
request = next(self.start_requests)
except StopIteration:
self.start_requests = None
else:
self.scheduler.enqueue_request(request)
while len(self.inprogress) < 5 and self.scheduler.size() > 0: # 最大并發數為5
request = self.scheduler.next_request()
if not request:
break
self.inprogress.add(request)
d = getPage(bytes(request.url, encoding='utf-8'))
d.addBoth(self._handle_downloader_output, request)
d.addBoth(lambda x, req: self.inprogress.remove(req), request)
d.addBoth(lambda x: self._next_request())
if len(self.inprogress) == 0 and self.scheduler.size() == 0:
self._closewait.callback(None)
def _handle_downloader_output(self, body, request):
"""
擷取内容,執行回調函數,并且把回調函數中的傳回值擷取,并添加到隊列中
:param response:
:param request:
:return:
"""
import types
response = Response(body, request)
func = request.callback or self.spider.parse
gen = func(response)
if isinstance(gen, types.GeneratorType):
for req in gen:
self.scheduler.enqueue_request(req)
@defer.inlineCallbacks
def start(self):
self._closewait = defer.Deferred()
yield self._closewait
def open_spider(self, spider, start_requests):
self.start_requests = start_requests
self.spider = spider
reactor.callLater(0, self._next_request)
class Crawler(object):
def __init__(self, spidercls):
self.spidercls = spidercls
self.spider = None
self.engine = None
@defer.inlineCallbacks
def crawl(self):
self.engine = ExecutionEngine()
self.spider = self.spidercls()
start_requests = iter(self.spider.start_requests())
start_requests = iter(start_requests)
self.engine.open_spider(self.spider, start_requests)
yield self.engine.start()
class CrawlerProcess(object):
def __init__(self):
self._active = set()
self.crawlers = set()
def crawl(self, spidercls, *args, **kwargs):
crawler = Crawler(spidercls)
self.crawlers.add(crawler)
d = crawler.crawl(*args, **kwargs)
self._active.add(d)
return d
def start(self):
dl = defer.DeferredList(self._active)
dl.addBoth(self._stop_reactor)
reactor.run()
def _stop_reactor(self, _=None):
reactor.stop()
class Spider(object):
def start_requests(self):
for url in self.start_urls:
yield Request(url)
class ChoutiSpider(Spider):
name = "chouti"
start_urls = [
'http://dig.chouti.com/',
]
def parse(self, response):
print(response.text)
class CnblogsSpider(Spider):
name = "cnblogs"
start_urls = [
'http://www.cnblogs.com/',
]
def parse(self, response):
print(response.text)
if __name__ == '__main__':
spider_cls_list = [ChoutiSpider, CnblogsSpider]
crawler_process = CrawlerProcess()
for spider_cls in spider_cls_list:
crawler_process.crawl(spider_cls)
crawler_process.start()
模拟scrapy架構
模拟scrapy架構

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import types
from twisted.internet import defer
from twisted.web.client import getPage
from twisted.internet import reactor
class Request(object):
def __init__(self, url, callback):
self.url = url
self.callback = callback
self.priority = 0
class HttpResponse(object):
def __init__(self, content, request):
self.content = content
self.request = request
class ChouTiSpider(object):
def start_requests(self):
url_list = ['http://www.cnblogs.com/', 'http://www.bing.com']
for url in url_list:
yield Request(url=url, callback=self.parse)
def parse(self, response):
print(response.request.url)
# yield Request(url="http://www.baidu.com", callback=self.parse)
from queue import Queue
Q = Queue()
class CallLaterOnce(object):
def __init__(self, func, *a, **kw):
self._func = func
self._a = a
self._kw = kw
self._call = None
def schedule(self, delay=0):
if self._call is None:
self._call = reactor.callLater(delay, self)
def cancel(self):
if self._call:
self._call.cancel()
def __call__(self):
self._call = None
return self._func(*self._a, **self._kw)
class Engine(object):
def __init__(self):
self.nextcall = None
self.crawlling = []
self.max = 5
self._closewait = None
def get_response(self,content, request):
response = HttpResponse(content, request)
gen = request.callback(response)
if isinstance(gen, types.GeneratorType):
for req in gen:
req.priority = request.priority + 1
Q.put(req)
def rm_crawlling(self,response,d):
self.crawlling.remove(d)
def _next_request(self,spider):
if Q.qsize() == 0 and len(self.crawlling) == 0:
self._closewait.callback(None)
if len(self.crawlling) >= 5:
return
while len(self.crawlling) < 5:
try:
req = Q.get(block=False)
except Exception as e:
req = None
if not req:
return
d = getPage(req.url.encode('utf-8'))
self.crawlling.append(d)
d.addCallback(self.get_response, req)
d.addCallback(self.rm_crawlling,d)
d.addCallback(lambda _: self.nextcall.schedule())
@defer.inlineCallbacks
def crawl(self):
spider = ChouTiSpider()
start_requests = iter(spider.start_requests())
flag = True
while flag:
try:
req = next(start_requests)
Q.put(req)
except StopIteration as e:
flag = False
self.nextcall = CallLaterOnce(self._next_request,spider)
self.nextcall.schedule()
self._closewait = defer.Deferred()
yield self._closewait
@defer.inlineCallbacks
def pp(self):
yield self.crawl()
_active = set()
obj = Engine()
d = obj.crawl()
_active.add(d)
li = defer.DeferredList(_active)
li.addBoth(lambda _,*a,**kw: reactor.stop())
reactor.run()
參考版
參考版
更多文檔參見:http://scrapy-chs.readthedocs.io/zh_CN/latest/index.html