Python 爬蟲 Scrapy架構示例 #chouti.py:
import scrapy,sys,io
from scrapy.selector import Selector
from scrapy.http import Request
from ..items import ChoutiItem
from scrapy.dupefilters import RFPDupeFilter
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding="gb18030")
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ["chouti.com"]
start_urls = ["http://dig.chouti.com"]
#visited_urls=set()#用于存放已通路的URL,進而實作不重複通路;但Scrapy本身也進行了去重的工作,因而無需自定義實作該操作
#def start_requests(self):#重寫scrapy.Spider的start_requests()方法
# for url in self.start_urls:
# yield Request(url,callback=self.run)
#def run(self,response):
# pass
def parse(self, response):
#擷取目前頁面所有熱點新聞的标題:
sel1=Selector(response=response).xpath("//div[@id='content-list']/div[@class='item']")
for obj in sel1:
title=obj.xpath(".//a[@class='show-content']/test()").extract_first().strip()
href=obj.xpath(".//a[@class='show-content']/@href").extract_first().strip()
item_obj=ChoutiItem(title=title,href=href)#封裝資料
#擷取目前頁可直接通路的所有熱點新聞頁面的頁碼(即顯示在熱點新聞下的頁碼的href屬性)L:
sel2=Selector(response=response).xpath("//a[re:test(@href,'/all/hot/recent/\d+')]/@href").extract()
for url in sel2:
#md5_url=self.md5(url)
#if url in self.visited_urls:#如果已經通路過該URL
# print("該URL已存在")
#else:
# self.visited_urls.add(url)
print(url)
url="https://dig.chouti.com%s"%url
#将得到的要通路的URL放入排程器(記得在settings.py中指定深度):
yield Request(url=url,callback=self.parse)#注意:指定回調函數時不加括号
#def md5(self,url):
# #要儲存的URL可能過長,是以進行md5加密,然後儲存加密後的URL(加密後的内容都是等長的)
# import hashlib
# obj=hashlib.md5()
# obj.update(bytes(url,encoding="utf-8"))
# return obj.hexdigest()
#chouti2.py:
import scrapy
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.http.cookies import CookieJar
class Chouti2Spider(scrapy.Spider):
name = 'chouti2'
allowed_domains = ['chouti.com']
start_urls = ['http://dig.chouti.com/']
cookie_dict = None
def parse(self, response):
cookie_obj = CookieJar()
cookie_obj.extract_cookies(response, response.request)
self.cookie_dict = cookie_obj._cookies
yield Request(
url = "https://dig.chouti.com/login",
method = "POST",
body = "phone=12345678901&password=aaabbbccc&oneMonth=1",#注意:請求體不支援使用字典
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
},
cookies = cookie_obj._cookies,
callback = self.check_login
)
def check_login(self, response):
print(response.text)
yield Request( # 進行下1步操作(如點贊)
url="https://dig.chouti.com",
callback=self.like
)
def like(self, response):
id_list = Selector(response=response).xpath("//div[@share-linkid]/@share-linkid").extract()
for nid in id_list: # 給所有熱點新聞點贊
url = "https://dig.chouti.com/link/vote?linksId=%s" % nid
yield Request(
url=url,
method="POST",
cookies=self.cookie_dict,
callback=self.show
)
page_urls=Selector(response=response).xpath("//div[@id='dig_lcpage']//a/@href").extract()
for page in page_urls:
url="https://dig.chouti.com%s"%page
yield Request(url=url,callback=self.like)#完成深度,點贊每頁中的熱點新聞
def show(self, response):
print(response.text)
#cnblogs.py:
import scrapy
class CnblogsSpider(scrapy.Spider):
name = 'cnblogs'
allowed_domains = ['cnblogs.com']
start_urls = ['http://cnblogs.com/']
def parse(self, response):
pass
#duplication.py:
class RepeatFilter(BaseDupeFilter):
def __init__(self, path=None, debug=False):
self.visited_urls=set()
#以下各函數都不能改名:
@classmethod
def from_settings(cls, settings):#cls即目前類的類名(此處是RepeatFilter)
#在内部會通過RepeatFilter.from_settings()建立RepeatFilter對象
return cls()
def request_seen(self, request):
if request.url in self.visited_urls:
return True
else:
self.visited_urls.add(request.url)
return False
def request_fingerprint(self, request):
print("request_fingerprint")
def close(self, reason):#爬取結束時執行
print("close")
def log(self, request, spider):#日志
print("log")
#items.py:
import scrapy
class ChoutiItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=scrapy.Field()
href=scrapy.Field()
#middlewares.py:
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class DownMiddleware1(object):
#注意:中間件中的方法隻能屬于這3個
def process_request(self,request,spider):
'''
請求需要被下載下傳時,經過所有下載下傳器中間件的process_request()調用
:param request:Request對象
:param spider:爬蟲對象
:return:
None:交給下1個下載下傳中間件的process_request()
Response對象:停止process_request()的執行,開始執行process_response()
Request對象:停止中間件的執行,将Request放回排程器
raise IgnoreRequest異常:調用process_exception()
'''
pass
def process_response(self,request,response,spider):
'''
下載下傳完成,傳回時調用
:param request:Request對象
:param response:Response對象
:param spider:爬蟲對象
:return:
Response對象:交給下1個下載下傳中間件的process_response()
Request對象:停止中間件的執行,request将被重新排程下載下傳
raise IgnoreRequest異常:調用Request.errback()
'''
print('response1')
return response
def process_exception(self,request,exception,spider):
'''
當下載下傳處理器(download handler)或process_request()抛出異常時執行
:param response:Response對象
:param exception:錯誤對象
:param spider:爬蟲對象
:return:
None:交給下1個下載下傳中間件的process_exception()
如果所有process_exception()都傳回None,則報錯
Response對象:停止後續process_exception方法
Request對象:停止中間件,request将會被重新排程下載下傳
'''
return None
class SpiderMiddleware(object):
def process_spider_input(self,response,spider):
'''
下載下傳完成時執行,然後交給parse()處理
:param response:下載下傳器移交的Response對象
:param spider:爬蟲對象
:return:
'''
pass
def process_spider_output(self,response,result,spider):
'''
spider處理完成,傳回時調用,之後再交給排程器或管道
:param response:Response對象
:param result:parse()傳回的Request或Item對象
:param spider:爬蟲對象
:return:必須傳回包含Request或Item對象的可疊代對象
'''
return result
def process_spider_exception(self,response, exception, spider):
'''
異常調用
:param response:
:param exception:
:param spider:
:return:
None:繼續交給後續中間件處理異常
含Response或Item對象的可疊代對象:交給排程器或管道
'''
return None
def process_start_requests(self,start_requests,spider):
'''
爬蟲啟動時調用,在之後疊代深度時不執行
:param start_requests:初始URL的Request對象構成的可疊代對象
:param spider:爬蟲對象
:return:包含Request對象的可疊代對象
'''
return start_requests
#pipelines.py:
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class Scrapy1Pipeline:
def __init__(self,conn_s):
self.conn_s=conn_s
self.conn=None
def open_spider(self,spider):
self.f=open("news.json","a+")
def process_item(self,item,spider):
if spider.name=="chouti":
print(item,spider)
tpl="%s\n%s\n\n"%(item["title"],item["href"])
self.f.write(tpl)
return item
def close_spider(self,spider):
self.f.close()
@classmethod
def from_crawler(cls,crawler):
conn_s=crawler.settings.getint('DB')
return cls(conn_s)
class Scrapy1Pipeline2:
def process_item(self,item,spider):
if spider.name=="cnblogs":
print(item)
raise DropItem()
#crawlall.py中:
from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings
class Command(ScrapyCommand):
requires_project=True
def syntax(self):
return '[options]'
def short_desc(self):#使用scrapy --help時關于該指令的提示
return 'Runs all of the spiders'
def run(self,args,opts):#opts是傳入指令的參數
#找到所有爬蟲的名稱:
spider_list=self.crawler_process.spiders.list()
#開始爬取(内置的crawl指令也是通過類似的方法執行的):
for name in spider_list:
self.crawler_process.crawl(name,**opts.__dict__)
self.crawler_process.start()
#settings.py中:
BOT_NAME = 'scrapy1'
SPIDER_MODULES = ['scrapy1.spiders']
NEWSPIDER_MODULE = 'scrapy1.spiders'
SPIDER_MIDDLEWARES = {
'scrapy1.middlewares.SpiderMiddleware':550
}
DOWNLOADER_MIDDLEWARES = {
'scrapy1.middlewares.SpiderMiddleware':550
}
EXTENSIONS = {
#'scrapy.extensions.telnet.TelnetConsole': None,
}
ITEM_PIPELINES = {
'scrapy1.pipelines.Scrapy1Pipeline': 300,
'scrapy.pipelines.Scrapy1Pipeline2':200,
}
DEPTH_LIMIT=4#指定爬取的深度
#如指定為1,則通路初始URL頁面中的URL(記為URL1),但不通路URL1頁面中的URL
#DUPEFILTER_CLASS='scrapy1.duplication.RepeatFilter'#指定用于去重的類
COMMANDS_MODULE='scrapy1.commands'