天天看點

Python 爬蟲 Scrapy架構示例

Python 爬蟲 Scrapy架構示例
#chouti.py:
import scrapy,sys,io
from scrapy.selector import Selector
from scrapy.http import Request
from ..items import ChoutiItem
from scrapy.dupefilters import RFPDupeFilter

sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding="gb18030")

class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ["chouti.com"]
    start_urls = ["http://dig.chouti.com"]
    #visited_urls=set()#用于存放已通路的URL,進而實作不重複通路;但Scrapy本身也進行了去重的工作,因而無需自定義實作該操作

    #def start_requests(self):#重寫scrapy.Spider的start_requests()方法
    #    for url in self.start_urls:
    #        yield Request(url,callback=self.run)

    #def run(self,response):
    #    pass

    def parse(self, response):
        #擷取目前頁面所有熱點新聞的标題:
        sel1=Selector(response=response).xpath("//div[@id='content-list']/div[@class='item']")
        for obj in sel1:
            title=obj.xpath(".//a[@class='show-content']/test()").extract_first().strip()
            href=obj.xpath(".//a[@class='show-content']/@href").extract_first().strip()
            item_obj=ChoutiItem(title=title,href=href)#封裝資料

        #擷取目前頁可直接通路的所有熱點新聞頁面的頁碼(即顯示在熱點新聞下的頁碼的href屬性)L:
        sel2=Selector(response=response).xpath("//a[re:test(@href,'/all/hot/recent/\d+')]/@href").extract()
        for url in sel2:
            #md5_url=self.md5(url)
            #if url in self.visited_urls:#如果已經通路過該URL
            #    print("該URL已存在")
            #else:
            #    self.visited_urls.add(url)
            print(url)
            url="https://dig.chouti.com%s"%url
            #将得到的要通路的URL放入排程器(記得在settings.py中指定深度):
            yield Request(url=url,callback=self.parse)#注意:指定回調函數時不加括号

    #def md5(self,url):
    #    #要儲存的URL可能過長,是以進行md5加密,然後儲存加密後的URL(加密後的内容都是等長的)
    #    import hashlib
    #    obj=hashlib.md5()
    #    obj.update(bytes(url,encoding="utf-8"))
    #    return obj.hexdigest()
           
#chouti2.py:
import scrapy
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.http.cookies import CookieJar

class Chouti2Spider(scrapy.Spider):
    name = 'chouti2'
    allowed_domains = ['chouti.com']
    start_urls = ['http://dig.chouti.com/']
    cookie_dict = None

    def parse(self, response):
        cookie_obj = CookieJar()
        cookie_obj.extract_cookies(response, response.request)
        self.cookie_dict = cookie_obj._cookies
        yield Request(
            url = "https://dig.chouti.com/login",
            method = "POST",
            body = "phone=12345678901&password=aaabbbccc&oneMonth=1",#注意:請求體不支援使用字典
            headers = {
                'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            },
            cookies = cookie_obj._cookies,
            callback = self.check_login
        )

        def check_login(self, response):
            print(response.text)
            yield Request(  # 進行下1步操作(如點贊)
                url="https://dig.chouti.com",
                callback=self.like
            )

        def like(self, response):
            id_list = Selector(response=response).xpath("//div[@share-linkid]/@share-linkid").extract()
            for nid in id_list:  # 給所有熱點新聞點贊
                url = "https://dig.chouti.com/link/vote?linksId=%s" % nid
                yield Request(
                    url=url,
                    method="POST",
                    cookies=self.cookie_dict,
                    callback=self.show
                )
            page_urls=Selector(response=response).xpath("//div[@id='dig_lcpage']//a/@href").extract()
            for page in page_urls:
                url="https://dig.chouti.com%s"%page
                yield Request(url=url,callback=self.like)#完成深度,點贊每頁中的熱點新聞

        def show(self, response):
            print(response.text)
           
#cnblogs.py:
import scrapy

class CnblogsSpider(scrapy.Spider):
    name = 'cnblogs'
    allowed_domains = ['cnblogs.com']
    start_urls = ['http://cnblogs.com/']

    def parse(self, response):
        pass
           
#duplication.py:
class RepeatFilter(BaseDupeFilter):
    def __init__(self, path=None, debug=False):
        self.visited_urls=set()

    #以下各函數都不能改名:
    @classmethod
    def from_settings(cls, settings):#cls即目前類的類名(此處是RepeatFilter)
        #在内部會通過RepeatFilter.from_settings()建立RepeatFilter對象
        return cls()

    def request_seen(self, request):
        if request.url in self.visited_urls:
            return True
        else:
            self.visited_urls.add(request.url)
            return False

    def request_fingerprint(self, request):
        print("request_fingerprint")

    def close(self, reason):#爬取結束時執行
        print("close")

    def log(self, request, spider):#日志
        print("log")
           
#items.py:
import scrapy

class ChoutiItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title=scrapy.Field()
    href=scrapy.Field()
           
#middlewares.py:
from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter

class DownMiddleware1(object):
    #注意:中間件中的方法隻能屬于這3個
    def process_request(self,request,spider):
        '''
        請求需要被下載下傳時,經過所有下載下傳器中間件的process_request()調用
        :param request:Request對象
        :param spider:爬蟲對象
        :return:
	        None:交給下1個下載下傳中間件的process_request()
	        Response對象:停止process_request()的執行,開始執行process_response()
	        Request對象:停止中間件的執行,将Request放回排程器
	        raise IgnoreRequest異常:調用process_exception()
        '''
        pass

    def process_response(self,request,response,spider):
        '''
        下載下傳完成,傳回時調用
        :param request:Request對象
        :param response:Response對象
        :param spider:爬蟲對象
        :return:
            Response對象:交給下1個下載下傳中間件的process_response()
            Request對象:停止中間件的執行,request将被重新排程下載下傳
            raise IgnoreRequest異常:調用Request.errback()
        '''
        print('response1')
        return response

    def process_exception(self,request,exception,spider):
        '''
        當下載下傳處理器(download handler)或process_request()抛出異常時執行
        :param response:Response對象
        :param exception:錯誤對象
        :param spider:爬蟲對象
        :return:
            None:交給下1個下載下傳中間件的process_exception()
              如果所有process_exception()都傳回None,則報錯
            Response對象:停止後續process_exception方法
            Request對象:停止中間件,request将會被重新排程下載下傳
        '''
        return None

class SpiderMiddleware(object):
    def process_spider_input(self,response,spider):
        '''
        下載下傳完成時執行,然後交給parse()處理
        :param response:下載下傳器移交的Response對象
        :param spider:爬蟲對象
        :return:
        '''
        pass

    def process_spider_output(self,response,result,spider):
        '''
        spider處理完成,傳回時調用,之後再交給排程器或管道
        :param response:Response對象
        :param result:parse()傳回的Request或Item對象
        :param spider:爬蟲對象
        :return:必須傳回包含Request或Item對象的可疊代對象
        '''
        return result

    def process_spider_exception(self,response, exception, spider):
        '''
        異常調用
        :param response:
        :param exception:
        :param spider:
        :return:
            None:繼續交給後續中間件處理異常
            含Response或Item對象的可疊代對象:交給排程器或管道
        '''
        return None

    def process_start_requests(self,start_requests,spider):
        '''
        爬蟲啟動時調用,在之後疊代深度時不執行
        :param start_requests:初始URL的Request對象構成的可疊代對象
        :param spider:爬蟲對象
        :return:包含Request對象的可疊代對象
        '''
        return start_requests
           
#pipelines.py:
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem

class Scrapy1Pipeline:
    def __init__(self,conn_s):
        self.conn_s=conn_s
        self.conn=None

    def open_spider(self,spider):
        self.f=open("news.json","a+")

    def process_item(self,item,spider):
        if spider.name=="chouti":
	        print(item,spider)
	        tpl="%s\n%s\n\n"%(item["title"],item["href"])
	        self.f.write(tpl)
        return item

    def close_spider(self,spider):
        self.f.close()

    @classmethod
    def from_crawler(cls,crawler):
        conn_s=crawler.settings.getint('DB')
        return cls(conn_s)

class Scrapy1Pipeline2:
    def process_item(self,item,spider):
        if spider.name=="cnblogs":
            print(item)
        raise DropItem()
           
#crawlall.py中:
from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings

class Command(ScrapyCommand):
    requires_project=True

    def syntax(self):
        return '[options]'

    def short_desc(self):#使用scrapy --help時關于該指令的提示
        return 'Runs all of the spiders'

    def run(self,args,opts):#opts是傳入指令的參數
        #找到所有爬蟲的名稱:
        spider_list=self.crawler_process.spiders.list()

        #開始爬取(内置的crawl指令也是通過類似的方法執行的):
        for name in spider_list:
            self.crawler_process.crawl(name,**opts.__dict__)
        self.crawler_process.start()
           
#settings.py中:
BOT_NAME = 'scrapy1'

SPIDER_MODULES = ['scrapy1.spiders']
NEWSPIDER_MODULE = 'scrapy1.spiders'

SPIDER_MIDDLEWARES = {
    'scrapy1.middlewares.SpiderMiddleware':550
}

DOWNLOADER_MIDDLEWARES = {
    'scrapy1.middlewares.SpiderMiddleware':550
}

EXTENSIONS = {
    #'scrapy.extensions.telnet.TelnetConsole': None,
}

ITEM_PIPELINES = {
    'scrapy1.pipelines.Scrapy1Pipeline': 300,
    'scrapy.pipelines.Scrapy1Pipeline2':200,
}

DEPTH_LIMIT=4#指定爬取的深度
#如指定為1,則通路初始URL頁面中的URL(記為URL1),但不通路URL1頁面中的URL

#DUPEFILTER_CLASS='scrapy1.duplication.RepeatFilter'#指定用于去重的類

COMMANDS_MODULE='scrapy1.commands'