天天看點

scrapy執行個體 ----- 爬取小說

參考: 靜覓scrapy教程

爬取目标:頂點小說網 http://www.23us.com/

scrapy執行個體 ----- 爬取小說

希望頂點小說網不要生氣

scrapy執行個體 ----- 爬取小說

首先來編寫items.py

#-*- coding:utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy


class BingdianItem(scrapy.Item):
    name = scrapy.Field()
    #小說名
    author = scrapy.Field()
    #作者
    novelurl = scrapy.Field()
    #小說位址
    serialstatus = scrapy.Field()
    #狀态
    serialnumber = scrapy.Field()
    #連載字數
    category = scrapy.Field()
    #文章類别
    name_id = scrapy.Field()
    #小說編号


class DcontentItem(scrapy.Item):
    id_name = scrapy.Field()            #小說編号
    chaptercontent = scrapy.Field()     #章節内容
    num = scrapy.Field()                #用于綁定章節順序
    chapterurl = scrapy.Field()         #章節位址
    chaptername = scrapy.Field()        #章節名字
           

spider檔案

#bingdian.py
#-*- coding:utf-8 -*-
import scrapy
# import re
# from bs4 import BeautifulSoup
# from scrapy.http import Response
from bingdian.items import BingdianItem ,DcontentItem
from bingdian.SQLitepipelines.sql import Sql

class bingdian_spider(scrapy.Spider):
    name = 'bingdian'
    allowed_domains = ['23us.com']
    start_urls = [
        'http://www.23us.com/class/1_1.html',
        'http://www.23wx.com/class/2_1.html',
        'http://www.23wx.com/class/3_1.html',
        'http://www.23wx.com/class/4_1.html',
        'http://www.23wx.com/class/5_1.html',
        'http://www.23wx.com/class/6_1.html',
        'http://www.23wx.com/class/7_1.html',
        'http://www.23wx.com/class/8_1.html',
        'http://www.23wx.com/class/9_1.html',
        'http://www.23wx.com/class/10_1.html'
    ]

    def parse(self,response):
        books = response.xpath('//dd/table/tr[@bgcolor="#FFFFFF"]')#/table/tbody/tr[@bgcolor="#FFFFFF"]
        print (books.extract())
        for book in books:
            name = book.xpath('.//td[1]/a[2]/text()').extract()[]
            author = book.xpath('.//td[3]/text()').extract()[]
            novelurl = book.xpath('.//td[1]/a[2]/@href').extract()[]
            serialstatus = book.xpath('.//td[6]/text()').extract()[]
            serialnumber = book.xpath('.//td[4]/text()').extract()[]
            category = book.xpath('//dl/dt/h2/text()').re(u'(.+) - 文章清單')[]
            jianjieurl = book.xpath('.//td[1]/a[1]/@href').extract()[]

            item = BingdianItem()
            item['name'] =  name
            item['author'] = author
            item['novelurl'] = novelurl
            item['serialstatus'] = serialstatus
            item['serialnumber'] = serialnumber
            item['category'] = category
            item['name_id'] = jianjieurl.split('/')[-]

            yield item
            yield scrapy.Request(novelurl,callback = self.get_chapter,meta = {'name_id' : item['name_id']})

        next_page = response.xpath('//dd[@class="pages"]/div/a[12]/@href').extract()[] #擷取下一頁位址
        if next_page:
            yield  scrapy.Request(next_page)

    #擷取章節名
    def get_chapter(self,response):
        num = 
        allurls = response.xpath('//tr')
        for trurls in allurls:
            tdurls = trurls.xpath('.//td[@class="L"]')
            for url in tdurls:
                num = num + 
                chapterurl = response.url + url.xpath('.//a/@href').extract()[]
                chaptername = url.xpath('.//a/text()').extract()[]
                rets = Sql.select_chapter(chapterurl)
                if rets[] == :
                    print(u'章節已經存在了')
                    pass
                else:
                    yield scrapy.Request(url = chapterurl,callback = self.get_chaptercontent,meta={'num':num,
                                                                                               'name_id':response.meta['name_id'],
                                                                                               'chaptername':chaptername,
                                                                                               'chapterurl':chapterurl})
    #擷取章節内容
    def get_chaptercontent(self,response):
        item = DcontentItem()
        item['num'] = response.meta['num']
        item['id_name'] = response.meta['name_id']
        item['chaptername'] = response.meta['chaptername']
        item['chapterurl'] = response.meta['chapterurl']
        content = response.xpath('//dd[@id="contents"]/text()').extract()
        item['chaptercontent'] = '\n   '.join(content)
        return item
           

name:

定義spider名字的字元串(string)。spider的名字定義了Scrapy如何定位(并初始化)spider,是以其必須是唯一的。 不過您可以生成多個相同的spider執行個體(instance),這沒有任何限制。 name是spider最重要的屬性,而且是必須的。

allowed_domains:

可選。包含了spider允許爬取的域名(domain)清單(list)。 當 OffsiteMiddleware 啟用時, 域名不在清單中的URL不會被跟進。

parse():

是spider的一個方法。Request()預設回調函數(可以通過傳遞callback=(函數名)修改回調函數,例如後面的get_chapter()函數)。被調用時,每個初始URL完成下載下傳後生成的 Response 對象将會作為唯一的參數傳遞給該函數。 該方法負責解析傳回的資料(response data),提取資料(生成item)以及生成需要進一步處理的URL的 Request 對象。

修改settings.py檔案

ITEM_PIPELINES = {
   # 'bingdian.pipelines.BingdianPipeline': 300,
    'bingdian.SQLitepipelines.pipelines.BingdingPipeline' :  #啟用自定義pipelines 1 是優先級程度(1-1000随意設定,數值越低,元件的優先級越高)
}
           
#pipelines.py
#-*- coding:utf-8 -*-
from .sql import Sql
from bingdian.items import BingdianItem,DcontentItem


class BingdingPipeline(object):
    def process_item(self,item,spider):
        if isinstance(item,BingdianItem):
            name_id = item['name_id']
            ret = Sql.select_name(name_id)
            if ret[] == :
                print('已經存在了')
                pass
            else:
                xs_name = item['name']
                xs_author = item['author']
                category = item['category']
                Sql.insert_dd_name(xs_name,xs_author,category,name_id)
                print(u'開始存小說标題')
        if isinstance(item,DcontentItem):
            url = item['chapterurl']
            name_id = item['id_name']
            num_id = item['num']
            xs_chaptername = item['chaptername']
            xs_content = item['chaptercontent']
            Sql.insert_dd_chaptername(xs_chaptername,xs_content,name_id,num_id,url)
            print(u'%s 存儲完畢') % xs_chaptername
            return item
           

sql操作檔案

#-*- coding:utf-8 -*-
import sqlite3

conn = sqlite3.connect('test.db')
cursor = conn.cursor()
#建立 dd_name 表
cursor.execute('DROP TABLE IF EXISTS dd_name')
cursor.execute('create table dd_name(xs_name VARCHAR (255) DEFAULT NULL ,xs_author VARCHAR (255),category VARCHAR (255),name_id VARCHAR (255))')
#建立 dd_chaptername 表
cursor.execute('DROP TABLE IF EXISTS dd_chaptername')
cursor.execute('''CREATE TABLE dd_chaptername(xs_chaptername VARCHAR(255) DEFAULT NULL ,xs_content TEXT,id_name INT(11) DEFAULT NULL, 
                                                num_id INT(11) DEFAULT NULL ,url VARCHAR(255))''')
class Sql:
    #插入資料
    @classmethod
    def insert_dd_name(cls,xs_name,xs_author,category,name_id):
        # sql = "insert into dd_name (xs_name,xs_author,category,name_id) values (%(xs_name)s , %(xs_author)s , %(category)s , %(name_id)s)"
        sql = "insert into dd_name (xs_name,xs_author,category,name_id) values ('%s','%s','%s','%s')" % (xs_name,xs_author,category,name_id)
        cursor.execute(sql)
        conn.commit()
    #查重
    @classmethod
    def select_name(cls,name_id):
        sql = "SELECT EXISTS (select 1 from dd_name where name_id = '%s')" % name_id
        cursor.execute(sql)
        return cursor.fetchall()[]


    @classmethod
    def insert_dd_chaptername(cls,xs_chaptername,xs_content,id_name,num_id,url):
        sql = '''INSERT INTO dd_chaptername(xs_chaptername , xs_content , id_name ,num_id ,
                url) VALUES ('%s' ,'%s' ,%s ,%s ,'%s')''' % (xs_chaptername,xs_content,id_name,num_id,url)

        cursor.execute(sql)
        conn.commit()

    @classmethod
    def select_chapter(cls,url):
        sql = "SELECT EXISTS (select 1 from dd_chaptername where url = '%s')" % url
        cursor.execute(sql)
        return cursor.fetchall()[]
           
scrapy執行個體 ----- 爬取小說
scrapy執行個體 ----- 爬取小說
scrapy執行個體 ----- 爬取小說
scrapy執行個體 ----- 爬取小說

繼續閱讀