參考: 靜覓scrapy教程
爬取目标:頂點小說網 http://www.23us.com/
希望頂點小說網不要生氣
首先來編寫items.py
#-*- coding:utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class BingdianItem(scrapy.Item):
name = scrapy.Field()
#小說名
author = scrapy.Field()
#作者
novelurl = scrapy.Field()
#小說位址
serialstatus = scrapy.Field()
#狀态
serialnumber = scrapy.Field()
#連載字數
category = scrapy.Field()
#文章類别
name_id = scrapy.Field()
#小說編号
class DcontentItem(scrapy.Item):
id_name = scrapy.Field() #小說編号
chaptercontent = scrapy.Field() #章節内容
num = scrapy.Field() #用于綁定章節順序
chapterurl = scrapy.Field() #章節位址
chaptername = scrapy.Field() #章節名字
spider檔案
#bingdian.py
#-*- coding:utf-8 -*-
import scrapy
# import re
# from bs4 import BeautifulSoup
# from scrapy.http import Response
from bingdian.items import BingdianItem ,DcontentItem
from bingdian.SQLitepipelines.sql import Sql
class bingdian_spider(scrapy.Spider):
name = 'bingdian'
allowed_domains = ['23us.com']
start_urls = [
'http://www.23us.com/class/1_1.html',
'http://www.23wx.com/class/2_1.html',
'http://www.23wx.com/class/3_1.html',
'http://www.23wx.com/class/4_1.html',
'http://www.23wx.com/class/5_1.html',
'http://www.23wx.com/class/6_1.html',
'http://www.23wx.com/class/7_1.html',
'http://www.23wx.com/class/8_1.html',
'http://www.23wx.com/class/9_1.html',
'http://www.23wx.com/class/10_1.html'
]
def parse(self,response):
books = response.xpath('//dd/table/tr[@bgcolor="#FFFFFF"]')#/table/tbody/tr[@bgcolor="#FFFFFF"]
print (books.extract())
for book in books:
name = book.xpath('.//td[1]/a[2]/text()').extract()[]
author = book.xpath('.//td[3]/text()').extract()[]
novelurl = book.xpath('.//td[1]/a[2]/@href').extract()[]
serialstatus = book.xpath('.//td[6]/text()').extract()[]
serialnumber = book.xpath('.//td[4]/text()').extract()[]
category = book.xpath('//dl/dt/h2/text()').re(u'(.+) - 文章清單')[]
jianjieurl = book.xpath('.//td[1]/a[1]/@href').extract()[]
item = BingdianItem()
item['name'] = name
item['author'] = author
item['novelurl'] = novelurl
item['serialstatus'] = serialstatus
item['serialnumber'] = serialnumber
item['category'] = category
item['name_id'] = jianjieurl.split('/')[-]
yield item
yield scrapy.Request(novelurl,callback = self.get_chapter,meta = {'name_id' : item['name_id']})
next_page = response.xpath('//dd[@class="pages"]/div/a[12]/@href').extract()[] #擷取下一頁位址
if next_page:
yield scrapy.Request(next_page)
#擷取章節名
def get_chapter(self,response):
num =
allurls = response.xpath('//tr')
for trurls in allurls:
tdurls = trurls.xpath('.//td[@class="L"]')
for url in tdurls:
num = num +
chapterurl = response.url + url.xpath('.//a/@href').extract()[]
chaptername = url.xpath('.//a/text()').extract()[]
rets = Sql.select_chapter(chapterurl)
if rets[] == :
print(u'章節已經存在了')
pass
else:
yield scrapy.Request(url = chapterurl,callback = self.get_chaptercontent,meta={'num':num,
'name_id':response.meta['name_id'],
'chaptername':chaptername,
'chapterurl':chapterurl})
#擷取章節内容
def get_chaptercontent(self,response):
item = DcontentItem()
item['num'] = response.meta['num']
item['id_name'] = response.meta['name_id']
item['chaptername'] = response.meta['chaptername']
item['chapterurl'] = response.meta['chapterurl']
content = response.xpath('//dd[@id="contents"]/text()').extract()
item['chaptercontent'] = '\n '.join(content)
return item
name:
定義spider名字的字元串(string)。spider的名字定義了Scrapy如何定位(并初始化)spider,是以其必須是唯一的。 不過您可以生成多個相同的spider執行個體(instance),這沒有任何限制。 name是spider最重要的屬性,而且是必須的。
allowed_domains:
可選。包含了spider允許爬取的域名(domain)清單(list)。 當 OffsiteMiddleware 啟用時, 域名不在清單中的URL不會被跟進。
parse():
是spider的一個方法。Request()預設回調函數(可以通過傳遞callback=(函數名)修改回調函數,例如後面的get_chapter()函數)。被調用時,每個初始URL完成下載下傳後生成的 Response 對象将會作為唯一的參數傳遞給該函數。 該方法負責解析傳回的資料(response data),提取資料(生成item)以及生成需要進一步處理的URL的 Request 對象。
修改settings.py檔案
ITEM_PIPELINES = {
# 'bingdian.pipelines.BingdianPipeline': 300,
'bingdian.SQLitepipelines.pipelines.BingdingPipeline' : #啟用自定義pipelines 1 是優先級程度(1-1000随意設定,數值越低,元件的優先級越高)
}
#pipelines.py
#-*- coding:utf-8 -*-
from .sql import Sql
from bingdian.items import BingdianItem,DcontentItem
class BingdingPipeline(object):
def process_item(self,item,spider):
if isinstance(item,BingdianItem):
name_id = item['name_id']
ret = Sql.select_name(name_id)
if ret[] == :
print('已經存在了')
pass
else:
xs_name = item['name']
xs_author = item['author']
category = item['category']
Sql.insert_dd_name(xs_name,xs_author,category,name_id)
print(u'開始存小說标題')
if isinstance(item,DcontentItem):
url = item['chapterurl']
name_id = item['id_name']
num_id = item['num']
xs_chaptername = item['chaptername']
xs_content = item['chaptercontent']
Sql.insert_dd_chaptername(xs_chaptername,xs_content,name_id,num_id,url)
print(u'%s 存儲完畢') % xs_chaptername
return item
sql操作檔案
#-*- coding:utf-8 -*-
import sqlite3
conn = sqlite3.connect('test.db')
cursor = conn.cursor()
#建立 dd_name 表
cursor.execute('DROP TABLE IF EXISTS dd_name')
cursor.execute('create table dd_name(xs_name VARCHAR (255) DEFAULT NULL ,xs_author VARCHAR (255),category VARCHAR (255),name_id VARCHAR (255))')
#建立 dd_chaptername 表
cursor.execute('DROP TABLE IF EXISTS dd_chaptername')
cursor.execute('''CREATE TABLE dd_chaptername(xs_chaptername VARCHAR(255) DEFAULT NULL ,xs_content TEXT,id_name INT(11) DEFAULT NULL,
num_id INT(11) DEFAULT NULL ,url VARCHAR(255))''')
class Sql:
#插入資料
@classmethod
def insert_dd_name(cls,xs_name,xs_author,category,name_id):
# sql = "insert into dd_name (xs_name,xs_author,category,name_id) values (%(xs_name)s , %(xs_author)s , %(category)s , %(name_id)s)"
sql = "insert into dd_name (xs_name,xs_author,category,name_id) values ('%s','%s','%s','%s')" % (xs_name,xs_author,category,name_id)
cursor.execute(sql)
conn.commit()
#查重
@classmethod
def select_name(cls,name_id):
sql = "SELECT EXISTS (select 1 from dd_name where name_id = '%s')" % name_id
cursor.execute(sql)
return cursor.fetchall()[]
@classmethod
def insert_dd_chaptername(cls,xs_chaptername,xs_content,id_name,num_id,url):
sql = '''INSERT INTO dd_chaptername(xs_chaptername , xs_content , id_name ,num_id ,
url) VALUES ('%s' ,'%s' ,%s ,%s ,'%s')''' % (xs_chaptername,xs_content,id_name,num_id,url)
cursor.execute(sql)
conn.commit()
@classmethod
def select_chapter(cls,url):
sql = "SELECT EXISTS (select 1 from dd_chaptername where url = '%s')" % url
cursor.execute(sql)
return cursor.fetchall()[]