首先,利用指令scrapy startproject csdnspider建立我們的爬蟲項目;
然後,在spiders目錄下,建立csdnspider.py檔案,這是我們主程式所在檔案,目錄結構如下:
找到并打開items.py檔案,定義我們需要爬取的元素:
# -*- coding: utf-8 -*-
# define here the models for your scraped items
#
# see documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.item import item,field
class csdnspideritem(scrapy.item):
# define the fields for your item here like:
# name = scrapy.field()
pass
class paperitem(item):
title = field() #博文标題
link = field() #博文連結
writetime = field() #日志編寫時間
readers = field() #閱讀次數
comments = field() #評論數
打開建立的csdnspider.py檔案,實作代碼:
############################################################################
# 程式:csdn部落格爬蟲
# 功能:抓取我的csdn全部博文
# 時間:2016/06/01
# 作者:yr
#############################################################################
import scrapy, re, json, sys
# 導入架構内置基本類class scrapy.spider.spider
try:
from scrapy.spider import spider
except:
from scrapy.spider import basespider as spider
# 導入爬取一般網站常用類class scrapy.contrib.spiders.crawlspider和規則類rule
from scrapy.contrib.spiders import crawlspider, rule
from scrapy.contrib.linkextractors.lxmlhtml import lxmllinkextractor
from bs4 import beautifulsoup
from csdnspider.items import paperitem
# 設定編碼格式
reload(sys)
sys.setdefaultencoding('utf-8')
add = 0
class csdnpaperspider(crawlspider):
name = "csdnspider"
allowed_domains = ["csdn.net"]
# 定義爬蟲的入口網頁
start_urls = ["http://blog.csdn.net/fly_yr/article/list/1"]
# 自定義規則
rules = [rule(lxmllinkextractor(allow=('/article/list/\d{,2}')), follow=true, callback='parseitem')]
# 定義提取網頁資料到items中的實作函數
def parseitem(self, response):
global add
items = []
data = response.body
soup = beautifulsoup(data, "html5lib")
# 找到所有的博文代碼子產品
sites = soup.find_all('div', "list_item article_item")
for site in sites:
item = paperitem()
# 标題、連結、日期、閱讀次數、評論個數
item['title'] = site.find('span', "link_title").a.get_text()
item['link']= site.find('span', "link_title").a.get('href')
item['writetime'] = site.find('span', "link_postdate").get_text()
item['readers'] = re.findall(re.compile(r'(.∗?)'), site.find('span', "link_view").get_text())[0]
item['comments'] = re.findall(re.compile(r'(.∗?)'), site.find('span', "link_comments").get_text())[0]
add += 1
items.append(item)
print("the total number:",add)
return items
從上述代碼中,我們可看到網頁元素的提取采用的beautifulsoup庫,而且我定義了一個全局變量add,來記錄一共提取出來的博文總數,用來與輸出結果對比,驗證其正确性。
找到并打開pipelines.py檔案,添加代碼:
# define your item pipelines here
# don't forget to add your pipeline to the item_pipelines setting
# see: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import signals
import json, codecs
class csdnspiderpipeline(object):
def process_item(self, item, spider):
return item
class jsonwithencodingcsdnpipeline(object):
def __init__(self):
self.file = codecs.open('papers.json', 'w', encoding='utf-8')
writetime = json.dumps("日期:"+str(item['writetime']),ensure_ascii=false) + "\n"
title = json.dumps("标題:"+str(item['title']),ensure_ascii=false)+ "\n"
link = json.dumps("連結:"+str(item['link']),ensure_ascii=false)+ "\n"
readers = json.dumps("閱讀次數:"+str(item['readers']),ensure_ascii=false)+ "\t"
comments = json.dumps("評論數量:"+str(item['comments']),ensure_ascii=false)+ "\n\n"
line = writetime + title + link + readers + comments
self.file.write(line)
def spider_closed(self, spider):
self.file.close()
找到并打開setting.py檔案:
# scrapy settings for csdnspider project
# for simplicity, this file contains only the most important settings by
# default. all the other settings are documented here:
# http://doc.scrapy.org/en/latest/topics/settings.html
bot_name = 'csdnspider'
spider_modules = ['csdnspider.spiders']
newspider_module = 'csdnspider.spiders'
# crawl responsibly by identifying yourself (and your website) on the user-agent
#user_agent = 'csdnspider (+http://www.yourdomain.com)'
item_pipelines = {
'csdnspider.pipelines.jsonwithencodingcsdnpipeline': 300,
}
log_level = 'info'
在項目根目錄下運作scrapy crawl csdnspider,得到結果:
目前我的博文共有394篇,結果正确。打開項目根目錄下的papers.json檔案,檢視爬取的博文資訊: