首先,利用命令scrapy startproject csdnspider创建我们的爬虫项目;
然后,在spiders目录下,创建csdnspider.py文件,这是我们主程序所在文件,目录结构如下:
找到并打开items.py文件,定义我们需要爬取的元素:
# -*- coding: utf-8 -*-
# define here the models for your scraped items
#
# see documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.item import item,field
class csdnspideritem(scrapy.item):
# define the fields for your item here like:
# name = scrapy.field()
pass
class paperitem(item):
title = field() #博文标题
link = field() #博文链接
writetime = field() #日志编写时间
readers = field() #阅读次数
comments = field() #评论数
打开创建的csdnspider.py文件,实现代码:
############################################################################
# 程序:csdn博客爬虫
# 功能:抓取我的csdn全部博文
# 时间:2016/06/01
# 作者:yr
#############################################################################
import scrapy, re, json, sys
# 导入框架内置基本类class scrapy.spider.spider
try:
from scrapy.spider import spider
except:
from scrapy.spider import basespider as spider
# 导入爬取一般网站常用类class scrapy.contrib.spiders.crawlspider和规则类rule
from scrapy.contrib.spiders import crawlspider, rule
from scrapy.contrib.linkextractors.lxmlhtml import lxmllinkextractor
from bs4 import beautifulsoup
from csdnspider.items import paperitem
# 设置编码格式
reload(sys)
sys.setdefaultencoding('utf-8')
add = 0
class csdnpaperspider(crawlspider):
name = "csdnspider"
allowed_domains = ["csdn.net"]
# 定义爬虫的入口网页
start_urls = ["http://blog.csdn.net/fly_yr/article/list/1"]
# 自定义规则
rules = [rule(lxmllinkextractor(allow=('/article/list/\d{,2}')), follow=true, callback='parseitem')]
# 定义提取网页数据到items中的实现函数
def parseitem(self, response):
global add
items = []
data = response.body
soup = beautifulsoup(data, "html5lib")
# 找到所有的博文代码模块
sites = soup.find_all('div', "list_item article_item")
for site in sites:
item = paperitem()
# 标题、链接、日期、阅读次数、评论个数
item['title'] = site.find('span', "link_title").a.get_text()
item['link']= site.find('span', "link_title").a.get('href')
item['writetime'] = site.find('span', "link_postdate").get_text()
item['readers'] = re.findall(re.compile(r'(.∗?)'), site.find('span', "link_view").get_text())[0]
item['comments'] = re.findall(re.compile(r'(.∗?)'), site.find('span', "link_comments").get_text())[0]
add += 1
items.append(item)
print("the total number:",add)
return items
从上述代码中,我们可看到网页元素的提取采用的beautifulsoup库,而且我定义了一个全局变量add,来记录一共提取出来的博文总数,用来与输出结果对比,验证其正确性。
找到并打开pipelines.py文件,添加代码:
# define your item pipelines here
# don't forget to add your pipeline to the item_pipelines setting
# see: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import signals
import json, codecs
class csdnspiderpipeline(object):
def process_item(self, item, spider):
return item
class jsonwithencodingcsdnpipeline(object):
def __init__(self):
self.file = codecs.open('papers.json', 'w', encoding='utf-8')
writetime = json.dumps("日期:"+str(item['writetime']),ensure_ascii=false) + "\n"
title = json.dumps("标题:"+str(item['title']),ensure_ascii=false)+ "\n"
link = json.dumps("链接:"+str(item['link']),ensure_ascii=false)+ "\n"
readers = json.dumps("阅读次数:"+str(item['readers']),ensure_ascii=false)+ "\t"
comments = json.dumps("评论数量:"+str(item['comments']),ensure_ascii=false)+ "\n\n"
line = writetime + title + link + readers + comments
self.file.write(line)
def spider_closed(self, spider):
self.file.close()
找到并打开setting.py文件:
# scrapy settings for csdnspider project
# for simplicity, this file contains only the most important settings by
# default. all the other settings are documented here:
# http://doc.scrapy.org/en/latest/topics/settings.html
bot_name = 'csdnspider'
spider_modules = ['csdnspider.spiders']
newspider_module = 'csdnspider.spiders'
# crawl responsibly by identifying yourself (and your website) on the user-agent
#user_agent = 'csdnspider (+http://www.yourdomain.com)'
item_pipelines = {
'csdnspider.pipelines.jsonwithencodingcsdnpipeline': 300,
}
log_level = 'info'
在项目根目录下运行scrapy crawl csdnspider,得到结果:
当前我的博文共有394篇,结果正确。打开项目根目录下的papers.json文件,查看爬取的博文信息: