惊了个呆 不到20行爬完~
cmd:
scrapy startproject toubiao
cd toubiao
scrapy genspider -t crawl gg .com
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
class GgSpider(CrawlSpider):
name = 'gg'
allowed_domains = ['bidchance.com']
start_urls = ['http://www.bidchance.com/outlinegonggao.html']
rules = ( #链接提取 目的地 提取之后是否继续提取
Rule(LinkExtractor(allow=r'www.bidchance.com/info-gonggao-(\d+)\.html'), callback='parse_item', follow=False),
Rule(LinkExtractor(allow=r'http://www.bidchance.com/outlinegonggao\d+\.html'), follow=True)
)
def parse_item(self, response):
item = {}
item["title"] = response.xpath('//div[@class="xlh"]/text()').extract_first().strip()
item["date"] = re.findall('发布日期:(2019年\d{2}月\d{2}日)',response.text)[0]
print(item)