1ãå建Scrapy项ç®
scrapy startproject XPC_Redis
2.è¿å ¥é¡¹ç®ç®å½ï¼ä½¿ç¨å½ä»¤genspiderå建Spiderï¼æ³¨æåé¢å 许ç¬åçåè¦å¢å ï¼
scrapy genspider xpc_redis xinpianchang.com
3ãå®ä¹è¦æåçæ°æ®ï¼å¤çitems.pyæ件ï¼
# -*- coding: utf-8 -*-
import scrapy
class XpcRedisItem(scrapy.Item):
# è§é¢id
v_id = scrapy.Field()
# è§é¢åå
video_name = scrapy.Field()
# è§é¢åç±»
category = scrapy.Field()
# ä¸ä¼ æ¶é´
up_time = scrapy.Field()
# ææ¾é
play_counts = scrapy.Field()
# ç¹èµé
like_counts = scrapy.Field()
# è§é¢é¾æ¥å°å
video_url = scrapy.Field()
# è§é¢ä»ç»
video_info = scrapy.Field()
# jsonæ件å°åï¼è¿ä¸ªé¡µé¢å¯ä»¥æ¥çå°è§é¢çææ¾å°åvideo_url
json_url = scrapy.Field()
# è§é¢è¯¦æ
页å°å
video_detail_url = scrapy.Field()
# æ¬æ¡è®°å½çæ·»å æ¶é´
add_time = scrapy.Field()
4ãç¼åæåitemæ°æ®çSpiderï¼å¨spidersæ件夹ä¸ï¼xpc_redis.pyï¼
# -*- coding: utf-8 -*-
import re
import datetime
import scrapy
import json
from XPC_Redis.items import XpcRedisItem
from scrapy_redis.spiders import RedisSpider
class XpcRedisSpider(RedisSpider):
name = 'xpc_redis'
allowed_domains = ['xinpianchang.com','openapi-vtom.vmovier.com']
# start_urls = ['https://www.xinpianchang.com/channel/index/type-/sort-like/duration_type-0/resolution_type-/page-1']
redis_key = 'xpc_redis:start_urls'
# lpush xpc_redis:start_urls https://www.xinpianchang.com/channel/index/type-/sort-like/duration_type-0/resolution_type-/page-1
def parse(self, response):
# è·åè§é¢idï¼æ¯é¡µ40æ¡
video_id = response.xpath('//div[@class="channel-con"]/ul[@class="video-list"]/li/@data-articleid').extract()
for id in video_id:
# è§é¢è¯¦æ
页å°å
video_detail_url = 'https://www.xinpianchang.com/a{}'.format(id)
yield scrapy.Request(url=video_detail_url,meta={'meta_1':video_detail_url},callback=self.video_detail)
# éç»å½ç¶æåªè½è·å20页
total_page = 20
for page in range(2,total_page+1):
# print("å¤ç第%s页..."%page)
url = 'https://www.xinpianchang.com/channel/index/type-/sort-like/duration_type-0/resolution_type-/page-'
yield scrapy.Request(url=url+str(page),callback=self.parse)
# è§é¢è¯¦æ
页
def video_detail(self,response):
# å¨spiderè¿è¡å°æ个ä½ç½®æ¶æåï¼æ¥ç被å¤ççresponseçæ
åµ
# from scrapy.shell import inspect_response
# inspect_response(response, self)
meta_1 = response.meta['meta_1']
# with open(meta_1.split('a')[-1] + ".html",'w',encoding='utf-8')as f:
# f.write(response.text)
item = XpcRedisItem()
# è§é¢è¯¦æ
页é¢
item['video_detail_url'] = meta_1
item['v_id'] = meta_1.split('a')[-1]
# è§é¢åå
video_name = response.xpath('//div[@class="title-wrap"]/h3/text()').extract_first()
item['video_name'] = video_name.strip()
# è§é¢åç±»
# category = response.xpath('//span/span[contains(@class,"cate")]//text()').extract()
# item['category'] = "".join([s.strip() for s in category])
# è§é¢åç±»å¯è½æå¤ä¸ªï¼å
å¤ææå 个åç±»ï¼åå¥æ°ä¸ªï¼å¶æ°ä¸ªæ¯ä¸ª|符å·
category_count = len(response.xpath("//span[contains(@class,'cate-box')]/span/a[1]"))
if category_count >1:
category_list = []
for i in range(1,category_count+1):
c = response.xpath("//span[contains(@class,'cate-box')]/span["+str(2*i-1)+"]/a/text()").extract()
category_list.append("-".join([s.strip() for s in c]))
item['category'] = ",".join(category_list)
else:
category = response.xpath('//span/span[contains(@class,"cate")]//text()').extract()
item['category'] = "".join([s.strip() for s in category])
# è§é¢ä¸ä¼ æ¶é´ï¼æ¶é´ä¼æ¾ç¤ºæ¨å¤©ä¸ç¥éå å·è¦è½¬æ¢
up_time = response.xpath('//div/span[contains(@class,"update-time")]/i/text()').get()
today = datetime.datetime.today()
if 'æ¨å¤©' in up_time:
yes = today - datetime.timedelta(days=1)
up_time = up_time.replace('æ¨å¤©', yes.strftime("%Y-%m-%d"))
elif 'ä»å¤©' in up_time:
up_time = up_time.replace('ä»å¤©', today.strftime("%Y-%m-%d"))
item['up_time'] = up_time
# ææ¾é
play_counts = response.xpath('//div/i[contains(@class,"play-counts")]/@data-curplaycounts').get()
item['play_counts'] = play_counts
# å欢éï¼ç¹èµé
like_counts = response.xpath('//span/span[contains(@class,"like-counts")]/@data-counts').get()
item['like_counts'] = like_counts
# è§é¢è¿æ¥å°å
# video_url = response.xpath('//*[@id="xpc_video"]/source/@src').extract_first()
# item['video_url'] = video_url.strip()
# è§é¢ä»ç»
video_info= response.xpath('//div[@class="filmplay-info"]/div/p[1]/text()').extract()
video_info = [s.strip() for s in video_info]
item['video_info']= ','.join(video_info)
# data-vidæ¯jsonæ件å°åçä¸é¨åï¼960VAm7OGE7DRnW8
# https://openapi-vtom.vmovier.com/v3/video/960VAm7OGE7DRnW8?expand=resource&usage=xpc_web&appKey=61a2f329348b3bf77
# â éè¿xpathè·ådata_vid
# data_vid = response.xpath('//div[@class="filmplay-data"]/div/span/a/@data-vid').extract_first()
# â¡éè¿æ£åè·ådata_vid
patt_vid = re.compile(r'vid = "(\w+)";')
data_vid = patt_vid.findall(response.text)[0]
# modeServerAppKey=61a2f329348b3bf77è¿ä¸ªå¼ä¸ç¥éä¼ä¸ä¼å
patt_modeServerAppKey = re.compile(r'modeServerAppKey = "(\w+)";')
data_modeServerAppKey = patt_modeServerAppKey.findall(response.text)[0]
# jsonæ件å°åï¼è¿ä¸ªé¡µé¢å¯ä»¥æ¥çå°è§é¢çææ¾å°åvideo_url
json_url = 'https://openapi-vtom.vmovier.com/v3/video/{}?expand=resource&usage=xpc_web&appKey={}'.format(data_vid,data_modeServerAppKey)
item['json_url'] = json_url
yield scrapy.Request(url=json_url,meta={'meta_2':item},callback=self.video_address)
# è§é¢å°å
def video_address(self,respones):
item = XpcRedisItem()
meta_2 = respones.meta['meta_2']
item['v_id'] = meta_2['v_id']
item['video_name'] = meta_2['video_name']
item['video_detail_url'] = meta_2['video_detail_url']
item['video_info'] = meta_2['video_info']
item['json_url'] = meta_2['json_url']
item['category'] = meta_2['category']
item['up_time'] = meta_2['up_time']
item['play_counts'] = meta_2['play_counts']
item['like_counts'] = meta_2['like_counts']
# è¿æ¡è®°å½æ·»å æ¶é´
item['add_time'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
json_html = json.loads(respones.text)
# resource = {'default':'','progressive':'','lowest':''}ï¼è¿éé¢æä¸åçæ¸
æ°åº¦ï¼è¦è¿è¡å¤æ
resource = json_html['data']['resource']
if 'default' in resource.keys():
item['video_url'] = json_html['data']['resource']['default']['url']
elif 'progressive' in resource.keys():
item['video_url'] = json_html['data']['resource']['progressive'][0]['url']
else:
item['video_url'] = json_html['data']['resource']['lowest']['url']
yield item
5.å¤çpipelines管éæ件ä¿åæ°æ®ï¼å°ç»æä¿åå°æ°æ®åºä¸ï¼pipelines.pyï¼
# -*- coding: utf-8 -*-
import pymysql
class XpcRedisPipeline(object):
@classmethod
def from_crawler(cls,crawler):
cls.MYSQL_HOST = crawler.settings.get('MYSQL_HOST')
cls.MYSQL_PORT = crawler.settings.get('MYSQL_PORT')
cls.MYSQL_USER = crawler.settings.get('MYSQL_USER')
cls.MYSQL_PASSWD = crawler.settings.get('MYSQL_PASSWD')
cls.MYSQL_DBNAME = crawler.settings.get('MYSQL_DBNAME')
cls.MYSQL_CHARSET = crawler.settings.get('MYSQL_CHARSET')
return cls()
def __init__(self):
self.db = pymysql.connect(host=self.MYSQL_HOST,port=self.MYSQL_PORT,user=self.MYSQL_USER,passwd=self.MYSQL_PASSWD,
db=self.MYSQL_DBNAME,charset=self.MYSQL_CHARSET)
self.cursor = self.db.cursor()
def process_item(self,item,spider):
try:
# å°è¯å建xpc表
# self.cursor.execute('DROP table IF EXISTS xpc')
sql = 'CREATE TABLE IF NOT EXISTS xpc(v_id BIGINT primary key not null COMMENT "è§é¢é¡µid",' \
'video_name varchar(200),category varchar(100),up_time VARCHAR(50),add_time DATETIME,play_counts INT(13),like_counts INT(13),' \
'video_detail_url varchar(100),video_url varchar(200),video_info LONGTEXT,' \
'json_url varchar(300))ENGINE =InnoDB DEFAULT CHARSET=utf8mb4;'
self.cursor.execute(sql)
except Exception as e:
print("xpc表已åå¨ï¼æ éå建ï¼")
try:
# å»éå¤ç
self.cursor.execute("SELECT v_id from xpc WHERE v_id=%s;",item['v_id'])
repetition = self.cursor.fetchone()
keys, values = zip(*item.items())
# å¦æåå¨ï¼åä¸éæ°æå
¥ï¼åªæ´æ°
if repetition:
# ON DUPLICATE KEY UPDATE:æ°æ®å·²åå¨ï¼åªæ¯æ´æ°é¨åå段å¼ï¼å¦åæå
¥éå¤keyå¼æ°æ®ä¼æ¥é
sql = """
INSERT INTO xpc({})VALUES ({}) ON DUPLICATE KEY UPDATE {};""".format(
','.join(keys),
','.join(['%s']*len(values)),
','.join(['{}=%s'.format(k) for k in keys]))
self.cursor.execute(sql,values*2)
else:
sql = """
INSERT INTO xpc({})VALUES ({});""".format(
','.join(keys),
','.join(['%s'] * len(values)))
self.cursor.execute(sql, values)
self.db.commit()
# print(self.cursor._last_executed)
return item
except Exception as e:
print("åºéERROR:",e)
self.db.rollback()
def close_spider(self,spider):
print("mysqlæ°æ®åºå¤çå®æ¯")
self.cursor.close()
self.db.close()
6.é ç½®settingsæ件ï¼settings.pyï¼
ROBOTSTXT_OBEY = False
PROXY_REDIS_KEY = 'xpc:proxies'
MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWD = '123456'
MYSQL_DBNAME = 'python4'
MYSQL_CHARSET = 'utf8mb4'
DOWNLOAD_DELAY = 3
DEFAULT_REQUEST_HEADERS = {
'User-Agesettingsnt': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
}
DOWNLOADER_MIDDLEWARES = {
# 'XPC_Redis.middlewares.XpcRedisDownloaderMiddleware': 543,
'XPC_Redis.middlewares.RandomProxyMiddleware': 749,
}
DOWNLOAD_TIMEOUT = 5
ITEM_PIPELINES = {
'XPC_Redis.pipelines.XpcRedisPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400,
}
# è¿å¯ä»¥å°æ¥å¿åå°æ¬å°æ件ä¸ï¼å¯éæ·»å 设置ï¼
LOG_FILE = "xpc_redis.log"
LOG_LEVEL = "DEBUG"
# å
å«æå°ä¿¡æ¯ä¹ä¸èµ·åè¿æ¥å¿é
LOG_STDOUT = True
7-éç¨ï¼å¢å éæºä»£çä¸é´ä»¶ï¼middlewares.pyï¼
# -*- coding: utf-8 -*-
import random
import redis,requests
from scrapy import signals
from scrapy.exceptions import NotConfigured
from twisted.internet.error import ConnectionRefusedError,TimeoutError
from scrapy.core.downloader.handlers.http11 import TunnelError
class RandomProxyMiddleware(object):
def __init__(self,settings):
self.r = redis.StrictRedis(host='127.0.0.1',decode_responses=True)
self.proxy_key = settings.get('PROXY_REDIS_KEY')
# è·åé¿åº¦ï¼å°ä»£çæ± ä¸çHTTPå
¨é¨ä¿®æ¹æå°åhttp
llren = self.r.llen(self.proxy_key)
for i in range(llren):
self.r.lset(self.proxy_key,i,[s.lower() for s in self.r.lrange(self.proxy_key,0,-1) if isinstance(s,str)==True][i])
# è®°å½æ个代ç失败ç次æ°çkey
self.proxy_stats_key =self.proxy_key+'_stats'
# æ大失败次æ°
self.max_failed = 3
# è·åææ代ç
@property
def proxies(self):
return self.r.lrange(self.proxy_key,0,-1)
@classmethod
def from_crawler(cls,crawler):
# å¦æHTTPPROXY_ENABLED为falseæè
proxiesä¸æ²¡æ代çæ°æ®çæ¶å
if not cls.proxies or not crawler.settings.getbool('HTTPPROXY_ENABLED'):
# 该å¼å¸¸ç±æäºç»ä»¶(ä¾å¦:Downloader middlwares)æåºï¼å£°æå
¶ä»ç¶ä¿æå
³é
raise NotConfigured
return cls(crawler.settings)
def process_request(self,request,spider):
if self.proxies and not request.meta.get('proxy'):
request.meta['proxy'] = random.choice(self.proxies)
print("å½å使ç¨ç代çæ¯:%s"%request.meta['proxy'])
def process_response(self,request,response,spider):
# è·åç¶æç
get_status =response.status
cur_proxy = request.meta.get('proxy')
if get_status in (400,403,404):
# cur_proxy å 1
self.r.hincrby(self.proxy_stats_key,cur_proxy,1)
# 失败次æ°,strç±»å
failed_times = self.r.hget(self.proxy_stats_key,cur_proxy) or 0
# å¦æ大äºæ¬¡æ°å¹¶ä¸cur_proxyä¸ä¸ºNone,å¦æ代çæ± ä¸æ æ°æ®ï¼å为None
if (int(failed_times) > self.max_failed) and cur_proxy:
print("got error http code(%s) when use proxy:%s" % (get_status, cur_proxy))
self.remove_proxy(cur_proxy)
if cur_proxy:
del request.meta['proxy']
return request
return response
def process_exception(self,request,exception,spider):
cur_proxy = request.meta.get('proxy')
if cur_proxy and isinstance(exception,(ConnectionRefusedError,TimeoutError,TunnelError)):
print("åºéERROR(%s) when use proxy:%s" % (exception, cur_proxy))
self.remove_proxy(cur_proxy)
del request.meta['proxy']
return request
def remove_proxy(self,cur_proxy):
if cur_proxy in self.proxies:
# ä»å表ä¸å æè¿ä¸ªä¸è½ç¨ç代çï¼é²æ¢å次被å©ç¨
# self.proxies.remove(cur_proxy)
self.r.lrem(self.proxy_key,0,cur_proxy)
# å æç»è®¡ä¿¡æ¯
self.r.hdel(self.proxy_stats_key,cur_proxy)
print("remove proxy:%s from proxy list" % cur_proxy)
8.åç §ä»¥ä¸é¾æ¥æå¼redisæ°æ®åºï¼
https://blog.csdn.net/z564359805/article/details/80808155
9.以ä¸è®¾ç½®å®æ¯ï¼è¿è¡ç¬åï¼æ§è¡é¡¹ç®å½ä»¤crawlï¼å¯å¨Spiderï¼
scrapy runspider xpc_redis.py
10.å¨Master端(æ ¸å¿æå¡å¨)çredis-cliè¾å ¥pushæ令ï¼åèæ ¼å¼ï¼
è¾å
¥ï¼lpush xpc_redis:start_urls https://www.xinpianchang.com/channel/index/type-/sort-like/duration_type-0/resolution_type-/page-1
Â