scrapy設定代理池
- 知識點回顧
-
- scrapy遇到了10060或者ip被封禁的問題
- scrapy設定ip代理和ua代理
- 接下來我們要修改settings檔案
- 修改中間件檔案middlewares
- 最後在settings檔案中修改
- 總結
知識點回顧
- 首先那我們先回顧一下scrapy項目的創立,指令是:`
-
scrapy startproject +項目名字
- 第二步那我們要進入項目:
-
cd 項目名字
- `第三步我們要建立爬蟲
-
scrapy genspider 爬蟲的名字 url
- 注意第三步的url是不加協定的
scrapy遇到了10060或者ip被封禁的問題
在爬蟲過程中,你會遇到很多的BUG,不要怕,可能隻是個小問題,在最近幾天的爬蟲中我遇到了10060的問題,這個問題的解決方法就是在你的爬蟲當中遇到了區域網路ip的防火牆造成的或者你因為爬取的太快等因素ip被拉進黑名單,這裡就需要設定ip代理池
scrapy設定ip代理和ua代理
其實那在項目中建立ip代理是十分簡單的.
首先我們要編寫一個擷取IP的程式ip_proxy_list:代碼如下
`# -*- coding: utf-8 -*-
import requests
import threading
from threading import Lock
import queue
#from BookToscrape.settings import R, IP_PROXY_WRITE_TYPE
g_lock = Lock()
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
def store_file(ip_port):
with open("proxy_list.txt", "a+", encoding="utf-8") as f:
f.write(f"{ip_port}\n")
# def store_redis(ip_port):
# R.sadd("ip_port_set", ip_port) #資料存入集合
# R.expire("ip_port_set", 24*60*60) #逾時時間
#
# STORE_MAP = {
# 'file':store_file,
# 'redis':store_redis,
# }
def fetch_web_data(url, proxies=None, timeout=10):
try:
r = requests.get(url, headers=headers, timeout=timeout, proxies=proxies)
return r.text
except Exception as e:
print(f"fetch_web_data has error with url:{url}, error:{e}")
return None
class FetchProxyListThread(threading.Thread):
'''
從http://www.thebigproxylist.com/members/proxy-api.php?output=all&user=list&pass=8a544b2637e7a45d1536e34680e11adf
網絡接口中下載下傳代理資料
'''
def __init__(self, url, mq):
threading.Thread.__init__(self)
self.__url = url
self.__mq = mq
def run(self):
'''
下載下傳接口資料,儲存到mq
:return:
'''
data = fetch_web_data(self.__url)
ip_pool_list = data.split("\n")
[self.__mq.put(ip_pool.split(",")[0]) for ip_pool in ip_pool_list]
CHECK_URL = "http://httpbin.org/get?x=2&y=4"
class IPProxyCheckThread(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.__queue = queue
def run(self):
global g_lock
while True:
ip_port = None
try:
ip_port = self.__queue.get(timeout=10)
except Exception as e:
break
print(f"current data is {ip_port}")
proxies = {
'http': ip_port,
}
data = fetch_web_data(CHECK_URL, proxies=proxies, timeout=5)
if data == None:
print(f"目前IP對 {ip_port} 校驗不成功,丢棄!")
continue
print(f"目前IP對 {ip_port} 校驗成功,可用!")
#g_lock.acquire()
store_file(ip_port)
#STORE_MAP[IP_PROXY_WRITE_TYPE](ip_port)
#g_lock.release()
def process():
mq = queue.Queue()
url = "http://www.thebigproxylist.com/members/proxy-api.php?output=all&user=list&pass=8a544b2637e7a45d1536e34680e11adf"
fth = FetchProxyListThread(url, mq)
thread_num = 10
thread_list = []
for i in range(thread_num):
t = IPProxyCheckThread(mq)
thread_list.append(t)
fth.start()
[th.start() for th in thread_list]
fth.join()
[th.join() for th in thread_list]
print("all work has done.")
if __name__ == "__main__":
process()`
在檔案運作完成後再你目前檔案夾下會對多一個proxy_list的文本檔案.
第二步建立一個名字叫ua_list的txt檔案,在檔案内寫入ua如下:
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1
Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6
Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6
Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5
Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3
Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3
Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3
Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3
Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24
Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24
接下來我們要修改settings檔案
在settings檔案中添加`
設定ua代理
USER_AGENT_LIST = []
with open("檔案的路徑/ua_list.txt", "r") as f:
lines = f.readlines()
for line in lines:
USER_AGENT_LIST.append(line.strip())
#設定ip代理
IP_PROXY_LIST = []
with open("檔案的路徑/proxy_list.txt", "r", encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
IP_PROXY_LIST.append(line.strip())
修改中間件檔案middlewares
在middlewares檔案中添加
import logging
import random
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
from sofangwang.settings import USER_AGENT_LIST
class RotateUserAgentMiddleware(UserAgentMiddleware):
#建立ua
def process_request(self, request, spider):
user_agent = random.choice(USER_AGENT_LIST)
if user_agent:
request.headers.setdefault('User-Agent', user_agent)
print(f"User-Agent:{user_agent} is using.")
return None
def process_exception(self, request, exception, spider):
error_info = f"spider:{spider.name} RotateUserAgentMiddleware has error with {exception}"
print(error_info)
logging.error(error_info)
from scrapy.http import HtmlResponse
from sofangwang.settings import IP_PROXY_LIST
class MyIPProxyMiddleWare(object):
'''
ip 代理池
'''
def process_request(self, request, spider):
# 從list中選取IP,設定到request
ip_proxy = random.choice(IP_PROXY_LIST)
if ip_proxy:
request.meta['proxies'] = ip_proxy # 此處關鍵字proxies不能錯
print(f"IP_PROXY:{ip_proxy}")
def process_exception(self, request, exception, spider):
error_info = f"spider:{spider.name} MyIPProxyMiddleWare has error with {exception}"
print(error_info)
logging.error(error_info)
最後在settings檔案中修改
修改DOWNLOADER_MIDDLEWARES
DOWNLOADER_MIDDLEWARES = {
'項目名.middlewares.SofangwangDownloaderMiddleware': 543,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
'項目名.middlewares.RotateUserAgentMiddleware': 500,
'項目名.middlewares.MyIPProxyMiddleWare': 505,
總結
本章是爬蟲的要點之一,在與反爬程式員的鬥争中IP代理池是重點中的重點,可能你的一個ip代理池不寫會造成整個公司或者學校的區域網路都進不去這個網站,是以在爬蟲項目建立完成的第一步就是設定IP代理池.
爬蟲的海洋中送大家一句話吧:世上無難事隻怕有心人