随機User-Agent scrapy
fake_useragent庫,僞裝請求頭
from fake_useragent import UserAgent
ua = UserAgent()
# ie浏覽器的user agent
print(ua.ie)
# opera浏覽器
print(ua.opera)
# chrome浏覽器
print(ua.chrome)
# firefox浏覽器
print(ua.firefox)
# safri浏覽器
print(ua.safari)
# 最常用的方式
# 寫爬蟲最實用的是可以随意變換headers,一定要有随機性。支援随機生成請求頭
print(ua.random)
print(ua.random)
print(ua.random)
擷取送出請求的ip位址
def get_local_ip():
# headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"}
ua = UserAgent()
head = ua.random
headers = {"User-Agent": head}
url = 'http://ip.hahado.cn/ip'
# 使用的是阿布雲收費代理ip
proxy = {'http': 'http://HC9XY1E5IT9P:[email protected]:9010'}
respone = requests.get(url=url, headers=headers, proxies=proxy)
print(respone.text)
from fake_useragent import UserAgent
class RandomUserAgentMiddlware(object):
#随機更換user-agent
def __init__(self, crawler):
super(RandomUserAgentMiddlware, self).__init__()
self.ua = UserAgent()
self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
def get_ua():
return getattr(self.ua, self.ua_type)
request.headers.setdefault('User-Agent', get_ua())
代理ip
先把有效的免費的代理ip保持到一個檔案裡
class GetIP(object):
def delete_ip(self):
dd = pd.read_csv('xici_ip.csv', header=None)
print(dd[0])
def judge_ip(self, ip, port, http):
#判斷ip是否可用
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"}
http_url = "http://www.baidu.com"
proxy_url = "{0}:{1}".format(ip, port)
try:
if http == 'HTTP':
proxy_dict = {
"http": 'http://' + proxy_url,
}
else:
proxy_dict = {
"https": 'https://' + proxy_url,
}
response = requests.get(http_url, proxies=proxy_dict, timeout=5, headers=headers)
print(response)
except Exception as e:
print('無效')
return False
else:
code = response.status_code
if code >= 200 and code < 300:
print('有效')
return True
else:
print('wuxiao')
return False
def get_random_ip(self):
dd = pd.read_csv('xici_ip3.csv', header=None)
first_ip = dd.sample(n=1, random_state=None)
ip = first_ip.iloc[0,1]
port = first_ip.iloc[0, 2]
http = first_ip.iloc[0, 3]
judge_re = self.judge_ip(ip, port, http)
if judge_re:
if http == 'HTTP':
return "http://{0}:{1}".format(ip, port)
else:
return "https://{0}:{1}".format(ip, port)
else:
return self.get_random_ip()