生成代理
# -*- coding=utf-8 -*-
import urllib2
import re
import requests
import random
import time
class Proxy():
def init(self):
# 靜态ip池
self.pool = []
self.agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:50.0) Gecko/20100101 Firefox/50.0'
self.header = {'User-Agent' : self.agent}
def isAlive(self, ip):
# 代理,字典
proxy = {'http' : ip}
# urllib2
proxy_handler = urllib2.ProxyHandler(proxies= proxy)
opener = urllib2.build_opener(proxy_handler)
urllib2.install_opener(opener= opener)
# 通路百度
baidu_url = 'https://www.baidu.com/'
req = urllib2.Request(url= baidu_url, headers= self.header)
try:
# timeout = 2
res = urllib2.urlopen(req, timeout= 2)
if res.code == 200:
return True
else:
return False
except:
return False
def initPool(self):
haodaili_seed = 'http://www.kuaidaili.com/proxylist/'
# 一共(1,11)
for i in xrange(1, 11):
haodaili_url = haodaili_seed + str(i)
haodaili_page = requests.get(url=haodaili_url)
# print haodaili_page.status_code
haodaili_content = haodaili_page.text
pat = re.compile(r'<tr>\s*?<td\sdata-title="IP">(.*?)</td>\s*?<td\sdata-title="PORT">(\d*?)</td>.*?</tr>',re.S)
proxy_list = re.findall(pattern= pat, string= haodaili_content)
for p in proxy_list:
url = 'http://'+ p[0]+ ':'+ p[1]
self.pool.append(url)
def refreshPool(self):
# 清洗ip
self.pool = filter(lambda ip: self.isAlive(ip), self.pool)
# 最終pool中的位址
for ip in self.pool:
with open('./ip_proxy.dat', 'a') as f:
f.write(ip + '\n')
def getProxy(self):
rand_index = random.randint(0, len(self.pool))
return self.pool[rand_index]
def test(self):
print 'pool making...'
# 初始化
self.init()
# 初始化pool
self.initPool()
# 清洗pool
self.refreshPool()
# 随機傳回代理ip
for i in xrange(len(self.pool)):
time.sleep(2)
print self.getProxy()
if __name__ == '__main__':
p = Proxy()
p.test()
使用代理
def climb(self, url):
ip = self.getProxy()
proxies = {
"http": ip,
"https": ip
}
anjuke_page= requests.get(url=url, proxies= proxies)
anjuke_loop= (anjuke_page.content.decode(encoding='utf-8', errors='ignore').encode(encoding='utf-8', errors='ignore'))
# 詳細資訊url,名字,狀态,價格,坐标,戶型
pat = re.compile(r'<div class="infos">.*?<a class="items-name"\shref="(.*?)".*?>(.*?)</a>.*?', re.S)
realty_info = re.findall(pattern=pat, string= anjuke_loop)
print realty_info
# 價格,電話