天天看點

Python - 靜态IP池

生成代理

# -*- coding=utf-8 -*-

import urllib2
import re
import requests
import random
import time

class Proxy():

    def init(self):
        # 靜态ip池
        self.pool = []
        self.agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:50.0) Gecko/20100101 Firefox/50.0'
        self.header = {'User-Agent' : self.agent}
    def isAlive(self, ip):
        # 代理,字典
        proxy = {'http' : ip}
        # urllib2
        proxy_handler = urllib2.ProxyHandler(proxies= proxy)
        opener = urllib2.build_opener(proxy_handler)
        urllib2.install_opener(opener= opener)
        # 通路百度
        baidu_url = 'https://www.baidu.com/'
        req = urllib2.Request(url= baidu_url, headers= self.header)
        try:
            # timeout = 2
            res = urllib2.urlopen(req, timeout= 2)
            if res.code == 200:
                return True
            else:
                return False
        except:
            return False
    def initPool(self):
        haodaili_seed = 'http://www.kuaidaili.com/proxylist/'
        # 一共(1,11)
        for i in xrange(1, 11):
            haodaili_url = haodaili_seed + str(i)
            haodaili_page = requests.get(url=haodaili_url)
            # print haodaili_page.status_code
            haodaili_content = haodaili_page.text
            pat = re.compile(r'<tr>\s*?<td\sdata-title="IP">(.*?)</td>\s*?<td\sdata-title="PORT">(\d*?)</td>.*?</tr>',re.S)
            proxy_list = re.findall(pattern= pat, string= haodaili_content)
            for p in proxy_list:
                url = 'http://'+ p[0]+ ':'+ p[1]
                self.pool.append(url)
    def refreshPool(self):
        # 清洗ip
        self.pool = filter(lambda ip: self.isAlive(ip), self.pool)
        # 最終pool中的位址
        for ip in self.pool:
            with open('./ip_proxy.dat', 'a') as f:
                f.write(ip + '\n')
    def getProxy(self):
        rand_index = random.randint(0, len(self.pool))
        return self.pool[rand_index]
    def test(self):
        print 'pool making...'
        # 初始化
        self.init()
        # 初始化pool
        self.initPool()
        # 清洗pool
        self.refreshPool()
        # 随機傳回代理ip
        for i in xrange(len(self.pool)):
            time.sleep(2)
            print self.getProxy()

if __name__ == '__main__':
    p = Proxy()
    p.test()      

使用代理

def climb(self, url):
        ip = self.getProxy()
        proxies = {
            "http": ip,
            "https": ip
        }
        anjuke_page= requests.get(url=url, proxies= proxies)
        anjuke_loop= (anjuke_page.content.decode(encoding='utf-8', errors='ignore').encode(encoding='utf-8', errors='ignore'))
        # 詳細資訊url,名字,狀态,價格,坐标,戶型
        pat = re.compile(r'<div class="infos">.*?<a class="items-name"\shref="(.*?)".*?>(.*?)</a>.*?', re.S)
        realty_info = re.findall(pattern=pat, string= anjuke_loop)
        print realty_info
        # 價格,電話