天天看點

python爬蟲之代理池的擷取和可用性檢測0x01 簡介0x02 代碼0x03 結果

python爬蟲之代理池的擷取和可用性檢測

  • 0x01 簡介
  • 0x02 代碼
  • 0x03 結果

0x01 簡介

本文主要介紹免費代理IP的爬取和檢測,主要使用beautifulsoup解析網頁資訊,摘取免費的代理ip和端口,爬取資訊來源為公開的免費的代理池

0x02 代碼

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/3/21/021 22:09
# @Author : H
# @File : getproxy.py

import requests
from bs4 import BeautifulSoup
import re


def getProxyIP_61(page):
    base_url = f"http://www.66ip.cn/{page}.html"
    res = requests.get(base_url)
    sub = []
    if res.ok:
        soup = BeautifulSoup(res.content, "html.parser")
        table = soup.find_all('table')[2]
        tr = table.find_all("tr")
        for td in tr:
            ip = td("td")[0].string
            port = td("td")[1].string
            if ip == "ip":
                pass
            else:
                sub.append(ip + ":" + port)
    return sub


def getProxyIP_61_areaindex_1(page):
    base_url = f"http://www.66ip.cn/{page}.html"
    res = requests.get(base_url)
    sub = []
    if res.ok:
        soup = BeautifulSoup(res.content, "html.parser")
        table = soup.find_all('table')[2]
        tr = table.find_all("tr")
        for td in tr:
            ip = td("td")[0].string
            port = td("td")[1].string
            if ip == "ip":
                pass
            else:
                sub.append(ip + ":" + port)
    return sub


def getProxyIP_xicaidaili(page):
    base_url = f"https://ip.jiangxianli.com/?page={page}"
    res = requests.get(base_url)
    sub = []
    if res.ok:
        soup = BeautifulSoup(res.content, "html.parser")
        table = soup("table")[0]
        tr = table.find_all("tr")
        for i in tr:
            if len(i("td")) != 0:
                ip = i("td")[0].string
                port = i("td")[1].string
                sub.append(ip + ":" + port)
    return sub


def getIPproxy_ihuan(page):
    # page = b97827cc 1
    # page = 4ce63706 2
    # page = 5crfe930 3
    # page = f3k1d581 4
    base_url = f"https://ip.ihuan.me/?page={page}"
    res = requests.get(base_url)
    sub = []
    if res.ok:
        soup = BeautifulSoup(res.content, "html.parser")
        table = soup.find_all('tbody')[0]
        tr = table.find_all("tr")
        for td in tr:
            ip = td("td")[0]("a")[0]
            ip = re.findall('(.*)>(.*)</a>(.*)', str(ip))[0][1]
            port = td("td")[1].string
            sub.append(ip + ":" + port)
    return sub


def huizong(sub, res):
    for i in res:
        sub.append(i)
    return sub


if __name__ == '__main__':
    # 擷取代理IP和端口号
    pages = ['b97827cc', '4ce63706', '5crfe930']
    sub = []
    for page in pages:
        huizong(sub, getIPproxy_ihuan(page))
    for i in range(1, 3):
        huizong(sub, getProxyIP_61(i))
        huizong(sub, getProxyIP_61_areaindex_1(i))
        huizong(sub, getProxyIP_xicaidaili(i))


    # 檢測代理可用性
    url = "https://www.baidu.com"
    for ip in sub:
        try:
            proxy_host = "https://" + ip
            proxies = {"http": proxy_host}
            res = requests.get(url, proxies=proxies)
            if res.ok:
                print("可用代理:\t" + proxy_host)
        except Exception as e:
            sub.remove(ip)
            continue

           

0x03 結果

python爬蟲之代理池的擷取和可用性檢測0x01 簡介0x02 代碼0x03 結果