python爬蟲之代理池的擷取和可用性檢測
- 0x01 簡介
- 0x02 代碼
- 0x03 結果
0x01 簡介
本文主要介紹免費代理IP的爬取和檢測,主要使用beautifulsoup解析網頁資訊,摘取免費的代理ip和端口,爬取資訊來源為公開的免費的代理池
0x02 代碼
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/3/21/021 22:09
# @Author : H
# @File : getproxy.py
import requests
from bs4 import BeautifulSoup
import re
def getProxyIP_61(page):
base_url = f"http://www.66ip.cn/{page}.html"
res = requests.get(base_url)
sub = []
if res.ok:
soup = BeautifulSoup(res.content, "html.parser")
table = soup.find_all('table')[2]
tr = table.find_all("tr")
for td in tr:
ip = td("td")[0].string
port = td("td")[1].string
if ip == "ip":
pass
else:
sub.append(ip + ":" + port)
return sub
def getProxyIP_61_areaindex_1(page):
base_url = f"http://www.66ip.cn/{page}.html"
res = requests.get(base_url)
sub = []
if res.ok:
soup = BeautifulSoup(res.content, "html.parser")
table = soup.find_all('table')[2]
tr = table.find_all("tr")
for td in tr:
ip = td("td")[0].string
port = td("td")[1].string
if ip == "ip":
pass
else:
sub.append(ip + ":" + port)
return sub
def getProxyIP_xicaidaili(page):
base_url = f"https://ip.jiangxianli.com/?page={page}"
res = requests.get(base_url)
sub = []
if res.ok:
soup = BeautifulSoup(res.content, "html.parser")
table = soup("table")[0]
tr = table.find_all("tr")
for i in tr:
if len(i("td")) != 0:
ip = i("td")[0].string
port = i("td")[1].string
sub.append(ip + ":" + port)
return sub
def getIPproxy_ihuan(page):
# page = b97827cc 1
# page = 4ce63706 2
# page = 5crfe930 3
# page = f3k1d581 4
base_url = f"https://ip.ihuan.me/?page={page}"
res = requests.get(base_url)
sub = []
if res.ok:
soup = BeautifulSoup(res.content, "html.parser")
table = soup.find_all('tbody')[0]
tr = table.find_all("tr")
for td in tr:
ip = td("td")[0]("a")[0]
ip = re.findall('(.*)>(.*)</a>(.*)', str(ip))[0][1]
port = td("td")[1].string
sub.append(ip + ":" + port)
return sub
def huizong(sub, res):
for i in res:
sub.append(i)
return sub
if __name__ == '__main__':
# 擷取代理IP和端口号
pages = ['b97827cc', '4ce63706', '5crfe930']
sub = []
for page in pages:
huizong(sub, getIPproxy_ihuan(page))
for i in range(1, 3):
huizong(sub, getProxyIP_61(i))
huizong(sub, getProxyIP_61_areaindex_1(i))
huizong(sub, getProxyIP_xicaidaili(i))
# 檢測代理可用性
url = "https://www.baidu.com"
for ip in sub:
try:
proxy_host = "https://" + ip
proxies = {"http": proxy_host}
res = requests.get(url, proxies=proxies)
if res.ok:
print("可用代理:\t" + proxy_host)
except Exception as e:
sub.remove(ip)
continue