源代码(注:免费的代理ip通常不够稳定,偶尔测试使用一下还可以,但常用代理ip还是付费的比较稳定): import threading
import pymongo
import requests
from lxml import html
url = 'https://www.xicidaili.com/nn/1'
headers = {
'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
}
etree = html.etree
response = requests.get(url=url, headers=headers)
html = etree.HTML(response.text)
# ip
ip = html.xpath("//tr[@class='odd']/td[2]/text()")
# 端口号
port = html.xpath("//tr[@class='odd']/td[3]/text()")
# 隐匿性
anon = html.xpath("//tr[@class='odd']/td[5]/text()")
# 是http还是https协议
prot = html.xpath("//tr[@class='odd']/td[6]/text()")
# 创建一个列表保存http类型的ip
http_list = []
# 创建一个列表保存https类型的ip
https_list = []
# 创建一个列表保存能使用的ip
goodip_list = []
for index, value in enumerate(anon):
if value == '高匿':
ip_port = ip[index] + ':' + port[index]
if prot[index] == 'http':
http_list.append(ip_port)
else:
https_list.append(ip_port)
# 测试得到的ip是否可用
def ip_test(proxy):
try:
urls = r'http://ip.tool.chinaz.com/'
result = requests.get(
url=urls,
headers=headers,
proxies={
'http': f'http://{proxy}',
'https': f'https://{proxy}',
},
timeout=5,
)
# 得到对应的页面内容
htmls = etree.HTML(result.text)
# 返回一个列表
ip_get = htmls.xpath(r'//dd[@class="fz24"]/text()')
if ip_get[0] == proxy[:-5]:
return True
elif ip_get[0] == proxy[:-6]:
return True
else:
return False
except:
return False
def save_ip(proxy, proxy_type):
if ip_test(proxy):
print(f'可使用ip:{proxy}, 类型为:{proxy_type}')
goodip_list.append({'ip': proxy})
if __name__ == '__main__':
tasks = [] # 线程池
for i in http_list:
task = threading.Thread(target=save_ip, args=(i, 'http'))
tasks.append(task)
task.start()
for j in https_list:
task = threading.Thread(target=save_ip, args=(j, 'https'))
tasks.append(task)
task.start()
for k in tasks:
k.join()
# 创建mongodb数据库连接
conn = pymongo.MongoClient()
# 选择数据库
db = conn.proxy
# 查询数据库中存在的集合
coll_list = db.list_collection_names()
if 'proxys' in coll_list:
# 如果存在则清空集合中的所有数据
proxys = db.proxys
proxys.delete_many({})
else:
# 如果不存在则创建集合proxys
proxys = db.proxys
# 往集合proxys中添加数据
proxys.insert_many(goodip_list)
print('已完成所有操作!')