#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import contextlib
import requests
import time
import re
import sys
from pymongo import MongoClient
'''
进入网址,获得html文件
分析提取标签,获得标签列表
对标签列表进行数据处理
使用mongodb进行存储
判断是否是网址,
否:提取图片
是:进入网址
'''
def get_html(url):
response = requests.get(url)
return response.text
def html_analysis(res, text):
html_label_list = []
for i in res:
response_list = re.findall(i, text)
# 将重复标签去除
response_list = list(set(response_list))
html_label_list.append(response_list)
return html_label_list
def label_handle(html_label_list):
pass
def save_to_mongo(label_data):
conn = MongoClient("localhost", 27017)
db = conn.label_file
myset = db.base_label
myset1 = db.back_label
n = 0
label1 = label_data[0]
for i in label1:
n +=1
myset.insert({"id":n, "base_label":"{}".format(i),'datetime':time.ctime()})
label1 = label_data[1]
a = 0
for i in label1:
a +=1
myset1.insert({"id":a, "back_label":"{}".format(i), 'datetime':time.ctime()})
show_database(db)
conn.close()
def show_database(db):
myset1 = db.base_label
myset2 = db.back_label
s = myset1.find({},{"_id":0})
print("collection counts=", s.count())
for i in s:
print(i)
time.sleep(2)
# https://img.ugirls.tv/uploads/magazine/cover/8c85d8088c672d0e3d4a22eb74f3018a_cover_web_l.jpg
# https://www.ugirls.com/Shop/Detail/Magazine-477.html
# https://img.ugirls.tv/uploads/magazine/cover/016bfa12acd8867904233b3b119a7747_cover_web_l.jpg
# https://www.ugirls.com/Shop/Detail/Magazine-476.html
# https://img.ugirls.tv/uploads/magazine/cover/269e51b772878a239a101a38b0d2cb84_cover_web_l.jpg
# https://www.ugirls.com/Shop/Detail/Magazine-475.html
# alt="[U373]杜花花"
def data_handle(url):
# 访问网址,获得html文件
text = get_html(url)
# 正则表达式字典,存储不同的分析表达式
res_dict = {"label":[r"<\w*\b",r"</\w*\b"], "pic":""}
res = res_dict["label"]
# 分析html文件,提取label标签
html_label_list = html_analysis(res,text)
# 对标签列表进行处理
# label_data = label_handle(html_label_list)
# 将处理后的标签进行存储,并显示
save_to_mongo(html_label_list)
def save_to_mongodb(total_info):
conn = MongoClient("localhost", 27017)
db = conn.person_file
myset = db.person_info
for person_info in total_info:
myset.insert(person_info)
print("The data was stored.")
def save_to_file(total_info):
for i in total_info:
with open(r"C:\Users\Mi\Pictures\Saved Pictures\%s.jpg"%(i["title"]), "wb") as ff:
res = requests.get(i['pic'])
ff.write(res.content)
time.sleep(2)
def get_pic(url):
# 获取html文件
text = get_html(url)
# 分析文件并返回一个列表
total_info = get_info(text)
# 将列表进行存储
save_to_mongodb(total_info)
# 将图片下载
save_to_file(total_info)
def get_info(text):
pattern = '''<a href="(?P<url>https://www.ugirls.com/Shop/Detail/.*)" target="_blank" rel="external nofollow" .*target="_blank"><img \
src="(?P<pic>https://img.ugirls.tv/uploads/magazine/cover/.*\.jpg)".*alt="(?P<title>.*)" /></a>'''
# 将html文件以div分割,以便使用search查找,
# search只能匹配一个字符串,使用循环进行查找
texts = re.split("div", text)
# 使用字典 ,储存(url),(标题),(图片)
total_info = []
for text in texts:
one_preson = {}
titles = re.search(pattern, text)
# 防止search找不到数据,报错
if not titles:
continue
one_preson["title"] = (titles.group("title")[6:])
one_preson["url"] = (titles.group("url"))
one_preson["pic"] = (titles.group("pic"))
total_info.append(one_preson)
# print(total_info)
return total_info
def create_path(pic_name):
times = time.strftime("%Y-%m-%d", time.gmtime())
s = "C:\pic_spy\%s" % times
if os.path.exists(s[:23]):
pass
else:
os.makedirs(s)
path = r"%s\meinv%s.jpg" % (s, n)
return path
def save_img(path, rs):
with open(path, "wb") as ff:
for data in rs.iter_content(1024):
ff.write(data)
if __name__ == "__main__":
url = "https://www.ugirls.com"
url = "https://www.ugirls.com/Content/"
n = 1
for i in range(8):
url = "https://www.ugirls.com/Content/Page-%d.html"%n
n +=1
get_pic(url)