1.安装模块
使用国内镜像源安装
pip install beautifulsoup4 -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
pip install requests -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
2.Demo
import requests
from bs4 import BeautifulSoup
def get_images(page):
response = requests.get(f'https://www.ishsh.com/gaoqing/page/{page}', verify=False)
data = response.content.decode('utf8')
with open(f'a{page}.html', 'wb') as f:
f.write(response.content)
soup = BeautifulSoup(data, 'html.parser')
print(soup.prettify())
images = soup.find_all('img')
for item in images:
try:
if page > 1:
data_original = item['src']
else:
data_original = item['data-original']
index = data_original.find('&src')
print(data_original[index + 5:])
except KeyError as e:
print('无此节点')
def get_image():
response = requests.get(f'https://www.ishsh.com/gaoqing', verify=False)
data = response.content.decode('utf8')
print(data)
soup = BeautifulSoup(data, 'html.parser')
img = soup.find('img')
print(img)
print(img['data-original'])
def select_image():
response = requests.get('https://www.ishsh.com/gaoqing', verify=False)
data = response.content.decode('utf8')
soup = BeautifulSoup(data, 'html.parser')
# imgs = soup.select('img')
imgs = soup.select('.img')
for item in imgs:
print(item)
# print(type(item))
# print(item.string)
# print(item.content)
# print(item.get_text())
print(item['title'])
if __name__ == '__main__':
page = 1
get_images(1)
# print(soup.title) # 选择标签,选择第一个
# print(soup.title.name)# 获取标签名
# print(soup.title.string)# 标签内容
# print(soup.img['src'])# 获取属性
# print(soup.head.title.string)# 嵌套选择
# print(soup.p.contents)# 获取p标签下所有子标签的内容
# print(soup.p.children)#获取p标签下所有子标签的可迭代对象
#
# print(soup.a.parent)# 获取a标签的父节点
# print(list(enumerate(soup.img.parent)))
# print(next(soup.span.next_siblings))# 获取后面的兄弟节点
# # print(next(soup.span.previous_sublings))# 获取前面的兄弟节点
# print(soup.div.next_silbing)# 获取下一个兄弟节点
# print(soup.div.previous_sinbling)#获取前一个兄弟节点
# print(soup.find_all('img'))# 根据标签名查找元素
# print(soup.find_all(attrs={'class':'img'}))# 根据属性查找元素
# print(soup.find_all(text='''丝袜美腿'''))# 根据内容查找元素
3.备注
find_all,find,select等方法查到的bs4.element.Tag类型的对象,可继续使用这些方法进行链条查询
参考https://www.cnblogs.com/zhaof/p/6930955.html