天天看點

beautifulsoup

1.安裝子產品

使用國内鏡像源安裝

pip install beautifulsoup4  -i  http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
pip install requests -i  http://pypi.douban.com/simple/ --trusted-host pypi.douban.com      

2.Demo

import requests
from bs4 import BeautifulSoup


def get_images(page):
    response = requests.get(f'https://www.ishsh.com/gaoqing/page/{page}', verify=False)
    data = response.content.decode('utf8')

    with open(f'a{page}.html', 'wb') as f:
        f.write(response.content)
    soup = BeautifulSoup(data, 'html.parser')
    print(soup.prettify())
    images = soup.find_all('img')
    for item in images:
        try:
            if page > 1:
                data_original = item['src']
            else:
                data_original = item['data-original']
            index = data_original.find('&src')
            print(data_original[index + 5:])
        except KeyError as e:
            print('無此節點')


def get_image():
    response = requests.get(f'https://www.ishsh.com/gaoqing', verify=False)
    data = response.content.decode('utf8')
    print(data)
    soup = BeautifulSoup(data, 'html.parser')
    img = soup.find('img')
    print(img)
    print(img['data-original'])


def select_image():
    response = requests.get('https://www.ishsh.com/gaoqing', verify=False)
    data = response.content.decode('utf8')
    soup = BeautifulSoup(data, 'html.parser')
    # imgs = soup.select('img')
    imgs = soup.select('.img')
    for item in imgs:
        print(item)
        # print(type(item))
        # print(item.string)
        # print(item.content)
        # print(item.get_text())
        print(item['title'])


if __name__ == '__main__':
    page = 1
    get_images(1)

# print(soup.title) # 選擇标簽,選擇第一個
# print(soup.title.name)# 擷取标簽名
# print(soup.title.string)# 标簽内容
# print(soup.img['src'])# 擷取屬性
# print(soup.head.title.string)# 嵌套選擇
# print(soup.p.contents)# 擷取p标簽下所有子标簽的内容
# print(soup.p.children)#擷取p标簽下所有子标簽的可疊代對象
#
# print(soup.a.parent)# 擷取a标簽的父節點
# print(list(enumerate(soup.img.parent)))
# print(next(soup.span.next_siblings))# 擷取後面的兄弟節點
# # print(next(soup.span.previous_sublings))# 擷取前面的兄弟節點
# print(soup.div.next_silbing)# 擷取下一個兄弟節點
# print(soup.div.previous_sinbling)#擷取前一個兄弟節點

# print(soup.find_all('img'))# 根據标簽名查找元素
# print(soup.find_all(attrs={'class':'img'}))# 根據屬性查找元素
# print(soup.find_all(text='''絲襪美腿'''))# 根據内容查找元素      

3.備注

find_all,find,select等方法查到的bs4.element.Tag類型的對象,可繼續使用這些方法進行鍊條查詢

參考https://www.cnblogs.com/zhaof/p/6930955.html