天天看點

Python學習筆記(BeautifulSoup選擇器)

Beautiful Soup 是一個可以從HTML或XML檔案中提取資料的Python庫.它能夠通過你喜歡的轉換器實作慣用的文檔導航,查找,修改文檔的方式.Beautiful Soup會幫你節省數小時甚至數天的工作時間。

Beaufiful Soup也是解析網頁内容最好的工具之一,解析内容大多數是通過選擇器來實作的,這兩天剛好進行一些爬蟲實驗,遇到一些瓶頸,幹脆一股腦的再把beautifulsoup再看一下,後續會講解一些爬蟲執行個體。

代碼示例

#"""
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie1</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<div data-foo="abc">foo!</div>
<div data-foo="bcd">foo!</div>
<div data-data="bcd">data-data-value!</div>
<p class="body strikeout"></p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'lxml')
print('soup=',soup)
print('-----------------------------find_all-----------------------------')
print('soup.find_all("title")=',soup.find_all("title")) #直接查找tag
#[<title>The Dormouse's story</title>]
print('soup.find_all("p", "title")=',soup.find_all("p", "title"))    #直接查找tag和屬性值
#[<p class="title"><b>The Dormouse's story</b></p>]
print('soup.find_all("a")=',soup.find_all("a"))     #直接查找标簽,傳回<class 'bs4.element.ResultSet'>
#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
print('soup.find_all(id="link2")=',soup.find_all(id="link2"))    #直接查找ID
#[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
print('soup.find_all(attrs={"data-foo": "abc"}=',soup.find_all(attrs={"data-foo": "abc"}))  #直接查找屬性和屬性值
print('soup.find_all("a", class_="sister")=',soup.find_all("a", class_="sister"))            #直接查找tag和class值
print('soup.find_all("p", class_="strikeout")=',soup.find_all("p", class_="strikeout"))     #same with last
print('soup.find_all("p", class_="body")=',soup.find_all("p", class_="body"))                 #same with last
print('soup.find_all("p", class_="body strikeout")=',soup.find_all("p", class_="body strikeout")) #same with last
print('soup.select("p.strikeout.body")=',soup.select("p.strikeout.body"))                   #直接查找tag和class值
print('soup.find_all("a", string="Elsie1")=',soup.find_all("a", string="Elsie1"))            #直接查找tag包含的text值
print('soup.find_all("a", text="Elsie1")=',soup.find_all("a", text="Elsie1"))                #same withlast
print('--------------------------CSS selectors---------------------------')
#el#id: 元素+ID,比如: div#logo
#el.class: 元素+class,比如: div.masthead
#el[attr]: 元素+class,比如: a[href]
#任意組合,比如:a[href].highlight
#ancestor child: 查找某個元素下子元素,比如:可以用.body p 查找在”body”元素下的所有 p元素
#parent > child: 查找某個父元素下的直接子元素,比如:可以用div.content > p 查找 p 元素,也可以用body > * 查找body标簽下所有直接子元素
#siblingA + siblingB: 查找在A元素之前第一個同級元素B,比如:div.head + div
#siblingA ~ siblingX: 查找A元素之前的同級X元素,比如:h1 ~ p
#el, el, el:多個選擇器組合,查找比對任一選擇器的唯一進制素,例如:div.masthead, div.logo
#You can find tags:
print('soup.select("title")=',soup.select("title"))                             #直接查找tag值
print('soup.select("p:nth-of-type(3)")=',soup.select("p:nth-of-type(3)"))   #直接查找第N個tag值
#Find tags beneath other tags:
print('soup.select("body a")=',soup.select("body a"))                           #直接查找tag下所有的tag值
print('soup.select("p  a")=',soup.select("p a"))                                 #same withlast
print('soup.select("html head title")=',soup.select("html head title"))      #same withlast
#Find tags directly beneath other tags:
print('soup.select("body > a")=',soup.select("body > a"))                      #直接查找tag下直接的tag值
print('soup.select("p > a")=',soup.select("p > a"))                             #same withlast
print('soup.select("html > head > title")=',soup.select("html > head > title"))  #same withlast
print('soup.select("p > a:nth-of-type(2)")=',soup.select("p > a:nth-of-type(2)")) #same withlast
print('soup.select("p > #link1")=',soup.select("p > #link1"))                  #直接查找tag下的ID值
#Find the siblings of tags:
print('soup.select("#link1 ~ .sister")=',soup.select("#link1 ~ .sister"))    #查找A元素之前的同級X元素
print('soup.select("#link1 + .sister")=',soup.select("#link1 + .sister"))    #查找在A元素之前第一個同級元素B
#Find tags by CSS class:
print('soup.select(".sister")=',soup.select(".sister"))                         #直接查找所有class内容,不比對tag
print('soup.select("[class~=sister]")=',soup.select("[class~=sister]"))      #查找A元素之前的同級X元素
#Find tags by ID:
print('soup.select("#link1")=',soup.select("#link1"))                           #查找ID值
print('soup.select("a#link2")=',soup.select("a#link2"))                         #same with last
#Find tags that match any selector from a list of selectors:
print('soup.select("#link1,#link2")=',soup.select("#link1,#link2"))           #same with last
#Test for the existence of an attribute:
print('soup.select(a[href])=',soup.select('a[href]'))                           #直接查找tag和屬性
#Find tags by attribute value:
print('soup.select(a[href="http://example.com/elsie"])=',soup.select('a[href="http://example.com/elsie"]'))  #直接查找tag和屬性及屬性值
print('soup.select(a[href^="http://example.com/"])=',soup.select('a[href^="http://example.com/"]'))           #利用屬性名字首來查找元素
print('soup.select(a[href$="tillie"])=',soup.select('a[href$="tillie"]'))                                         #利用屬性字尾查找元素
print('soup.select(a[href*=".com/el"])=',soup.select('a[href*=".com/el"]'))                                       #利用屬性模糊比對查找元素
print('soup.find_all(attrs={"data-foo": True}=',soup.find_all(attrs={"data-foo":True}))                           #直接查找帶屬性名稱的元素
#[<div data-foo="abc">foo!</div>, <div data-foo="bcd">foo!</div>]
print('soup.find_all(attrs={"data-data": True}=',soup.find_all(attrs={"data-data":True}))                        #same with last,傳回集合
#[<div data-data="bcd">data-data-value!</div>]  
print('soup.find(attrs={"data-data": True})=',soup.find(attrs={"data-data":True}))                                #直接查找帶屬性名稱的元素傳回單條
#[<div data-data="bcd">data-data-value!</div>]
print('soup.find(attrs={"data-data": True})["data-data"]=',soup.find(attrs={"data-data":True})['data-data'])  #直接查找帶屬性名稱的元素傳回單條,并傳回屬性值
print('soup.find_all("a", class_="sister")=',soup.find_all("a", class_="sister"))
#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
print('soup.find("a", class_="sister")=',soup.find("a", class_="sister"))
#<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print('type(soup.find_all("a", class_="sister"))=',type(soup.find_all("a", class_="sister")))  #<class 'bs4.element.ResultSet'>
print('type(soup.find("a", class_="sister"))=',type(soup.find("a", class_="sister")))          #<class 'bs4.element.Tag'>           

複制