天天看點

python 爬取<span></span>中間标簽的内容

# python 爬取<span></span>中間标簽的内容
html = """
<div>
    <span class='red'>item1</span>
    <div>
        <span id='s1'>item2</span>
    </div>
</div>
"""
# 方法一:使用 scrapy 的Selector
from scrapy.selector import Selector

# scrapy 的選擇器支援 css和xpath選擇。下面是css選擇器。如果你了解前端JQuery的知識,
# 會發現
t1 = Selector(text=html).css('span.red::text').extract()  # class 用點
print(t1)  # ['item1']
t2 = Selector(text=html).css('span::text').extract()  # 所有span 的内容
print(t2)  # ['item1','item2']
t3 = Selector(text=html).css('span#s1::text').extract()  # id 用#
print(t3)  # ['item2']
t4 = Selector(text=html).css('div>div>span::text').extract()  # div 裡邊 span
print(t4)  # ['item2']

# 方法二:使用bs4
from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'html.parser')
s1 = soup.find('span', attrs={"class": "red"})  # 查找span class為red的字元串
s2 = soup.find_all("span")  # 查找所有的span
result = [span.get_text() for span in s2]
print(result)  # ['item1', 'item2']
           

1、正規表達式擷取<td></td>标簽之間的内容

如:<td class="label">行政相對人名稱:</td>   擷取  行政相對人名稱:

Name= re.findall('<td class="label">(.*?)</tb>',text)[0]