有一段html文本
<html>
<head>
<title>這是标題</title>
</head>
<body>
<div>這是内容</div>
<div>要移除的内容</div>
</body>
</html>
我希望把
<div>要移除的内容</div>
移除
安裝lxml
pip install lxml
代碼執行個體
# -*- coding: utf-8 -*-
from lxml import etree
text = """
<html>
<head>
<title>這是标題</title>
</head>
<body>
<div>這是内容</div>
<div>要移除的内容</div>
</body>
</html>"""
tree = etree.fromstring(text)
# 傳回一個清單
remove_tags = tree.xpath('//div[last()]')
if remove_tags:
remove_tag = remove_tags[0]
remove_tag.getparent().remove(remove_tag)
print(etree.tounicode(tree))
"""
<html>
<head>
<title>這是标題</title>
</head>
<body>
<div>這是内容</div>
</body>
</html>
"""
最後也實作了我的要求,可以封裝為單獨的函數,以便調用
from lxml import html, etree
def remove_elements(html, xpath):
"""
移除html 文本中的元素
:param html: str
:param xpath: str
:return: str
"""
# tree = etree.fromstring(html)
tree = html.fragment_fromstring(f'<div>{text}</div>')
# 傳回一個清單
remove_tags = tree.xpath(xpath)
for remove_tag in remove_tags:
remove_tag.getparent().remove(remove_tag)
return etree.tounicode(tree)
if __name__ == '__main__':
text = """
<html>
<head>
<title>這是标題</title>
</head>
<body>
<div>這是内容</div>
<div>要移除的内容</div>
</body>
</html>"""
print(remove_elements(text, '//div[last()]'))
參考
https://lxml.de/tutorial.html