有一段html文本
<html>
<head>
<title>这是标题</title>
</head>
<body>
<div>这是内容</div>
<div>要移除的内容</div>
</body>
</html>
我希望把
<div>要移除的内容</div>
移除
安装lxml
pip install lxml
代码实例
# -*- coding: utf-8 -*-
from lxml import etree
text = """
<html>
<head>
<title>这是标题</title>
</head>
<body>
<div>这是内容</div>
<div>要移除的内容</div>
</body>
</html>"""
tree = etree.fromstring(text)
# 返回一个列表
remove_tags = tree.xpath('//div[last()]')
if remove_tags:
remove_tag = remove_tags[0]
remove_tag.getparent().remove(remove_tag)
print(etree.tounicode(tree))
"""
<html>
<head>
<title>这是标题</title>
</head>
<body>
<div>这是内容</div>
</body>
</html>
"""
最后也实现了我的要求,可以封装为单独的函数,以便调用
from lxml import html, etree
def remove_elements(html, xpath):
"""
移除html 文本中的元素
:param html: str
:param xpath: str
:return: str
"""
# tree = etree.fromstring(html)
tree = html.fragment_fromstring(f'<div>{text}</div>')
# 返回一个列表
remove_tags = tree.xpath(xpath)
for remove_tag in remove_tags:
remove_tag.getparent().remove(remove_tag)
return etree.tounicode(tree)
if __name__ == '__main__':
text = """
<html>
<head>
<title>这是标题</title>
</head>
<body>
<div>这是内容</div>
<div>要移除的内容</div>
</body>
</html>"""
print(remove_elements(text, '//div[last()]'))
参考
https://lxml.de/tutorial.html