文檔: https://lxml.de/lxmlhtml.html#cleaning-up-html 代碼示例
# -*- coding: utf-8 -*-
from lxml.html.clean import Cleaner
html = """
<p cms-style="font-L">
<strong>鐵打的騰訊</strong>
<a href="//n.sinaimg.cn/tech/crawl/115/w550h365/20200326/963a-irkazzv3237667.jpg" class="keyword f_st" target="_blank">
<img src="//n.sinaimg.cn/tech/crawl/115/w550h365/20200326/963a-irkazzv3237667.jpg" alt="">
</a>
</p>
"""
# 儲存新聞的時候,很多屬性不需要儲存,不然會占用硬碟資源,是以隻保留圖檔标簽的src屬性就行
safe_attrs = frozenset(['src'])
# a标簽也不要,隻保留裡邊的内容
remove_tags = frozenset([
'a'
])
cleaner = Cleaner(safe_attrs=safe_attrs, remove_tags=remove_tags)
cleaned_html = cleaner.clean_html(html)
print(cleaned_html)
'''
<p>
<strong>鐵打的騰訊</strong>
<img src="//n.sinaimg.cn/tech/crawl/115/w550h365/20200326/963a-irkazzv3237667.jpg">
</p>
'''
清洗之後内容簡潔多了