lxml是一个HTML/XML的解析器,主要功能是解析HTML/XML数据,通过pip install lxml可以安装lxml
1、lxml 可以自动修正 html 代码,例子里不仅补全了 li 标签,还添加了 body,html 标签,xpath标签的索引从1开始
from lxml import etree
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a> # 注意,此
处缺少一个 </li> 闭合标签
</ul>
'''
# 将字符串转化为lxml文档
html = etree.HTML(text)
# 将lxml转化为字符串
html = etree.tostring(html).decode()
print(html)
print(type(html))
<html><body><div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a> # 注意,此
处缺少一个 </li> 闭合标签
</ul>
</div></body></html>
<class 'str'>
2、获取文本内容、标签属性值
from lxml import etree
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a> # 注意,此
处缺少一个 </li> 闭合标签
</ul>
'''
html = etree.HTML(text)
result = html.xpath('//ul/li[@class="item-0"]/text()')
_result = html.xpath('//ul/li[5]/text()')
print(result)
print(_result)
result = html.xpath('//li')
print(len(result))
for con in result:
# 获取所有li标签下的a标签的文本
content = con.xpath('./a/text()')[0]
# 获取a标签中href属性包含1.html的的文本内容
data = con.xpath('./a[contains(@href, "1.html")]/text()')
# 获取a标签下所有的href属性值
href = con.xpath('./a/@href')[0]
print(data)
print(content)
[' # 注意,此\n处缺少一个 ']
[' # 注意,此\n处缺少一个 ']
5
['first item']
first item
[]
second item
[]
third item
[]
fourth item
[]
fifth item
3、从文本中获取html文本
from lxml import etree
html = etree.parse('test.html')
# 显示 etree.parse() 返回类型
print(type(html))
result = html.xpath('//li')
# 打印<li>标签的元素集合
print(result)
print(len(result))
print(type(result))
print(type(result[0]))