解析库使用（xPath）〈Python3网络爬虫开发实战〉

仅做记录

XPath对网页进行解析的过程：

from lxml import etree
text = '''
<div>
    <ul>
        <li class="item-0"><a href="link1.html">first item</a></li>
        <li class="item-1"><a href="link2.html">first item</a></li>
        <li class="item-inactive"><a href="link3.html">first item</a></li>
        <li class="item-1"><a href="link4.html">first item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a>
    </ul>
</div>
'''

html = etree.HTML(text)
result = etree.tostring(html)
print(result.decode('utf-8'))  # 打印成str类型
print(result) #打印成bytes类型

# 或者导入外部的文本文件进行解析：
html = etree.parse('./test.html', etree.HTMLParser()) result = etree.tostring(html) print(result.decode('utf-8'))

# 所有节点
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//*')
print(result)

# 指定（所有）节点
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li')
print(result) # 返回结果是一个列表
print(result[0])

# 子节点
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li/a') #返回li的全部a节点（直接子节点）
result2 = html.xpath('//ul//a') # 返回ul的全部a节点（子孙节点）
result3 = html.xpath('//ul/a') # 返回ul的全部a节点（直接子节点，没有结果）

# 父节点
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//a[@href="link4.html"]/../@class')
result2 = html.xpath('//a[@href="link3.html"]/parent::*/@class') #另一种获取父节点的方法

# 属性匹配
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-1"]') # 返回class属性为item-1的两个li元素

# 文本获取
# 直接节点方式（特定子节点下文本）
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]/a/text()')
# 子孙节点方式（全部文本）
html = etree.parse('./test.html', etree.HTMLParser())
result2 = html.xpath('//li[@class="item-0"]//text()') #最后一个〈li>上的换行符也返回

# 属性获取(注意跟属性匹配[@href="link4.html"]之间的区别）
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li/a/@href')

# 属性多值匹配(某个属性有多个值）
text = '''
<li class="li li-first"><a href="link1.html">first item</a></li>
'''
html = etree.HTML(text) # 解析“text”版本的html
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result)

# 多属性匹配
text = '''
<li class="myli li-first" name="item"><a href="link1.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "myli") and @name="item"]/a/text()')
print(result)

# 按序选择(序号不是从0,而是从1开始）
html = etree.HTML(text)
result = html.xpath('//li[1]/a/text()') # 第一个li的文本内容
print(result)
result = html.xpath('//li[last()]/a/text()') # 最后一个li的文本内容
print(result)
result = html.xpath('//li[position()<3]/a/text()') # 序号比3小的li的文本内容（即第1，2两个文本内容）
print(result)
result = html.xpath('//li[last()-2]/a/text()') # 倒数第3个li的文本内容
print(result)

# 节点轴选择
html = etree.HTML(text)
result = html.xpath('//li[1]/ancestor::*') # li[1]的所有祖先节点
print(result)
result = html.xpath('//li[1]/ancestor::div') # li[1]的div祖先节点
print(result)
result = html.xpath('//li[1]/attribute::*') # li[1]的所有属性值
print(result)
result = html.xpath('//li[1]/child::a[@href="link1.html"]') # li[1]的直接子节点中href属性为link1.html的a节点
print(result)
result = html.xpath('//li[1]/descendant::span') # li[1]子孙节点中只包含span节点而不包含a节点
print(result)
result = html.xpath('//li[1]/following::*[2]') # 获取节点a，(li[1]之后的所有节点中第2个后续节点 (li也算）)
print(result)
result = html.xpath('//li[1]/following::*[1]') # 获取节点li，(li[1]之后的所有节点中第1个后续节点 (li也算）)
print(result)
result = html.xpath('//li[1]/following-sibling::*') # 获取当前节点之后的所有同级节点
print(result)

解析库使用（xPath）〈Python3网络爬虫开发实战〉

猜你喜欢