解析库使用(xPath)〈Python3网络爬虫开发实战〉

仅做记录

XPath对网页进行解析的过程:

from lxml import etree
text = '''
<div>
    <ul>
        <li class="item-0"><a href="link1.html">first item</a></li>
        <li class="item-1"><a href="link2.html">first item</a></li>
        <li class="item-inactive"><a href="link3.html">first item</a></li>
        <li class="item-1"><a href="link4.html">first item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a>
    </ul>
</div>
'''

html = etree.HTML(text)
result = etree.tostring(html)
print(result.decode('utf-8'))  # 打印成str类型
print(result) #打印成bytes类型
# 或者导入外部的文本文件进行解析:
html = etree.parse('./test.html', etree.HTMLParser()) result = etree.tostring(html) print(result.decode('utf-8'))
# 所有节点
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//*')
print(result)
# 指定(所有)节点
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li')
print(result) # 返回结果是一个列表
print(result[0])
# 子节点
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li/a') #返回li的全部a节点(直接子节点)
result2 = html.xpath('//ul//a') # 返回ul的全部a节点(子孙节点)
result3 = html.xpath('//ul/a') # 返回ul的全部a节点(直接子节点,没有结果)
# 父节点
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//a[@href="link4.html"]/../@class')
result2 = html.xpath('//a[@href="link3.html"]/parent::*/@class') #另一种获取父节点的方法

# 属性匹配
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-1"]') # 返回class属性为item-1的两个li元素
# 文本获取
# 直接节点方式(特定子节点下文本)
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]/a/text()')
# 子孙节点方式(全部文本)
html = etree.parse('./test.html', etree.HTMLParser())
result2 = html.xpath('//li[@class="item-0"]//text()') #最后一个〈li>上的换行符也返回

# 属性获取(注意跟属性匹配[@href="link4.html"]之间的区别)
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li/a/@href')

# 属性多值匹配(某个属性有多个值)
text = '''
<li class="li li-first"><a href="link1.html">first item</a></li>
'''
html = etree.HTML(text) # 解析“text”版本的html
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result)
# 多属性匹配
text = '''
<li class="myli li-first" name="item"><a href="link1.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "myli") and @name="item"]/a/text()')
print(result)
# 按序选择(序号不是从0,而是从1开始)
html = etree.HTML(text)
result = html.xpath('//li[1]/a/text()') # 第一个li的文本内容
print(result)
result = html.xpath('//li[last()]/a/text()') # 最后一个li的文本内容
print(result)
result = html.xpath('//li[position()<3]/a/text()') # 序号比3小的li的文本内容(即第1,2两个文本内容)
print(result)
result = html.xpath('//li[last()-2]/a/text()') # 倒数第3个li的文本内容
print(result)
# 节点轴选择
html = etree.HTML(text)
result = html.xpath('//li[1]/ancestor::*') # li[1]的所有祖先节点
print(result)
result = html.xpath('//li[1]/ancestor::div') # li[1]的div祖先节点
print(result)
result = html.xpath('//li[1]/attribute::*') # li[1]的所有属性值
print(result)
result = html.xpath('//li[1]/child::a[@href="link1.html"]') # li[1]的直接子节点中href属性为link1.html的a节点
print(result)
result = html.xpath('//li[1]/descendant::span') # li[1]子孙节点中只包含span节点而不包含a节点
print(result)
result = html.xpath('//li[1]/following::*[2]') # 获取节点a,(li[1]之后的所有节点中第2个后续节点 (li也算))
print(result)
result = html.xpath('//li[1]/following::*[1]') # 获取节点li,(li[1]之后的所有节点中第1个后续节点 (li也算))
print(result)
result = html.xpath('//li[1]/following-sibling::*') # 获取当前节点之后的所有同级节点
print(result)

猜你喜欢

转载自blog.csdn.net/sisqzy86/article/details/84203440