Python爬虫之xpath使用指南
"""<<常用表达式规则>>
表达式: 含义:
nodename 选择此节点的所有子节点
/ 从当前节点选取直接子节点
// 从当前节点选取子孙节点
. 选取当前节点
.. 选取当前节点的父节点
@ 选取属性
* 选取所有信息
<<多属性匹配运算符介绍>>
运算符: 描述: 示例:
and 与 age=19 or age=20
or 或 age>19 and age<21
mod 计算除法的余数 5 mod 7
| 计算两个节点集 //book | //cd
+ 加法 6 + 4
- 减法 6 - 4
* 乘法 6 * 4
div 除法 8 div 4
= 等于 age=19
!= 不等于 age!=19
< 小于 age<19
<= 小于等于 age<=19
> 大于 age>19
>= 大于等于 age>=19
"""
from lxml import etree
html_doc = """
<html>
<head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<ul class="list" id="list-1">
<li class="element"><a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list two" id="list-2">
<li class="element"><a href="http://example.com/tillie" class="parent" id="link3">Tillie</a>evan</li>
<li class="element">jane</li>
<li class="element">summer</li>
</ul>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="child" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="parent" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">End...</p>
</body>
</html>
"""
def parse():
html = etree.HTML(html_doc)
print(etree.tostring(html).decode('utf-8'))
print(html.xpath('//a[@class="parent"]/text()'))
print(html.xpath('//ul/@class'))
print(html.xpath('//ul/attribute::*'))
print(html.xpath('//ul[@class="list"]'))
print(html.xpath('//ul[contains(@class, "two")]'))
print(html.xpath('//ul[contains(@class, "two") and @id="list-2"]'))
print(html.xpath('//*'))
print(html.xpath('//li'))
print(html.xpath('//li/parent::*'))
print(html.xpath('//li/..'))
print(html.xpath('//li/ancestor::*'))
print(html.xpath('//li/ancestor::ul'))
print(html.xpath('//ul/child::*'))
print(html.xpath('//ul/child::li'))
print(html.xpath('//ul/li'))
print(html.xpath('//ul/descendant::*'))
print(html.xpath('//ul/descendant::a'))
print(html.xpath('//ul//a'))
print(html.xpath('//li[1]/following-sibling::*'))
print(html.xpath('//li[1]/following::*'))
print(html.xpath('//li[1]/following::*[2]'))
print(html.xpath('//ul/li[1]'))
print(html.xpath('//ul/li[last()]'))
print(html.xpath('//ul/li[last()-2]'))
print(html.xpath('//ul/li[position()<3]'))
if __name__ == '__main__':
parse()