python3网络爬虫学习 第四章解析库的使用4.1xpath解析库的使用

from lxml import etree
# xpath("//*")  获取所有子孙节点
# xpath("//body//")  获取body节点的所有子孙节点
# xpath("/html/*")  html节点下的子节点

# xpath("//body/p[@class='title']")  body节点下的p节点中属性中class = 'title'的节点
#获取节点属性
# xpath("//body/p/@name")  body节点下的p节点中属性中name的值,如果有多个都会返回

# xpath("//body/p[@class='title']/..")   body节点下的p节点中属性中class = 'title'节点的父节点
#文本获取 需要到节点标签内获取
#xpath("//body/p[@class='title']/b/text()")

#属性值多匹配,当某个节点的属性值有多个而无法区分时,可以传入参数进行选择使用contains(@参数名,参数值)

#多属性匹配,中间用and连接
#xpath("//body/p[@class='title'and @name = 'dromouse']/b/text()")
#这里 and 其实是运算符,还有许多运算符:
# or 或      and且        //b | //cd 返回包含b和cd的元素节点集
#xpath("//body/p[@class='title'or @class = 'story']//text()")
#xpath("//head//text() | //body/p[@class='title']//text()")


#当匹配到多个节点时,可以用下标选择,因为结果是列表,不过有点区别,下标是从1开始而不是常用的0
# result = html.xpath("//body/p[1]")
# result1 = html.xpath("//body/p[2]")
# result2 = html.xpath("//body/p[3]")
# print(result,result1,result2)

#节点轴的使用
#ancestor轴,可以获取所有祖先节点后面需要两个冒号,再后面是选择器
# xpath("//p[@class = 'title']/ancestor::*")
#attribute轴,可以获取所有属性值后面需要两个冒号,再后面是选择器
# xpath("//p[@class = 'title']/attribute::*")
#child轴,可以获取所有直接子节点后面需要两个冒号,再后面是选择器
# xpath("//p[@class = 'title']/child::b")
#descendant轴,可以获取所有子孙节点后面需要两个冒号,再后面是选择器
# xpath("//p[@class = 'story']/descendant::*")




# text = """
# <html><head><title>The Dormouse's story</title></head>
# <body>
# <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
# <p class="story">Once upon a time there were three little sisters; and their names were
# <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</p>
# <p class="story">...</p>
# """
# html =  etree.HTML(text)
# result = etree.tostring(html)
# with open("./test.html","wb")as f :
#     f.write(result)
"""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>
"""
from lxml import etree
html = etree.parse("./test.html",etree.HTMLParser())
result =html.xpath("//p[@class = 'story']/descendant::*")
print(result)

猜你喜欢

转载自blog.csdn.net/luslin/article/details/81704359
今日推荐