Crawler Chapter 2 (xpath of data analysis)

The second part is data analysis, which is divided into three parts

  1. xpath
  2. b4
  3. Regular expressions
    This time mainly introduces xpath expressions
from lxml import etree
import parsel
html_str="""
    <div>
			<ul>
				<li class="item1">
				    <a href="link1.html">1</a>
				</li>
				<li class="item2">
			    	<a href="link1.htm2">2</a>
				</li>
				<li class="item3">
				    <a href="link1.htm3">3</a>
				</li>
				<li class="item4">
				    <a href="link1.htm4">4</a>
				</li>
				<li class="item5">
				    <a href="link1.htm5">5</a>
				</li>
			</ul>
		</div>
"""
# 转换数据类型
data=parsel.Selector(html_str)
# 能够把缺失的标签补充完整
# 添加extract把对象中的数据统统显示出来

# 2.解析数据
    # (1)从根节点开始,获取所有<a>标签
# result = data.xpath('/html/body/div/ul/li/a').extract()
# print(result)

    # (2)跨节点获取所有a标签
# result0=data.xpath('//a').extract()
# print(result0)

    # (3)选取当前节点,使用场景:需要对选区的标签的下一级进行多次选取
# result1=data.xpath('//ul')
# result2=result1.xpath('./li').extract()
# result3=result1.xpath('./li/a').extract()
# print(result2)
# print(result3)

    # (4)选取当前节点的父节点的属性../
#常见的标签属性 class id href tag src title
# result=data.xpath('//a')
# result4=result.xpath('../@class').extract()
# print(result4)

    # (5)获取第三个li标签的节点
# result=data.xpath('//li[3]').extract()# 索引从1开始
# print(result)
# result=data.xpath('//li/a').extract()# 列表从0开始
# print(result[2])
    # (6)通过定位属性的方法获取第四个a标签
# result=data.xpath('//a[@href="link1.htm4"]').extract()
# print(result)
    # (7) 通过定位标签,获取第四个a标签包裹的文本
# result=data.xpath('//a[@href="link1.htm5"]/text()')[0].extract()
# print(result)
    # (8)获取第五个a标签的href属性值
# result=data.xpath('//li[5]/a/@href').extract()
# print(result)

    #了解模糊查询
# result=data.xpath('//li[contains(@class,"ite")]').extract()
# print(result)
    #同时获取多个标签,在路径表达式中使用“|”运算符
    #同时获取li标签的属性和a标签的文本
# result=data.xpath('//li/@class|//a/text()').extract()
# print(result)

#小结
#1.xpath的概述Xpath解析查找提取信息的语言
#2.xpath的节点关系:根节点,子节点,同级节点
#3.xpath的重点语法在于获取任意节点://
#4.xpath的重点语法根据属性获取节点:标签[@属性='值']
#5.xpath获取节点的文本:text()
#6.xpath获取节点的属性值:@属性值

Guess you like

Origin blog.csdn.net/weixin_45079974/article/details/109145281