Xpath解析库的使用

### Xpath常用规则
## nodename     选取此节点的所有子节点
## /            从当前节点选取直接子节点
## //           从当前节点选取子孙节点
## .            选取当前节点
## ..           选取当前节点的父节点
## @            选取属性

### 测试文本
text = '''
<ul id="dmr" name="liebiao">
<li data-closeper="" aria-label="查看更多" role="menuitem" aria-haspopup="true" data-groupid="104" class="J_Cat a-all">
<a data-cid="1" data-dataid="222878" >家电</a>
<a data-cid="1" data-dataid="222908" >数码</a>
<a data-cid="1" data-dataid="222879" >手机

<i aria-hidden="true" class="tb-ifont service-arrow"></i>
</li>
</ul>
'''

1. etree示例引入

## etree示例引入
from lxml import etree

# 构造一个etree的HTML节点对象(可供Xpath解析)
html = etree.HTML(text)
# 读取text文本内容进行构造节点对象
html2 = etree.parse('./text', etree.HTMLParser())
# 用tostring方法可以修正html代码,如上面代码缺失的</a>标签
result = etree.tostring(html)
result2 = etree.tostring(html2)
print(html, html2)
print(type(html), type(html2))
'''
输出内容:
<Element html at 0x2b47848> <lxml.etree._ElementTree object at 0x0000000002B47788>
<class 'lxml.etree._Element'> <class 'lxml.etree._ElementTree'>
'''
# 输出修正后的html代码
print(result.decode('utf-8'))
print(result2.decode('utf-8'))

2. 提取页面下的所有节点

## 提取页面下的所有节点
from lxml import etree

html = etree.HTML(text)
result = html.xpath('//*')
print(len(result))
print(result)

'''
输出结果:
8
[<Element html at 0x2b539c8>, <Element body at 0x2b53948>, <Element ul at 0x2b53a08>, <Element li at 0x2b53a48>, <Element a at 0x2b53a88>, <Element a at 0x2b53b08>, <Element a at 0x2b53b48>, <Element i at 0x2b53b88>]
'''

3. 提取子节点

## 提取子节点
from lxml import etree

html = etree.parse('./text', etree.HTMLParser())
# 通过/寻找li标签下的直接a子节点
result = html.xpath('//li/a')
# 通过//寻找ul标签下的a子和孙节点
result2 = html.xpath('//ul//a')
print(len(result), len(result2))
print(result, result2)

'''
运行结果:
3 3
[<Element a at 0x2963cc8>, <Element a at 0x2963d08>, <Element a at 0x2963d48>] [<Element a at 0x2963cc8>, <Element a at 0x2963d08>, <Element a at 0x2963d48>]
'''

4. 提取父节点

## 提取父节点
from lxml import etree

html = etree.HTML(text)
# 提取li节点中role属性为menuitem的节点的父节点的name属性内容
result = html.xpath('//li[@role="menuitem"]/../@name')
print(result)

'''
输出结果:
['liebiao']
'''

5. 属性匹配

## 属性匹配
html = etree.HTML(text)
# 匹配data-dataid为222878的节点
result = html.xpath('//a[@data-dataid="222878"]')
print(result)

'''
输出内容:
[<Element a at 0x2973c48>]
'''

6. 提取文本内容

## 提取文本内容
html = etree.HTML(text)
# 匹配data-dataid为222878的节点的文本内容
result = html.xpath('//a[@data-dataid="222878"]/text()')
print(result)

'''
输出内容:
['家电']
'''

7. 属性值获取

## 属性获取
from lxml import etree

html = etree.HTML(text)
result = html.xpath('//li/@aria-label')
print(result)

'''
输出内容:
['查看更多']
'''

8. 属性多值匹配

## 属性多值匹配
from lxml import etree

html = etree.HTML(text)
result = html.xpath('//li[@class="J_Cat"]')
result2 = html.xpath('//li[@class="J_Cat a-all"]//text()')
result3 = html.xpath('//li[contains(@class, "J_Cat")]//text()')
print(result, result2, result3)

'''
输出结果:
[] ['\n', '家电', '\n', '数码', '\n', '手机\n\n', '\ue62e', '\n'] ['\n', '家电', '\n', '数码', '\n', '手机\n\n', '\ue62e', '\n']
'''

9. 多属性匹配

## 多属性匹配
## 运算符介绍
# or        或
# and       与
# mod       除余
# |         返回节点集合
# +         加法
# -         减法
# *         乘法
# =         等于
# !=        不等于
# <         小于
# <=        小于或等于
# >         大于
# >=        大于或等于
from lxml import etree

html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "J_Cat") and @role="menuitem"]/a/text()')
print(result)

'''
输出结果:
['家电', '数码', '手机\n\n', '\n']
'''

10. 按序选择,通过索引的方式进行选择

## 按序选择,通过索引的方式进行选择
from lxml import etree

html = etree.HTML(text)
# 提取li节点下第一个a节点的文本内容
print(html.xpath('//li/a[1]/text()'))
# 提取li节点下最后一个a节点的文本内容
print(html.xpath('//li/a[last()]/text()'))
# 提取li节点下位置小于3的a节点的文本内容
print(html.xpath('//li/a[position()<3]/text()'))
# 提取li节点下倒数第2个a节点的文本内容
print(html.xpath('//li/a[last()-1]/text()'))

'''
输出结果:
['手机\n\n', '\n']
['家电', '数码']
['数码']
'''

11. 节点轴选择

## 节点轴选择
# ancestor轴,可以节点获取所有的祖先节点
# attribute轴,可以获取节点的所有属性值
# child轴,可以获取节点的所有直接子节点
# descendant轴,可以获取节点的所有子孙节点
# following轴,可以获取节点后的所有节点
# following-sibling,可以获取当前节点的所有同级节点
from lxml import etree

html = etree.HTML(text)
print(html.xpath('//li/a[1]/ancestor::*'))
print(html.xpath('//li/a[1]/ancestor::ul'))
print(html.xpath('//li/a[1]/attribute::*'))
print(html.xpath('//li[1]/child::*'))
print(html.xpath('//ul[1]/descendant::a'))
print(html.xpath('//a[1]/following::*'))
print(html.xpath('//a[1]/following-sibling::*'))

'''
输出结果:
[<Element html at 0x2b53b88>, <Element body at 0x2b53b48>, <Element ul at 0x2b53d88>, <Element li at 0x2b53bc8>]
[<Element ul at 0x2b53b48>]
['1', '222878']
[<Element a at 0x2b53b48>, <Element a at 0x2b53d88>, <Element a at 0x2b53bc8>]
[<Element a at 0x2b53b48>, <Element a at 0x2b53d88>, <Element a at 0x2b53bc8>]
[<Element a at 0x2b53b48>, <Element a at 0x2b53d88>, <Element i at 0x2b53bc8>]
[<Element a at 0x2b53d88>, <Element a at 0x2b53bc8>]
'''

12. 用Xpath解析爬取豆瓣top250

### 用Xpath解析爬取豆瓣top250

from lxml import etree
import requests, json

def get_page(url):
    '''
    获取url网页代码
    :param url: 要爬取的网址
    :return: 网页代码
    '''

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        print('get page success...')
        return response.text
    else:
        exit('get page fail...')

def parse_page(text):
    '''
    解析豆瓣电影top250网页代码
    :param html: 网页代码
    :return: data需要爬取的数据
    '''

    html = etree.HTML(text)
    items = html.xpath('//ol[@class="grid_view"]/li/div[@class="item"]')
    for item in items:
        #print(item.xpath('.//div[@class="bd"]/p[@class="quote"]/span/text()')[0])
        yield {
            'img': item.xpath('.//div[@class="pic"]//img/@src')[0],
            'details': item.xpath('.//div[@class="hd"]/a/@href')[0],
            'name': item.xpath('.//div[@class="hd"]//span[1]/text()')[0],
            'director': item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[0].split()[1],
            'actor': item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[0].split()[5] if len(item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[0].split())>5 else 'None',
            'time': item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[1].split()[0],
            'nation': item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[1].split()[2],
            'type': item.xpath('.//div[@class="bd"]/p[@class=""]/text()')[1].split()[4:],
            'score': item.xpath('.//div[@class="bd"]/div/span[@class="rating_num"]/text()')[0],
            'introduction': item.xpath('.//div[@class="bd"]/p[@class="quote"]/span/text()') if item.xpath('.//div[@class="bd"]/p[@class="quote"]/span/text()') else 'None',
        }

    return items

def save_to_file(data):
    '''
    保存爬取到的数据到文本文件中
    :param data:
    :return:
    '''
    with open('豆瓣电影top250.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(data, ensure_ascii=False) + '\n')

def main(start):
    url = 'https://movie.douban.com/top250?start=' + str(start)
    text = get_page(url)
    data = parse_page(text)
    for item in data:
        print(item)
        save_to_file(item)



if __name__ == '__main__':
    for i in range(10):
        start = i * 25
        main(start)
View Code

猜你喜欢

转载自www.cnblogs.com/Caiyundo/p/12503931.html