Python crawler-xpath method crawler technology sharing, more efficient and concise code!

1. Get the document tree object

Obtain the object of the document through XPath. After obtaining the object, you can use the object of the document to obtain the elements in the tree.

2. Text conversion document tree object

def strToEleObj():
    doc = '''
          <div>
              <ul>
                   <li class="item-0"><a href="link1.html">first item</a></li>
                   <li class="item-1"><a href="link2.html">second item</a></li>
                   <li class="item-inactive"><a href="link3.html">third item</a></li>
                   <li class="item-1"><a href="link4.html">fourth item</a></li>
                   <li class="item-0"><a href="link5.html">fifth item</a> # 注意,此处缺少一个 </li> 闭合标签
               </ul>
           </div>
          '''
    # 把文本转换成一个文档树对象
    html = etree.HTML(doc)
    result = etree.tostring(html)
    print(str(result, 'utf-8'))

file conversion document tree object

def fileToEleObj():
    # 读取外部文件 index.html
    html = etree.parse('./index.html')
    # pretty_print=True 会格式化输出
    result = etree.tostring(html, pretty_print=True)  # pretty_print=True 会格式化输出
    print(result)

node, element, attribute, content

The idea of ​​xpath is to find nodes through path expressions. Nodes include elements, attributes, and content

path expression

/ root node, node separator,
// anywhere
. current node
... parent node
@property

example:

from lxml import etree
'''
    路径表达式
'''
def get_el_list():
    doc = '''
              <div>
                  <ul>
                       <li class="item-0"><a href="link1.html">first item</a></li>
                       <li class="item-1"><a href="link2.html">second item</a></li>
                       <li class="item-inactive"><a href="link3.html">third item</a></li>
                       <li class="item-1"><a href="link4.html">fourth item</a></li>
                       <li class="item-0"><a href="link5.html">fifth item</a> # 注意,此处缺少一个 </li> 闭合标签
                   </ul>
               </div>
              '''
    # 把文本转换成一个文档树对象
    html = etree.HTML(doc)
    # 获取当前节点
    print('获取当前节点---> ', html.xpath('.'))
    # 获取 根节点 标签 ,当前元素无根节点 通过 打印 etree.tostring(html) ,会发现根节点为 <html> </html> 包裹的内容 ,上一行获取的当前节点为 html
    print('获取 根节点 标签---> ', html.xpath('/'))
    # 获取 li 标签
    print('获取 li 标签---> ', html.xpath('//li'))
    # 获取 li 下的 a 标签属性
    print('获取li下的 a 标签属性----> ', html.xpath('//li/a/@href'))
    # 获取 p 标签 ,此标签不存在 返回结果为空数组
    print('获取 p 标签----> ', html.xpath('//p '))

output result

获取当前节点--->  [<Element html at 0x2a989854200>]
获取 根节点 标签--->  []
获取 li 标签--->  [<Element li at 0x2a9898ece40>, <Element li at 0x2a9899240c0>, <Element li at 0x2a989924180>, <Element li at 0x2a9899241c0>, <Element li at 0x2a989924200>]
获取li下的 a 标签属性---->  ['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']
获取 p 标签---->  []

  1. After converting the doc into a document object, it is the content of the package; so the current node object obtained is HTML;
  2. The current node is HTML, and there is no root node, so the return is an empty array: [];
  3. When querying a node that does not exist, an empty array is returned: []

wildcard

*   任意元素
@*  任意属性
node()  任意子节点(元素,属性,内容)

example:


'''
    通配符
'''
from lxml import etree
def get_el_by_anyChar():
    doc = '''
              <div>
                  <ul class="ul" >
                       <li class="item-0"><a href="link1.html">first item</a></li>
                       <li class="item-1"><a href="link2.html">second item</a></li>
                       <li class="item-inactive"><a href="link3.html">third item</a></li>
                       <li class="item-1"><a href="link4.html">fourth item</a></li>
                       <li class="item-0"><a href="link5.html">fifth item</a> # 注意,此处缺少一个 </li> 闭合标签
                   </ul>
               </div>
              '''
    # 把文本转换成一个文档树对象
    html = etree.HTML(doc)
    # 获取 ul 下的所有子节点
    print('获取 ul 下的所有子节点---> ', html.xpath('//ul/node()'))
    # 获取 任意元素[所有的]
    print('获取 ul 下 任意元素[所有的]---> ', html.xpath('//ul/*'))
    # 获取 任意属性 [所有的]
    print('获取 ul 下 任意属性[所有的]---> ', html.xpath('//ul/@*'))

output result

获取 ul 下的所有子节点--->  ['\n                       ', <Element li at 0x1d4792b5e80>, '\n                       ', <Element li at 0x1d4792b5e00>, '\n                       ', <Element li at 0x1d4792b5f00>, '\n                       ', <Element li at 0x1d4792b5f40>, '\n                       ', <Element li at 0x1d4792b5ec0>, ' 闭合标签\n                   ']
获取 任意元素--->  [<Element li at 0x1d47928dd80>, <Element li at 0x1d4792b5e80>, <Element li at 0x1d4792b5fc0>, <Element li at 0x1d4792b5e00>, <Element li at 0x1d4792b5f00>]
获取 任意属性--->  ['ul']

predicate

//a[n] n为大于零的整数,代表子元素排在第n个位置的<a>元素
//a[last()]   last()  代表子元素排在最后个位置的<a>元素
//a[last()-]  和上面同理,代表倒数第二个
//a[position()<3] 位置序号小于3,也就是前两个,这里我们可以看出xpath中的序列是从1开始
//a[@href]    拥有href的<a>元素
//a[@href='www.baidu.com']    href属性值为'www.baidu.com'<a>元素
//book[@price>2]   price值大于2<book>元素

example;

from lxml import etree
def get_el_by_wei():
    doc = '''
                  <div>
                      <ul class="ul" >
                           <li class="item-0"><a href="link1.html">first item</a></li>
                           <li class="item-1"><a href="link2.html">second item</a></li>
                           <li class="item-inactive"><a href="link3.html">third item</a></li>
                           <li class="item-1"><a href="link4.html">fourth item</a></li>
                           <li class="item-0"><a href="link5.html">fifth item</a> # 注意,此处缺少一个 </li> 闭合标签
                       </ul>
                   </div>
                  '''
    # 把文本转换成一个文档树对象
    html = etree.HTML(doc)
    # 获取第一个 li / a 元素 里面的文本
    print('获取第一个 ---> ', html.xpath('//li[1]/a/text()'))
    # 获取最后一个 li / a 元素 里面的文本
    print('获取最后一个 ---> ', html.xpath('//li[last()]/a/text()'))
    # 获取倒数第二个 li / a元素 里面的文本
    print('获取 倒数第二个---> ', html.xpath('//li[last()-1]/a/text()'))
    # 获取位置序号小于3,也就是前两个 li / a元素 里面的文本
    print('获取位置序号小于3 ---> ', html.xpath('//li[position()<3]/a/text()'))
    # 获取拥有href的<a>元素下的文本
    print('获取第一个 ---> ', html.xpath('//a[@href]/text()'))
    # 获取 a 标签下 href = link3.html的a元素下的文本 注意 不是 == 而是 =
    print('获取 a 标签下 href = link3.html的<a>元素---> ', html.xpath('//a[@href="link3.html"]/text()'))
    # 获取 ul class == ul 的
    print('获取 ul class == ul  ---> ', html.xpath('//ul[@class="ul"]'))

multiple paths

Use | to connect two expressions, you can perform or match

//book/title | //book/price  

example;

from lxml import etree
def get_el_mutil_path():
    doc = '''
                  <div>
                      <ul class="ul" >
                           <li class="item-0"><a href="link1.html">first item</a></li>
                           <li class="item-1"><a href="link2.html">second item</a></li>
                           <li class="item-inactive"><a href="link3.html">third item</a></li>
                           <li class="item-1"><a href="link4.html">fourth item</a></li>
                           <li class="item-0"><a href="link5.html">fifth item</a> # 注意,此处缺少一个 </li> 闭合标签
                       </ul>
                   </div>
                  '''
    # 把文本转换成一个文档树对象
    html = etree.HTML(doc)
    # 获取li 下 class = item-inactive 或者 item-1
    print('获取li 下 class = item-inactive 或者 item-1 ---> ', html.xpath('//li[@class="item-inactive"] | //li[@class="item-1"] '))

output result;

获取li 下 class = item-inactive 或者 item-1 --->  [<Element li at 0x1b490955f40>, <Element li at 0x1b490966200>, <Element li at 0x1b490966180>]

function

contains(string1,string2)
starts-with(string1,string2)
# 文本
text()
# 最后一个
last()
# 位置
position()
# 回去所有节点
node()

'''
    函数
'''
from lxml import etree
def get_el_func():
    doc = '''
                     <div>
                         <ul class="ul" >
                              <li class="item-0 active"><a href="link1.html">first item</a></li>
                              <li class="item-1"><a href="link2.html">second item</a></li>
                              <li class="item-inactive"><a href="link3.html">third item</a></li>
                              <li class="item-1"><a href="link4.html">fourth item</a></li>
                              <li class="item-0"><a href="link5.html">fifth item</a> # 注意,此处缺少一个 </li> 闭合标签
                          </ul>
                      </div>
                     '''
    # 把文本转换成一个文档树对象
    html = etree.HTML(doc)
    # 匹配 class 包含 active 的 元素
    print(html.xpath("//*[contains(@class,'active')]"))
    # 获取所有 li / a 文本
    print(html.xpath("//li/a/text()"))
    # 获取最后一个 li / a 文本
    print(html.xpath("//li[last()]/text()"))
    # 获取位置为1的li /a 文本 ,节点时从1开始 而不是0
    print(html.xpath("//li[position()=1]/a/text()"))

output result;

[<Element li at 0x23ea36d0400>, <Element li at 0x23ea36d0180>]
['first item', 'second item', 'third item', 'fourth item', 'fifth item']
[' # 注意,此处缺少一个 ']
['first item']

Actual information

Get the movie name, brief description, and picture of a movie website

import requests
from lxml import etree

'''
     获取电影信息列表
'''
def get_moive_info_list(url):
        # 定义头部信息
        headers = {
    
    
                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
        }
        res = requests.get(url, headers=headers)
        # res.text 返回的是 文本
        html = res.text
        print('输出响应信息->',html)
        # 将文本转换成文档对象
        selector = etree.HTML(html)
        # 返回是电影名列表
        title_list = selector.xpath('//a[@class="pic-pack-outer"]/h3/text()')
        print('电影名称列表:',title_list)
        # 获取简单描述
        desc_list = selector.xpath('//a[@class="pic-pack-outer"]/p/text()')
        print('电影名称简单描述:', desc_list)
        # 图片
        img_list = selector.xpath('//a[@class="pic-pack-outer"]/img/@src')
        print('图片列表:', img_list)

if __name__ == '__main__':
        url = 'https://xxxxxxx/vod/list/n_1_t_25/o1p1.html'
        get_moive_info_list(url)

output result

输出响应信息-> <!DOCTYPE html>
·······
</body>
</html>
电影名称列表: ['辣妈犟爸', '五月梨花香', '岁岁平安',.....]
电影名称简单描述: ['年轻村官奋斗历程', '脱贫致富振兴家乡', .....]
图片列表: ['https://image11.m1905.cn/uploadfile/2022/0804/thumb_1_150_203_20220804094442559303.jpg',  .... 'https://image11.m1905.cn/uploadfile/2016/0926/thumb_1_150_85_20160926105222739343.jpg']

Please add a picture description

↓ ↓ ↓ Add the business card below to find me, directly get the source code and cases ↓ ↓ ↓

Guess you like

Origin blog.csdn.net/weixin_45841831/article/details/130505804