Python crawler selection 06 episodes (xpath parsing, lxml parsing library, case actual combat)

python learning directory portal

1. Xpath analysis

  • Baidu installs the gugexpath plugin

1. Definition

XPath即为XML路径语言,它是一种用来确定XML文档中某部分位置的语言,同样适用于HTML文档的检索

2. Case demonstration-Lianjia actual combat

import random
import requests
from lxml import etree
from fake_useragent import UserAgent

class LianjiaSpider(object):
    def __init__(self):
        self.url = 'https://bj.lianjia.com/ershoufang/pg{}/'

    def parse_html(self,url):
        headers = {
    
    'User-Agent':UserAgent().random}
        html = requests.get(url=url,headers=headers).content.decode('utf8','ignore')

    def get_data(self,html):
        p = etree.HTML(html)
        # 基准xpath: [<element li at xxx>,<element li>]
        li_list =p.xpath('//ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]')
        # for 遍历,依次提取每个房源信息,放到字典中
        item = {
    
    }
        for li in li_list:
            # 名称+区域
            name_list = li.xpath('.//div[@class="positionInfo"]/a[1]/text()')
            item['name'] = name_list[0].strip() if name_list else None
            address_list =li.xpath('.//div[@class="positionInfo"]/a[2]/text()')
            item['address'] = address_list[0].strip() if address_list else None
            # 户型+面积+方位+是否精装+楼层+年代+类型
            # h_list: ['']
            h_list = li.xpath('.//div[@class="houseInfo"]/text()')
            if h_list:
                info_list = h_list[0].split('|')
                if len(info_list) == 7:
                    item['model'] = info_list[0].strip()
                    item['area'] = info_list[1].strip()
                    item['direct'] = info_list[2].strip()
                    item['perfect'] = info_list[3].strip()
                    item['floor'] = info_list[4].strip()
                    item['year'] = info_list[5].strip()
                    item['type'] = info_list[6].strip()
                else:
                    item['model'] = item['area'] = item['direct'] = item['perfect'] = item['floor'] = item['year'] = \
                    item['type'] = None
            else:
                item['model'] = item['area'] = item['direct'] = item['perfect'] = item['floor'] = item['year'] = \
                item['type'] = None

            # 总价+单价
            total_list = li.xpath('.//div[@class="totalPrice"]/span/text()')
            item['total'] = total_list[0].strip() if total_list else None
            unit_list = li.xpath('.//div[@class="unitPrice"]/span/text()')
            item['unit'] = unit_list[0].strip() if unit_list else None
            print(item)

    def run(self):
        for page in range(1,2):
            url = self.url.format(page)
            self.parse_html(url)
            time.sleep(random.randint(1,2))

if __name__ == '__main__':
    spider = LianjiaSpider()
    spider.run()

【注意】 1> 只要涉及到条件,加 [] : //dl[@class="xxx"] //dl/dd[2] 2> 只要获取属性值,加 @ : //dl[@class="xxx"] //p/a/@href

3. Select node

1// : 从所有节点中查找(包括子节点和后代节点)2】@  : 获取属性值
  2.1> 使用场景1(属性值作为条件)
       //div[@class="movie-item-info"]
  2.2> 使用场景2(直接获取属性值)
       //div[@class="movie-item-info"]/a/img/@src
   3】练习 - 猫眼电影top100
  3.1> 匹配电影名称
      //div[@class="movie-item-info"]/p[1]/a/@title
  3.2> 匹配电影主演
      //div[@class="movie-item-info"]/p[2]/text()
  3.3> 匹配上映时间
      //div[@class="movie-item-info"]/p[3]/text()
  3.4> 匹配电影链接
      //div[@class="movie-item-info"]/p[1]/a/@href

4. Match multipath (or)

xpath表达式1 | xpath表达式2 | xpath表达式3

5. Commonly used functions

1】text() :获取节点的文本内容
    xpath表达式末尾不加 /text() :则得到的结果为节点对象
    xpath表达式末尾加 /text() 或者 /@href : 则得到结果为字符串
        2】contains() : 匹配属性值中包含某些字符串节点
    匹配class属性值中包含 'movie-item' 这个字符串的 div 节点
     //div[contains(@class,"movie-item")]

Two. Summary

1】字符串: xpath表达式的末尾为: /text()/@href  得到的列表中为'字符串'
 2】节点对象: 其他剩余所有情况得到的列表中均为'节点对象' 
    [<element dd at xxxa>,<element dd at xxxb>,<element dd at xxxc>]
    [<element div at xxxa>,<element div at xxxb>]
    [<element p at xxxa>,<element p at xxxb>,<element p at xxxc>]

Three.lxml parsing library

1. Installation

1】Ubuntu:  sudo pip3 install lxml2】Windows: python -m pip install lxml

2. Use process

1、导模块
   from lxml import etree
2、创建解析对象
   parse_html = etree.HTML(html)
3、解析对象调用xpath
   r_list = parse_html.xpath('xpath表达式')

3.xpath is the most commonly used

1】基准xpath: 匹配所有电影信息的节点对象列表
   //dl[@class="board-wrapper"]/dd
   [<element dd at xxx>,<element dd at xxx>,...]
    2】遍历对象列表,依次获取每个电影信息
   item = {
    
    }
   for dd in dd_list:
	 	item['name'] = dd.xpath('.//p[@class="name"]/a/text()').strip()
	 	item['star'] = dd.xpath('.//p[@class="star"]/text()').strip()
	 	item['time'] = dd.xpath('.//p[@class="releasetime"]/text()').strip()

Four. Cat's Eye Movie Case-XPath Implementation

"""
猫眼电影top100抓取(电影名称、主演、上映时间)
"""
import requests
import time
import random
from lxml import etree

class MaoyanSpider:
    def __init__(self):
        self.url = 'https://maoyan.com/board/4?offset={}'
        self.headers = {
    
    'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'}

    def get_html(self, url):
        html = requests.get(url=url, headers=self.headers).text
        # 直接调用解析函数
        self.parse_html(html)

    def parse_html(self, html):
        """解析提取数据 - xpath"""
        p = etree.HTML(html)
        # 基准xpath:每个电影信息的节点对象dd列表 [<element dd at xxx>, <element dd at xxx>,...]
        dd_list = p.xpath('//dl[@class="board-wrapper"]/dd')
        print(dd_list)
        item = {
    
    }
        for dd in dd_list:
            item['name'] = dd.xpath('.//p[@class="name"]/a/@title')[0].strip()
            item['star'] = dd.xpath('.//p[@class="star"]/text()')[0].strip()
            item['time'] = dd.xpath('.//p[@class="releasetime"]/text()')[0].strip()
            print(item)

    def run(self):
        """程序入口函数"""
        for offset in range(0, 91, 10):
            url = self.url.format(offset)
            self.get_html(url=url)
            # 控制数据抓取频率:uniform()生成指定范围内的浮点数
            time.sleep(random.uniform(0,1))

if __name__ == '__main__':
    spider = MaoyanSpider()
    spider.run()

Guess you like

Origin blog.csdn.net/weixin_38640052/article/details/107878076