Python crawler xpath data analysis basic usage

Disclaimer: Since the publication of this article, this article is for reference only and may not be reproduced or copied. If the party who browses this article is involved in any violation of national laws and regulations, all consequences shall be borne by the party who browses this article and has nothing to do with the blogger of this article. And due to the reprinting, copying and other operations of the parties who browse this article, any disputes caused by violation of national laws and regulations and all the consequences shall be borne by the parties who browse this article and have nothing to do with the blogger of this article.

Xpath parsing is more common than bs4 parsing.

import requests
from lxml import etree

1. Basic Grammar

1.1 parsing html grammar

Parse local files

# 解析本地 html 文件
parser = etree.HTMLParser(encoding="utf-8")
tree = etree.parse("./data/base/taobao.html", parser=parser)

Parse web files

# url, UA
url = "https://www.aqistudy.cn/historydata/"
headers = {
    
    
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
}
text = requests.get(url=url, headers=headers).text
# 解析
tree = etree.HTML(text, etree.HTMLParser(encoding="utf-8"))

tree.xpath() Set as label, return as List.

1.2 Get tags

/: Represents a level, beginning from the root node is positioned.
//: Representing a plurality of levels, the positioning can be started from any position.
[n]: Positioning the first label represents the index from the 1start.

xpath = tree.xpath("/html/body/div")
print(xpath)
print(tree.xpath("/html//div"))
print(tree.xpath("/html/body/div[2]"))

Insert picture description here

1.3 Get the content in the label

Locate an attribute tag
[@attribute='attributeValue'] : exactly match, not contain.

# 属性定位
print(tree.xpath("//div[@class='site-nav']"))
# 索引定位: 索引从 1 开始
print(tree.xpath("//div[@class='tb-ie-updater-box']/a[2]"))

Insert picture description here

1.4 Get the attributes in the label

/text(): Direct content, return List
//text().: All content, return List
/@attribute.: Get attribute value, return List.

# 取数据 /text(): 直系内容, 返回 List. //text(): 所有内容, 返回 List.
print(tree.xpath("//div[@class='tb-ie-updater-box']/a[2]/text()"))
print(tree.xpath("//div[@class='tb-ie-updater-box']/a[2]//text()"))
# 取属性值
print(tree.xpath("//div[@class='tb-ie-updater-box']/a/@href"))

Insert picture description here

1.5 Finding nodes through content

Fuzzy matching : precise matching :[contains(text(),"text")]
[text()="text"]

2. Examples

Crawl the city

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

import requests
from lxml import etree


if __name__ == '__main__':
    
    # url, UA
    url = "https://www.aqistudy.cn/historydata/"
    headers = {
    
    
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
    }
    text = requests.get(url=url, headers=headers).text
    # 解析
    tree = etree.HTML(text, etree.HTMLParser(encoding="utf-8"))
    ul = tree.xpath("//div[@class='all']/div[@class='bottom']/ul")
    # 建立文件
    fp = open("./data/city/city_zm.txt", "w", encoding="utf-8")
    # 循环 ul
    for ul_zm in ul:
        zm = ul_zm.xpath("./div[1]/b/text()")[0]
        fp.write(zm + " " + ", ".join(ul_zm.xpath("./div[2]/li/a/text()")) + "\n")
        print(zm + " 添加完成")
    fp.close()

Insert picture description here

Second-hand housing information

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

import requests
from lxml import etree
import re


if __name__ == '__main__':

    # url, UA
    url = "https://bj.58.com/ershoufang/"
    headers = {
    
    
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0"
    }
    # 爬取
    text = requests.get(url=url, headers=headers).text
    # 解析
    tree = etree.HTML(text, etree.HTMLParser(encoding="utf-8"))
    xpath = tree.xpath("//ul[@class='house-list-wrap']/li")
    # 建立文件
    fp = open("data/esf/ershoufang.txt", "w", encoding="utf-8")
    fp.write("title\tprice\tbase_info\tjjr_info\tpeople_name\n")
    # 循环二手房的 li 标签
    for li_path in xpath:
        # 得到相应的内容
        title = li_path.xpath("./div[@class='list-info']/h2/a/text()")
        price = li_path.xpath("./div[@class='price']/p//text()")
        base_info = li_path.xpath("./div[@class='list-info']/p/span//text()")
        jjr_info = li_path.xpath("./div[@class='list-info']/div/span[1]/text()")
        people_name = li_path.xpath("./div[@class='list-info']/div/a/span/text()")
        replace_base_info = re.sub(" +", " ", re.sub("[\t|\n]", "", " ".join(base_info)))
        replace_jjr_info = re.sub(" +", " ", re.sub("[\t|\n]", "", " ".join(jjr_info)))

        # 写入内容
        fp.write(f"{''.join(title[0].split())}\t{' '.join(price)}\t{replace_base_info}\t{replace_jjr_info}\t{' '.join(people_name)}\n")
        print(f"{''.join(title[0].split())}\t{' '.join(price)}\t{replace_base_info}\t{replace_jjr_info}\t{' '.join(people_name)}")
    fp.close()

Insert picture description here

Crawl beauty photos

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

import requests
from lxml import etree
import os


if __name__ == '__main__':

    for page in range(1, 172):
        # url, UA
        # url = page == 1 ? "http://pic.netbian.com/4kmeinv/" : f"http://pic.netbian.com/4kmeinv/index_{page}.html"
        url = "http://pic.netbian.com/4kmeinv/" if page == 1 else f"http://pic.netbian.com/4kmeinv/index_{page}.html"
        headers = {
    
    
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0"
        }
        # 爬取
        response = requests.get(url=url, headers=headers)
        # 解决乱码问题
        response.encoding = response.apparent_encoding
        text = response.text
        # 解析
        tree = etree.HTML(text, etree.HTMLParser(encoding="utf-8"))
        xpath = tree.xpath("//ul[@class='clearfix']/li")
        # 建立文件夹
        if not os.path.exists("./data/mn/%d" % page):
            os.makedirs("./data/mn/%d" % page)
        # 循环美女图片的 li 标签
        for li_path in xpath:
            img_url = "http://pic.netbian.com" + li_path.xpath(".//img/@src")[0]
            # 生成图片
            with open("./data/mn/%d/%s.jpg" % (page, li_path.xpath(".//img/@alt")[0]), "wb") as f:
                f.write(requests.get(url=img_url, headers=headers).content)
        print("%d 页下载完成" % page)

Insert picture description here