Python crawler xpath data analysis basic usage
Disclaimer: Since the publication of this article, this article is for reference only and may not be reproduced or copied. If the party who browses this article is involved in any violation of national laws and regulations, all consequences shall be borne by the party who browses this article and has nothing to do with the blogger of this article. And due to the reprinting, copying and other operations of the parties who browse this article, any disputes caused by violation of national laws and regulations and all the consequences shall be borne by the parties who browse this article and have nothing to do with the blogger of this article.
Xpath parsing is more common than bs4 parsing.
import requests
from lxml import etree
1. Basic Grammar
1.1 parsing html grammar
Parse local files
# 解析本地 html 文件
parser = etree.HTMLParser(encoding="utf-8")
tree = etree.parse("./data/base/taobao.html", parser=parser)
Parse web files
# url, UA
url = "https://www.aqistudy.cn/historydata/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
}
text = requests.get(url=url, headers=headers).text
# 解析
tree = etree.HTML(text, etree.HTMLParser(encoding="utf-8"))
tree.xpath()
Set as label, return as List.
1.2 Get tags
/
: Represents a level, beginning from the root node is positioned.
//
: Representing a plurality of levels, the positioning can be started from any position.
[n]
: Positioning the first label represents the index from the1
start.
xpath = tree.xpath("/html/body/div")
print(xpath)
print(tree.xpath("/html//div"))
print(tree.xpath("/html/body/div[2]"))
1.3 Get the content in the label
Locate an attribute tag
[@attribute='attributeValue']
: exactly match, not contain.
# 属性定位
print(tree.xpath("//div[@class='site-nav']"))
# 索引定位: 索引从 1 开始
print(tree.xpath("//div[@class='tb-ie-updater-box']/a[2]"))
1.4 Get the attributes in the label
/text()
: Direct content, return List
//text()
.: All content, return List
/@attribute
.: Get attribute value, return List.
# 取数据 /text(): 直系内容, 返回 List. //text(): 所有内容, 返回 List.
print(tree.xpath("//div[@class='tb-ie-updater-box']/a[2]/text()"))
print(tree.xpath("//div[@class='tb-ie-updater-box']/a[2]//text()"))
# 取属性值
print(tree.xpath("//div[@class='tb-ie-updater-box']/a/@href"))
1.5 Finding nodes through content
Fuzzy matching : precise matching :
[contains(text(),"text")]
[text()="text"]
2. Examples
Crawl the city
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import requests
from lxml import etree
if __name__ == '__main__':
# url, UA
url = "https://www.aqistudy.cn/historydata/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
}
text = requests.get(url=url, headers=headers).text
# 解析
tree = etree.HTML(text, etree.HTMLParser(encoding="utf-8"))
ul = tree.xpath("//div[@class='all']/div[@class='bottom']/ul")
# 建立文件
fp = open("./data/city/city_zm.txt", "w", encoding="utf-8")
# 循环 ul
for ul_zm in ul:
zm = ul_zm.xpath("./div[1]/b/text()")[0]
fp.write(zm + " " + ", ".join(ul_zm.xpath("./div[2]/li/a/text()")) + "\n")
print(zm + " 添加完成")
fp.close()
Second-hand housing information
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import requests
from lxml import etree
import re
if __name__ == '__main__':
# url, UA
url = "https://bj.58.com/ershoufang/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0"
}
# 爬取
text = requests.get(url=url, headers=headers).text
# 解析
tree = etree.HTML(text, etree.HTMLParser(encoding="utf-8"))
xpath = tree.xpath("//ul[@class='house-list-wrap']/li")
# 建立文件
fp = open("data/esf/ershoufang.txt", "w", encoding="utf-8")
fp.write("title\tprice\tbase_info\tjjr_info\tpeople_name\n")
# 循环二手房的 li 标签
for li_path in xpath:
# 得到相应的内容
title = li_path.xpath("./div[@class='list-info']/h2/a/text()")
price = li_path.xpath("./div[@class='price']/p//text()")
base_info = li_path.xpath("./div[@class='list-info']/p/span//text()")
jjr_info = li_path.xpath("./div[@class='list-info']/div/span[1]/text()")
people_name = li_path.xpath("./div[@class='list-info']/div/a/span/text()")
replace_base_info = re.sub(" +", " ", re.sub("[\t|\n]", "", " ".join(base_info)))
replace_jjr_info = re.sub(" +", " ", re.sub("[\t|\n]", "", " ".join(jjr_info)))
# 写入内容
fp.write(f"{''.join(title[0].split())}\t{' '.join(price)}\t{replace_base_info}\t{replace_jjr_info}\t{' '.join(people_name)}\n")
print(f"{''.join(title[0].split())}\t{' '.join(price)}\t{replace_base_info}\t{replace_jjr_info}\t{' '.join(people_name)}")
fp.close()
Crawl beauty photos
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import requests
from lxml import etree
import os
if __name__ == '__main__':
for page in range(1, 172):
# url, UA
# url = page == 1 ? "http://pic.netbian.com/4kmeinv/" : f"http://pic.netbian.com/4kmeinv/index_{page}.html"
url = "http://pic.netbian.com/4kmeinv/" if page == 1 else f"http://pic.netbian.com/4kmeinv/index_{page}.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0"
}
# 爬取
response = requests.get(url=url, headers=headers)
# 解决乱码问题
response.encoding = response.apparent_encoding
text = response.text
# 解析
tree = etree.HTML(text, etree.HTMLParser(encoding="utf-8"))
xpath = tree.xpath("//ul[@class='clearfix']/li")
# 建立文件夹
if not os.path.exists("./data/mn/%d" % page):
os.makedirs("./data/mn/%d" % page)
# 循环美女图片的 li 标签
for li_path in xpath:
img_url = "http://pic.netbian.com" + li_path.xpath(".//img/@src")[0]
# 生成图片
with open("./data/mn/%d/%s.jpg" % (page, li_path.xpath(".//img/@alt")[0]), "wb") as f:
f.write(requests.get(url=img_url, headers=headers).content)
print("%d 页下载完成" % page)