1. 代码一:
# 视频网址:https://www.bilibili.com/video/BV1ha4y1H7sx?p=11&spm_id_from=pageDriver
# 爬取的网址主页:http://scxk.nmpa.gov.cn:81/xk/
import csv
import requests
import time
# 请求为post请求
# data-url: http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList
# 获取主页的数据
def get_data(post_url, start=1, end=1): # 默认仅爬取1页
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29'
}
params = {
"method": "getXkzsList"
}
all_data_list = [] # 字典列表,字典中包含两个字段:ID、EPS_NAME
for page in range(start, end + 1):
print(f"正在爬取第{
page}页:")
form_data = {
"on": "true",
"page": page,
"pageSize": "15",
"productName": "",
"conditionType": "1",
"applyname": "",
"applysn": "",
}
# time.sleep(2)
resp = requests.post(url=post_url, data=form_data, params=params, headers=headers)
datas = resp.json()
print("该页有数据条数:", len(datas['list']))
# print(datas)
resp.close()
for data in datas['list']: # 获取到字典中"list"键中的值
temp_dict = {
}
temp_dict['ID'] = data['ID'] # 获取公司的ID,方便拼接成子页面
temp_dict['ESP_NAME'] = data['EPS_NAME'] # 获取企业名称
all_data_list.append(temp_dict)
return all_data_list
# 拼接并添加子页面的URL
def generate_detail_url(all_datas_):
all_datas = []
for datas in all_datas_:
temp_data = datas
child_url = f"http://scxk.nmpa.gov.cn:81/xk/itownet/portal/dzpz.jsp?id={
datas['ID']}"
temp_data['URL'] = child_url
all_datas.append(temp_data)
return all_datas
# 获得子页面的数据
def get_detail_data(referer, id):
# data-url:http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById
post_url= "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29',
"Referer": referer
}
# params = {
# "method": "getXkzsList"
# }
form_data = {
"id": id
}
resp = requests.post(url=post_url, data=form_data, headers=headers)
detail_datas = resp.json()
resp.close()
return detail_datas
def main():
# post_url = "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList"
post_url = "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do"
# 输入爬取页的范围:
start_page = int(input("请输入您要爬取的开始页:"))
end_page = int(input("请输入您要爬取的结束页:"))
# 获得主页的企业名称和企业的ID字典:
all_datas_ = get_data(post_url, start_page, end_page)
# print(all_datas_)
# print(len(all_datas_))
# 增加了子页面的链接:
all_datas = generate_detail_url(all_datas_)
# 获取子页面的链接:调用get_detail_data()进行爬取,并进行存储数据:
target_datas = []
for datas in all_datas:
referer_url = datas['URL'] # 可作为防盗链Referer属性值
id = referer_url.split("id=")[-1]
# print(referer_url)
print(id)
detail_data = get_detail_data(referer_url, id)
print(detail_data)
target_datas.append(detail_data)
# time.sleep(2)
with open("./6/国家药管局化妆品生产许可证.csv", "w", encoding="utf-8", newline="") as f:
writer = csv.DictWriter(f, fieldnames=target_datas[0].keys())
# 写入表头:
writer.writeheader()
# 写入内容
writer.writerows(target_datas)
print("爬取并保存完毕!")
if __name__ == '__main__':
main()
爬取结果:
2. 代码二:
import json
import requests
# data-url:http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById
if __name__ == '__main__':
# 批量获取不同企业的id值:
url= "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29',
}
id_list = [] # 存储企业的ID
all_data_list = [] # 存储所有企业的详情数据
# 参数的封装
for page in range(1, 6):
page = str(page)
data = {
"on": "true",
"page": page,
"pageSize": "15",
"productName": "",
"conditionType": "1",
"applyname": "",
"applysn": "",
}
resp = requests.post(url=url, data=data, headers=headers)
json_ids = resp.json()
resp.close()
for dic in json_ids['list']:
id_list.append(dic['ID']) # 存储企业的ID
# 获取企业详情页的数据:
post_url = "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById"
for id in id_list:
data = {
"id": id
}
resp = requests.post(url=post_url, headers=headers, data=data)
detail_json = resp.json()
resp.close()
all_data_list.append(detail_json)
print(detail_json)
fp = open("./6/6.2 allData.json", "w", encoding="utf-8")
# 持久化存储all_data_list:
json.dump(all_data_list, fp=fp, ensure_ascii=False)
fp.close()
print("保存完毕!")
运行结果: