1-6 requests模块之爬取化妆品生产许可证

1. 代码一:

# 视频网址:https://www.bilibili.com/video/BV1ha4y1H7sx?p=11&spm_id_from=pageDriver
# 爬取的网址主页:http://scxk.nmpa.gov.cn:81/xk/
import csv

import requests
import time


# 请求为post请求
# data-url: http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList
# 获取主页的数据
def get_data(post_url, start=1, end=1):  # 默认仅爬取1页
    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29'
    }
    params = {
    
    
        "method": "getXkzsList"
    }
    all_data_list = []    # 字典列表,字典中包含两个字段:ID、EPS_NAME
    for page in range(start, end + 1):
        print(f"正在爬取第{
      
      page}页:")
        form_data = {
    
    
            "on": "true",
            "page": page,
            "pageSize": "15",
            "productName": "",
            "conditionType": "1",
            "applyname": "",
            "applysn": "",
        }
        # time.sleep(2)
        resp = requests.post(url=post_url, data=form_data, params=params, headers=headers)
        datas = resp.json()
        print("该页有数据条数:", len(datas['list']))
        # print(datas)
        resp.close()
        for data in datas['list']:   # 获取到字典中"list"键中的值
            temp_dict = {
    
    }
            temp_dict['ID'] = data['ID']  # 获取公司的ID,方便拼接成子页面
            temp_dict['ESP_NAME'] = data['EPS_NAME']   # 获取企业名称
            all_data_list.append(temp_dict)
    return all_data_list


# 拼接并添加子页面的URL
def generate_detail_url(all_datas_):
    all_datas = []
    for datas in all_datas_:
        temp_data = datas
        child_url = f"http://scxk.nmpa.gov.cn:81/xk/itownet/portal/dzpz.jsp?id={
      
      datas['ID']}"
        temp_data['URL'] = child_url
        all_datas.append(temp_data)
    return all_datas


# 获得子页面的数据
def get_detail_data(referer, id):
    # data-url:http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById
    post_url= "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById"
    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29',
        "Referer": referer
    }
    # params = {
    
    
    #     "method": "getXkzsList"
    # }
    form_data = {
    
    
        "id": id
    }
    resp = requests.post(url=post_url, data=form_data, headers=headers)
    detail_datas = resp.json()
    resp.close()
    return detail_datas


def main():
    # post_url = "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList"
    post_url = "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do"
    # 输入爬取页的范围:
    start_page = int(input("请输入您要爬取的开始页:"))
    end_page = int(input("请输入您要爬取的结束页:"))
    # 获得主页的企业名称和企业的ID字典:
    all_datas_ = get_data(post_url, start_page, end_page)
    # print(all_datas_)
    # print(len(all_datas_))
    # 增加了子页面的链接:
    all_datas = generate_detail_url(all_datas_)
    # 获取子页面的链接:调用get_detail_data()进行爬取,并进行存储数据:

    target_datas = []
    for datas in all_datas:
        referer_url = datas['URL']   # 可作为防盗链Referer属性值
        id = referer_url.split("id=")[-1]
        # print(referer_url)
        print(id)
        detail_data = get_detail_data(referer_url, id)
        print(detail_data)
        target_datas.append(detail_data)
        # time.sleep(2)
    with open("./6/国家药管局化妆品生产许可证.csv", "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=target_datas[0].keys())
        # 写入表头:
        writer.writeheader()
        # 写入内容
        writer.writerows(target_datas)
    print("爬取并保存完毕!")


if __name__ == '__main__':
    main()

爬取结果:
在这里插入图片描述

2. 代码二:

import json

import requests

# data-url:http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById
if __name__ == '__main__':
    # 批量获取不同企业的id值:
    url= "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList"
    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29',
    }
    id_list = []  # 存储企业的ID
    all_data_list = []  # 存储所有企业的详情数据
    # 参数的封装
    for page in range(1, 6):
        page = str(page)
        data = {
    
    
                "on": "true",
                "page": page,
                "pageSize": "15",
                "productName": "",
                "conditionType": "1",
                "applyname": "",
                "applysn": "",
            }
        resp = requests.post(url=url, data=data, headers=headers)
        json_ids = resp.json()
        resp.close()
        for dic in json_ids['list']:
            id_list.append(dic['ID'])   # 存储企业的ID

    # 获取企业详情页的数据:
    post_url = "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById"
    for id in id_list:
        data = {
    
    
            "id": id
        }
        resp = requests.post(url=post_url, headers=headers, data=data)
        detail_json = resp.json()
        resp.close()
        all_data_list.append(detail_json)
        print(detail_json)
    fp = open("./6/6.2 allData.json", "w", encoding="utf-8")
    # 持久化存储all_data_list:
    json.dump(all_data_list, fp=fp, ensure_ascii=False)
    fp.close()
    print("保存完毕!")

运行结果:
在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/ungoing/article/details/124083584
1-6