Basic usage of Python crawler requests

Basic usage of Python crawler requests

Disclaimer: Since the publication of this article, this article is for reference only and may not be reproduced or copied. If the party who browses this article is involved in any violation of national laws and regulations, all consequences shall be borne by the party who browses this article and has nothing to do with the blogger of this article. And due to the reprinting, copying and other operations of the parties who browse this article, any disputes caused by violation of national laws and regulations and all the consequences shall be borne by the parties who browse this article and have nothing to do with the blogger of this article.

1. Load the module

PyCharm
https://blog.csdn.net/YKenan/article/details/96290603

Console

pip3 install requests

2. Request data

Reference module

import requests

Syntax
GET parameters:

  1. url : The URL of the get request.
  2. params : get the incoming data of the request.
  3. headers : Request headers. Set User-Agent more .
  4. timeout : Limit the connection timeout time, the unit is s .

POST parameters:

  1. url : The URL of the post request.
  2. data : The data passed in the post request.
  3. headers : Request headers. Set User-Agent more .
  4. timeout : Limit the connection timeout time, the unit is s .
# get 请求
requests.get(url, params, headers, timeout)
# post 请求
requests.post(url, data, headers, timeout)

For example, simulate Baidu translation

    # url, UA, 参数
    url = "https://fanyi.baidu.com/sug"
    headers = {
    
    
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0"
    }
    word = input("请输入单词:")
    # post 请求传入的数据
    data = {
    
    
        "kw": word
    }
    # 爬取
    response = requests.post(url=url, data=data, headers=headers, timeout=5)

How to find where the connection:
Into the Baidu page, right 检查元素or click click on the network refresh the page.F12

Insert picture description hereInsert picture description here

Insert picture description here

3. Get data results

# 返回二进制响应内容
response.content
# 返回页面内容
response.text
# 返回 JSON 数据
response.json()

Crawling process data with bigger tryprevent network problems caused by crawling stopped.
Requests and basically crawling remodule together by a regular extract desired content data.

4. Examples

Baidu translator

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

# 导包
import requests
import json


if __name__ == '__main__':
    
    # url, UA, 参数
    url = "https://fanyi.baidu.com/sug"
    headers = {
    
    
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0"
    }
    word = input("请输入单词:")
    # post 请求传入的数据
    data = {
    
    
        "kw": word
    }
    # 爬取
    response = requests.post(url=url, data=data, headers=headers)
    # 返回 json 数据
    print(response.json()["data"][0]["v"])
    # 存储
    fp = open(f"./data/fanyi/{word}.json", "w", encoding="utf-8")
    # ensure_ascii=False 显示中文
    json.dump(response.json(), fp=fp, ensure_ascii=False)
    fp.close()

Insert picture description here

Download photos

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

import requests
import re

if __name__ == '__main__':

    # 遍历页数
    for page in range(21, 40):
        # url 及参数
        url = "http://aspx.sc.chinaz.com/query.aspx"
        params = {
    
    
            "keyword": "可爱",
            "classID": "11",
            "page": page
        }
        # UA
        headers = {
    
    
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0"
        }
        # 爬取
        response = requests.get(url=url, params=params, headers=headers)
        # 网页内容
        text = response.text
        # 正则 () 内匹配想要的内容
        image_urls = re.findall('<img data-src="(.*?)" alt=".*?" class="preview" />', text, re.S)
        print(image_urls)

        # 便利照片
        for image_url in image_urls:
            # 得到照片的二进制数据
            image_url__content = requests.get("http:" + image_url, headers=headers).content
            with open(f"./data/image/{'%d-%d' % (page, image_urls.index(image_url))}.jpg", "wb") as f:
                f.write(image_url__content)
                print(image_url + " 下载完成")

Insert picture description here

Crawling data from the National Medical Products Administration
http://scxk.nmpa.gov.cn:81/xk/

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

# 导包
import requests
import pandas as pd


# 返回值可以作为输入值
def getID(list_page):
    # 记录爬虫没有爬取成功的页数
    list_error = []

    # 遍历页数
    for page in list_page:
        # url, UA, 参数
        url = "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do"
        headers = {
    
    
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0"
        }
        # post 请求传入的数据
        data = {
    
    
            "method": "getXkzsList",
            "page": page,
        }
        try:
            # 爬取, 并设置超时时间为 3s
            response = requests.post(url=url, data=data, headers=headers, timeout=3)
            # 返回 json 数据, 得到 List
            json_list_ = response.json()["list"]
            # 得到 ID
            for json_list_data in json_list_:
                print(json_list_data["ID"])
                # 写入文件
                with open("./data/nmpa/list_ID.txt", "a", encoding="utf-8") as f:
                    f.write(json_list_data["ID"] + "\n")
        except:
            list_error.append(page)
    return list_error


# 返回值可以作为输入值
def get_detail(list_id):
    # 记录爬虫没有爬取成功的 ID
    list_error = []

    columns = ['businessLicenseNumber', 'businessPerson', 'certStr', 'cityCode', 'countyCode', 'creatUser', 'createTime', 'endTime', 'epsAddress', 'epsName', 'epsProductAddress', 'id', 'isimport', 'legalPerson', 'offDate', 'offReason', 'parentid', 'preid', 'processid', 'productSn', 'provinceCode', 'qfDate', 'qfManagerName', 'qualityPerson', 'rcManagerDepartName', 'rcManagerUser', 'startTime', 'xkCompleteDate', 'xkDate', 'xkDateStr', 'xkName', 'xkProject', 'xkRemark', 'xkType']
    df = pd.DataFrame(columns=columns)

    for ID in list_id:
        # url, UA, 参数
        url = "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do"
        headers = {
    
    
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0"
        }
        # post 请求传入的数据
        data = {
    
    
            "method": "getXkzsById",
            "id": ID,
        }
        try:
            # 爬取, 并设置超时时间为 3s
            response = requests.post(url=url, data=data, headers=headers, timeout=1)
            # 返回 json 数据, 得到 List
            list_detail = list(dict(response.json()).values())
            print(list_detail)
            df.loc[len(df)] = pd.Series(list_detail, index=columns)
        except:
            list_error.append(ID)
    df.to_csv("./data/nmpa/list_detail_2.csv", encoding='utf_8_sig', index=False)
    return list_error


if __name__ == '__main__':

    # 获取 ID
    print(getID(range(1, 360)))
    # 读取文件
    f = open("./data/nmpa/list_ID.txt", "r", encoding="utf-8")
    lines = f.read().splitlines()
    f.close()
    # 爬取信息
    print(get_detail(lines))

Insert picture description here

Guess you like

Origin blog.csdn.net/YKenan/article/details/111936873