urllib's ajax usage

The get method saves the data locally

# _*_ coding : utf-8 _*_
# @Time : 2023/1/19 16:17
# @Author : 李阶熊
# @File : urllib_ajax的get请求
# @Project : pythonProject
# get请求
# 获取豆瓣电影的第一页的数据  并保存起来

import urllib.request

url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'

headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

# (1) 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)

# (2) 获取响应的数据
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')

# (3) 数据下载到本地
# open方法默认情况下使用的是gbk的编码  如果我们需要保存汉字  那么需要在open方法中指定编码格式为utf-8
# encoding='utf-8'
# fp = open('douban.json','w',encoding='utf-8')
# fp.write(content)

with open('douban1.json', 'w', encoding='utf-8') as fp:
    fp.write(content)

Save the first 10 pages of Douban data

# _*_ coding : utf-8 _*_
# @Time : 2023/1/19 17:16
# @Author : 李阶熊
# @File : urllib_get方法请求豆瓣前10页数据
# @Project : pythonProject
# https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&
# start=0&limit=20

# https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&
# start=20&limit=20

# https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&
# start=40&limit=20

# page  1  2  3  4
# start 0  20 40 60


import urllib.request


# start = (page - 1) * 20

# start=0

# headers = {
    
    
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
# }
#
# for i in range(10):
#     url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&limit=20&start='
#     start = i * 20
#     url = url + str(start)
#     # 1.请求对象定制
#     request = urllib.request.Request(url=url, headers=headers)
#
#     # 2.获取响应数据
#     response = urllib.request.urlopen(request)
#
#     # 3.保存数据到本地
#     content = response.read().decode('utf-8')
#     print('开始保存第', i+1, '几页数据')
#     with open('douban.json', 'a', encoding='utf-8') as fp:
#         fp.write(content)


# 下载豆瓣电影前10页的数据
# 1.请求对象的定制
# 2.获取响应的数据
# 3.下载数据

def create_request(start):
    url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&limit=20&start=' + str(start)
    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    }
    request1 = urllib.request.Request(url=url, headers=headers)
    return request1


def get_content(request1):
    response = urllib.request.urlopen(request1)
    content = response.read().decode('utf-8')
    return content


def down_load(datas):
    with open('douban.json', 'a', encoding='utf-8') as fp:
        fp.write(datas)


# 程序的入口
if __name__ == '__main__':
    start_page = int(input('请输入起始的页码'))
    end_page = int(input('请输入结束的页面'))

    for page in range(start_page, end_page + 1):

        # 每一页都有自己的请求对象定制

        request = create_request((page - 1) * 20)

        data = get_content(request)

        print('当前保存数据第', page, '页')

        down_load(data)


abnormal

# _*_ coding : utf-8 _*_
# @Time : 2023/1/20 11:24
# @Author : 李阶熊
# @File : urllib_异常
# @Project : pythonProject
import urllib.request
import urllib.error

#url = 'https://blog.csdn.net/weixin_44211968/article/details/120129476?spm=1001.2100.3001.7377'

url = 'https://www.doudan111.com'
headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

try:
    request = urllib.request.Request(url=url, headers=headers)
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    print(content)
except urllib.error.HTTPError:
    print("hahahha")
except urllib.error.URLError:
    print("xixixi")

urllib Get kfc store address data

# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname
# cname:
# 北京
# pid:
# pageIndex:
# 2
# pageSize:
# 10

import urllib.request
import urllib.parse


def create_request(page):
    base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'

    data = {
    
    
        'cname': '北京',
        'pid': '',
        'pageIndex': str(page),
        'pageSize': '10'
    }

    data = urllib.parse.urlencode(data).encode('utf-8')

    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    }

    request = urllib.request.Request(url=base_url, data=data, headers=headers)
    return request


def get_content(request2):
    response = urllib.request.urlopen(request2)
    content = response.read().decode('utf-8')
    return content


def down_load(page1, content2):
    with open('kfc'+str(page1)+'.json', 'w', encoding='utf-8') as fp:
        fp.write(content2)


if __name__ == '__main__':
    start_page = int(input('请输入起始页码'))
    end_page = int(input('请输入结束页码'))

    for page in range(start_page, end_page + 1):
        print('开始下载第', page, '页')
        # 请求对象的定制
        request1 = create_request(page)
        # 获取网页源码
        content1 = get_content(request1)
        # 下载到本地
        down_load(page, content1)

urllib carries cookies to request Weibo personal information

# _*_ coding : utf-8 _*_
# @Time : 2023/1/28 10:46
# @Author : 李阶熊
# @File : urllib微博的cookie登录
# @Project : pythonProject

# 适用的场景: 数据采集的时候, 需要绕过登陆, 然后进入到某个页面

# 个人信息页面是utf-8 但是还报编码错误, 因为并没有进入到个人信息页面 而是跳转到了登陆页面

import urllib.request

url = 'https://weibo.com/p/1005055645428311/info'

headers = {
    
    

    # ':authority': 'weibo.com',
    # ':method': 'GET',
    # ':path': '/p/1005055645428311/info',
    # ':scheme': 'https',
    # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    # 'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'max-age=0',
    # cookie中携带着你的登录信息, 如果有登陆之后的cookie 那么我们就可以携带着cookie进入到任何页面
    'cookie': 'XSRF-TOKEN=k4W3jQdQTRV51BLNdh9Bgq0-; login_sid_t=d411c86abec42d79d18b6cceee924f67; cross_origin_proto=SSL; _s_tentry=weibo.com; Apache=8527055933097.476.1674873488755; SINAGLOBAL=8527055933097.476.1674873488755; ULV=1674873488761:1:1:1:8527055933097.476.1674873488755:; wb_view_log=1920*10801; SSOLoginState=1674873548; SUB=_2A25O0PqdDeRhGeNI71cV8ibPyj2IHXVtpGtVrDV8PUNbmtANLRnYkW9NSH6t1nfm884z1WFXmugFuTB63RebtLjK; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhwxxJNcNRR3I77oivLQP-95JpX5KzhUgL.Fo-cSh-Xeon0eK22dJLoIESgdcpDUsLadbH8SFHF1F-RxFH81F-4eFHFxFH8SCHWSF-4SBtt; ALF=1706409546; wvr=6; wb_view_log_5645428311=1920*10801; WBPSESS=OqWss3qcIkOBUh2PKbOPqTbgHlGSou7iMWhXp2LBefg7_ovvOndEatgx8HSk8ZivKAM7rJIPFI5_1Bn6c7IcMhoi2kWc4QmQ_w2Nhr5gHM0JqKR98rxkJqTauOun-TSmMAgvzFzQc-P1VFVeDtjGZw==; PC_TOKEN=8fdcf40040; webim_unReadCount=%7B%22time%22%3A1674874761720%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A0%2C%22msgbox%22%3A0%7D',
    'referer': 'https://weibo.cn/',
    'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

request = urllib.request.Request(url=url, headers=headers)

response = urllib.request.urlopen(request)

content = response.read().decode('utf-8')

with open('weibo.html', 'w', encoding='utf-8') as fp:
    fp.write(content)

Use of urllib handler

# _*_ coding : utf-8 _*_
# @Time : 2023/1/28 11:46
# @Author : 李阶熊
# @File : urllib代理
# @Project : pythonProject
import urllib.request

url = 'https://www.baidu.com/s?wd=ip'

headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}

request = urllib.request.Request(url=url, headers=headers)

# response = urllib.request.urlopen(request)

proxies = {
    
    
    # 'http': '202.109.157.63:9000'
}

# handler build_opener open
handler = urllib.request.ProxyHandler(proxies=proxies)

opener = urllib.request.build_opener(handler)

response = opener.open(request)

content = response.read().decode('utf-8')

with open('daili.html', 'w', encoding='utf-8') as fp:
    fp.write(content)


urllib proxy pool

# _*_ coding : utf-8 _*_
# @Time : 2023/1/28 14:07
# @Author : 李阶熊
# @File : urllib_代理池
# @Project : pythonProject
import urllib.request
import random

url = 'https://www.baidu.com/s?wd=ip'

proxies_pool = [{
    
    'http': '202.109.157.63:9000'},
                {
    
    'http': '202.109.157.63:9000'}
                ]

proxies = random.choice(proxies_pool)

headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}

request = urllib.request.Request(url=url, headers=headers)

handler = urllib.request.ProxyHandler(proxies)

opener = urllib.request.build_opener(handler)

response = opener.open(request)

content = response.read().decode('utf-8')

with open('daili.html', 'w', encoding='utf-8') as fp:
    fp.write(content)

Guess you like

Origin blog.csdn.net/weixin_49177159/article/details/128736480