The get method saves the data locally

# _*_ coding : utf-8 _*_
# @Time : 2023/1/19 16:17
# @Author : 李阶熊
# @File : urllib_ajax的get请求
# @Project : pythonProject
# get请求
# 获取豆瓣电影的第一页的数据  并保存起来

import urllib.request

url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'

headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

# (1) 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)

# (2) 获取响应的数据
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')

# (3) 数据下载到本地
# open方法默认情况下使用的是gbk的编码  如果我们需要保存汉字  那么需要在open方法中指定编码格式为utf-8
# encoding='utf-8'
# fp = open('douban.json','w',encoding='utf-8')
# fp.write(content)

with open('douban1.json', 'w', encoding='utf-8') as fp:
    fp.write(content)

Save the first 10 pages of Douban data

# _*_ coding : utf-8 _*_
# @Time : 2023/1/19 17:16
# @Author : 李阶熊
# @File : urllib_get方法请求豆瓣前10页数据
# @Project : pythonProject
# https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&
# start=0&limit=20

# https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&
# start=20&limit=20

# https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&
# start=40&limit=20

# page  1  2  3  4
# start 0  20 40 60


import urllib.request


# start = (page - 1) * 20

# start=0

# headers = {
    
    
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
# }
#
# for i in range(10):
#     url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&limit=20&start='
#     start = i * 20
#     url = url + str(start)
#     # 1.请求对象定制
#     request = urllib.request.Request(url=url, headers=headers)
#
#     # 2.获取响应数据
#     response = urllib.request.urlopen(request)
#
#     # 3.保存数据到本地
#     content = response.read().decode('utf-8')
#     print('开始保存第', i+1, '几页数据')
#     with open('douban.json', 'a', encoding='utf-8') as fp:
#         fp.write(content)


# 下载豆瓣电影前10页的数据
# 1.请求对象的定制
# 2.获取响应的数据
# 3.下载数据

def create_request(start):
    url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&limit=20&start=' + str(start)
    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    }
    request1 = urllib.request.Request(url=url, headers=headers)
    return request1


def get_content(request1):
    response = urllib.request.urlopen(request1)
    content = response.read().decode('utf-8')
    return content


def down_load(datas):
    with open('douban.json', 'a', encoding='utf-8') as fp:
        fp.write(datas)


# 程序的入口
if __name__ == '__main__':
    start_page = int(input('请输入起始的页码'))
    end_page = int(input('请输入结束的页面'))

    for page in range(start_page, end_page + 1):

        # 每一页都有自己的请求对象定制

        request = create_request((page - 1) * 20)

        data = get_content(request)

        print('当前保存数据第', page, '页')

        down_load(data)

abnormal

# _*_ coding : utf-8 _*_
# @Time : 2023/1/20 11:24
# @Author : 李阶熊
# @File : urllib_异常
# @Project : pythonProject
import urllib.request
import urllib.error

#url = 'https://blog.csdn.net/weixin_44211968/article/details/120129476?spm=1001.2100.3001.7377'

url = 'https://www.doudan111.com'
headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

try:
    request = urllib.request.Request(url=url, headers=headers)
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    print(content)
except urllib.error.HTTPError:
    print("hahahha")
except urllib.error.URLError:
    print("xixixi")

urllib Get kfc store address data

# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname
# cname:
# 北京
# pid:
# pageIndex:
# 2
# pageSize:
# 10

import urllib.request
import urllib.parse


def create_request(page):
    base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'

    data = {
    
    
        'cname': '北京',
        'pid': '',
        'pageIndex': str(page),
        'pageSize': '10'
    }

    data = urllib.parse.urlencode(data).encode('utf-8')

    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    }

    request = urllib.request.Request(url=base_url, data=data, headers=headers)
    return request


def get_content(request2):
    response = urllib.request.urlopen(request2)
    content = response.read().decode('utf-8')
    return content


def down_load(page1, content2):
    with open('kfc'+str(page1)+'.json', 'w', encoding='utf-8') as fp:
        fp.write(content2)


if __name__ == '__main__':
    start_page = int(input('请输入起始页码'))
    end_page = int(input('请输入结束页码'))

    for page in range(start_page, end_page + 1):
        print('开始下载第', page, '页')
        # 请求对象的定制
        request1 = create_request(page)
        # 获取网页源码
        content1 = get_content(request1)
        # 下载到本地
        down_load(page, content1)

urllib carries cookies to request Weibo personal information

# _*_ coding : utf-8 _*_
# @Time : 2023/1/28 10:46
# @Author : 李阶熊
# @File : urllib微博的cookie登录
# @Project : pythonProject

# 适用的场景： 数据采集的时候， 需要绕过登陆， 然后进入到某个页面

# 个人信息页面是utf-8 但是还报编码错误， 因为并没有进入到个人信息页面 而是跳转到了登陆页面

import urllib.request

url = 'https://weibo.com/p/1005055645428311/info'

headers = {
    
    

    # ':authority': 'weibo.com',
    # ':method': 'GET',
    # ':path': '/p/1005055645428311/info',
    # ':scheme': 'https',
    # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    # 'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'max-age=0',
    # cookie中携带着你的登录信息， 如果有登陆之后的cookie 那么我们就可以携带着cookie进入到任何页面
    'cookie': 'XSRF-TOKEN=k4W3jQdQTRV51BLNdh9Bgq0-; login_sid_t=d411c86abec42d79d18b6cceee924f67; cross_origin_proto=SSL; _s_tentry=weibo.com; Apache=8527055933097.476.1674873488755; SINAGLOBAL=8527055933097.476.1674873488755; ULV=1674873488761:1:1:1:8527055933097.476.1674873488755:; wb_view_log=1920*10801; SSOLoginState=1674873548; SUB=_2A25O0PqdDeRhGeNI71cV8ibPyj2IHXVtpGtVrDV8PUNbmtANLRnYkW9NSH6t1nfm884z1WFXmugFuTB63RebtLjK; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhwxxJNcNRR3I77oivLQP-95JpX5KzhUgL.Fo-cSh-Xeon0eK22dJLoIESgdcpDUsLadbH8SFHF1F-RxFH81F-4eFHFxFH8SCHWSF-4SBtt; ALF=1706409546; wvr=6; wb_view_log_5645428311=1920*10801; WBPSESS=OqWss3qcIkOBUh2PKbOPqTbgHlGSou7iMWhXp2LBefg7_ovvOndEatgx8HSk8ZivKAM7rJIPFI5_1Bn6c7IcMhoi2kWc4QmQ_w2Nhr5gHM0JqKR98rxkJqTauOun-TSmMAgvzFzQc-P1VFVeDtjGZw==; PC_TOKEN=8fdcf40040; webim_unReadCount=%7B%22time%22%3A1674874761720%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A0%2C%22msgbox%22%3A0%7D',
    'referer': 'https://weibo.cn/',
    'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

request = urllib.request.Request(url=url, headers=headers)

response = urllib.request.urlopen(request)

content = response.read().decode('utf-8')

with open('weibo.html', 'w', encoding='utf-8') as fp:
    fp.write(content)

Use of urllib handler

# _*_ coding : utf-8 _*_
# @Time : 2023/1/28 11:46
# @Author : 李阶熊
# @File : urllib代理
# @Project : pythonProject
import urllib.request

url = 'https://www.baidu.com/s?wd=ip'

headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}

request = urllib.request.Request(url=url, headers=headers)

# response = urllib.request.urlopen(request)

proxies = {
    
    
    # 'http': '202.109.157.63:9000'
}

# handler build_opener open
handler = urllib.request.ProxyHandler(proxies=proxies)

opener = urllib.request.build_opener(handler)

response = opener.open(request)

content = response.read().decode('utf-8')

with open('daili.html', 'w', encoding='utf-8') as fp:
    fp.write(content)

urllib proxy pool

# _*_ coding : utf-8 _*_
# @Time : 2023/1/28 14:07
# @Author : 李阶熊
# @File : urllib_代理池
# @Project : pythonProject
import urllib.request
import random

url = 'https://www.baidu.com/s?wd=ip'

proxies_pool = [{
    
    'http': '202.109.157.63:9000'},
                {
    
    'http': '202.109.157.63:9000'}
                ]

proxies = random.choice(proxies_pool)

headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}

request = urllib.request.Request(url=url, headers=headers)

handler = urllib.request.ProxyHandler(proxies)

opener = urllib.request.build_opener(handler)

response = opener.open(request)

content = response.read().decode('utf-8')

with open('daili.html', 'w', encoding='utf-8') as fp:
    fp.write(content)

urllib's ajax usage

The get method saves the data locally

Save the first 10 pages of Douban data

abnormal

urllib Get kfc store address data

urllib carries cookies to request Weibo personal information

Use of urllib handler

urllib proxy pool

Guess you like