The get method saves the data locally
import urllib.request
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
with open('douban1.json', 'w', encoding='utf-8') as fp:
fp.write(content)
Save the first 10 pages of Douban data
import urllib.request
def create_request(start):
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&limit=20&start=' + str(start)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}
request1 = urllib.request.Request(url=url, headers=headers)
return request1
def get_content(request1):
response = urllib.request.urlopen(request1)
content = response.read().decode('utf-8')
return content
def down_load(datas):
with open('douban.json', 'a', encoding='utf-8') as fp:
fp.write(datas)
if __name__ == '__main__':
start_page = int(input('请输入起始的页码'))
end_page = int(input('请输入结束的页面'))
for page in range(start_page, end_page + 1):
request = create_request((page - 1) * 20)
data = get_content(request)
print('当前保存数据第', page, '页')
down_load(data)
abnormal
import urllib.request
import urllib.error
url = 'https://www.doudan111.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}
try:
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
except urllib.error.HTTPError:
print("hahahha")
except urllib.error.URLError:
print("xixixi")
urllib Get kfc store address data
import urllib.request
import urllib.parse
def create_request(page):
base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
data = {
'cname': '北京',
'pid': '',
'pageIndex': str(page),
'pageSize': '10'
}
data = urllib.parse.urlencode(data).encode('utf-8')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}
request = urllib.request.Request(url=base_url, data=data, headers=headers)
return request
def get_content(request2):
response = urllib.request.urlopen(request2)
content = response.read().decode('utf-8')
return content
def down_load(page1, content2):
with open('kfc'+str(page1)+'.json', 'w', encoding='utf-8') as fp:
fp.write(content2)
if __name__ == '__main__':
start_page = int(input('请输入起始页码'))
end_page = int(input('请输入结束页码'))
for page in range(start_page, end_page + 1):
print('开始下载第', page, '页')
request1 = create_request(page)
content1 = get_content(request1)
down_load(page, content1)
urllib carries cookies to request Weibo personal information
import urllib.request
url = 'https://weibo.com/p/1005055645428311/info'
headers = {
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'cookie': 'XSRF-TOKEN=k4W3jQdQTRV51BLNdh9Bgq0-; login_sid_t=d411c86abec42d79d18b6cceee924f67; cross_origin_proto=SSL; _s_tentry=weibo.com; Apache=8527055933097.476.1674873488755; SINAGLOBAL=8527055933097.476.1674873488755; ULV=1674873488761:1:1:1:8527055933097.476.1674873488755:; wb_view_log=1920*10801; SSOLoginState=1674873548; SUB=_2A25O0PqdDeRhGeNI71cV8ibPyj2IHXVtpGtVrDV8PUNbmtANLRnYkW9NSH6t1nfm884z1WFXmugFuTB63RebtLjK; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhwxxJNcNRR3I77oivLQP-95JpX5KzhUgL.Fo-cSh-Xeon0eK22dJLoIESgdcpDUsLadbH8SFHF1F-RxFH81F-4eFHFxFH8SCHWSF-4SBtt; ALF=1706409546; wvr=6; wb_view_log_5645428311=1920*10801; WBPSESS=OqWss3qcIkOBUh2PKbOPqTbgHlGSou7iMWhXp2LBefg7_ovvOndEatgx8HSk8ZivKAM7rJIPFI5_1Bn6c7IcMhoi2kWc4QmQ_w2Nhr5gHM0JqKR98rxkJqTauOun-TSmMAgvzFzQc-P1VFVeDtjGZw==; PC_TOKEN=8fdcf40040; webim_unReadCount=%7B%22time%22%3A1674874761720%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A0%2C%22msgbox%22%3A0%7D',
'referer': 'https://weibo.cn/',
'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
with open('weibo.html', 'w', encoding='utf-8') as fp:
fp.write(content)
Use of urllib handler
import urllib.request
url = 'https://www.baidu.com/s?wd=ip'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
proxies = {
}
handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
with open('daili.html', 'w', encoding='utf-8') as fp:
fp.write(content)
urllib proxy pool
import urllib.request
import random
url = 'https://www.baidu.com/s?wd=ip'
proxies_pool = [{
'http': '202.109.157.63:9000'},
{
'http': '202.109.157.63:9000'}
]
proxies = random.choice(proxies_pool)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
handler = urllib.request.ProxyHandler(proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
with open('daili.html', 'w', encoding='utf-8') as fp:
fp.write(content)