版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/weixin_39532362/article/details/87920600
导入模块
# 用于请求
from urllib import request
# 用于编码及解码
from urllib import parse
# 用于错误处理
import urllib.error
设置请求
request.Request()
:设置请求req.add_hander()
:添加请求头req.get_header()
:获取请求头
from urllib import request
from urllib import parse
url="http://httpbin.org/get"
ua_list = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
headers={'User-Agent':random.choice(ua_list)}
data=bytes(parse.urlencode({'wd': 'jjj'}),encoding='utf-8')
# 构造请求
req=request.Request(url=url,headers=headers,data=data,method='GET')
# 设置请求头方法二
req.add_header("User-Agent",random.choice(ua_list))
# 获取请求头
print (req.get_header("User-agent"))
高级请求设置
request.OpenerDirector()
:初始化空Openerrequest.build_opener()
:指定参数构造Openeropener.add_handler()
:Opener增加handlerrequest.ProxyHandler()
:构造代理proxy_handlerrequest.HTTPCookieProcessor()
:构造cookies_handlerhttp.cookiejar.CookieJar()
:生成cookie格式
# 设置代理的handler
opener=request.OpenerDirector()
proxy_handler=request.ProxyHandler({
'http':'http://211.147.67.150:80',
'https':'https://211.147.67.150:80'
})
opener.add_handler(proxy_handler)
# ---
opener=request.build_opener(proxy_handler)
# 设置编辑cookie的handler
import http.cookiejar
opener=request.OpenerDirector()
cookie=http.cookiejar.CookieJar()
cookie_handler=request.HTTPCookieProcessor(cookie)
opener.add_handler(cookie_handler)
# ---
opener=request.build_opener(cookie_handler)
提交请求
request.urlopen()
:基本提交opener.open
:opener提交
# 基本提交
response=request.urlopen(url,timeout=0.001)
# 使用请求类型提交
response=request.urlopen(req,,timeout=0.001)
# opener的提交请求
response=opener.open(req,timeout=10)
响应体处理
response.readline()
:二进制读取一行response.readlines()
:二进制读取所有行返回列表response.read()
:二进制读取文本response.geturl()
:实际urlresponse.info()
:响应头信息,返回字符串response.getheaders()
:响应头信息,返回二元元组列表response.getheader(name="Content-Type")
:特定响应头信息response.version
:HTTP协议版本号response.status
:响应码response.getcode()
:响应码
编码处理
parse.urlparse()
:解析url返回ParseResult对象,类似dictparse.urlunparse()
:将ParseResult编码为url规则字符串parse.quote()
:把字符串转码为网页中转码带%parse.unquote()
:把网页中转码转换为可读字符串parse.urlencode()
:把字典编码为网页中转码带%parse.ParseResult()
:构造ParseResult对象
ParseResult的参数
scheme | 协议 |
netloc | 主机 |
path | 路径 |
params | |
query | get的属性值 |
fragment |
from urllib import parse
# urlparse&urlunparse
url='http://httpbin.org/get'
pr=parse.urlparse(url)
parse.urlunparse(pr)
# quote&unquote
qt=parse.quote('字符串',encoding='utf-8')
parse.unquote(qt,encoding='utf-8')
# urlencode
# 会同时执行quote
urlstr=parse.urlencode({'key1':'值1','key2':'值2'})
obj=dict([tuple(kv.split('=')) for kv in urlstr.split('&')])
异常处理
from urllib import request
import urllib
import socket
try:
response=request.urlopen(url,timeout=0.001)
except urllib.error.URLError as e:
print(type(e.reason))
if isinstance(e.reason,socket.timeout):
print('is timed out')