Python3爬虫urllib库的使用

访问页面

from urllib.request import Request,urlopen
url='http://www.xx.com'
req=Request(url)
resp=urlopen(req)

返回数据

html=resp.read().decode()

添加报头信息

​
from fake_useragent import UserAgent
header={
    'User-Agent': UserAgent().chrome
}
url='http://www.xx.com'
req=Request(url,headers=header)

​

https访问

#忽略证书
context=ssl._create_unverified_context()
response=urlopen(requset,context=context)

POST请求

from urllib.parse import urlencode
my_data={
    'usr':'123',
    'pwe':'123456'

}
f_data=urlencode(my_data).encode()
requset=Request(url,data=f_data,headers=headers)

Proxy代理

#http://httpbin.org/get
from urllib.request import build_opener,ProxyHandler
proxy=ProxyHandler({'http':'usr:pwd@ip:port'})
opener=build_opener(proxy)

Cookie

#Methon1
headers={
    'User-Agent':UserAgent().chrome,
    'Cookie':'xxxx'
}
request=Request(url,headers=headers)

#Methon2
from urllib.request import build_opener,HTTPCookieProcessor
from http.cookiejar import MozillaCookieJar
cookie_jar=MozillaCookieJar()
handle=HTTPCookieProcessor(cookie_jar)
opener=build_opener(handle)
response=opener.open(request)
#保存cookies,ignore_expires->是否过期,ignore_discard->是否丢弃
cookie_jar.save('cookie.txt',ignore_expires=True,ignore_discard=True)
#使用cookie
cookie_jar.load('cookie.txt',ignore_expires=True,ignore_discard=True)

URLError

from urllib.error import URLError
try:
    opener=build_opener()
    response=opener.open(request)
    info=response.read().decode()
except URLError as e:
    if e.args==():
        print(e.code)
    else:
        print(e.args[0].errno)
发布了26 篇原创文章 · 获赞 0 · 访问量 588

猜你喜欢

转载自blog.csdn.net/kkLeung/article/details/105423675