python3 网络爬虫学习——第三章,3.1urllib的使用

主要写了一些urllib命令

#request:基本的HTTP请求模块,用来模拟发送请求
#error:异常处理模块,如果出现请求错误,我们可以捕获异常,然后重试或进行其他操作保证程序不会停止
#parse:一个工具模块,提供了许多URL处理方法
#robotparser:识别网站的robots.txt文件,判断哪些网站可以爬,很少用

#3.1.1发送请求
#1、urlopen模块提供了基本的构造HTTP请求方法,利用它可以模拟浏览器的一个请求发起的过程,同时它还有处理授权验证、重定向、浏览器Cookies以及其他内容
# import urllib.request
#
# response = urllib.request.urlopen("https://www.python.org")
# print(response.read().decode("utf-8"))
#返回代码很多,下面看看返回的是什么类型
# print(type(response))  #<class 'http.client.HTTPResponse'>
#可以发现它是一个HTTPResponse类型的对象,它主要包含的功能有read()、readinto()、getheader(name)、getheaders()、fileno()等方法以及msg、version、status、reason、debuglevel、closed等属性
# print(response.status)       #查看返回 结果状态码
# print(response.getheaders())  #查看响应头的信息
# print(response.getheader('Server'))   #查看响应头中Server的值
#def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,*, cafile=None, capath=None, cadefault=False, context=None):
#现在看看各个函数参数用法
#(1)、data参数
#data参数是可选的。如果要添加参数需使用bytes()方法转换类型,如果需要传递参数那请求方法就不是GET而是POST
# 实例
# import urllib.request
# import urllib.parse
# data = bytes(urllib.parse.urlencode({'word':'hello'}),encoding='utf8')
# response = urllib.request.urlopen("http://httpbin.org/post",data = data)
# print(response.read())
#这里传递了一个参数‘word’他的值是hello,它需要被转成bytes类型,使用了urllib.parse模块中的urlencode()方法将参数字典转换成了字符串,第二个参数时编码格式,这里是“utf8”,请求站点"http://httpbin.org/post"是用来测试POST请求的,它可以返回一些信息,其中包括data参数,我们传递的参数在form中,表示是模拟表单的提交方式
#返回结果:
"""
b'{\n  "args": {}, \n  "data": "", \n  "files": {}, \n  "form": {\n    "word": "hello"\n  }, \n  "headers": {\n    "Accept-Encoding": "identity", \n    "Connection": "close", \n    "Content-Length": "10", \n    "Content-Type": "application/x-www-form-urlencoded", \n    "Host": "httpbin.org", \n    "User-Agent": "Python-urllib/3.6"\n  }, \n  "json": null, \n  "origin": "114.242.26.54", \n  "url": "http://httpbin.org/post"\n}\n'
"""
#(2)timeout参数
#用于设置超时时间,如果设置时间内没有得到响应就会报错,可以利用它设置如果长时间没有得到响应就跳过这个网站或执行其他操作
# import urllib.request
# response = urllib.request.urlopen("https://www.baidu.com",timeout=0.01)
# print(response.read())
#返回结果socket.timeout: timed out
#因此可以利用try,except异常处理来进行其他操作
# import urllib.request
# try:
#     response = urllib.request.urlopen("https://www.baidu.com",timeout = 0.01)
# except urllib.error.URLError as e:
#     print("time out")
#运行结果是time out
#def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,*, cafile=None, capath=None, cadefault=False, context=None):
#其他参数cafile和capth分别指定CA证书和它的路径,context指定SSL设置,必须是ssl.SSLContext类型
#2、Request ,可以加入请求头等功能
# def __init__(self, url, data=None, headers={},origin_req_host=None, unverifiable=False,method=None):
"""
第一个参数时网址,是必传参数,其他都是选传的
第二个参数data如果要传必须是bytes类型通过urllib.parse.urlencode()函数编码
第三个参数headers={}是请求头,可以用来伪装浏览器headers={"User-Agent":}
第四个参数origin_req_host=None是请求方IP地址
第五个参数unverifiable=False表示这个请求是无法验证的,默认是False
第六个参数method=None表示请求方法GET,POST、PUT等 method = "POST"
"""
#实例:
# from urllib import request,parse
# url = 'http://httpbin.org/post'
# headers = {'User-Agent':"Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50","HOST":'httpbin.org'}
# dict ={'name':'Germey'}
# data = bytes(parse.urlencode(dict),encoding='utf8')
# req = request.Request(url,data = data, headers = headers,method='POST')
# response = request.urlopen(req)
# print(response.read().decode("utf-8"))
#执行结果
"""
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "name": "Germey"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Connection": "close", 
    "Content-Length": "11", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50"
  }, 
  "json": null, 
  "origin": "114.242.26.65", 
  "url": "http://httpbin.org/post"
}
"""
#另外headers可以用add_header()方法添加
# import urllib.request
# req = urllib.request.Request("https://www.baidu.com")
# req.add_header("User-Agent","Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50")
# response = urllib.request.urlopen(req)
# print(response.read().decode("utf-8"))
#3、高级用法
#Hander工具,可以理解为各种处理器,处理登录验证,处理Cookies,处理代理设置,利用它几乎可以做到HTTP请求中的所有事情
#首先介绍下urllib.request模块里的BaseHandler类,他是所有其他Handler的父类
#各种Hanler继承自父类,例如:
"""
HTTPDefaultErrorHandler:处理服务器响应错误
HTTPRedirectHandler:用于重定向
HTTPCookieProcessor:用于处理Cookies
ProxyHandler:用于设置代理,默认代理是空
HTTPPasswordMgr:用于管理密码,它维护了用户名和密码的表
HTTPBasicAuthHandler:用于管理认证,如果一个连接打开时需要认证,可以用它来解决认证问题
"""
"""
还有一个比较重要的类OpenerDirector,可以使用Opener.open()方法打开网页
"""
# 实例
# from urllib.request import HTTPPasswordMgrWithDefaultRealm,HTTPBasicAuthHandler,build_opener
# from urllib.error import URLError
# import urllib
#
# username = "username"
# password = "password"
# url = "http://localhost:5000/"
# p = HTTPPasswordMgrWithDefaultRealm()
# p.add_password(None,url,username,password)
# auth_handler = HTTPBasicAuthHandler(p)
# opener = build_opener(auth_handler)
# try :
#     result = opener.open(url)
#     html = result.read().decode("utf-8")
#     print(html)
# except URLError as e:
#     print(e.reason)
#添加代理
# from urllib.error import URLError
# from urllib.request import ProxyHandler,build_opener
#
# proxy_Handler = ProxyHandler({"http":"http://10.0.126.188:9743","https":"https://10.0.126.188:9743"})
# opener = build_opener(proxy_Handler)
# try:
#     response = opener.open("https://www.baidu.com")
#     print(response.read().decode("utf-8"))
# except URLError as e:
#     print(e.reason)
#Cookies
#实例获取
# import http.cookiejar,urllib.request
# cookie = http.cookiejar.CookieJar()
# handler = urllib.request.HTTPCookieProcessor(cookie)
# opener = urllib.request.build_opener(handler)
# response = opener.open("https://www.baidu.com")
# for item in cookie:
#     print(item.name+"="+item.value)
"""
BIDUPSID=72075B8181DFB5A84474B9BC92FAD19C
PSTM=1534211858
BD_NOT_HTTPS=1
"""
#将Cookie输出成文件格式:
# import http.cookiejar,urllib.request
# filename = "cookies.txt"
# cookie =http.cookiejar.MozillaCookieJar(filename)
# hander = urllib.request.HTTPCookieProcessor(cookie)
# opener = urllib.request.build_opener(hander)
# response =opener.open("https://www.baidu.com")
# cookie.save(ignore_discard=True,ignore_expires=True)
"""
# Netscape HTTP Cookie File
# http://curl.haxx.se/rfc/cookie_spec.html
# This is a generated file!  Do not edit.

.baidu.com  TRUE    /   FALSE   3681696015  BIDUPSID    4CC88CCA0BE2A983D53109DD2A130597
.baidu.com  TRUE    /   FALSE   3681696015  PSTM    1534212370
www.baidu.com   FALSE   /   FALSE   1534212668  BD_NOT_HTTPS    1
"""
#处理异常:
# URLError类,通过他能捕捉错误原因进行处理
# try :
# except URLError as e:
#     print(e.reason)
#HTTPError类,专门处理HTTP请求错误
#实例
# from urllib import request,error
# try:
#     response = request.urlopen("https://cuiqingcai.com/index.html")
# except error.HTTPError as e:
#     print(e.reason,e.code,e.headers,sep='\n')
"""
Not Found
404
Server: nginx/1.10.3 (Ubuntu)
Date: Tue, 14 Aug 2018 02:19:34 GMT
Content-Type: text/html; charset=UTF-8
Transfer-Encoding: chunked
Connection: close
Vary: Cookie
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Cache-Control: no-cache, must-revalidate, max-age=0
Link: <https://cuiqingcai.com/wp-json/>; rel="https://api.w.org/
"""
#因为URLError是HTTPError的父类,这里可以先选择捕捉父类的错误再捕捉子类的错误
# from urllib.error import URLError,HTTPError
# from urllib import request
# try:
#     response = request.urlopen('https://cuiqingcai.com/index.html')
# except HTTPError as e:
#     print(e.reason,e.code,e.headers)
# except URLError as e:
#     print(e.reason)
# else:
#     print("Request Successfully")
#这里先判断是否是HTTP异常,然后再判断是否是URL异常,最后再执行正常逻辑
#有时e.reason不是字符串而是class类型时,可以使用instance()判断
# import socket
# import urllib.request
# import urllib.error
# try:
#     response = urllib.request.urlopen("https://www.baidu.com",timeout=0.001)
# except urllib.error.URLError as e :
#     print(type(e.reason))
#     if isinstance(e.reason,socket.timeout):
#         print("time out")
#3.1.3解析链接
#urllib库里提供了parse模块,它定义了处理URL的标准接口

#1、urlparse()
# from urllib.parse import urlparse
# result = urlparse("https://www.baidu.com/index.html;user?id=5#comment")
# print(type(result),result)
"""
<class 'urllib.parse.ParseResult'> ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')
"""
"""
scheme='https'表示协议
netloc='www.baidu.com'表示域名
path='/index.html'访问路径
params='user'参数
query='id=5'查询条件query
fragment='comment直接定位页面内部的下拉位置
"""
"""
def urlparse(url, scheme='', allow_fragments=True):
参数:1、url 要解析的URL
    :2、scheme:默认的协议:http或https
    :3、allow_fragments =True 即是否忽略fragment。如果设置为False就被忽略,它会被解析到query或者params或者path中
"""
# from urllib.parse import urlparse
# result = urlparse("www.baidu.com")
# print(result)
"""
ParseResult(scheme='', netloc='', path='www.baidu.com', params='', query='', fragment='')
"""

#2、urlparse是解析url,它的对立方法urlunparse用来生成url,它接收的必须是可迭代对象,长度必须是6位
# from urllib.parse import urlunparse
# data = ['htts','www.baid.com','index.html','user','id=5','comment']
# print(urlunparse(data))
#htts://www.baid.com/index.html;user?id=5#comment

#3、urlsplit()与urlparse()类似,不过不单独解析params这部分,只返回5个结果
# from urllib.parse import urlsplit
# result = urlsplit("https://www.baidu.com/index.html;user?id=5#comment")
# print(result)
#SplitResult(scheme='https', netloc='www.baidu.com', path='/index.html;user', query='id=5', fragment='comment')

#4、urlunspilt()与urlunparse类似,不过需要5个参数
# from urllib.parse import urlunsplit
# result = ["https","www.baidu.com","index.html","id=5","comment"]
# print(urlunsplit(result))

#5、urljoin方法,提供了一个base_url基础连接和一个新的连接,该方法会分析基础连接的协议网址和路径这三个部分并将新的连接缺失的部分进行补充
# from urllib.parse import urljoin
# print(urljoin('http://www.baidu.com','FAQ.html'))
# print(urljoin('http://www.baidu.com','https://www.cuiqingcai.com/FAQ.html'))
"""
http://www.baidu.com/FAQ.html
https://www.cuiqingcai.com/FAQ.html
"""
#如果新的连接有协议网址和路径这三个部分就用新的,没有就用基础连接的补充到新的连接里面

#6、urlencode()构造GET请求参数时使用
# from urllib.parse import urlencode
# params = {'name':"user",'age':22}
# base_url = "https://www.baidu.com?"
# url = base_url + urlencode(params)
# print(url)
#https://www.baidu.com?name=user&age=22
# 这个方法很常用,为了方便构造参数

#7、parse_qs()将GET请求的参数转回字典,与urlencode()相反功能
# from urllib.parse import parse_qs
#
# url = "name=user&age=22"
# print(parse_qs(url))
# 结果{'name': ['user'], 'age': ['22']}
#8、parse_qsl将GET请求的参数转成元组组成的列表
# from urllib.parse import parse_qsl
# url = "name=user&age=22"
# print(parse_qsl(url))
# [('name', 'user'), ('age', '22')]
#9、quote()使用方法转换URL编码格式,解决URL参数中有中文问题
# from urllib.parse import quote
# keyword = "壁纸"
# base_url = "https://www.baidu.com/wd="
# url = base_url + quote(keyword)
# print(url)
# https://www.baidu.com/wd=%E5%A3%81%E7%BA%B8
#10、unquote()对URL进行解码
# from urllib.parse import unquote
# url = "https://www.baidu.com/wd=%E5%A3%81%E7%BA%B8"
# print(unquote(url))
# https://www.baidu.com/wd=壁纸
#3.1.4Roots协议,用来告诉爬虫哪些网站可以爬取,哪些不可以

猜你喜欢

转载自blog.csdn.net/luslin/article/details/81664670