python库的解析--urllib.request 用于打开 URL 的可扩展库(urllib.request库)

import urllib.request as ur
import urllib.parse as up
import socket
import os
import urllib.error as ue
import http.cookiejar


# 普通的opener对象
def get_res():
    try:
        url = 'https://httpbin.org/post'
        wd = 'python'
        data = {
    
    
            'wd': wd
        }
        data = bytes(up.urlencode(data), encoding='UTF-8')

        # 请求头的设置
        headers = {
    
    
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                          ' (KHTML, like Gecko) Chrome/86.0.4209.2 Safari/537.36'
        }

        # urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None)
        req = ur.Request(url=url, data=data, headers=headers, method='POST')
        '''
            data必须是bytes类型
            headers是一个字典，他就是请求头
                也可以通过Request对象的add_header()来添加请求头
            origin_req_host请求方的host名称或ip地址
            unverifiable表示用户是否有足够的权限接受这个请求的结果
            method方法
    
            等价于
            req = ur.Request(url=url, data=data, method='POST')
            req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                          ' (KHTML, like Gecko) Chrome/86.0.4209.2 Safari/537.36')
        '''

        print("*"*20, 'request对象', "*"*20)

        # 获取request中的目标完整的URL
        print(req.full_url)

        # 获取request中的协议
        print(req.type)

        # 获取request中的目标域名
        print(req.host)

        # 获取request中的目标URL域名
        print(req.origin_req_host)

        # 获取request中的路径
        print(req.selector)

        # 获取request中的data数据
        print(req.data)

        # 获取request中的是否有权限
        print(req.unverifiable)

        # 获取request中的method4
        print(req.method)

        print(req.get_method())

        # Request.add_header(key, val)向request种添加请求头
        # req.add_header()

        # Request.add_unredirected_header(key, header)添加一个不会被加入重定向请求的头部
        # req.add_unredirected_header()

        # Request.has_header（header）返回实例是否具有命名头（检查常规和未重定向）
        # Request.remove_header(header)从请求实例中删除命名头（从常规头和未定向头中）

        print(req.get_full_url())

        print("*" * 53)
        # urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)
        res = ur.urlopen(req)
        '''
            data必须是bytes类型或者string类型
                如果采用这个参数，那么请求的方式将不再是GET，而是POST
            timeout超时设置只适用于HTTP、HTTPS和FTP连接
            cafile和capath这两个参数分别制定CA证书和他的路径
            content必须是ssl.SSLContext类型，用了指定SSL设置
        '''
        print(res.read().decode("UTF-8"))

        # res.geturl()获取爬取目标的URL
        print(res.geturl())
        print("-"*15)

        # res.info()获取爬取目标的基本信息
        print(res.info())
        print(type(res.info()))
        print("-"*15)

        # res.getcode() 返回状态所对应的数值
        print(res.getcode())
        print("-"*15)

        # res.status 返回状态码
        print(res.status)

        # res.getheaders() 获取headers中的信息
        print(res.getheaders())

        # res.reason 返回状态
        print(res.reason)

        # res.debuglevel 返回错误等级
        print(res.debuglevel)
    except ue.URLError as e:
        if isinstance(e.reason, socket.timeout):
            print('Time Out')
            '''
                socket模块中定义了timeout类型
            '''


# 高级用法，构建handler对象
class HandlerCreateBySelf(object):
    # 验证信息
    def http_auth_handler(self, url):
        username = 'Mike'
        password = '5201314'

        # 初始化  HTTPPasswordMgrWithDefaultReal 对象
        # HTTPPasswordMgrWithDefaultReal 保持（realm，uri）->（user，password）映射的数据库
        p = ur.HTTPPasswordMgrWithDefaultRealm()

        """
            class urllib.request.HTTPPasswordMgr 对象
            class urllib.request.HTTPPasswordMgrWithDefaultRealm 对象
            class urllib.request.HTTPPasswordMgrWithPriorAuth 对象
        """
        # HTTPPasswordMgrWithDefaultReal.add_password()
        # 使用add_password()方法将用户名和密码添加进去
        p.add_password(None, url, username, password)
        """
            该class urllib.request.HTTPPasswordMgrWithDefaultRealm对象的另外方法：
            HTTPPasswordMgrWithPriorAuth.find_user_password(realm, authuri)
                获取给定领域和URI的用户/密码（如果有）
            HTTPPasswordMgrWithPriorAuth.update_authenticated(self, uri, is_authenticated=False)
                为给定的uri或uri列表更新is_authenticated标志
            HTTPPasswordMgrWithPriorAuth.is_authenticated(self, authuri)
                返回给定URI的is \ u 
                authenticated标志的当前状态。
                
            class urllib.request.HTTPPasswordMgr 对象的方法：
            HTTPPasswordMgr.find_user_password(realm, authuri)
                获取给定领域和URI的用户/密码（如果有）
            HTTPPasswordMgr.add_password(realm, url, username, password)
                在域的身份验证和任何给定URI的超级URI被用作身份验证令牌
        """

        # HTTPBasicAuthHandler 处理与远程主机的身份验证
        auth_handler = ur.HTTPBasicAuthHandler(p)
        '''
            当提供错误的身份验证方案时，HTTPBasicAuthHandler将引发ValueError
        '''

        # urllib.request.build_opener([handler, ...])
        # class urllib.request.OpenerDirector OpenerDirector实例
        opener = ur.build_opener(auth_handler)
        '''
               返回一个OpenerDirector实例
               处理程序可以是BaseHandler的实例
               也可以是BaseHandler的子类
               （在这种情况下，必须可以在不使用任何参数的情况下调用构造函数）

               Python 安装有 SLL 支持（指可以导入 ssl 模块），亦会加入 HTTPSHandler
           '''

        try:
            result = opener.open(url)
            '''
                OpenerDirector实例的其他方法
                OpenerDirector.add_handler(handler)
                    在下面，protocol应该被替换为实际要处理的协议
                        <protocol>_open() — 句柄知道如何打开 protocol URLs 的信号。
                        http_error_<type>() --- signal that the handler knows how to handle HTTP errors with HTTP error code type.
                        <protocol>_error() — 句柄知道如何处理来自（非-http）*protocol* 的错误的信号。
                        <protocol>_request() — 句柄知道如何预处理 protocol 请求的信号。
                        <protocol>_response() — 句柄知道如何后处理 protocol 响应的信号。
                OpenerDirector.error(proto, *args) 处理给定协议的错误
            '''
            html = result.read().decode("utf-8")
            print(html)
        except ue.URLError as e:
            print(e.reason)

    # 获取代理IP
    def get_proxy(self, url, proxy_address):
        if not proxy_address:
            proxy_address = ur.urlopen(url).read().decode('utf-8').split()

        # urllib.request.ProxyHandler(proxies=None)
        proxy_handler = ur.ProxyHandler({
    
    
            'http': proxy_address
        })
        '''
            构建一个请求获取代理，如果给定了代理，那么它会将一个协议名映射到代理URL的字典

            如果未设置代理环境变量，
            则在Windows环境中，将从注册表的“Internet设置”部分获取代理设置
            在Mac OS X环境中，将从OS X系统配置框架中检索代理信息
        '''

        # urllib.request.build_opener([handler, ...])
        proxy_opener = ur.build_opener(proxy_handler)

        # urllib.request.install_opener(opener)
        ur.install_opener(proxy_opener)
        '''
            安装一个OpenerDirector实例作为默认的全局访问器
            以后在调用urlopen时默认会自动调用构建的opener
        '''

    # 构建cookie的handler对象
    def get_cookie(self, url):
        # 以文件格式输出cookie
        filename = 'cookie.txt'

        # cookie = http.cookiejar.CookieJar()
        cookie = http.cookiejar.MozillaCookieJar(filename)
        '''
            如果想要吧cookie保存下来，就需要使用CookieJar的子类对象MozillaCookieJar或者LWPCookieJar
        '''

        # urllib.request.HTTPCookieProcessor() 用于处理cookie
        cookie_handler = ur.HTTPCookieProcessor(cookie)

        cookie_opener = ur.build_opener(cookie_handler)
        res = cookie_opener.open(url)
        for item in res:
            print(item)

        # 保存cookie文件
        cookie.save(ignore_discard=True, ignore_expires=True)

        # 读取cookie未见
        cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True)


if __name__ == '__main__':
    get_res()
    handler = HandlerCreateBySelf()
    # handler.get_cookie(url='http://www.baidu.com')

    print(os.path.abspath('./html/baidu.txt'))
    # urllib.request.pathname2url(path)
    # 将路径名路径从路径的本地语法转换为URL的路径组件中使用的格式
    print(ur.pathname2url(os.path.abspath('html/baidu.txt')))

    # urllib.request.url2pathname(path)
    # 将路径组件路径从百分比编码的URL转换为路径的本地语法
    print(ur.url2pathname(os.path.abspath('html/baidu.txt')))

    # urllib.request.getproxies()
    # 此辅助函数将handler方案字典返回到代理服务器URL映射
    print(ur.getproxies())
python库的解析--urllib.request 用于打开 URL 的可扩展库(urllib.request库)

猜你喜欢