1.get传参
(1)汉字报错 :解释器器ascii没有汉字 url汉字转码
urllib.parse.quote safe="string.printtable"
(2)字典传参
urllib.parse.urlencode()
post:
urlib.request.openurl(url,data = "服务器器接受的数据")
handler:处理理器器的⾃自定义:
User-Agent:
(1)模拟真实的浏览器器发送请求:(1)百度批量量搜索(2)检查元素(百度搜索useragent⼤大全)
(2)request.add_header(动态添加head数据)
(3)响应头 response.header
(4)创建request:urlib.request.Request(url)
2.IP代理理:
(1)免费的IP:时效性差,错误率⾼高
(2)付费的IP:贵花钱,也有失效不不能⽤用的
IP分类:
透明:对⽅方知道我们真实的ip
匿匿名:对⽅方不不知道我们真实的ip,知道了了你使⽤用了了代理理
⾼高匿匿:对⽅方不不知道我们真是的IP.也不不知道我们使⽤用了了代理理
handler:
(1)系统的urlopen()不不⽀支持代理理的添加
创建对应的处理理器器(handler)
1.代理理处理理器器:ProxyHandler
2.拿着ProxyHandler创建opener:bulid_opener()
3.opener.open(url)就可以请求数据
auth认证handler
Cookieshandler
URLError
requests(第三⽅方模块):简单易易⽤用
数据解析:
数据存储:json csv MongDB resdis mysql
import urllib.request import urllib.parse import string def get_params(): url = "http://www.baidu.com/s?" params = { "wd":"中文", "key":"zhang", "value":"san" } str_params = urllib.parse.urlencode(params) print(str_params) final_url = url + str_params #将带有中文的url 转译成计算机可以识别的url end_url = urllib.parse.quote(final_url,safe=string.printable) response = urllib.request.urlopen(end_url) data = response.read().decode("utf-8") print(data) get_params()
import urllib.request def load_baidu(): url= "https://www.baidu.com" header = { #浏览器的版本 "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", # "haha":"hehe" } #创建请求对象 request = urllib.request.Request(url) #动态的去添加head的信息 request.add_header("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36") #请求网络数据(不在此处增加请求头信息因为此方法系统没有提供参数) response = urllib.request.urlopen(request) print(response) data = response.read().decode("utf-8") #获取到完整的url final_url = request.get_full_url() print(final_url) #响应头 # print(response.headers) #获取请求头的信息(所有的头的信息) # request_headers = request.headers # print(request_headers) #(2)第二种方式打印headers的信息 #注意点:首字母需要大写,其他字母都小写 request_headers = request.get_header("User-agent") # print(request_headers) with open("02header.html","w")as f: f.write(data) load_baidu()
import urllib.request def load_baidu(): url= "http://www.baidu.com" #添加请求头的信息 #创建请求对象 request = urllib.request.Request(url) #请求网络数据 response = urllib.request.urlopen(request) print(response) data = response.read().decode("utf-8") #响应头 # print(response.headers) #获取请求头的信息 request_headers = request.headers print(request_headers) with open("02header.html","w")as f: f.write(data) load_baidu()
import urllib.request import random def load_baidu(): url = "http://www.baidu.com" user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50" ] #每次请求的浏览器都是不一样的 random_user_agent = random.choice(user_agent_list) request = urllib.request.Request(url) #增加对应的请求头信息(user_agent) request.add_header("User-Agent",random_user_agent) #请求数据 response = urllib.request.urlopen(request) #请求头的信息 print(request.get_header("User-agent")) load_baidu()
import urllib.request def handler_openner(): #系统的urlopen并没有添加代理的功能所以需要我们自定义这个功能 #安全 套接层 ssl第三方的CA数字证书 #http80端口# 和https443 #urlopen为什么可以请求数据 handler处理器 #自己的oppener请求数据 # urllib.request.urlopen() url = "https://blog.csdn.net/m0_37499059/article/details/79003731" #创建自己的处理器 handler = urllib.request.HTTPHandler() #创建自己的oppener opener=urllib.request.build_opener(handler) #用自己创建的opener调用open方法请求数据 response = opener.open(url) # data = response.read() data = response.read().decode("utf-8") with open("02header.html", "w")as f: f.write(data) handler_openner()
import urllib.request def create_proxy_handler(): url = "https://blog.csdn.net/m0_37499059/article/details/79003731" #添加代理 proxy = { #免费的写法 "http":"" # "http":"120.77.249.46:8080" #付费的代理 # "http":"xiaoming":123@115. } #代理处理器 proxy_handler = urllib.request.ProxyHandler(proxy) #创建自己opener opener = urllib.request.build_opener(proxy_handler) #拿着代理ip去发送请求 response = opener.open(url) data = response.read().decode("utf-8") with open("03header.html", "w")as f: f.write(data) create_proxy_handler()
import urllib.request def proxy_user(): proxy_list = [ {"https":""}, # {"https":"106.75.226.36:808"}, # {"https":"61.135.217.7:80"}, # {"https":"125.70.13.77:8080"}, # {"https":"118.190.95.35:9001"} ] for proxy in proxy_list: print(proxy) #利用遍历出来的ip创建处理器 proxy_handler = urllib.request.ProxyHandler(proxy) #创建opener opener = urllib.request.build_opener(proxy_handler) try: data = opener.open("http://www.baidu.com",timeout=1) haha = data.read() print(haha) except Exception as e: print(e) proxy_user()