Article Directory
urllib
Installation: pip install urllib
Import: import urllib
urllib.request.urlopen
Parameters: url: the web page to be opened data=None is empty by default, if it is not empty, it is the post request timeout access timeout time
import urllib.request
response = urllib.request.urlopen('https://python.org/')
print("查看 response 的返回类型:",type(response))
print("查看反应地址信息: ",response)
print("查看头部信息1(http header):\n",response.info())
print("查看头部信息2(http header):\n",response.getheaders())
print("输出头部属性信息:",response.getheader("Server"))
print("查看响应状态信息1(http status):\n",response.status)
print("查看响应状态信息2(http status):\n",response.getcode())
print("查看响应 url 地址:\n",response.geturl())
page = response.read()
print("输出网页源码:",page.decode('utf-8'))
urllib.parse.urlencode
Url-encode the data
params = {
'age':35,'sex':'男','work_years':15}
# 经过url编码的数据
params = urllib.parse.urlencode(params)
# 参数data给定的话,说明请求是post
# get请求,url所有的请求都是在网址进行暴露的
response = urllib.request.urlopen(url = url%(params))
print(response.read().decode())
urllib.request.urlretrieve
Download video directly
# 高级方法,不需要打开文件,封装好的方法
print('视频开始下载…………')
urllib.request.urlretrieve(url = url,filename='./airplane.mp4')
print('视频保存成功!')
Open file download
picture = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1572253550306&di=3e39d6daed1f3fbddb40eaf09868232a&imgtype=0&src=http%3A%2F%2Fimg.pconline.com.cn%2Fimages%2Fupload%2Fupc%2Ftx%2Fitbbs%2F1406%2F10%2Fc21%2F35150441_1402409900118_mthumb.jpg'
response = urllib.request.urlopen(url = picture)
text = response.read()
with open('./flower.jpg',mode = 'wb') as fp:
fp.write(text)
print('网页上的数据保存成功')
urllib.request.ProxyHandler
Use proxy ip
import urllib
from urllib import request
from urllib import response
# 免费,西刺代理
if __name__ == '__main__':
url = 'http://httpbin.org/ip'
# 不适用代理发起请求
response = urllib.request.urlopen(url = url)
print(response.read().decode())
# 使用代理,伪装,爬虫,封id
ph = urllib.request.ProxyHandler({
'http':'117.69.201.206:9999'})
# 打开者,打开url
opener = urllib.request.build_opener(ph)
# 使用代理打开一个网址
response2 = opener.open(url)
print('使用代理,ip是:',response2.read().decode())
# print(response.getcode())#相应码200,304 404
# print(response.geturl())
requests
Installation: pip install requests
Import: import requests
requests.get
import requests
if __name__ == '__main__':
response = requests.get(url='http://www.baidu.com/')
response.encoding = 'utf-8'
print(response.text)
print('---------------',response.status_code)
print(response.content)
# requests发起请求简单
response = requests.get(url = 'http://httpbin.org/get',params={
'age':28,'salary':'两万整'})
print(response.text)
requests.post
import requests
url = 'http://httpbin.org/post'
if __name__ == '__main__':
response = requests.post(url=url, data={
'sex': '男', 'class': 'Python', 'score': 108})
response.encoding = 'utf-8'
print(response.text)
requests.Session
import requests
url = 'http://oa.1000phone.net/oa.php/Expense/index'
if __name__ == '__main__':
# 联网请求的会话
sess = requests.Session()
# 通过会话获取首页的数据,验证,cookies,会话记住
cookies = {
'PHPSESSID': 'ST-56995-8t1zsY2JpoqzcaRuLLlNvq5-Pks-izm5ejd5j1npj2pjc7i3v4z'}
re = sess.get(url = 'http://oa.1000phone.net/oa.php', cookies =cookies)
# 使用会话发起新的url的请求
response = sess.get(url,cookies = cookies)
print(response.text)
private proxy for requests
import requests
url = 'http://httpbin.org/ip'
if __name__ == '__main__':
# 私密代理,需要使用,用户名和密码
response = requests.get(url=url,proxies = {
'http':'http://455098435:[email protected]:16816'},timeout = 20)
print(response.text)
Request header
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'PHPSESSID=ST-56995-8t1zsY2JpoqzcaRuLLlNvq5-Pks-izm5ejd5j1npj2pjc7i3v4z',
'Host': 'oa.1000phone.net',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36', }