python3爬虫之requests库基本使用


官方文档链接(中文)

https://2.python-requests.org/zh_CN/latest/


requests  基于  urllib3 ,python编写。

安装 pip install requests  (python3)

anaconda 版本 用pip安装  要在 anaconda prompt 里打入安装命令


提示Requirement already satisfied: requests in xxxxxx  表示 已经安装了


import requests

response=requests.get('http://www.baidu.com')
#打印类型
print(type(response))
#打印状态码
print(response.status_code)
#打印网页源码类型
#字符串类型,不需要decode(区别于urllib)
print(type(response.text))
#打印网页源码
print(response.text)
#打印cookie
print(response.cookies)


requests.get

import requests

data={'name':'germey',
      'age':22}
response=requests.get('http://httpbin.org/get',params=data)
print(response.text)
输出结果:
{
  "args": {
    "age": "22", 
    "name": "germey"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.21.0"
  }, 
  "origin": "xxx.xxx.xxx.xxx, xxx.xxx.xxx.xxx", 
  "url": "https://httpbin.org/get?name=germey&age=22"
}
可以看到,在原始网站上增加了
?name=germey&age=22
问号后的便是params数据

解析json

import requests
import json

response=requests.get('http://httpbin.org/get')
print(type(response.text))
#下面两句结果一样
print(response.json())
print(json.loads(response.text))

print(type(response.json()))
输出结果:
<class 'str'>
{'args': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.21.0'}, 'origin': xxx.xxx.xxx.xxx 'url': 'https://httpbin.org/get'}
{'args': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.21.0'}, 'origin': 'xxx.xxx.xxx.xxx 'url': 'https://httpbin.org/get'}
<class 'dict'>

可以看到用json解析后的为字典类型


获取二进制文件
#获取二进制文件

import requests

response=requests.get('https://github.com/favicon.ico')
print(type(response.content))

#保存二进制文件
with open('favicon.ico','wb') as f:
    f.write(response.content)    #将response.content写入favicon.ico
    f.close()   #关闭文件

#favicon.ico为文件名
#wb参数中w表示写入,b表示二进制文件,r表示读取。


headers
import requests

#返回200
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0'
    }
response=requests.get('https://www.zhihu.com/explore',headers=headers)
print(response.status_code)

#返回400
response=requests.get('https://www.zhihu.com/explore')
print(response.status_code)


User-Agent表示的是 浏览器的客户端信息

http://www.useragentstring.com/

中可以查询


#响应的属性

response=requests.get('http://www.baidu.com')

print(response.status_code)
print(response.headers)
print(response.cookies)
print(response.url)
print(response.history)



#文件上传

files={'file':open('favicon.ico','rb')}
response=requests.post('http://httpbin.org/post',files=files)
print(response.text)



#获取cookies

import requests

response=requests.get('http://www.baidu.com')
print(response.cookies)
for key,value in response.cookies.items():
     print(key+ '=' + value)



#其他

#代理设置样例,根据选择需要其中一种
proxies={
    "http":"http://user:[email protected]:xxxx/:
}
proxies={
    "http":"socks5://xxx.xxx.xxx.xxx:xxxx",
    "https":"socks5://xxx.xxx.xxx.xxx:xxxx"
}
proxies={
    "http":"http://xxx.xxx.xxx.xxx:xxxx",
    "https":"https://xxx.xxx.xxx.xxx:xxxx"
}
r=requests.get('url',proxies=proxies,timeout=1)
#timeout表示超时设置。单位为秒
#超出时间,将会终端程序
#使用try except
from requests.exceptions import ReadTimeout
try:
    r=requests.get('url',proxies=proxies,timeout=1)
    print(r.status_code)
except ReadTimeout:
    print('timeout')


#认证设置

import requests
from requests.auth import HTTPBasicAuth
r=requests.get('http://xxx.xxx.xxx.xxx:xxxx')
print(r.status_code)

#返回401表示需要认证

r=requests.get('http://xxx.xxx.xxx.xxx:xxxx',auth=('user','passwd')
print(r.status_code)

猜你喜欢

转载自www.cnblogs.com/XUEYEYU/p/11310910.html