Python爬虫(二) | requests

1.发送请求

import requests
response = requests.get('http://httpbin.org/get')
response = requests.post('http://httpbin.org/post',data={'name':'tom'})
response = requests.put('http://httpbin.org/put',data={'name':'tom'})
response = requests.delete('http://httpbin.org/delete')
response = requests.options('http://httpbin.org/get')
response = requests.head('http://httpbin.org/get')

2.GET

get(url, params=None, **kwargs)

import requests
response = requests.get('http://httpbin.org/get')
print(response)
print(response.headers)
print(response.url)
print(response.text)

<Response [200]>
{'Access-Control-Allow-Credentials': 'true', 'Access-Control-Allow-Origin': '*', 'Content-Encoding': 'gzip', 'Content-Type': 'application/json', 'Date': 'Sun, 24 Mar 2019 05:24:59 GMT', 'Server': 'nginx', 'Content-Length': '186', 'Connection': 'keep-alive'}
http://httpbin.org/get
{
  "args": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.21.0"
  }, 
  "origin": "124.234.225.107, 124.234.225.107", 
  "url": "https://httpbin.org/get"
}

参数params:用来构建url中的参数

import requests
response = requests.get('http://httpbin.org/get',params={'name':'tom'})
print(response.url)
print(response.text)

http://httpbin.org/get?name=tom
{
  "args": {
    "name": "tom"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.21.0"
  }, 
  "origin": "124.234.225.107, 124.234.225.107", 
  "url": "https://httpbin.org/get?name=tom"
}

网页如果是json类型,直接调用json()

import requests
response = requests.get('http://httpbin.org/get')
print(type(response.text))
print(type(response.json()))
print(response.json())

<class 'str'>
<class 'dict'>
{'args': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.21.0'}, 'origin': '124.234.225.107, 124.234.225.107', 'url': 'https://httpbin.org/get'}

对于知乎来说,User-Agent是第一道防线,不加Headers会返回400错误

import requests
response = requests.get('https://www.zhihu.com/explore')
print(response)

<Response [400]>

在加上Headers之后,显示200,请求成功

import requests
import re
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'
}
response = requests.get('https://www.zhihu.com/explore',headers=headers)
r = re.compile('<h2><a\s.*?>(.*?)</a></h2>',re.S)
result = re.findall(r,response.text)
print(response)
print(result)

<Response [200]>
['\n2019 LPL 春季赛 TheShy 逆天表现 iG 2:1 力克 RNG,如何评价这轮?\n', '\n石油是牛顿流体吗?\n', '《比赛间隙》第一章“The Game”(一)', '\n《哆啦A梦》到底想告诉我们什么?\n', '科研有点难,怎么办?小赛和刘看山帮你搬救兵', '\n阿拉伯帝国征服前后,埃及的税负状况是怎样的?\n', '\n你所知道的最黑的黑科技是什么?\n', '\n现代埃及人伊拉克人会对古两河文明,古埃及文明产生自豪感吗?\n', '\n是什么让你突然决定再也不穿 JK制服、Lolita 裙出门?\n', '\n猫是否知道自己是猫?\n', '\n你有哪些珍藏许久的短句?\n', '\n如何评价《东宫》男主李承鄞?\n', '\n你被吓的最惨的一次是什么情况?\n', '\n如何评价 18-19 赛季英格兰联赛杯决赛上,切尔西门将凯帕拒绝被换下的行为?\n', '\n朱一龙是怎么红起来的?\n']

抓取二进制数据

import requests
response = requests.get('https://github.com/favicon.ico')
print(response.content)
with open('a.ico','wb') as a:
    a.write(response.content)

b'\x00\x00\x01\x00\x02\x00\x10\x10\x00\x00\x01\x00 \x00(\x05\x00\x00&\x00\x00\x00  \x0000\x00\x00\x00\x0f\x0f\x0f\x11\x11\x11\x14\………………

3.POST

import requests
response = requests.post('http://httpbin.org/post',data={'name':'tom'})
print(response.text)

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "name": "tom"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Content-Length": "8", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.21.0"
  }, 
  "json": null, 
  "origin": "124.234.225.107, 124.234.225.107", 
  "url": "https://httpbin.org/post"
}

4.响应

import requests
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'
}
response = requests.get('http://jianshu.com',headers=headers)
print(response.status_code)
print(response.headers)
print(response.url)
print(response.cookies)
print(response.history)

200
{'Server': 'Tengine', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Date': 'Sun, 24 Mar 2019 07:45:05 GMT', 'Vary': 'Accept-Encoding', 'X-Frame-Options': 'DENY', 'X-XSS-Protection': '1; mode=block', 'X-Content-Type-Options': 'nosniff', 'ETag': 'W/"d651179adccf529ba325a7613096dc3b"', 'Cache-Control': 'max-age=0, private, must-revalidate', 'Set-Cookie': 'locale=zh-CN; path=/', 'X-Request-Id': 'f340316b-782a-4dda-9d06-d2b6c399d68a', 'X-Runtime': '0.014229', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'Content-Encoding': 'gzip', 'Via': 'cache22.l2nu29-1[24,0], cache8.cn247[31,0]', 'Timing-Allow-Origin': '*', 'EagleId': '2a51048815534135050143291e'}
https://www.jianshu.com/
<RequestsCookieJar[<Cookie locale=zh-CN for www.jianshu.com/>]>
[<Response [301]>, <Response [301]>]

5.上传文件

import requests
response = requests.post('http://httpbin.org/post',files={'file':open('a.ico','rb')})
print(response.text)

{
  "args": {}, 
  "data": "", 
  "files": {
    "file": "data:application/octet-stream;base64,AAA...."
  }, 
  "form": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Content-Length": "6659", 
    "Content-Type": "multipart/form-data; boundary=56705873483d2e90796e290dcadb5c38", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.21.0"
  }, 
  "json": null, 
  "origin": "124.234.225.107, 124.234.225.107", 
  "url": "https://httpbin.org/post"
}

6.Cookies

  • 获取cookies

import requests
response = requests.get('https://baidu.com')
print(response.cookies)
for k,v in response.cookies.items():
    print(k+'='+v)

<RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>
BDORZ=27315
  • 添加cookies

import requests
headers={
    'Cookie':'_zap=83b2a9e3-3467-4608-a0a8-a9914abf7e5b; _xsrf=pOwHEM584xMgdHUugml43XDHsSyEdL1O; d_c0="ALDjyO7F5A6PTsp8AFj7UAcuB67UJSVBIqE=|1548650876"; q_c1=507f77901a68433383921f0d2f2ae5a2|1553406141000|1548652074000; capsion_ticket="2|1:0|10:1553414326|14:capsion_ticket|44:MmMxY2UzYTllNTMyNGZlZmEyYWYwYzNjZTYzMjY5ZWE=|84bbd0e3fff6f6d6c42b08c6a6ccf629201ea66147bb4eb0455b4150e13a205f"; tst=r; __utma=51854390.1343962778.1553406149.1553406149.1553406149.1; __utmc=51854390; __utmz=51854390.1553406149.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=51854390.100--|2=registration_date=20160212=1^3=entry_date=20160212=1; tgw_l7_route=4860b599c6644634a0abcd4d10d37251; z_c0="2|1:0|10:1553414329|4:z_c0|92:Mi4xWXUyWUFnQUFBQUFBc09QSTdzWGtEaVlBQUFCZ0FsVk51WWFFWFFCZmtUMHlzTmM5ejZuYms3MlIyek1PMzhKQ0hn|ab06de83d9ac8115d235b76e5500b63e15cf55103c2ff3566c5dff1e2b9ce4ef"',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
    'Host':'www.zhihu.com',
    'Referer':'https://www.zhihu.com/signup?next=%2F',
}
r = requests.get('https://www.zhihu.com/',headers=headers)
print(r.text)


登入成功
import requests
from requests.cookies import RequestsCookieJar
Cookies='_zap=83b2a9e3-3467-4608-a0a8-a9914abf7e5b; _xsrf=pOwHEM584xMgdHUugml43XDHsSyEdL1O; d_c0="ALDjyO7F5A6PTsp8AFj7UAcuB67UJSVBIqE=|1548650876"; q_c1=507f77901a68433383921f0d2f2ae5a2|1553406141000|1548652074000; capsion_ticket="2|1:0|10:1553414326|14:capsion_ticket|44:MmMxY2UzYTllNTMyNGZlZmEyYWYwYzNjZTYzMjY5ZWE=|84bbd0e3fff6f6d6c42b08c6a6ccf629201ea66147bb4eb0455b4150e13a205f"; tst=r; __utma=51854390.1343962778.1553406149.1553406149.1553406149.1; __utmc=51854390; __utmz=51854390.1553406149.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=51854390.100--|2=registration_date=20160212=1^3=entry_date=20160212=1; tgw_l7_route=4860b599c6644634a0abcd4d10d37251; z_c0="2|1:0|10:1553414329|4:z_c0|92:Mi4xWXUyWUFnQUFBQUFBc09QSTdzWGtEaVlBQUFCZ0FsVk51WWFFWFFCZmtUMHlzTmM5ejZuYms3MlIyek1PMzhKQ0hn|ab06de83d9ac8115d235b76e5500b63e15cf55103c2ff3566c5dff1e2b9ce4ef"'
jar = RequestsCookieJar()
for cookie in Cookies.split(';'):
    k,v = cookie.split('=',1)
    jar.set(k,v)
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
    'Host':'www.zhihu.com',
    'Referer':'https://www.zhihu.com/signup?next=%2F',
}
r=requests.get('https://www.zhihu.com/',headers=headers,cookies=jar)
print(r.text)

7.会话保持

import requests
r1 = requests.get('http://httpbin.org/cookies/set/number/123456')
r2 = requests.get('http://httpbin.org/cookies')
print(r2.text)

import requests
s = requests.session()
r1 = s.get('http://httpbin.org/cookies/set/number/123456')
r2 = s.get('http://httpbin.org/cookies')
print(r2.text)


{
  "cookies": {}
}

{
  "cookies": {
    "number": "123456"
  }
}

8.代理

import requests
proxy={
    'http':"http://111.79.198.187:9999",
    'https':"https://111.79.198.187:9999",
}
r = requests.get('https://httpbin.org/get',proxies=proxy)
print(r.text)

{
  "args": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.21.0"
  }, 
  "origin": "111.79.198.187, 111.79.198.187", 
  "url": "https://httpbin.org/get"
}

9.超时

import requests
r = requests.get('https://httpbin.org/get',timeout=1)
r = requests.get('https://httpbin.org/get',timeout=(2,4))  #链接connect和读取read
print(r.text)

10.身份验证

import requests
from requests.auth import HTTPBasicAuth
r = requests.get('https://httpbin.org/get',auth=('username','password'))
r = requests.get('https://httpbin.org/get',auth=HTTPBasicAuth('username','password'))

11.Prepared Request

参数:method=None, url=None, headers=None, files=None, data=None, params=None, auth=None, cookies=None, hooks=None, json=None

from requests import session,Request
url = 'https://httpbin.org/post'
data = {
    'name':'tom',
}
header={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
}
s=session()
req = Request('POST',url,headers=header,data=data)
prepped = s.prepare_request(req)
r = s.send(prepped)
print(r.text)

猜你喜欢

转载自blog.csdn.net/qq_41179280/article/details/88775884