python爬虫一：requests.get()

转:https://zhuanlan.zhihu.com/p/26701898

# -*- coding: utf-8 -*-
import requests

r=requests.get("http://www.baidu.com")

#http返回的状态
print r.status_code  #200

#http返回的headers
print r.headers
'''
{'X-Cache': 'MISS from netentsec-nps-172.17.18.80', 'Content-Encoding': 'gzip',
 'Transfer-Encoding': 'chunked', 'Set-Cookie': 'BDORZ=27315; max-age=86400; domain=.baidu.com; path=/',
  'Server': 'bfe/1.0.8.18', 'Last-Modified': 'Mon, 23 Jan 2017 13:27:36 GMT', 'Connection': 'keep-alive',
   'Pragma': 'no-cache', 'Cache-Control': 'private, no-cache, no-store, proxy-revalidate, no-transform', 
   'Date': 'Thu, 24 May 2018 01:56:11 GMT', 'Content-Type': 'text/html'}
'''

#从headers猜测的编码格式
print r.encoding #utf-8
#从内容分析的编码格式
print( r.apparent_encoding) #utf-8

#相应内容的二进制格式  无乱码
print r.content

'''
<a href=http://map.baidu.com name=tj_trmap class=mnav>地图</a> <a href=http://v.baidu.com name=tj_trvideo class=mnav>视频</a>
 
 <a href=http://tieba.baidu.com name=tj_trtieba class=mnav>贴吧</a> <noscript>
  
  <a href=http://www.baidu.com/bdorz/login.gif?login&

tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 name=tj_login class=lb>登录</a>
'''




#源码中的get()方法
'''
def get(url, params=None, **kwargs):
    模拟发送get请求
    Sends a GET request.
    模拟获取页面的URL连接
    :param url: URL for the new :class:`Request` object.
    额外的参数字典或字节流格式
    :param params: (optional) Dictionary or bytes to be sent in the query string for the :class:`Request`.
    十二个控制访问参数  header  cookies
    :param \*\*kwargs: Optional arguments that ``request`` takes.
    返回一个response对象
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
'''

'''
 **kwargs可代表
kwargs: 控制访问的参数，均为可选项

params : 字典或字节序列，作为参数增加到url中

data : 字典、字节序列或文件对象，作为Request的内容 json : JSON格式的数据，作为Request的内容

headers : 字典，HTTP定制头

cookies : 字典或CookieJar，Request中的cookie

auth : 元组，支持HTTP认证功能

files : 字典类型，传输文件

timeout : 设定超时时间，秒为单位

proxies : 字典类型，设定访问代理服务器，可以增加登录认证

allow_redirects : True/False，默认为True，重定向开关 

stream : True/False，默认为True，获取内容立即下载开关 

verify : True/False，默认为True，认证SSL证书开关

cert : 本地SSL证书路径

url: 拟更新页面的url链接

data: 字典、字节序列或文件，Request的内容 

json: JSON格式的数据，Request的内容
'''


print r.text

'''
<!DOCTYPE html>
<!--STATUS OK--><html> <head><meta http-equiv=content-type content=text/html;charset=utf-8>
<meta http-equiv=X-UA-Compatible content=IE=Edge>
<meta content=always name=referrer><link rel=stylesheet type=text/css href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css>
...

'''
#我们设定get请求头

header={'User-agent':'123'}

r=requests.get("http://www.baidu.com",headers=header)
print r.request.headers #{'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'User-agent': '123'}



#自定义代理池
#pxs = { 'http': 'http://user:[email protected]:1234',
#        '#https': 'https://10.10.10.1:4321' }
#r = requests.get('http://www.baidu.com', proxies=pxs)
#
#print r.text


#抓取网页的通用框架
def getHtml(url):
    try:
        r= requests.get(url,timeout=30)
        #状态码不是200就发出httpError的异常
        r.raise_for_status()
        #获取正确的编码格式
        r.encoding=r.apparent_encoding

        #打印内容
        print r.text


    except:
        return "wrong!"

python爬虫一：requests.get()

猜你喜欢