爬虫urllib.request

https://www.cnblogs.com/xingzhui/p/7845675.html
urlopen方法
打开指定的URL

urllib.request.urlopen(url, data=None, [timeout, ]*,
cafile=None, capath=None, cadefault=False, context=None)
url参数，可以是一个string，或者一个Request对象。
data一定是bytes对象，传递给服务器的数据，或者为None。目前只有HTTP requests会使用data，提供data时会是一个post请求，如若没有data，那就是get请求。data在使用前需要使用urllib.parse.urlencode()函数转换成流数据。

from urllib import request

resp=request.urlopen(‘http://www.baidu.com’)
print(type(resp))
#可以看出，urlopen返回的是一个HTTPResponse对象
<class ‘http.client.HTTPResponse’>
print(dir(resp))
#resp具有的方法和属性如下，我们最常用的是read和readline
[‘abstractmethods’, ‘class’, ‘del’, ‘delattr’, ‘dict’, ‘dir’, ‘doc’, ‘enter’, ‘eq’, ‘exit’, ‘format’, ‘ge’, ‘getattribute’, ‘gt’, ‘hash’, ‘init’, ‘iter’, ‘le’, ‘lt’, ‘module’, ‘ne’, ‘new’, ‘next’, ‘reduce’, ‘reduce_ex’, ‘repr’, ‘setattr’, ‘sizeof’, ‘str’, ‘subclasshook’, ‘_abc_cache’, ‘_abc_negative_cache’, ‘_abc_negative_cache_version’, ‘_abc_registry’, ‘_checkClosed’, ‘_checkReadable’, ‘_checkSeekable’, ‘_checkWritable’, ‘_check_close’, ‘_close_conn’, ‘_get_chunk_left’, ‘_method’, ‘_peek_chunked’, ‘_read1_chunked’, ‘_read_and_discard_trailer’, ‘_read_next_chunk_size’, ‘_read_status’, ‘_readall_chunked’, ‘_readinto_chunked’, ‘_safe_read’, ‘_safe_readinto’, ‘begin’, ‘chunk_left’, ‘chunked’, ‘close’, ‘closed’, ‘code’, ‘debuglevel’, ‘detach’, ‘fileno’, ‘flush’, ‘fp’, ‘getcode’, ‘getheader’, ‘getheaders’, ‘geturl’, ‘headers’, ‘info’, ‘isatty’, ‘isclosed’, ‘length’, ‘msg’, ‘peek’, ‘read’, ‘read1’, ‘readable’, ‘readinto’, ‘readinto1’, ‘readline’, ‘readlines’, ‘reason’, ‘seek’, ‘seekable’, ‘status’, ‘tell’, ‘truncate’, ‘url’, ‘version’, ‘will_close’, ‘writable’, ‘write’, ‘writelines’]
Request类
URL请求的抽象类。

urllib.request.Request(url, data=None, headers={},
origin_req_host=None, unverifiable=False, method=None)
Example
import urllib.request
with urllib.request.urlopen(“http://www.baidu.com”) as f:
print(f.read(300))
#最简单的打开一个url的方法
#由于urlopen无法判断数据的encoding，所以返回的是bytes对象。一般会对返回的数据进行decode。
b’ \xe7\x99\xbe\xe5\xba\xa6\xe4\xb8\x80\xe4\xb8\x8b\xef\xbc\x8c\xe4\xbd\xa0\xe5\xb0\xb1\xe7\x9f\xa5\xe9\x81\x93

<link rel="prefetch" href="//ajax.googleapis.com/ajax/libs/jqu

#如果想要检索URL资源，并将其保存到一个临时位置，可以使用urlretrieve函数
import urllib.request
local_filename,headers=urllib.request.urlretrieve(‘http://www.baidu.com’)
print(local_filename)
print(headers)
C:\Users\张名昊\AppData\Local\Temp\tmpgvshi0fc
Content-Type: text/html; charset=utf-8
Content-Length: 561
Cache-Control: no-cache
Set-Cookie: jdERAezKGXbgHHHMMBwTrqQ=1;max-age=20;
Connection: close
#urlopen还可以接受Request对象，推荐使用这种方法，因为可以对Request对象进行深度的定制，不仅仅传入一个URL
import urllib.request
req=urllib.request.Request(‘http://www.baidu.com’)
with urllib.request.urlopen(req) as response:
page=response.read(300).decode(‘utf-8’)#我们获取的数据一般是ascii的，decode成utf-8.
print(page)

百度一下，你就知道

Data
有时候，我们想想一个URL发送一些数据，一般使用POST请求，不过数据需要encode，然后传递给Request对象。encoding一般由urllib.parse库的函数实现。

import urllib.parse as up
import urllib.request as ur
url=‘http://www.baidu.com’
values={
‘name’:‘ZhangMinghao’,
‘location’:‘Shanghai’,
‘language’:‘Python3’
}
data=up.urlencode(values)
data=data.encode(‘ascii’)#data应该是bytes，所以上传给server时需要转换成ascii数据。
req=ur.Request(url,data)
with ur.urlopen(req) as response:
page=response.read()
如果我们不想使用data参数，那么使用GET请求，将data内容与url连接到一起，发送到server。

import urllib.request
import urllib.parse
data={
‘name’:‘ZhangMinghao’,
‘location’:‘Shanghai’,
‘language’:‘Python3’
}
url_values=urllib.parse.urlencode(data)
print(url_values)
url=‘http://www.baidu.com’
full_url=url+’?’+url_values
response=urllib.request.urlopen(full_url)
name=ZhangMinghao&location=Shanghai&language=Python3
Headers
有些网页不希望被程序访问，或者向不同的浏览器发送不同的内容。默认的urllib识别为Python-urllib/3.5，可能使server感到疑惑或者返回内容出错。可以通过设置User-Agent来设置我们程序的浏览器识别码，创建Request对象时，出入一个header的字典。

import urllib.parse
import urllib.request

url=‘http://www.baidu.com’
user_agent=‘Mozilla/5.0 (Windows NT 6.1; Win64; x64)’
values={
‘name’:‘ZhangMinghao’,
‘location’:‘Shanghai’,
‘language’:‘Python3’
}
headers={‘User-Agent’:user_agent}

扫描二维码关注公众号，回复： 6000606 查看本文章

data=urllib.parse.urlencode(values)
data=data.encode(‘ascii’)
request=urllib.request.Request(url,data,headers)
with urllib.request.urlopen(req) as response:
the_page = response.read()
处理错误
urllib.error模块中包含了相关的各种错误。
URLError，HTTPError等
URLError一般是因为没有网络连接或者server不存在。这种情况下，会产生一个reason属性，是一个tuple，包含了错误码和错误文本

import urllib.error

req=urllib.request.Request(‘http://www.pretend_server.com’)
try: urllib.request.urlopen(req)
except urllib.error.URLError as e:
print(e.reason)
[Errno 11001] getaddrinfo failed
HTTPError
每个http请求都会返回一个状态码，一般处理程序会处理某些状态吗，但是有一些处理不了，就会返回HTTPError，比如404找不到页面，403请求被禁止，401请求授权。不展开讲其他错误码了。

info和geturl
urllib.response模块
geturl返回真正访问的URL地址。
info，返回一个类似字典的对象，来描述获取的对象，尤其是headers。

import urllib.request

resp=urllib.request.urlopen(‘http://www.baidu.com’)
print(resp.info())
Content-Type: text/html; charset=utf-8
Content-Length: 561
Cache-Control: no-cache
Set-Cookie: tSMHvfupnLIgHHHMVijdstJ=1;max-age=20;
Connection: close

猜你喜欢