21 days to build a distributed crawler -urllib library (a)

 1.1.urlopen usage function

#encoding:utf-8

from urllib import request

res = request.urlopen("https://www.cnblogs.com/")

print(res.readlines())


#urlopen的参数
#def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
#            *, cafile=None, capath=None, cadefault=False, context=None):

 1.2.urlretrieve function

 Save the file on a web page to your local

#coding:utf-8

from urllib import request

res = request.urlretrieve("https://www.cnblogs.com/",'cnblog.html')


#urlretrieve参数
#def urlretrieve(url, filename=None, reporthook=None, data=None):

 1.3 parameter encoding and decoding functions

 urlencode function for encoding Chinese characters and special

# Urlencode function 

# simple use 
# from the urllib Import the parse 
# Data = { 'name': 'Drake', 'Age': 100} 
# QS = parse.urlencode (Data) 
# Print (QS) # name = E5% % E7 B7 the BE%%%% 91 is 9E% 8B E5% 85% Age = 100 & 

# actual use cases 
from the urllib Import Request, the parse 
URL = " http://www.baidu.com/s " 
the params = { " WD " : " blog Park " } 
QS = parse.urlencode (params) 
url = url + " ? " + qs
res = request.urlopen(url)
print(res.read())

parse_qs function for decoding coded parameters after the url.

from urllib import parse

qs = "name=%E5%BE%B7%E7%91%9E%E5%85%8B&age=100"
print(parse.parse_qs(qs))   #{'name': ['德瑞克'], 'age': ['100']}

 1.4.urlparse and urlsplit function usage

 urlparse和urlsplit都是用来对url的各个组成部分进行分割的,唯一不同的是urlsplit没有"params"这个属性.

from urllib import request,parse

url = "https://www.baidu.com/s?wd=cnblog#2"
result = parse.urlparse(url)
print(result)
#ParseResult(scheme='https', netloc='www.baidu.com', path='/s', params='', query='wd=cnblog', fragment='2')

print('scheme:',result.scheme)   #协议
print('netloc:',result.netloc)   #域名
print('path:',result.path)       Path#
Print ( ' Query: ' , result.query)      # query parameter 

# Results 
# scheme: HTTPS 
# netloc: www.baidu.com 
# path: / S 
# Query: WD = cnblog

 1.5.Request pull hook climbing net jobs

Request class parameters

class Request:

    def __init__(self, url, data=None, headers={},
                 origin_req_host=None, unverifiable=False,
                 method=None):

Retractor climbing net jobs

Jobs pull hook inside the network is in Ajax.json

 

 Code:

# Use the Request class climbing pull hook net jobs 

from urllib Import Request, the parse 

url = " https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult to false = " 

# request header 
headers = {
     " the User-- Agent " : " the Mozilla / 5.0 (the Windows NT 6.1; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 64.0.3282.140 Safari / 537.36 " ,
     " the Referer " : " https://www.lagou.com/jobs/list_python?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput= "
}
#post request to be submitted data 
Data = {
     ' First ' : ' to true ' ,
     ' PN ' :. 1 ,
     ' KD ' : ' Python ' 
} 
# Data post request data byte type must be encoded 
req = request.Request (URL, headers = headers, parse.urlencode = Data (Data) .encode ( ' UTF-. 8 ' ), Method = ' the POST ' )    # establish a request object 
RES = request.urlopen (REQ)
 # acquired information word section type, need to be decoded 
Print (res.read (). decode ( 'utf-8'))

1.6.ProxyHandler agent

Acting principle: before requesting purpose of the site, the first request to the proxy server, and then let the proxy server to request that the target site, access to data, and then return to us.

# Use of the agent 
from the urllib Import Request 

URL = " https://www.baidu.com/s?wd=cnblog " 

# 1. Construction of a handler used ProxyHandler incoming proxy 
# handler request.ProxyHandler = ({ 'HTTP': '115.210.31.236.55: 9000'}) 
handler request.ProxyHandler = ({ ' HTTP ' : ' 115.210.31.236.55: 9000 ' })
 # 2. handler created using a construct opener 
opener = request.build_opener (handler )
 # 3 to send a request using the opener 
RES = opener.open (URL)
 Print (res.read ())

 

 
 
 

 1.1.urlopen usage function

#encoding:utf-8

from urllib import request

res = request.urlopen("https://www.cnblogs.com/")

print(res.readlines())


#urlopen的参数
#def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
#            *, cafile=None, capath=None, cadefault=False, context=None):

 1.2.urlretrieve function

 Save the file on a web page to your local

#coding:utf-8

from urllib import request

res = request.urlretrieve("https://www.cnblogs.com/",'cnblog.html')


#urlretrieve参数
#def urlretrieve(url, filename=None, reporthook=None, data=None):

 1.3 parameter encoding and decoding functions

 urlencode function for encoding Chinese characters and special

# Urlencode function 

# simple use 
# from the urllib Import the parse 
# Data = { 'name': 'Drake', 'Age': 100} 
# QS = parse.urlencode (Data) 
# Print (QS) # name = E5% % E7 B7 the BE%%%% 91 is 9E% 8B E5% 85% Age = 100 & 

# actual use cases 
from the urllib Import Request, the parse 
URL = " http://www.baidu.com/s " 
the params = { " WD " : " blog Park " } 
QS = parse.urlencode (params) 
url = url + " ? " + qs
res = request.urlopen(url)
print(res.read())

parse_qs function for decoding coded parameters after the url.

from urllib import parse

qs = "name=%E5%BE%B7%E7%91%9E%E5%85%8B&age=100"
print(parse.parse_qs(qs))   #{'name': ['德瑞克'], 'age': ['100']}

 1.4.urlparse and urlsplit function usage

 urlparse和urlsplit都是用来对url的各个组成部分进行分割的,唯一不同的是urlsplit没有"params"这个属性.

from urllib import request,parse

url = "https://www.baidu.com/s?wd=cnblog#2"
result = parse.urlparse(url)
print(result)
#ParseResult(scheme='https', netloc='www.baidu.com', path='/s', params='', query='wd=cnblog', fragment='2')

print('scheme:',result.scheme)   #协议
print('netloc:',result.netloc)   #域名
print('path:',result.path)       Path#
Print ( ' Query: ' , result.query)      # query parameter 

# Results 
# scheme: HTTPS 
# netloc: www.baidu.com 
# path: / S 
# Query: WD = cnblog

 1.5.Request pull hook climbing net jobs

Request class parameters

class Request:

    def __init__(self, url, data=None, headers={},
                 origin_req_host=None, unverifiable=False,
                 method=None):

Retractor climbing net jobs

Jobs pull hook inside the network is in Ajax.json

 

 Code:

#利用Request类爬去拉勾网职位信息

from urllib import request,parse

url = "https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false"

#请求头
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
    "Referer":"https://www.lagou.com/jobs/list_python?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput="
}
#post请求需要提交的数据
data = {
    'first':'true',
    'pn':1,
    'kd':'python'
}
#post请求的data数据必须是编码后的字节类型
req = request.Request(url,headers=headers,data=parse.urlencode(data).encode('utf-8'),method='POST')   #建立一个请求对象
res = request.urlopen(req)
#获取的信息是字节类型,需要解码
print(res.read().decode('utf-8'))

1.6.ProxyHandler代理

代理原理:在请求目的网站之前,先请求代理服务器,然后让代理服务器去请求目的网站,获取到数据后,再返回给我们。

#代理的使用
from urllib import request

url = "https://www.baidu.com/s?wd=cnblog"

#1.使用ProxyHandler传入代理构建一个handler
# handler = request.ProxyHandler({'http':'115.210.31.236.55:9000'})
handler = request.ProxyHandler({'http':'115.210.31.236.55:9000'})
#2.使用创建的handler构建一个opener
opener = request.build_opener(handler)
#3.使用opener去发送一个请求
res = opener.open(url)
print(res.read())

 

Guess you like

Origin www.cnblogs.com/gaidy/p/12095892.html