Crawler library for web crawlers

urllib

Installation: pip install urllib

Import: import urllib

urllib.request.urlopen

Parameters: url: the web page to be opened data=None is empty by default, if it is not empty, it is the post request timeout access timeout time

import urllib.request
response = urllib.request.urlopen('https://python.org/')
print("查看 response 的返回类型:",type(response))
print("查看反应地址信息: ",response)
print("查看头部信息1(http header):\n",response.info())
print("查看头部信息2(http header):\n",response.getheaders())
print("输出头部属性信息:",response.getheader("Server"))
print("查看响应状态信息1(http status):\n",response.status)
print("查看响应状态信息2(http status):\n",response.getcode())
print("查看响应 url 地址:\n",response.geturl())
page = response.read()
print("输出网页源码:",page.decode('utf-8'))
urllib.parse.urlencode

Url-encode the data

params = {
    
    'age':35,'sex':'男','work_years':15}
# 经过url编码的数据
params = urllib.parse.urlencode(params)
# 参数data给定的话,说明请求是post
# get请求,url所有的请求都是在网址进行暴露的
response = urllib.request.urlopen(url = url%(params))
print(response.read().decode())
urllib.request.urlretrieve

Download video directly

# 高级方法,不需要打开文件,封装好的方法
print('视频开始下载…………')
urllib.request.urlretrieve(url = url,filename='./airplane.mp4')
print('视频保存成功!')

Open file download

picture = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1572253550306&di=3e39d6daed1f3fbddb40eaf09868232a&imgtype=0&src=http%3A%2F%2Fimg.pconline.com.cn%2Fimages%2Fupload%2Fupc%2Ftx%2Fitbbs%2F1406%2F10%2Fc21%2F35150441_1402409900118_mthumb.jpg'
response = urllib.request.urlopen(url = picture)
text = response.read()
with open('./flower.jpg',mode = 'wb') as fp:
fp.write(text)
print('网页上的数据保存成功')
urllib.request.ProxyHandler

Use proxy ip

import urllib
from urllib import request
from urllib import response
# 免费,西刺代理
if __name__ == '__main__':
    url = 'http://httpbin.org/ip'
    # 不适用代理发起请求
    response = urllib.request.urlopen(url = url)
    print(response.read().decode())
    # 使用代理,伪装,爬虫,封id
    ph = urllib.request.ProxyHandler({
    
    'http':'117.69.201.206:9999'})
    # 打开者,打开url
    opener = urllib.request.build_opener(ph)
    # 使用代理打开一个网址
    response2 = opener.open(url)
    print('使用代理,ip是:',response2.read().decode())
    # print(response.getcode())#相应码200,304 404
    # print(response.geturl())

requests

Installation: pip install requests

Import: import requests

requests.get
import requests

if __name__ == '__main__':
    response = requests.get(url='http://www.baidu.com/')
    response.encoding = 'utf-8'
    print(response.text)
    print('---------------',response.status_code)
    print(response.content)
    # requests发起请求简单
    response = requests.get(url = 'http://httpbin.org/get',params={
    
    'age':28,'salary':'两万整'})
    print(response.text)
requests.post
import requests

url = 'http://httpbin.org/post'

if __name__ == '__main__':
    response = requests.post(url=url, data={
    
    'sex': '男', 'class': 'Python', 'score': 108})
    response.encoding = 'utf-8'
    print(response.text)
requests.Session
import requests

url = 'http://oa.1000phone.net/oa.php/Expense/index'

if __name__ == '__main__':
    # 联网请求的会话
    sess = requests.Session()
    # 通过会话获取首页的数据,验证,cookies,会话记住
    cookies = {
    
    'PHPSESSID': 'ST-56995-8t1zsY2JpoqzcaRuLLlNvq5-Pks-izm5ejd5j1npj2pjc7i3v4z'}
    re = sess.get(url = 'http://oa.1000phone.net/oa.php', cookies =cookies)

    # 使用会话发起新的url的请求
    response = sess.get(url,cookies = cookies)
    print(response.text)
private proxy for requests
import requests

url = 'http://httpbin.org/ip'

if __name__ == '__main__':
    # 私密代理,需要使用,用户名和密码
    response = requests.get(url=url,proxies = {
    
    'http':'http://455098435:[email protected]:16816'},timeout = 20)
    print(response.text)

Request header

headers = {
    
    
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': 'PHPSESSID=ST-56995-8t1zsY2JpoqzcaRuLLlNvq5-Pks-izm5ejd5j1npj2pjc7i3v4z',
        'Host': 'oa.1000phone.net',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36', }

Guess you like

Origin blog.csdn.net/qq_42546127/article/details/106383160