Python之——实现简单的爬虫功能(支持Post和Get请求)

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/l1028386804/article/details/91511464

转载请注明出处:https://blog.csdn.net/l1028386804/article/details/91511464

废话不多说,直接上代码:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# -*- coding: gbk -*-
# Date: 2019/6/12
# Created by 冰河
# Description Python实现的post和get请求,可作为爬虫使用
# 博客: https://blog.csdn.net/l1028386804
#导包
from urllib import request,parse
from urllib.error import HTTPError,URLError
# 保存cookie
from http import cookiejar

#自定义Session类 
class session(object):
    def __init__(self):
        cookie_object = cookiejar.CookieJar()
        # handler 对应着一个操作
        handler = request.HTTPCookieProcessor(cookie_object)
        # opener 遇到有cookie的response的时候,
        # 调用handler内部的一个函数, 存储到cookie object
        self.opener = request.build_opener(handler)
    #封装函数
    def get(self,url,headers=None):
        return get(url,headers,self.opener)
    def post(self,url,form ,headers=None):
        return post(url,form ,headers,self.opener)
#get请求 
def get(url, headers=None, opener = None):
    return urlrequests(url, headers=headers, opener = opener)

#post请求 
def post(url, form, headers=None, opener = None):
    return urlrequests(url, form, headers=headers, opener=opener)
 
#封装好的函数
def urlrequests(url,form=None,headers=None,opener = None):
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    if headers == None:
        headers={
            'User-Agent':user_agent
        }#请求头 就像钥匙一样能开启锁,有些锁是开着的有些是锁着的
    html_bytes = b''#写成二进制码
    try:
        if form:#post请求
            form_str = parse.urlencode(form,encoding='utf-8')
            form_bytes =form_str.encode('utf-8')
            req= request.Request(url,data=form_bytes,headers=headers)
        else:#get请求
            req = request.Request(url,headers=headers)
        if opener:
            response = opener.open(req)
        else:
            response =request.urlopen(req)#此处加timeout 超时时间 下面第二个错误才可能会报出
        html_bytes = response.read()
    except HTTPError as e:
        print(e)
    except URLError as e:
        print(e)
 
    return html_bytes
 
if __name__ == '__main__':
    #使用post请求方法
    # url = 'http://fanyi.baidu.com/sug'
    # form = {
    #     'kw': '呵呵'
    # }
    # html_bytes = post(url, form=form)
    # print(html_bytes)
#直接使用
    url = 'http://www.baidu.com'#此出写入需要查的网站名
    html_byte = get(url)
#网页显示
    print(html_byte.decode('utf-8'))

猜你喜欢

转载自blog.csdn.net/l1028386804/article/details/91511464
今日推荐