使用L2正则化防止多层神经网络过拟合（纯numpy实现）

1、背景

多层神经网络的学习能力相当强大，在数据量相当充足的情况下，这种强大自然没有什么问题，但是对于一些问题来说，比如计算机视觉，数据通常是不足的，在这种情况下训练时就容易发生数据过拟合，通俗来讲，过拟合就是训练的模型学习到了数据集自带的特性，而这种特性并不是现实世界中存在的数据所共有的。当这种情况发生时，虽然训练误差较小，但是模型泛化能力不足，因此我们需要用一些方法去防止过拟合的发生，比较常见的方法有两种，其一，使用更多训练数据，其二，正则化。数据多了问题自然解决，但是有些问题去找更多数据的代价很大，并且更有一些问题无法找到更多训练数据，比如癌症的早期发现，那么正则化就更容易被使用了。

2、L2正则化简介

假如在神经网络训练时的目标函数是：

L2正则化就是在原本目标函数的基础上，添加一个损失项

有了新的目标函数，参数更新只需要在求梯度的时候做略微的改动即可

从上式中可以看出，在添加正则化损失项之后，在每次更新W时，首先将W乘以一个小于1的数，然后再加上没正则化时的目标函数所对应的梯度项，经过迭代后可以得到较小的W，进而得到较小的Z，如果激活函数使用tanh或者sigmoid时，会落在函数接近线性的区域，由神经网络的特性可知，如果激活函数使用线性函数，那么经过网络的前向传播之后，输出依然只是输入的线性组合，与感知机无异。所以最终得到的模型不能表达太复杂的问题，即学习能力受到限制。

3、实验结果

下图是无正则化时的模型：

Iterations：0---cost：0.6972652257211792
Iterations：2000---cost：0.21165381376399547
Iterations：4000---cost：0.1951425385136149
Iterations：6000---cost：0.1740360233286938
Iterations：8000---cost：0.15274737230486157
Iterations：10000---cost：0.133325170802212
Iterations：12000---cost：0.12762253797215065
Iterations：14000---cost：0.1240078424815226
Iterations：16000---cost：0.12780531249704044
Iterations：18000---cost：0.13313835960940024
Iterations：20000---cost：0.057367278096741206
Iterations：22000---cost：0.09418080799882553
Iterations：24000---cost：0.024817611835795575
Iterations：26000---cost：0.01912050781220849
Iterations：28000---cost：0.009494663286156817
训练正确率：1.0
测试正确率：0.935

下图为L2正则化模型：

Iterations：0---cost：0.8759553784922353
Iterations：2000---cost：0.3653597750925469
Iterations：4000---cost：0.3330295441809763
Iterations：6000---cost：0.30973576048901186
Iterations：8000---cost：0.28918653965297925
Iterations：10000---cost：0.2746114296324272
Iterations：12000---cost：0.26338130156127554
Iterations：14000---cost：0.2546441027617421
Iterations：16000---cost：0.2477042933674601
Iterations：18000---cost：0.241885525892029
Iterations：20000---cost：0.23707870137611828
Iterations：22000---cost：0.23331474815492242
Iterations：24000---cost：0.23046153380456355
Iterations：26000---cost：0.2282299050072628
Iterations：28000---cost：0.22629647982079956
训练正确率：0.9478672985781991
测试正确率：0.955

可以看出虽然训练正确率下降了，但是测试正确率上升了，而且模型复杂度降低了很多。

4、python实现

import numpy as np
import load_datasets
import matplotlib.pyplot as plt
from sklearn import datasets
import scipy.io as sio

def relu(x):
    return np.maximum(x,0)
def sigmoid(x):
    if (x>0).all():
        return 1.0/(1+np.exp(-x))
    else:
        return np.exp(x)/(1+np.exp(x))
def plot_decision_boundary(W, b, X, Y):
    x_min, x_max = X[0,:].min(), X[0,:].max()  #取得绘图数值范围
    y_min, y_max = X[1,:].min(), X[1,:].max()
    step = 0.01   #网格精度
    xx,yy = np.meshgrid( np.arange(x_min,x_max,step), np.arange(y_min,y_max,step) )  #生成一张网格
    plot_samples = np.array( [xx.ravel(),yy.ravel()] )
    A = plot_samples.copy()
    for l in range(1, len(W)):
        Z = np.dot(W[l], A)+b[l]
        if l==len(W)-1:
            A = sigmoid(Z)
        else:
            A = relu(Z)
    A[A>0.5] = 1
    A[A<=0.5] = 0
    A =A.reshape(xx.shape)
    plt.contourf(xx, yy, A, cmap=plt.cm.Spectral)
    plt.xlabel('x1')
    plt.ylabel('y2')
    plt.scatter(X[0,:], X[1,:], c=Y[0,:])
    plt.show()
#读取训练、测试数据
train_x, train_y, test_x, test_y = load_2D_dataset()
#初始化基本的参数
nTrain = train_x.shape[1]
nTest = test_x.shape[1]
Iterations = 30000 #迭代次数
Layers = [train_x.shape[0], 50,35,20,10,1] #网络结构
nLayers = len(Layers)-1
alpha = 0.02   #学习率
lambd = 0    #正则化系数
#初始化基本的参数
nTrain = train_x.shape[1]
nTest = test_x.shape[1]
Iterations = 30000 #迭代次数
Layers = [train_x.shape[0], 50,35,20,10,1] #网络结构
nLayers = len(Layers)-1
alpha = 0.02   #学习率
lambd = 0.7    #正则化系数
#初始化偏置、权重
W = [[] for l in range(nLayers+1)]
b = [[] for l in range(nLayers+1)]
for l in range(1,nLayers+1):
    W[l] = np.random.randn(Layers[l],Layers[l-1])/np.sqrt(Layers[l-1])
    b[l] = np.zeros((Layers[l],1))
    print(W[l].shape)
    print(b[l].shape)
dW = W.copy()
db = b.copy()
#初始化Cache
A = [[] for l in range(nLayers+1)]
Z = [[] for l in range(nLayers+1)]
for l in range(1,nLayers+1):
    A[l] = np.zeros((Layers[l],nTrain))
    Z[l] = np.zeros((Layers[l],nTrain))
    print(A[l].shape)
    print(Z[l].shape)
dA = A.copy()
dZ = Z.copy()
A[0] = train_x
cost = []
#迭代
for i in range(Iterations):
    #前向传播
    for l in range(1,nLayers+1):
        Z[l] = np.dot(W[l], A[l-1])+b[l]
        if l==nLayers:  #最后一层激活函数待定
            A[l] = sigmoid(Z[l])
        else:
            A[l] = relu(Z[l]) #隐层激活函数relu
    dZ[nLayers] = (A[nLayers]-train_y)/nTrain
    #反向传播
    for l in np.arange(nLayers,0, -1):
        dW[l] = np.dot(dZ[l], A[l-1].T) + lambd/nTrain*W[l]
        db[l] = np.sum(dZ[l], axis=1, keepdims=True)
        if l>1:
            dA[l-1] = np.dot(W[l].T, dZ[l])
            dZ[l-1] = dA[l-1].copy()
            dZ[l-1][Z[l-1]<0] = 0
    for l in range(1,nLayers+1):
        W[l] = W[l]-alpha*dW[l]
        b[l] = b[l]-alpha*db[l]
    W_sum = 0
    for l in range(1,nLayers):
        W_sum += np.sum(W[l]**2)
    if i%2000==0:
        #下面在log函数中添加0.0001微小偏差，防止等于0，出现log(0)
        cur_cost = -np.sum( train_y*np.log(A[nLayers]+0.0001)+(1-train_y)*np.log(1-A[nLayers]+0.0001) )/nTrain+(lambd/nTrain/2)*W_sum
        cost.append(cur_cost)
        print("Iterations："+str(i)+"---cost："+str(cur_cost))
train_err = np.sum( A[nLayers][train_y==1]<0.5 )+np.sum(A[nLayers][train_y==0]>0.5)#训练集错误数量
print("训练集正确率："+str(1-train_err/nTrain))
#预测输出
A_predict = test_x
for l in range(1,nLayers+1):
    Z_predict = np.dot(W[l], A_predict)+b[l]
    if l==nLayers:
        A_predict = sigmoid(Z_predict)
    else:
        A_predict = relu(Z_predict)
test_err = np.sum(A_predict[test_y==1]<0.5) + np.sum( A_predict[test_y==0]>0.5 )
print("测试正确率："+str(1-test_err/nTest))

plt.plot(cost)
plot_decision_boundary(W, b, train_x, train_y)

细雨清风..

发布了3 篇原创文章 · 获赞 0 · 访问量 35

私信关注