Regularization solves overfitting

This film gives three examples for comparison, namely: no regularization, L2 regularization, and dropout regularization.

The first is the reg_utils.py of the related functions required for forward and backward propagation, loading data, and drawing:

# -*- coding: utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio

def sigmoid(x):
    """
    Compute the sigmoid of x
 
    Arguments:
    x -- A scalar or numpy array of any size.
 
    Return:
    s -- sigmoid(x)
    """
    s = 1/(1+np.exp(-x))
    return s
 
def relu(x):
    """
    Compute the relu of x
 
    Arguments:
    x -- A scalar or numpy array of any size.
 
    Return:
    s -- relu(x)
    """
    s = np.maximum(0,x)
    
    return s

def initialize_parameters(layer_dims):
    """
    Arguments:
    layer_dims -- python array (list) containing the dimensions of each layer in our network
    
    Returns:
    parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
                    W1 -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
                    b1 -- bias vector of shape (layer_dims[l], 1)
                    Wl -- weight matrix of shape (layer_dims[l-1], layer_dims[l])
                    bl -- bias vector of shape (1, layer_dims[l])
                    
    Tips:
    - For example: the layer_dims for the "Planar Data classification model" would have been [2,2,1]. 
    This means W1's shape was (2,2), b1 was (1,2), W2 was (2,1) and b2 was (1,1). Now you have to generalize it!
    - In the for loop, use parameters['W' + str(l)] to access Wl, where l is the iterative integer.
    """
    
    np.random.seed(3)
    parameters = {
    
    }
    L = len(layer_dims) # number of layers in the network
 
    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1])
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
        
        assert(parameters['W' + str(l)].shape == layer_dims[l], layer_dims[l-1])
        assert(parameters['W' + str(l)].shape == layer_dims[l], 1)
 
        
    return parameters

def forward_propagation(X, parameters):
    """
    Implements the forward propagation (and computes the loss) presented in Figure 2.
    
    Arguments:
    X -- input dataset, of shape (input size, number of examples)
    Y -- true "label" vector (containing 0 if cat, 1 if non-cat)
    parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
                    W1 -- weight matrix of shape ()
                    b1 -- bias vector of shape ()
                    W2 -- weight matrix of shape ()
                    b2 -- bias vector of shape ()
                    W3 -- weight matrix of shape ()
                    b3 -- bias vector of shape ()
    
    Returns:
    loss -- the loss function (vanilla logistic loss)
    """
    
    # retrieve parameters
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]
    
    # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
    z1 = np.dot(W1, X) + b1
    a1 = relu(z1)
    z2 = np.dot(W2, a1) + b2
    a2 = relu(z2)
    z3 = np.dot(W3, a2) + b3
    a3 = sigmoid(z3)
    
    cache = (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3)
    
    return a3, cache
 
def compute_cost(a3, Y):
    """
    Implement the cost function
    
    Arguments:
    a3 -- post-activation, output of forward propagation
    Y -- "true" labels vector, same shape as a3
    
    Returns:
    cost - value of the cost function
    """
    m = Y.shape[1]
    
    logprobs = np.multiply(-np.log(a3),Y) + np.multiply(-np.log(1 - a3), 1 - Y)
    cost = 1./m * np.nansum(logprobs)
    
    return cost

def backward_propagation(X, Y, cache):
    """
    Implement the backward propagation presented in figure 2.
    
    Arguments:
    X -- input dataset, of shape (input size, number of examples)
    Y -- true "label" vector (containing 0 if cat, 1 if non-cat)
    cache -- cache output from forward_propagation()
    
    Returns:
    gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
    """
    m = X.shape[1]
    (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) = cache
    
    dz3 = 1./m * (a3 - Y)
    dW3 = np.dot(dz3, a2.T)
    db3 = np.sum(dz3, axis=1, keepdims = True)
    
    da2 = np.dot(W3.T, dz3)
    dz2 = np.multiply(da2, np.int64(a2 > 0))
    dW2 = np.dot(dz2, a1.T)
    db2 = np.sum(dz2, axis=1, keepdims = True)
    
    da1 = np.dot(W2.T, dz2)
    dz1 = np.multiply(da1, np.int64(a1 > 0))
    dW1 = np.dot(dz1, X.T)
    db1 = np.sum(dz1, axis=1, keepdims = True)
    
    gradients = {
    
    "dz3": dz3, "dW3": dW3, "db3": db3,
                 "da2": da2, "dz2": dz2, "dW2": dW2, "db2": db2,
                 "da1": da1, "dz1": dz1, "dW1": dW1, "db1": db1}
    
    return gradients

def update_parameters(parameters, grads, learning_rate):
    """
    Update parameters using gradient descent
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    grads -- python dictionary containing your gradients, output of n_model_backward
    
    Returns:
    parameters -- python dictionary containing your updated parameters 
                  parameters['W' + str(i)] = ... 
                  parameters['b' + str(i)] = ...
    """
    
    L = len(parameters) // 2 # number of layers in the neural networks
 
    # Update rule for each parameter
    for k in range(L):
        parameters["W" + str(k+1)] = parameters["W" + str(k+1)] - learning_rate * grads["dW" + str(k+1)]
        parameters["b" + str(k+1)] = parameters["b" + str(k+1)] - learning_rate * grads["db" + str(k+1)]
        
    return parameters

def load_2D_dataset(is_plot=True):
    data = sio.loadmat('datasets/data.mat')
    train_X = data['X'].T
    train_Y = data['y'].T
    test_X = data['Xval'].T
    test_Y = data['yval'].T
    if is_plot:
        plt.scatter(train_X[0, :], train_X[1, :], c=train_Y, s=40, cmap=plt.cm.Spectral)
        plt.show()
    
    return train_X, train_Y, test_X, test_Y

def predict(X, y, parameters):
    """
    This function is used to predict the results of a  n-layer neural network.
    
    Arguments:
    X -- data set of examples you would like to label
    parameters -- parameters of the trained model
    
    Returns:
    p -- predictions for the given dataset X
    """
    
    m = X.shape[1]
    p = np.zeros((1,m), dtype = np.int)
    
    # Forward propagation
    a3, caches = forward_propagation(X, parameters)
    
    # convert probas to 0/1 predictions
    for i in range(0, a3.shape[1]):
        if a3[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0
 
    # print results
    print("Accuracy: "  + str(np.mean((p[0,:] == y[0,:]))))
    
    return p

def plot_decision_boundary(model, X, y):
    # Set min and max values and give it some padding
    x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
    y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
    h = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Predict the function value for the whole grid
    Z = model(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.ylabel('x2')
    plt.xlabel('x1')
    plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral)
    plt.show()
 
def predict_dec(parameters, X):
    """
    Used for plotting decision boundary.
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    X -- input data of size (m, K)
    
    Returns
    predictions -- vector of predictions of our model (red: 0 / blue: 1)
    """
    
    # Predict using forward propagation and a classification threshold of 0.5
    a3, cache = forward_propagation(X, parameters)
    predictions = (a3>0.5)
    return predictions

You can first draw the data to see what it looks like:

train_X, train_Y, test_X, test_Y = reg_utils.load_2D_dataset(is_plot=True)

insert image description here
Then start testing the code:

no regularization

First of all, we do not use regularization, and let the lambd parameter (a is deleted to not coincide with the python keyword) and keep_prob be the default values ​​0 and 1, indicating that these two regularizations are not used.

import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
import reg_utils  


plt.rcParams['figure.figsize'] = (7.0, 4.0)  # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# 加载数据集
train_X, train_Y, test_X, test_Y = reg_utils.load_2D_dataset(is_plot=False)

def model(X, Y, learning_rate=0.3, num_iterations=30000, print_cost=True, is_plot=True, lambd=0, keep_prob=1):
    """
    实现一个三层的神经网络:LINEAR ->RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID

    参数:
        X - 输入的数据,维度为(2, 要训练/测试的数量)
        Y - 标签,【0(蓝色) | 1(红色)】,维度为(1,对应的是输入的数据的标签)
        learning_rate - 学习速率
        num_iterations - 迭代的次数
        print_cost - 是否打印成本值,每迭代10000次打印一次,但是每1000次记录一个成本值
        is_polt - 是否绘制梯度下降的曲线图
        lambd - 正则化的超参数,实数
        keep_prob - 随机删除节点的概率
    返回
        parameters - 学习后的参数
    """
    grads = {
    
    }
    costs = []
    m = X.shape[1]
    layers_dims = [X.shape[0], 20, 3, 1]

    # 初始化参数
    parameters = reg_utils.initialize_parameters(layers_dims)

    # 开始学习
    for i in range(0, num_iterations):
        # 前向传播
        ## 是否随机删除节点
        if keep_prob == 1:
            ### 不随机删除节点
            a3, cache = reg_utils.forward_propagation(X, parameters)
        elif keep_prob < 1:
            ### 随机删除节点
            a3, cache = forward_propagation_with_dropout(X, parameters, keep_prob)
        else:
            print("keep_prob参数错误!程序退出。")
            exit

        # 计算成本
        ## 是否使用二范数
        if lambd == 0:
            ### 不使用L2正则化
            cost = reg_utils.compute_cost(a3, Y)
        else:
            ### 使用L2正则化
            cost = compute_cost_with_regularization(a3, Y, parameters, lambd)

        # 反向传播
        ## 可以同时使用L2正则化和随机删除节点,但是本次实验不同时使用。
        assert (lambd == 0 or keep_prob == 1)

        ## 两个参数的使用情况
        if (lambd == 0 and keep_prob == 1):
            ### 不使用L2正则化和不使用随机删除节点
            grads = reg_utils.backward_propagation(X, Y, cache)
        elif lambd != 0:
            ### 使用L2正则化,不使用随机删除节点
            grads = backward_propagation_with_regularization(X, Y, cache, lambd)
        elif keep_prob < 1:
            ### 使用随机删除节点,不使用L2正则化
            grads = backward_propagation_with_dropout(X, Y, cache, keep_prob)

        # 更新参数
        parameters = reg_utils.update_parameters(parameters, grads, learning_rate)

        # 记录并打印成本
        if i % 1000 == 0:
            ## 记录成本
            costs.append(cost)
            if (print_cost and i % 10000 == 0):
                # 打印成本
                print("第" + str(i) + "次迭代,成本值为:" + str(cost))

    # 是否绘制成本曲线图
    if is_plot:
        plt.plot(costs)
        plt.ylabel('cost')
        plt.xlabel('iterations (x1,000)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()

    # 返回学习后的参数
    return parameters

# 进行模型学习,得到最终的参数
parameters = model(train_X, train_Y, is_plot=True)
print("训练集:")
predictions_train = reg_utils.predict(train_X, train_Y, parameters)
print("测试集:")
predictions_test = reg_utils.predict(test_X, test_Y, parameters)

The result after running is as follows:

0次迭代,成本值为:0.655741252348100210000次迭代,成本值为:0.1632998752572421320000次迭代,成本值为:0.13851642423265018
训练集:
Accuracy: 0.9478672985781991
测试集:
Accuracy: 0.915

insert image description here
Such a result looks normal (because of the problem of the data set, it is not obvious that the over-fitting feature is not obvious), and then drawing the decision boundary segmentation curve will be more obvious:

plt.title("Model without regularization")
axes = plt.gca()
axes.set_xlim([-0.75, 0.40])
axes.set_ylim([-0.75, 0.65])
reg_utils.plot_decision_boundary(lambda x: reg_utils.predict_dec(parameters, x.T), train_X, train_Y)

The results of the operation are as follows:
insert image description here
It can be clearly seen that the overfitting is over-fitting, and the few local features have been over-learned.
Next, experiment with the effect of introducing regularization.

Use L2 regularization

The L2 regularization formula is as follows (L2 regularization is mainly reflected in the loss formula): The
insert image description here
L2 regularization cost is actually the sum of the squares of the weights of each layer, which is np.sum(np.square(Wl))calculated by code.
d W [ l ] = ( frombackprop ) + λ m W [ l ] , frombackprop is d W [ l ] dW^{[l]} =(frombackprop)+ \frac{\lambda}{m}W ^{[l ]}, frombackprop is dW^{[l]}dW[l]=(frombackprop)+mlW[ l ] ,frombackpropisdW_[l]

When updating parameters, W [ l ] = W [ l ] − α d W [ l ] When updating parameters, W^{[l]} =W^{[l]} - \alpha dW ^{[l]}When updating parameters, W[l]=W[l]αdW[ l ]
The final combination of similar items is: W [ l ] = ( 1 − λ m ) W [ l ] − α d W [ l ] The final combination of similar items is: W^{[l]}=(1-\frac {\lambda}{m} )W^{[l]}-\alpha dW^{[l]}The final merged similar items are: W[l]=(1ml)W[l]αdW[ l ]
It can be seen from the formula of updating parameters that L2 regularization is achieved by adding the regularization parameterλ {\lambda}λ makes the weight of the network smaller (weight decay), thereby weakening the influence of many neurons to solve the overfitting problem.
Add the following code to calculate the loss and reverse gradient of L2 regularization:

def compute_cost_with_regularization(A3, Y, parameters, lambd):
    """
    实现公式2的L2正则化计算成本

    参数:
        A3 - 正向传播的输出结果,维度为(输出节点数量,训练/测试的数量)
        Y - 标签向量,与数据一一对应,维度为(输出节点数量,训练/测试的数量)
        parameters - 包含模型学习后的参数的字典
    返回:
        cost - 使用公式2计算出来的正则化损失的值

    """
    m = Y.shape[1]
    W1 = parameters["W1"]
    W2 = parameters["W2"]
    W3 = parameters["W3"]

    # 无正则化loss
    cross_entropy_cost = reg_utils.compute_cost(A3, Y)

    # L2正则化loss,lambd*每层权重的平方和的和/(2*m)
    L2_regularization_cost = lambd * (np.sum(np.square(W1)) + np.sum(np.square(W2)) + np.sum(np.square(W3))) / (2 * m)

    cost = cross_entropy_cost + L2_regularization_cost

    return cost

# 当然,因为改变了成本函数,我们也必须改变向后传播的函数, 所有的梯度都必须根据这个新的成本值来计算。
def backward_propagation_with_regularization(X, Y, cache, lambd):
    """
    实现我们添加了L2正则化的模型的后向传播。

    参数:
        X - 输入数据集,维度为(输入节点数量,数据集里面的数量)
        Y - 标签,维度为(输出节点数量,数据集里面的数量)
        cache - 来自forward_propagation()的cache输出
        lambda - regularization超参数,实数

    返回:
        gradients - 一个包含了每个参数、激活值和预激活值变量的梯度的字典
    """

    m = X.shape[1]

    (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache

    dZ3 = A3 - Y

    dW3 = (1 / m) * np.dot(dZ3, A2.T) + ((lambd * W3) / m)    # 前一项为frombackprop,即原来的dW3
    db3 = (1 / m) * np.sum(dZ3, axis=1, keepdims=True)

    dA2 = np.dot(W3.T, dZ3)
    dZ2 = np.multiply(dA2, np.int64(A2 > 0))
    dW2 = (1 / m) * np.dot(dZ2, A1.T) + ((lambd * W2) / m)
    db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.dot(W2.T, dZ2)
    dZ1 = np.multiply(dA1, np.int64(A1 > 0))
    dW1 = (1 / m) * np.dot(dZ1, X.T) + ((lambd * W1) / m)
    db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)

    gradients = {
    
    "dZ3": dZ3, "dW3": dW3, "db3": db3, "dA2": dA2,
                 "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
                 "dZ1": dZ1, "dW1": dW1, "db1": db1}

    return gradients

Add the lambd parameter when calling the model function:

parameters = model(train_X, train_Y, lambd=0.7,is_plot=True)

The result of running the code is as follows:

0次迭代,成本值为:0.697448449313126410000次迭代,成本值为:0.268491887328223920000次迭代,成本值为:0.2680916337127301
训练集:
Accuracy: 0.9383886255924171
测试集:
Accuracy: 0.93

Loss trend curve:
insert image description here
drawing decision boundaries:
the title here can be changed:

plt.title("Model with L2-regularization")

insert image description here
It can be seen that there is almost no gap between the accuracy of the training set and the test set, or that it is smaller than the gap without regularization. From the drawing boundary, it can be seen that there are no over-fitting features.
L2 regularization will make the decision boundary smoother. But be careful, if λ is too large, it may also be "over-smoothed", resulting in high bias in the model, which becomes an under-fitting state.

Regularize with dropout

The principle is to set the probability keep-prob of a certain neuron in a certain layer, and randomly deactivate the nodes with 1 - keep-prob probability in this layer. The inactivated nodes in this layer do not participate in the forward propagation and back propagation in the current round of iterations , that is, the parameters of the inactivated nodes will not be updated in the current round of training, and the parameters of the unfired nodes will be updated. Assuming that random deactivation is performed on the third layer, the following three steps need to be performed during forward propagation (assuming deactivation on the third layer):

  1. d3 = np.random.rand(a3.shape[0], a3.shape[1]) < keep-prob. The meaning of this sentence is to create a random matrix with the same shape as a3, and compare each value with keep-prob. If it is smaller than keep-prob, it will be True (it will automatically become 1 when python calculates), and if it is larger than keep-prob, it will not meet the requirements. False is 0.
  2. a3 = np. multiply(a3, d3) . By multiplying with d3, deactivate 1-keep-prob nodes do not participate in the calculation (multiplied with 0 is 0).
  3. a3 /= keep-prob. By scaling, we still have approximately the same expected value when calculating the cost, which is called reverse dropout.
    The following two steps are required for backpropagation (assuming deactivation in the third layer):
  4. dA3 = dA3 * D3 . The discarded nodes in the forward propagation are discarded, and the gradient is not calculated or updated.
  5. dA2 /= keep_prob. Do the zoom, keep the approximate expectations.
    Add the following code for forward and reverse propagation of dropout:
def forward_propagation_with_dropout(X, parameters, keep_prob=0.5):
    """
    实现具有随机舍弃节点的前向传播。
    LINEAR -> RELU + DROPOUT -> LINEAR -> RELU + DROPOUT -> LINEAR -> SIGMOID.

    参数:
        X  - 输入数据集,维度为(2,示例数)
        parameters - 包含参数“W1”,“b1”,“W2”,“b2”,“W3”,“b3”的python字典:
            W1  - 权重矩阵,维度为(20,2)
            b1  - 偏向量,维度为(20,1)
            W2  - 权重矩阵,维度为(3,20)
            b2  - 偏向量,维度为(3,1)
            W3  - 权重矩阵,维度为(1,3)
            b3  - 偏向量,维度为(1,1)
        keep_prob  - 随机删除的概率,实数
    返回:
        A3  - 最后的激活值,维度为(1,1),正向传播的输出
        cache - 存储了一些用于计算反向传播的数值的元组
    """
    np.random.seed(1)

    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]

    # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
    Z1 = np.dot(W1, X) + b1
    A1 = reg_utils.relu(Z1)

    D1 = np.random.rand(A1.shape[0], A1.shape[1])
    D1 = D1 < keep_prob  # 步骤1
    A1 = A1 * D1  # 步骤2
    A1 = A1 / keep_prob  # 步骤3

    Z2 = np.dot(W2, A1) + b2
    A2 = reg_utils.relu(Z2)
    
    D2 = np.random.rand(A2.shape[0], A2.shape[1])
    D2 = D2 < keep_prob  # 步骤1
    A2 = A2 * D2  # 步骤2
    A2 = A2 / keep_prob  # 步骤3

    Z3 = np.dot(W3, A2) + b3
    A3 = reg_utils.sigmoid(Z3)

    cache = (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3)

    return A3, cache

def backward_propagation_with_dropout(X, Y, cache, keep_prob):
    """
    实现我们随机删除的模型的后向传播。
    参数:
        X  - 输入数据集,维度为(2,示例数)
        Y  - 标签,维度为(输出节点数量,示例数量)
        cache - 来自forward_propagation_with_dropout()的cache输出
        keep_prob  - 随机删除的概率,实数

    返回:
        gradients - 一个关于每个参数、激活值和预激活变量的梯度值的字典
    """
    m = X.shape[1]
    (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3) = cache

    dZ3 = A3 - Y
    dW3 = (1 / m) * np.dot(dZ3, A2.T)
    db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
    dA2 = np.dot(W3.T, dZ3)

    dA2 = dA2 * D2  # 步骤1
    dA2 = dA2 / keep_prob  # 步骤2

    dZ2 = np.multiply(dA2, np.int64(A2 > 0))
    dW2 = 1. / m * np.dot(dZ2, A1.T)
    db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.dot(W2.T, dZ2)

    dA1 = dA1 * D1  # 步骤1
    dA1 = dA1 / keep_prob  # 步骤2

    dZ1 = np.multiply(dA1, np.int64(A1 > 0))
    dW1 = 1. / m * np.dot(dZ1, X.T)
    db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)

    gradients = {
    
    "dZ3": dZ3, "dW3": dW3, "db3": db3, "dA2": dA2,
                 "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
                 "dZ1": dZ1, "dW1": dW1, "db1": db1}

    return gradients

When calling the model function, add the keep_prob parameter and set it to 0.86, that is, 14% of the nodes in the first and second layers will not participate in the calculation in each iteration:

parameters = model(train_X, train_Y, keep_prob=0.86, learning_rate=0.3,is_plot=True)

The result of running the code is as follows:

10000次迭代,成本值为:0.06101698657490560520000次迭代,成本值为:0.060582435798513114
训练集:
Accuracy: 0.9289099526066351
测试集:
Accuracy: 0.95

insert image description here
The title can be changed here:

plt.title("Model with dropout")

insert image description here
It can be seen that the accuracy of the training set is slightly reduced by using dropout, but the accuracy of the test set is improved and the generalization ability is improved, which is still very successful.

The reason why dropout prevents overfitting: each neuron does not depend on any feature, because any feature may be cleared.
Note that dropout is not used in the test phase, because it is necessary to ensure the stability of the test results.

Guess you like

Origin blog.csdn.net/weixin_45354497/article/details/130600757