def zeros_initializations(layer_dims):
"""
argu:
input:
layer_dims:a list the size of each layer
output:
parameters:a dictionary contains the parameters of each layer{'W1','b1',...,'Wl','bl'}
"""
parameters = {}
L = len(layer_dims)
for l in range(L):
parameters['W' + str(l)] = np.zeros((layer_dims[l],layer_dims[l-1]))
parameters['b' + str(l)] = np.zeros((layers_dims[l],1))
return parameters
随机初始化
def random_initializations(layer_dims):
"""
argu:
input:
layer_dims:a list the size of each layer
output:
parameters:a dictionary contains the parameters of each layer{'W1','b1',...,'Wl','bl'}
"""
parameters = {}
L = len(layer_dims)
for l in range(L):
parameters['W' + str(l)] = np.random.randn((layer_dims[l],layer_dims[l-1]))
parameters['b' + str(l)] = np.zeros((layers_dims[l],1))
return parameters
He 初始化
def he_initialize(layers_dims):
"""
argu:
input:
layer_dims:a list the size of each layer
output:
parameters:a dictionary contains the parameters of each layer{'W1','b1',...,'Wl','bl'}
"""
parameters = {}
L = len(layer_dims)
for l in range(L):
parameters['W' + str(l)] = np.random.randn((layer_dims[l],layer_dims[l-1])) * (np.sqrt(2. / layers_dims[l-1]))
parameters['b' + str(l)] = np.zeros((layers_dims[l],1))
return parameters
正则化
L2正则化
# GRADED FUNCTION: compute_cost_with_regularization
def compute_cost_with_regularization(A3, Y, parameters, lambd):
"""
Implement the cost function with L2 regularization. See formula (2) above.
Arguments:
A3 -- post-activation, output of forward propagation, of shape (output size, number of examples)
Y -- "true" labels vector, of shape (output size, number of examples)
parameters -- python dictionary containing parameters of the model
Returns:
cost - value of the regularized loss function (formula (2))
"""
m = Y.shape[1]
W1 = parameters["W1"]
W2 = parameters["W2"]
W3 = parameters["W3"]
cross_entropy_cost = compute_cost(A3, Y) # This gives you the cross-entropy part of the cost
### START CODE HERE ### (approx. 1 line)
L2_regularization_cost = (1. / m)*(lambd / 2) * (np.sum(np.square(W1)) + np.sum(np.square(W2)) + np.sum(np.square(W3)))
cost = cross_entropy_cost + L2_regularization_cost
return cost
梯度下降法
# GRADED FUNCTION: backward_propagation_with_regularization
def backward_propagation_with_regularization(X, Y, cache, lambd):
"""
Implements the backward propagation of our baseline model to which we added an L2 regularization.
Arguments:
X -- input dataset, of shape (input size, number of examples)
Y -- "true" labels vector, of shape (output size, number of examples)
cache -- cache output from forward_propagation()
lambd -- regularization hyperparameter, scalar
Returns:
gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
"""
m = X.shape[1]
(Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache
dZ3 = A3 - Y
### START CODE HERE ### (approx. 1 line)
dW3 = 1./m * (np.dot(dZ3, A2.T) + lambd * W3)
### END CODE HERE ###
db3 = 1./m * np.sum(dZ3, axis=1, keepdims = True)
dA2 = np.dot(W3.T, dZ3)
dZ2 = np.multiply(dA2, np.int64(A2 > 0))
### START CODE HERE ### (approx. 1 line)
dW2 = 1./m * (np.dot(dZ2, A1.T) + lambd * W2 )
### END CODE HERE ###
db2 = 1./m * np.sum(dZ2, axis=1, keepdims = True)
dA1 = np.dot(W2.T, dZ2)
dZ1 = np.multiply(dA1, np.int64(A1 > 0))
### START CODE HERE ### (approx. 1 line)
dW1 = 1./m * (np.dot(dZ1, X.T) + lambd * W1 )
### END CODE HERE ###
db1 = 1./m * np.sum(dZ1, axis=1, keepdims = True)
gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2,
"dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
"dZ1": dZ1, "dW1": dW1, "db1": db1}
return gradients