- Improving Deep Neural Networks
- 加强深度网络性能的一些技巧
- 参数初始化技巧
- 正则化技巧,l2, dropout
- 梯度检验
import numpy as np
import matplotlib.pyplot as plt
import h5py
import scipy
import sklearn
import sklearn.datasets
import copy
%matplotlib inline
plt.rcParams['figure.figsize'] = (7.0, 4.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
def func_sigmoid(Z):
Implements the sigmoid activation in numpy
Z -- numpy array of any shape
A -- output of sigmoid(z), same shape as Z
cache -- returns Z as well, useful during backpropagation
A = 1/(1+np.exp(-Z))
cache = Z
return A, cache
def func_relu(Z):
Implement the RELU function.
Z -- Output of the linear layer, of any shape
A -- Post-activation parameter, of the same shape as Z
cache -- a python dictionary containing "A" ; stored for computing the backward pass efficiently
A = np.maximum(0,Z)
assert(A.shape == Z.shape)
cache = Z
return A, cache
def func_relu_backward(dA, cache):
Implement the backward propagation for a single RELU unit.
dA -- post-activation gradient, of any shape
cache -- 'Z' where we store for computing backward propagation efficiently
dZ -- Gradient of the cost with respect to Z
Z = cache
dZ = np.array(dA, copy=True) # just converting dz to a correct object.
# When z <= 0, you should set dz to 0 as well.
dZ[Z <= 0] = 0
assert (dZ.shape == Z.shape)
return dZ
def func_sigmoid_backward(dA, cache):
Implement the backward propagation for a single SIGMOID unit.
dA -- post-activation gradient, of any shape
cache -- 'Z' where we store for computing backward propagation efficiently
dZ -- Gradient of the cost with respect to Z
Z = cache
s = 1/(1+np.exp(-Z))
dZ = dA * s * (1-s)
assert (dZ.shape == Z.shape)
return dZ
def func_L_layers_initialize_parameters(layer_dims, seed=1):
:param layer_dims: python array (list) containing the dimensions of each layer in our network
:param seed: 随机种子
parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
bl -- bias vector of shape (layer_dims[l], 1)
parameters = {}
L = len(layer_dims)
# 这里/ np.sqrt(layer_dims[layer-1])很重要,如果还是*0.01,会导致模型cost降不下去
for layer in range(1, L):
parameters['W' + str(layer)] = np.random.randn(layer_dims[layer], layer_dims[layer-1]) / np.sqrt(layer_dims[layer-1]) # * 0.01
parameters['b' + str(layer)] = np.zeros((layer_dims[layer], 1))
assert(parameters['W' + str(layer)].shape == (layer_dims[layer], layer_dims[layer-1]))
assert(parameters['b' + str(layer)].shape == (layer_dims[layer], 1))
return parameters
def func_linear_forward(A, W, b):
linear forward
:param A:
:param W:
:param b:
:return Z,chche:
Z -- the input of the activation function, also called pre-activation parameter
cache -- a python dictionary containing "A", "W" and "b" ; stored for computing the backward pass efficiently
Z = np.dot(W, A) + b
assert(Z.shape == (W.shape[0], A.shape[1]))
cache = (A, W, b)
return Z, cache
def func_linear_activation_forward(A_prev, W, b, activation):
Implement the forward propagation for the LINEAR->ACTIVATION layer
:param A_prev: activations from previous layer (or input data): (size of previous layer, number of examples)
:param W: weights matrix: numpy array of shape (size of current layer, size of previous layer)
:param b: bias vector, numpy array of shape (size of the current layer, 1)
:param activation: the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
A -- the output of the activation function, also called the post-activation value
cache -- a python dictionary containing "linear_cache" and "activation_cache";
stored for computing the backward pass efficiently
Z, linear_cache = func_linear_forward(A_prev, W, b)
if activation == 'sigmoid':
A, activation_cache = func_sigmoid(Z)
elif activation == 'relu':
A, activation_cache = func_relu(Z)
raise ValueError('activation param')
assert(A.shape == (W.shape[0], A_prev.shape[1]))
cache = (linear_cache, activation_cache)
return A, cache
def func_L_model_forward(X, parameters):
Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation
:param X -- data, numpy array of shape (input size, number of examples)
:param parameters -- output of initialize_parameters_deep()
AL -- last post-activation value
caches -- list of caches containing:
every cache of linear_relu_forward() (there are L-1 of them, indexed from 0 to L-2)
the cache of linear_sigmoid_forward() (there is one, indexed L-1)
caches = []
A = X
L = len(parameters) // 2
for layer in range(1, L):
A_prev = A
W, b = parameters['W'+str(layer)], parameters['b'+str(layer)]
A, cache = func_linear_activation_forward(A_prev, W, b, 'relu')
A_prev = A
layer = L
W, b = parameters['W'+str(layer)], parameters['b'+str(layer)]
A, cache = func_linear_activation_forward(A_prev, W, b, 'sigmoid')
assert(A.shape == (1, X.shape[1]))
return A, caches
compute cost
def func_compute_cost(AL, Y):
Implement the cost function defined by equation (7).
:param AL -- probability vector corresponding to your label predictions, shape (1, number of examples)
:param Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)
cost -- cross-entropy cost
m = Y.shape[1]
# todo RuntimeWarning: divide by zero encountered in log
AL = np.clip(AL, 1e-10, 1-1e-10)
# Compute loss from aL and y.
# logprobs = np.multiply(-np.log(AL), Y) + np.multiply(-np.log(1 - AL), 1 - Y)
# cost = 1./m * np.nansum(logprobs)
# cost = -1 / m * np.sum(np.multiply(Y, np.log(AL))+np.multiply(1-Y, np.log(1-AL)))
cost = (1./m) * (-np.dot(Y,np.log(AL).T) - np.dot(1-Y, np.log(1-AL).T))
cost = np.squeeze(cost) # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
assert(cost.shape == ())
return cost
def func_linear_backward(dZ, cache):
Implement the linear portion of backward propagation for a single layer (layer l)
:param dZ -- Gradient of the cost with respect to the linear output (of current layer l)
:param cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer
dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
dW -- Gradient of the cost with respect to W (current layer l), same shape as W
db -- Gradient of the cost with respect to b (current layer l), same shape as b
A_prev, W, b = cache
m = A_prev.shape[1]
dW = 1. / m * np.dot(dZ, A_prev.T)
db = 1. / m * np.sum(dZ, axis=1, keepdims=True) # axis=1是行记录求和
dA_prev = np.dot(W.T, dZ)
assert(dW.shape == W.shape)
assert(db.shape == b.shape)
assert(dA_prev.shape == A_prev.shape)
return dA_prev, dW, db
def func_linear_activation_backward(dA, cache, activation):
Implement the backward propagation for the LINEAR->ACTIVATION layer.
:param dA -- post-activation gradient for current layer l
:param cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
:param activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
dW -- Gradient of the cost with respect to W (current layer l), same shape as W
db -- Gradient of the cost with respect to b (current layer l), same shape as b
linear_cache, activation_cache = cache
if activation == 'relu':
dZ = func_relu_backward(dA, activation_cache)
elif activation == 'sigmoid':
dZ = func_sigmoid_backward(dA, activation_cache)
raise ValueError('activation param')
dA_prev, dW, db = func_linear_backward(dZ, linear_cache)
return dA_prev, dW, db
def func_L_model_backward(AL, Y, caches):
Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
:param AL -- probability vector, output of the forward propagation (L_model_forward())
:param Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
:param caches -- list of caches containing:
every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
grads -- A dictionary with the gradients
grads["dA" + str(l)] = ...
grads["dW" + str(l)] = ...
grads["db" + str(l)] = ...
grads = {}
L = len(caches)
m = AL.shape[1]
Y = Y.reshape(AL.shape)
# todo divide by zero encountered in true_divide
dAL = - (np.divide(Y, AL, where=AL!=0) - np.divide(1-Y, 1-AL, where=(1-AL)!=0))
# dAL = - (np.divide(Y, AL) - np.divide(1-Y, 1-AL))
cur_cache = caches[L-1]
grads['dA' + str(L-1)], grads['dW'+str(L)], grads['db'+str(L)] = func_linear_activation_backward(dAL, cur_cache, activation='sigmoid')
for layer in reversed(range(L-1)):
cur_cache = caches[layer]
dA_prev_tmp, dW_tmp, db_tmp = func_linear_activation_backward(grads["dA"+str(layer+1)], cur_cache, activation='relu')
grads['dA'+str(layer)] = dA_prev_tmp
grads['dW'+str(layer+1)] = dW_tmp
grads['db'+str(layer+1)] = db_tmp
# current_cache = caches[L-1]
# grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = func_linear_activation_backward(dAL, current_cache, activation = "sigmoid")
# for l in reversed(range(L-1)):
# # lth layer: (RELU -> LINEAR) gradients.
# current_cache = caches[l]
# dA_prev_temp, dW_temp, db_temp = func_linear_activation_backward(grads["dA" + str(l + 2)], current_cache, activation = "relu")
# grads["dA" + str(l + 1)] = dA_prev_temp
# grads["dW" + str(l + 1)] = dW_temp
# grads["db" + str(l + 1)] = db_temp
return grads
update parameters
def func_update_parameters(parameters, grads, lr):
Update parameters using gradient descent
:param parameters -- python dictionary containing your parameters
:param grads -- python dictionary containing your gradients, output of L_model_backward
:param lr: learning rate
parameters -- python dictionary containing your updated parameters
parameters["W" + str(l)] = ...
parameters["b" + str(l)] = ...
L = len(parameters) // 2
for layer in range(1, L+1):
parameters['W'+str(layer)] = parameters['W'+str(layer)] - lr * grads['dW' + str(layer)]
parameters['b'+str(layer)] = parameters['b'+str(layer)] - lr * grads['db' + str(layer)]
return parameters
def func_predict(X, y, parameters):
This function is used to predict the results of a L-layer neural network.
:param X -- data set of examples you would like to label
:param parameters -- parameters of the trained model
p -- predictions for the given dataset X
m = X.shape[1]
L = len(parameters) // 2
p = np.zeros((1, m))
probas, caches = func_L_model_forward(X, parameters)
for i in range(probas.shape[1]):
if probas[0, i] > 0.5:
p[0, i] = 1
p[0, i] = 0
print('acc: {}'.format(np.sum(p==y)/m))
return p
dnn model
def func_L_layer_dnn_model(X, Y, layer_dims, lr=0.001, num_epochs=10000, print_cost=False):
:param X:
:param Y:
:param layer_dims: python array (list) containing the dimensions of each layer in our network
:param lr: learning rate
:param num_epochs:
:param print_cost:
:return params
costs = []
# m = X.shape[1]
# 参数初始化
parameters = func_L_layers_initialize_parameters(layer_dims)
# loop
for epoch in range(num_epochs):
# 前向传播
AL, caches = func_L_model_forward(X, parameters)
# 计算损失
cost = func_compute_cost(AL, Y)
# 后向传播
grads = func_L_model_backward(AL, Y, caches)
# 更新参数
parameters = func_update_parameters(parameters, grads, lr)
# 打印信息
if print_cost and epoch % 100 == 0:
print('cost after epoch {}: {}'.format(epoch, np.squeeze(cost)))
if epoch % 100 == 0:
return parameters, costs
def load_cat_dataset():
train_dataset = h5py.File('./深度学习之吴恩达课程作业1/train_catvnoncat.h5', "r")
train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels
test_dataset = h5py.File('./深度学习之吴恩达课程作业1/test_catvnoncat.h5', "r")
test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels
classes = np.array(test_dataset["list_classes"][:]) # the list of classes
train_set_y = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
test_set_y = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
train_set_x_orig = train_set_x_orig.reshape(train_set_x_orig.shape[0], -1).T
test_set_x_orig = test_set_x_orig.reshape(test_set_x_orig.shape[0], -1).T
train_set_x = train_set_x_orig/255
test_set_x = test_set_x_orig/255
return train_set_x, train_set_y, test_set_x, test_set_y, classes
def load_dataset():
train_X, train_Y = sklearn.datasets.make_circles(n_samples=300, noise=.05)
test_X, test_Y = sklearn.datasets.make_circles(n_samples=100, noise=.05)
# Visualize the data
plt.scatter(train_X[:, 0], train_X[:, 1], c=train_Y, s=40, cmap=plt.cm.Spectral);
train_X = train_X.T
train_Y = train_Y.reshape((1, train_Y.shape[0]))
test_X = test_X.T
test_Y = test_Y.reshape((1, test_Y.shape[0]))
return train_X, train_Y, test_X, test_Y
def load_planar_dataset(seed):
m = 400 # number of examples
N = int(m/2) # number of points per class
D = 2 # dimensionality
X = np.zeros((m,D)) # data matrix where each row is a single example
Y = np.zeros((m,1), dtype='uint8') # labels vector (0 for red, 1 for blue)
a = 4 # maximum ray of the flower
for j in range(2):
ix = range(N*j,N*(j+1))
t = np.linspace(j*3.12,(j+1)*3.12,N) + np.random.randn(N)*0.2 # theta
r = a*np.sin(4*t) + np.random.randn(N)*0.2 # radius
X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
Y[ix] = j
X = X.T
Y = Y.T
return X, Y
def load_planar_dataset(randomness, seed):
m = 50
N = int(m/2) # number of points per class
D = 2 # dimensionality
X = np.zeros((m,D)) # data matrix where each row is a single example
Y = np.zeros((m,1), dtype='uint8') # labels vector (0 for red, 1 for blue)
a = 2 # maximum ray of the flower
for j in range(2):
ix = range(N*j,N*(j+1))
if j == 0:
t = np.linspace(j, 4*3.1415*(j+1),N) #+ np.random.randn(N)*randomness # theta
r = 0.3*np.square(t) + np.random.randn(N)*randomness # radius
if j == 1:
t = np.linspace(j, 2*3.1415*(j+1),N) #+ np.random.randn(N)*randomness # theta
r = 0.2*np.square(t) + np.random.randn(N)*randomness # radius
X[ix] = np.c_[r*np.cos(t), r*np.sin(t)]
Y[ix] = j
X = X.T
Y = Y.T
return X, Y
def func_predict_dec(parameters, X):
Used for plotting decision boundary.
parameters -- python dictionary containing your parameters
X -- input data of size (m, K)
predictions -- vector of predictions of our model (red: 0 / blue: 1)
# Predict using forward propagation and a classification threshold of 0.5
AL, cache = func_L_model_forward(X, parameters)
predictions = (AL>0.5)
return predictions
def func_plot_decision_boundary(model, X, y):
# Set min and max values and give it some padding
x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
h = 0.01
# Generate a grid of points with distance h between them
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Predict the function value for the whole grid
Z = model(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot the contour and training examples
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral)
- 一个好的参数初始化选择能够:
- Speed up the convergence of gradient descent
- Increase the odds of gradient descent converging to a lower training (and generalization) error
1-zero initialization
def func_initialize_parameters_zeros(layer_dims):
zero initialization
:param layer_dims -- python array (list) containing the size of each layer.
parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
b1 -- bias vector of shape (layers_dims[1], 1)
WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
bL -- bias vector of shape (layers_dims[L], 1)
parameters = {}
L = len(layer_dims)
for layer in range(1, L):
parameters['W'+str(layer)] = np.zeros((layer_dims[layer], layer_dims[layer-1]))
parameters['b'+str(layer)] = np.zeros((layer_dims[layer], 1))
return parameters
parameters = func_initialize_parameters_zeros([3,2,1])
2-random initialization
def func_initialize_parameters_random(layer_dims):
random initialization
:param layer_dims -- python array (list) containing the size of each layer.
parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
b1 -- bias vector of shape (layers_dims[1], 1)
WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
bL -- bias vector of shape (layers_dims[L], 1)
parameters = {}
L = len(layer_dims)
for layer in range(1, L):
parameters['W'+str(layer)] = np.random.randn(layer_dims[layer], layer_dims[layer-1]) * 10
parameters['b'+str(layer)] = np.zeros((layer_dims[layer], 1))
return parameters
parameters = func_initialize_parameters_random([3,2,1])
3-He initialization
This function is similar to the previous initialize_parameters_random(...)
. The only difference is that instead of multiplying np.random.randn(..,..)
by 10, you will multiply it by $\sqrt{\frac{2}{\text{dimension of the previous layer}}}$, which is what He initialization recommends for layers with a ReLU activation.
def func_initialize_parameters_he(layer_dims):
he initialization
:param layer_dims -- python array (list) containing the size of each layer.
parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
b1 -- bias vector of shape (layers_dims[1], 1)
WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
bL -- bias vector of shape (layers_dims[L], 1)
parameters = {}
L = len(layer_dims)
for layer in range(1, L):
parameters['W'+str(layer)] = np.random.randn(layer_dims[layer], layer_dims[layer-1]) * np.sqrt(2/layer_dims[layer-1])
parameters['b'+str(layer)] = np.zeros((layer_dims[layer], 1))
return parameters
parameters = func_initialize_parameters_he([2,4,1])
overwrite dnn model
def func_L_layer_dnn_model(X, Y, layer_dims, lr=0.01, num_epochs=15000, print_cost=True, initialization=None):
:param X:
:param Y:
:param layer_dims: python array (list) containing the dimensions of each layer in our network
:param lr: learning rate
:param num_epochs:
:param print_cost:
:param initialization : flag to choose which initialization to use ("zeros","random" or "he")
:return params
# np.random.seed(1)
costs = []
# m = X.shape[1]
# 参数初始化
if initialization == 'zeros':
parameters = func_initialize_parameters_zeros(layer_dims)
elif initialization == 'random':
parameters = func_initialize_parameters_random(layer_dims)
elif initialization == 'he':
parameters = func_initialize_parameters_he(layer_dims)
parameters = func_L_layers_initialize_parameters(layer_dims)
# raise ValueError('error initialization set!')
print('parameters: \n', parameters)
# loop
for epoch in range(num_epochs):
# 前向传播
AL, caches = func_L_model_forward(X, parameters)
# 计算损失
cost = func_compute_cost(AL, Y)
# 后向传播
grads = func_L_model_backward(AL, Y, caches)
# 更新参数
parameters = func_update_parameters(parameters, grads, lr)
# 打印信息
if print_cost and epoch % 1000 == 0:
print('cost after epoch {}: {}'.format(epoch, np.squeeze(cost)))
# print('AL:', AL)
if epoch % 1000 == 0:
return parameters, costs
train_X, train_Y, test_X, test_Y = load_dataset()
layer_dims = [train_X.shape[0], 10, 5, 1]
lr = 0.01
num_epochs = 15000
[2, 10, 5, 1]
parameters, costs = func_L_layer_dnn_model(train_X, train_Y, layer_dims, lr, num_epochs, print_cost)
print('on the train set:')
pred_train = func_predict(train_X, train_Y, parameters)
print('on the test set:')
pred_test = func_predict(test_X, test_Y, parameters)
plt.xlabel('iterations (per hundreds)')
plt.title("Learning rate =" + str(lr))
cost after epoch 0: 0.6960659020466242
cost after epoch 1000: 0.6892005439078841
cost after epoch 2000: 0.6850049132576146
cost after epoch 3000: 0.6798306324866793
cost after epoch 4000: 0.6729361200642482
cost after epoch 5000: 0.6578134850604765
cost after epoch 6000: 0.6308733720773914
cost after epoch 7000: 0.5847718750117528
cost after epoch 8000: 0.512240914977723
cost after epoch 9000: 0.41737297954575764
cost after epoch 10000: 0.32777046260752574
cost after epoch 11000: 0.25687394311562106
cost after epoch 12000: 0.20428449077110009
cost after epoch 13000: 0.16602627481234417
cost after epoch 14000: 0.13073861742391021
on the train set:
acc: 0.99
on the test set:
acc: 0.94
plt.title("Model with np.sqrt(layer_dims[layer-1]) initialization")
axes = plt.gca()
func_plot_decision_boundary(lambda x: func_predict_dec(parameters, x.T), train_X, train_Y)
zero 初始化
parameters, costs = func_L_layer_dnn_model(train_X, train_Y, layer_dims, lr, num_epochs, print_cost, initialization='zeros')
print('on the train set:')
pred_train = func_predict(train_X, train_Y, parameters)
print('on the test set:')
pred_test = func_predict(test_X, test_Y, parameters)
plt.xlabel('iterations (per hundreds)')
plt.title("Learning rate =" + str(lr))
on the train set:
acc: 0.5
on the test set:
acc: 0.5
plt.title("Model with Zeros initialization")
axes = plt.gca()
func_plot_decision_boundary(lambda x: func_predict_dec(parameters, x.T), train_X, train_Y)
random 初始化
parameters, costs = func_L_layer_dnn_model(train_X, train_Y, layer_dims, lr, num_epochs, print_cost, initialization='random')
# print(parameters)
print('on the train set:')
pred_train = func_predict(train_X, train_Y, parameters)
print('on the test set:')
pred_test = func_predict(test_X, test_Y, parameters)
plt.xlabel('iterations (per hundreds)')
plt.title("Learning rate =" + str(lr))
on the train set:
acc: 0.5
on the test set:
acc: 0.5
plt.title("Model with random initialization")
axes = plt.gca()
func_plot_decision_boundary(lambda x: func_predict_dec(parameters, x.T), train_X, train_Y)
- 不同的参数初始化导致不同的结果
- 随机参数初始化是为了打破对称性以及确保各隐藏units可以学习不同的特征
- 不要初始化太大的值,会导致梯度消失或者爆炸
- 对于relu激活函数来说,he初始化方式能表现很好
Deep Learning models have so much flexibility and capacity that overfitting can be a serious problem, if the training dataset is not big enough. Sure it does well on the training set, but the learned network doesn’t generalize to new examples that it has never seen!
1-L2 Regularization
The standard way to avoid overfitting is called L2 regularization. It consists of appropriately modifying your cost function, from:
1.1 overwrite compute cost
def func_compute_cost_with_regularization(AL, Y, parameters, lambd):
Implement the cost function with L2 regularization. See formula (2) above.
:param AL -- post-activation, output of forward propagation, of shape (output size, number of examples)
:param Y -- "true" labels vector, of shape (output size, number of examples)
:param parameters -- python dictionary containing parameters of the model
:param lambd -- 正则系数
cost - value of the regularized loss function (formula (2))
cross_entropy_cost = func_compute_cost(AL, Y)
m = Y.shape[1]
L = len(parameters) // 2
tmp = [parameters['W'+str(layer)] for layer in range(1, L+1)]
L2_regularization_cost = sum([np.sum(np.square(w)) for w in tmp]) * lambd / 2 / m
cost = cross_entropy_cost + L2_regularization_cost
return cost
def func_compute_cost_with_regularization_test_case():
func_compute_cost_with_regularization(A3, Y_assess, parameters, lambd=0.1)
1.2 overwrite backward propagation
def func_linear_backward_with_regularization(dZ, cache, lambd):
Implement the linear portion of backward propagation for a single layer (layer l)
:param dZ -- Gradient of the cost with respect to the linear output (of current layer l)
:param cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer
:param lambd -- 正则系数
dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
dW -- Gradient of the cost with respect to W (current layer l), same shape as W
db -- Gradient of the cost with respect to b (current layer l), same shape as b
A_prev, W, b = cache
m = A_prev.shape[1]
dW = 1 / m * np.dot(dZ, A_prev.T) + lambd / m * W
db = 1 / m * np.sum(dZ, axis=1, keepdims=True) # axis=1是行记录求和
dA_prev = np.dot(W.T, dZ)
assert(dW.shape == W.shape)
assert(db.shape == b.shape)
assert(dA_prev.shape == A_prev.shape)
return dA_prev, dW, db
def func_linear_activation_backward_with_regularization(dA, cache, lambd, activation):
Implement the backward propagation for the LINEAR->ACTIVATION layer.
:param dA -- post-activation gradient for current layer l
:param cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
:param lambd -- 正则系数
:param activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
dW -- Gradient of the cost with respect to W (current layer l), same shape as W
db -- Gradient of the cost with respect to b (current layer l), same shape as b
linear_cache, activation_cache = cache
if activation == 'relu':
dZ = func_relu_backward(dA, activation_cache)
elif activation == 'sigmoid':
dZ = func_sigmoid_backward(dA, activation_cache)
raise ValueError('activation param')
dA_prev, dW, db = func_linear_backward_with_regularization(dZ, linear_cache, lambd)
return dA_prev, dW, db
def func_L_model_backward_with_regularization(AL, Y, caches, lambd):
Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
:param AL -- probability vector, output of the forward propagation (L_model_forward())
:param Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
:param caches -- list of caches containing:
every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
:param lambd -- 正则系数
grads -- A dictionary with the gradients
grads["dA" + str(l)] = ...
grads["dW" + str(l)] = ...
grads["db" + str(l)] = ...
grads = {}
L = len(caches)
m = AL.shape[1]
Y = Y.reshape(AL.shape)
dAL = - (np.divide(Y, AL) - np.divide(1-Y, 1-AL))
cur_cache = caches[L-1]
grads['dA' + str(L-1)], grads['dW'+str(L)], grads['db'+str(L)] = func_linear_activation_backward_with_regularization(dAL, cur_cache, lambd, activation='sigmoid')
for layer in reversed(range(L-1)):
cur_cache = caches[layer]
dA_prev_tmp, dW_tmp, db_tmp = func_linear_activation_backward_with_regularization(grads["dA"+str(layer+1)], cur_cache, lambd, activation='relu')
grads['dA'+str(layer)] = dA_prev_tmp
grads['dW'+str(layer+1)] = dW_tmp
grads['db'+str(layer+1)] = db_tmp
# current_cache = caches[L-1]
# grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = func_linear_activation_backward(dAL, current_cache, activation = "sigmoid")
# for l in reversed(range(L-1)):
# # lth layer: (RELU -> LINEAR) gradients.
# current_cache = caches[l]
# dA_prev_temp, dW_temp, db_temp = func_linear_activation_backward(grads["dA" + str(l + 2)], current_cache, activation = "relu")
# grads["dA" + str(l + 1)] = dA_prev_temp
# grads["dW" + str(l + 1)] = dW_temp
# grads["db" + str(l + 1)] = db_temp
return grads
def func_backward_propagation_with_regularization_test_case():
X_assess = np.random.randn(3, 5)
Y_assess = np.array([[1, 1, 0, 1, 0]])
cache = (np.array([[-1.52855314, 3.32524635, 2.13994541, 2.60700654, -0.75942115],
[-1.98043538, 4.1600994 , 0.79051021, 1.46493512, -0.45506242]]),
np.array([[ 0. , 3.32524635, 2.13994541, 2.60700654, 0. ],
[ 0. , 4.1600994 , 0.79051021, 1.46493512, 0. ]]),
np.array([[-1.09989127, -0.17242821, -0.87785842],
[ 0.04221375, 0.58281521, -1.10061918]]),
np.array([[ 1.14472371],
[ 0.90159072]]),
np.array([[ 0.53035547, 5.94892323, 2.31780174, 3.16005701, 0.53035547],
[-0.69166075, -3.47645987, -2.25194702, -2.65416996, -0.69166075],
[-0.39675353, -4.62285846, -2.61101729, -3.22874921, -0.39675353]]),
np.array([[ 0.53035547, 5.94892323, 2.31780174, 3.16005701, 0.53035547],
[ 0. , 0. , 0. , 0. , 0. ],
[ 0. , 0. , 0. , 0. , 0. ]]),
np.array([[ 0.50249434, 0.90085595],
[-0.68372786, -0.12289023],
[-0.93576943, -0.26788808]]),
np.array([[ 0.53035547],
np.array([[-0.3771104 , -4.10060224, -1.60539468, -2.18416951, -0.3771104 ]]),
np.array([[ 0.40682402, 0.01629284, 0.16722898, 0.10118111, 0.40682402]]),
np.array([[-0.6871727 , -0.84520564, -0.67124613]]),
return X_assess, Y_assess, cache
X_assess, Y_assess, cache = func_backward_propagation_with_regularization_test_case()
X_assess, Y_assess, cache
grads = func_L_model_backward_with_regularization(A3, Y_assess, caches, lambd=0.7)
At each iteration, you shut down (= set to zero) each neuron of a layer with probability $1 - keep\_prob$ or keep it with probability $keep\_prob$ (50% here). The dropped neurons don't contribute to the training in both the forward and backward propagations of the iteration.
$1^{st}$ layer: we shut down on average 40% of the neurons. $3^{rd}$ layer: we shut down on average 20% of the neurons.
Note that regularization hurts training set performance! This is because it limits the ability of the network to overfit to the training set. But since it ultimately gives better test accuracy, it is helping your system.
- 检查反向传播过程是否正确?则需要梯度检验
- 确保你的前向传播和计算loss完全正确!
- 检验反向传播过程,也就是检查$\frac{\partial J}{\partial \theta}$是否计算正确
- 回顾导数的定义
- 则可以使用$J(\theta + \varepsilon)$ and $J(\theta - \varepsilon)$ (in the case that $\theta$ is a real number), since you’re confident your implementation for $J$ is correct
1D 梯度检验
To show that the func_1d_backward_propagation()
function is correctly computing the gradient $\frac{\partial J}{\partial \theta}$, let’s implement gradient checking.
- First compute “gradapprox” using the formula above (1) and a small value of $\varepsilon$. Here are the Steps to follow:
- $\theta^{+} = \theta + \varepsilon$
- $\theta^{-} = \theta - \varepsilon$
- $J^{+} = J(\theta^{+})$
- $J^{-} = J(\theta^{-})$
- $gradapprox = \frac{J^{+} - J^{-}}{2 \varepsilon}$
- Then compute the gradient using backward propagation, and store the result in a variable “grad”
- Finally, compute the relative difference between “gradapprox” and the “grad” using the following formula:You will need 3 Steps to compute this formula:
- 1’. compute the numerator using np.linalg.norm(…)
- 2’. compute the denominator. You will need to call np.linalg.norm(…) twice.
- 3’. divide them.
- If this difference is small (say less than $10^{-7}$), you can be quite confident that you have computed your gradient correctly. Otherwise, there may be a mistake in the gradient computation.
def func_1d_forward_propagation(x, theta):
Implement the linear forward propagation (compute J) presented in Figure 1 (J(theta) = theta * x)
x -- a real-valued input
theta -- our parameter, a real number as well
J -- the value of function J, computed using the formula J(theta) = theta * x
return theta * x
x, theta = 2, 4
J = func_1d_forward_propagation(x, theta)
def func_1d_backward_propagation(x, theta):
dtheta = x
return dtheta
dtheta = func_1d_backward_propagation(x, theta)
def func_1d_gradient_check(x, theta, epsilon=1e-7):
Implement the backward propagation presented in Figure 1.
x -- a real-valued input
theta -- our parameter, a real number as well
epsilon -- tiny shift to the input to compute approximated gradient with formula(1)
difference -- difference (2) between the approximated gradient and the backward propagation gradient
theta_plus = theta + epsilon
theta_minus = theta - epsilon
J_plus = func_1d_forward_propagation(x, theta_plus)
J_minus = func_1d_forward_propagation(x, theta_minus)
grad_approx = (J_plus - J_minus) / 2 / epsilon
grad = func_1d_backward_propagation(x, theta)
numerator = np.linalg.norm(grad - grad_approx)
denominator = np.linalg.norm(grad) + np.linalg.norm(grad_approx)
difference = numerator / denominator
if difference < 1e-7:
print('the gradient is correct!')
print('the gradient is wrong!')
return difference
difference = func_1d_gradient_check(x, theta)
the gradient is correct!
N-dimensional gradient checking
check function=自己写的
def func_nd_gradient_check(parameters, gradients, X, Y, epsilon=1e-7):
Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n
parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters.
x -- input datapoint, of shape (input size, 1)
y -- true "label"
epsilon -- tiny shift to the input to compute approximated gradient with formula(1)
difference -- difference (2) between the approximated gradient and the backward propagation gradient
gradients_approx = {}
for key, value in parameters.items():
# print(key)
parameters_copy = copy.copy(parameters)
value_shape = value.shape # w参数的规模
value_vector = np.reshape(value, (-1, 1)) # 将w展开成(n_w, 1)
# print(value_vector.shape, value_shape)
gradapprox_vector = np.zeros((value_vector.shape[0], 1)) # 每改变一个w,对应一个gradapprox
for i in range(value_vector.shape[0]): # 每个w过一遍
theta_plus = np.copy(value_vector)
theta_plus[i][0] = theta_plus[i][0] + epsilon
parameters_copy[key] = theta_plus.reshape(value_shape)
# print(parameters_copy)
AL, _ = func_L_model_forward(X, parameters_copy)
cost_plus = func_compute_cost(AL, Y)
theta_minus = np.copy(value_vector)
theta_minus[i][0] = theta_minus[i][0] - epsilon
parameters_copy[key] = theta_minus.reshape(value_shape)
AL, _ = func_L_model_forward(X, parameters_copy)
cost_minus = func_compute_cost(AL, Y)
gradapprox_vector[i] = (cost_plus - cost_minus) / 2 / epsilon
# print(gradapprox_vector, gradapprox_vector.shape)
gradients_approx['d'+key] = gradapprox_vector.reshape(value_shape)
# 将gradients转成vector
count = 0
for key in gradients_approx.keys():
# print(key)
new_vector = np.reshape(gradients[key], (-1, 1))
new_vector_approx = np.reshape(gradients_approx[key], (-1, 1))
if count == 0:
theta = new_vector
theta_approx = new_vector_approx
theta = np.concatenate((theta, new_vector), axis=0)
theta_approx = np.concatenate((theta_approx, new_vector_approx), axis=0)
count += 1
# 计算difference
# print(theta.shape, theta_approx.shape)
# print(theta, theta_approx)
numerator = np.linalg.norm(theta - theta_approx)
denominator = np.linalg.norm(theta) + np.linalg.norm(theta_approx)
# print(numerator, denominator)
# print(np.linalg.norm(theta), np.linalg.norm(theta_approx))
difference = numerator / denominator
if difference > 2e-7:
print ("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")
print ("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")
return difference
def dictionary_to_vector(parameters):
Roll all our parameters dictionary into a single vector satisfying our specific required shape.
keys = []
count = 0
for key in ["W1", "b1", "W2", "b2", "W3", "b3"]:
# flatten parameter
new_vector = np.reshape(parameters[key], (-1,1))
keys = keys + [key]*new_vector.shape[0]
if count == 0:
theta = new_vector
theta = np.concatenate((theta, new_vector), axis=0)
count = count + 1
return theta, keys
def vector_to_dictionary(theta):
Unroll all our parameters dictionary from a single vector satisfying our specific required shape.
parameters = {}
parameters["W1"] = theta[:20].reshape((10,2))
parameters["b1"] = theta[20:30].reshape((10,1))
parameters["W2"] = theta[30:80].reshape((5,10))
parameters["b2"] = theta[80:85].reshape((5,1))
parameters["W3"] = theta[85:90].reshape((1,5))
parameters["b3"] = theta[90:91].reshape((1,1))
return parameters
def gradients_to_vector(gradients):
Roll all our gradients dictionary into a single vector satisfying our specific required shape.
count = 0
for key in ["dW1", "db1", "dW2", "db2", "dW3", "db3"]:
# flatten parameter
new_vector = np.reshape(gradients[key], (-1,1))
if count == 0:
theta = new_vector
theta = np.concatenate((theta, new_vector), axis=0)
count = count + 1
return theta
# GRADED FUNCTION: gradient_check_n
def gradient_check_n(parameters, gradients, X, Y, epsilon = 1e-7):
Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n
parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters.
x -- input datapoint, of shape (input size, 1)
y -- true "label"
epsilon -- tiny shift to the input to compute approximated gradient with formula(1)
difference -- difference (2) between the approximated gradient and the backward propagation gradient
# Set-up variables
parameters_values, _ = dictionary_to_vector(parameters)
grad = gradients_to_vector(gradients)
num_parameters = parameters_values.shape[0]
J_plus = np.zeros((num_parameters, 1))
J_minus = np.zeros((num_parameters, 1))
gradapprox = np.zeros((num_parameters, 1))
# Compute gradapprox
for i in range(num_parameters):
# Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]".
# "_" is used because the function you have to outputs two parameters but we only care about the first one
### START CODE HERE ### (approx. 3 lines)
thetaplus = np.copy(parameters_values) # Step 1
thetaplus[i][0] += epsilon # Step 2
AL, _ = func_L_model_forward(X, vector_to_dictionary( thetaplus ))
J_plus[i] = func_compute_cost(AL, Y)
# J_plus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary( thetaplus )) # Step 3
# Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]".
### START CODE HERE ### (approx. 3 lines)
thetaminus = np.copy(parameters_values) # Step 1
thetaminus[i][0] -= epsilon # Step 2
AL, _ = func_L_model_forward(X, vector_to_dictionary( thetaminus ))
J_minus[i] = func_compute_cost(AL, Y)
# J_minus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary( thetaminus )) # Step 3
# Compute gradapprox[i]
### START CODE HERE ### (approx. 1 line)
gradapprox[i] = (J_plus[i] - J_minus[i]) / 2/ epsilon
# Compare gradapprox to backward propagation gradients by computing difference.
### START CODE HERE ### (approx. 1 line)
numerator = np.linalg.norm(grad - gradapprox) # Step 1'
denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) # Step 2'
# print(numerator, denominator)
# print(grad, gradapprox)
difference = numerator / denominator # Step 3'
if difference > 2e-7:
print ("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")
print ("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")
return difference
overwrite dnn model
def func_L_layer_dnn_model(X, Y, layer_dims, lr=0.001, num_epochs=10000, print_cost=False):
:param X:
:param Y:
:param layer_dims: python array (list) containing the dimensions of each layer in our network
:param lr: learning rate
:param num_epochs:
:param print_cost:
:return params
costs = []
# m = X.shape[1]
# 参数初始化
parameters = func_L_layers_initialize_parameters(layer_dims)
# loop
for epoch in range(num_epochs):
# 前向传播
AL, caches = func_L_model_forward(X, parameters)
# 计算损失
cost = func_compute_cost(AL, Y)
# 后向传播
grads = func_L_model_backward(AL, Y, caches)
if epoch == num_epochs-1:
# 梯度检验
# print(grads, caches)
difference = func_nd_gradient_check(parameters, grads, X, Y)
difference = gradient_check_n(parameters, grads, X, Y)
# 更新参数
parameters = func_update_parameters(parameters, grads, lr)
# 打印信息
if print_cost and epoch % 100 == 0:
print('cost after epoch {}: {}'.format(epoch, np.squeeze(cost)))
if epoch % 100 == 0:
return parameters, costs
train_X, train_Y, test_X, test_Y = load_dataset()
layer_dims = [train_X.shape[0], 10, 5, 1]
lr = 0.1
num_epochs = 1500
[2, 10, 5, 1]
parameters, costs = func_L_layer_dnn_model(train_X, train_Y, layer_dims, lr, num_epochs, print_cost)
print('on the train set:')
pred_train = func_predict(train_X, train_Y, parameters)
print('on the test set:')
pred_test = func_predict(test_X, test_Y, parameters)
plt.xlabel('iterations (per hundreds)')
plt.title("Learning rate =" + str(lr))
on the train set:
acc: 0.99
on the test set:
acc: 0.94
for key, value in parameters.items():
print(key, value.shape)
plt.title("Model with np.sqrt(layer_dims(layer-1)) initialization")
axes = plt.gca()
func_plot_decision_boundary(lambda x: func_predict_dec(parameters, x.T), train_X, train_Y)
- 梯度检验很慢,因为每一个w的分量都要计算plus和minus,所以不用每一个epoch都梯度检验,只需要某些epoch进行检验梯度计算是否正确即可
- 梯度检验不适用于dropout,先检验是否正确,再加dropout