深度学习优化算法实现(Momentum, Adam)

除了常见的梯度下降法外，还有几种比较通用的优化算法；表现都优于梯度下降法。本文只记录完成吴恩达深度学习作业时遇到的Momentum和Adam算法，而且只有简要的代码。具体原理请看深度学习优化算法解析(Momentum, RMSProp, Adam),比较具体的说明了吴恩达版本的三种优化算法的讲述！

Momentum

初始化

def initialize_velocity(parameters):"""Initializes the velocity as a python dictionary with:- keys: "dW1", "db1", ..., "dWL", "dbL" - values: numpy arrays of zeros of the same shape as the correspondinggradients/parameters.Arguments:parameters -- python dictionary containing your parameters.parameters['W' + str(l)] = Wlparameters['b' + str(l)] = blReturns:v -- python dictionary containing the current velocity.v['dW' + str(l)] = velocity of dWlv['db' + str(l)] = velocity of dbl"""L = len(parameters) // 2 # number of layers in the neural networksv = {}# Initialize velocityfor l in range(L):### START CODE HERE ### (approx. 2 lines)v['dW' + str(l + 1)] = np.zeros(np.shape(parameters['W' + str(l + 1)]))v['db' + str(l + 1)] = np.zeros(np.shape(parameters['b' + str(l + 1)]))### END CODE HERE ###return v

更新参数

def update_parameters_with_momentum(parameters, grads, 
v, beta, learning_rate):"""Update parameters using MomentumArguments:parameters -- python dictionary containing your parameters:parameters['W' + str(l)] = Wlparameters['b' + str(l)] = blgrads -- python dictionary containing your gradients foreach parameters:grads['dW' + str(l)] = dWlgrads['db' + str(l)] = dblv -- python dictionary containing the current velocity:v['dW' + str(l)] = ...v['db' + str(l)] = ...beta -- the momentum hyperparameter, scalarlearning_rate -- the learning rate, scalarReturns:parameters -- python dictionary containing your updated parameters v -- python dictionary containing your updated velocities"""L = len(parameters) // 2 # number of layers in the neural networks# Momentum update for each parameterfor l in range(L):### START CODE HERE ### (approx. 4 lines)# compute velocitiesv['dW' + str(l + 1)] = beta * v['dW' + str(l + 1)] + (1 - beta) * grads['dW' + str(l + 1)]v['db' + str(l + 1)] = beta * v['db' + str(l + 1)] + (1 - beta) * grads['db' + str(l + 1)]# update parametersparameters['W' + str(l + 1)] += -learning_rate * v['dW' + str(l + 1)]parameters['b' + str(l + 1)] += -learning_rate * v['db' + str(l + 1)]### END CODE HERE ###return parameters, v

Adam

初始化

def initialize_adam(parameters) :"""Initializes v and s as two python dictionaries with:- keys: "dW1", "db1", ..., "dWL", "dbL" - values: numpy arrays of zeros of the same shape as the corresponding gradients/parameters.Arguments:parameters -- python dictionary containing your parameters.parameters["W" + str(l)] = Wlparameters["b" + str(l)] = blReturns: v -- python dictionary that will contain the exponentially weighted average of the gradient.v["dW" + str(l)] = ...v["db" + str(l)] = ...s -- python dictionary that will contain the exponentially weighted average of the squared gradient.s["dW" + str(l)] = ...s["db" + str(l)] = ..."""L = len(parameters) // 2 # number of layers in the neural networksv = {}s = {}# Initialize v, s. Input: "parameters". Outputs: "v, s".for l in range(L):### START CODE HERE ### (approx. 4 lines)v['dW' + str(l + 1)] = np.zeros(np.shape(parameters['W' + str(l + 1)]))v['db' + str(l + 1)] = np.zeros(np.shape(parameters['b' + str(l + 1)]))s['dW' + str(l + 1)] = np.zeros(np.shape(parameters['W' + str(l + 1)]))s['db' + str(l + 1)] = np.zeros(np.shape(parameters['b' + str(l + 1)]))### END CODE HERE ###return v, s

更新参数

def update_parameters_with_adam(parameters, grads, v, s, t,learning_rate = 0.01,beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8):"""Update parameters using AdamArguments:parameters -- python dictionary containing your parameters:parameters['W' + str(l)] = Wlparameters['b' + str(l)] = blgrads -- python dictionary containing your gradientsfor each parameters:grads['dW' + str(l)] = dWlgrads['db' + str(l)] = dblv -- Adam variable, moving average of the first gradient,python dictionarys -- Adam variable, moving average of the squared gradient,python dictionarylearning_rate -- the learning rate, scalar.beta1 -- Exponential decay hyperparameter forthe first moment estimates beta2 -- Exponential decay hyperparameter forthe second moment estimates epsilon -- hyperparameter preventing divisionby zero in Adam updatesReturns:parameters -- python dictionary containing your updated parameters v -- Adam variable, moving average of the first gradient, python dictionarys -- Adam variable, moving average of the squared gradient, python dictionary"""L = len(parameters) // 2 # number of layers in the neural networks# Initializing first moment estimate,python dictionaryv_corrected = {} # Initializing second moment estimate,python dictionarys_corrected = {} # Perform Adam update on all parametersfor l in range(L):### START CODE HERE ### (approx. 2 lines)v['dW' + str(l + 1)] = beta1 * v['dW' + str(l + 1)] + (1 - beta1) * grads['dW' + str(l + 1)]v['db' + str(l + 1)] = beta1 * v['db' + str(l + 1)] + (1 - beta1) * grads['db' + str(l + 1)]### END CODE HERE ###### START CODE HERE ### (approx. 2 lines)v_corrected['dW' + str(l + 1)] = v['dW' + str(l + 1)] / (1 - beta1 ** t)v_corrected['db' + str(l + 1)] = v['db' + str(l + 1)] / (1 - beta1 ** t)### END CODE HERE ###### START CODE HERE ### (approx. 2 lines)s['dW' + str(l + 1)] = beta2 * s['dW' + str(l + 1)] + (1 - beta2) * np.square(grads['dW' + str(l + 1)])s['db' + str(l + 1)] = beta2 * s['db' + str(l + 1)] + (1 - beta2) * np.square(grads['db' + str(l + 1)]) ### END CODE HERE ###### START CODE HERE ### (approx. 2 lines)s_corrected['dW' + str(l + 1)] = s['dW' + str(l + 1)] / (1 - beta2 ** t)s_corrected['db' + str(l + 1)] = s['db' + str(l + 1)] / (1 - beta2 ** t)### END CODE HERE ###### START CODE HERE ### (approx. 2 lines)parameters['W' + str(l + 1)] += -learning_rate * v_corrected['dW' + str(l + 1)] / (np.sqrt(s['dW' + str(l + 1)]) + epsilon)parameters['b' + str(l + 1)] += -learning_rate * v_corrected['db' + str(l + 1)] / (np.sqrt(s['db' + str(l + 1)]) + epsilon)### END CODE HERE ###return parameters, v, s