cs231n作业2 双层神经网络



Forward: 计算score,再根据score计算loss

def loss(self, X, y=None, reg=0.0):# Unpack variables from the params dictionaryW1, b1 = self.params['W1'], self.params['b1']W2, b2 = self.params['W2'], self.params['b2']N, D = X.shape# Compute the forward passscores = Noneh1 = np.maximum(0, np.dot(X,W1) + b1) #(5,10)scores = np.dot(h1,W2) + b2 # (5,3)if y is None:return scores# Compute the lossloss = Noneexp_S = np.exp(scores) #(5,3)sum_exp_S = np.sum(exp_S,axis = 1) sum_exp_S = sum_exp_S.reshape(-1,1) #(5,1)#print (sum_exp_S.shape)loss = np.sum(-scores[range(N),list(y)]) + sum(np.log(sum_exp_S))loss = loss / N + 0.5 * reg * np.sum(W1 * W1) +  0.5 * reg * np.sum(W2 * W2)# Backward pass: compute gradientsgrads = {}#---------------------------------#dscores = np.zeros(scores.shape)dscores[range(N),list(y)] = -1dscores += (exp_S/sum_exp_S) #(5,3) dscores /= Ngrads['W2'] = np.dot(h1.T, dscores)grads['W2'] += reg * W2grads['b2'] = np.sum(dscores, axis = 0)#---------------------------------#dh1 = np.dot(dscores, W2.T)  #(5,10)dh1_ReLU = (h1>0) * dh1grads['W1'] = X.T.dot(dh1_ReLU) + reg * W1grads['b1'] = np.sum(dh1_ReLU, axis = 0)#---------------------------------#return loss, grads

2、训练函数 (迭代过程:forward–>backward–>update–>forward–>backward->update……)

def train(self, X, y, X_val, y_val,learning_rate=1e-3, learning_rate_decay=0.95,reg=5e-6, num_iters=100,batch_size=200, verbose=False):num_train = X.shape[0]iterations_per_epoch = max(num_train / batch_size, 1)# Use SGD to optimize the parameters in self.modelloss_history = []train_acc_history = []val_acc_history = []for it in xrange(num_iters):X_batch = Noney_batch = Nonemask = np.random.choice(num_train,batch_size,replace = True)X_batch = X[mask]y_batch = y[mask]# Compute loss and gradients using the current minibatchloss, grads = self.loss(X_batch, y=y_batch, reg=reg)loss_history.append(loss)self.params['W1'] += -learning_rate * grads['W1']self.params['b1'] += -learning_rate * grads['b1']self.params['W2'] += -learning_rate * grads['W2']self.params['b2'] += -learning_rate * grads['b2']if verbose and it % 100 == 0:print('iteration %d / %d: loss %f' % (it, num_iters, loss))# Every epoch, check train and val accuracy and decay learning rate.if it % iterations_per_epoch == 0:# Check accuracy#print ('第%d个epoch' %it)train_acc = (self.predict(X_batch) == y_batch).mean()val_acc = (self.predict(X_val) == y_val).mean()train_acc_history.append(train_acc)val_acc_history.append(val_acc)# Decay learning ratelearning_rate *= learning_rate_decay #减小学习率return {'loss_history': loss_history,'train_acc_history': train_acc_history,'val_acc_history': val_acc_history,}





def affine_forward(x, w, b):out = NoneN=x.shape[0]x_new=x.reshape(N,-1)#转为二维向量out=np.dot(x_new,w)+bcache = (x, w, b) # 不需要保存outreturn out, cachedef affine_backward(dout, cache):x, w, b = cachedx, dw, db = None, None, Nonedx=np.dot(dout,w.T)dx=np.reshape(dx,x.shape)x_new=x.reshape(x.shape[0],-1)dw=np.dot(x_new.T,dout) db=np.sum(dout,axis=0,keepdims=True)return dx, dw, dbdef relu_forward(x):out = Noneout=np.maximum(0,x)cache = xreturn out, cachedef relu_backward(dout, cache):dx, x = None, cachereturn dx


def affine_relu_forward(x, w, b):a, fc_cache = affine_forward(x, w, b)out, relu_cache = relu_forward(a)cache = (fc_cache, relu_cache)return out, cachedef affine_relu_backward(dout, cache):fc_cache, relu_cache = cacheda = relu_backward(dout, relu_cache)dx, dw, db = affine_backward(da, fc_cache)return dx, dw, db


class FullyConnectedNet(object):def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10,dropout=0, use_batchnorm=False, reg=0.0,weight_scale=1e-2, dtype=np.float32, seed=None):self.use_batchnorm = use_batchnormself.use_dropout = dropout > 0self.reg = regself.num_layers = 1 + len(hidden_dims)self.dtype = dtypeself.params = {}layers_dims = [input_dim] + hidden_dims + [num_classes] #z这里存储的是每个layer的大小for i in xrange(self.num_layers):self.params['W' + str(i + 1)] = weight_scale * np.random.randn(layers_dims[i], layers_dims[i + 1])self.params['b' + str(i + 1)] = np.zeros((1, layers_dims[i + 1]))if self.use_batchnorm and i < len(hidden_dims):#最后一层是不需要batchnorm的self.params['gamma' + str(i + 1)] = np.ones((1, layers_dims[i + 1]))self.params['beta' + str(i + 1)] = np.zeros((1, layers_dims[i + 1]))self.dropout_param = {}if self.use_dropout:self.dropout_param = {'mode': 'train', 'p': dropout}if seed is not None:self.dropout_param['seed'] = seedself.bn_params = []if self.use_batchnorm:self.bn_params = [{'mode': 'train'} for i in xrange(self.num_layers - 1)]# Cast all parameters to the correct datatypefor k, v in self.params.iteritems():self.params[k] = v.astype(dtype)def loss(self, X, y=None):X = X.astype(self.dtype)mode = 'test' if y is None else 'train'if self.dropout_param is not None:self.dropout_param['mode'] = modeif self.use_batchnorm:for bn_param in self.bn_params:bn_param[mode] = modescores = Noneh, cache1, cache2, cache3,cache4, bn, out = {}, {}, {}, {}, {}, {},{}out[0] = X #存储每一层的out,按照逻辑,X就是out0[0]# Forward pass: compute lossfor i in xrange(self.num_layers - 1):# 得到每一层的参数w, b = self.params['W' + str(i + 1)], self.params['b' + str(i + 1)]if self.use_batchnorm:gamma, beta = self.params['gamma' + str(i + 1)], self.params['beta' + str(i + 1)]h[i], cache1[i] = affine_forward(out[i], w, b)bn[i], cache2[i] = batchnorm_forward(h[i], gamma, beta, self.bn_params[i])out[i + 1], cache3[i] = relu_forward(bn[i])if self.use_dropout:out[i+1], cache4[i] = dropout_forward(out[i+1]  , self.dropout_param)else:out[i + 1], cache3[i] = affine_relu_forward(out[i], w, b)if self.use_dropout:out[i + 1], cache4[i] = dropout_forward(out[i + 1], self.dropout_param)W, b = self.params['W' + str(self.num_layers)], self.params['b' + str(self.num_layers)]scores, cache = affine_forward(out[self.num_layers - 1], W, b) #对最后一层进行计算if mode == 'test':return scoresloss, grads = 0.0, {}data_loss, dscores = softmax_loss(scores, y)reg_loss = 0for i in xrange(self.num_layers):reg_loss += 0.5 * self.reg * np.sum(self.params['W' + str(i + 1)] * self.params['W' + str(i + 1)])loss = data_loss + reg_loss# Backward pass: compute gradientsdout, dbn, dh, ddrop = {}, {}, {}, {}t = self.num_layers - 1dout[t], grads['W' + str(t + 1)], grads['b' + str(t + 1)] = affine_backward(dscores, cache)#这个cache就是上面得到的for i in xrange(t):if self.use_batchnorm:if self.use_dropout:dout[t - i] = dropout_backward(dout[t-i], cache4[t-1-i])dbn[t - 1 - i] = relu_backward(dout[t - i], cache3[t - 1 - i])dh[t - 1 - i], grads['gamma' + str(t - i)], grads['beta' + str(t - i)] = batchnorm_backward(dbn[t - 1 - i],cache2[t - 1 - i])dout[t - 1 - i], grads['W' + str(t - i)], grads['b' + str(t - i)] = affine_backward(dh[t - 1 - i],cache1[t - 1 - i])else:if self.use_dropout:dout[t - i] = dropout_backward(dout[t - i], cache4[t - 1 - i])dout[t - 1 - i], grads['W' + str(t - i)], grads['b' + str(t - i)] = affine_relu_backward(dout[t - i],cache3[t - 1 - i])# Add the regularization gradient contributionfor i in xrange(self.num_layers):grads['W' + str(i + 1)] += self.reg * self.params['W' + str(i + 1)]return loss, grads



  1. SGD
  2. Momentum
  3. Nestero
  4. RMSProp and Adam





def batchnorm_forward(x, gamma, beta, bn_param):mode = bn_param['mode']  #因为train和test是两种不同的方法eps = bn_param.get('eps', 1e-5)momentum = bn_param.get('momentum', 0.9)N, D = x.shaperunning_mean = bn_param.get('running_mean', np.zeros(D, dtype=x.dtype))running_var = bn_param.get('running_var', np.zeros(D, dtype=x.dtype))out, cache = None, Noneif mode == 'train':    sample_mean = np.mean(x, axis=0, keepdims=True)       # [1,D]    sample_var = np.var(x, axis=0, keepdims=True)         # [1,D] x_normalized = (x - sample_mean) / np.sqrt(sample_var + eps)    # [N,D]    out = gamma * x_normalized + beta    cache = (x_normalized, gamma, beta, sample_mean, sample_var, x, eps)    running_mean = momentum * running_mean + (1 - momentum) * sample_mean    #通过moument得到最终的running_mean和running_varrunning_var = momentum * running_var + (1 - momentum) * sample_varelif mode == 'test':    x_normalized = (x - running_mean) / np.sqrt(running_var + eps)    #test的时候如何通过BN层out = gamma * x_normalized + betaelse:    raise ValueError('Invalid forward batchnorm mode "%s"' % mode)# Store the updated running means back into bn_parambn_param['running_mean'] = running_meanbn_param['running_var'] = running_varreturn out, cachedef batchnorm_backward(dout, cache):dx, dgamma, dbeta = None, None, Nonex_normalized, gamma, beta, sample_mean, sample_var, x, eps = cacheN, D = x.shapedx_normalized = dout * gamma       # [N,D]x_mu = x - sample_mean             # [N,D]sample_std_inv = 1.0 / np.sqrt(sample_var + eps)    # [1,D]dsample_var = -0.5 * np.sum(dx_normalized * x_mu, axis=0, keepdims=True) * sample_std_inv**3dsample_mean = -1.0 * np.sum(dx_normalized * sample_std_inv, axis=0, keepdims=True) - \                                2.0 * dsample_var * np.mean(x_mu, axis=0, keepdims=True)dx1 = dx_normalized * sample_std_invdx2 = 2.0/N * dsample_var * x_mudx = dx1 + dx2 + 1.0/N * dsample_meandgamma = np.sum(dout * x_normalized, axis=0, keepdims=True)dbeta = np.sum(dout, axis=0, keepdims=True)return dx, dgamma, dbeta

Batch Normalization解决的一个重要问题就是梯度饱和。



def dropout_forward(x, dropout_param):p, mode = dropout_param['p'], dropout_param['mode']if 'seed' in dropout_param:  np.random.seed(dropout_param['seed'])mask = Noneout = Noneif mode == 'train':    mask = (np.random.rand(*x.shape) < p) / p    #注意这里除以了一个P,这样在test的输出的时候,维持原样即可out = x * maskelif mode == 'test':    out = xcache = (dropout_param, mask)out = out.astype(x.dtype, copy=False)return out, cachedef dropout_backward(dout, cache):dropout_param, mask = cachemode = dropout_param['mode']dx = Noneif mode == 'train':    dx = dout * maskelif mode == 'test':    dx = doutreturn dx



def conv_forward_naive(x, w, b, conv_param):stride, pad = conv_param['stride'], conv_param['pad']N, C, H, W = x.shapeF, C, HH, WW = w.shapex_padded = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), mode='constant') #补零H_new = 1 + (H + 2 * pad - HH) / strideW_new = 1 + (W + 2 * pad - WW) / strides = strideout = np.zeros((N, F, H_new, W_new))for i in xrange(N):       # ith image    for f in xrange(F):   # fth filter        for j in xrange(H_new):            for k in xrange(W_new):                out[i, f, j, k] = np.sum(x_padded[i, :, j*s:HH+j*s, k*s:WW+k*s] * w[f]) + b[f]#对应位相乘cache = (x, w, b, conv_param)return out, cachedef conv_backward_naive(dout, cache):x, w, b, conv_param = cachepad = conv_param['pad']stride = conv_param['stride']F, C, HH, WW = w.shapeN, C, H, W = x.shapeH_new = 1 + (H + 2 * pad - HH) / strideW_new = 1 + (W + 2 * pad - WW) / stridedx = np.zeros_like(x)dw = np.zeros_like(w)db = np.zeros_like(b)s = stridex_padded = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), 'constant')dx_padded = np.pad(dx, ((0, 0), (0, 0), (pad, pad), (pad, pad)), 'constant')for i in xrange(N):       # ith image    for f in xrange(F):   # fth filter        for j in xrange(H_new):            for k in xrange(W_new):                window = x_padded[i, :, j*s:HH+j*s, k*s:WW+k*s]db[f] += dout[i, f, j, k]                dw[f] += window * dout[i, f, j, k]                dx_padded[i, :, j*s:HH+j*s, k*s:WW+k*s] += w[f] * dout[i, f, j, k]#上面的式子,关键就在于+号# Unpaddx = dx_padded[:, :, pad:pad+H, pad:pad+W]return dx, dw, db


def max_pool_forward_naive(x, pool_param):HH, WW = pool_param['pool_height'], pool_param['pool_width']s = pool_param['stride']N, C, H, W = x.shapeH_new = 1 + (H - HH) / sW_new = 1 + (W - WW) / sout = np.zeros((N, C, H_new, W_new))for i in xrange(N):    for j in xrange(C):        for k in xrange(H_new):            for l in xrange(W_new):                window = x[i, j, k*s:HH+k*s, l*s:WW+l*s] out[i, j, k, l] = np.max(window)cache = (x, pool_param)return out, cachedef max_pool_backward_naive(dout, cache):x, pool_param = cacheHH, WW = pool_param['pool_height'], pool_param['pool_width']s = pool_param['stride']N, C, H, W = x.shapeH_new = 1 + (H - HH) / sW_new = 1 + (W - WW) / sdx = np.zeros_like(x)for i in xrange(N):    for j in xrange(C):        for k in xrange(H_new):            for l in xrange(W_new):                window = x[i, j, k*s:HH+k*s, l*s:WW+l*s]                m = np.max(window)               #获得之前的那个值,这样下面只要windows==m就能得到相应的位置dx[i, j, k*s:HH+k*s, l*s:WW+l*s] = (window == m) * dout[i, j, k, l]return dx






