cs231n作业2 双层神经网络

双层神经网络

在这里插入图片描述
在这里插入图片描述
我们选用ReLU函数和softmax函数:
在这里插入图片描述
步骤:
1、LOSS损失函数(前向传播)与梯度(后向传播)计算

Forward: 计算score,再根据score计算loss
Backward:分别对W2、b2、W1、b1求梯度

def loss(self, X, y=None, reg=0.0):# Unpack variables from the params dictionaryW1, b1 = self.params['W1'], self.params['b1']W2, b2 = self.params['W2'], self.params['b2']N, D = X.shape# Compute the forward passscores = Noneh1 = np.maximum(0, np.dot(X,W1) + b1) #(5,10)scores = np.dot(h1,W2) + b2 # (5,3)if y is None:return scores# Compute the lossloss = Noneexp_S = np.exp(scores) #(5,3)sum_exp_S = np.sum(exp_S,axis = 1) sum_exp_S = sum_exp_S.reshape(-1,1) #(5,1)#print (sum_exp_S.shape)loss = np.sum(-scores[range(N),list(y)]) + sum(np.log(sum_exp_S))loss = loss / N + 0.5 * reg * np.sum(W1 * W1) +  0.5 * reg * np.sum(W2 * W2)# Backward pass: compute gradientsgrads = {}#---------------------------------#dscores = np.zeros(scores.shape)dscores[range(N),list(y)] = -1dscores += (exp_S/sum_exp_S) #(5,3) dscores /= Ngrads['W2'] = np.dot(h1.T, dscores)grads['W2'] += reg * W2grads['b2'] = np.sum(dscores, axis = 0)#---------------------------------#dh1 = np.dot(dscores, W2.T)  #(5,10)dh1_ReLU = (h1>0) * dh1grads['W1'] = X.T.dot(dh1_ReLU) + reg * W1grads['b1'] = np.sum(dh1_ReLU, axis = 0)#---------------------------------#return loss, grads

2、训练函数 (迭代过程:forward–>backward–>update–>forward–>backward->update……)

def train(self, X, y, X_val, y_val,learning_rate=1e-3, learning_rate_decay=0.95,reg=5e-6, num_iters=100,batch_size=200, verbose=False):num_train = X.shape[0]iterations_per_epoch = max(num_train / batch_size, 1)# Use SGD to optimize the parameters in self.modelloss_history = []train_acc_history = []val_acc_history = []for it in xrange(num_iters):X_batch = Noney_batch = Nonemask = np.random.choice(num_train,batch_size,replace = True)X_batch = X[mask]y_batch = y[mask]# Compute loss and gradients using the current minibatchloss, grads = self.loss(X_batch, y=y_batch, reg=reg)loss_history.append(loss)self.params['W1'] += -learning_rate * grads['W1']self.params['b1'] += -learning_rate * grads['b1']self.params['W2'] += -learning_rate * grads['W2']self.params['b2'] += -learning_rate * grads['b2']if verbose and it % 100 == 0:print('iteration %d / %d: loss %f' % (it, num_iters, loss))# Every epoch, check train and val accuracy and decay learning rate.if it % iterations_per_epoch == 0:# Check accuracy#print ('第%d个epoch' %it)train_acc = (self.predict(X_batch) == y_batch).mean()val_acc = (self.predict(X_val) == y_val).mean()train_acc_history.append(train_acc)val_acc_history.append(val_acc)# Decay learning ratelearning_rate *= learning_rate_decay #减小学习率return {'loss_history': loss_history,'train_acc_history': train_acc_history,'val_acc_history': val_acc_history,}

3、预测函数
4、参数训练

用于机器视觉识别的卷积神经网络

多层全连接神经网络

两个基本的layer:

def affine_forward(x, w, b):out = NoneN=x.shape[0]x_new=x.reshape(N,-1)#转为二维向量out=np.dot(x_new,w)+bcache = (x, w, b) # 不需要保存outreturn out, cachedef affine_backward(dout, cache):x, w, b = cachedx, dw, db = None, None, Nonedx=np.dot(dout,w.T)dx=np.reshape(dx,x.shape)x_new=x.reshape(x.shape[0],-1)dw=np.dot(x_new.T,dout) db=np.sum(dout,axis=0,keepdims=True)return dx, dw, dbdef relu_forward(x):out = Noneout=np.maximum(0,x)cache = xreturn out, cachedef relu_backward(dout, cache):dx, x = None, cachereturn dx

构建一个Sandwich的层:

def affine_relu_forward(x, w, b):a, fc_cache = affine_forward(x, w, b)out, relu_cache = relu_forward(a)cache = (fc_cache, relu_cache)return out, cachedef affine_relu_backward(dout, cache):fc_cache, relu_cache = cacheda = relu_backward(dout, relu_cache)dx, dw, db = affine_backward(da, fc_cache)return dx, dw, db

FullyConnectedNet:

class FullyConnectedNet(object):def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10,dropout=0, use_batchnorm=False, reg=0.0,weight_scale=1e-2, dtype=np.float32, seed=None):self.use_batchnorm = use_batchnormself.use_dropout = dropout > 0self.reg = regself.num_layers = 1 + len(hidden_dims)self.dtype = dtypeself.params = {}layers_dims = [input_dim] + hidden_dims + [num_classes] #z这里存储的是每个layer的大小for i in xrange(self.num_layers):self.params['W' + str(i + 1)] = weight_scale * np.random.randn(layers_dims[i], layers_dims[i + 1])self.params['b' + str(i + 1)] = np.zeros((1, layers_dims[i + 1]))if self.use_batchnorm and i < len(hidden_dims):#最后一层是不需要batchnorm的self.params['gamma' + str(i + 1)] = np.ones((1, layers_dims[i + 1]))self.params['beta' + str(i + 1)] = np.zeros((1, layers_dims[i + 1]))self.dropout_param = {}if self.use_dropout:self.dropout_param = {'mode': 'train', 'p': dropout}if seed is not None:self.dropout_param['seed'] = seedself.bn_params = []if self.use_batchnorm:self.bn_params = [{'mode': 'train'} for i in xrange(self.num_layers - 1)]# Cast all parameters to the correct datatypefor k, v in self.params.iteritems():self.params[k] = v.astype(dtype)def loss(self, X, y=None):X = X.astype(self.dtype)mode = 'test' if y is None else 'train'if self.dropout_param is not None:self.dropout_param['mode'] = modeif self.use_batchnorm:for bn_param in self.bn_params:bn_param[mode] = modescores = Noneh, cache1, cache2, cache3,cache4, bn, out = {}, {}, {}, {}, {}, {},{}out[0] = X #存储每一层的out,按照逻辑,X就是out0[0]# Forward pass: compute lossfor i in xrange(self.num_layers - 1):# 得到每一层的参数w, b = self.params['W' + str(i + 1)], self.params['b' + str(i + 1)]if self.use_batchnorm:gamma, beta = self.params['gamma' + str(i + 1)], self.params['beta' + str(i + 1)]h[i], cache1[i] = affine_forward(out[i], w, b)bn[i], cache2[i] = batchnorm_forward(h[i], gamma, beta, self.bn_params[i])out[i + 1], cache3[i] = relu_forward(bn[i])if self.use_dropout:out[i+1], cache4[i] = dropout_forward(out[i+1]  , self.dropout_param)else:out[i + 1], cache3[i] = affine_relu_forward(out[i], w, b)if self.use_dropout:out[i + 1], cache4[i] = dropout_forward(out[i + 1], self.dropout_param)W, b = self.params['W' + str(self.num_layers)], self.params['b' + str(self.num_layers)]scores, cache = affine_forward(out[self.num_layers - 1], W, b) #对最后一层进行计算if mode == 'test':return scoresloss, grads = 0.0, {}data_loss, dscores = softmax_loss(scores, y)reg_loss = 0for i in xrange(self.num_layers):reg_loss += 0.5 * self.reg * np.sum(self.params['W' + str(i + 1)] * self.params['W' + str(i + 1)])loss = data_loss + reg_loss# Backward pass: compute gradientsdout, dbn, dh, ddrop = {}, {}, {}, {}t = self.num_layers - 1dout[t], grads['W' + str(t + 1)], grads['b' + str(t + 1)] = affine_backward(dscores, cache)#这个cache就是上面得到的for i in xrange(t):if self.use_batchnorm:if self.use_dropout:dout[t - i] = dropout_backward(dout[t-i], cache4[t-1-i])dbn[t - 1 - i] = relu_backward(dout[t - i], cache3[t - 1 - i])dh[t - 1 - i], grads['gamma' + str(t - i)], grads['beta' + str(t - i)] = batchnorm_backward(dbn[t - 1 - i],cache2[t - 1 - i])dout[t - 1 - i], grads['W' + str(t - i)], grads['b' + str(t - i)] = affine_backward(dh[t - 1 - i],cache1[t - 1 - i])else:if self.use_dropout:dout[t - i] = dropout_backward(dout[t - i], cache4[t - 1 - i])dout[t - 1 - i], grads['W' + str(t - i)], grads['b' + str(t - i)] = affine_relu_backward(dout[t - i],cache3[t - 1 - i])# Add the regularization gradient contributionfor i in xrange(self.num_layers):grads['W' + str(i + 1)] += self.reg * self.params['W' + str(i + 1)]return loss, grads

使用slover来对神经网络进优化求解

之后进行参数更新:

  1. SGD
  2. Momentum
  3. Nestero
  4. RMSProp and Adam

批量规范化

在这里插入图片描述

BN层前向传播:
在这里插入图片描述

BN层反向传播:
在这里插入图片描述

def batchnorm_forward(x, gamma, beta, bn_param):mode = bn_param['mode']  #因为train和test是两种不同的方法eps = bn_param.get('eps', 1e-5)momentum = bn_param.get('momentum', 0.9)N, D = x.shaperunning_mean = bn_param.get('running_mean', np.zeros(D, dtype=x.dtype))running_var = bn_param.get('running_var', np.zeros(D, dtype=x.dtype))out, cache = None, Noneif mode == 'train':    sample_mean = np.mean(x, axis=0, keepdims=True)       # [1,D]    sample_var = np.var(x, axis=0, keepdims=True)         # [1,D] x_normalized = (x - sample_mean) / np.sqrt(sample_var + eps)    # [N,D]    out = gamma * x_normalized + beta    cache = (x_normalized, gamma, beta, sample_mean, sample_var, x, eps)    running_mean = momentum * running_mean + (1 - momentum) * sample_mean    #通过moument得到最终的running_mean和running_varrunning_var = momentum * running_var + (1 - momentum) * sample_varelif mode == 'test':    x_normalized = (x - running_mean) / np.sqrt(running_var + eps)    #test的时候如何通过BN层out = gamma * x_normalized + betaelse:    raise ValueError('Invalid forward batchnorm mode "%s"' % mode)# Store the updated running means back into bn_parambn_param['running_mean'] = running_meanbn_param['running_var'] = running_varreturn out, cachedef batchnorm_backward(dout, cache):dx, dgamma, dbeta = None, None, Nonex_normalized, gamma, beta, sample_mean, sample_var, x, eps = cacheN, D = x.shapedx_normalized = dout * gamma       # [N,D]x_mu = x - sample_mean             # [N,D]sample_std_inv = 1.0 / np.sqrt(sample_var + eps)    # [1,D]dsample_var = -0.5 * np.sum(dx_normalized * x_mu, axis=0, keepdims=True) * sample_std_inv**3dsample_mean = -1.0 * np.sum(dx_normalized * sample_std_inv, axis=0, keepdims=True) - \                                2.0 * dsample_var * np.mean(x_mu, axis=0, keepdims=True)dx1 = dx_normalized * sample_std_invdx2 = 2.0/N * dsample_var * x_mudx = dx1 + dx2 + 1.0/N * dsample_meandgamma = np.sum(dout * x_normalized, axis=0, keepdims=True)dbeta = np.sum(dout, axis=0, keepdims=True)return dx, dgamma, dbeta

Batch Normalization解决的一个重要问题就是梯度饱和。

Dropout

训练的时候以一定的概率来去每层的神经元:
在这里插入图片描述
可以防止过拟合。还可以理解为dropout是一个正则化的操作,他在每次训练的时候,强行让一些feature为0,这样提高了网络的稀疏表达能力。

def dropout_forward(x, dropout_param):p, mode = dropout_param['p'], dropout_param['mode']if 'seed' in dropout_param:  np.random.seed(dropout_param['seed'])mask = Noneout = Noneif mode == 'train':    mask = (np.random.rand(*x.shape) < p) / p    #注意这里除以了一个P,这样在test的输出的时候,维持原样即可out = x * maskelif mode == 'test':    out = xcache = (dropout_param, mask)out = out.astype(x.dtype, copy=False)return out, cachedef dropout_backward(dout, cache):dropout_param, mask = cachemode = dropout_param['mode']dx = Noneif mode == 'train':    dx = dout * maskelif mode == 'test':    dx = doutreturn dx

卷积神经网络

卷积层的前向传播与反向传播
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

def conv_forward_naive(x, w, b, conv_param):stride, pad = conv_param['stride'], conv_param['pad']N, C, H, W = x.shapeF, C, HH, WW = w.shapex_padded = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), mode='constant') #补零H_new = 1 + (H + 2 * pad - HH) / strideW_new = 1 + (W + 2 * pad - WW) / strides = strideout = np.zeros((N, F, H_new, W_new))for i in xrange(N):       # ith image    for f in xrange(F):   # fth filter        for j in xrange(H_new):            for k in xrange(W_new):                out[i, f, j, k] = np.sum(x_padded[i, :, j*s:HH+j*s, k*s:WW+k*s] * w[f]) + b[f]#对应位相乘cache = (x, w, b, conv_param)return out, cachedef conv_backward_naive(dout, cache):x, w, b, conv_param = cachepad = conv_param['pad']stride = conv_param['stride']F, C, HH, WW = w.shapeN, C, H, W = x.shapeH_new = 1 + (H + 2 * pad - HH) / strideW_new = 1 + (W + 2 * pad - WW) / stridedx = np.zeros_like(x)dw = np.zeros_like(w)db = np.zeros_like(b)s = stridex_padded = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), 'constant')dx_padded = np.pad(dx, ((0, 0), (0, 0), (pad, pad), (pad, pad)), 'constant')for i in xrange(N):       # ith image    for f in xrange(F):   # fth filter        for j in xrange(H_new):            for k in xrange(W_new):                window = x_padded[i, :, j*s:HH+j*s, k*s:WW+k*s]db[f] += dout[i, f, j, k]                dw[f] += window * dout[i, f, j, k]                dx_padded[i, :, j*s:HH+j*s, k*s:WW+k*s] += w[f] * dout[i, f, j, k]#上面的式子,关键就在于+号# Unpaddx = dx_padded[:, :, pad:pad+H, pad:pad+W]return dx, dw, db

池化层

def max_pool_forward_naive(x, pool_param):HH, WW = pool_param['pool_height'], pool_param['pool_width']s = pool_param['stride']N, C, H, W = x.shapeH_new = 1 + (H - HH) / sW_new = 1 + (W - WW) / sout = np.zeros((N, C, H_new, W_new))for i in xrange(N):    for j in xrange(C):        for k in xrange(H_new):            for l in xrange(W_new):                window = x[i, j, k*s:HH+k*s, l*s:WW+l*s] out[i, j, k, l] = np.max(window)cache = (x, pool_param)return out, cachedef max_pool_backward_naive(dout, cache):x, pool_param = cacheHH, WW = pool_param['pool_height'], pool_param['pool_width']s = pool_param['stride']N, C, H, W = x.shapeH_new = 1 + (H - HH) / sW_new = 1 + (W - WW) / sdx = np.zeros_like(x)for i in xrange(N):    for j in xrange(C):        for k in xrange(H_new):            for l in xrange(W_new):                window = x[i, j, k*s:HH+k*s, l*s:WW+l*s]                m = np.max(window)               #获得之前的那个值,这样下面只要windows==m就能得到相应的位置dx[i, j, k*s:HH+k*s, l*s:WW+l*s] = (window == m) * dout[i, j, k, l]return dx

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/diannao/42333.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

multisim中关于74ls192n和DSWPK开关仿真图分析(减法计数器)

&#x1f3c6;本文收录于「Bug调优」专栏&#xff0c;主要记录项目实战过程中的Bug之前因后果及提供真实有效的解决方案&#xff0c;希望能够助你一臂之力&#xff0c;帮你早日登顶实现财富自由&#x1f680;&#xff1b;同时&#xff0c;欢迎大家关注&&收藏&&…

直播预告 | VMware大规模迁移实战,HyperMotion助力业务高效迁移

2006年核高基专项启动&#xff0c;2022年国家79号文件要求2027年央国企100%完成信创改造……国家一系列信创改造政策的推动&#xff0c;让服务器虚拟化软件巨头VMware在中国的市场份额迅速缩水。 加之VMware永久授权的取消和部分软件组件销售策略的变更&#xff0c;导致VMware…

什么时候考虑将mysql数据迁移到ES?

文章目录 对ES的一些疑问问题1:ES相比mysql本身有哪些优势&#xff1f;问题2:哪些场景适合用ES而不是mysql&#xff1f;问题3:mysql逐行扫描&#xff0c;根据过滤条件检查记录中对应字段是否满足要求属于正排索引&#xff0c;根据二叉树索引检索记录的方式属于正排索引还是倒排…

SpringBoot整合DataX数据同步(自动生成job文件)

SpringBoot整合Datax数据同步 文章目录 SpringBoot整合Datax数据同步1.简介设计理念 DataX3.0框架设计DataX3.0核心架构核心模块介绍DataX调度流程 2.DataX3.0插件体系3.数据同步1.编写job的json文件2.进入bin目录下&#xff0c;执行文件 4.SpringBoot整合DataX生成Job文件并执…

生产力工具|VS Code安装及使用指南

一、VS Code介绍 &#xff08;一&#xff09;软件介绍 Visual Studio Code&#xff08;简称VS Code&#xff09;是由Microsoft开发的免费开源代码编辑器&#xff0c;适用于Windows、macOS和Linux操作系统。它支持多种编程语言&#xff0c;如JavaScript、Python、C等&#xff0…

知识社区在线提问小程序模板源码

蓝色的知识问答&#xff0c;问答交流&#xff0c;知识社区&#xff0c;在线提问手机app小程序网页模板。包含&#xff1a;社区主页、提问、我的、绑定手机&#xff0c;实名认证等。 知识社区在线提问小程序模板源码

品质至上!中国星坤连接器的发展之道!

在电子连接技术领域&#xff0c;中国星坤以其卓越的创新能力和对品质的不懈追求&#xff0c;赢得了业界的广泛认可。凭借在高精度连接器设计和制造上的领先地位&#xff0c;星坤不仅获得了多项实用新型专利&#xff0c;更通过一系列国际质量管理体系认证&#xff0c;彰显了其产…

【Qt5.12.9】程序无法显示照片问题(已解决)

问题记录&#xff1a;Qt5.12.9下无法显示照片 我的工程名为03_qpainter&#xff0c;照片cd.png存放在工程目录下的image文件夹中。 /03_qpainter/image/cd.png 因为这是正点原子Linux下Qt书籍中的例程&#xff0c;在通过学习其配套的例程中的项目&#xff0c;发现我的项目少…

【Python】搭建属于自己 AI 机器人

目录 前言 1 准备工作 1.1 环境搭建 1.2 获取 API KEY 2 写代码 2.1 引用库 2.2 创建用户 2.3 创建对话 2.4 输出内容 2.5 调试 2.6 全部代码 2.7 简短的总结 3 优化代码 3.1 规范代码 3.1.1 引用库 3.1.2 创建提示词 3.1.3 创建模型 3.1.4 规范输出&#xf…

西门子1200高速计数器编码器的应用 接线 组态 编程 调试 测距测速

编码器的应用、接线、组态、博途1200编程与调试&#xff1a;高速计数器&#xff0c;用于给PLC发高速脉冲&#xff0c;接I点 用来例如&#xff1a;检测电机转速&#xff0c;皮带输送机运行的距离 &#xff08;粗略定位&#xff09; 360&#xff1a;代表转一圈会对外发360个脉冲&…

系统化学习 H264视频编码(02) I帧 P帧 B帧 引入及相关概念解读

说明&#xff1a;我们参考黄金圈学习法&#xff08;什么是黄金圈法则?->模型 黄金圈法则&#xff0c;本文使用&#xff1a;why-what&#xff09;来学习音H264视频编码。本系列文章侧重于理解视频编码的知识体系和实践方法&#xff0c;理论方面会更多地讲清楚 音视频中概念的…

Vue3+.NET6前后端分离式管理后台实战(二十八)

1&#xff0c;Vue3.NET6前后端分离式管理后台实战(二十八)

【Linux进阶】文件系统6——理解文件操作

目录 1.文件的读取 1.1.目录 1.2.文件 1.3.目录树读取 1.4.文件系统大小与磁盘读取性能 2.增添文件 2.1.数据的不一致&#xff08;Inconsistent&#xff09;状态 2.2.日志式文件系统&#xff08;Journaling filesystem&#xff09; 3.Linux文件系统的运行 4、文件的删…

干货 | 2024大模型场景下智算平台的设计与优化实践(免费下载)

诚挚邀请您微信扫描以下二维码加入方案驿站知识星球&#xff0c;获取上万份PPT/WORD解决方案&#xff01;&#xff01;&#xff01;感谢支持&#xff01;&#xff01;&#xff01;

【C++】string的底层原理及实现

文章目录 string类的存储结构默认成员函数构造函数析构函数拷贝构造函数赋值重载 容量操作size()capacity()reserve()resize()clear() 遍历与访问operator[ ]迭代器范围与for 增删查改push_back()pop_back()append()operatorinsert()erase()c_str()find()substr() 非成员函数op…

力扣考研经典题 反转链表

核心思想 头插法&#xff1a; 不断的将cur指针所指向的节点放到头节点之前&#xff0c;然后头节点指向cur节点&#xff0c;因为最后返回的是head.next 。 解题思路 1.如果头节点是空的&#xff0c;或者是只有一个节点&#xff0c;只需要返回head节点即可。 if (head null …

Echarts中的热力图和漏斗图(在Vue中使用热力图和漏斗图)

热力图 (Heatmap) Echarts的热力图用于展示两个维度数据矩阵中的值分布情况。它通过在平面上划分成多个矩形区域&#xff0c;并用不同的颜色填充这些区域来表示数据的大小或强度。颜色渐变从浅到深通常映射着数值从小到大&#xff0c;从而直观展示数据的集中程度和分布模式。热…

半同步主从复制

半同步主从复制的概念 半同步主从复制&#xff08;Semisynchronous Replication, SBR&#xff09;是MySQL数据库中的一种数据复制方式&#xff0c;它在异步复制的基础上增加了一定程度的同步性&#xff0c;旨在提高数据安全性&#xff0c;减少数据丢失的风险。 半同步主从复制…

阶段三:项目开发---大数据开发运行环境搭建:任务8:安装配置Redis

任务描述 知识点&#xff1a;安装配置Redis 重 点&#xff1a; 安装配置Redis 难 点&#xff1a;无 内 容&#xff1a; Redis&#xff08;Remote Dictionary Server )&#xff0c;即远程字典服务&#xff0c;是一个开源的使用ANSI C语言编写、支持网络、可基于内存亦可…

电路基础知识汇总

1.0 串连&#xff0c;并联&#xff0c;混连 串联的定义 电路串联是一种电路元件的连接方式&#xff0c;其中各个元件沿着单一路径互相连接&#xff0c;形成一个连续的链。在串联电路中&#xff0c;每个节点最多只连接两个元件&#xff0c;这意味着电流只有一条路径可以通过整个…