




  1. 介绍
  2. 先决条件
  3. 导入我们的库
  4. 激活函数及求导
  5. 神经网络类
  6. 初始化权重和偏差
  7. 正向传播
  8. 成本函数
  9. 反向传播
  10. 预测新数据集的标签




我假设你知道神经网络是什么以及它们是如何学习的。如果你对Python和像numpy这样的库很熟悉的话,这将很容易理解。另外,还需要对线性代数和微积分知识有很好的了解,以便于轻松地理解正向和反向传播部分。此外,我强烈建议您阅读Andrew Ng在Coursera上的课程视频(https://www.coursera.org/ ; https://www.deeplearning.ai/ )。



# Importing the librariesimport numpy as npimport matplotlib.pyplot as pltimport pandas as pdimport warningsimport timewarnings.filterwarnings('ignore');import osimport sys

我们将使用pandas导入和清理我们的数据集。Numpy 是执行矩阵代数和复杂计算的最重要的库。




def sigmoid(z) : """ Reutrns the element wise sigmoid function. """ return 1./(1 + np.exp(-z))def sigmoid_prime(z) : """ Returns the derivative of the sigmoid function. """ return sigmoid(z)*(1-sigmoid(z))def ReLU(z) :  """ Reutrns the element wise ReLU function. """ return (z*(z > 0))def ReLU_prime(z) : """ Returns the derivative of the ReLU function. """ return 1*(z>=0)def lReLU(z) :  """ Reutrns the element wise leaky ReLU function. """ return np.maximum(z/100,z)def lReLU_prime(z) : """ Returns the derivative of the leaky ReLU function. """ z = 1*(z>=0) z[z==0] = 1/100 return zdef tanh(z) : """ Reutrns the element wise hyperbolic tangent function. """ return np.tanh(z)def tanh_prime(z) :  """ Returns the derivative of the tanh function. """ return (1-tanh(z)**2)# A dictionary of our activation functionsPHI = {'sigmoid':sigmoid, 'relu':ReLU, 'lrelu':lReLU, 'tanh':tanh}# A dictionary containing the derivatives of our activation functionsPHI_PRIME = {'sigmoid':sigmoid_prime, 'relu':ReLU_prime, 'lrelu':lReLU_prime, 'tanh':tanh_prime}



我们有ReLU或“ 线性整流函数 ”。我们将主要使用这个激活函数。注意,我们将保持ReLU 0的导数在点0处。


我们还有一个ReLU的扩展版本叫做Leaky ReLU。它的工作原理与ReLU类似,可以在某些数据集上提供更好的结果(不一定是全部)。






  1. 每层神经元的数量
  2. 我们想在每一层中使用激活函数
  3. 我们的特征矩阵(X包含行特征和列特征)。
  4. 特征矩阵对应的标签(y是行向量)
  5. 初始化权重和偏差的方法
  6. 使用损失函数


class NeuralNet :  """ This is a class for making Artificial Neural Networks. L2 and Droupout are the  default regularization methods implemented in this class. It takes the following parameters:  1. layers : A python list containing the different number of neurons in each layer. (containing the output layer) Eg - [64,32,16,16,1]  2. X : Matrix of features with rows as features and columns as different examples.  3. y : Numpy array containing the ouputs of coresponding examples.  4. ac_funcs : A python list containing activation function of each layer. Eg - ['relu','relu','lrelu','tanh','sigmoid']  5. init_method : Meathod to initialize weights of the network. Can be 'gaussian','random','zeros'.  6. loss_func : Currently not implemented  7. W : Weights of a pretrained neural network with same architecture.  8. W : Biases of a pretrained neural network with same architecture. """


 def __init__(self, layers, X, y, ac_funcs, init_method='gaussian', loss_func='b_ce', W=np.array([]), B=np.array([])) : """ Initialize the network. """ # Store the layers of the network self.layers = layers # ---- self.W = None self.B = None # Store the number of examples in the dataset as m self.m = X.shape[1] # Store the full layer list as n self.n = [X.shape[0], *layers] # Save the dataset self.X = X # Save coresponding output self.y = y # List to store the cost of the model calculated during training self.cost = [] # Stores the accuracy obtained on the test set. self.acc = 0 # Activation function of each layer self.ac_funcs = ac_funcs self.loss = loss_func # Inittialize the weights by provided methods if not provided.

我们将使用' self.m'来存储数据集中示例的数量。' self.n '将存储每层中神经元数量的信息。' self.ac_funcs '是每层的激活函数的python列表。' self.cost '将在我们训练网络时存储成本函数的记录值。' self.acc '将在训练后的数据集上存储记录的精度。在初始化网络的所有变量之后,让我们进一步初始化网络的权重和偏差。



 # Inittialize the weights by provided methods if not provided. if len(W) and len(B) : self.W = W self.B = B else :  if init_method=='gaussian':  self.W = [np.random.randn(self.n[nl], self.n[nl-1]) for nl in range(1,len(self.n))] self.B = [np.zeros((nl,1), 'float32') for nl in self.layers] elif init_method == 'random': self.W = [np.random.rand(self.n[nl], self.n[nl-1]) for nl in range(1,len(self.n))] self.B = [np.random.rand(nl,1) for nl in self.layers] elif init_method == 'zeros': self.W = [np.zeros((self.n[nl], self.n[nl-1]), 'float32') for nl in range(1,len(self.n))] self.B = [np.zeros((nl,1), 'float32') for nl in self.layers]















 def _feedForward(self, keep_prob): """ Forward pass """ z = [];a = [] z.append(np.dot(self.W[0], self.X) + self.B[0]) a.append(PHI[self.ac_funcs[0]](z[-1])) for l in range(1,len(self.layers)): z.append(np.dot(self.W[l], a[-1]) + self.B[l]) # a.append(PHI[self.ac_funcs[l]](z[l])) _a = PHI[self.ac_funcs[l]](z[l]) a.append( ((np.random.rand(_a.shape[0],1) < keep_prob)*_a)/keep_prob ) return z,a




 def _cost_func(self, a, _lambda): """ Binary Cross Entropy Cost Function """ return ( (-1/self.m)*np.sum(np.nan_to_num(self.y*np.log(a) + (1-self.y)*np.log(1-a))) + (_lambda/(2*self.m))*np.sum([np.sum(i**2) for i in self.W]) ) def _cost_derivative(self, a) :  """ The derivative of cost w.r.t z """ return (a-self.y) 

我们用L2正则化对我们的成本函数进行了编译。lambda参数称为“ 惩罚参数 ”。它有助于使权重值不会迅速增加,从而更好地形成。这里,' a'包含输出层的激活值。我们还有函数_cost_derivative来计算成本函数对输出层激活的导数。我们稍后会在反向传播期间使用它。





 def startTraining(self, epochs, alpha, _lambda, keep_prob=0.5, interval=100): """ Start training the neural network. It takes the followng parameters :   1. epochs : Number of epochs for which you want to train the network.  2. alpha : The learning rate of your network.  3. _lambda : L2 regularization parameter or the penalization parameter.  4. keep_prob : Dropout regularization parameter. The probability of neurons to deactivate. Eg - 0.8 means 20% of the neurons have been deactivated.  5. interval : The interval between updates of cost and accuracy. """ start = time.time() for i in range(epochs+1) :  z,a = self._feedForward(keep_prob) delta = self._cost_derivative(a[-1]) for l in range(1,len(z)) :  delta_w = np.dot(delta, a[-l-1].T) + (_lambda)*self.W[-l] delta_b = np.sum(delta, axis=1, keepdims=True) delta = np.dot(self.W[-l].T, delta)*PHI_PRIME[self.ac_funcs[-l-1]](z[-l-1]) self.W[-l] = self.W[-l] - (alpha/self.m)*delta_w self.B[-l] = self.B[-l] - (alpha/self.m)*delta_b delta_w = np.dot(delta, self.X.T ) + (_lambda)*self.W[0] delta_b = np.sum(delta, axis=1, keepdims=True) self.W[0] = self.W[0] - (alpha/self.m)*delta_w self.B[0] = self.B[0] - (alpha/self.m)*delta_b



重要提示:此处可能出现的一个重大错误是在更新权重和偏差后更新delta 。这样做可能会导致非常糟糕的梯度渐变消失/爆炸问题。




 def predict(self, X_test) : """ Predict the labels for a new dataset. Returns probability. """ a = PHI[self.ac_funcs[0]](np.dot(self.W[0], X_test) + self.B[0]) for l in range(1,len(self.layers)): a = PHI[self.ac_funcs[l]](np.dot(self.W[l], a) + self.B[l]) return a




# Importing the librariesimport numpy as npimport matplotlib.pyplot as pltimport pandas as pdimport warningsimport timewarnings.filterwarnings('ignore');import osimport sys# Importing our datasetos.chdir("C:/Users/Hilak/Desktop/INTERESTS/Machine Learning A-Z Template Folder/Part 3 - Classification/Section 14 - Logistic Regression");training_set = pd.read_csv("Social_Network_Ads.csv");# Splitting our dataset into matrix of features and output values.X = training_set.iloc[:, 1:4].valuesy = training_set.iloc[:, 4].values# Encoding our object features.from sklearn.preprocessing import LabelEncoder, OneHotEncoderle_x = LabelEncoder()X[:,0] = le_x.fit_transform(X[:,0])ohe = OneHotEncoder(categorical_features = [0])X = ohe.fit_transform(X).toarray()# Performing Feature scalingfrom sklearn.preprocessing import StandardScalerss = StandardScaler()X[:,2:4] = ss.fit_transform(X[:, 2:4])# Splitting the dataset into train and test set.from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)X_train = X_train.TX_test = X_test.T# # Alternate Dataset for test purposes. # Alternate Dataset for test purposes. def sigmoid(z) : 
    """ Reutrns the element wise sigmoid function. 
    """
    return 1./(1 + np.exp(-z))

def sigmoid_prime(z) :
    """ Returns the derivative of the sigmoid function. 
    """
    return sigmoid(z)*(1-sigmoid(z))

def ReLU(z) : 
    """ Reutrns the element wise ReLU function. 
    """
    return (z*(z > 0))

def ReLU_prime(z) :
    """ Returns the derivative of the ReLU function. 
    """
    return 1*(z>=0)

def lReLU(z) : 
    """ Reutrns the element wise leaky ReLU function. 
    """
    return np.maximum(z/100,z)

def lReLU_prime(z) :
    """ Returns the derivative of the leaky ReLU function. 
    """
    z = 1*(z>=0)
    z[z==0] = 1/100
    return z

def tanh(z) :
    """ Reutrns the element wise hyperbolic tangent function. 
    """
    return np.tanh(z)

def tanh_prime(z) : 
    """ Returns the derivative of the tanh function. # A dictionary of our activation functions
PHI = {'sigmoid':sigmoid, 'relu':ReLU, 'lrelu':lReLU, 'tanh':tanh}

# A dictionary containing the derivatives of our activation functions
PHI_PRIME = {'sigmoid':sigmoid_prime, 'relu':ReLU_prime, 'lrelu':lReLU_prime, 'tanh':tanh_prime}

class NeuralNet : 
    """ This is a class for making Artificial Neural Networks. L2 and Droupout are the 
    default regularization methods implemented in this class. It takes the following parameters: 
    1. layers : A python list containing the different number of neurons in each layer. (containing the output layer) Eg - [64,32,16,16,1] 
    2. X : Matrix of features with rows as features and columns as different examples. 
    3. y : Numpy array containing the ouputs of coresponding examples. 
    4. ac_funcs : A python list containing activation function of each layer. Eg - ['relu','relu','lrelu','tanh','sigmoid'] 
    5. init_method : Meathod to initialize weights of the network. def __init__(self, layers, X, y, ac_funcs, init_method='gaussian', loss_func='b_ce', W=np.array([]), B=np.array([])) :
        """ Initialize the network. self.layers = layers
        self.W = None
        self.B = None
        self.m = X.shape[1]
        self.n = [X.shape[0], *layers]
        self.X = X
        self.y = y
        self.cost = []
        self.acc = 0
        self.ac_funcs = ac_funcs
        self.loss = loss_func
        
        if len(W) and len(B) :
            self.W = W
            self.B = B
        else : 
            if init_method=='gaussian': 
                self.W = [np.random.randn(self.n[nl], self.n[nl-1]) for nl in range(1,len(self.n))]
                self.B = [np.zeros((nl,1), 'float32') for nl in self.layers]
            elif init_method == 'random':
                self.W = [np.random.rand(self.n[nl], self.n[nl-1]) for nl in range(1,len(self.n))]
                self.B = [np.random.rand(nl,1) for nl in self.layers]
            elif init_method == 'zeros':
                self.W = [np.zeros((self.n[nl], self.n[nl-1]), 'float32') for nl in range(1,len(self.n))]
                self.B = [np.zeros((nl,1), 'float32') for nl in self.layers]
    
    def startTraining(self, epochs, alpha, _lambda, keep_prob=0.5, interval=100):
        """ Start training the neural network. 1. epochs : Number of epochs for which you want to train the network. 
        2. alpha : The learning rate of your network. 
        3. _lambda : L2 regularization parameter or the penalization parameter. 
        4. keep_prob : Dropout regularization parameter. The probability of neurons to deactivate. Eg - 0.8 means 20% of the neurons have been deactivated. 
        5. interval : The interval between updates of cost and accuracy. start = time.time()
        for i in range(epochs+1) : 
            z,a = self._feedForward(keep_prob)
            delta = self._cost_derivative(a[-1])
            for l in range(1,len(z)) : 
                delta_w = np.dot(delta, a[-l-1].T) + (_lambda)*self.W[-l]
                delta_b = np.sum(delta, axis=1, keepdims=True)
                delta = np.dot(self.W[-l].T, delta)*PHI_PRIME[self.ac_funcs[-l-1]](z[-l-1])
                self.W[-l] = self.W[-l] - (alpha/self.m)*delta_w
                self.B[-l] = self.B[-l] - (alpha/self.m)*delta_b
            delta_w = np.dot(delta, self.X.T ) + (_lambda)*self.W[0]
            delta_b = np.sum(delta, axis=1, keepdims=True)
            self.W[0] = self.W[0] - (alpha/self.m)*delta_w
            self.B[0] = self.B[0] - (alpha/self.m)*delta_b
            
            if not i%interval :
                aa = self.predict(self.X)
                if self.loss == 'b_ce':
                    aa = aa > 0.5
                    self.acc = sum(sum(aa == self.y)) / self.m
                    cost_val = self._cost_func(a[-1], _lambda)
                    self.cost.append(cost_val)
                elif self.loss == 'c_ce':
                    aa = np.argmax(aa, axis = 0)
                    yy = np.argmax(self.y, axis = 0)
                    self.acc = np.sum(aa==yy)/(self.m)
                    cost_val = self._cost_func(a[-1], _lambda)
                    self.cost.append(cost_val)
                sys.stdout.write(f'Epoch[{i}] : Cost = {cost_val:.2f} ; Acc = {(self.acc*100):.2f}% ; Time Taken = {(time.time()-start):.2f}s')
                print('')
        return None
    
    def predict(self, X_test) :
        """ Predict the labels for a new dataset. Returns probability. a = PHI[self.ac_funcs[0]](np.dot(self.W[0], X_test) + self.B[0])
        for l in range(1,len(self.layers)):
            a = PHI[self.ac_funcs[l]](np.dot(self.W[l], a) + self.B[l])
        return a
    
    def _feedForward(self, keep_prob):
        """ Forward pass """
        z = [];a = []
        z.append(np.dot(self.W[0], self.X) + self.B[0])
        a.append(PHI[self.ac_funcs[0]](z[-1]))
        for l in range(1,len(self.layers)):
            z.append(np.dot(self.W[l], a[-1]) + self.B[l])
            _a = PHI[self.ac_funcs[l]](z[l])
            a.append( ((np.random.rand(_a.shape[0],1) < keep_prob)*_a)/keep_prob )
        return z,a
    
    def _cost_func(self, a, _lambda):
        """ Binary Cross Entropy Cost Function """
        return ( (-1/self.m)*np.sum(np.nan_to_num(self.y*np.log(a) + (1-self.y)*np.log(1-a))) + (_lambda/(2*self.m))*np.sum([np.sum(i**2) for i in self.W]) )
    
    def _cost_derivative(self, a) : 
        """ The derivative of cost w.r.t z """
        return (a-self.y)
    
    @property
    def summary(self) :
        return self.cost, self.acc, self.W,self.B
    
    def __repr__(self) : 
        return f''

# Initializing our neural network
neural_net_sigmoid = NeuralNet([32,16,1], X_train, y_train, ac_funcs = ['relu','relu','sigmoid'])

# Staring the training of our network.
neural_net_sigmoid.startTraining(5000, 0.01, 0.2, 0.5, 100)

# Predicting on new dataset using our trained network.
preds = neural_net_sigmoid.predict(X_test)
preds = preds > 0.5
acc = (sum(sum(preds == y_test)) / y_test.size)*100

# Accuracy (metric of evaluation) obtained by the network.
print(f'Test set Accuracy ( r-t-s ) : {acc}%')

# Plotting our cost vs epochs relationship
sigmoid_summary = neural_net_sigmoid.summary
plt.plot(range(len(sigmoid_summary[0])), sigmoid_summary[0], label='Sigmoid Cost')
plt.title('Cost')
plt.show()

# Comparing our results with the library keras.
from keras.models import Sequential
from keras.layers import Dense

X_train, X_test = X_train.T, X_test.T

classifier = Sequential()
classifier.add(Dense(input_dim=4, units = 32, kernel_initializer="uniform




