零基础机器学习|线性回归

一、数据收集和预处理

1、读取可视化

使用pandas库导入csv（逗号相隔开的文件）pd.read_csv()

import pandas as pd
df_ads = pd.read_csv("本地文件地址D:\…\双斜线表示“）

使用head()读出数据表格的前5行

df_ads.head()

2、相关分析

导入画图工具库 matplotlib 和统计学可视化库 seaborn 画出热力图

import matplotlib.pyplot as plt
import seaborn as sns
sns.heatmap(df_ads.corr(),cmap = “YlGnBu”,annot = True)
plt.show()

3、散点图

aspect = 1 长宽相同

sns.pairplot(df_ads,x_vars=[‘wechat’,‘weibo’,‘others’],y_vars=‘sales’,
height = 4, aspect = 1 ,kind = ‘scatter’)
plt.show()

3、预处理

1)导入数据

X = np.array(df_ads)
y = np.array(df_ads.sales)

2)删除标签

axis = 0 列
axis = 1 行

X = np.delete(数据, 标签个数 , axis = 1)

3)数据矩阵化

区分张量和向量的区别
数据导入进来时，是数组但是我们需要它为张量方便做乘法计算
用reshape的方法把向量转化成矩阵

X = X.reshape((len(x),1))
y = y.reshape((len(y),1))

二、拆分训练集和测试集

使用sklearn.model_selection 中的函数

from sklearn.model_selection import train_test_split

拆分处理, test_size 比例是0.2 ，random_state每次运行得到相同的训练和测试数据

X_train, X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2,random_state = 0)

三、数据归一化

自定义定义函数完成公式
$\frac {X-min(X)}{max(X) - min(X)}}$

def scaler (train, test):min_value = train.min(axis = 0)max_value = train.max(axis = 0)gap = max_value - min_valuetrain -= min_valuetrain /= gaptest -= min_valuetest /= gapreturn train, test

四、损失函数

自定义一下函数计算
${\frac{1}{2N} \sum_{(x,y)\in D}(y-h(x))^2}$

def loss_fuction(X,y,weight,bias):y_hat = weight*X + biasloss = y_hat - ycost = np.sum(loss**2) / (2*len(X))return cost#损失函数的值

五、梯度下降

使用以下公式计算梯度，数学微积分联系的比较紧密，可以好好看看数学
${\frac {1}{2N}\sum_{i=1}^N(y^{(i)}-(w*x^{(i)}))*x^{(i)}}$
更新权重
$w-{\frac{\alpha}{N}\sum_{i=1}^N(y^{(i)}-(w*x^{(i)}))*x^{(i)}}$

def graient_descent(X, y, weight, bias, lr, iteration):#lr为步长，iteration：迭代次数l_history = np.zeros(iteration)#创建一个全0 长度为iteration 的数组w_history = np.zeros(iteration)b_history = np.zeros(iteration)for i in range (iteration):y_hat = weight * X + biasloss = y_hat - yderivation_weight = X.T.dot(loss)/len(X) #计算权重的梯度 ，derivation_bias = sum(loss)*1/len(X)#计算偏置梯度weight = weight - lr*derivation_weight #现W=原W-学习速率（多块速度下降）乘以权重梯度（导数）bias =bias - lr*derivation_biasl_history[i] = loss_fuction(X,y,weight,bias)w_history[i] = weightb_history[i] = biasreturn l_history, w_history, b_history

六、调用封装线性回归的函数

自定义线性回归函数，包含梯度下降，梯度下降中包含损失函数，并计算预测的准确率

def linear_regression(X, y, weight, alpha, iterations):loss_history, weight_history = gradient_descent(X, y,weight,alpha, iterations)print("训练最终损失:", loss_history[-1]) # 打印最终损失y_pred = X.dot(weight_history[-1]) # 进行预测traning_acc = 100 - np.mean(np.abs(y_pred - y))*100 # 计算准确率print("线性回归训练准确率: {:.2f}%".format(traning_acc))  # 打印准确率return loss_history, weight_history # 返回训练历史记录

七、定义迭代次数，学习率，初始权重（优化，调参）

iterations = 300; # 迭代300次
alpha = 0.15; #学习速率设为0.15
weight = np.array([0.5,1,1,1]) # 权重向量，w[0] = bias
#计算一下初始值的损失
print ('当前损失：',loss_function(X_train, y_train, weight))

八、损失函数图像观察

观察随着迭代次数的增加，损失的数值是否减少

plt.plot(loss_history,'g--',label = 'Loss Curve')
plt.xlabel('Iteration')
plt.ylabel('loss')
plt.legend()
plt.show()

九、预测未来

输入X对应特征的数据，预测y销售量
提前保留没有被更改的X原始数据

def min_max_gap(train):min_val = train.min(axis = 0)max_val = train.max(axis = 1)gap = max_val - min_valreturn min_val,max_val,gap
y_min , y_max,y_gap = min_max_gap(y_train)

提前定义最大最小函数，方便后续还原归一化后的数据

X_train_original = X_train.copy()

注意要将归一化后的数据还原

X_plan = [250,50,50] # 要预测的X特征数据
X_train,X_plan = scaler(X_train_original,X_plan) # 对预测数据也要归一化缩放
X_plan = np.append([1], X_plan ) # 加一个哑特征X0 = 1
y_plan = np.dot(weight_history[-1],X_plan)
y_value = y_plan*y_gap + y_min # y_gap是当前y_train中最大值和最小值的差，y_min是最小值
print ("预计商品销售额： ",y_value, "千元")

十、完整代码

import numpy as np #导入NumPy数学工具箱
import pandas as pd #导入Pandas数据处理工具箱
df_ads = pd.read_csv('F:\\MachineLearning_testData\\advertising.csv')
df_ads.head()X = np.array(df_ads) # 构建特征集，含全部特征
X = np.delete(X, [3], axis = 1) # 删除掉标签
y = np.array(df_ads.sales) #构建标签集，销售金额
print ("张量X的阶:",X.ndim)
print ("张量X的形状:", X.shape)
print (X)y = y.reshape(-1,1) #通过reshape函数把向量转换为矩阵，-1就是len(y),返回样本个数
print ("张量y的形状:", y.shape)from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=0)
def scaler(train, test): # 定义归一化函数 ，进行数据压缩# 数据的压缩min = train.min(axis=0) # 训练集最小值max = train.max(axis=0) # 训练集最大值gap = max - min # 最大值和最小值的差train -= min # 所有数据减最小值train /= gap # 所有数据除以大小值差test -= min #把训练集最小值应用于测试集test /= gap #把训练集大小值差应用于测试集return train, test # 返回压缩后的数据def min_max_gap(train):  # 计算训练集最大，最小值以及他们的差，用于后面反归一化过程min = train.min(axis=0)  # 训练集最小值max = train.max(axis=0)  # 训练集最大值gap = max - min  # 最大值和最小值的差return min, max, gapy_min, y_max, y_gap = min_max_gap(y_train)
X_train_original = X_train.copy() # 保留一份训练集数据副本，用于对要预测数据归一化
X_train,X_test = scaler(X_train,X_test) # 对特征归一化
y_train,y_test = scaler(y_train,y_test) # 对标签也归一化
x0_train = np.ones((len(X_train),1)) # 构造X_train长度的全1数组配合对Bias的点积
X_train = np.append(x0_train, X_train, axis=1) #把X增加一系列的1
x0_test = np.ones((len(X_test),1)) # 构造X_test长度的全1数组配合对Bias的点积
X_test = np.append(x0_test, X_test, axis=1) #把X增加一系列的1
print ("张量X的形状:", X_train.shape)
print (X_train)
def loss_function(X, y, W): # 手工定义一个MSE均方误差函数,W此时是一个向量y_hat = X.dot(W.T) # 点积运算 h(x)=w_0*x_0 + w_1*x_1 + w_2*x_2 + w_3*x_3loss = y_hat.reshape((len(y_hat),1))-y # 中间过程,求出当前W和真值的差异cost = np.sum(loss**2)/(2*len(X)) # 这是平方求和过程, 均方误差函数的代码实现return cost # 返回当前模型的均方误差值
def gradient_descent(X, y, W, lr, iterations): # 定义梯度下降函数l_history = np.zeros(iterations) # 初始化记录梯度下降过程中损失的数组W_history = np.zeros((iterations,len(W))) # 初始化权重数组for iter in range(iterations): # 进行梯度下降的迭代，就是下多少级台阶y_hat = X.dot(W.T) # 这个是向量化运行实现的假设函数loss = y_hat.reshape((len(y_hat),1))-y # 中间过程, y_hat和y真值的差derivative_W = X.T.dot(loss)/len(X) #求出多项式的梯度向量derivative_W = derivative_W.reshape(len(W))W = W - lr*derivative_W # 结合下降速率更新权重l_history[iter] = loss_function(X, y, W) # 损失的历史记录W_history[iter] = W # 梯度下降过程中权重的历史记录return l_history, W_history # 返回梯度下降过程数据
#首先确定参数的初始值
iterations = 300; # 迭代300次
alpha = 0.15; #学习速率设为0.15
weight = np.array([0.5,1,1,1]) # 权重向量，w[0] = bias
#计算一下初始值的损失
print ('当前损失：',loss_function(X_train, y_train, weight))
# 定义线性回归模型
def linear_regression(X, y, weight, alpha, iterations):loss_history, weight_history = gradient_descent(X, y,weight,alpha, iterations)print("训练最终损失:", loss_history[-1]) # 打印最终损失y_pred = X.dot(weight_history[-1]) # 进行预测traning_acc = 100 - np.mean(np.abs(y_pred - y))*100 # 计算准确率print("线性回归训练准确率: {:.2f}%".format(traning_acc))  # 打印准确率return loss_history, weight_history # 返回训练历史记录
# 调用刚才定义的线性回归模型
loss_history, weight_history = linear_regression(X_train, y_train,weight, alpha, iterations) #训练机器
print("权重历史记录：", weight_history)
print("损失历史记录：", loss_history)X_plan = [250,50,50] # 要预测的X特征数据
X_train,X_plan = scaler(X_train_original,X_plan) # 对预测数据也要归一化缩放
X_plan = np.append([1], X_plan ) # 加一个哑特征X0 = 1
y_plan = np.dot(weight_history[-1],X_plan) # [-1] 即模型收敛时的权重
# 对预测结果要做反向缩放，才能得到与原始广告费用对应的预测值
y_value = y_plan*y_gap + y_min # y_gap是当前y_train中最大值和最小值的差，y_min是最小值
print ("预计商品销售额： ",y_value, "千元")