流程
- 导入需要的包
- 引入文件,查看内容
- 数据处理
- 调用模型准备训练
- 输出结果
导入需要的包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
引入文件,查看内容
train = pd.read_csv('train.csv')
print('The shape of training data:', train.shape)
train.head()
test = pd.read_csv('test.csv')
print('The shape of testing data:', test.shape)
test.head()
数据处理
删除没有用的列
train.drop('LotFrontage', axis=1, inplace=True)
test.drop('LotFrontage', axis=1, inplace=True)
区分数字特征和字符特征
#分离数字特征和类别特征
num_features = []
cate_features = []
for col in test.columns:if test[col].dtype == 'object':cate_features.append(col)else:num_features.append(col)
print('number of numeric features:', len(num_features))
print('number of categorical features:', len(cate_features))
去除特殊的值
#处理掉右下的明显异常值
train = train.drop(train[(train['TotalBsmtSF']>6000) & (train['SalePrice']<200000)].index)
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<200000)].index)
查看训练集中各特征的数据缺失个数
print('The shape of training data:', train.shape)
train_missing = train.isnull().sum()
train_missing = train_missing.drop(train_missing[train_missing==0].index).sort_values(ascending=False)
train_missing
查看测试集中各特征的数据缺失个数
#查看测试集中各特征的数据缺失个数
print('The shape of testing data:', test.shape)
test_missing = test.isnull().sum()
test_missing = test_missing.drop(test_missing[test_missing==0].index).sort_values(ascending=False)
test_missing
根据特征说明文档,以下特征缺失代表没有,所以直接补充为’None’就可以了:
none_lists = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtFinType1','BsmtFinType2', 'BsmtCond', 'BsmtExposure', 'BsmtQual', 'MasVnrType']
for col in none_lists:train[col] = train[col].fillna('None')test[col] = test[col].fillna('None')
补充出现频率最高的一类
most_lists = ['MSZoning', 'Exterior1st', 'Exterior2nd', 'SaleType', 'KitchenQual', 'Electrical']
for col in most_lists:train[col] = train[col].fillna(train[col].mode()[0])test[col] = test[col].fillna(train[col].mode()[0]) #注意这里补充的是训练集中出现最多的类别
删除掉多余的特征
train['Functional'] = train['Functional'].fillna('Typ')
test['Functional'] = test['Functional'].fillna('Typ')train.drop('Utilities', axis=1, inplace=True)
test.drop('Utilities', axis=1, inplace=True)
数字特征处理
补零,对可能为零的特征,缺失值全部补零
zero_lists = ['GarageYrBlt', 'MasVnrArea', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'GarageCars', 'GarageArea','TotalBsmtSF']
for col in zero_lists:train[col] = train[col].fillna(0)test[col] = test[col].fillna(0)
最后检查下是否还存在缺失值:
查看训练集是否有空
train.isnull().sum().any()
查看测试集是否有空
test.isnull().sum().any()
从存放类别特征的列表去掉
#从存放类别特征的列表去掉'Utilities'
cate_features.remove('Utilities')
print('The number of categorical features:', len(cate_features))
from sklearn.preprocessing import LabelEncoder
for col in cate_features:train[col] = train[col].astype(str)test[col] = test[col].astype(str)
le_features = ['Street', 'Alley', 'LotShape', 'LandContour', 'LandSlope', 'HouseStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir','KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence']
for col in le_features:encoder = LabelEncoder()value_train = set(train[col].unique())value_test = set(test[col].unique())value_list = list(value_train | value_test)encoder.fit(value_list)train[col] = encoder.transform(train[col])test[col] = encoder.transform(test[col])
把数据放一块处理
all_data = pd.concat((train.drop('SalePrice', axis=1), test)).reset_index(drop=True)
all_data = pd.get_dummies(all_data, drop_first=True) #注意独热编码生成的时候要去掉一个维度,保证剩下的变量都是相互独立的
all_data.shape
划分数据集
trainset = all_data[:1460]
traincy = pd.read_csv('train.csv')
y=traincy['SalePrice']
testset = all_data[1458:]
print('The shape of training data:', trainset.shape)
print('The shape of testing data:', testset.shape)
调用模型
linear_model = LinearRegression()
linear_model.fit(trainset, y)
预测数据
line_pre = linear_model.predict(testset)
输出结果
test = pd.read_csv('test.csv')
# print(test.shape,line_pre.shape)
we = pd.DataFrame({'Id': test['Id'], 'SalePrice': line_pre})
we.to_csv('House_Price_submissionMyself.csv', index=False)