kaggle(需要魔法才能访问):https://www.kaggle.com/
需要下载的数据集:melb_data.csv、train.csv(已放在资源里面)
First Machine Learning Model
Selecting Data for Modeling
#Selecting Data for Modeling
import pandas as pd
melbourne_file_path=r"G:\Kaggle\Datasets\archive\melb_data.csv"
melbourne_data=pd.read_csv(melbourne_file_path)
#查看数据集中所有列的列表,这是通过 DataFrame 的 columns 属性完成的
melbourne_data.columns
# dropna drops missing values (think of na as "not available")
#dropna 删除缺失值(将 na 视为“不可用”)
melbourne_data = melbourne_data.dropna(axis=0)
#Selecting The Prediction Target
#选择我们想要预测的列,这称为预测目标y
y=melbourne_data.Price
#Choosing "Features"
#输入到模型中(随后用于进行预测)的列称为“特征”
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
#X
X = melbourne_data[melbourne_features]
#describe方法
X.describe()
Rooms | Bathroom | Landsize | Lattitude | Longtitude | |
---|---|---|---|---|---|
count | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 | 6196.000000 |
mean | 2.931407 | 1.576340 | 471.006940 | -37.807904 | 144.990201 |
std | 0.971079 | 0.711362 | 897.449881 | 0.075850 | 0.099165 |
min | 1.000000 | 1.000000 | 0.000000 | -38.164920 | 144.542370 |
25% | 2.000000 | 1.000000 | 152.000000 | -37.855438 | 144.926198 |
50% | 3.000000 | 1.000000 | 373.000000 | -37.802250 | 144.995800 |
75% | 4.000000 | 2.000000 | 628.000000 | -37.758200 | 145.052700 |
max | 8.000000 | 8.000000 | 37000.000000 | -37.457090 | 145.526350 |
#head()方法
X.head()
Rooms | Bathroom | Landsize | Lattitude | Longtitude | |
---|---|---|---|---|---|
1 | 2 | 1.0 | 156.0 | -37.8079 | 144.9934 |
2 | 3 | 2.0 | 134.0 | -37.8093 | 144.9944 |
4 | 4 | 1.0 | 120.0 | -37.8072 | 144.9941 |
6 | 3 | 2.0 | 245.0 | -37.8024 | 144.9993 |
7 | 2 | 1.0 | 256.0 | -37.8060 | 144.9954 |
Building Your Model
#Building Your Model
# 构建和使用模型的步骤是:
# 定义
# 拟合
# 预测
# 评估
from sklearn.tree import DecisionTreeRegressor
#定义模型
melbourne_model=DecisionTreeRegressor(random_state=1)
#拟合
melbourne_model.fit(X,y)
DecisionTreeRegressor(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor(random_state=1)
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(melbourne_model.predict(X.head()))
Making predictions for the following 5 houses:Rooms Bathroom Landsize Lattitude Longtitude
1 2 1.0 156.0 -37.8079 144.9934
2 3 2.0 134.0 -37.8093 144.9944
4 4 1.0 120.0 -37.8072 144.9941
6 3 2.0 245.0 -37.8024 144.9993
7 2 1.0 256.0 -37.8060 144.9954
The predictions are
[1035000. 1465000. 1600000. 1876000. 1636000.]
Exercise First Machine Learning Model
#Exercise: First Machine Learning Model Exercises
import pandas as pd
iowa_file_path = r"G:\Kaggle\Datasets\archive\train.csv"
home_data = pd.read_csv(iowa_file_path)
#Step 1: Specify Prediction Target
print(home_data.columns)
y = home_data.SalePrice
#Step 2: Create X
feature_names = ["LotArea","YearBuilt","1stFlrSF","2ndFlrSF","FullBath","BedroomAbvGr","TotRmsAbvGrd"]
X=home_data[feature_names]
#Review Data在构建模型之前,快速浏览一下 X 以验证它看起来是否合理
print(X.describe())
print(X.head())
#Step 3: Specify and Fit Model
from sklearn.tree import DecisionTreeRegressor
#specify the model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fit the model
iowa_model.fit(X,y)
#Step 4: Make Predictions
predictions = iowa_model.predict(X)
print(predictions)
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street','Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig','LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType','HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd','RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType','MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1','BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating','HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF','LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath','HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual','TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType','GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual','GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF','EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC','Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType','SaleCondition', 'SalePrice'],dtype='object')LotArea YearBuilt 1stFlrSF 2ndFlrSF FullBath \
count 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000
mean 10516.828082 1971.267808 1162.626712 346.992466 1.565068
std 9981.264932 30.202904 386.587738 436.528436 0.550916
min 1300.000000 1872.000000 334.000000 0.000000 0.000000
25% 7553.500000 1954.000000 882.000000 0.000000 1.000000
50% 9478.500000 1973.000000 1087.000000 0.000000 2.000000
75% 11601.500000 2000.000000 1391.250000 728.000000 2.000000
max 215245.000000 2010.000000 4692.000000 2065.000000 3.000000 BedroomAbvGr TotRmsAbvGrd
count 1460.000000 1460.000000
mean 2.866438 6.517808
std 0.815778 1.625393
min 0.000000 2.000000
25% 2.000000 5.000000
50% 3.000000 6.000000
75% 3.000000 7.000000
max 8.000000 14.000000 LotArea YearBuilt 1stFlrSF 2ndFlrSF FullBath BedroomAbvGr \
0 8450 2003 856 854 2 3
1 9600 1976 1262 0 2 3
2 11250 2001 920 866 2 3
3 9550 1915 961 756 1 3
4 14260 2000 1145 1053 2 4 TotRmsAbvGrd
0 8
1 6
2 6
3 7
4 9
[208500. 181500. 223500. ... 266500. 142125. 147500.]
#Exercise: First Machine Learning Model
#Step 1: Specify Prediction Target
print(home_data.columns)
y = home_data.SalePrice
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street','Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig','LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType','HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd','RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType','MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1','BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating','HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF','LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath','HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual','TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType','GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual','GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF','EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC','Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType','SaleCondition', 'SalePrice'],dtype='object')
#Step 2: Create X
feature_names = ["LotArea","YearBuilt","1stFlrSF","2ndFlrSF","FullBath","BedroomAbvGr","TotRmsAbvGrd"]
X=home_data[feature_names]
#Review Data在构建模型之前,快速浏览一下 X 以验证它看起来是否合理
print(X.describe())
print(X.head())
LotArea YearBuilt 1stFlrSF 2ndFlrSF FullBath \
count 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000
mean 10516.828082 1971.267808 1162.626712 346.992466 1.565068
std 9981.264932 30.202904 386.587738 436.528436 0.550916
min 1300.000000 1872.000000 334.000000 0.000000 0.000000
25% 7553.500000 1954.000000 882.000000 0.000000 1.000000
50% 9478.500000 1973.000000 1087.000000 0.000000 2.000000
75% 11601.500000 2000.000000 1391.250000 728.000000 2.000000
max 215245.000000 2010.000000 4692.000000 2065.000000 3.000000 BedroomAbvGr TotRmsAbvGrd
count 1460.000000 1460.000000
mean 2.866438 6.517808
std 0.815778 1.625393
min 0.000000 2.000000
25% 2.000000 5.000000
50% 3.000000 6.000000
75% 3.000000 7.000000
max 8.000000 14.000000 LotArea YearBuilt 1stFlrSF 2ndFlrSF FullBath BedroomAbvGr \
0 8450 2003 856 854 2 3
1 9600 1976 1262 0 2 3
2 11250 2001 920 866 2 3
3 9550 1915 961 756 1 3
4 14260 2000 1145 1053 2 4 TotRmsAbvGrd
0 8
1 6
2 6
3 7
4 9
#Step 3: Specify and Fit Model
from sklearn.tree import DecisionTreeRegressor
#specify the model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fit the model
iowa_model.fit(X,y)
DecisionTreeRegressor(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor(random_state=1)
#Step 4: Make Predictions
predictions = iowa_model.predict(X)
print(predictions)
[208500. 181500. 223500. ... 266500. 142125. 147500.]
Model Validation 模型验证
# Model Validation 模型验证
import pandas as pd
# load data加载数据
melbourne_file_path = r"G:\Kaggle\Datasets\archive\melb_data.csv"
melbourne_data = pd.read_csv(melbourne_file_path)
# Filter rows with missing price values过滤缺少价格值的行
filtered_melbourne_data = melbourne_data.dropna(axis=0)
# Choose target and features选择目标和特征
y = filtered_melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
X = filtered_melbourne_data[melbourne_features]
#模型的选择
from sklearn.tree import DecisionTreeRegressor
# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(X, y)
DecisionTreeRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor()
#calculate the mean absolute error计算平均绝对误差
from sklearn.metrics import mean_absolute_error
predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)
434.71594577146544
#scikit-learn 库有一个函数 train_test_split 将数据分成两部分
#我们将使用其中一些数据作为训练数据来拟合模型,并使用其他数据作为验证数据来计算mean_absolute_error
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(train_X, train_y)
# get predicted prices on validation data根据验证数据获取预测价格
val_predictions = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))
263704.735958683
Exercise: Model Validation
#Exercise: Model Validation
import pandas as pd
from sklearn.tree import DecisionTreeRegressor# Path of the file to read
iowa_file_path = r"G:\Kaggle\Datasets\archive\train.csv"home_data = pd.read_csv(iowa_file_path)
y = home_data.SalePrice
feature_columns = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[feature_columns]# Specify Model
iowa_model = DecisionTreeRegressor()
# Fit Model
iowa_model.fit(X, y)print("First in-sample predictions:", iowa_model.predict(X.head()))
print("Actual target values for those homes:", y.head().tolist())
First in-sample predictions: [208500. 181500. 223500. 140000. 250000.]
Actual target values for those homes: [208500, 181500, 223500, 140000, 250000]
#Exercise: Model Validation
#Step 1: Split Your Data
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=1)
#Step 2: Specify and Fit the Model
# Specify the model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fit iowa_model with the training data.
iowa_model.fit(train_X,train_y)
DecisionTreeRegressor(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor(random_state=1)
#Step 3: Make Predictions with Validation data使用验证数据进行预测
# Predict with all validation observations
val_predictions = iowa_model.predict(val_X)
# print the top few validation predictions
print(val_X.head())
# print the top few actual prices from validation data
print(val_y.head())
LotArea YearBuilt 1stFlrSF 2ndFlrSF FullBath BedroomAbvGr \
258 12435 2001 963 829 2 3
267 8400 1939 1052 720 2 4
288 9819 1967 900 0 1 3
649 1936 1970 630 0 1 1
1233 12160 1959 1188 0 1 3 TotRmsAbvGrd
258 7
267 8
288 5
649 3
1233 6
258 231500
267 179500
288 122000
649 84500
1233 142000
Name: SalePrice, dtype: int64
#Step 4: Calculate the Mean Absolute Error in Validation Data
#步骤 4:计算验证数据中的平均绝对误差
from sklearn.metrics import mean_absolute_error
val_mae = mean_absolute_error(val_y,val_predictions)
#print(val_mae)
29652.931506849316
Underfitting and Overfitting
#Underfitting and Overfitting
#欠拟合和过拟合
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressordef get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)model.fit(train_X, train_y)preds_val = model.predict(val_X)mae = mean_absolute_error(val_y, preds_val)return(mae)
import pandas as pd
# Load data
melbourne_file_path = r"G:\Kaggle\Datasets\archive\melb_data.csv"
melbourne_data = pd.read_csv(melbourne_file_path)
# Filter rows with missing values
filtered_melbourne_data = melbourne_data.dropna(axis=0)
# Choose target and features
y = filtered_melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
X = filtered_melbourne_data[melbourne_features]from sklearn.model_selection import train_test_split# split data into training and validation data, for both features and target
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)
# compare MAE with differing values of max_leaf_nodes
#将 MAE 与 max_leaf_nodes 的不同值进行比较
for max_leaf_nodes in [5, 50, 500, 5000]:my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))
Max leaf nodes: 5 Mean Absolute Error: 347380
Max leaf nodes: 50 Mean Absolute Error: 258171
Max leaf nodes: 500 Mean Absolute Error: 243495
Max leaf nodes: 5000 Mean Absolute Error: 255575
#之前用于加载数据的代码
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
# Path of the file to read
iowa_file_path = r"G:\Kaggle\Datasets\archive\train.csv"
home_data = pd.read_csv(iowa_file_path)
# Create target object and call it y
y = home_data.SalePrice
# Create X
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[features]# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)# Specify Model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fit Model
iowa_model.fit(train_X, train_y)# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))
Validation MAE: 29,653
Exercise: Underfitting and Overfitting
#Exercise: Underfitting and Overfitting
#Exercises
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)model.fit(train_X, train_y)preds_val = model.predict(val_X)mae = mean_absolute_error(val_y, preds_val)return(mae)
#Step 1: Compare Different Tree Sizes
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
for leaf_size in candidate_max_leaf_nodes:leaf_size = get_mae(leaf_size, train_X, val_X, train_y, val_y)# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
best_tree_size = min(scores, key=scores.get)
#Step 2: Fit Model Using All Data
# Fill in argument to make optimal size and uncomment
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size,random_state=1)
# fit the final model and uncomment the next two lines
final_model.fit(X,y)