    • 引言
    • 数据集介绍
    • 环境准备
    • 数据预处理
      • 1. 导入相应的分析库和数据加载
      • 2. 数据探索
      • 3. 异常值处理
        • 3.1 缺失值处理
        • 3.2 异常值处理
    • 模型训练
      • 1. 数据标签的0/1化
      • 2. 数据的归一化
      • 3. 划分数据集
      • 4. 训练模型
      • 5. 模型评估
      • 6. 特征重要性
    • 结论





  • 非挥发性酸(fixed acidity)
  • 挥发性酸(volatile acidity)
  • 柠檬酸(citric acid)
  • 残糖(residual sugar)
  • 氯化物(chlorides)
  • 游离二氧化硫(free sulfur dioxide)
  • 总二氧化硫(total sulfur dioxide)
  • 密度(density)
  • 酸碱度(pH)
  • 硫酸盐(sulphates)
  • 酒精(alcohol)
  • 葡萄酒质量(quality,0-10)



  • pandas
  • numpy
  • scikit-learn
  • matplotlib
  • seaborn


pip install pandas numpy scikit-learn matplotlib seaborn


1. 导入相应的分析库和数据加载


import warnings # For warning handling# Third-party imports
import pandas as pd # For data processing, CSV file I/O
import numpy as np # For numerical operations and mathematical functions
import matplotlib.pyplot as plt # For data visualization
import seaborn as sns # For statistical graphics
import plotly.express as px # For interactive plotting
from sklearn.model_selection import train_test_split # For data splitting for machine learning
from sklearn.preprocessing import MinMaxScaler, StandardScaler # For feature standardization
from sklearn.metrics import accuracy_score # For model evaluation
from termcolor import colored # For colored text printing
from sklearn.ensemble import RandomForestClassifier # For random forest classifier model# For warning handling
warnings.filterwarnings('ignore') # For ignoring warnings


# load data
try:# Relative file pathfilePath = "winequality-white.csv"# Read the CSV file and save it in "data" variabledata= pd.read_csv(filePath,sep=';')# Check loading dataprint(colored("THE DATASET LOADED SUCCESSFULLY...", "green", attrs=['reverse']))except FileNotFoundError:print(colored("ERROR: File not found!", "red", attrs=['reverse']))except Exception as e:print(colored(f"ERROR: {e}", "red", attrs=['reverse']))


2. 数据探索


# 查看数据集的前几行
dataset_rows = data.head(7) #.head() the default value = 5print(colored('As you can see, the first 7 rows in the dataset:\n', 'green', attrs=['reverse']))# Iterate over each row in the dataset_rows DataFrame
for index, row in dataset_rows.iterrows():# Print the index label of the current rowprint(colored(f"Row {index + 1}:","white",attrs=['reverse']))# Print the content of the current rowprint(row)# Print a separator lineprint("--------------------------------------")


print("The shape =",data.shape)# Show information about the dataset
num_rows, num_cols = data.shape
num_features = num_cols - 1
num_data = num_rows * num_cols# Print the information
print(f"Number of Rows: {num_rows}")
print(f"Number of Columns: {num_cols}")
print(f"Number of Features: {num_features}")
print(f"Number of All Data: {num_data}")# Check and ensure running
print(colored("The task has been completed without any errors....","green", attrs=['reverse']))


# 查看数据集的信息




# Create a count plot using seaborn
sns.catplot(data=data, x='quality', kind='count')# Add labels and title to the plot
plt.title('Distribution of Wine Quality')
plt.ylabel('Count')# Display the plot


3. 异常值处理

3.1 缺失值处理
# Check for missing values
null_counts = data.isnull().sum() # Display the number of null values
print(colored(f"Totally, there are {null_counts.sum()} null values in the dataset.","green", attrs=['reverse']))


3.2 异常值处理
# Set the figure size
plt.figure(figsize=(22, 11))# Add outliers to the plot
sns.stripplot(data=data, color="red", jitter=0.2, size=5)# Set the axis labels and title
plt.xlabel("X-axis label")
plt.ylabel("Y-axis label")# Show the plot


# Delete the outliers
# The data before deleting outliers 
print("Before Removing the outliers", data.shape)# Deleting outliers (Removing the number of observation where the total sulfur dioxide is more than 160)
data = data[data['total sulfur dioxide']<160]#The data after deleting outliers
print("After Removing the outliers", data.shape)


# Set the figure size
plt.figure(figsize=(22, 11))# Add outliers to the plot
sns.stripplot(data=data, color="red", jitter=0.2, size=5)# Set the axis labels and title
plt.xlabel("X-axis label")
plt.ylabel("Y-axis label")# Show the plot



1. 数据标签的0/1化

# Split the data into features (X) and target variable (Y)
X = data.drop('quality',axis=1)# Create a new series 'Y' by applying a lambda function to the 'quality' column of the 'data' DataFrame
# The lambda function assigns a value of 1 if the 'quality' value is greater than or equal to 5, otherwise assigns 0
Y = data['quality'].apply(lambda y_value: 1 if y_value >= 5 else 0)# Print the shapes of X and Y to verify the splitting
print("Shape of X:", X.shape)
print("Shape of Y:", Y.shape)


2. 数据的归一化

# Rescale and normalize the features
# Standardization (Normalization)
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)
'''# Min-Max Scaling (Rescaling)
min_max_scaler = MinMaxScaler()
X = min_max_scaler.fit_transform(X)

3. 划分数据集


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=44)# Print the shapes of the training and testing sets to verify the splitting
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of Y_train:", Y_train.shape)
print("Shape of Y_test:", Y_test.shape)


4. 训练模型


# Initialize lists to store training and testing accuracies
scoreListRF_Train = []
scoreListRF_Test = []'''
max_dep      ----------> (1, 5),(1, 10) 
rand_state   ----------> (1, 35),(1, 50)
n_est        ----------> (1, 30),(1, 30)
'''# Iterate over different values of max_depth
for max_dep in range(1, 5):# Iterate over different values of random_statefor rand_state in range(1, 20):# Iterate over different values of n_estimatorsfor n_est in range(1, 15):# Create a Random Forest model with the different values of max_depth, random_state, and n_estimatorsModel = RandomForestClassifier(n_estimators=n_est, random_state=rand_state, max_depth=max_dep)            # Fit the model on the training dataModel.fit(X_train, Y_train)# Calculate and store the training accuracyscoreListRF_Train.append(Model.score(X_train, Y_train))# Calculate and store the testing accuracyscoreListRF_Test.append(Model.score(X_test, Y_test))# Find the maximum accuracy for both training and testing
RF_Accuracy_Train = max(scoreListRF_Train) 
RF_Accuracy_Test = max(scoreListRF_Test)# Print the best accuracies achieved
print(f"Random Forest best accuracy (Training): {RF_Accuracy_Train*100:.2f}%")
print(f"Random Forest best accuracy (Testing): {RF_Accuracy_Test*100:.2f}%")# Print a success message indicating that the model has been trained successfully
print(colored("The Random Forest model has been trained successfully","green", attrs=['reverse']))


5. 模型评估


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix# 预测测试集
y_pred = Model.predict(X_test)# 计算准确率
accuracy = accuracy_score(Y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')# 打印分类报告
print(classification_report(Y_test, y_pred))# 打印混淆矩阵
print(confusion_matrix(Y_test, y_pred))


6. 特征重要性


import matplotlib.pyplot as plt
import seaborn as sns# 获取特征重要性
feature_importances = Model.feature_importances_# 创建一个DataFrame来存储特征和它们的重要性
feature_importance_df = pd.DataFrame({'Feature': data.columns.tolist()[:-1],'Importance': feature_importances
})# 对特征重要性进行排序
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)# 绘制特征重要性图
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance')








