1.导入数据。
2.清洗数据,将缺失值或“NAN”替换为“无”,并将文本数据转换为数值型数据。
3.使用聚类算法(如KMeans)对数据进行聚类,并计算样本到簇中心的平均距离以确定最佳的簇数量。
4.对数据进行PCA降维,以便在三维空间中可视化聚类结果。
5.使用Matplotlib绘制三维散点图。
pip install mlxtend
import pandas as pd
import numpy as np
from sklearn import preprocessing
from mlxtend.preprocessing import TransactionEncoder
import re
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
inputfile_1= "./生枣仁主治病症.xlsx"
df = pd.read_excel(inputfile_1)
df['方名'] = df['方名'].str.strip()
df.head()
数据清洗:将缺失值或'NAN'替换为'无
cols = ['药1', '药2', '药3', '药4']
df[cols] = df[cols].fillna('无')
df.drop_duplicates(inplace=True)
df.head()
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
# 文本数据转换为数值型数据
for col in ['药1', '药2', '药3', '药4']:df[col] = label_encoder.fit_transform(df[col])
from sklearn.metrics import pairwise_distances_argmin_min
distances = []
K = range(2, 10)
for k in K:kmeans = KMeans(n_clusters=k, random_state=42).fit(df.drop(columns=['方名']))labels = kmeans.labels_cluster_centers = kmeans.cluster_centers_# 计算样本到簇中心的平均距离distances_to_centers = pairwise_distances_argmin_min(df.drop(columns=['方名']), cluster_centers)[1]avg_distance = distances_to_centers.mean()distances.append(avg_distance)# 找到最佳的簇数量(基于样本到簇中心的平均距离)
optimal_k = distances.index(min(distances)) + 2 # 因为K的范围是从2开始的
print(f"Optimal number of clusters: {optimal_k}")kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(df.drop(columns=['方名']))
df['Cluster'] = clusterspca = PCA(n_components=3)
X_pca = pca.fit_transform(df.drop(columns=['方名', '方名']))fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for i in range(optimal_k):ax.scatter(X_pca[df['Cluster'] == i, 0], X_pca[df['Cluster'] == i, 1], X_pca[df['Cluster'] == i, 2], label=f'Cluster {i}')
ax.set_xlabel('PCA Feature 1')
ax.set_ylabel('PCA Feature 2')
ax.set_zlabel('PCA Feature 3')
ax.legend()
plt.title('3D Scatter Plot of Clusters')
plt.show()
# 绘制簇数量与平均距离的折线图
plt.figure()
plt.plot(K, distances, 'bx-')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Average Distance to Cluster Centers')
plt.title('Average Distance to Cluster Centers vs. Number of Clusters')
plt.xticks(K)
plt.show()