所有代码块都是在Jupyter Notebook下进行调试运行,前后之间都相互关联。
文中所有代码块所涉及到的函数里面的详细参数均可通过scikit-learn官网API文档进行查阅,这里我只写下每行代码所实现的功能,参数的调整读者可以多进行试验调试。多动手!!!
一、K-Means算法
#导包
import numpy as np
import os
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
Ⅰ,K-Means算法
from sklearn.datasets import make_blobs#指定五个中心点,以这五个点为中心开始发散,从而构建五个簇
blob_centers = np.array([[0.2,2.3],[-1.5,2.3],[-2.8,1.8],[-2.8,2.8],[-2.8,1.3]])blob_std =np.array([0.4,0.3,0.1,0.1,0.1]) #指定发散程度
X,y = make_blobs(n_samples=2000,centers=blob_centers,cluster_std = blob_std,random_state=7)#指定样本个数2000,中心点为上步设置的中心点,以中心点为圆心向两边发散
#绘图展示
def plot_clusters(X, y=None):plt.scatter(X[:, 0], X[:, 1], c=y, s=1)plt.xlabel("$x_1$", fontsize=14)plt.ylabel("$x_2$", fontsize=14, rotation=0)
plt.figure(figsize=(8, 4))
plot_clusters(X)
plt.show()
①决策边界
from sklearn.cluster import KMeans#导包
k = 5#指定当前簇的个数
kmeans = KMeans(n_clusters = k,random_state=42)#实例化
y_pred = kmeans.fit_predict(X)#训练模型,获取预测结果
y_pred#一共2000个样本点,每个样本点归于哪一个簇,一共五个簇,0,1,2,3,4
"""
array([0, 4, 1, ..., 2, 1, 4])
"""
kmeans.labels_ #调用下预测结果属性
"""
array([0, 4, 1, ..., 2, 1, 4])
"""
很显然,fit_predict(X)与kmeans.labels_ 得到预测结果是一致的
kmeans.cluster_centers_#获取当前五个中心点
"""
array([[-2.80037642, 1.30082566],[ 0.20876306, 2.25551336],[-2.79290307, 2.79641063],[-1.46679593, 2.28585348],[-2.80389616, 1.80117999]])
"""
X_new = np.array([[0,2],[3,2],[-3,3],[-3,2.5]])#随便构造4个测试数据
kmeans.predict(X_new)#预测一下结果值,看下每个测试数据对应的簇是第几个
"""
array([1, 1, 2, 2])
"""
kmeans.transform(X_new)#计算 四个测试样本 分别到 五个中心点的距离
"""
array([[2.88633901, 0.32995317, 2.9042344 , 1.49439034, 2.81093633],[5.84236351, 2.80290755, 5.84739223, 4.4759332 , 5.80730058],[1.71086031, 3.29399768, 0.29040966, 1.69136631, 1.21475352],[1.21567622, 3.21806371, 0.36159148, 1.54808703, 0.72581411]])
"""
#展示数据
def plot_data(X):plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2)#展示中心点
def plot_centroids(centroids, weights=None, circle_color='w', cross_color='k'):if weights is not None:centroids = centroids[weights > weights.max() / 10]plt.scatter(centroids[:, 0], centroids[:, 1],marker='o', s=30, linewidths=8,color=circle_color, zorder=10, alpha=0.9)plt.scatter(centroids[:, 0], centroids[:, 1],marker='x', s=50, linewidths=50,color=cross_color, zorder=11, alpha=1)#展示决策边界
def plot_decision_boundaries(clusterer, X, resolution=1000, show_centroids=True,show_xlabels=True, show_ylabels=True):mins = X.min(axis=0) - 0.1maxs = X.max(axis=0) + 0.1#排列组合xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),np.linspace(mins[1], maxs[1], resolution))Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])#得到预测的结果值Z = Z.reshape(xx.shape)#调整格式plt.contourf(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),cmap="Pastel2")#绘制等高线不包括颜色plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),linewidths=1, colors='k')#绘制等高线包括颜色plot_data(X)if show_centroids:plot_centroids(clusterer.cluster_centers_)if show_xlabels:plt.xlabel("$x_1$", fontsize=14)else:plt.tick_params(labelbottom='off')if show_ylabels:plt.ylabel("$x_2$", fontsize=14, rotation=0)else:plt.tick_params(labelleft='off')
plt.figure(figsize=(8, 4))
plot_decision_boundaries(kmeans, X)
plt.show()
②算法流程
#这里只展示了迭代3次的结果
kmeans_iter1 = KMeans(n_clusters = 5,init = 'random',n_init = 1,max_iter=1,random_state=1)#实例化模型
kmeans_iter2 = KMeans(n_clusters = 5,init = 'random',n_init = 1,max_iter=2,random_state=1)
kmeans_iter3 = KMeans(n_clusters = 5,init = 'random',n_init = 1,max_iter=3,random_state=1)kmeans_iter1.fit(X)#训练一下
kmeans_iter2.fit(X)
kmeans_iter3.fit(X)
"""
KMeans(init='random', max_iter=3, n_clusters=5, n_init=1, random_state=1)
"""
plt.figure(figsize=(12,8))#指定绘制图像的大小
plt.subplot(321)#3行2列的第1个
plot_data(X)#绘制数据散点图
plot_centroids(kmeans_iter1.cluster_centers_, circle_color='r', cross_color='k')#绘制每次迭代之后的中心点
plt.title('Update cluster_centers')plt.subplot(322)#3行2列的第2个
plot_decision_boundaries(kmeans_iter1, X,show_xlabels=False, show_ylabels=False)#绘制第一次迭代之后的决策边界
plt.title('Label')plt.subplot(323)#3行2列的第3个
plot_decision_boundaries(kmeans_iter1, X,show_xlabels=False, show_ylabels=False)
plot_centroids(kmeans_iter2.cluster_centers_,)#更新中心点plt.subplot(324)#3行2列的第4个
plot_decision_boundaries(kmeans_iter2, X,show_xlabels=False, show_ylabels=False)#绘制第二次迭代之后的决策边界plt.subplot(325)#3行2列的第5个
plot_decision_boundaries(kmeans_iter2, X,show_xlabels=False, show_ylabels=False)
plot_centroids(kmeans_iter3.cluster_centers_,)#更新中心点plt.subplot(326)#3行2列的第6个
plot_decision_boundaries(kmeans_iter3, X,show_xlabels=False, show_ylabels=False)#绘制第三次迭代之后的决策边界plt.show()
左图为初始化中心点并更新中心点参数,右图为K-Means划分之后重新确定中心点
③不稳定结果
def plot_clusterer_comparison(c1,c2,X):c1.fit(X)#俩不同的K-Meansc2.fit(X)plt.figure(figsize=(12,4))plt.subplot(121)#1行2列 第一个plot_decision_boundaries(c1,X)#决策边界的绘制plt.subplot(122)#1行2列 第二个plot_decision_boundaries(c2,X)#决策边界的绘制
c1 = KMeans(n_clusters = 5,init='random',n_init = 1,random_state=10)#构建两个不同的实例,将初始值中心点位置设置随机
c2 = KMeans(n_clusters = 5,init='random',n_init = 1,random_state=14)
plot_clusterer_comparison(c1,c2,X)
④评估指标
1,inertia
#获取样本到五个中心点的距离
kmeans.transform(X)
"""
array([[0.11146795, 3.04611916, 1.45402521, 1.54944305, 0.46779778],[0.51431557, 3.11541584, 0.99002955, 1.48612753, 0.07122059],[3.76340605, 1.32016676, 4.09069201, 2.67154781, 3.81713488],...,[1.42865797, 3.04886464, 0.06769209, 1.40795651, 0.92830156],[3.23385668, 0.14895409, 3.05913478, 1.71125 , 3.10300136],[0.67518173, 2.8625311 , 0.85434589, 1.21678483, 0.22700281]])
"""
#找到离哪个中心点距离最小,与上面的方法相对应
kmeans.labels_
"""
array([0, 4, 1, ..., 2, 1, 4])
"""
Inertia指标:每个样本与其质心的距离,越小分类效果越好
X_dist = kmeans.transform(X)
#通过标签来确定出样本数据到五个中心点中最小的距离
X_dist[np.arange(len(X_dist)),kmeans.labels_]
"""
array([0.11146795, 0.07122059, 1.32016676, ..., 0.06769209, 0.14895409,0.22700281])
"""
np.sum(X_dist[np.arange(len(X_dist)),kmeans.labels_]**2)#平方再求和
"""
211.59853725816856
"""
kmeans.inertia_#调用指标函数
"""
211.59853725816828
"""
很明显,这个inertia_
实际就是,求测试数据到五个中心点最小距离的平方之和。
kmeans.score(X)#老外的思想是基于对数函数的
"""
-211.5985372581683
"""
c1.inertia_
"""
236.80956211186657
"""
c2.inertia_
"""
211.60832621558367
"""
通过inertia_评估,可知,c1分类效果较好。
2,轮廓系数
𝑎𝑖 : 计算样本i到同簇其他样本的平均距离ai。ai 越小,说明样本i越应该被聚类到该簇。将ai 称为样本i的簇内不相似度。
𝑏𝑖 : 计算样本i到其他某簇Cj 的所有样本的平均距离bij,称为样本i与簇Cj 的不相似度。定义为样本i的簇间不相似度:bi =min{bi1, bi2, …, bik}
si接近1,则说明样本i聚类合理;
si接近-1,则说明样本i更应该分类到另外的簇;
若si 近似为0,则说明样本i在两个簇的边界上。
⑤K值的选取
1,inertia
依次取好多个K值,并进行评分,将评分结果进行绘制,找到拐点(斜率变换趋势交界处),所对应的K值比较理想,但不是绝对的!
kmeans_per_k = [KMeans(n_clusters = k).fit(X) for k in range(1,10)]#k值取1-9,都创建模型进行训练
inertias = [model.inertia_ for model in kmeans_per_k]#将这些不同k值的模型通过inertia_评估方法进行判定
plt.figure(figsize=(8,4))#指定绘图的大小
plt.plot(range(1,10),inertias,'bo-')
plt.axis([1,9.5,0,1300])#指定x和y轴的范围
plt.show()
找到拐点位置,也就是K=4效果较好。但实际上这里的K=5,故,这种方法仅供参考。
2,轮廓系数
from sklearn.metrics import silhouette_score
silhouette_score(X,kmeans.labels_)
"""
0.655517642572828
"""
kmeans_per_k
"""
[KMeans(n_clusters=1),KMeans(n_clusters=2),KMeans(n_clusters=3),KMeans(n_clusters=4),KMeans(n_clusters=5),KMeans(n_clusters=6),KMeans(n_clusters=7),KMeans(),KMeans(n_clusters=9)]
"""
silhouette_scores = [silhouette_score(X,model.labels_) for model in kmeans_per_k[1:]]
silhouette_scores
"""
[0.5966442557582528,0.5723900247411775,0.688531617595759,0.655517642572828,0.601878677912387,0.6071325093726307,0.561411737095725,0.5661946395774896]
"""
plt.figure(figsize=(8,4))
plt.plot(range(2,10),silhouette_scores,'bo-')
plt.show()
找到最接近1的位置的值,也就是K=4效果较好。但实际上这里的K=5,故,这种方法也仅供参考。
⑥K-Means存在的问题
X1, y1 = make_blobs(n_samples=1000, centers=((4, -4), (0, 0)), random_state=42)
X1 = X1.dot(np.array([[0.374, 0.95], [0.732, 0.598]]))
X2, y2 = make_blobs(n_samples=250, centers=1, random_state=42)
X2 = X2 + [6, -8]
X = np.r_[X1, X2]
y = np.r_[y1, y2]plot_data(X)
kmeans_good = KMeans(n_clusters=3,init=np.array([[-1.5,2.5],[0.5,0],[4,0]]),n_init=1,random_state=14)#玩赖,根据结果来定义初始值中心
kmeans_bad = KMeans(n_clusters=3,random_state=14)
kmeans_good.fit(X)#训练一下
kmeans_bad.fit(X)
"""
KMeans(n_clusters=3, random_state=14)
"""
plt.figure(figsize = (10,4))#指定绘制图的大小
plt.subplot(121)#一行两列 第一个
plot_decision_boundaries(kmeans_good,X)#绘制决策边界
plt.title('Good - inertia = {}'.format(kmeans_good.inertia_))plt.subplot(122)#一行两列 第二个
plot_decision_boundaries(kmeans_bad,X)#绘制决策边界
plt.title('Bad - inertia = {}'.format(kmeans_bad.inertia_))
有结果可知,左图inertia值大,但是效果确实很好,这一点更加反映出,上述的两个K指定选取以及评估指标都是仅供参考,并不代表最好的结果。
二、图像分割运用
将图像背景当成一类簇,图像前景当成另外一类簇,这便实现了对图像的分割。
#ladybug.png
from matplotlib.image import imread
image = imread(r'G:\Juptyer_workspace\study\data\cat.png')#读取图像路径
image.shape
"""
(321, 287, 3)
"""
X = image.reshape(-1,3)#-1表示image.shape的3个值相乘再除以3
X.shape#得到图像的像素点,每个像素点都有3个特征(RGB)
"""
(92127, 3)
"""
kmeans = KMeans(n_clusters = 8,random_state=14).fit(X)#n_clusters簇的个数 训练一下
kmeans.cluster_centers_#获取每个簇的中心点位置,8个簇,8个中心点位置对应8行,每个像素都有3个特征RGE对应3列
"""
array([[0.4471733 , 0.350066 , 0.30473077],[0.9084078 , 0.77313846, 0.70459783],[0.6072408 , 0.4768775 , 0.42346856],[0.78779787, 0.62332743, 0.55133986],[0.27133766, 0.19734937, 0.17845926],[0.6265253 , 0.61260563, 0.6179434 ],[0.7396835 , 0.7428063 , 0.76036733],[0.86409616, 0.86019826, 0.8708797 ]], dtype=float32)
"""
segmented_img = kmeans.cluster_centers_[kmeans.labels_].reshape(image.shape)#通过标签找到对应的中心点,用中心点的像素值取代所对应簇的所有像素点的值,因为最后需要以图像形式展示,这里需要重新转化一下格式
segmented_imgs = []
n_colors = (10,8,6,4,2)#5类K值,即有几中颜色
for n_cluster in n_colors:kmeans = KMeans(n_clusters = n_cluster,random_state=42).fit(X)segmented_img = kmeans.cluster_centers_[kmeans.labels_]segmented_imgs.append(segmented_img.reshape(image.shape))
plt.figure(figsize=(10,5))
plt.subplot(231)
plt.imshow(image)
plt.title('Original image')for idx,n_clusters in enumerate(n_colors):plt.subplot(232+idx)plt.imshow(segmented_imgs[idx])plt.title('{}colors'.format(n_clusters))
三、半监督学习
将训练集聚类为50个集群,然后对于每个聚类,找到最靠近质心的图像,这些图像称为代表性图像。
from sklearn.datasets import load_digitsX_digits,y_digits = load_digits(return_X_y = True)from sklearn.model_selection import train_test_splitX_train,X_test,y_train,y_test = train_test_split(X_digits,y_digits,random_state=42)
X_digits.shape#8*8*64
"""
(1797, 64)
"""
X_train.shape#训练数据有1347个
"""
(1347, 64)
"""
y_train.shape
"""
(1347,)
"""
from sklearn.linear_model import LogisticRegression#导入逻辑回归
n_labeled = 50#例如只有五十个标签log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train[:n_labeled], y_train[:n_labeled])
log_reg.score(X_test, y_test)#通过逻辑回归直接算
"""
0.8266666666666667
"""
k = 50
kmeans = KMeans(n_clusters=k, random_state=42)
X_digits_dist = kmeans.fit_transform(X_train)
X_digits_dist.shape#1347个训练样本,每个训练样本跟这50个簇的距离
"""
(1347, 50)
"""
representative_digits_idx = np.argmin(X_digits_dist,axis=0)#找到离每个簇中心最近的样本
representative_digits_idx.shape#当前有1347个样本,有50个簇,每个簇找到当前离簇中心最近的一个样本
"""
(50,)
"""
X_representative_digits = X_train[representative_digits_idx]##找到了这50个离50个簇中心最近的样本
绘制这些代表性图像并手动标记
plt.figure(figsize=(8, 2))
for index, X_representative_digit in enumerate(X_representative_digits):plt.subplot(k // 10, 10, index + 1)plt.imshow(X_representative_digit.reshape(8, 8), cmap="binary", interpolation="bilinear")plt.axis('off')plt.show()
y_representative_digits = np.array([0, 1, 3, 2, 9, 6, 4, 6, 9, 5,1, 2, 9, 5, 2, 7, 2, 1, 8, 6,3, 1, 5, 4, 5, 4, 0, 3, 2, 6,1, 7, 7, 9, 1, 8, 6, 5, 4, 8,5, 3, 3, 6, 7, 9, 7, 8, 4, 9])#对数据进行打标签,这50个是典型代表,是每个簇中选出的离簇中心最近的代表
有一个只有50个标记实例的数据集,它们中的每一个都是其集群的代表性图像,而不是完全随机的实例。
看看性能是否更好
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_representative_digits, y_representative_digits)
log_reg.score(X_test, y_test)
"""
0.8955555555555555
"""
将标签传播到同一群集中的所有其他实例
y_train_propagated = np.empty(len(X_train), dtype=np.int32)
for i in range(k):y_train_propagated[kmeans.labels_==i] = y_representative_digits[i]log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train_propagated)
log_reg.score(X_test, y_test)
"""
0.8966666666666667
"""
只选择前20个
percentile_closest = 20X_cluster_dist = X_digits_dist[np.arange(len(X_train)), kmeans.labels_]
for i in range(k):in_cluster = (kmeans.labels_ == i)#找到测试样本所处的簇cluster_dist = X_cluster_dist[in_cluster] #选择属于当前簇的所有样本cutoff_distance = np.percentile(cluster_dist, percentile_closest) #排序找到前20个above_cutoff = (X_cluster_dist > cutoff_distance) # False True结果X_cluster_dist[in_cluster & above_cutoff] = -1
partially_propagated = (X_cluster_dist != -1)
X_train_partially_propagated = X_train[partially_propagated]
y_train_partially_propagated = y_train_propagated[partially_propagated]
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_partially_propagated, y_train_partially_propagated)
log_reg.score(X_test, y_test)
"""
0.92177777777777778
"""
四、DBSCAN算法
#导包
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=1000, noise=0.05, random_state=42)
X
"""
array([[-0.02137124, 0.40618608],[ 0.97670045, -0.45832306],[ 0.90405882, -0.37651952],...,[ 1.66258462, -0.3079193 ],[-0.94355873, 0.3278936 ],[ 0.79419406, 0.60777171]])
"""
y#两类
"""
array([1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1,1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1,0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1,1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0,0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1,0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,0, 1, 0, 1, 0, 0, 0, 1, 0, 0], dtype=int64)
"""
plt.plot(X[:,0],X[:,1],'b.')
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps = 0.05,min_samples=5)#指定半径0.05和圆中最小得存在的样本个数5即最小阈值为5
dbscan.fit(X)#训练
dbscan.labels_[:10]#先选取10个看下结果,-1表示离群点
"""
array([ 0, 2, -1, -1, 1, 0, 0, 0, 2, 5], dtype=int64)
"""
dbscan.core_sample_indices_[:10]#先选取10个看下结果,核心对象的索引
"""
array([ 0, 4, 5, 6, 7, 8, 10, 11, 12, 13], dtype=int64)
"""
np.unique(dbscan.labels_)#查看下一共分成了几类
"""
array([-1, 0, 1, 2, 3, 4, 5, 6], dtype=int64)
"""
对比
dbscan2 = DBSCAN(eps = 0.2,min_samples=5)#指定半径0.2和圆中最小得存在的样本个数5即最小阈值为5
dbscan2.fit(X)#训练
def plot_dbscan(dbscan, X, size, show_xlabels=True, show_ylabels=True):#绘图core_mask = np.zeros_like(dbscan.labels_, dtype=bool)core_mask[dbscan.core_sample_indices_] = Trueanomalies_mask = dbscan.labels_ == -1non_core_mask = ~(core_mask | anomalies_mask)cores = dbscan.components_anomalies = X[anomalies_mask]non_cores = X[non_core_mask]plt.scatter(cores[:, 0], cores[:, 1],c=dbscan.labels_[core_mask], marker='o', s=size, cmap="Paired")plt.scatter(cores[:, 0], cores[:, 1], marker='*', s=20, c=dbscan.labels_[core_mask])plt.scatter(anomalies[:, 0], anomalies[:, 1],c="r", marker="x", s=100)plt.scatter(non_cores[:, 0], non_cores[:, 1], c=dbscan.labels_[non_core_mask], marker=".")if show_xlabels:plt.xlabel("$x_1$", fontsize=14)else:plt.tick_params(labelbottom='off')if show_ylabels:plt.ylabel("$x_2$", fontsize=14, rotation=0)else:plt.tick_params(labelleft='off')plt.title("eps={:.2f}, min_samples={}".format(dbscan.eps, dbscan.min_samples), fontsize=14)
plt.figure(figsize=(9, 3.2))plt.subplot(121)
plot_dbscan(dbscan, X, size=100)plt.subplot(122)
plot_dbscan(dbscan2, X, size=600, show_ylabels=False)plt.show()