在构成圆形的30000个随机样本点上,设置7个簇,使用K-Means算法聚类
from math import pi, sin, cos from collections import namedtuple from random import random, choice from copy import copy import matplotlib.pyplot as plt import numpy as np import systry:import psycopsyco.full() except ImportError:passFLOAT_MAX = 1e100class Point:__slots__ = ["x", "y", "group"]def __init__(self, x=0.0, y=0.0, group=1):self.x, self.y, self.group = x, y, groupdef generate_points(npoints, radius):points = [Point() for _ in range(npoints)]# note: this is not a uniform 2-d distributionfor p in points:r = random() * radiusang = random() * 2 * pip.x = r * cos(ang)p.y = r * sin(ang)return pointsdef nearest_cluster_center(point, cluster_centers):"""Distance and index of the closest cluster center"""def sqr_distance_2D(a, b):return (a.x - b.x) ** 2 + (a.y - b.y) ** 2min_index = point.groupmin_dist = FLOAT_MAXfor i, cc in enumerate(cluster_centers):d = sqr_distance_2D(cc, point)if min_dist > d:min_dist = dmin_index = ireturn (min_index, min_dist)def kpp(points, cluster_centers):cluster_centers[0] = copy(choice(points))d = [0.0 for _ in range(len(points))]for i in range(1, len(cluster_centers)):sum = 0for j, p in enumerate(points):d[j] = nearest_cluster_center(p, cluster_centers[:i])[1]sum += d[j]sum *= random()for j, di in enumerate(d):sum -= diif sum > 0:continuecluster_centers[i] = copy(points[j])breakfor p in points:p.group = nearest_cluster_center(p, cluster_centers)[0]def lloyd(points, nclusters):cluster_centers = [Point() for _ in range(nclusters)]# call k++ initkpp(points, cluster_centers)lenpts10 = len(points) >> 10changed = 0while True:# group element for centroids are used as countersfor cc in cluster_centers:cc.x = 0cc.y = 0cc.group = 0for p in points:cluster_centers[p.group].group += 1cluster_centers[p.group].x += p.xcluster_centers[p.group].y += p.yfor cc in cluster_centers:cc.x /= cc.groupcc.y /= cc.group# find closest centroid of each PointPtrchanged = 0for p in points:min_i = nearest_cluster_center(p, cluster_centers)[0]if min_i != p.group:changed += 1p.group = min_i# stop when 99.9% of points are goodif changed <= lenpts10:breakfor i, cc in enumerate(cluster_centers):cc.group = ireturn cluster_centersdef kmeans(points, nclusters):cluster_centers = [copy(choice(points)) for _ in xrange(nclusters)]lenpts10 = len(points) >> 10changed = 0while True:for p in points:cluster_centers[p.group].group += 1cluster_centers[p.group].x += p.xcluster_centers[p.group].y += p.yfor cc in cluster_centers:#print cc.x,cc.y,cc.groupcc.x /= cc.groupcc.y /= cc.groupchanged = 0for p in points:min_i = nearest_cluster_center(p, cluster_centers)[0]if min_i != p.group:changed += 1p.group = min_i# stop when 99.9% of points are goodif changed <= lenpts10:breakfor i, cc in enumerate(cluster_centers):cc.group = ireturn cluster_centersdef print_eps(points, cluster_centers):c = np.random.rand(len(cluster_centers)+1,3)for i, cc in enumerate(cluster_centers):plot_x = []plot_y = []print (cc.x,cc.y,cc.group)for p in points:if p.group != i:continueplot_x.append(p.x)plot_y.append(p.y)plt.scatter(plot_x,plot_y,color=c[i],s=1)plt.scatter(cc.x,cc.y, color=c[len(c)-i-1], marker='^',s=50)plt.show()def main():npoints = 30000k = 7 # # clusterspoints = generate_points(npoints, 10)cluster_centers = lloyd(points, k)#cluster_centers = kmeans(points,k)print_eps(points, cluster_centers)main()
4.752567342690781 -4.334890736846155 0
-1.3573754262792683 6.986665631867061 1
-0.15969173199245487 2.506935816564994 2
-6.2690747305026315 0.7728991240509486 3
-2.713392739613833 -5.828585413438269 4
5.771515155816214 2.91539616422218 5
0.07082712668661448 -0.7504142150100154 6