k-means聚类算法
K-Means是一种无监督算法,其目标是将数据进行分类。分类个数要求已知。
k-means流程
- 随机确定K个点作为质心、
- 找到离每个点最近的质心,将这个点分配到这个质心代表的簇里
- 再对每个簇进行计算,以点簇的均值点作为新的质心
- 如果新的质心和上一轮的不一样,则迭代进行2-3步骤,直到质心位置稳定
本文目标
- 在golang中实现k-means算法。
- 使用matplotlib绘制聚类散点图。
- 尝试并行处理。
- 与sklearn结果对比。
执行结果
左侧:输出
中间:本文效果
右侧:sklearn效果
Q: 为什么样本一样,结果不同?
A: 两方面,首先算法的结束方法中阈值不同,然后是初始k均值点选择不同。
确定数据结构
我把kMeans封装成了对象,此外还有Point。实现如下
type (Point struct {X, Y float64}KMeans struct {points []Pointk intdistanceFunc distanceFuncavgPoints []Point}distanceFunc func(p1, p2 Point) float64
)
生成随机样本
func generateRandomPoints(n int) []tool.Point {points := make([]tool.Point, n)for i := 0; i < n; i++ {points[i] = tool.Point{X: rand.Float64() * 100,Y: rand.Float64() * 100,}}return points
}
确定距离度量函数
distance := func(p1, p2 tool.Point) float64 {return math.Sqrt(math.Pow(p1.X-p2.X, 2) + math.Pow(p1.Y-p2.Y, 2))}
初始化kmeans对象
func (kMeans *KMeans) Init(k int, points []Point, distanceFunc distanceFunc) {kMeans.k = kkMeans.points = pointskMeans.distanceFunc = distanceFunckMeans.avgPoints = make([]Point, kMeans.k)
}
func (kMeans *KMeans) initializeAvgPoints() {copy(kMeans.avgPoints, kMeans.points[:kMeans.k])
}
确定算法大致流程
func (kMeans *KMeans) Do(checkFunc func(oldCentroids, newCentroids []Point) bool) ([][]Point, int) {kMeans.initializeAvgPoints()var (clusters [][]PointtmpAvgPoints []Pointcount int)for {// 获取聚类含有哪些点clusters = kMeans.computeDistanceToAvgPoints()// 更新聚类中心tmpAvgPoints = kMeans.updateAvgPoints(clusters)// 检查质心位置的变化if checkFunc(kMeans.avgPoints, tmpAvgPoints) {break}// 更新质心位置kMeans.avgPoints = tmpAvgPointscount++}return clusters, count
}
计算聚类(尝试并发)
Q: 为什么可以并发?
A: 因为计算聚类时,对每个点的运算的独立的,依赖的数据不会在计算时修改。
func (kMeans *KMeans) computeDistanceToAvgPoints() [][]Point {type BakPoint struct {i intp Point}clusters := make([][]Point, len(kMeans.avgPoints))resultCh := make(chan BakPoint, len(kMeans.points))for _, point := range kMeans.points {computeMinDistanceForSignlePoint := func(point Point, avgPoints []Point, distanceFunc func(p1, p2 Point) float64, ch chan BakPoint) {minDistance := struct {d float64i int}{d: math.MaxFloat64,i: -1,}for i, avgPoint := range avgPoints {if d := distanceFunc(avgPoint, point); d < minDistance.d {minDistance.d = dminDistance.i = i}}ch <- BakPoint{p: point, i: minDistance.i}}go computeMinDistanceForSignlePoint(point, kMeans.avgPoints, kMeans.distanceFunc, resultCh)}for i := 0; i < len(kMeans.points); i++ {result := <-resultChclusters[result.i] = append(clusters[result.i], result.p)}close(resultCh)return clusters
}
更新K均值点
func (kMeans *KMeans) updateAvgPoints(clusters [][]Point) []Point {centroids := make([]Point, kMeans.k)for i, cluster := range clusters {sumX, sumY := 0.0, 0.0for _, point := range cluster {sumX += point.XsumY += point.Y}centroids[i].X = sumX / float64(len(cluster))centroids[i].Y = sumY / float64(len(cluster))}return centroids
}
使用matplotlib绘制图像
import matplotlib.pyplot as pltdef readFromFile():clusters = []current_cluster = []with open('points.txt', 'r') as file:for line in file:if line.strip() == "----":if current_cluster:clusters.append(current_cluster)current_cluster = []else:x, y = map(float, line.split())current_cluster.append((x, y))if current_cluster:clusters.append(current_cluster)return clustersclusters = readFromFile()for cluster in clusters:X = [point[0] for point in cluster]Y = [point[1] for point in cluster]plt.scatter(X, Y, marker='.')plt.xlabel('X')
plt.ylabel('Y')
plt.title('Clustered Scatter Plot')
plt.show()
使用sklearn计算
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeanspoints = []
with open('rawpoints.txt', 'r') as file:for line in file:x, y = map(float, line.split())points.append([x, y])X = np.array(points)n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', marker='.')
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=10, color='red', label='Centroids')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('K-Means Clustering')
plt.legend()
plt.show()
完整代码
项目结构如下
KMeans-golang
│ go.mod
│ main.go
│ raw.py
│ show.py
└─toolkmeans.go
main.go
package mainimport ("fmt""k-means/tool""math""math/rand""os""os/exec""time"
)func generateRandomPoints(n int) []tool.Point {points := make([]tool.Point, n)for i := 0; i < n; i++ {points[i] = tool.Point{X: rand.Float64() * 100,Y: rand.Float64() * 100,}}return points
}func wrapper(name string, fun func()) {start := time.Now()fun()elapsed := time.Since(start)fmt.Printf("%s 函数执行时间:%s\n", name, elapsed)fmt.Printf("%s 函数执行时间(纳秒):%dns\n", name, elapsed.Nanoseconds())
}func writeToFile(points [][]tool.Point) {file, err := os.Create("points.txt")if err != nil {fmt.Println("Failed to create file:", err)return}defer file.Close()for _, row := range points {for _, p := range row {_, err := fmt.Fprintf(file, "%f %f\n", p.X, p.Y)if err != nil {fmt.Println("Failed to write to file:", err)return}}_, err := fmt.Fprintf(file, "----\n")if err != nil {fmt.Println("Failed to write to file:", err)return}}fmt.Println("Data written to file successfully.")
}
func writeToFile2(points []tool.Point) {file, err := os.Create("rawpoints.txt")if err != nil {fmt.Println("Failed to create file:", err)return}defer file.Close()for _, p := range points {_, err := fmt.Fprintf(file, "%f %f\n", p.X, p.Y)if err != nil {fmt.Println("Failed to write to file:", err)return}}fmt.Println("Data written to file successfully.")
}
func main() {rand.Seed(time.Now().UnixNano())// 样本数据var points []tool.Pointwrapper("生成样本", func() {points = generateRandomPoints(100)})k := 5distance := func(p1, p2 tool.Point) float64 {return math.Sqrt(math.Pow(p1.X-p2.X, 2) + math.Pow(p1.Y-p2.Y, 2))}kMeansObj := new(tool.KMeans)kMeansObj.Init(k, points, distance)var (finalClusters [][]tool.Pointcount int)wrapper("执行算法", func() {finalClusters, count = kMeansObj.Do(func(oldCentroids, newCentroids []tool.Point) bool {epsilon := 0.000001for i := 0; i < len(oldCentroids); i++ {if distance(oldCentroids[i], newCentroids[i]) > epsilon {return false}}return true})})fmt.Println("count: ", count)wrapper("写入文件", func() {writeToFile2(points)writeToFile(finalClusters)})go func() {command := exec.Command("C:\\Projects\\PycharmProjects\\deelLearn\\venv\\Scripts\\python.exe", "raw.py")command.Run()command.Wait()}()command := exec.Command("C:\\Projects\\PycharmProjects\\deelLearn\\venv\\Scripts\\python.exe", "show.py")command.Run()command.Wait()
}
raw.py
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeanspoints = []
with open('rawpoints.txt', 'r') as file:for line in file:x, y = map(float, line.split())points.append([x, y])X = np.array(points)n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', marker='.')
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=10, color='red', label='Centroids')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('K-Means Clustering')
plt.legend()
plt.show()
show.py
import matplotlib.pyplot as pltdef readFromFile():clusters = []current_cluster = []with open('points.txt', 'r') as file:for line in file:if line.strip() == "----":if current_cluster:clusters.append(current_cluster)current_cluster = []else:x, y = map(float, line.split())current_cluster.append((x, y))if current_cluster:clusters.append(current_cluster)return clustersclusters = readFromFile()for cluster in clusters:X = [point[0] for point in cluster]Y = [point[1] for point in cluster]plt.scatter(X, Y, marker='.')plt.xlabel('X')
plt.ylabel('Y')
plt.title('Clustered Scatter Plot')
plt.show()
tool/kmeans.go
package toolimport "math"type (Point struct {X, Y float64}KMeans struct {points []Pointk intdistanceFunc distanceFuncavgPoints []Point}distanceFunc func(p1, p2 Point) float64
)func (kMeans *KMeans) Init(k int, points []Point, distanceFunc distanceFunc) {kMeans.k = kkMeans.points = pointskMeans.distanceFunc = distanceFunckMeans.avgPoints = make([]Point, kMeans.k)
}func (kMeans *KMeans) Do(checkFunc func(oldCentroids, newCentroids []Point) bool) ([][]Point, int) {kMeans.initializeAvgPoints()var (clusters [][]PointtmpAvgPoints []Pointcount int)for {// 获取聚类含有哪些点clusters = kMeans.computeDistanceToAvgPoints()// 更新聚类中心tmpAvgPoints = kMeans.updateAvgPoints(clusters)// 检查质心位置的变化if checkFunc(kMeans.avgPoints, tmpAvgPoints) {break}// 更新质心位置kMeans.avgPoints = tmpAvgPointscount++}return clusters, count
}func (kMeans *KMeans) initializeAvgPoints() {copy(kMeans.avgPoints, kMeans.points[:kMeans.k])
}func (kMeans *KMeans) computeDistanceToAvgPoints() [][]Point {type BakPoint struct {i intp Point}clusters := make([][]Point, len(kMeans.avgPoints))resultCh := make(chan BakPoint, len(kMeans.points))for _, point := range kMeans.points {computeMinDistanceForSignlePoint := func(point Point, avgPoints []Point, distanceFunc func(p1, p2 Point) float64, ch chan BakPoint) {minDistance := struct {d float64i int}{d: math.MaxFloat64,i: -1,}for i, avgPoint := range avgPoints {if d := distanceFunc(avgPoint, point); d < minDistance.d {minDistance.d = dminDistance.i = i}}ch <- BakPoint{p: point, i: minDistance.i}}go computeMinDistanceForSignlePoint(point, kMeans.avgPoints, kMeans.distanceFunc, resultCh)}for i := 0; i < len(kMeans.points); i++ {result := <-resultChclusters[result.i] = append(clusters[result.i], result.p)}close(resultCh)return clusters
}func (kMeans *KMeans) updateAvgPoints(clusters [][]Point) []Point {centroids := make([]Point, kMeans.k)for i, cluster := range clusters {sumX, sumY := 0.0, 0.0for _, point := range cluster {sumX += point.XsumY += point.Y}centroids[i].X = sumX / float64(len(cluster))centroids[i].Y = sumY / float64(len(cluster))}return centroids
}