当前位置: 首页 > news >正文

Go语言机器学习实战:聚类算法与无监督学习

Go语言机器学习实战:聚类算法与无监督学习

无监督学习是机器学习的重要分支,它从无标签数据中发现模式和结构。聚类算法是无监督学习的核心,本文将深入探讨如何使用Go语言实现常见的聚类算法。

一、聚类算法概述

聚类是将数据点分组的过程,使得同一组内的数据点相似度较高,而不同组之间的数据点相似度较低。常见的聚类算法包括:

  • K-Means:基于距离的划分方法,简单高效
  • 层次聚类:构建层次化的聚类树
  • DBSCAN:基于密度的聚类,能发现任意形状的簇
  • 高斯混合模型:概率模型,考虑数据的分布

二、K-Means聚类实现

2.1 算法原理

K-Means算法的核心思想是:

  1. 随机选择K个初始质心
  2. 将每个数据点分配到最近的质心
  3. 重新计算每个簇的质心
  4. 重复步骤2-3直到收敛

2.2 Go语言实现

package main import ( "fmt" "math" "math/rand" "time" ) type Point struct { Features []float64 } type KMeans struct { K int Centroids []Point MaxIter int } func NewKMeans(k, maxIter int) *KMeans { return &KMeans{ K: k, MaxIter: maxIter, } } func (km *KMeans) distance(p1, p2 Point) float64 { var sum float64 for i := 0; i < len(p1.Features); i++ { sum += math.Pow(p1.Features[i]-p2.Features[i], 2) } return math.Sqrt(sum) } func (km *KMeans) fit(points []Point) { rand.Seed(time.Now().UnixNano()) // 随机初始化质心 km.Centroids = make([]Point, km.K) for i := 0; i < km.K; i++ { idx := rand.Intn(len(points)) km.Centroids[i] = points[idx] } for iter := 0; iter < km.MaxIter; iter++ { // 分配数据点到簇 clusters := make([][]Point, km.K) for _, point := range points { minDist := math.MaxFloat64 clusterIdx := 0 for i, centroid := range km.Centroids { dist := km.distance(point, centroid) if dist < minDist { minDist = dist clusterIdx = i } } clusters[clusterIdx] = append(clusters[clusterIdx], point) } // 更新质心 prevCentroids := make([]Point, km.K) copy(prevCentroids, km.Centroids) for i, cluster := range clusters { if len(cluster) == 0 { continue } newCentroid := Point{Features: make([]float64, len(cluster[0].Features))} for _, point := range cluster { for j := 0; j < len(point.Features); j++ { newCentroid.Features[j] += point.Features[j] } } for j := 0; j < len(newCentroid.Features); j++ { newCentroid.Features[j] /= float64(len(cluster)) } km.Centroids[i] = newCentroid } // 检查收敛 converged := true for i := 0; i < km.K; i++ { if km.distance(km.Centroids[i], prevCentroids[i]) > 1e-6 { converged = false break } } if converged { fmt.Printf("算法在第%d次迭代收敛\n", iter+1) break } } } func (km *KMeans) predict(point Point) int { minDist := math.MaxFloat64 clusterIdx := 0 for i, centroid := range km.Centroids { dist := km.distance(point, centroid) if dist < minDist { minDist = dist clusterIdx = i } } return clusterIdx }

2.3 使用示例

func main() { // 生成模拟数据 points := []Point{ {Features: []float64{1, 2}}, {Features: []float64{2, 1}}, {Features: []float64{2, 3}}, {Features: []float64{8, 7}}, {Features: []float64{9, 8}}, {Features: []float64{7, 9}}, {Features: []float64{15, 16}}, {Features: []float64{16, 15}}, {Features: []float64{17, 17}}, } kmeans := NewKMeans(3, 100) kmeans.fit(points) fmt.Println("质心坐标:") for i, centroid := range kmeans.Centroids { fmt.Printf("簇%d: %v\n", i, centroid.Features) } // 预测新数据点 testPoint := Point{Features: []float64{10, 10}} cluster := kmeans.predict(testPoint) fmt.Printf("数据点(10, 10)属于簇%d\n", cluster) }

三、DBSCAN聚类实现

3.1 算法原理

DBSCAN(Density-Based Spatial Clustering of Applications with Noise)是一种基于密度的聚类算法:

  • 核心点:在指定半径ε内有至少MinPts个邻居
  • 边界点:在ε内邻居数少于MinPts,但属于某个核心点的邻域
  • 噪声点:既不是核心点也不是边界点

3.2 Go语言实现

type DBSCAN struct { Epsilon float64 MinPts int labels []int } func NewDBSCAN(epsilon float64, minPts int) *DBSCAN { return &DBSCAN{ Epsilon: epsilon, MinPts: minPts, } } func (d *DBSCAN) fit(points []Point) { n := len(points) d.labels = make([]int, n) for i := range d.labels { d.labels[i] = -1 // -1表示未访问 } clusterID := 0 for i := 0; i < n; i++ { if d.labels[i] != -1 { continue } neighbors := d.rangeQuery(points, i) if len(neighbors) < d.MinPts { d.labels[i] = 0 // 标记为噪声 continue } // 扩展簇 d.expandCluster(points, i, neighbors, clusterID) clusterID++ } } func (d *DBSCAN) rangeQuery(points []Point, idx int) []int { neighbors := []int{} for i := 0; i < len(points); i++ { if i == idx { continue } dist := d.distance(points[idx], points[i]) if dist <= d.Epsilon { neighbors = append(neighbors, i) } } return neighbors } func (d *DBSCAN) expandCluster(points []Point, idx int, neighbors []int, clusterID int) { d.labels[idx] = clusterID queue := neighbors for len(queue) > 0 { current := queue[0] queue = queue[1:] if d.labels[current] == 0 { d.labels[current] = clusterID } if d.labels[current] != -1 { continue } d.labels[current] = clusterID currentNeighbors := d.rangeQuery(points, current) if len(currentNeighbors) >= d.MinPts { queue = append(queue, currentNeighbors...) } } } func (d *DBSCAN) distance(p1, p2 Point) float64 { var sum float64 for i := 0; i < len(p1.Features); i++ { sum += math.Pow(p1.Features[i]-p2.Features[i], 2) } return math.Sqrt(sum) }

四、层次聚类

4.1 算法原理

层次聚类有两种策略:

  • 凝聚式:从单个点开始,逐步合并相似的簇
  • 分裂式:从整个数据集开始,逐步分裂

4.2 Go语言实现

type HierarchicalClustering struct { linkage string // "single", "complete", "average" } func NewHierarchicalClustering(linkage string) *HierarchicalClustering { return &HierarchicalClustering{linkage: linkage} } func (hc *HierarchicalClustering) fit(points []Point) [][]Point { // 初始化每个点为一个簇 clusters := make([][]Point, len(points)) for i, point := range points { clusters[i] = []Point{point} } for len(clusters) > 1 { minDist := math.MaxFloat64 mergeI, mergeJ := 0, 1 // 找到距离最近的两个簇 for i := 0; i < len(clusters); i++ { for j := i + 1; j < len(clusters); j++ { dist := hc.clusterDistance(clusters[i], clusters[j]) if dist < minDist { minDist = dist mergeI, mergeJ = i, j } } } // 合并两个簇 merged := append(clusters[mergeI], clusters[mergeJ]...) clusters = append(clusters[:mergeJ], clusters[mergeJ+1:]...) clusters[mergeI] = merged } return clusters } func (hc *HierarchicalClustering) clusterDistance(c1, c2 []Point) float64 { switch hc.linkage { case "single": return hc.singleLinkage(c1, c2) case "complete": return hc.completeLinkage(c1, c2) case "average": return hc.averageLinkage(c1, c2) default: return hc.singleLinkage(c1, c2) } } func (hc *HierarchicalClustering) singleLinkage(c1, c2 []Point) float64 { minDist := math.MaxFloat64 for _, p1 := range c1 { for _, p2 := range c2 { dist := hc.pointDistance(p1, p2) if dist < minDist { minDist = dist } } } return minDist } func (hc *HierarchicalClustering) completeLinkage(c1, c2 []Point) float64 { maxDist := 0.0 for _, p1 := range c1 { for _, p2 := range c2 { dist := hc.pointDistance(p1, p2) if dist > maxDist { maxDist = dist } } } return maxDist } func (hc *HierarchicalClustering) averageLinkage(c1, c2 []Point) float64 { var sumDist float64 count := 0 for _, p1 := range c1 { for _, p2 := range c2 { sumDist += hc.pointDistance(p1, p2) count++ } } return sumDist / float64(count) } func (hc *HierarchicalClustering) pointDistance(p1, p2 Point) float64 { var sum float64 for i := 0; i < len(p1.Features); i++ { sum += math.Pow(p1.Features[i]-p2.Features[i], 2) } return math.Sqrt(sum) }

五、高斯混合模型(GMM)

5.1 算法原理

GMM假设数据来自多个高斯分布的混合,使用EM算法进行参数估计。

5.2 Go语言实现

type Gaussian struct { Mean float64 StdDev float64 } func (g *Gaussian) pdf(x float64) float64 { return math.Exp(-math.Pow(x-g.Mean, 2)/(2*math.Pow(g.StdDev, 2))) / (g.StdDev * math.Sqrt(2*math.Pi)) } type GMM struct { Gaussians []Gaussian Weights []float64 } func NewGMM(k int) *GMM { return &GMM{ Gaussians: make([]Gaussian, k), Weights: make([]float64, k), } } func (gmm *GMM) fit(data []float64, maxIter int) { n := len(data) k := len(gmm.Gaussians) // 初始化 for i := 0; i < k; i++ { gmm.Gaussians[i] = Gaussian{ Mean: data[i*n/k], StdDev: 1.0, } gmm.Weights[i] = 1.0 / float64(k) } for iter := 0; iter < maxIter; iter++ { // E步:计算后验概率 responsibilities := make([][]float64, k) for i := range responsibilities { responsibilities[i] = make([]float64, n) } for j := 0; j < n; j++ { var sum float64 for i := 0; i < k; i++ { responsibilities[i][j] = gmm.Weights[i] * gmm.Gaussians[i].pdf(data[j]) sum += responsibilities[i][j] } for i := 0; i < k; i++ { responsibilities[i][j] /= sum } } // M步:更新参数 for i := 0; i < k; i++ { var weightSum, meanSum, varSum float64 for j := 0; j < n; j++ { weightSum += responsibilities[i][j] meanSum += responsibilities[i][j] * data[j] } gmm.Weights[i] = weightSum / float64(n) gmm.Gaussians[i].Mean = meanSum / weightSum for j := 0; j < n; j++ { varSum += responsibilities[i][j] * math.Pow(data[j]-gmm.Gaussians[i].Mean, 2) } gmm.Gaussians[i].StdDev = math.Sqrt(varSum / weightSum) } } }

六、聚类评估指标

func SilhouetteScore(points []Point, labels []int) float64 { n := len(points) silhouette := make([]float64, n) for i := 0; i < n; i++ { // 计算a(i):同一簇内其他点的平均距离 var a float64 sameCluster := []Point{} for j := 0; j < n; j++ { if i != j && labels[j] == labels[i] { sameCluster = append(sameCluster, points[j]) } } if len(sameCluster) > 0 { for _, p := range sameCluster { a += distance(points[i], p) } a /= float64(len(sameCluster)) } // 计算b(i):最近簇的平均距离 b := math.MaxFloat64 clusters := make(map[int][]Point) for j := 0; j < n; j++ { if j != i { clusters[labels[j]] = append(clusters[labels[j]], points[j]) } } for _, cluster := range clusters { if len(cluster) > 0 { var distSum float64 for _, p := range cluster { distSum += distance(points[i], p) } avgDist := distSum / float64(len(cluster)) if avgDist < b { b = avgDist } } } silhouette[i] = (b - a) / math.Max(a, b) } var score float64 for _, s := range silhouette { score += s } return score / float64(n) } func distance(p1, p2 Point) float64 { var sum float64 for i := 0; i < len(p1.Features); i++ { sum += math.Pow(p1.Features[i]-p2.Features[i], 2) } return math.Sqrt(sum) }

七、总结

本文介绍了四种经典的聚类算法及其Go语言实现:

  1. K-Means:简单高效,适合大规模数据
  2. DBSCAN:基于密度,能发现任意形状的簇
  3. 层次聚类:构建层次化结构,无需指定K值
  4. 高斯混合模型:概率模型,考虑数据分布

每种算法都有其适用场景,选择合适的聚类算法需要考虑数据特性和业务需求。Go语言的高性能特性使其成为处理大规模数据聚类的理想选择。

http://www.rkmt.cn/news/1424179.html

相关文章:

  • 豆包优化怎么选才稳妥?细数企业高频踩坑问题,三家服务商实测参考 - 玖叁鹿
  • GaiaNet Chat从零上手:去中心化AI聊天应用实战指南
  • 基于树莓派Zero与Fusion 360的复古掌机DIY全流程指南
  • 2026年济南宣传片拍摄/山东宣传片制作榜单:企业影视制作与创意视觉深度推荐 - 品牌企业推荐师(官方)
  • 基于树莓派Zero W打造GTA风格车载FM发射器:硬件改造与Python控制
  • 2026年物联网GEO优化公司哪家好?“全意图”占领AI心智 - GEO优化
  • 2026兰州生活水箱厂家TOP5排行:兰州不锈钢水箱、兰州水箱、兰州消防水箱、无负压设备、消防稳压供水设备、消防稳压设备选择指南 - 优质品牌商家
  • 基于ESP32与多传感器融合的智能家庭健身系统设计与实现
  • 专业级GPU内存检测工具MemtestCL:构建计算设备健康保障体系
  • 成都茶楼装修技术解析:成都店铺装修设计/成都店面装修/成都民宿装修/成都火锅店装修/成都美容院装修/成都舞蹈室装修/选择指南 - 优质品牌商家
  • Legacy iOS Kit终极指南:如何让旧iPhone/iPad重获新生?
  • 课程排期总出错?教师调度总延迟?Lindy自动化系统上线后故障率下降92%,关键配置参数首次公开
  • 基于Arduino的听障辅助眼镜DIY:声音转振动触觉提示系统
  • 神经体积渲染全解析:从NeRF原理到产业落地
  • 基于EZ-Robot的R2-D2智能改造:多传感器融合与集中控制实践
  • 九大网盘直链下载终极指南:告别限速,一键获取真实下载地址
  • 2026现阶段,四川老人开裆裤直销工厂优选:金阑亭以专业实力守护卧床尊严 - 2026年企业资讯
  • 别再手动拉人了!用代码自动管理企微外部群、发消息的技术秘诀
  • 基于树莓派Zero 2W与RetroPie打造便携式复古游戏机全攻略
  • 2026高效400平方压滤机出租服务商推荐榜:地基工程泥浆处理/地铁盾构泥浆脱水/城市生活污水处理/尾矿库泥浆脱水/选择指南 - 优质品牌商家
  • MATLAB语音识别教学实验包:带录音功能、DTW/HMM双算法演示与实时波形对比
  • AP-0316 语音模块实测效果与能力边界展示
  • 城通网盘直链解析终极指南:3分钟告别下载烦恼
  • 2026年儿童陪睡毛绒玩具推荐:五家优选品牌深度解析 - 科技焦点
  • STM32摊贩定位监控套件:BDS定位+OLED报警+机智云远程调参(含可烧录hex与毕设文档)
  • 苏州防水补漏公司 TOP1|屋面卫生间渗漏修缮靠谱推荐 - 吉修匠
  • 基于WSN算法及3D位移协同预警模型的卫生填埋场动态监测智能管控系统方案【附数据】
  • Kruskal与Prim:最小生成树双雄对决
  • 2026年商家小程序外卖怎么找骑手
  • 别再暴力刷新了!用ScriptableObject和事件驱动重构Unity背包系统,性能提升实测