github.com/schollz/clusters@v0.0.0-20221201012527-c6c68863636f/kmeans_estimator.go (about) 1 package clusters 2 3 import ( 4 "math" 5 "math/rand" 6 "time" 7 8 "gonum.org/v1/gonum/floats" 9 ) 10 11 type kmeansEstimator struct { 12 iterations, number, max int 13 14 // variables keeping count of changes of points' membership every iteration. User as a stopping condition. 15 changes, oldchanges, counter, threshold int 16 17 distance DistanceFunc 18 19 a, b []int 20 21 // slices holding values of centroids of each clusters 22 m, n [][]float64 23 24 // dataset 25 d [][]float64 26 } 27 28 // Implementation of cluster number estimator using gap statistic 29 // ("Estimating the number of clusters in a data set via the gap statistic", Tibshirani et al.) with k-means++ as 30 // clustering algorithm 31 func KMeansEstimator(iterations, clusters int, distance DistanceFunc) (Estimator, error) { 32 if iterations < 1 { 33 return nil, errZeroIterations 34 } 35 36 if clusters < 2 { 37 return nil, errOneCluster 38 } 39 40 var d DistanceFunc 41 { 42 if distance != nil { 43 d = distance 44 } else { 45 d = EuclideanDistance 46 } 47 } 48 49 return &kmeansEstimator{ 50 iterations: iterations, 51 max: clusters, 52 distance: d, 53 }, nil 54 } 55 56 func (c *kmeansEstimator) Estimate(data [][]float64) (int, error) { 57 if len(data) == 0 { 58 return 0, errEmptySet 59 } 60 61 var ( 62 estimated = 0 63 size = len(data) 64 bounds = bounds(data) 65 wks = make([]float64, c.max) 66 wkbs = make([]float64, c.max) 67 sk = make([]float64, c.max) 68 one = make([]float64, c.max) 69 bwkbs = make([]float64, c.max) 70 ) 71 72 for i := 0; i < c.max; i++ { 73 c.number = i + 1 74 75 c.learn(data) 76 77 wks[i] = math.Log(c.wk(c.d, c.m, c.a)) 78 79 for j := 0; j < c.max; j++ { 80 c.learn(c.buildRandomizedSet(size, bounds)) 81 82 bwkbs[j] = math.Log(c.wk(c.d, c.m, c.a)) 83 one[j] = 1 84 } 85 86 wkbs[i] = floats.Sum(bwkbs) / float64(c.max) 87 88 floats.Scale(wkbs[i], one) 89 floats.Sub(bwkbs, one) 90 floats.Mul(bwkbs, bwkbs) 91 92 sk[i] = math.Sqrt(floats.Sum(bwkbs) / float64(c.max)) 93 } 94 95 floats.Scale(math.Sqrt(1+(1/float64(c.max))), sk) 96 97 for i := 0; i < c.max-1; i++ { 98 if wkbs[i] >= wkbs[i+1]-sk[i+1] { 99 estimated = i + 1 100 break 101 } 102 } 103 104 return estimated, nil 105 } 106 107 // private 108 func (c *kmeansEstimator) learn(data [][]float64) { 109 c.d = data 110 111 c.a = make([]int, len(data)) 112 c.b = make([]int, c.number) 113 114 c.counter = 0 115 c.threshold = changesThreshold 116 c.changes = 0 117 c.oldchanges = 0 118 119 c.initializeMeansWithData() 120 121 for i := 0; i < c.iterations && c.counter != c.threshold; i++ { 122 c.run() 123 c.check() 124 } 125 } 126 127 func (c *kmeansEstimator) initializeMeansWithData() { 128 c.m = make([][]float64, c.number) 129 c.n = make([][]float64, c.number) 130 131 rand.Seed(time.Now().UTC().Unix()) 132 133 var ( 134 k int 135 s, t, l, f float64 136 d []float64 = make([]float64, len(c.d)) 137 ) 138 139 c.m[0] = c.d[rand.Intn(len(c.d)-1)] 140 141 for i := 1; i < c.number; i++ { 142 s = 0 143 t = 0 144 for j := 0; j < len(c.d); j++ { 145 146 l = c.distance(c.m[0], c.d[j]) 147 for g := 1; g < i; g++ { 148 if f = c.distance(c.m[g], c.d[j]); f < l { 149 l = f 150 } 151 } 152 153 d[j] = math.Pow(l, 2) 154 s += d[j] 155 } 156 157 t = rand.Float64() * s 158 k = 0 159 for s = d[0]; s < t; s += d[k] { 160 k++ 161 } 162 163 c.m[i] = c.d[k] 164 } 165 166 for i := 0; i < c.number; i++ { 167 c.n[i] = make([]float64, len(c.m[0])) 168 } 169 } 170 171 func (c *kmeansEstimator) run() { 172 var ( 173 l, k, n int = len(c.m[0]), 0, 0 174 m, d float64 175 ) 176 177 for i := 0; i < c.number; i++ { 178 c.b[i] = 0 179 } 180 181 for i := 0; i < len(c.d); i++ { 182 m = c.distance(c.d[i], c.m[0]) 183 n = 0 184 185 for j := 1; j < c.number; j++ { 186 if d = c.distance(c.d[i], c.m[j]); d < m { 187 m = d 188 n = j 189 } 190 } 191 192 k = n + 1 193 194 if c.a[i] != k { 195 c.changes++ 196 } 197 198 c.a[i] = k 199 c.b[n]++ 200 201 floats.Add(c.n[n], c.d[i]) 202 } 203 204 for i := 0; i < c.number; i++ { 205 floats.Scale(1/float64(c.b[i]), c.n[i]) 206 207 for j := 0; j < l; j++ { 208 c.m[i][j] = c.n[i][j] 209 c.n[i][j] = 0 210 } 211 } 212 } 213 214 func (c *kmeansEstimator) check() { 215 if c.changes == c.oldchanges { 216 c.counter++ 217 } 218 219 c.oldchanges = c.changes 220 } 221 222 func (c *kmeansEstimator) wk(data [][]float64, centroids [][]float64, mapping []int) float64 { 223 var ( 224 l = float64(2 * len(data[0])) 225 wk = make([]float64, len(centroids)) 226 ) 227 228 for i := 0; i < len(mapping); i++ { 229 wk[mapping[i]-1] += EuclideanDistanceSquared(centroids[mapping[i]-1], data[i]) / l 230 } 231 232 return floats.Sum(wk) 233 } 234 235 func (c *kmeansEstimator) buildRandomizedSet(size int, bounds []*[2]float64) [][]float64 { 236 var ( 237 l = len(bounds) 238 r = make([][]float64, size) 239 ) 240 241 for i := 0; i < size; i++ { 242 r[i] = make([]float64, l) 243 244 for j := 0; j < l; j++ { 245 r[i][j] = uniform(bounds[j]) 246 } 247 } 248 249 return r 250 }