github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/colexec/aggexec/algos/kmeans/elkans/clusterer.go (about) 1 // Copyright 2023 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package elkans 16 17 import ( 18 "github.com/matrixorigin/matrixone/pkg/common/moerr" 19 "github.com/matrixorigin/matrixone/pkg/logutil" 20 "github.com/matrixorigin/matrixone/pkg/sql/colexec/aggexec/algos/kmeans" 21 "github.com/matrixorigin/matrixone/pkg/vectorize/moarray" 22 "gonum.org/v1/gonum/mat" 23 "math" 24 "math/rand" 25 "sync" 26 ) 27 28 // ElkanClusterer is an improved kmeans algorithm which using the triangle inequality to reduce the number of 29 // distance calculations. As quoted from the paper: 30 // "The main contribution of this paper is an optimized version of the standard k-means method, with which the number 31 // of distance computations is in practice closer to `n` than to `nke`, where n is the number of vectors, k is the number 32 // of centroids, and e is the number of iterations needed until convergence." 33 // 34 // However, during each iteration of the algorithm, the lower bounds l(x, c) are updated for all points x and centers c. 35 // These updates take O(nk) time, so the complexity of the algorithm remains at least O(nke), even though the number 36 // of distance calculations is roughly O(n) only. 37 // NOTE that, distance calculation is very expensive for higher dimension vectors. 38 // 39 // Ref Paper: https://cdn.aaai.org/ICML/2003/ICML03-022.pdf 40 type ElkanClusterer struct { 41 42 // for each of the n vectors, we keep track of the following data 43 vectorList []*mat.VecDense 44 vectorMetas []vectorMeta 45 assignments []int 46 47 // for each of the k centroids, we keep track of the following data 48 centroids []*mat.VecDense 49 halfInterCentroidDistMatrix [][]float64 50 minHalfInterCentroidDist []float64 51 52 // thresholds 53 maxIterations int // e in paper 54 deltaThreshold float64 // used for early convergence. we are not using it right now. 55 56 // counts 57 clusterCnt int // k in paper 58 vectorCnt int // n in paper 59 60 distFn kmeans.DistanceFunction 61 initType kmeans.InitType 62 rand *rand.Rand 63 normalize bool 64 } 65 66 // vectorMeta holds required information for Elkan's kmeans pruning. 67 // lower is at-least distance of a vector to each of the k centroids. Thus, there are k values for each data point. 68 // upper is at-most (maximum possible) distance of a vector to its currently assigned or "closest" centroid. 69 // Hence, there's only one value for each data point. 70 // recompute is a flag to indicate if the distance to centroids needs to be recomputed. if false, 71 // the algorithm will rely on the 'upper' bound as an approximation instead of computing the exact distance. 72 type vectorMeta struct { 73 lower []float64 74 upper float64 75 recompute bool 76 } 77 78 var _ kmeans.Clusterer = new(ElkanClusterer) 79 80 func NewKMeans(vectors [][]float64, clusterCnt, 81 maxIterations int, deltaThreshold float64, 82 distanceType kmeans.DistanceType, initType kmeans.InitType, 83 normalize bool, 84 ) (kmeans.Clusterer, error) { 85 86 err := validateArgs(vectors, clusterCnt, maxIterations, deltaThreshold, distanceType, initType) 87 if err != nil { 88 return nil, err 89 } 90 91 gonumVectors, err := moarray.ToGonumVectors[float64](vectors...) 92 if err != nil { 93 return nil, err 94 } 95 96 assignments := make([]int, len(vectors)) 97 var metas = make([]vectorMeta, len(vectors)) 98 for i := range metas { 99 metas[i] = vectorMeta{ 100 lower: make([]float64, clusterCnt), 101 upper: 0, 102 recompute: true, 103 } 104 } 105 106 centroidDist := make([][]float64, clusterCnt) 107 for i := range centroidDist { 108 centroidDist[i] = make([]float64, clusterCnt) 109 } 110 minCentroidDist := make([]float64, clusterCnt) 111 112 distanceFunction, err := resolveDistanceFn(distanceType) 113 if err != nil { 114 return nil, err 115 } 116 117 return &ElkanClusterer{ 118 maxIterations: maxIterations, 119 deltaThreshold: deltaThreshold, 120 121 vectorList: gonumVectors, 122 assignments: assignments, 123 vectorMetas: metas, 124 125 //centroids will be initialized by InitCentroids() 126 halfInterCentroidDistMatrix: centroidDist, 127 minHalfInterCentroidDist: minCentroidDist, 128 129 distFn: distanceFunction, 130 initType: initType, 131 clusterCnt: clusterCnt, 132 vectorCnt: len(vectors), 133 134 rand: rand.New(rand.NewSource(kmeans.DefaultRandSeed)), 135 normalize: normalize, 136 }, nil 137 } 138 139 // InitCentroids initializes the centroids using initialization algorithms like random or kmeans++. 140 func (km *ElkanClusterer) InitCentroids() error { 141 var initializer Initializer 142 switch km.initType { 143 case kmeans.Random: 144 initializer = NewRandomInitializer() 145 case kmeans.KmeansPlusPlus: 146 initializer = NewKMeansPlusPlusInitializer(km.distFn) 147 default: 148 initializer = NewRandomInitializer() 149 } 150 km.centroids = initializer.InitCentroids(km.vectorList, km.clusterCnt) 151 return nil 152 } 153 154 // Cluster returns the final centroids and the error if any. 155 func (km *ElkanClusterer) Cluster() ([][]float64, error) { 156 if km.normalize { 157 moarray.NormalizeGonumVectors(km.vectorList) 158 } 159 160 if km.vectorCnt == km.clusterCnt { 161 return moarray.ToMoArrays[float64](km.vectorList) 162 } 163 164 err := km.InitCentroids() // step 0.1 165 if err != nil { 166 return nil, err 167 } 168 169 km.initBounds() // step 0.2 170 171 res, err := km.elkansCluster() 172 if err != nil { 173 return nil, err 174 } 175 176 return moarray.ToMoArrays[float64](res) 177 } 178 179 func (km *ElkanClusterer) elkansCluster() ([]*mat.VecDense, error) { 180 181 for iter := 0; ; iter++ { 182 km.computeCentroidDistances() // step 1 183 184 changes := km.assignData() // step 2 and 3 185 186 newCentroids := km.recalculateCentroids() // step 4 187 188 km.updateBounds(newCentroids) // step 5 and 6 189 190 km.centroids = newCentroids // step 7 191 192 logutil.Debugf("kmeans iter=%d, changes=%d", iter, changes) 193 if iter != 0 && km.isConverged(iter, changes) { 194 break 195 } 196 } 197 return km.centroids, nil 198 } 199 200 func validateArgs(vectorList [][]float64, clusterCnt, 201 maxIterations int, deltaThreshold float64, 202 distanceType kmeans.DistanceType, initType kmeans.InitType) error { 203 if len(vectorList) == 0 || len(vectorList[0]) == 0 { 204 return moerr.NewInternalErrorNoCtx("input vectors is empty") 205 } 206 if clusterCnt > len(vectorList) { 207 return moerr.NewInternalErrorNoCtx("cluster count is larger than vector count %d > %d", clusterCnt, len(vectorList)) 208 } 209 if maxIterations < 0 { 210 return moerr.NewInternalErrorNoCtx("max iteration is out of bounds (must be >= 0)") 211 } 212 if deltaThreshold <= 0.0 || deltaThreshold >= 1.0 { 213 return moerr.NewInternalErrorNoCtx("delta threshold is out of bounds (must be > 0.0 and < 1.0)") 214 } 215 if distanceType > 2 { 216 return moerr.NewInternalErrorNoCtx("distance type is not supported") 217 } 218 if initType > 1 { 219 return moerr.NewInternalErrorNoCtx("init type is not supported") 220 } 221 222 // We need to validate that all vectors have the same dimension. 223 // This is already done by moarray.ToGonumVectors, so skipping it here. 224 225 if (clusterCnt * clusterCnt) > math.MaxInt { 226 return moerr.NewInternalErrorNoCtx("cluster count is too large for int*int") 227 } 228 229 return nil 230 } 231 232 // initBounds initializes the lower bounds, upper bound and assignment for each vector. 233 func (km *ElkanClusterer) initBounds() { 234 // step 0.2 235 // Set the lower bound l(x, c)=0 for each point x and center c. 236 // Assign each x to its closest initial center c(x)=min{ d(x, c) }, using Lemma 1 to avoid 237 // redundant distance calculations. Each time d(x, c) is computed, set l(x, c)=d(x, c). 238 // Assign upper bounds u(x)=min_c d(x, c). 239 for x := range km.vectorList { 240 minDist := math.MaxFloat64 241 closestCenter := 0 242 for c := range km.centroids { 243 dist := km.distFn(km.vectorList[x], km.centroids[c]) 244 km.vectorMetas[x].lower[c] = dist 245 if dist < minDist { 246 minDist = dist 247 closestCenter = c 248 } 249 } 250 251 km.vectorMetas[x].upper = minDist 252 km.assignments[x] = closestCenter 253 } 254 } 255 256 // computeCentroidDistances computes the centroid distances and the min centroid distances. 257 // NOTE: here we are save 0.5 of centroid distance to avoid 0.5 multiplication in step 3(iii) and 3.b. 258 func (km *ElkanClusterer) computeCentroidDistances() { 259 260 // step 1.a 261 // For all centers c and c', compute 0.5 x d(c, c'). 262 var wg sync.WaitGroup 263 for r := 0; r < km.clusterCnt; r++ { 264 for c := r + 1; c < km.clusterCnt; c++ { 265 wg.Add(1) 266 go func(i, j int) { 267 defer wg.Done() 268 dist := 0.5 * km.distFn(km.centroids[i], km.centroids[j]) 269 km.halfInterCentroidDistMatrix[i][j] = dist 270 km.halfInterCentroidDistMatrix[j][i] = dist 271 }(r, c) 272 } 273 } 274 wg.Wait() 275 276 // step 1.b 277 // For all centers c, compute s(c)=0.5 x min{d(c, c') | c'!= c}. 278 for i := 0; i < km.clusterCnt; i++ { 279 currMinDist := math.MaxFloat64 280 for j := 0; j < km.clusterCnt; j++ { 281 if i == j { 282 continue 283 } 284 currMinDist = math.Min(currMinDist, km.halfInterCentroidDistMatrix[i][j]) 285 } 286 km.minHalfInterCentroidDist[i] = currMinDist 287 } 288 } 289 290 // assignData assigns each vector to the nearest centroid. 291 // This is the place where most of the "distance computation skipping" happens. 292 func (km *ElkanClusterer) assignData() int { 293 294 changes := 0 295 296 for currVector := range km.vectorList { 297 298 // step 2 299 // u(x) <= s(c(x)) 300 if km.vectorMetas[currVector].upper <= km.minHalfInterCentroidDist[km.assignments[currVector]] { 301 continue 302 } 303 304 for c := range km.centroids { // c is nextPossibleCentroidIdx 305 // step 3 306 // For all remaining points x and centers c such that 307 // (i) c != c(x) and 308 // (ii) u(x)>l(x, c) and 309 // (iii) u(x)> 0.5 x d(c(x), c) 310 if c != km.assignments[currVector] && 311 km.vectorMetas[currVector].upper > km.vectorMetas[currVector].lower[c] && 312 km.vectorMetas[currVector].upper > km.halfInterCentroidDistMatrix[km.assignments[currVector]][c] { 313 314 //step 3.a - Bounds update 315 // If r(x) then compute d(x, c(x)) and assign r(x)= false. 316 var dxcx float64 317 if km.vectorMetas[currVector].recompute { 318 km.vectorMetas[currVector].recompute = false 319 320 dxcx = km.distFn(km.vectorList[currVector], km.centroids[km.assignments[currVector]]) 321 km.vectorMetas[currVector].upper = dxcx 322 km.vectorMetas[currVector].lower[km.assignments[currVector]] = dxcx 323 324 if km.vectorMetas[currVector].upper <= km.vectorMetas[currVector].lower[c] { 325 continue // Pruned by triangle inequality on lower bound. 326 } 327 328 if km.vectorMetas[currVector].upper <= km.halfInterCentroidDistMatrix[km.assignments[currVector]][c] { 329 continue // Pruned by triangle inequality on cluster distances. 330 } 331 332 } else { 333 dxcx = km.vectorMetas[currVector].upper // Otherwise, d(x, c(x))=u(x). 334 } 335 336 //step 3.b - Update 337 // If d(x, c(x))>l(x, c) or d(x, c(x))> 0.5 d(c(x), c) then 338 // Compute d(x, c) 339 // If d(x, c)<d(x, c(x)) then assign c(x)=c. 340 if dxcx > km.vectorMetas[currVector].lower[c] || 341 dxcx > km.halfInterCentroidDistMatrix[km.assignments[currVector]][c] { 342 343 dxc := km.distFn(km.vectorList[currVector], km.centroids[c]) // d(x,c) in the paper 344 km.vectorMetas[currVector].lower[c] = dxc 345 if dxc < dxcx { 346 km.vectorMetas[currVector].upper = dxc 347 km.assignments[currVector] = c 348 changes++ 349 } 350 } 351 } 352 } 353 } 354 return changes 355 } 356 357 // recalculateCentroids calculates the new mean centroids based on the new assignments. 358 func (km *ElkanClusterer) recalculateCentroids() []*mat.VecDense { 359 membersCount := make([]int64, km.clusterCnt) 360 361 newCentroids := make([]*mat.VecDense, km.clusterCnt) 362 for c := range newCentroids { 363 newCentroids[c] = mat.NewVecDense(km.vectorList[0].Len(), nil) 364 } 365 366 // sum of all the members of the cluster 367 for x, vec := range km.vectorList { 368 cx := km.assignments[x] 369 membersCount[cx]++ 370 newCentroids[cx].AddVec(newCentroids[cx], vec) 371 } 372 373 // means of the clusters = sum of all the members of the cluster / number of members in the cluster 374 for c := range newCentroids { 375 if membersCount[c] == 0 { 376 // pick a vector randomly from existing vectors as the new centroid 377 //newCentroids[c] = km.vectorList[km.rand.Intn(km.vectorCnt)] 378 379 //// if the cluster is empty, reinitialize it to a random vector, since you can't find the mean of an empty set 380 randVector := make([]float64, km.vectorList[0].Len()) 381 for l := range randVector { 382 randVector[l] = km.rand.Float64() 383 } 384 newCentroids[c] = mat.NewVecDense(km.vectorList[0].Len(), randVector) 385 386 // normalize the random vector 387 if km.normalize { 388 moarray.NormalizeGonumVector(newCentroids[c]) 389 } 390 } else { 391 // find the mean of the cluster members 392 // note: we don't need to normalize here, since the vectors are already normalized 393 newCentroids[c].ScaleVec(1.0/float64(membersCount[c]), newCentroids[c]) 394 } 395 396 } 397 398 return newCentroids 399 } 400 401 // updateBounds updates the lower and upper bounds for each vector. 402 func (km *ElkanClusterer) updateBounds(newCentroid []*mat.VecDense) { 403 404 // compute the centroid shift distance matrix once. 405 // d(c', m(c')) in the paper 406 centroidShiftDist := make([]float64, km.clusterCnt) 407 var wg sync.WaitGroup 408 for c := 0; c < km.clusterCnt; c++ { 409 wg.Add(1) 410 go func(cIdx int) { 411 defer wg.Done() 412 centroidShiftDist[cIdx] = km.distFn(km.centroids[cIdx], newCentroid[cIdx]) 413 logutil.Debugf("centroidShiftDist[%d]=%f", cIdx, centroidShiftDist[cIdx]) 414 }(c) 415 } 416 wg.Wait() 417 418 // step 5 419 //For each point x and center c, assign 420 // l(x, c)= max{ l(x, c)-d(c, m(c)), 0 } 421 for x := range km.vectorList { 422 for c := range km.centroids { 423 shift := km.vectorMetas[x].lower[c] - centroidShiftDist[c] 424 km.vectorMetas[x].lower[c] = math.Max(shift, 0) 425 } 426 427 // step 6 428 // For each point x, assign 429 // u(x)= u(x) + d(m(c(x)), c(x)) 430 // r(x)= true 431 cx := km.assignments[x] 432 km.vectorMetas[x].upper += centroidShiftDist[cx] 433 km.vectorMetas[x].recompute = true 434 } 435 } 436 437 // isConverged checks if the algorithm has converged. 438 func (km *ElkanClusterer) isConverged(iter int, changes int) bool { 439 if iter == km.maxIterations || changes == 0 { 440 return true 441 } 442 // NOTE: we are not using deltaThreshold right now. 443 //if changes < int(float64(km.vectorCnt)*km.deltaThreshold) { 444 // return true 445 //} 446 return false 447 } 448 449 // SSE returns the sum of squared errors. 450 func (km *ElkanClusterer) SSE() float64 { 451 sse := 0.0 452 for i := range km.vectorList { 453 distErr := km.distFn(km.vectorList[i], km.centroids[km.assignments[i]]) 454 sse += math.Pow(distErr, 2) 455 } 456 return sse 457 }