github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/colexec/aggexec/algos/kmeans/elkans/clusterer.go (about)

     1  // Copyright 2023 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package elkans
    16  
    17  import (
    18  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    19  	"github.com/matrixorigin/matrixone/pkg/logutil"
    20  	"github.com/matrixorigin/matrixone/pkg/sql/colexec/aggexec/algos/kmeans"
    21  	"github.com/matrixorigin/matrixone/pkg/vectorize/moarray"
    22  	"gonum.org/v1/gonum/mat"
    23  	"math"
    24  	"math/rand"
    25  	"sync"
    26  )
    27  
    28  // ElkanClusterer is an improved kmeans algorithm which using the triangle inequality to reduce the number of
    29  // distance calculations. As quoted from the paper:
    30  // "The main contribution of this paper is an optimized version of the standard k-means method, with which the number
    31  // of distance computations is in practice closer to `n` than to `nke`, where n is the number of vectors, k is the number
    32  // of centroids, and e is the number of iterations needed until convergence."
    33  //
    34  // However, during each iteration of the algorithm, the lower bounds l(x, c) are updated for all points x and centers c.
    35  // These updates take O(nk) time, so the complexity of the algorithm remains at least O(nke), even though the number
    36  // of distance calculations is roughly O(n) only.
    37  // NOTE that, distance calculation is very expensive for higher dimension vectors.
    38  //
    39  // Ref Paper: https://cdn.aaai.org/ICML/2003/ICML03-022.pdf
    40  type ElkanClusterer struct {
    41  
    42  	// for each of the n vectors, we keep track of the following data
    43  	vectorList  []*mat.VecDense
    44  	vectorMetas []vectorMeta
    45  	assignments []int
    46  
    47  	// for each of the k centroids, we keep track of the following data
    48  	centroids                   []*mat.VecDense
    49  	halfInterCentroidDistMatrix [][]float64
    50  	minHalfInterCentroidDist    []float64
    51  
    52  	// thresholds
    53  	maxIterations  int     // e in paper
    54  	deltaThreshold float64 // used for early convergence. we are not using it right now.
    55  
    56  	// counts
    57  	clusterCnt int // k in paper
    58  	vectorCnt  int // n in paper
    59  
    60  	distFn    kmeans.DistanceFunction
    61  	initType  kmeans.InitType
    62  	rand      *rand.Rand
    63  	normalize bool
    64  }
    65  
    66  // vectorMeta holds required information for Elkan's kmeans pruning.
    67  // lower is at-least distance of a vector to each of the k centroids. Thus, there are k values for each data point.
    68  // upper is at-most (maximum possible) distance of a vector to its currently assigned or "closest" centroid.
    69  // Hence, there's only one value for each data point.
    70  // recompute is a flag to indicate if the distance to centroids needs to be recomputed. if false,
    71  // the algorithm will rely on the 'upper' bound as an approximation instead of computing the exact distance.
    72  type vectorMeta struct {
    73  	lower     []float64
    74  	upper     float64
    75  	recompute bool
    76  }
    77  
    78  var _ kmeans.Clusterer = new(ElkanClusterer)
    79  
    80  func NewKMeans(vectors [][]float64, clusterCnt,
    81  	maxIterations int, deltaThreshold float64,
    82  	distanceType kmeans.DistanceType, initType kmeans.InitType,
    83  	normalize bool,
    84  ) (kmeans.Clusterer, error) {
    85  
    86  	err := validateArgs(vectors, clusterCnt, maxIterations, deltaThreshold, distanceType, initType)
    87  	if err != nil {
    88  		return nil, err
    89  	}
    90  
    91  	gonumVectors, err := moarray.ToGonumVectors[float64](vectors...)
    92  	if err != nil {
    93  		return nil, err
    94  	}
    95  
    96  	assignments := make([]int, len(vectors))
    97  	var metas = make([]vectorMeta, len(vectors))
    98  	for i := range metas {
    99  		metas[i] = vectorMeta{
   100  			lower:     make([]float64, clusterCnt),
   101  			upper:     0,
   102  			recompute: true,
   103  		}
   104  	}
   105  
   106  	centroidDist := make([][]float64, clusterCnt)
   107  	for i := range centroidDist {
   108  		centroidDist[i] = make([]float64, clusterCnt)
   109  	}
   110  	minCentroidDist := make([]float64, clusterCnt)
   111  
   112  	distanceFunction, err := resolveDistanceFn(distanceType)
   113  	if err != nil {
   114  		return nil, err
   115  	}
   116  
   117  	return &ElkanClusterer{
   118  		maxIterations:  maxIterations,
   119  		deltaThreshold: deltaThreshold,
   120  
   121  		vectorList:  gonumVectors,
   122  		assignments: assignments,
   123  		vectorMetas: metas,
   124  
   125  		//centroids will be initialized by InitCentroids()
   126  		halfInterCentroidDistMatrix: centroidDist,
   127  		minHalfInterCentroidDist:    minCentroidDist,
   128  
   129  		distFn:     distanceFunction,
   130  		initType:   initType,
   131  		clusterCnt: clusterCnt,
   132  		vectorCnt:  len(vectors),
   133  
   134  		rand:      rand.New(rand.NewSource(kmeans.DefaultRandSeed)),
   135  		normalize: normalize,
   136  	}, nil
   137  }
   138  
   139  // InitCentroids initializes the centroids using initialization algorithms like random or kmeans++.
   140  func (km *ElkanClusterer) InitCentroids() error {
   141  	var initializer Initializer
   142  	switch km.initType {
   143  	case kmeans.Random:
   144  		initializer = NewRandomInitializer()
   145  	case kmeans.KmeansPlusPlus:
   146  		initializer = NewKMeansPlusPlusInitializer(km.distFn)
   147  	default:
   148  		initializer = NewRandomInitializer()
   149  	}
   150  	km.centroids = initializer.InitCentroids(km.vectorList, km.clusterCnt)
   151  	return nil
   152  }
   153  
   154  // Cluster returns the final centroids and the error if any.
   155  func (km *ElkanClusterer) Cluster() ([][]float64, error) {
   156  	if km.normalize {
   157  		moarray.NormalizeGonumVectors(km.vectorList)
   158  	}
   159  
   160  	if km.vectorCnt == km.clusterCnt {
   161  		return moarray.ToMoArrays[float64](km.vectorList)
   162  	}
   163  
   164  	err := km.InitCentroids() // step 0.1
   165  	if err != nil {
   166  		return nil, err
   167  	}
   168  
   169  	km.initBounds() // step 0.2
   170  
   171  	res, err := km.elkansCluster()
   172  	if err != nil {
   173  		return nil, err
   174  	}
   175  
   176  	return moarray.ToMoArrays[float64](res)
   177  }
   178  
   179  func (km *ElkanClusterer) elkansCluster() ([]*mat.VecDense, error) {
   180  
   181  	for iter := 0; ; iter++ {
   182  		km.computeCentroidDistances() // step 1
   183  
   184  		changes := km.assignData() // step 2 and 3
   185  
   186  		newCentroids := km.recalculateCentroids() // step 4
   187  
   188  		km.updateBounds(newCentroids) // step 5 and 6
   189  
   190  		km.centroids = newCentroids // step 7
   191  
   192  		logutil.Debugf("kmeans iter=%d, changes=%d", iter, changes)
   193  		if iter != 0 && km.isConverged(iter, changes) {
   194  			break
   195  		}
   196  	}
   197  	return km.centroids, nil
   198  }
   199  
   200  func validateArgs(vectorList [][]float64, clusterCnt,
   201  	maxIterations int, deltaThreshold float64,
   202  	distanceType kmeans.DistanceType, initType kmeans.InitType) error {
   203  	if len(vectorList) == 0 || len(vectorList[0]) == 0 {
   204  		return moerr.NewInternalErrorNoCtx("input vectors is empty")
   205  	}
   206  	if clusterCnt > len(vectorList) {
   207  		return moerr.NewInternalErrorNoCtx("cluster count is larger than vector count %d > %d", clusterCnt, len(vectorList))
   208  	}
   209  	if maxIterations < 0 {
   210  		return moerr.NewInternalErrorNoCtx("max iteration is out of bounds (must be >= 0)")
   211  	}
   212  	if deltaThreshold <= 0.0 || deltaThreshold >= 1.0 {
   213  		return moerr.NewInternalErrorNoCtx("delta threshold is out of bounds (must be > 0.0 and < 1.0)")
   214  	}
   215  	if distanceType > 2 {
   216  		return moerr.NewInternalErrorNoCtx("distance type is not supported")
   217  	}
   218  	if initType > 1 {
   219  		return moerr.NewInternalErrorNoCtx("init type is not supported")
   220  	}
   221  
   222  	// We need to validate that all vectors have the same dimension.
   223  	// This is already done by moarray.ToGonumVectors, so skipping it here.
   224  
   225  	if (clusterCnt * clusterCnt) > math.MaxInt {
   226  		return moerr.NewInternalErrorNoCtx("cluster count is too large for int*int")
   227  	}
   228  
   229  	return nil
   230  }
   231  
   232  // initBounds initializes the lower bounds, upper bound and assignment for each vector.
   233  func (km *ElkanClusterer) initBounds() {
   234  	// step 0.2
   235  	// Set the lower bound l(x, c)=0 for each point x and center c.
   236  	// Assign each x to its closest initial center c(x)=min{ d(x, c) }, using Lemma 1 to avoid
   237  	// redundant distance calculations. Each time d(x, c) is computed, set l(x, c)=d(x, c).
   238  	// Assign upper bounds u(x)=min_c d(x, c).
   239  	for x := range km.vectorList {
   240  		minDist := math.MaxFloat64
   241  		closestCenter := 0
   242  		for c := range km.centroids {
   243  			dist := km.distFn(km.vectorList[x], km.centroids[c])
   244  			km.vectorMetas[x].lower[c] = dist
   245  			if dist < minDist {
   246  				minDist = dist
   247  				closestCenter = c
   248  			}
   249  		}
   250  
   251  		km.vectorMetas[x].upper = minDist
   252  		km.assignments[x] = closestCenter
   253  	}
   254  }
   255  
   256  // computeCentroidDistances computes the centroid distances and the min centroid distances.
   257  // NOTE: here we are save 0.5 of centroid distance to avoid 0.5 multiplication in step 3(iii) and 3.b.
   258  func (km *ElkanClusterer) computeCentroidDistances() {
   259  
   260  	// step 1.a
   261  	// For all centers c and c', compute 0.5 x d(c, c').
   262  	var wg sync.WaitGroup
   263  	for r := 0; r < km.clusterCnt; r++ {
   264  		for c := r + 1; c < km.clusterCnt; c++ {
   265  			wg.Add(1)
   266  			go func(i, j int) {
   267  				defer wg.Done()
   268  				dist := 0.5 * km.distFn(km.centroids[i], km.centroids[j])
   269  				km.halfInterCentroidDistMatrix[i][j] = dist
   270  				km.halfInterCentroidDistMatrix[j][i] = dist
   271  			}(r, c)
   272  		}
   273  	}
   274  	wg.Wait()
   275  
   276  	// step 1.b
   277  	//  For all centers c, compute s(c)=0.5 x min{d(c, c') | c'!= c}.
   278  	for i := 0; i < km.clusterCnt; i++ {
   279  		currMinDist := math.MaxFloat64
   280  		for j := 0; j < km.clusterCnt; j++ {
   281  			if i == j {
   282  				continue
   283  			}
   284  			currMinDist = math.Min(currMinDist, km.halfInterCentroidDistMatrix[i][j])
   285  		}
   286  		km.minHalfInterCentroidDist[i] = currMinDist
   287  	}
   288  }
   289  
   290  // assignData assigns each vector to the nearest centroid.
   291  // This is the place where most of the "distance computation skipping" happens.
   292  func (km *ElkanClusterer) assignData() int {
   293  
   294  	changes := 0
   295  
   296  	for currVector := range km.vectorList {
   297  
   298  		// step 2
   299  		// u(x) <= s(c(x))
   300  		if km.vectorMetas[currVector].upper <= km.minHalfInterCentroidDist[km.assignments[currVector]] {
   301  			continue
   302  		}
   303  
   304  		for c := range km.centroids { // c is nextPossibleCentroidIdx
   305  			// step 3
   306  			// For all remaining points x and centers c such that
   307  			// (i) c != c(x) and
   308  			// (ii) u(x)>l(x, c) and
   309  			// (iii) u(x)> 0.5 x d(c(x), c)
   310  			if c != km.assignments[currVector] &&
   311  				km.vectorMetas[currVector].upper > km.vectorMetas[currVector].lower[c] &&
   312  				km.vectorMetas[currVector].upper > km.halfInterCentroidDistMatrix[km.assignments[currVector]][c] {
   313  
   314  				//step 3.a - Bounds update
   315  				// If r(x) then compute d(x, c(x)) and assign r(x)= false.
   316  				var dxcx float64
   317  				if km.vectorMetas[currVector].recompute {
   318  					km.vectorMetas[currVector].recompute = false
   319  
   320  					dxcx = km.distFn(km.vectorList[currVector], km.centroids[km.assignments[currVector]])
   321  					km.vectorMetas[currVector].upper = dxcx
   322  					km.vectorMetas[currVector].lower[km.assignments[currVector]] = dxcx
   323  
   324  					if km.vectorMetas[currVector].upper <= km.vectorMetas[currVector].lower[c] {
   325  						continue // Pruned by triangle inequality on lower bound.
   326  					}
   327  
   328  					if km.vectorMetas[currVector].upper <= km.halfInterCentroidDistMatrix[km.assignments[currVector]][c] {
   329  						continue // Pruned by triangle inequality on cluster distances.
   330  					}
   331  
   332  				} else {
   333  					dxcx = km.vectorMetas[currVector].upper //  Otherwise, d(x, c(x))=u(x).
   334  				}
   335  
   336  				//step 3.b - Update
   337  				// If d(x, c(x))>l(x, c) or d(x, c(x))> 0.5 d(c(x), c) then
   338  				// Compute d(x, c)
   339  				// If d(x, c)<d(x, c(x)) then assign c(x)=c.
   340  				if dxcx > km.vectorMetas[currVector].lower[c] ||
   341  					dxcx > km.halfInterCentroidDistMatrix[km.assignments[currVector]][c] {
   342  
   343  					dxc := km.distFn(km.vectorList[currVector], km.centroids[c]) // d(x,c) in the paper
   344  					km.vectorMetas[currVector].lower[c] = dxc
   345  					if dxc < dxcx {
   346  						km.vectorMetas[currVector].upper = dxc
   347  						km.assignments[currVector] = c
   348  						changes++
   349  					}
   350  				}
   351  			}
   352  		}
   353  	}
   354  	return changes
   355  }
   356  
   357  // recalculateCentroids calculates the new mean centroids based on the new assignments.
   358  func (km *ElkanClusterer) recalculateCentroids() []*mat.VecDense {
   359  	membersCount := make([]int64, km.clusterCnt)
   360  
   361  	newCentroids := make([]*mat.VecDense, km.clusterCnt)
   362  	for c := range newCentroids {
   363  		newCentroids[c] = mat.NewVecDense(km.vectorList[0].Len(), nil)
   364  	}
   365  
   366  	// sum of all the members of the cluster
   367  	for x, vec := range km.vectorList {
   368  		cx := km.assignments[x]
   369  		membersCount[cx]++
   370  		newCentroids[cx].AddVec(newCentroids[cx], vec)
   371  	}
   372  
   373  	// means of the clusters = sum of all the members of the cluster / number of members in the cluster
   374  	for c := range newCentroids {
   375  		if membersCount[c] == 0 {
   376  			// pick a vector randomly from existing vectors as the new centroid
   377  			//newCentroids[c] = km.vectorList[km.rand.Intn(km.vectorCnt)]
   378  
   379  			//// if the cluster is empty, reinitialize it to a random vector, since you can't find the mean of an empty set
   380  			randVector := make([]float64, km.vectorList[0].Len())
   381  			for l := range randVector {
   382  				randVector[l] = km.rand.Float64()
   383  			}
   384  			newCentroids[c] = mat.NewVecDense(km.vectorList[0].Len(), randVector)
   385  
   386  			// normalize the random vector
   387  			if km.normalize {
   388  				moarray.NormalizeGonumVector(newCentroids[c])
   389  			}
   390  		} else {
   391  			// find the mean of the cluster members
   392  			// note: we don't need to normalize here, since the vectors are already normalized
   393  			newCentroids[c].ScaleVec(1.0/float64(membersCount[c]), newCentroids[c])
   394  		}
   395  
   396  	}
   397  
   398  	return newCentroids
   399  }
   400  
   401  // updateBounds updates the lower and upper bounds for each vector.
   402  func (km *ElkanClusterer) updateBounds(newCentroid []*mat.VecDense) {
   403  
   404  	// compute the centroid shift distance matrix once.
   405  	// d(c', m(c')) in the paper
   406  	centroidShiftDist := make([]float64, km.clusterCnt)
   407  	var wg sync.WaitGroup
   408  	for c := 0; c < km.clusterCnt; c++ {
   409  		wg.Add(1)
   410  		go func(cIdx int) {
   411  			defer wg.Done()
   412  			centroidShiftDist[cIdx] = km.distFn(km.centroids[cIdx], newCentroid[cIdx])
   413  			logutil.Debugf("centroidShiftDist[%d]=%f", cIdx, centroidShiftDist[cIdx])
   414  		}(c)
   415  	}
   416  	wg.Wait()
   417  
   418  	// step 5
   419  	//For each point x and center c, assign
   420  	// l(x, c)= max{ l(x, c)-d(c, m(c)), 0 }
   421  	for x := range km.vectorList {
   422  		for c := range km.centroids {
   423  			shift := km.vectorMetas[x].lower[c] - centroidShiftDist[c]
   424  			km.vectorMetas[x].lower[c] = math.Max(shift, 0)
   425  		}
   426  
   427  		// step 6
   428  		// For each point x, assign
   429  		// u(x)= u(x) + d(m(c(x)), c(x))
   430  		// r(x)= true
   431  		cx := km.assignments[x]
   432  		km.vectorMetas[x].upper += centroidShiftDist[cx]
   433  		km.vectorMetas[x].recompute = true
   434  	}
   435  }
   436  
   437  // isConverged checks if the algorithm has converged.
   438  func (km *ElkanClusterer) isConverged(iter int, changes int) bool {
   439  	if iter == km.maxIterations || changes == 0 {
   440  		return true
   441  	}
   442  	// NOTE: we are not using deltaThreshold right now.
   443  	//if changes < int(float64(km.vectorCnt)*km.deltaThreshold) {
   444  	//	return true
   445  	//}
   446  	return false
   447  }
   448  
   449  // SSE returns the sum of squared errors.
   450  func (km *ElkanClusterer) SSE() float64 {
   451  	sse := 0.0
   452  	for i := range km.vectorList {
   453  		distErr := km.distFn(km.vectorList[i], km.centroids[km.assignments[i]])
   454  		sse += math.Pow(distErr, 2)
   455  	}
   456  	return sse
   457  }