github.com/schollz/clusters@v0.0.0-20221201012527-c6c68863636f/kmeans_estimator.go (about)

     1  package clusters
     2  
     3  import (
     4  	"math"
     5  	"math/rand"
     6  	"time"
     7  
     8  	"gonum.org/v1/gonum/floats"
     9  )
    10  
    11  type kmeansEstimator struct {
    12  	iterations, number, max int
    13  
    14  	// variables keeping count of changes of points' membership every iteration. User as a stopping condition.
    15  	changes, oldchanges, counter, threshold int
    16  
    17  	distance DistanceFunc
    18  
    19  	a, b []int
    20  
    21  	// slices holding values of centroids of each clusters
    22  	m, n [][]float64
    23  
    24  	// dataset
    25  	d [][]float64
    26  }
    27  
    28  // Implementation of cluster number estimator using gap statistic
    29  // ("Estimating the number of clusters in a data set via the gap statistic", Tibshirani et al.) with k-means++ as
    30  // clustering algorithm
    31  func KMeansEstimator(iterations, clusters int, distance DistanceFunc) (Estimator, error) {
    32  	if iterations < 1 {
    33  		return nil, errZeroIterations
    34  	}
    35  
    36  	if clusters < 2 {
    37  		return nil, errOneCluster
    38  	}
    39  
    40  	var d DistanceFunc
    41  	{
    42  		if distance != nil {
    43  			d = distance
    44  		} else {
    45  			d = EuclideanDistance
    46  		}
    47  	}
    48  
    49  	return &kmeansEstimator{
    50  		iterations: iterations,
    51  		max:        clusters,
    52  		distance:   d,
    53  	}, nil
    54  }
    55  
    56  func (c *kmeansEstimator) Estimate(data [][]float64) (int, error) {
    57  	if len(data) == 0 {
    58  		return 0, errEmptySet
    59  	}
    60  
    61  	var (
    62  		estimated = 0
    63  		size      = len(data)
    64  		bounds    = bounds(data)
    65  		wks       = make([]float64, c.max)
    66  		wkbs      = make([]float64, c.max)
    67  		sk        = make([]float64, c.max)
    68  		one       = make([]float64, c.max)
    69  		bwkbs     = make([]float64, c.max)
    70  	)
    71  
    72  	for i := 0; i < c.max; i++ {
    73  		c.number = i + 1
    74  
    75  		c.learn(data)
    76  
    77  		wks[i] = math.Log(c.wk(c.d, c.m, c.a))
    78  
    79  		for j := 0; j < c.max; j++ {
    80  			c.learn(c.buildRandomizedSet(size, bounds))
    81  
    82  			bwkbs[j] = math.Log(c.wk(c.d, c.m, c.a))
    83  			one[j] = 1
    84  		}
    85  
    86  		wkbs[i] = floats.Sum(bwkbs) / float64(c.max)
    87  
    88  		floats.Scale(wkbs[i], one)
    89  		floats.Sub(bwkbs, one)
    90  		floats.Mul(bwkbs, bwkbs)
    91  
    92  		sk[i] = math.Sqrt(floats.Sum(bwkbs) / float64(c.max))
    93  	}
    94  
    95  	floats.Scale(math.Sqrt(1+(1/float64(c.max))), sk)
    96  
    97  	for i := 0; i < c.max-1; i++ {
    98  		if wkbs[i] >= wkbs[i+1]-sk[i+1] {
    99  			estimated = i + 1
   100  			break
   101  		}
   102  	}
   103  
   104  	return estimated, nil
   105  }
   106  
   107  // private
   108  func (c *kmeansEstimator) learn(data [][]float64) {
   109  	c.d = data
   110  
   111  	c.a = make([]int, len(data))
   112  	c.b = make([]int, c.number)
   113  
   114  	c.counter = 0
   115  	c.threshold = changesThreshold
   116  	c.changes = 0
   117  	c.oldchanges = 0
   118  
   119  	c.initializeMeansWithData()
   120  
   121  	for i := 0; i < c.iterations && c.counter != c.threshold; i++ {
   122  		c.run()
   123  		c.check()
   124  	}
   125  }
   126  
   127  func (c *kmeansEstimator) initializeMeansWithData() {
   128  	c.m = make([][]float64, c.number)
   129  	c.n = make([][]float64, c.number)
   130  
   131  	rand.Seed(time.Now().UTC().Unix())
   132  
   133  	var (
   134  		k          int
   135  		s, t, l, f float64
   136  		d          []float64 = make([]float64, len(c.d))
   137  	)
   138  
   139  	c.m[0] = c.d[rand.Intn(len(c.d)-1)]
   140  
   141  	for i := 1; i < c.number; i++ {
   142  		s = 0
   143  		t = 0
   144  		for j := 0; j < len(c.d); j++ {
   145  
   146  			l = c.distance(c.m[0], c.d[j])
   147  			for g := 1; g < i; g++ {
   148  				if f = c.distance(c.m[g], c.d[j]); f < l {
   149  					l = f
   150  				}
   151  			}
   152  
   153  			d[j] = math.Pow(l, 2)
   154  			s += d[j]
   155  		}
   156  
   157  		t = rand.Float64() * s
   158  		k = 0
   159  		for s = d[0]; s < t; s += d[k] {
   160  			k++
   161  		}
   162  
   163  		c.m[i] = c.d[k]
   164  	}
   165  
   166  	for i := 0; i < c.number; i++ {
   167  		c.n[i] = make([]float64, len(c.m[0]))
   168  	}
   169  }
   170  
   171  func (c *kmeansEstimator) run() {
   172  	var (
   173  		l, k, n int = len(c.m[0]), 0, 0
   174  		m, d    float64
   175  	)
   176  
   177  	for i := 0; i < c.number; i++ {
   178  		c.b[i] = 0
   179  	}
   180  
   181  	for i := 0; i < len(c.d); i++ {
   182  		m = c.distance(c.d[i], c.m[0])
   183  		n = 0
   184  
   185  		for j := 1; j < c.number; j++ {
   186  			if d = c.distance(c.d[i], c.m[j]); d < m {
   187  				m = d
   188  				n = j
   189  			}
   190  		}
   191  
   192  		k = n + 1
   193  
   194  		if c.a[i] != k {
   195  			c.changes++
   196  		}
   197  
   198  		c.a[i] = k
   199  		c.b[n]++
   200  
   201  		floats.Add(c.n[n], c.d[i])
   202  	}
   203  
   204  	for i := 0; i < c.number; i++ {
   205  		floats.Scale(1/float64(c.b[i]), c.n[i])
   206  
   207  		for j := 0; j < l; j++ {
   208  			c.m[i][j] = c.n[i][j]
   209  			c.n[i][j] = 0
   210  		}
   211  	}
   212  }
   213  
   214  func (c *kmeansEstimator) check() {
   215  	if c.changes == c.oldchanges {
   216  		c.counter++
   217  	}
   218  
   219  	c.oldchanges = c.changes
   220  }
   221  
   222  func (c *kmeansEstimator) wk(data [][]float64, centroids [][]float64, mapping []int) float64 {
   223  	var (
   224  		l  = float64(2 * len(data[0]))
   225  		wk = make([]float64, len(centroids))
   226  	)
   227  
   228  	for i := 0; i < len(mapping); i++ {
   229  		wk[mapping[i]-1] += EuclideanDistanceSquared(centroids[mapping[i]-1], data[i]) / l
   230  	}
   231  
   232  	return floats.Sum(wk)
   233  }
   234  
   235  func (c *kmeansEstimator) buildRandomizedSet(size int, bounds []*[2]float64) [][]float64 {
   236  	var (
   237  		l = len(bounds)
   238  		r = make([][]float64, size)
   239  	)
   240  
   241  	for i := 0; i < size; i++ {
   242  		r[i] = make([]float64, l)
   243  
   244  		for j := 0; j < l; j++ {
   245  			r[i][j] = uniform(bounds[j])
   246  		}
   247  	}
   248  
   249  	return r
   250  }