github.com/schollz/clusters@v0.0.0-20221201012527-c6c68863636f/clusters.go (about) 1 // Package clusters provides abstract definitions of clusterers as well as 2 // their implementations. 3 package clusters 4 5 import ( 6 "math" 7 ) 8 9 // DistanceFunc represents a function for measuring distance 10 // between n-dimensional vectors. 11 type DistanceFunc func([]float64, []float64) float64 12 13 // Online represents parameters important for online learning in 14 // clustering algorithms. 15 type Online struct { 16 Alpha float64 17 Dimension int 18 } 19 20 // HCEvent represents the intermediate result of computation of hard clustering algorithm 21 // and are transmitted periodically to the caller during online learning 22 type HCEvent struct { 23 Cluster int 24 Observation []float64 25 } 26 27 // Clusterer defines the operation of learning 28 // common for all algorithms 29 type Clusterer interface { 30 Learn([][]float64) error 31 } 32 33 // HardClusterer defines a set of operations for hard clustering algorithms 34 type HardClusterer interface { 35 36 // Sizes returns sizes of respective clusters 37 Sizes() []int 38 39 // Guesses returns mapping from data point indices to cluster numbers. Clusters' numbering begins at 1. 40 Guesses() []int 41 42 // Predict returns number of cluster to which the observation would be assigned 43 Predict(observation []float64) int 44 45 // IsOnline tells the algorithm supports online learning 46 IsOnline() bool 47 48 // WithOnline configures the algorithms for online learning with given parameters 49 WithOnline(Online) HardClusterer 50 51 // Online begins the process of online training of an algorithm. Observations are sent on the observations channel, 52 // once no more are expected an empty struct needs to be sent on done channel. Caller receives intermediate results of computation via 53 // the returned channel. 54 Online(observations chan []float64, done chan struct{}) chan *HCEvent 55 56 // Implement common operation 57 Clusterer 58 } 59 60 // Estimator defines a computation used to determine an optimal number of clusters in the dataset 61 type Estimator interface { 62 63 // Estimate provides an expected number of clusters in the dataset 64 Estimate([][]float64) (int, error) 65 } 66 67 // Importer defines an operation of importing the dataset from an external file 68 type Importer interface { 69 70 // Import fetches the data from a file, start and end arguments allow user 71 // to specify the span of data columns to be imported (inclusively) 72 Import(file string, start, end int) ([][]float64, error) 73 } 74 75 var ( 76 // EuclideanDistance is one of the common distance measurement 77 EuclideanDistance = func(a, b []float64) float64 { 78 var ( 79 s, t float64 80 ) 81 82 for i, _ := range a { 83 t = a[i] - b[i] 84 s += t * t 85 } 86 87 return math.Sqrt(s) 88 } 89 90 // EuclideanDistanceSquared is one of the common distance measurement 91 EuclideanDistanceSquared = func(a, b []float64) float64 { 92 var ( 93 s, t float64 94 ) 95 96 for i, _ := range a { 97 t = a[i] - b[i] 98 s += t * t 99 } 100 101 return s 102 } 103 104 // Manhattan distance captures the distance between two points by 105 // aggregating the pairwise absolute difference between each variable 106 ManhattanDistance = func(a, b []float64) float64 { 107 var ( 108 s float64 109 ) 110 111 for i, _ := range a { 112 s += math.Abs(a[i]-b[i]) 113 } 114 115 return s 116 } 117 )