github.com/schollz/clusters@v0.0.0-20221201012527-c6c68863636f/optics.go (about)

     1  package clusters
     2  
     3  import (
     4  	"math"
     5  	"sync"
     6  )
     7  
     8  type steepDownArea struct {
     9  	start, end int
    10  	mib        float64
    11  }
    12  
    13  type clusterJob struct {
    14  	a, b, n int
    15  }
    16  
    17  type opticsClusterer struct {
    18  	minpts, workers int
    19  	eps, xi, x      float64
    20  
    21  	distance DistanceFunc
    22  
    23  	// slices holding the cluster mapping and sizes. Access is synchronized to avoid read during computation.
    24  	mu   sync.RWMutex
    25  	a, b []int
    26  
    27  	// variables used for concurrent computation of nearest neighbours and producing final mapping
    28  	l, s, o, f, g int
    29  	j             chan *rangeJob
    30  	c             chan *clusterJob
    31  	m             *sync.Mutex
    32  	w             *sync.WaitGroup
    33  	r             *[]int
    34  	p             []float64
    35  
    36  	// visited points
    37  	v []bool
    38  
    39  	// reachability distances
    40  	re []*pItem
    41  
    42  	// ordered list of points wrt. reachability distance
    43  	so []int
    44  
    45  	// dataset
    46  	d [][]float64
    47  }
    48  
    49  // Implementation of OPTICS algorithm with concurrent nearest neighbour computation. The number of goroutines acting concurrently
    50  // is controlled via workers argument. Passing 0 will result in this number being chosen arbitrarily.
    51  func OPTICS(minpts int, eps, xi float64, workers int, distance DistanceFunc) (HardClusterer, error) {
    52  	if minpts < 1 {
    53  		return nil, errZeroMinpts
    54  	}
    55  
    56  	if workers < 0 {
    57  		return nil, errZeroWorkers
    58  	}
    59  
    60  	if eps <= 0 {
    61  		return nil, errZeroEpsilon
    62  	}
    63  
    64  	if xi <= 0 {
    65  		return nil, errZeroXi
    66  	}
    67  
    68  	var d DistanceFunc
    69  	{
    70  		if distance != nil {
    71  			d = distance
    72  		} else {
    73  			d = EuclideanDistance
    74  		}
    75  	}
    76  
    77  	return &opticsClusterer{
    78  		minpts:   minpts,
    79  		workers:  workers,
    80  		eps:      eps,
    81  		xi:       xi,
    82  		x:        1 - xi,
    83  		distance: d,
    84  	}, nil
    85  }
    86  
    87  func (c *opticsClusterer) IsOnline() bool {
    88  	return false
    89  }
    90  
    91  func (c *opticsClusterer) WithOnline(o Online) HardClusterer {
    92  	return c
    93  }
    94  
    95  func (c *opticsClusterer) Learn(data [][]float64) error {
    96  	if len(data) == 0 {
    97  		return errEmptySet
    98  	}
    99  
   100  	c.mu.Lock()
   101  
   102  	c.l = len(data)
   103  	c.s = c.numWorkers()
   104  	c.o = c.s - 1
   105  	c.f = c.l / c.s
   106  
   107  	c.d = data
   108  
   109  	c.v = make([]bool, c.l)
   110  	c.re = make([]*pItem, c.l)
   111  	c.so = make([]int, 0, c.l)
   112  	c.a = make([]int, c.l)
   113  	c.b = make([]int, 0)
   114  
   115  	c.startNearestWorkers()
   116  
   117  	c.run()
   118  
   119  	c.endNearestWorkers()
   120  
   121  	c.v = nil
   122  	c.p = nil
   123  	c.r = nil
   124  
   125  	c.startClusterWorkers()
   126  
   127  	c.extract()
   128  
   129  	c.endClusterWorkers()
   130  
   131  	c.re = nil
   132  	c.so = nil
   133  
   134  	c.mu.Unlock()
   135  
   136  	return nil
   137  }
   138  
   139  func (c *opticsClusterer) Sizes() []int {
   140  	c.mu.RLock()
   141  	defer c.mu.RUnlock()
   142  
   143  	return c.b
   144  }
   145  
   146  func (c *opticsClusterer) Guesses() []int {
   147  	c.mu.RLock()
   148  	defer c.mu.RUnlock()
   149  
   150  	return c.a
   151  }
   152  
   153  func (c *opticsClusterer) Predict(p []float64) int {
   154  	var (
   155  		l int
   156  		d float64
   157  		m float64 = c.distance(p, c.d[0])
   158  	)
   159  
   160  	for i := 1; i < len(c.d); i++ {
   161  		if d = c.distance(p, c.d[i]); d < m {
   162  			m = d
   163  			l = i
   164  		}
   165  	}
   166  
   167  	return c.a[l]
   168  }
   169  
   170  func (c *opticsClusterer) Online(observations chan []float64, done chan struct{}) chan *HCEvent {
   171  	return nil
   172  }
   173  
   174  func (c *opticsClusterer) run() {
   175  	var (
   176  		l       int
   177  		d       float64
   178  		ns, nss []int = make([]int, 0), make([]int, 0)
   179  		q       priorityQueue
   180  		p       *pItem
   181  	)
   182  
   183  	for i := 0; i < c.l; i++ {
   184  		if c.v[i] {
   185  			continue
   186  		}
   187  
   188  		c.nearest(i, &l, &ns)
   189  
   190  		c.v[i] = true
   191  
   192  		c.so = append(c.so, i)
   193  
   194  		if d = c.coreDistance(i, l, ns); d != 0 {
   195  			q = newPriorityQueue(l)
   196  
   197  			c.update(i, d, l, ns, &q)
   198  
   199  			for q.NotEmpty() {
   200  				p = q.Pop().(*pItem)
   201  
   202  				c.nearest(p.v, &l, &nss)
   203  
   204  				c.v[p.v] = true
   205  
   206  				c.so = append(c.so, p.v)
   207  
   208  				if d = c.coreDistance(p.v, l, nss); d != 0 {
   209  					c.update(p.v, d, l, nss, &q)
   210  				}
   211  			}
   212  		}
   213  	}
   214  }
   215  
   216  func (c *opticsClusterer) coreDistance(p int, l int, r []int) float64 {
   217  	if l < c.minpts {
   218  		return 0
   219  	}
   220  
   221  	var d, m float64 = 0, c.distance(c.d[p], c.d[r[0]])
   222  
   223  	for i := 1; i < l; i++ {
   224  		if d = c.distance(c.d[p], c.d[r[i]]); d > m {
   225  			m = d
   226  		}
   227  	}
   228  
   229  	return m
   230  }
   231  
   232  func (c *opticsClusterer) update(p int, d float64, l int, r []int, q *priorityQueue) {
   233  	for i := 0; i < l; i++ {
   234  		if !c.v[r[i]] {
   235  			m := math.Max(d, c.distance(c.d[p], c.d[r[i]]))
   236  
   237  			if c.re[r[i]] == nil {
   238  				item := &pItem{
   239  					v: r[i],
   240  					p: m,
   241  				}
   242  
   243  				c.re[r[i]] = item
   244  
   245  				q.Push(item)
   246  			} else if m < c.re[r[i]].p {
   247  				q.Update(c.re[r[i]], c.re[r[i]].v, m)
   248  			}
   249  		}
   250  	}
   251  }
   252  
   253  func (c *opticsClusterer) extract() {
   254  	var (
   255  		i, k, e, ue, cs, ce, s, p int = 0, 1, 0, 0, 0, 0, 0, 0
   256  		mib, d                    float64
   257  		areas                     []*steepDownArea = make([]*steepDownArea, 0)
   258  		clusters                  map[int]bool     = make(map[int]bool)
   259  	)
   260  
   261  	// first and last points are not assigned the reachability distance, so exclude them from calculations
   262  	c.so = c.so[1 : len(c.so)-2]
   263  	c.g = len(c.so) - 2
   264  
   265  	for i < len(c.so)-1 {
   266  		mib = math.Max(mib, c.re[c.so[i]].p)
   267  
   268  		if c.isSteepDown(i, &e) {
   269  			as := areas[:0]
   270  			for j := 0; j < len(areas); j++ {
   271  				if c.re[c.so[areas[j].start]].p*c.x < mib {
   272  					continue
   273  				}
   274  
   275  				as = append(as, &steepDownArea{
   276  					start: areas[j].start,
   277  					end:   areas[j].end,
   278  					mib:   math.Max(areas[j].mib, mib),
   279  				})
   280  			}
   281  			areas = as
   282  
   283  			areas = append(areas, &steepDownArea{
   284  				start: i,
   285  				end:   e,
   286  			})
   287  
   288  			i = e + 1
   289  			mib = c.re[c.so[i]].p
   290  		} else if c.isSteepUp(i, &e) {
   291  			ue = e + 1
   292  
   293  			as := areas[:0]
   294  			for j := 0; j < len(areas); j++ {
   295  				if c.re[c.so[areas[j].start]].p*c.x < mib {
   296  					continue
   297  				}
   298  
   299  				as = append(as, &steepDownArea{
   300  					start: areas[j].start,
   301  					end:   areas[j].end,
   302  					mib:   math.Max(areas[j].mib, mib),
   303  				})
   304  			}
   305  			areas = as
   306  
   307  			for j := 0; j < len(areas); j++ {
   308  				if c.re[c.so[ue]].p*c.x < areas[j].mib {
   309  					continue
   310  				}
   311  
   312  				d = (c.re[c.so[areas[j].start]].p - c.re[c.so[ue]].p) / c.re[c.so[areas[j].start]].p
   313  
   314  				if math.Abs(d) <= c.xi {
   315  					cs = areas[j].start
   316  					ce = ue
   317  				} else if d > c.xi {
   318  					for k := areas[j].end; k > areas[j].end; k-- {
   319  						if math.Abs((c.re[c.so[k]].p-c.re[c.so[ue]].p)/c.re[c.so[k]].p) <= c.xi {
   320  							cs = k
   321  							break
   322  						}
   323  					}
   324  					ce = ue
   325  				} else {
   326  					cs = areas[j].start
   327  					for k := i; k < e; k++ {
   328  						if math.Abs((c.re[c.so[k]].p-c.re[c.so[i]].p)/c.re[c.so[k]].p) <= c.xi {
   329  							ce = k
   330  							break
   331  						}
   332  					}
   333  				}
   334  
   335  				p = ce - cs
   336  
   337  				if p < c.minpts {
   338  					continue
   339  				}
   340  
   341  				s = ce + cs
   342  
   343  				if !clusters[s] {
   344  					clusters[s] = true
   345  
   346  					c.b = append(c.b, 0)
   347  					c.b[k] = p
   348  
   349  					c.w.Add(1)
   350  
   351  					c.c <- &clusterJob{
   352  						a: cs,
   353  						b: ce,
   354  						n: k,
   355  					}
   356  
   357  					k++
   358  				}
   359  			}
   360  
   361  			i = ue
   362  			mib = c.re[c.so[i]].p
   363  		} else {
   364  			i++
   365  		}
   366  	}
   367  
   368  	c.w.Wait()
   369  }
   370  
   371  func (c *opticsClusterer) isSteepDown(i int, e *int) bool {
   372  	if c.re[c.so[i]].p*c.x > c.re[c.so[i+1]].p {
   373  		return false
   374  	}
   375  
   376  	var counter, j int = 0, i
   377  
   378  	*e = j
   379  
   380  	for {
   381  		if j > c.g {
   382  			break
   383  		}
   384  
   385  		if c.re[c.so[j]].p < c.re[c.so[j+1]].p {
   386  			break
   387  		}
   388  
   389  		if c.re[c.so[j]].p*c.x <= c.re[c.so[j+1]].p {
   390  			*e = j
   391  			counter = 0
   392  		} else {
   393  			counter++
   394  		}
   395  
   396  		if counter > c.minpts {
   397  			break
   398  		}
   399  
   400  		j++
   401  	}
   402  
   403  	return *e != i
   404  }
   405  
   406  func (c *opticsClusterer) isSteepUp(i int, e *int) bool {
   407  	if c.re[c.so[i]].p > c.re[c.so[i+1]].p*c.x {
   408  		return false
   409  	}
   410  
   411  	var counter, j int = 0, i
   412  
   413  	*e = j
   414  
   415  	for {
   416  		if j > c.g {
   417  			break
   418  		}
   419  
   420  		if c.re[c.so[j]].p > c.re[c.so[j+1]].p {
   421  			break
   422  		}
   423  
   424  		if c.re[c.so[j]].p <= c.re[c.so[j+1]].p*c.x {
   425  			*e = j
   426  			counter = 0
   427  		} else {
   428  			counter++
   429  		}
   430  
   431  		if counter > c.minpts {
   432  			break
   433  		}
   434  
   435  		j++
   436  	}
   437  
   438  	return *e != i
   439  }
   440  
   441  func (c *opticsClusterer) startClusterWorkers() {
   442  	c.c = make(chan *clusterJob, c.l)
   443  
   444  	c.w = &sync.WaitGroup{}
   445  
   446  	for i := 0; i < c.s; i++ {
   447  		go c.clusterWorker()
   448  	}
   449  }
   450  
   451  func (c *opticsClusterer) endClusterWorkers() {
   452  	close(c.c)
   453  
   454  	c.c = nil
   455  
   456  	c.w = nil
   457  }
   458  
   459  func (c *opticsClusterer) clusterWorker() {
   460  	for j := range c.c {
   461  		for i := j.a; i < j.b; i++ {
   462  			c.a[i] = j.n
   463  		}
   464  
   465  		c.w.Done()
   466  	}
   467  }
   468  
   469  /* Divide work among c.s workers, where c.s is determined
   470   * by the size of the data. This is based on an assumption that neighbour points of p
   471   * are located in relatively small subsection of the input data, so the dataset can be scanned
   472   * concurrently without blocking a big number of goroutines trying to write to r */
   473  func (c *opticsClusterer) nearest(p int, l *int, r *[]int) {
   474  	var b int
   475  
   476  	*r = (*r)[:0]
   477  
   478  	c.p = c.d[p]
   479  	c.r = r
   480  
   481  	c.w.Add(c.s)
   482  
   483  	for i := 0; i < c.l; i += c.f {
   484  		if c.l-i <= c.f {
   485  			b = c.l - 1
   486  		} else {
   487  			b = i + c.f
   488  		}
   489  
   490  		c.j <- &rangeJob{
   491  			a: i,
   492  			b: b,
   493  		}
   494  	}
   495  
   496  	c.w.Wait()
   497  
   498  	*l = len(*r)
   499  }
   500  
   501  func (c *opticsClusterer) startNearestWorkers() {
   502  	c.j = make(chan *rangeJob, c.l)
   503  
   504  	c.m = &sync.Mutex{}
   505  	c.w = &sync.WaitGroup{}
   506  
   507  	for i := 0; i < c.s; i++ {
   508  		go c.nearestWorker()
   509  	}
   510  }
   511  
   512  func (c *opticsClusterer) endNearestWorkers() {
   513  	close(c.j)
   514  
   515  	c.j = nil
   516  
   517  	c.m = nil
   518  	c.w = nil
   519  }
   520  
   521  func (c *opticsClusterer) nearestWorker() {
   522  	for j := range c.j {
   523  		for i := j.a; i < j.b; i++ {
   524  			if c.distance(c.p, c.d[i]) < c.eps {
   525  				c.m.Lock()
   526  				*c.r = append(*c.r, i)
   527  				c.m.Unlock()
   528  			}
   529  		}
   530  
   531  		c.w.Done()
   532  	}
   533  }
   534  
   535  func (c *opticsClusterer) numWorkers() int {
   536  	var b int
   537  
   538  	if c.l < 1000 {
   539  		b = 1
   540  	} else if c.l < 10000 {
   541  		b = 10
   542  	} else if c.l < 100000 {
   543  		b = 100
   544  	} else {
   545  		b = 1000
   546  	}
   547  
   548  	if c.workers == 0 {
   549  		return b
   550  	}
   551  
   552  	if c.workers < b {
   553  		return c.workers
   554  	}
   555  
   556  	return b
   557  }