github.com/schollz/clusters@v0.0.0-20221201012527-c6c68863636f/optics.go (about) 1 package clusters 2 3 import ( 4 "math" 5 "sync" 6 ) 7 8 type steepDownArea struct { 9 start, end int 10 mib float64 11 } 12 13 type clusterJob struct { 14 a, b, n int 15 } 16 17 type opticsClusterer struct { 18 minpts, workers int 19 eps, xi, x float64 20 21 distance DistanceFunc 22 23 // slices holding the cluster mapping and sizes. Access is synchronized to avoid read during computation. 24 mu sync.RWMutex 25 a, b []int 26 27 // variables used for concurrent computation of nearest neighbours and producing final mapping 28 l, s, o, f, g int 29 j chan *rangeJob 30 c chan *clusterJob 31 m *sync.Mutex 32 w *sync.WaitGroup 33 r *[]int 34 p []float64 35 36 // visited points 37 v []bool 38 39 // reachability distances 40 re []*pItem 41 42 // ordered list of points wrt. reachability distance 43 so []int 44 45 // dataset 46 d [][]float64 47 } 48 49 // Implementation of OPTICS algorithm with concurrent nearest neighbour computation. The number of goroutines acting concurrently 50 // is controlled via workers argument. Passing 0 will result in this number being chosen arbitrarily. 51 func OPTICS(minpts int, eps, xi float64, workers int, distance DistanceFunc) (HardClusterer, error) { 52 if minpts < 1 { 53 return nil, errZeroMinpts 54 } 55 56 if workers < 0 { 57 return nil, errZeroWorkers 58 } 59 60 if eps <= 0 { 61 return nil, errZeroEpsilon 62 } 63 64 if xi <= 0 { 65 return nil, errZeroXi 66 } 67 68 var d DistanceFunc 69 { 70 if distance != nil { 71 d = distance 72 } else { 73 d = EuclideanDistance 74 } 75 } 76 77 return &opticsClusterer{ 78 minpts: minpts, 79 workers: workers, 80 eps: eps, 81 xi: xi, 82 x: 1 - xi, 83 distance: d, 84 }, nil 85 } 86 87 func (c *opticsClusterer) IsOnline() bool { 88 return false 89 } 90 91 func (c *opticsClusterer) WithOnline(o Online) HardClusterer { 92 return c 93 } 94 95 func (c *opticsClusterer) Learn(data [][]float64) error { 96 if len(data) == 0 { 97 return errEmptySet 98 } 99 100 c.mu.Lock() 101 102 c.l = len(data) 103 c.s = c.numWorkers() 104 c.o = c.s - 1 105 c.f = c.l / c.s 106 107 c.d = data 108 109 c.v = make([]bool, c.l) 110 c.re = make([]*pItem, c.l) 111 c.so = make([]int, 0, c.l) 112 c.a = make([]int, c.l) 113 c.b = make([]int, 0) 114 115 c.startNearestWorkers() 116 117 c.run() 118 119 c.endNearestWorkers() 120 121 c.v = nil 122 c.p = nil 123 c.r = nil 124 125 c.startClusterWorkers() 126 127 c.extract() 128 129 c.endClusterWorkers() 130 131 c.re = nil 132 c.so = nil 133 134 c.mu.Unlock() 135 136 return nil 137 } 138 139 func (c *opticsClusterer) Sizes() []int { 140 c.mu.RLock() 141 defer c.mu.RUnlock() 142 143 return c.b 144 } 145 146 func (c *opticsClusterer) Guesses() []int { 147 c.mu.RLock() 148 defer c.mu.RUnlock() 149 150 return c.a 151 } 152 153 func (c *opticsClusterer) Predict(p []float64) int { 154 var ( 155 l int 156 d float64 157 m float64 = c.distance(p, c.d[0]) 158 ) 159 160 for i := 1; i < len(c.d); i++ { 161 if d = c.distance(p, c.d[i]); d < m { 162 m = d 163 l = i 164 } 165 } 166 167 return c.a[l] 168 } 169 170 func (c *opticsClusterer) Online(observations chan []float64, done chan struct{}) chan *HCEvent { 171 return nil 172 } 173 174 func (c *opticsClusterer) run() { 175 var ( 176 l int 177 d float64 178 ns, nss []int = make([]int, 0), make([]int, 0) 179 q priorityQueue 180 p *pItem 181 ) 182 183 for i := 0; i < c.l; i++ { 184 if c.v[i] { 185 continue 186 } 187 188 c.nearest(i, &l, &ns) 189 190 c.v[i] = true 191 192 c.so = append(c.so, i) 193 194 if d = c.coreDistance(i, l, ns); d != 0 { 195 q = newPriorityQueue(l) 196 197 c.update(i, d, l, ns, &q) 198 199 for q.NotEmpty() { 200 p = q.Pop().(*pItem) 201 202 c.nearest(p.v, &l, &nss) 203 204 c.v[p.v] = true 205 206 c.so = append(c.so, p.v) 207 208 if d = c.coreDistance(p.v, l, nss); d != 0 { 209 c.update(p.v, d, l, nss, &q) 210 } 211 } 212 } 213 } 214 } 215 216 func (c *opticsClusterer) coreDistance(p int, l int, r []int) float64 { 217 if l < c.minpts { 218 return 0 219 } 220 221 var d, m float64 = 0, c.distance(c.d[p], c.d[r[0]]) 222 223 for i := 1; i < l; i++ { 224 if d = c.distance(c.d[p], c.d[r[i]]); d > m { 225 m = d 226 } 227 } 228 229 return m 230 } 231 232 func (c *opticsClusterer) update(p int, d float64, l int, r []int, q *priorityQueue) { 233 for i := 0; i < l; i++ { 234 if !c.v[r[i]] { 235 m := math.Max(d, c.distance(c.d[p], c.d[r[i]])) 236 237 if c.re[r[i]] == nil { 238 item := &pItem{ 239 v: r[i], 240 p: m, 241 } 242 243 c.re[r[i]] = item 244 245 q.Push(item) 246 } else if m < c.re[r[i]].p { 247 q.Update(c.re[r[i]], c.re[r[i]].v, m) 248 } 249 } 250 } 251 } 252 253 func (c *opticsClusterer) extract() { 254 var ( 255 i, k, e, ue, cs, ce, s, p int = 0, 1, 0, 0, 0, 0, 0, 0 256 mib, d float64 257 areas []*steepDownArea = make([]*steepDownArea, 0) 258 clusters map[int]bool = make(map[int]bool) 259 ) 260 261 // first and last points are not assigned the reachability distance, so exclude them from calculations 262 c.so = c.so[1 : len(c.so)-2] 263 c.g = len(c.so) - 2 264 265 for i < len(c.so)-1 { 266 mib = math.Max(mib, c.re[c.so[i]].p) 267 268 if c.isSteepDown(i, &e) { 269 as := areas[:0] 270 for j := 0; j < len(areas); j++ { 271 if c.re[c.so[areas[j].start]].p*c.x < mib { 272 continue 273 } 274 275 as = append(as, &steepDownArea{ 276 start: areas[j].start, 277 end: areas[j].end, 278 mib: math.Max(areas[j].mib, mib), 279 }) 280 } 281 areas = as 282 283 areas = append(areas, &steepDownArea{ 284 start: i, 285 end: e, 286 }) 287 288 i = e + 1 289 mib = c.re[c.so[i]].p 290 } else if c.isSteepUp(i, &e) { 291 ue = e + 1 292 293 as := areas[:0] 294 for j := 0; j < len(areas); j++ { 295 if c.re[c.so[areas[j].start]].p*c.x < mib { 296 continue 297 } 298 299 as = append(as, &steepDownArea{ 300 start: areas[j].start, 301 end: areas[j].end, 302 mib: math.Max(areas[j].mib, mib), 303 }) 304 } 305 areas = as 306 307 for j := 0; j < len(areas); j++ { 308 if c.re[c.so[ue]].p*c.x < areas[j].mib { 309 continue 310 } 311 312 d = (c.re[c.so[areas[j].start]].p - c.re[c.so[ue]].p) / c.re[c.so[areas[j].start]].p 313 314 if math.Abs(d) <= c.xi { 315 cs = areas[j].start 316 ce = ue 317 } else if d > c.xi { 318 for k := areas[j].end; k > areas[j].end; k-- { 319 if math.Abs((c.re[c.so[k]].p-c.re[c.so[ue]].p)/c.re[c.so[k]].p) <= c.xi { 320 cs = k 321 break 322 } 323 } 324 ce = ue 325 } else { 326 cs = areas[j].start 327 for k := i; k < e; k++ { 328 if math.Abs((c.re[c.so[k]].p-c.re[c.so[i]].p)/c.re[c.so[k]].p) <= c.xi { 329 ce = k 330 break 331 } 332 } 333 } 334 335 p = ce - cs 336 337 if p < c.minpts { 338 continue 339 } 340 341 s = ce + cs 342 343 if !clusters[s] { 344 clusters[s] = true 345 346 c.b = append(c.b, 0) 347 c.b[k] = p 348 349 c.w.Add(1) 350 351 c.c <- &clusterJob{ 352 a: cs, 353 b: ce, 354 n: k, 355 } 356 357 k++ 358 } 359 } 360 361 i = ue 362 mib = c.re[c.so[i]].p 363 } else { 364 i++ 365 } 366 } 367 368 c.w.Wait() 369 } 370 371 func (c *opticsClusterer) isSteepDown(i int, e *int) bool { 372 if c.re[c.so[i]].p*c.x > c.re[c.so[i+1]].p { 373 return false 374 } 375 376 var counter, j int = 0, i 377 378 *e = j 379 380 for { 381 if j > c.g { 382 break 383 } 384 385 if c.re[c.so[j]].p < c.re[c.so[j+1]].p { 386 break 387 } 388 389 if c.re[c.so[j]].p*c.x <= c.re[c.so[j+1]].p { 390 *e = j 391 counter = 0 392 } else { 393 counter++ 394 } 395 396 if counter > c.minpts { 397 break 398 } 399 400 j++ 401 } 402 403 return *e != i 404 } 405 406 func (c *opticsClusterer) isSteepUp(i int, e *int) bool { 407 if c.re[c.so[i]].p > c.re[c.so[i+1]].p*c.x { 408 return false 409 } 410 411 var counter, j int = 0, i 412 413 *e = j 414 415 for { 416 if j > c.g { 417 break 418 } 419 420 if c.re[c.so[j]].p > c.re[c.so[j+1]].p { 421 break 422 } 423 424 if c.re[c.so[j]].p <= c.re[c.so[j+1]].p*c.x { 425 *e = j 426 counter = 0 427 } else { 428 counter++ 429 } 430 431 if counter > c.minpts { 432 break 433 } 434 435 j++ 436 } 437 438 return *e != i 439 } 440 441 func (c *opticsClusterer) startClusterWorkers() { 442 c.c = make(chan *clusterJob, c.l) 443 444 c.w = &sync.WaitGroup{} 445 446 for i := 0; i < c.s; i++ { 447 go c.clusterWorker() 448 } 449 } 450 451 func (c *opticsClusterer) endClusterWorkers() { 452 close(c.c) 453 454 c.c = nil 455 456 c.w = nil 457 } 458 459 func (c *opticsClusterer) clusterWorker() { 460 for j := range c.c { 461 for i := j.a; i < j.b; i++ { 462 c.a[i] = j.n 463 } 464 465 c.w.Done() 466 } 467 } 468 469 /* Divide work among c.s workers, where c.s is determined 470 * by the size of the data. This is based on an assumption that neighbour points of p 471 * are located in relatively small subsection of the input data, so the dataset can be scanned 472 * concurrently without blocking a big number of goroutines trying to write to r */ 473 func (c *opticsClusterer) nearest(p int, l *int, r *[]int) { 474 var b int 475 476 *r = (*r)[:0] 477 478 c.p = c.d[p] 479 c.r = r 480 481 c.w.Add(c.s) 482 483 for i := 0; i < c.l; i += c.f { 484 if c.l-i <= c.f { 485 b = c.l - 1 486 } else { 487 b = i + c.f 488 } 489 490 c.j <- &rangeJob{ 491 a: i, 492 b: b, 493 } 494 } 495 496 c.w.Wait() 497 498 *l = len(*r) 499 } 500 501 func (c *opticsClusterer) startNearestWorkers() { 502 c.j = make(chan *rangeJob, c.l) 503 504 c.m = &sync.Mutex{} 505 c.w = &sync.WaitGroup{} 506 507 for i := 0; i < c.s; i++ { 508 go c.nearestWorker() 509 } 510 } 511 512 func (c *opticsClusterer) endNearestWorkers() { 513 close(c.j) 514 515 c.j = nil 516 517 c.m = nil 518 c.w = nil 519 } 520 521 func (c *opticsClusterer) nearestWorker() { 522 for j := range c.j { 523 for i := j.a; i < j.b; i++ { 524 if c.distance(c.p, c.d[i]) < c.eps { 525 c.m.Lock() 526 *c.r = append(*c.r, i) 527 c.m.Unlock() 528 } 529 } 530 531 c.w.Done() 532 } 533 } 534 535 func (c *opticsClusterer) numWorkers() int { 536 var b int 537 538 if c.l < 1000 { 539 b = 1 540 } else if c.l < 10000 { 541 b = 10 542 } else if c.l < 100000 { 543 b = 100 544 } else { 545 b = 1000 546 } 547 548 if c.workers == 0 { 549 return b 550 } 551 552 if c.workers < b { 553 return c.workers 554 } 555 556 return b 557 }