github.com/thanos-io/thanos@v0.32.5/pkg/dedup/iter.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package dedup 5 6 import ( 7 "math" 8 9 "github.com/prometheus/prometheus/model/histogram" 10 "github.com/prometheus/prometheus/model/labels" 11 "github.com/prometheus/prometheus/storage" 12 "github.com/prometheus/prometheus/tsdb/chunkenc" 13 "github.com/thanos-io/thanos/pkg/store/storepb" 14 ) 15 16 type dedupSeriesSet struct { 17 set storage.SeriesSet 18 isCounter bool 19 20 replicas []storage.Series 21 // Pushed down series. Currently, they are being handled in a specific way. 22 // In the future, we might want to relax this and handle these depending 23 // on what function has been passed. 24 pushedDown []storage.Series 25 26 lset labels.Labels 27 peek storage.Series 28 ok bool 29 30 f string 31 pushdownEnabled bool 32 } 33 34 // isCounter deduces whether a counter metric has been passed. There must be 35 // a better way to deduce this. 36 func isCounter(f string) bool { 37 return f == "increase" || f == "rate" || f == "irate" || f == "resets" 38 } 39 40 // NewOverlapSplit splits overlapping chunks into separate series entry, so existing algorithm can work as usual. 41 // We cannot do this in dedup.SeriesSet as it iterates over samples already. 42 // TODO(bwplotka): Remove when we move to per chunk deduplication code. 43 // We expect non-duplicated series with sorted chunks by min time (possibly overlapped). 44 func NewOverlapSplit(set storepb.SeriesSet) storepb.SeriesSet { 45 return &overlapSplitSet{set: set, ok: true} 46 } 47 48 type overlapSplitSet struct { 49 ok bool 50 set storepb.SeriesSet 51 52 currLabels labels.Labels 53 currI int 54 replicas [][]storepb.AggrChunk 55 } 56 57 func (o *overlapSplitSet) Next() bool { 58 if !o.ok { 59 return false 60 } 61 62 o.currI++ 63 if o.currI < len(o.replicas) { 64 return true 65 } 66 67 o.currI = 0 68 o.replicas = o.replicas[:0] 69 o.replicas = append(o.replicas, nil) 70 71 o.ok = o.set.Next() 72 if !o.ok { 73 return false 74 } 75 76 var chunks []storepb.AggrChunk 77 o.currLabels, chunks = o.set.At() 78 if len(chunks) == 0 { 79 return true 80 } 81 82 o.replicas[0] = append(o.replicas[0], chunks[0]) 83 84 chunksLoop: 85 for i := 1; i < len(chunks); i++ { 86 currMinTime := chunks[i].MinTime 87 for ri := range o.replicas { 88 if len(o.replicas[ri]) == 0 || o.replicas[ri][len(o.replicas[ri])-1].MaxTime < currMinTime { 89 o.replicas[ri] = append(o.replicas[ri], chunks[i]) 90 continue chunksLoop 91 } 92 } 93 o.replicas = append(o.replicas, []storepb.AggrChunk{chunks[i]}) // Not found, add to a new "fake" series. 94 } 95 return true 96 } 97 98 func (o *overlapSplitSet) At() (labels.Labels, []storepb.AggrChunk) { 99 return o.currLabels, o.replicas[o.currI] 100 } 101 102 func (o *overlapSplitSet) Err() error { 103 return o.set.Err() 104 } 105 106 // NewSeriesSet returns seriesSet that deduplicates the same series. 107 // The series in series set are expected be sorted by all labels. 108 func NewSeriesSet(set storage.SeriesSet, f string, pushdownEnabled bool) storage.SeriesSet { 109 // TODO: remove dependency on knowing whether it is a counter. 110 s := &dedupSeriesSet{pushdownEnabled: pushdownEnabled, set: set, isCounter: isCounter(f), f: f} 111 s.ok = s.set.Next() 112 if s.ok { 113 s.peek = s.set.At() 114 } 115 return s 116 } 117 118 // trimPushdownMarker trims the pushdown marker from the given labels. 119 // Returns true if there was a pushdown marker. 120 func trimPushdownMarker(lbls labels.Labels) (labels.Labels, bool) { 121 return labels.NewBuilder(lbls).Del(PushdownMarker.Name).Labels(), lbls.Has(PushdownMarker.Name) 122 } 123 124 func (s *dedupSeriesSet) Next() bool { 125 if !s.ok { 126 return false 127 } 128 // Reset both because they might have some leftovers. 129 if s.pushdownEnabled { 130 s.pushedDown = s.pushedDown[:0] 131 } 132 s.replicas = s.replicas[:0] 133 134 // Set the label set we are currently gathering to the peek element. 135 s.lset = s.peek.Labels() 136 137 pushedDown := false 138 if s.pushdownEnabled { 139 s.lset, pushedDown = trimPushdownMarker(s.lset) 140 } 141 if pushedDown { 142 s.pushedDown = append(s.pushedDown[:0], s.peek) 143 } else { 144 s.replicas = append(s.replicas[:0], s.peek) 145 } 146 return s.next() 147 } 148 149 func (s *dedupSeriesSet) next() bool { 150 // Peek the next series to see whether it's a replica for the current series. 151 s.ok = s.set.Next() 152 if !s.ok { 153 // There's no next series, the current replicas are the last element. 154 return len(s.replicas) > 0 || len(s.pushedDown) > 0 155 } 156 s.peek = s.set.At() 157 nextLset := s.peek.Labels() 158 159 var pushedDown bool 160 if s.pushdownEnabled { 161 nextLset, pushedDown = trimPushdownMarker(nextLset) 162 } 163 164 // If the label set modulo the replica label is equal to the current label set 165 // look for more replicas, otherwise a series is complete. 166 if !labels.Equal(s.lset, nextLset) { 167 return true 168 } 169 170 if pushedDown { 171 s.pushedDown = append(s.pushedDown, s.peek) 172 } else { 173 s.replicas = append(s.replicas, s.peek) 174 } 175 176 return s.next() 177 } 178 179 func (s *dedupSeriesSet) At() storage.Series { 180 if len(s.replicas) == 1 && len(s.pushedDown) == 0 { 181 return seriesWithLabels{Series: s.replicas[0], lset: s.lset} 182 } 183 if len(s.replicas) == 0 && len(s.pushedDown) == 1 { 184 return seriesWithLabels{Series: s.pushedDown[0], lset: s.lset} 185 } 186 // Clients may store the series, so we must make a copy of the slice before advancing. 187 repl := make([]storage.Series, len(s.replicas)) 188 copy(repl, s.replicas) 189 190 var pushedDown []storage.Series 191 if s.pushdownEnabled { 192 pushedDown = make([]storage.Series, len(s.pushedDown)) 193 copy(pushedDown, s.pushedDown) 194 } 195 196 return newDedupSeries(s.lset, repl, pushedDown, s.f) 197 } 198 199 func (s *dedupSeriesSet) Err() error { 200 return s.set.Err() 201 } 202 203 func (s *dedupSeriesSet) Warnings() storage.Warnings { 204 return s.set.Warnings() 205 } 206 207 type seriesWithLabels struct { 208 storage.Series 209 lset labels.Labels 210 } 211 212 func (s seriesWithLabels) Labels() labels.Labels { return s.lset } 213 214 type dedupSeries struct { 215 lset labels.Labels 216 replicas []storage.Series 217 pushedDown []storage.Series 218 219 isCounter bool 220 f string 221 } 222 223 func newDedupSeries(lset labels.Labels, replicas []storage.Series, pushedDown []storage.Series, f string) *dedupSeries { 224 return &dedupSeries{lset: lset, isCounter: isCounter(f), replicas: replicas, pushedDown: pushedDown, f: f} 225 } 226 227 func (s *dedupSeries) Labels() labels.Labels { 228 return s.lset 229 } 230 231 // pushdownIterator creates an iterator that handles 232 // all pushed down series. 233 func (s *dedupSeries) pushdownIterator(_ chunkenc.Iterator) chunkenc.Iterator { 234 var pushedDownIterator adjustableSeriesIterator 235 if s.isCounter { 236 pushedDownIterator = &counterErrAdjustSeriesIterator{Iterator: s.pushedDown[0].Iterator(nil)} 237 } else { 238 pushedDownIterator = noopAdjustableSeriesIterator{Iterator: s.pushedDown[0].Iterator(nil)} 239 } 240 241 for _, o := range s.pushedDown[1:] { 242 var replicaIterator adjustableSeriesIterator 243 244 if s.isCounter { 245 replicaIterator = &counterErrAdjustSeriesIterator{Iterator: o.Iterator(nil)} 246 } else { 247 replicaIterator = noopAdjustableSeriesIterator{Iterator: o.Iterator(nil)} 248 } 249 250 pushedDownIterator = noopAdjustableSeriesIterator{newPushdownSeriesIterator(pushedDownIterator, replicaIterator, s.f)} 251 } 252 253 return pushedDownIterator 254 } 255 256 // allSeriesIterator creates an iterator over all series - pushed down 257 // and regular replicas. 258 func (s *dedupSeries) allSeriesIterator(_ chunkenc.Iterator) chunkenc.Iterator { 259 var replicasIterator, pushedDownIterator adjustableSeriesIterator 260 if len(s.replicas) != 0 { 261 if s.isCounter { 262 replicasIterator = &counterErrAdjustSeriesIterator{Iterator: s.replicas[0].Iterator(nil)} 263 } else { 264 replicasIterator = noopAdjustableSeriesIterator{Iterator: s.replicas[0].Iterator(nil)} 265 } 266 267 for _, o := range s.replicas[1:] { 268 var replicaIter adjustableSeriesIterator 269 if s.isCounter { 270 replicaIter = &counterErrAdjustSeriesIterator{Iterator: o.Iterator(nil)} 271 } else { 272 replicaIter = noopAdjustableSeriesIterator{Iterator: o.Iterator(nil)} 273 } 274 replicasIterator = newDedupSeriesIterator(replicasIterator, replicaIter) 275 } 276 } 277 278 if len(s.pushedDown) != 0 { 279 if s.isCounter { 280 pushedDownIterator = &counterErrAdjustSeriesIterator{Iterator: s.pushedDown[0].Iterator(nil)} 281 } else { 282 pushedDownIterator = noopAdjustableSeriesIterator{Iterator: s.pushedDown[0].Iterator(nil)} 283 } 284 285 for _, o := range s.pushedDown[1:] { 286 var replicaIter adjustableSeriesIterator 287 if s.isCounter { 288 replicaIter = &counterErrAdjustSeriesIterator{Iterator: o.Iterator(nil)} 289 } else { 290 replicaIter = noopAdjustableSeriesIterator{Iterator: o.Iterator(nil)} 291 } 292 pushedDownIterator = newDedupSeriesIterator(pushedDownIterator, replicaIter) 293 } 294 } 295 296 if replicasIterator == nil { 297 return pushedDownIterator 298 } 299 if pushedDownIterator == nil { 300 return replicasIterator 301 } 302 return newDedupSeriesIterator(pushedDownIterator, replicasIterator) 303 } 304 305 func (s *dedupSeries) Iterator(_ chunkenc.Iterator) chunkenc.Iterator { 306 // This function needs a regular iterator over all series. Behavior is identical 307 // whether it was pushed down or not. 308 if s.f == "group" { 309 return s.allSeriesIterator(nil) 310 } 311 // If there are no replicas then jump straight to constructing an iterator 312 // for pushed down series. 313 if len(s.replicas) == 0 { 314 return s.pushdownIterator(nil) 315 } 316 317 // Finally, if we have both then construct a tree out of them. 318 // Pushed down series have their own special iterator. 319 // We deduplicate everything in the end. 320 var it adjustableSeriesIterator 321 if s.isCounter { 322 it = &counterErrAdjustSeriesIterator{Iterator: s.replicas[0].Iterator(nil)} 323 } else { 324 it = noopAdjustableSeriesIterator{Iterator: s.replicas[0].Iterator(nil)} 325 } 326 327 for _, o := range s.replicas[1:] { 328 var replicaIter adjustableSeriesIterator 329 if s.isCounter { 330 replicaIter = &counterErrAdjustSeriesIterator{Iterator: o.Iterator(nil)} 331 } else { 332 replicaIter = noopAdjustableSeriesIterator{Iterator: o.Iterator(nil)} 333 } 334 it = newDedupSeriesIterator(it, replicaIter) 335 } 336 337 if len(s.pushedDown) == 0 { 338 return it 339 } 340 341 // Join all of the pushed down iterators into one. 342 var pushedDownIterator adjustableSeriesIterator 343 if s.isCounter { 344 pushedDownIterator = &counterErrAdjustSeriesIterator{Iterator: s.pushedDown[0].Iterator(nil)} 345 } else { 346 pushedDownIterator = noopAdjustableSeriesIterator{Iterator: s.pushedDown[0].Iterator(nil)} 347 } 348 349 for _, o := range s.pushedDown[1:] { 350 var replicaIterator adjustableSeriesIterator 351 352 if s.isCounter { 353 replicaIterator = &counterErrAdjustSeriesIterator{Iterator: o.Iterator(nil)} 354 } else { 355 replicaIterator = noopAdjustableSeriesIterator{Iterator: o.Iterator(nil)} 356 } 357 358 pushedDownIterator = noopAdjustableSeriesIterator{newPushdownSeriesIterator(pushedDownIterator, replicaIterator, s.f)} 359 } 360 361 return newDedupSeriesIterator(it, pushedDownIterator) 362 } 363 364 // adjustableSeriesIterator iterates over the data of a time series and allows to adjust current value based on 365 // given lastValue iterated. 366 type adjustableSeriesIterator interface { 367 chunkenc.Iterator 368 369 // adjustAtValue allows to adjust value by implementation if needed knowing the last value. This is used by counter 370 // implementation which can adjust for obsolete counter value. 371 adjustAtValue(lastFloatValue float64) 372 } 373 374 type noopAdjustableSeriesIterator struct { 375 chunkenc.Iterator 376 } 377 378 func (it noopAdjustableSeriesIterator) adjustAtValue(float64) {} 379 380 // counterErrAdjustSeriesIterator is extendedSeriesIterator used when we deduplicate counter. 381 // It makes sure we always adjust for the latest seen last counter value for all replicas. 382 // Let's consider following example: 383 // 384 // Replica 1 counter scrapes: 20 30 40 Nan - 0 5 385 // Replica 2 counter scrapes: 25 35 45 Nan - 2 386 // 387 // Now for downsampling purposes we are accounting the resets(rewriting the samples value) 388 // so our replicas before going to dedup iterator looks like this: 389 // 390 // Replica 1 counter total: 20 30 40 - - 40 45 391 // Replica 2 counter total: 25 35 45 - - 47 392 // 393 // Now if at any point we will switch our focus from replica 2 to replica 1 we will experience lower value than previous, 394 // which will trigger false positive counter reset in PromQL. 395 // 396 // We mitigate this by taking allowing invoking AdjustAtValue which adjust the value in case of last value being larger than current at. 397 // (Counter cannot go down) 398 // 399 // This is to mitigate https://github.com/thanos-io/thanos/issues/2401. 400 // TODO(bwplotka): Find better deduplication algorithm that does not require knowledge if the given 401 // series is counter or not: https://github.com/thanos-io/thanos/issues/2547. 402 type counterErrAdjustSeriesIterator struct { 403 chunkenc.Iterator 404 405 errAdjust float64 406 } 407 408 func (it *counterErrAdjustSeriesIterator) adjustAtValue(lastFloatValue float64) { 409 _, v := it.At() 410 if lastFloatValue > v { 411 // This replica has obsolete value (did not see the correct "end" of counter value before app restart). Adjust. 412 it.errAdjust += lastFloatValue - v 413 } 414 } 415 416 func (it *counterErrAdjustSeriesIterator) At() (int64, float64) { 417 t, v := it.Iterator.At() 418 return t, v + it.errAdjust 419 } 420 421 type dedupSeriesIterator struct { 422 a, b adjustableSeriesIterator 423 424 aval, bval chunkenc.ValueType 425 426 // TODO(bwplotka): Don't base on LastT, but on detected scrape interval. This will allow us to be more 427 // responsive to gaps: https://github.com/thanos-io/thanos/issues/981, let's do it in next PR. 428 lastT int64 429 lastIter chunkenc.Iterator 430 431 penA, penB int64 432 useA bool 433 } 434 435 func newDedupSeriesIterator(a, b adjustableSeriesIterator) *dedupSeriesIterator { 436 return &dedupSeriesIterator{ 437 a: a, 438 b: b, 439 lastT: math.MinInt64, 440 lastIter: a, 441 aval: a.Next(), 442 bval: b.Next(), 443 } 444 } 445 446 func (it *dedupSeriesIterator) Next() chunkenc.ValueType { 447 lastFloatVal, isFloatVal := it.lastFloatVal() 448 lastUseA := it.useA 449 defer func() { 450 if it.useA != lastUseA && isFloatVal { 451 // We switched replicas. 452 // Ensure values are correct bases on value before At. 453 // TODO(rabenhorst): Investigate if we also need to implement adjusting histograms here. 454 it.adjustAtValue(lastFloatVal) 455 } 456 }() 457 458 // Advance both iterators to at least the next highest timestamp plus the potential penalty. 459 if it.aval != chunkenc.ValNone { 460 it.aval = it.a.Seek(it.lastT + 1 + it.penA) 461 } 462 if it.bval != chunkenc.ValNone { 463 it.bval = it.b.Seek(it.lastT + 1 + it.penB) 464 } 465 466 // Handle basic cases where one iterator is exhausted before the other. 467 if it.aval == chunkenc.ValNone { 468 it.useA = false 469 if it.bval != chunkenc.ValNone { 470 it.lastT = it.b.AtT() 471 it.lastIter = it.b 472 it.penB = 0 473 } 474 return it.bval 475 } 476 if it.bval == chunkenc.ValNone { 477 it.useA = true 478 it.lastT = it.a.AtT() 479 it.lastIter = it.a 480 it.penA = 0 481 return it.aval 482 } 483 // General case where both iterators still have data. We pick the one 484 // with the smaller timestamp. 485 // The applied penalty potentially already skipped potential samples already 486 // that would have resulted in exaggerated sampling frequency. 487 ta := it.a.AtT() 488 tb := it.b.AtT() 489 490 it.useA = ta <= tb 491 492 // For the series we didn't pick, add a penalty twice as high as the delta of the last two 493 // samples to the next seek against it. 494 // This ensures that we don't pick a sample too close, which would increase the overall 495 // sample frequency. It also guards against clock drift and inaccuracies during 496 // timestamp assignment. 497 // If we don't know a delta yet, we pick 5000 as a constant, which is based on the knowledge 498 // that timestamps are in milliseconds and sampling frequencies typically multiple seconds long. 499 const initialPenalty = 5000 500 501 if it.useA { 502 if it.lastT != math.MinInt64 { 503 it.penB = 2 * (ta - it.lastT) 504 } else { 505 it.penB = initialPenalty 506 } 507 it.penA = 0 508 it.lastT = ta 509 it.lastIter = it.a 510 511 return it.aval 512 } 513 if it.lastT != math.MinInt64 { 514 it.penA = 2 * (tb - it.lastT) 515 } else { 516 it.penA = initialPenalty 517 } 518 it.penB = 0 519 it.lastT = tb 520 it.lastIter = it.b 521 return it.bval 522 } 523 524 func (it *dedupSeriesIterator) lastFloatVal() (float64, bool) { 525 if it.useA && it.aval == chunkenc.ValFloat { 526 _, v := it.lastIter.At() 527 return v, true 528 } 529 if !it.useA && it.bval == chunkenc.ValFloat { 530 _, v := it.lastIter.At() 531 return v, true 532 } 533 return 0, false 534 } 535 536 func (it *dedupSeriesIterator) adjustAtValue(lastFloatValue float64) { 537 if it.aval == chunkenc.ValFloat { 538 it.a.adjustAtValue(lastFloatValue) 539 } 540 if it.bval == chunkenc.ValFloat { 541 it.b.adjustAtValue(lastFloatValue) 542 } 543 } 544 545 func (it *dedupSeriesIterator) Seek(t int64) chunkenc.ValueType { 546 // Don't use underlying Seek, but iterate over next to not miss gaps. 547 for { 548 ts := it.AtT() 549 if ts >= t { 550 if it.useA { 551 return it.a.Seek(ts) 552 } 553 return it.b.Seek(ts) 554 } 555 if it.Next() == chunkenc.ValNone { 556 return chunkenc.ValNone 557 } 558 } 559 } 560 561 func (it *dedupSeriesIterator) At() (int64, float64) { 562 return it.lastIter.At() 563 } 564 565 func (it *dedupSeriesIterator) AtHistogram() (int64, *histogram.Histogram) { 566 return it.lastIter.AtHistogram() 567 } 568 569 func (it *dedupSeriesIterator) AtFloatHistogram() (int64, *histogram.FloatHistogram) { 570 return it.lastIter.AtFloatHistogram() 571 } 572 573 func (it *dedupSeriesIterator) AtT() int64 { 574 var t int64 575 if it.useA { 576 t = it.a.AtT() 577 } else { 578 t = it.b.AtT() 579 } 580 return t 581 } 582 583 func (it *dedupSeriesIterator) Err() error { 584 if it.a.Err() != nil { 585 return it.a.Err() 586 } 587 return it.b.Err() 588 } 589 590 // boundedSeriesIterator wraps a series iterator and ensures that it only emits 591 // samples within a fixed time range. 592 type boundedSeriesIterator struct { 593 it chunkenc.Iterator 594 mint, maxt int64 595 } 596 597 func NewBoundedSeriesIterator(it chunkenc.Iterator, mint, maxt int64) *boundedSeriesIterator { 598 return &boundedSeriesIterator{it: it, mint: mint, maxt: maxt} 599 } 600 601 func (it *boundedSeriesIterator) Seek(t int64) chunkenc.ValueType { 602 if t > it.maxt { 603 return chunkenc.ValNone 604 } 605 if t < it.mint { 606 t = it.mint 607 } 608 return it.it.Seek(t) 609 } 610 611 func (it *boundedSeriesIterator) At() (t int64, v float64) { 612 return it.it.At() 613 } 614 615 func (it *boundedSeriesIterator) AtHistogram() (int64, *histogram.Histogram) { 616 return it.it.AtHistogram() 617 } 618 619 func (it *boundedSeriesIterator) AtFloatHistogram() (int64, *histogram.FloatHistogram) { 620 return it.it.AtFloatHistogram() 621 } 622 623 func (it *boundedSeriesIterator) AtT() int64 { 624 return it.it.AtT() 625 } 626 627 func (it *boundedSeriesIterator) Next() chunkenc.ValueType { 628 valueType := it.it.Next() 629 if valueType == chunkenc.ValNone { 630 return chunkenc.ValNone 631 } 632 t := it.it.AtT() 633 634 // Advance the iterator if we are before the valid interval. 635 if t < it.mint { 636 if it.Seek(it.mint) == chunkenc.ValNone { 637 return chunkenc.ValNone 638 } 639 t = it.it.AtT() 640 } 641 // Once we passed the valid interval, there is no going back. 642 if t <= it.maxt { 643 return valueType 644 } 645 646 return chunkenc.ValNone 647 } 648 649 func (it *boundedSeriesIterator) Err() error { 650 return it.it.Err() 651 }