github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/storage/series/buffer.go (about) 1 // Copyright (c) 2016 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package series 22 23 import ( 24 "errors" 25 "fmt" 26 "sort" 27 "sync/atomic" 28 "time" 29 30 "github.com/m3db/m3/src/dbnode/encoding" 31 "github.com/m3db/m3/src/dbnode/namespace" 32 "github.com/m3db/m3/src/dbnode/persist" 33 "github.com/m3db/m3/src/dbnode/storage/block" 34 "github.com/m3db/m3/src/dbnode/ts" 35 "github.com/m3db/m3/src/dbnode/x/xio" 36 "github.com/m3db/m3/src/x/clock" 37 "github.com/m3db/m3/src/x/context" 38 xerrors "github.com/m3db/m3/src/x/errors" 39 "github.com/m3db/m3/src/x/ident" 40 "github.com/m3db/m3/src/x/instrument" 41 "github.com/m3db/m3/src/x/pool" 42 xtime "github.com/m3db/m3/src/x/time" 43 44 "github.com/cespare/xxhash/v2" 45 "go.uber.org/zap" 46 ) 47 48 const ( 49 errBucketMapCacheNotInSync = "bucket map keys do not match sorted keys cache" 50 errBucketMapCacheNotInSyncFmt = errBucketMapCacheNotInSync + ", blockStart: %d" 51 errTimestampFormat = time.RFC822Z 52 ) 53 54 var ( 55 timeZero time.Time 56 errIncompleteMerge = errors.New("bucket merge did not result in only one encoder") 57 errTooManyEncoders = xerrors.NewInvalidParamsError(errors.New("too many encoders per block")) 58 ) 59 60 const ( 61 bucketsCacheSize = 2 62 // optimizedTimesArraySize is the size of the internal array for the 63 // optimizedTimes struct. Since the size of this array determines the 64 // effectiveness of minimizing heap allocations, usage of this struct and/or 65 // changing this const should only be done after considering its current 66 // use cases: 67 // 1) The number of buckets that will be removed within a tick due to that 68 // block being recently flushed 69 // 2) The number of buckets that contain ColdWrites within a cold flush 70 // cycle 71 // TODO(juchan): revisit this after ColdWrites usage to see if this number 72 // is sane. 73 optimizedTimesArraySize = 8 74 writableBucketVersion = 0 75 ) 76 77 type databaseBuffer interface { 78 MoveTo( 79 buffer databaseBuffer, 80 nsCtx namespace.Context, 81 ) error 82 83 Write( 84 ctx context.Context, 85 id ident.ID, 86 timestamp xtime.UnixNano, 87 value float64, 88 unit xtime.Unit, 89 annotation []byte, 90 wOpts WriteOptions, 91 ) (bool, WriteType, error) 92 93 Snapshot( 94 ctx context.Context, 95 blockStart xtime.UnixNano, 96 metadata persist.Metadata, 97 persistFn persist.DataFn, 98 nsCtx namespace.Context, 99 ) (SnapshotResult, error) 100 101 WarmFlush( 102 ctx context.Context, 103 blockStart xtime.UnixNano, 104 metadata persist.Metadata, 105 persistFn persist.DataFn, 106 nsCtx namespace.Context, 107 ) (FlushOutcome, error) 108 109 ReadEncoded( 110 ctx context.Context, 111 start, end xtime.UnixNano, 112 nsCtx namespace.Context, 113 ) ([][]xio.BlockReader, error) 114 115 FetchBlocksForColdFlush( 116 ctx context.Context, 117 start xtime.UnixNano, 118 version int, 119 nsCtx namespace.Context, 120 ) (block.FetchBlockResult, error) 121 122 FetchBlocks( 123 ctx context.Context, 124 starts []xtime.UnixNano, 125 nsCtx namespace.Context, 126 ) []block.FetchBlockResult 127 128 FetchBlocksMetadata( 129 ctx context.Context, 130 start, end xtime.UnixNano, 131 opts FetchBlocksMetadataOptions, 132 ) (block.FetchBlockMetadataResults, error) 133 134 IsEmpty() bool 135 136 MarkNonEmptyBlocks(nonEmptyBlockStarts map[xtime.UnixNano]struct{}) 137 138 ColdFlushBlockStarts(blockStates map[xtime.UnixNano]BlockState) OptimizedTimes 139 140 Stats() bufferStats 141 142 Tick(versions ShardBlockStateSnapshot, nsCtx namespace.Context) bufferTickResult 143 144 Load(bl block.DatabaseBlock, writeType WriteType) 145 146 Reset(opts databaseBufferResetOptions) 147 } 148 149 type databaseBufferResetOptions struct { 150 BlockRetriever QueryableBlockRetriever 151 Options Options 152 } 153 154 type bufferStats struct { 155 wiredBlocks int 156 } 157 158 type bufferTickResult struct { 159 mergedOutOfOrderBlocks int 160 evictedBucketTimes OptimizedTimes 161 } 162 163 // OptimizedTimes is a struct that holds an unknown number of times. This is 164 // used to avoid heap allocations as much as possible by trying to not allocate 165 // a slice of times. To do this, `optimizedTimesArraySize` needs to be 166 // strategically sized such that for the vast majority of the time, the internal 167 // array can hold all the times required so that `slice` is nil. 168 // 169 // OptimizedTimes should only be interacted with via its helper functions - its 170 // fields should never be accessed or modified directly, which could cause an 171 // invalid state. 172 type OptimizedTimes struct { 173 arrIdx int 174 arr [optimizedTimesArraySize]xtime.UnixNano 175 slice []xtime.UnixNano 176 } 177 178 // Add adds a time to this OptimizedTimes. 179 func (t *OptimizedTimes) Add(newTime xtime.UnixNano) { 180 if t.arrIdx < cap(t.arr) { 181 t.arr[t.arrIdx] = newTime 182 t.arrIdx++ 183 } else { 184 t.slice = append(t.slice, newTime) 185 } 186 } 187 188 // Len returns the number of times in this OptimizedTimes. 189 func (t *OptimizedTimes) Len() int { 190 return t.arrIdx + len(t.slice) 191 } 192 193 // Contains returns whether the target time is in this OptimizedTimes. 194 func (t *OptimizedTimes) Contains(target xtime.UnixNano) bool { 195 for i := 0; i < t.arrIdx; i++ { 196 if t.arr[i].Equal(target) { 197 return true 198 } 199 } 200 for _, tt := range t.slice { 201 if tt.Equal(target) { 202 return true 203 } 204 } 205 return false 206 } 207 208 // ForEach runs the given function for each time in this OptimizedTimes. 209 func (t *OptimizedTimes) ForEach(fn func(t xtime.UnixNano)) { 210 for i, tNano := range t.arr { 211 if i >= t.arrIdx { 212 break 213 } 214 fn(tNano) 215 } 216 for _, tNano := range t.slice { 217 fn(tNano) 218 } 219 } 220 221 type dbBuffer struct { 222 opts Options 223 nowFn clock.NowFn 224 225 // bucketsMap is a map from a block start to its corresponding bucket 226 // versions. 227 bucketsMap map[xtime.UnixNano]*BufferBucketVersions 228 // Cache of buckets to avoid map lookup of above. 229 bucketVersionsCache [bucketsCacheSize]*BufferBucketVersions 230 // This is an in order slice of the block starts in the bucketsMap. 231 // We maintain this to avoid sorting the map keys adhoc when we want to 232 // perform operations in chronological order. 233 inOrderBlockStarts []xtime.UnixNano 234 bucketVersionsPool *BufferBucketVersionsPool 235 bucketPool *BufferBucketPool 236 blockRetriever QueryableBlockRetriever 237 } 238 239 // NB(prateek): databaseBuffer.Reset(...) must be called upon the returned 240 // object prior to use. 241 func newDatabaseBuffer() databaseBuffer { 242 b := &dbBuffer{ 243 bucketsMap: make(map[xtime.UnixNano]*BufferBucketVersions), 244 inOrderBlockStarts: make([]xtime.UnixNano, 0, bucketsCacheSize), 245 } 246 return b 247 } 248 249 func (b *dbBuffer) Reset(opts databaseBufferResetOptions) { 250 b.opts = opts.Options 251 b.nowFn = opts.Options.ClockOptions().NowFn() 252 b.bucketPool = opts.Options.BufferBucketPool() 253 b.bucketVersionsPool = opts.Options.BufferBucketVersionsPool() 254 b.blockRetriever = opts.BlockRetriever 255 } 256 257 func (b *dbBuffer) MoveTo( 258 buffer databaseBuffer, 259 nsCtx namespace.Context, 260 ) error { 261 blockSize := b.opts.RetentionOptions().BlockSize() 262 for _, buckets := range b.bucketsMap { 263 for _, bucket := range buckets.buckets { 264 // Load any existing blocks. 265 for _, block := range bucket.loadedBlocks { 266 // Load block. 267 buffer.Load(block, bucket.writeType) 268 } 269 270 // Load encoders. 271 for _, elem := range bucket.encoders { 272 if elem.encoder.Len() == 0 { 273 // No data. 274 continue 275 } 276 // Take ownership of the encoder. 277 segment := elem.encoder.Discard() 278 // Create block and load into new buffer. 279 block := b.opts.DatabaseBlockOptions().DatabaseBlockPool().Get() 280 block.Reset(bucket.start, blockSize, segment, nsCtx) 281 // Load block. 282 buffer.Load(block, bucket.writeType) 283 } 284 } 285 } 286 287 return nil 288 } 289 290 func (b *dbBuffer) Write( 291 ctx context.Context, 292 id ident.ID, 293 timestamp xtime.UnixNano, 294 value float64, 295 unit xtime.Unit, 296 annotation []byte, 297 wOpts WriteOptions, 298 ) (bool, WriteType, error) { 299 var ( 300 ropts = b.opts.RetentionOptions() 301 bufferPast = ropts.BufferPast() 302 bufferFuture = ropts.BufferFuture() 303 now = xtime.ToUnixNano(b.nowFn()) 304 pastLimit = now.Add(-1 * bufferPast).Truncate(time.Second) 305 futureLimit = now.Add(bufferFuture).Truncate(time.Second) 306 blockSize = ropts.BlockSize() 307 blockStart = timestamp.Truncate(blockSize) 308 writeType WriteType 309 ) 310 311 switch { 312 case wOpts.BootstrapWrite: 313 exists, err := b.blockRetriever.IsBlockRetrievable(blockStart) 314 if err != nil { 315 return false, writeType, err 316 } 317 // Bootstrap writes are allowed to be outside of time boundaries 318 // and determined as cold or warm writes depending on whether 319 // the block is retrievable or not. 320 if !exists { 321 writeType = WarmWrite 322 } else { 323 writeType = ColdWrite 324 } 325 326 case timestamp.Before(pastLimit): 327 writeType = ColdWrite 328 if !b.opts.ColdWritesEnabled() { 329 return false, writeType, xerrors.NewInvalidParamsError( 330 fmt.Errorf("datapoint too far in past: "+ 331 "id=%s, off_by=%s, timestamp=%s, past_limit=%s, "+ 332 "timestamp_unix_nanos=%d, past_limit_unix_nanos=%d", 333 id.Bytes(), pastLimit.Sub(timestamp).String(), 334 timestamp.Format(errTimestampFormat), 335 pastLimit.Format(errTimestampFormat), 336 timestamp, pastLimit)) 337 } 338 339 case !futureLimit.After(timestamp): 340 writeType = ColdWrite 341 if !b.opts.ColdWritesEnabled() { 342 return false, writeType, xerrors.NewInvalidParamsError( 343 fmt.Errorf("datapoint too far in future: "+ 344 "id=%s, off_by=%s, timestamp=%s, future_limit=%s, "+ 345 "timestamp_unix_nanos=%d, future_limit_unix_nanos=%d", 346 id.Bytes(), timestamp.Sub(futureLimit).String(), 347 timestamp.Format(errTimestampFormat), 348 futureLimit.Format(errTimestampFormat), 349 timestamp, futureLimit)) 350 } 351 352 default: 353 writeType = WarmWrite 354 355 } 356 357 if writeType == ColdWrite { 358 retentionLimit := now.Add(-ropts.RetentionPeriod()) 359 if wOpts.BootstrapWrite { 360 // NB(r): Allow bootstrapping to write to blocks that are 361 // still in retention. 362 retentionLimit = retentionLimit.Truncate(blockSize) 363 } 364 if retentionLimit.After(timestamp) { 365 if wOpts.SkipOutOfRetention { 366 // Allow for datapoint to be skipped since caller does not 367 // want writes out of retention to fail. 368 return false, writeType, nil 369 } 370 return false, writeType, xerrors.NewInvalidParamsError( 371 fmt.Errorf("datapoint too far in past and out of retention: "+ 372 "id=%s, off_by=%s, timestamp=%s, retention_past_limit=%s, "+ 373 "timestamp_unix_nanos=%d, retention_past_limit_unix_nanos=%d", 374 id.Bytes(), retentionLimit.Sub(timestamp).String(), 375 timestamp.Format(errTimestampFormat), 376 retentionLimit.Format(errTimestampFormat), 377 timestamp, retentionLimit)) 378 } 379 380 futureRetentionLimit := now.Add(ropts.FutureRetentionPeriod()) 381 if !futureRetentionLimit.After(timestamp) { 382 if wOpts.SkipOutOfRetention { 383 // Allow for datapoint to be skipped since caller does not 384 // want writes out of retention to fail. 385 return false, writeType, nil 386 } 387 return false, writeType, xerrors.NewInvalidParamsError( 388 fmt.Errorf("datapoint too far in future and out of retention: "+ 389 "id=%s, off_by=%s, timestamp=%s, retention_future_limit=%s, "+ 390 "timestamp_unix_nanos=%d, retention_future_limit_unix_nanos=%d", 391 id.Bytes(), timestamp.Sub(futureRetentionLimit).String(), 392 timestamp.Format(errTimestampFormat), 393 futureRetentionLimit.Format(errTimestampFormat), 394 timestamp, futureRetentionLimit)) 395 } 396 397 b.opts.Stats().IncColdWrites() 398 } 399 400 buckets := b.bucketVersionsAtCreate(blockStart) 401 b.putBucketVersionsInCache(buckets) 402 403 if wOpts.TruncateType == TypeBlock { 404 timestamp = blockStart 405 } 406 407 if wOpts.TransformOptions.ForceValueEnabled { 408 value = wOpts.TransformOptions.ForceValue 409 } 410 411 ok, err := buckets.write(timestamp, value, unit, annotation, writeType, wOpts.SchemaDesc) 412 return ok, writeType, err 413 } 414 415 func (b *dbBuffer) IsEmpty() bool { 416 // A buffer can only be empty if there are no buckets in its map, since 417 // buckets are only created when a write for a new block start is done, and 418 // buckets are removed from the map when they are evicted from memory. 419 return len(b.bucketsMap) == 0 420 } 421 422 func (b *dbBuffer) MarkNonEmptyBlocks(nonEmptyBlockStarts map[xtime.UnixNano]struct{}) { 423 for blockStart, bv := range b.bucketsMap { 424 if _, ok := nonEmptyBlockStarts[blockStart]; !ok { 425 if !bv.streamsEmpty() { 426 nonEmptyBlockStarts[blockStart] = struct{}{} 427 } 428 } 429 } 430 } 431 432 func (b *dbBuffer) ColdFlushBlockStarts(blockStates map[xtime.UnixNano]BlockState) OptimizedTimes { 433 var times OptimizedTimes 434 435 for t, bucketVersions := range b.bucketsMap { 436 for _, bucket := range bucketVersions.buckets { 437 if bucket.writeType == ColdWrite && 438 // We need to cold flush this bucket if it either: 439 // 1) Has new cold writes that need to be flushed, or 440 // 2) This bucket version is higher than what has been 441 // successfully flushed. This can happen if a cold flush was 442 // attempted, changing this bucket version, but fails to 443 // completely finish (which is what the shard block state 444 // signifies). In this case, we need to try to flush this 445 // bucket again. 446 (bucket.version == writableBucketVersion || 447 blockStates[bucket.start].ColdVersion < bucket.version) { 448 times.Add(t) 449 break 450 } 451 } 452 } 453 454 return times 455 } 456 457 func (b *dbBuffer) Stats() bufferStats { 458 return bufferStats{ 459 wiredBlocks: len(b.bucketsMap), 460 } 461 } 462 463 func (b *dbBuffer) Tick(blockStates ShardBlockStateSnapshot, nsCtx namespace.Context) bufferTickResult { 464 mergedOutOfOrder := 0 465 var evictedBucketTimes OptimizedTimes 466 for tNano, buckets := range b.bucketsMap { 467 // The blockStates map is never written to after creation, so this 468 // read access is safe. Since this version map is a snapshot of the 469 // versions, the real block flush versions may be higher. This is okay 470 // here because it's safe to: 471 // 1) not remove a bucket that's actually retrievable, or 472 // 2) remove a lower versioned bucket. 473 // Retrievable and higher versioned buckets will be left to be 474 // collected in the next tick. 475 blockStateSnapshot, bootstrapped := blockStates.UnwrapValue() 476 // Only use block state snapshot information to make eviction decisions if the block state 477 // has been properly bootstrapped already. 478 if bootstrapped { 479 blockState := blockStateSnapshot.Snapshot[tNano] 480 if coldVersion := blockState.ColdVersion; blockState.WarmRetrievable || coldVersion > 0 { 481 if blockState.WarmRetrievable { 482 // Buckets for WarmWrites that are retrievable will only be version 1, since 483 // they only get successfully persisted once. 484 buckets.removeBucketsUpToVersion(WarmWrite, 1) 485 } 486 if coldVersion > 0 { 487 buckets.removeBucketsUpToVersion(ColdWrite, coldVersion) 488 } 489 490 if buckets.streamsLen() == 0 { 491 // All underlying buckets have been flushed successfully, so we can 492 // just remove the buckets from the bucketsMap. 493 b.removeBucketVersionsAt(tNano) 494 // Pass which bucket got evicted from the buffer to the series. 495 // Data gets read in order of precedence: buffer -> cache -> disk. 496 // After a bucket gets removed from the buffer, data from the cache 497 // will be served. However, since data just got persisted to disk, 498 // the cached block is now stale. To correct this, we can either: 499 // 1) evict the stale block from cache so that new data will 500 // be retrieved from disk, or 501 // 2) merge the new data into the cached block. 502 // It's unclear whether recently flushed data would frequently be 503 // read soon afterward, so we're choosing (1) here, since it has a 504 // simpler implementation (just removing from a map). 505 evictedBucketTimes.Add(tNano) 506 continue 507 } 508 } 509 } 510 511 buckets.recordActiveEncoders() 512 513 // Once we've evicted all eligible buckets, we merge duplicate encoders 514 // in the remaining ones to try and reclaim memory. 515 merges, err := buckets.merge(WarmWrite, nsCtx) 516 if err != nil { 517 log := b.opts.InstrumentOptions().Logger() 518 log.Error("buffer merge encode error", zap.Error(err)) 519 } 520 if merges > 0 { 521 mergedOutOfOrder++ 522 } 523 } 524 return bufferTickResult{ 525 mergedOutOfOrderBlocks: mergedOutOfOrder, 526 evictedBucketTimes: evictedBucketTimes, 527 } 528 } 529 530 func (b *dbBuffer) Load(bl block.DatabaseBlock, writeType WriteType) { 531 var ( 532 blockStart = bl.StartTime() 533 buckets = b.bucketVersionsAtCreate(blockStart) 534 bucket = buckets.writableBucketCreate(writeType) 535 ) 536 bucket.loadedBlocks = append(bucket.loadedBlocks, bl) 537 } 538 539 func (b *dbBuffer) Snapshot( 540 ctx context.Context, 541 blockStart xtime.UnixNano, 542 metadata persist.Metadata, 543 persistFn persist.DataFn, 544 nsCtx namespace.Context, 545 ) (SnapshotResult, error) { 546 var ( 547 start = b.nowFn() 548 result SnapshotResult 549 ) 550 551 buckets, exists := b.bucketVersionsAt(blockStart) 552 if !exists { 553 return result, nil 554 } 555 556 // Snapshot must take both cold and warm writes because cold flushes don't 557 // happen for the current block (since cold flushes can't happen before a 558 // warm flush has happened). 559 streams, err := buckets.mergeToStreams(ctx, streamsOptions{filterWriteType: false, nsCtx: nsCtx}) 560 if err != nil { 561 return result, err 562 } 563 564 afterMergeByBucket := b.nowFn() 565 result.Stats.TimeMergeByBucket = afterMergeByBucket.Sub(start) 566 567 var ( 568 numStreams = len(streams) 569 mergeAcrossBuckets = numStreams != 1 570 segment ts.Segment 571 ) 572 if !mergeAcrossBuckets { 573 segment, err = streams[0].Segment() 574 if err != nil { 575 return result, err 576 } 577 } else { 578 // We may need to merge again here because the regular merge method does 579 // not merge warm and cold buckets or buckets that have different versions. 580 sr := make([]xio.SegmentReader, 0, numStreams) 581 for _, stream := range streams { 582 sr = append(sr, stream) 583 } 584 585 bopts := b.opts.DatabaseBlockOptions() 586 encoder := bopts.EncoderPool().Get() 587 encoder.Reset(blockStart, bopts.DatabaseBlockAllocSize(), nsCtx.Schema) 588 iter := b.opts.MultiReaderIteratorPool().Get() 589 var encoderClosed bool 590 defer func() { 591 if !encoderClosed { 592 encoder.Close() 593 } 594 iter.Close() 595 }() 596 iter.Reset(sr, blockStart, b.opts.RetentionOptions().BlockSize(), nsCtx.Schema) 597 598 for iter.Next() { 599 dp, unit, annotation := iter.Current() 600 if err := encoder.Encode(dp, unit, annotation); err != nil { 601 return result, err 602 } 603 } 604 if err := iter.Err(); err != nil { 605 return result, err 606 } 607 608 segment = encoder.Discard() 609 defer segment.Finalize() 610 encoderClosed = true 611 } 612 613 afterMergeAcrossBuckets := b.nowFn() 614 result.Stats.TimeMergeAcrossBuckets = afterMergeAcrossBuckets.Sub(afterMergeByBucket) 615 616 if segment.Len() == 0 { 617 // Don't write out series with no data. 618 return result, nil 619 } 620 621 checksum := segment.CalculateChecksum() 622 623 afterChecksum := b.nowFn() 624 result.Stats.TimeChecksum = afterChecksum.Sub(afterMergeAcrossBuckets) 625 626 if err := persistFn(metadata, segment, checksum); err != nil { 627 return result, err 628 } 629 630 result.Stats.TimePersist = b.nowFn().Sub(afterChecksum) 631 632 result.Persist = true 633 return result, nil 634 } 635 636 func (b *dbBuffer) WarmFlush( 637 ctx context.Context, 638 blockStart xtime.UnixNano, 639 metadata persist.Metadata, 640 persistFn persist.DataFn, 641 nsCtx namespace.Context, 642 ) (FlushOutcome, error) { 643 buckets, exists := b.bucketVersionsAt(blockStart) 644 if !exists { 645 return FlushOutcomeBlockDoesNotExist, nil 646 } 647 648 // Flush only deals with WarmWrites. ColdWrites get persisted to disk via 649 // the compaction cycle. 650 streams, err := buckets.mergeToStreams(ctx, streamsOptions{filterWriteType: true, writeType: WarmWrite, nsCtx: nsCtx}) 651 if err != nil { 652 return FlushOutcomeErr, err 653 } 654 655 var ( 656 stream xio.SegmentReader 657 ok bool 658 ) 659 if numStreams := len(streams); numStreams == 1 { 660 stream = streams[0] 661 ok = true 662 } else { 663 // In the majority of cases, there will only be one stream to persist 664 // here. Only when a previous flush fails midway through a shard will 665 // there be buckets for previous versions. In this case, we need to try 666 // to flush them again, so we merge them together to one stream and 667 // persist it. 668 encoder, _, err := mergeStreamsToEncoder(blockStart, streams, b.opts, nsCtx) 669 if err != nil { 670 return FlushOutcomeErr, err 671 } 672 673 stream, ok = encoder.Stream(ctx) 674 encoder.Close() 675 } 676 677 if !ok { 678 // Don't write out series with no data. 679 return FlushOutcomeBlockDoesNotExist, nil 680 } 681 682 segment, err := stream.Segment() 683 if err != nil { 684 return FlushOutcomeErr, err 685 } 686 687 if segment.Len() == 0 { 688 // Empty segment is equivalent to no stream, i.e data does not exist. 689 return FlushOutcomeBlockDoesNotExist, nil 690 } 691 692 checksum := segment.CalculateChecksum() 693 err = persistFn(metadata, segment, checksum) 694 if err != nil { 695 return FlushOutcomeErr, err 696 } 697 698 if bucket, exists := buckets.writableBucket(WarmWrite); exists { 699 // WarmFlushes only happen once per block, so it makes sense to always 700 // set this to 1. 701 bucket.version = 1 702 } 703 704 return FlushOutcomeFlushedToDisk, nil 705 } 706 707 func (b *dbBuffer) ReadEncoded( 708 ctx context.Context, 709 start xtime.UnixNano, 710 end xtime.UnixNano, 711 nsCtx namespace.Context, 712 ) ([][]xio.BlockReader, error) { 713 var ( 714 blockSize = b.opts.RetentionOptions().BlockSize() 715 // TODO(r): pool these results arrays 716 res [][]xio.BlockReader 717 ) 718 719 for _, blockStart := range b.inOrderBlockStarts { 720 blockStart := blockStart 721 if !blockStart.Before(end) || !start.Before(blockStart.Add(blockSize)) { 722 continue 723 } 724 725 bv, exists := b.bucketVersionsAt(blockStart) 726 if !exists { 727 // Invariant violated. This means the keys in the bucket map does 728 // not match the sorted keys cache, which should never happen. 729 instrument.EmitAndLogInvariantViolation( 730 b.opts.InstrumentOptions(), func(l *zap.Logger) { 731 l.Error(errBucketMapCacheNotInSync, zap.Int64("blockStart", int64(blockStart))) 732 }) 733 return nil, instrument.InvariantErrorf( 734 errBucketMapCacheNotInSyncFmt, blockStart) 735 } 736 737 if streams := bv.streams(ctx, streamsOptions{filterWriteType: false}); len(streams) > 0 { 738 res = append(res, streams) 739 } 740 741 // NB(r): Store the last read time, should not set this when 742 // calling FetchBlocks as a read is differentiated from 743 // a FetchBlocks call. One is initiated by an external 744 // entity and the other is used for streaming blocks between 745 // the storage nodes. This distinction is important as this 746 // data is important for use with understanding access patterns, etc. 747 bv.setLastRead(b.nowFn()) 748 } 749 750 return res, nil 751 } 752 753 func (b *dbBuffer) FetchBlocksForColdFlush( 754 ctx context.Context, 755 start xtime.UnixNano, 756 version int, 757 nsCtx namespace.Context, 758 ) (block.FetchBlockResult, error) { 759 res := b.fetchBlocks(ctx, []xtime.UnixNano{start}, 760 streamsOptions{filterWriteType: true, writeType: ColdWrite, nsCtx: nsCtx}) 761 if len(res) == 0 { 762 // The lifecycle of calling this function is preceded by first checking 763 // which blocks have cold data that have not yet been flushed. 764 // If we don't get data here, it means that it has since fallen out of 765 // retention and has been evicted. 766 return block.FetchBlockResult{}, nil 767 } 768 if len(res) != 1 { 769 // Must be only one result if anything at all, since fetchBlocks returns 770 // one result per block start. 771 return block.FetchBlockResult{}, fmt.Errorf("fetchBlocks did not return just one block for block start %s", start) 772 } 773 774 result := res[0] 775 776 buckets, exists := b.bucketVersionsAt(start) 777 if !exists { 778 return block.FetchBlockResult{}, fmt.Errorf("buckets do not exist with block start %s", start) 779 } 780 if bucket, exists := buckets.writableBucket(ColdWrite); exists { 781 // Update the version of the writable bucket (effectively making it not 782 // writable). This marks this bucket as attempted to be flushed, 783 // although it is only actually written to disk successfully at the 784 // shard level after every series has completed the flush process. 785 // The tick following a successful flush to disk will remove this bucket 786 // from memory. 787 bucket.version = version 788 } 789 // No-op if the writable bucket doesn't exist. 790 // This function should only get called for blocks that we know need to be 791 // cold flushed. However, buckets that get attempted to be cold flushed and 792 // fail need to get cold flushed as well. These kinds of buckets will have 793 // a non-writable version. 794 795 return result, nil 796 } 797 798 func (b *dbBuffer) FetchBlocks( 799 ctx context.Context, 800 starts []xtime.UnixNano, 801 nsCtx namespace.Context, 802 ) []block.FetchBlockResult { 803 return b.fetchBlocks(ctx, starts, streamsOptions{filterWriteType: false, nsCtx: nsCtx}) 804 } 805 806 func (b *dbBuffer) fetchBlocks( 807 ctx context.Context, 808 starts []xtime.UnixNano, 809 sOpts streamsOptions, 810 ) []block.FetchBlockResult { 811 var res []block.FetchBlockResult 812 813 for _, start := range starts { 814 buckets, ok := b.bucketVersionsAt(start) 815 if !ok { 816 continue 817 } 818 819 streams := buckets.streams(ctx, sOpts) 820 if len(streams) > 0 { 821 result := block.NewFetchBlockResult( 822 start, 823 streams, 824 nil, 825 ) 826 result.FirstWrite = buckets.firstWrite(sOpts) 827 res = append(res, result) 828 } 829 } 830 831 // Result should be sorted in ascending order. 832 sort.Slice(res, func(i, j int) bool { return res[i].Start.Before(res[j].Start) }) 833 834 return res 835 } 836 837 func (b *dbBuffer) FetchBlocksMetadata( 838 ctx context.Context, 839 start, end xtime.UnixNano, 840 opts FetchBlocksMetadataOptions, 841 ) (block.FetchBlockMetadataResults, error) { 842 blockSize := b.opts.RetentionOptions().BlockSize() 843 res := b.opts.FetchBlockMetadataResultsPool().Get() 844 845 for _, blockStart := range b.inOrderBlockStarts { 846 blockStart := blockStart 847 if !blockStart.Before(end) || !start.Before(blockStart.Add(blockSize)) { 848 continue 849 } 850 851 bv, exists := b.bucketVersionsAt(blockStart) 852 if !exists { 853 // Invariant violated. This means the keys in the bucket map does 854 // not match the sorted keys cache, which should never happen. 855 instrument.EmitAndLogInvariantViolation( 856 b.opts.InstrumentOptions(), func(l *zap.Logger) { 857 l.Error(errBucketMapCacheNotInSync, zap.Int64("blockStart", int64(blockStart))) 858 }) 859 return nil, instrument.InvariantErrorf(errBucketMapCacheNotInSyncFmt, blockStart) 860 } 861 862 size := int64(bv.streamsLen()) 863 // If we have no data in this bucket, skip early without appending it to the result. 864 if size == 0 { 865 continue 866 } 867 var resultSize int64 868 if opts.IncludeSizes { 869 resultSize = size 870 } 871 var resultLastRead xtime.UnixNano 872 if opts.IncludeLastRead { 873 resultLastRead = bv.lastRead() 874 } 875 876 var ( 877 checksum *uint32 878 err error 879 ) 880 if opts.IncludeChecksums { 881 // Checksum calculations are best effort since we can't calculate one if there 882 // are multiple streams without performing an expensive merge. 883 checksum, err = bv.checksumIfSingleStream(ctx) 884 if err != nil { 885 return nil, err 886 } 887 } 888 res.Add(block.FetchBlockMetadataResult{ 889 Start: bv.start, 890 Size: resultSize, 891 LastRead: resultLastRead, 892 Checksum: checksum, 893 }) 894 } 895 896 return res, nil 897 } 898 899 func (b *dbBuffer) bucketVersionsAt( 900 t xtime.UnixNano, 901 ) (*BufferBucketVersions, bool) { 902 // First check LRU cache. 903 for _, buckets := range b.bucketVersionsCache { 904 if buckets == nil { 905 continue 906 } 907 if buckets.start.Equal(t) { 908 return buckets, true 909 } 910 } 911 912 // Then check the map. 913 if buckets, exists := b.bucketsMap[t]; exists { 914 return buckets, true 915 } 916 917 return nil, false 918 } 919 920 func (b *dbBuffer) bucketVersionsAtCreate( 921 t xtime.UnixNano, 922 ) *BufferBucketVersions { 923 if buckets, exists := b.bucketVersionsAt(t); exists { 924 return buckets 925 } 926 927 buckets := b.bucketVersionsPool.Get() 928 buckets.resetTo(t, b.opts, b.bucketPool) 929 b.bucketsMap[t] = buckets 930 b.inOrderBlockStartsAdd(t) 931 932 return buckets 933 } 934 935 func (b *dbBuffer) putBucketVersionsInCache(newBuckets *BufferBucketVersions) { 936 replaceIdx := bucketsCacheSize - 1 937 for i, buckets := range b.bucketVersionsCache { 938 // Check if we have the same pointer in cache. 939 if buckets == newBuckets { 940 replaceIdx = i 941 } 942 } 943 944 for i := replaceIdx; i > 0; i-- { 945 b.bucketVersionsCache[i] = b.bucketVersionsCache[i-1] 946 } 947 948 b.bucketVersionsCache[0] = newBuckets 949 } 950 951 func (b *dbBuffer) removeBucketVersionsInCache(oldBuckets *BufferBucketVersions) { 952 nilIdx := -1 953 for i, buckets := range b.bucketVersionsCache { 954 if buckets == oldBuckets { 955 nilIdx = i 956 } 957 } 958 if nilIdx == -1 { 959 return 960 } 961 962 for i := nilIdx; i < bucketsCacheSize-1; i++ { 963 b.bucketVersionsCache[i] = b.bucketVersionsCache[i+1] 964 } 965 966 b.bucketVersionsCache[bucketsCacheSize-1] = nil 967 } 968 969 func (b *dbBuffer) removeBucketVersionsAt(blockStart xtime.UnixNano) { 970 buckets, exists := b.bucketVersionsAt(blockStart) 971 if !exists { 972 return 973 } 974 delete(b.bucketsMap, blockStart) 975 b.removeBucketVersionsInCache(buckets) 976 b.inOrderBlockStartsRemove(blockStart) 977 // nil out pointers. 978 buckets.resetTo(0, nil, nil) 979 b.bucketVersionsPool.Put(buckets) 980 } 981 982 func (b *dbBuffer) inOrderBlockStartsAdd(newTime xtime.UnixNano) { 983 starts := b.inOrderBlockStarts 984 idx := len(starts) 985 // There shouldn't be that many starts here, so just linear search through. 986 for i, t := range starts { 987 if t.After(newTime) { 988 idx = i 989 break 990 } 991 } 992 // Insert new time without allocating new slice. 993 b.inOrderBlockStarts = append(starts, 0) //nolint 994 // Update to new slice 995 starts = b.inOrderBlockStarts 996 copy(starts[idx+1:], starts[idx:]) 997 starts[idx] = newTime 998 } 999 1000 func (b *dbBuffer) inOrderBlockStartsRemove(removeTime xtime.UnixNano) { 1001 starts := b.inOrderBlockStarts 1002 // There shouldn't be that many starts here, so just linear search through. 1003 for i, t := range starts { 1004 if t.Equal(removeTime) { 1005 b.inOrderBlockStarts = append(starts[:i], starts[i+1:]...) 1006 return 1007 } 1008 } 1009 } 1010 1011 // BufferBucketVersions is a container for different versions of buffer buckets. 1012 // Bucket versions are how the buffer separates writes that have been written 1013 // to disk as a fileset and writes that have not. The bucket with a version of 1014 // `writableBucketVersion` is the bucket that all writes go into (as thus is the 1015 // bucket version that have not yet been persisted). After a bucket gets 1016 // persisted, its version gets set to a version that the shard passes down to it 1017 // (since the shard knows what has been fully persisted to disk). 1018 type BufferBucketVersions struct { 1019 buckets []*BufferBucket 1020 start xtime.UnixNano 1021 opts Options 1022 lastReadUnixNanos int64 1023 bucketPool *BufferBucketPool 1024 } 1025 1026 func (b *BufferBucketVersions) resetTo( 1027 start xtime.UnixNano, 1028 opts Options, 1029 bucketPool *BufferBucketPool, 1030 ) { 1031 // nil all elements so that they get GC'd. 1032 for i := range b.buckets { 1033 b.buckets[i] = nil 1034 } 1035 b.buckets = b.buckets[:0] 1036 b.start = start 1037 b.opts = opts 1038 atomic.StoreInt64(&b.lastReadUnixNanos, 0) 1039 b.bucketPool = bucketPool 1040 } 1041 1042 // streams returns all the streams for this BufferBucketVersions. 1043 func (b *BufferBucketVersions) streams(ctx context.Context, opts streamsOptions) []xio.BlockReader { 1044 var res []xio.BlockReader 1045 for _, bucket := range b.buckets { 1046 if opts.filterWriteType && bucket.writeType != opts.writeType { 1047 continue 1048 } 1049 res = append(res, bucket.streams(ctx)...) 1050 } 1051 1052 return res 1053 } 1054 1055 func (b *BufferBucketVersions) firstWrite(opts streamsOptions) xtime.UnixNano { 1056 var res xtime.UnixNano 1057 for _, bucket := range b.buckets { 1058 if opts.filterWriteType && bucket.writeType != opts.writeType { 1059 continue 1060 } 1061 // Get the earliest valid first write time. 1062 if res == 0 || 1063 (bucket.firstWrite.Before(res) && bucket.firstWrite != 0) { 1064 res = bucket.firstWrite 1065 } 1066 } 1067 return res 1068 } 1069 1070 func (b *BufferBucketVersions) streamsEmpty() bool { 1071 for _, bucket := range b.buckets { 1072 if !bucket.streamsEmpty() { 1073 return false 1074 } 1075 } 1076 return true 1077 } 1078 1079 func (b *BufferBucketVersions) streamsLen() int { 1080 res := 0 1081 for _, bucket := range b.buckets { 1082 res += bucket.streamsLen() 1083 } 1084 return res 1085 } 1086 1087 func (b *BufferBucketVersions) checksumIfSingleStream(ctx context.Context) (*uint32, error) { 1088 if len(b.buckets) != 1 { 1089 return nil, nil 1090 } 1091 return b.buckets[0].checksumIfSingleStream(ctx) 1092 } 1093 1094 func (b *BufferBucketVersions) write( 1095 timestamp xtime.UnixNano, 1096 value float64, 1097 unit xtime.Unit, 1098 annotation []byte, 1099 writeType WriteType, 1100 schema namespace.SchemaDescr, 1101 ) (bool, error) { 1102 return b.writableBucketCreate(writeType).write(timestamp, value, unit, annotation, schema) 1103 } 1104 1105 func (b *BufferBucketVersions) merge(writeType WriteType, nsCtx namespace.Context) (int, error) { 1106 res := 0 1107 for _, bucket := range b.buckets { 1108 // Only makes sense to merge buckets that are writable. 1109 if bucket.version == writableBucketVersion && writeType == bucket.writeType { 1110 merges, err := bucket.merge(nsCtx) 1111 if err != nil { 1112 return 0, err 1113 } 1114 res += merges 1115 } 1116 } 1117 1118 return res, nil 1119 } 1120 1121 func (b *BufferBucketVersions) removeBucketsUpToVersion( 1122 writeType WriteType, 1123 version int, 1124 ) { 1125 // Avoid allocating a new backing array. 1126 nonEvictedBuckets := b.buckets[:0] 1127 1128 for _, bucket := range b.buckets { 1129 bVersion := bucket.version 1130 if bucket.writeType == writeType && bVersion != writableBucketVersion && 1131 bVersion <= version { 1132 // We no longer need to keep any version which is equal to 1133 // or less than the retrievable version, since that means 1134 // that the version has successfully persisted to disk. 1135 // Bucket gets reset before use. 1136 b.bucketPool.Put(bucket) 1137 continue 1138 } 1139 1140 nonEvictedBuckets = append(nonEvictedBuckets, bucket) 1141 } 1142 1143 b.buckets = nonEvictedBuckets 1144 } 1145 1146 func (b *BufferBucketVersions) setLastRead(value time.Time) { 1147 atomic.StoreInt64(&b.lastReadUnixNanos, value.UnixNano()) 1148 } 1149 1150 func (b *BufferBucketVersions) lastRead() xtime.UnixNano { 1151 return xtime.UnixNano(atomic.LoadInt64(&b.lastReadUnixNanos)) 1152 } 1153 1154 func (b *BufferBucketVersions) writableBucket(writeType WriteType) (*BufferBucket, bool) { 1155 for _, bucket := range b.buckets { 1156 if bucket.version == writableBucketVersion && bucket.writeType == writeType { 1157 return bucket, true 1158 } 1159 } 1160 1161 return nil, false 1162 } 1163 1164 func (b *BufferBucketVersions) writableBucketCreate(writeType WriteType) *BufferBucket { 1165 bucket, exists := b.writableBucket(writeType) 1166 1167 if exists { 1168 return bucket 1169 } 1170 1171 newBucket := b.bucketPool.Get() 1172 newBucket.resetTo(b.start, writeType, b.opts) 1173 b.buckets = append(b.buckets, newBucket) 1174 return newBucket 1175 } 1176 1177 // mergeToStreams merges each buffer bucket version's streams into one, then 1178 // returns a single stream for each buffer bucket version. 1179 func (b *BufferBucketVersions) mergeToStreams(ctx context.Context, opts streamsOptions) ([]xio.SegmentReader, error) { 1180 buckets := b.buckets 1181 res := make([]xio.SegmentReader, 0, len(buckets)) 1182 1183 for _, bucket := range buckets { 1184 if opts.filterWriteType && bucket.writeType != opts.writeType { 1185 continue 1186 } 1187 stream, ok, err := bucket.mergeToStream(ctx, opts.nsCtx) 1188 if err != nil { 1189 return nil, err 1190 } 1191 if !ok { 1192 continue 1193 } 1194 res = append(res, stream) 1195 } 1196 1197 return res, nil 1198 } 1199 1200 func (b *BufferBucketVersions) recordActiveEncoders() { 1201 var numActiveEncoders int 1202 for _, bucket := range b.buckets { 1203 if bucket.version == writableBucketVersion { 1204 numActiveEncoders += len(bucket.encoders) 1205 } 1206 } 1207 b.opts.Stats().RecordEncodersPerBlock(numActiveEncoders) 1208 } 1209 1210 type streamsOptions struct { 1211 filterWriteType bool 1212 writeType WriteType 1213 nsCtx namespace.Context 1214 } 1215 1216 // BufferBucket is a specific version of a bucket of encoders, which is where 1217 // writes are ultimately stored before they are persisted to disk as a fileset. 1218 // See comment for BufferBucketVersions for more detail on bucket versions. 1219 type BufferBucket struct { 1220 opts Options 1221 start xtime.UnixNano 1222 encoders []inOrderEncoder 1223 loadedBlocks []block.DatabaseBlock 1224 version int 1225 writeType WriteType 1226 firstWrite xtime.UnixNano 1227 } 1228 1229 type inOrderEncoder struct { 1230 encoder encoding.Encoder 1231 lastWriteAt xtime.UnixNano 1232 } 1233 1234 func (b *BufferBucket) resetTo( 1235 start xtime.UnixNano, 1236 writeType WriteType, 1237 opts Options, 1238 ) { 1239 // Close the old context if we're resetting for use. 1240 b.reset() 1241 b.opts = opts 1242 b.start = start 1243 bopts := b.opts.DatabaseBlockOptions() 1244 encoder := bopts.EncoderPool().Get() 1245 encoder.Reset(start, bopts.DatabaseBlockAllocSize(), nil) 1246 b.encoders = append(b.encoders, inOrderEncoder{ 1247 encoder: encoder, 1248 }) 1249 b.loadedBlocks = nil 1250 // We would only ever create a bucket for it to be writable. 1251 b.version = writableBucketVersion 1252 b.writeType = writeType 1253 b.firstWrite = 0 1254 } 1255 1256 func (b *BufferBucket) reset() { 1257 b.resetEncoders() 1258 b.resetLoadedBlocks() 1259 } 1260 1261 func (b *BufferBucket) write( 1262 timestamp xtime.UnixNano, 1263 value float64, 1264 unit xtime.Unit, 1265 annotation []byte, 1266 schema namespace.SchemaDescr, 1267 ) (bool, error) { 1268 datapoint := ts.Datapoint{ 1269 TimestampNanos: timestamp, 1270 Value: value, 1271 } 1272 1273 // Find the correct encoder to write to 1274 idx := -1 1275 for i := range b.encoders { 1276 lastWriteAt := b.encoders[i].lastWriteAt 1277 if timestamp.Equal(lastWriteAt) { 1278 lastDatapoint, err := b.encoders[i].encoder.LastEncoded() 1279 if err != nil { 1280 return false, err 1281 } 1282 lastAnnotationChecksum, err := b.encoders[i].encoder.LastAnnotationChecksum() 1283 if err != nil { 1284 return false, err 1285 } 1286 1287 if lastDatapoint.Value == value && lastAnnotationChecksum == xxhash.Sum64(annotation) { 1288 // No-op since matches the current value. Propagates up to callers that 1289 // no value was written. 1290 return false, nil 1291 } 1292 continue 1293 } 1294 1295 if timestamp.After(lastWriteAt) { 1296 idx = i 1297 break 1298 } 1299 } 1300 1301 var err error 1302 defer func() { 1303 nowFn := b.opts.ClockOptions().NowFn() 1304 if err == nil && b.firstWrite == 0 { 1305 b.firstWrite = xtime.ToUnixNano(nowFn()) 1306 } 1307 }() 1308 1309 // Upsert/last-write-wins semantics. 1310 // NB(r): We push datapoints with the same timestamp but differing 1311 // value into a new encoder later in the stack of in order encoders 1312 // since an encoder is immutable. 1313 // The encoders pushed later will surface their values first. 1314 if idx != -1 { 1315 err = b.writeToEncoderIndex(idx, datapoint, unit, annotation, schema) 1316 return err == nil, err 1317 } 1318 1319 // Need a new encoder, we didn't find an encoder to write to. 1320 maxEncoders := b.opts.RuntimeOptionsManager().Get().EncodersPerBlockLimit() 1321 if maxEncoders != 0 && len(b.encoders) >= int(maxEncoders) { 1322 b.opts.Stats().IncEncoderLimitWriteRejected() 1323 return false, errTooManyEncoders 1324 } 1325 1326 b.opts.Stats().IncCreatedEncoders() 1327 bopts := b.opts.DatabaseBlockOptions() 1328 blockSize := b.opts.RetentionOptions().BlockSize() 1329 blockAllocSize := bopts.DatabaseBlockAllocSize() 1330 1331 encoder := b.opts.EncoderPool().Get() 1332 encoder.Reset(timestamp.Truncate(blockSize), blockAllocSize, schema) 1333 1334 b.encoders = append(b.encoders, inOrderEncoder{ 1335 encoder: encoder, 1336 lastWriteAt: timestamp, 1337 }) 1338 1339 idx = len(b.encoders) - 1 1340 err = b.writeToEncoderIndex(idx, datapoint, unit, annotation, schema) 1341 if err != nil { 1342 encoder.Close() 1343 b.encoders = b.encoders[:idx] 1344 return false, err 1345 } 1346 return true, nil 1347 } 1348 1349 func (b *BufferBucket) writeToEncoderIndex( 1350 idx int, 1351 datapoint ts.Datapoint, 1352 unit xtime.Unit, 1353 annotation []byte, 1354 schema namespace.SchemaDescr, 1355 ) error { 1356 b.encoders[idx].encoder.SetSchema(schema) 1357 err := b.encoders[idx].encoder.Encode(datapoint, unit, annotation) 1358 if err != nil { 1359 return err 1360 } 1361 1362 b.encoders[idx].lastWriteAt = datapoint.TimestampNanos 1363 return nil 1364 } 1365 1366 func (b *BufferBucket) streams(ctx context.Context) []xio.BlockReader { 1367 streams := make([]xio.BlockReader, 0, len(b.loadedBlocks)+len(b.encoders)) 1368 for _, bl := range b.loadedBlocks { 1369 if bl.Len() == 0 { 1370 continue 1371 } 1372 if s, err := bl.Stream(ctx); err == nil && s.IsNotEmpty() { 1373 // NB(r): block stream method will register the stream closer already 1374 streams = append(streams, s) 1375 } 1376 } 1377 for i := range b.encoders { 1378 start := b.start 1379 if s, ok := b.encoders[i].encoder.Stream(ctx); ok { 1380 br := xio.BlockReader{ 1381 SegmentReader: s, 1382 Start: start, 1383 BlockSize: b.opts.RetentionOptions().BlockSize(), 1384 } 1385 ctx.RegisterFinalizer(s) 1386 streams = append(streams, br) 1387 } 1388 } 1389 1390 return streams 1391 } 1392 1393 func (b *BufferBucket) streamsEmpty() bool { 1394 for i := range b.loadedBlocks { 1395 if !b.loadedBlocks[i].Empty() { 1396 return false 1397 } 1398 } 1399 for i := range b.encoders { 1400 if !b.encoders[i].encoder.Empty() { 1401 return false 1402 } 1403 } 1404 return true 1405 } 1406 1407 func (b *BufferBucket) streamsLen() int { 1408 length := 0 1409 for i := range b.loadedBlocks { 1410 length += b.loadedBlocks[i].Len() 1411 } 1412 for i := range b.encoders { 1413 length += b.encoders[i].encoder.Len() 1414 } 1415 return length 1416 } 1417 1418 func (b *BufferBucket) checksumIfSingleStream(ctx context.Context) (*uint32, error) { 1419 if b.hasJustSingleEncoder() { 1420 enc := b.encoders[0].encoder 1421 stream, ok := enc.Stream(ctx) 1422 if !ok { 1423 return nil, nil 1424 } 1425 1426 segment, err := stream.Segment() 1427 if err != nil { 1428 return nil, err 1429 } 1430 1431 if segment.Len() == 0 { 1432 return nil, nil 1433 } 1434 1435 checksum := segment.CalculateChecksum() 1436 return &checksum, nil 1437 } 1438 1439 if b.hasJustSingleLoadedBlock() { 1440 checksum, err := b.loadedBlocks[0].Checksum() 1441 if err != nil { 1442 return nil, err 1443 } 1444 return &checksum, nil 1445 } 1446 1447 return nil, nil 1448 } 1449 1450 func (b *BufferBucket) resetEncoders() { 1451 var zeroed inOrderEncoder 1452 for i := range b.encoders { 1453 // Register when this bucket resets we close the encoder. 1454 encoder := b.encoders[i].encoder 1455 encoder.Close() 1456 b.encoders[i] = zeroed 1457 } 1458 b.encoders = b.encoders[:0] 1459 } 1460 1461 func (b *BufferBucket) resetLoadedBlocks() { 1462 for i := range b.loadedBlocks { 1463 bl := b.loadedBlocks[i] 1464 bl.Close() 1465 } 1466 b.loadedBlocks = nil 1467 } 1468 1469 func (b *BufferBucket) needsMerge() bool { 1470 return !(b.hasJustSingleEncoder() || b.hasJustSingleLoadedBlock()) 1471 } 1472 1473 func (b *BufferBucket) hasJustSingleEncoder() bool { 1474 return len(b.encoders) == 1 && len(b.loadedBlocks) == 0 1475 } 1476 1477 func (b *BufferBucket) hasJustSingleLoadedBlock() bool { 1478 encodersEmpty := len(b.encoders) == 0 || 1479 (len(b.encoders) == 1 && b.encoders[0].encoder.Len() == 0) 1480 return encodersEmpty && len(b.loadedBlocks) == 1 1481 } 1482 1483 func (b *BufferBucket) merge(nsCtx namespace.Context) (int, error) { 1484 if !b.needsMerge() { 1485 // Save unnecessary work 1486 return 0, nil 1487 } 1488 1489 var ( 1490 start = b.start 1491 readers = make([]xio.SegmentReader, 0, len(b.encoders)+len(b.loadedBlocks)) 1492 streams = make([]xio.SegmentReader, 0, len(b.encoders)) 1493 ctx = b.opts.ContextPool().Get() 1494 merges = 0 1495 ) 1496 defer func() { 1497 ctx.Close() 1498 // NB(r): Only need to close the mutable encoder streams as 1499 // the context we created for reading the loaded blocks 1500 // will close those streams when it is closed. 1501 for _, stream := range streams { 1502 stream.Finalize() 1503 } 1504 }() 1505 1506 // Rank loaded blocks as data that has appeared before data that 1507 // arrived locally in the buffer 1508 for i := range b.loadedBlocks { 1509 block, err := b.loadedBlocks[i].Stream(ctx) 1510 if err == nil && block.SegmentReader != nil { 1511 merges++ 1512 readers = append(readers, block.SegmentReader) 1513 } 1514 } 1515 1516 for i := range b.encoders { 1517 if s, ok := b.encoders[i].encoder.Stream(ctx); ok { 1518 merges++ 1519 readers = append(readers, s) 1520 streams = append(streams, s) 1521 } 1522 } 1523 1524 encoder, lastWriteAt, err := mergeStreamsToEncoder(start, readers, b.opts, nsCtx) 1525 if err != nil { 1526 return 0, err 1527 } 1528 1529 b.resetEncoders() 1530 b.resetLoadedBlocks() 1531 1532 b.encoders = append(b.encoders, inOrderEncoder{ 1533 encoder: encoder, 1534 lastWriteAt: lastWriteAt, 1535 }) 1536 1537 return merges, nil 1538 } 1539 1540 // mergeStreamsToEncoder merges streams to an encoder and returns the last 1541 // write time. It is the responsibility of the caller to close the returned 1542 // encoder when appropriate. 1543 func mergeStreamsToEncoder( 1544 blockStart xtime.UnixNano, 1545 streams []xio.SegmentReader, 1546 opts Options, 1547 nsCtx namespace.Context, 1548 ) (encoding.Encoder, xtime.UnixNano, error) { 1549 bopts := opts.DatabaseBlockOptions() 1550 encoder := opts.EncoderPool().Get() 1551 encoder.Reset(blockStart, bopts.DatabaseBlockAllocSize(), nsCtx.Schema) 1552 iter := opts.MultiReaderIteratorPool().Get() 1553 defer iter.Close() 1554 1555 var lastWriteAt xtime.UnixNano 1556 iter.Reset(streams, blockStart, opts.RetentionOptions().BlockSize(), nsCtx.Schema) 1557 for iter.Next() { 1558 dp, unit, annotation := iter.Current() 1559 if err := encoder.Encode(dp, unit, annotation); err != nil { 1560 encoder.Close() 1561 return nil, 0, err 1562 } 1563 lastWriteAt = dp.TimestampNanos 1564 } 1565 if err := iter.Err(); err != nil { 1566 encoder.Close() 1567 return nil, 0, err 1568 } 1569 1570 return encoder, lastWriteAt, nil 1571 } 1572 1573 // mergeToStream merges all streams in this BufferBucket into one stream and 1574 // returns it. 1575 func (b *BufferBucket) mergeToStream(ctx context.Context, nsCtx namespace.Context) (xio.SegmentReader, bool, error) { 1576 if b.hasJustSingleEncoder() { 1577 b.resetLoadedBlocks() 1578 // Already merged as a single encoder. 1579 stream, ok := b.encoders[0].encoder.Stream(ctx) 1580 if !ok { 1581 return nil, false, nil 1582 } 1583 ctx.RegisterFinalizer(stream) 1584 return stream, true, nil 1585 } 1586 1587 if b.hasJustSingleLoadedBlock() { 1588 // Need to reset encoders but do not want to finalize the block as we 1589 // are passing ownership of it to the caller. 1590 b.resetEncoders() 1591 stream, err := b.loadedBlocks[0].Stream(ctx) 1592 if err != nil { 1593 return nil, false, err 1594 } 1595 return stream, true, nil 1596 } 1597 1598 _, err := b.merge(nsCtx) 1599 if err != nil { 1600 b.resetEncoders() 1601 b.resetLoadedBlocks() 1602 return nil, false, err 1603 } 1604 1605 // After a successful merge, encoders and loaded blocks will be 1606 // reset, and the merged encoder appended as the only encoder in the 1607 // bucket. 1608 if !b.hasJustSingleEncoder() { 1609 return nil, false, errIncompleteMerge 1610 } 1611 1612 stream, ok := b.encoders[0].encoder.Stream(ctx) 1613 if !ok { 1614 return nil, false, nil 1615 } 1616 ctx.RegisterFinalizer(stream) 1617 return stream, true, nil 1618 } 1619 1620 // BufferBucketVersionsPool provides a pool for BufferBucketVersions. 1621 type BufferBucketVersionsPool struct { 1622 pool pool.ObjectPool 1623 } 1624 1625 // NewBufferBucketVersionsPool creates a new BufferBucketVersionsPool. 1626 func NewBufferBucketVersionsPool(opts pool.ObjectPoolOptions) *BufferBucketVersionsPool { 1627 p := &BufferBucketVersionsPool{pool: pool.NewObjectPool(opts)} 1628 p.pool.Init(func() interface{} { 1629 return &BufferBucketVersions{} 1630 }) 1631 return p 1632 } 1633 1634 // Get gets a BufferBucketVersions from the pool. 1635 func (p *BufferBucketVersionsPool) Get() *BufferBucketVersions { 1636 return p.pool.Get().(*BufferBucketVersions) 1637 } 1638 1639 // Put puts a BufferBucketVersions back into the pool. 1640 func (p *BufferBucketVersionsPool) Put(buckets *BufferBucketVersions) { 1641 p.pool.Put(buckets) 1642 } 1643 1644 // BufferBucketPool provides a pool for BufferBuckets. 1645 type BufferBucketPool struct { 1646 pool pool.ObjectPool 1647 } 1648 1649 // NewBufferBucketPool creates a new BufferBucketPool. 1650 func NewBufferBucketPool(opts pool.ObjectPoolOptions) *BufferBucketPool { 1651 p := &BufferBucketPool{pool: pool.NewObjectPool(opts)} 1652 p.pool.Init(func() interface{} { 1653 return &BufferBucket{} 1654 }) 1655 return p 1656 } 1657 1658 // Get gets a BufferBucket from the pool. 1659 func (p *BufferBucketPool) Get() *BufferBucket { 1660 return p.pool.Get().(*BufferBucket) 1661 } 1662 1663 // Put puts a BufferBucket back into the pool. 1664 func (p *BufferBucketPool) Put(bucket *BufferBucket) { 1665 p.pool.Put(bucket) 1666 }