github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/storage/series/buffer.go (about)

     1  // Copyright (c) 2016 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package series
    22  
    23  import (
    24  	"errors"
    25  	"fmt"
    26  	"sort"
    27  	"sync/atomic"
    28  	"time"
    29  
    30  	"github.com/m3db/m3/src/dbnode/encoding"
    31  	"github.com/m3db/m3/src/dbnode/namespace"
    32  	"github.com/m3db/m3/src/dbnode/persist"
    33  	"github.com/m3db/m3/src/dbnode/storage/block"
    34  	"github.com/m3db/m3/src/dbnode/ts"
    35  	"github.com/m3db/m3/src/dbnode/x/xio"
    36  	"github.com/m3db/m3/src/x/clock"
    37  	"github.com/m3db/m3/src/x/context"
    38  	xerrors "github.com/m3db/m3/src/x/errors"
    39  	"github.com/m3db/m3/src/x/ident"
    40  	"github.com/m3db/m3/src/x/instrument"
    41  	"github.com/m3db/m3/src/x/pool"
    42  	xtime "github.com/m3db/m3/src/x/time"
    43  
    44  	"github.com/cespare/xxhash/v2"
    45  	"go.uber.org/zap"
    46  )
    47  
    48  const (
    49  	errBucketMapCacheNotInSync    = "bucket map keys do not match sorted keys cache"
    50  	errBucketMapCacheNotInSyncFmt = errBucketMapCacheNotInSync + ", blockStart: %d"
    51  	errTimestampFormat            = time.RFC822Z
    52  )
    53  
    54  var (
    55  	timeZero           time.Time
    56  	errIncompleteMerge = errors.New("bucket merge did not result in only one encoder")
    57  	errTooManyEncoders = xerrors.NewInvalidParamsError(errors.New("too many encoders per block"))
    58  )
    59  
    60  const (
    61  	bucketsCacheSize = 2
    62  	// optimizedTimesArraySize is the size of the internal array for the
    63  	// optimizedTimes struct. Since the size of this array determines the
    64  	// effectiveness of minimizing heap allocations, usage of this struct and/or
    65  	// changing this const should only be done after considering its current
    66  	// use cases:
    67  	// 1) The number of buckets that will be removed within a tick due to that
    68  	//    block being recently flushed
    69  	// 2) The number of buckets that contain ColdWrites within a cold flush
    70  	//    cycle
    71  	// TODO(juchan): revisit this after ColdWrites usage to see if this number
    72  	// is sane.
    73  	optimizedTimesArraySize = 8
    74  	writableBucketVersion   = 0
    75  )
    76  
    77  type databaseBuffer interface {
    78  	MoveTo(
    79  		buffer databaseBuffer,
    80  		nsCtx namespace.Context,
    81  	) error
    82  
    83  	Write(
    84  		ctx context.Context,
    85  		id ident.ID,
    86  		timestamp xtime.UnixNano,
    87  		value float64,
    88  		unit xtime.Unit,
    89  		annotation []byte,
    90  		wOpts WriteOptions,
    91  	) (bool, WriteType, error)
    92  
    93  	Snapshot(
    94  		ctx context.Context,
    95  		blockStart xtime.UnixNano,
    96  		metadata persist.Metadata,
    97  		persistFn persist.DataFn,
    98  		nsCtx namespace.Context,
    99  	) (SnapshotResult, error)
   100  
   101  	WarmFlush(
   102  		ctx context.Context,
   103  		blockStart xtime.UnixNano,
   104  		metadata persist.Metadata,
   105  		persistFn persist.DataFn,
   106  		nsCtx namespace.Context,
   107  	) (FlushOutcome, error)
   108  
   109  	ReadEncoded(
   110  		ctx context.Context,
   111  		start, end xtime.UnixNano,
   112  		nsCtx namespace.Context,
   113  	) ([][]xio.BlockReader, error)
   114  
   115  	FetchBlocksForColdFlush(
   116  		ctx context.Context,
   117  		start xtime.UnixNano,
   118  		version int,
   119  		nsCtx namespace.Context,
   120  	) (block.FetchBlockResult, error)
   121  
   122  	FetchBlocks(
   123  		ctx context.Context,
   124  		starts []xtime.UnixNano,
   125  		nsCtx namespace.Context,
   126  	) []block.FetchBlockResult
   127  
   128  	FetchBlocksMetadata(
   129  		ctx context.Context,
   130  		start, end xtime.UnixNano,
   131  		opts FetchBlocksMetadataOptions,
   132  	) (block.FetchBlockMetadataResults, error)
   133  
   134  	IsEmpty() bool
   135  
   136  	MarkNonEmptyBlocks(nonEmptyBlockStarts map[xtime.UnixNano]struct{})
   137  
   138  	ColdFlushBlockStarts(blockStates map[xtime.UnixNano]BlockState) OptimizedTimes
   139  
   140  	Stats() bufferStats
   141  
   142  	Tick(versions ShardBlockStateSnapshot, nsCtx namespace.Context) bufferTickResult
   143  
   144  	Load(bl block.DatabaseBlock, writeType WriteType)
   145  
   146  	Reset(opts databaseBufferResetOptions)
   147  }
   148  
   149  type databaseBufferResetOptions struct {
   150  	BlockRetriever QueryableBlockRetriever
   151  	Options        Options
   152  }
   153  
   154  type bufferStats struct {
   155  	wiredBlocks int
   156  }
   157  
   158  type bufferTickResult struct {
   159  	mergedOutOfOrderBlocks int
   160  	evictedBucketTimes     OptimizedTimes
   161  }
   162  
   163  // OptimizedTimes is a struct that holds an unknown number of times. This is
   164  // used to avoid heap allocations as much as possible by trying to not allocate
   165  // a slice of times. To do this, `optimizedTimesArraySize` needs to be
   166  // strategically sized such that for the vast majority of the time, the internal
   167  // array can hold all the times required so that `slice` is nil.
   168  //
   169  // OptimizedTimes should only be interacted with via its helper functions - its
   170  // fields should never be accessed or modified directly, which could cause an
   171  // invalid state.
   172  type OptimizedTimes struct {
   173  	arrIdx int
   174  	arr    [optimizedTimesArraySize]xtime.UnixNano
   175  	slice  []xtime.UnixNano
   176  }
   177  
   178  // Add adds a time to this OptimizedTimes.
   179  func (t *OptimizedTimes) Add(newTime xtime.UnixNano) {
   180  	if t.arrIdx < cap(t.arr) {
   181  		t.arr[t.arrIdx] = newTime
   182  		t.arrIdx++
   183  	} else {
   184  		t.slice = append(t.slice, newTime)
   185  	}
   186  }
   187  
   188  // Len returns the number of times in this OptimizedTimes.
   189  func (t *OptimizedTimes) Len() int {
   190  	return t.arrIdx + len(t.slice)
   191  }
   192  
   193  // Contains returns whether the target time is in this OptimizedTimes.
   194  func (t *OptimizedTimes) Contains(target xtime.UnixNano) bool {
   195  	for i := 0; i < t.arrIdx; i++ {
   196  		if t.arr[i].Equal(target) {
   197  			return true
   198  		}
   199  	}
   200  	for _, tt := range t.slice {
   201  		if tt.Equal(target) {
   202  			return true
   203  		}
   204  	}
   205  	return false
   206  }
   207  
   208  // ForEach runs the given function for each time in this OptimizedTimes.
   209  func (t *OptimizedTimes) ForEach(fn func(t xtime.UnixNano)) {
   210  	for i, tNano := range t.arr {
   211  		if i >= t.arrIdx {
   212  			break
   213  		}
   214  		fn(tNano)
   215  	}
   216  	for _, tNano := range t.slice {
   217  		fn(tNano)
   218  	}
   219  }
   220  
   221  type dbBuffer struct {
   222  	opts  Options
   223  	nowFn clock.NowFn
   224  
   225  	// bucketsMap is a map from a block start to its corresponding bucket
   226  	// versions.
   227  	bucketsMap map[xtime.UnixNano]*BufferBucketVersions
   228  	// Cache of buckets to avoid map lookup of above.
   229  	bucketVersionsCache [bucketsCacheSize]*BufferBucketVersions
   230  	// This is an in order slice of the block starts in the bucketsMap.
   231  	// We maintain this to avoid sorting the map keys adhoc when we want to
   232  	// perform operations in chronological order.
   233  	inOrderBlockStarts []xtime.UnixNano
   234  	bucketVersionsPool *BufferBucketVersionsPool
   235  	bucketPool         *BufferBucketPool
   236  	blockRetriever     QueryableBlockRetriever
   237  }
   238  
   239  // NB(prateek): databaseBuffer.Reset(...) must be called upon the returned
   240  // object prior to use.
   241  func newDatabaseBuffer() databaseBuffer {
   242  	b := &dbBuffer{
   243  		bucketsMap:         make(map[xtime.UnixNano]*BufferBucketVersions),
   244  		inOrderBlockStarts: make([]xtime.UnixNano, 0, bucketsCacheSize),
   245  	}
   246  	return b
   247  }
   248  
   249  func (b *dbBuffer) Reset(opts databaseBufferResetOptions) {
   250  	b.opts = opts.Options
   251  	b.nowFn = opts.Options.ClockOptions().NowFn()
   252  	b.bucketPool = opts.Options.BufferBucketPool()
   253  	b.bucketVersionsPool = opts.Options.BufferBucketVersionsPool()
   254  	b.blockRetriever = opts.BlockRetriever
   255  }
   256  
   257  func (b *dbBuffer) MoveTo(
   258  	buffer databaseBuffer,
   259  	nsCtx namespace.Context,
   260  ) error {
   261  	blockSize := b.opts.RetentionOptions().BlockSize()
   262  	for _, buckets := range b.bucketsMap {
   263  		for _, bucket := range buckets.buckets {
   264  			// Load any existing blocks.
   265  			for _, block := range bucket.loadedBlocks {
   266  				// Load block.
   267  				buffer.Load(block, bucket.writeType)
   268  			}
   269  
   270  			// Load encoders.
   271  			for _, elem := range bucket.encoders {
   272  				if elem.encoder.Len() == 0 {
   273  					// No data.
   274  					continue
   275  				}
   276  				// Take ownership of the encoder.
   277  				segment := elem.encoder.Discard()
   278  				// Create block and load into new buffer.
   279  				block := b.opts.DatabaseBlockOptions().DatabaseBlockPool().Get()
   280  				block.Reset(bucket.start, blockSize, segment, nsCtx)
   281  				// Load block.
   282  				buffer.Load(block, bucket.writeType)
   283  			}
   284  		}
   285  	}
   286  
   287  	return nil
   288  }
   289  
   290  func (b *dbBuffer) Write(
   291  	ctx context.Context,
   292  	id ident.ID,
   293  	timestamp xtime.UnixNano,
   294  	value float64,
   295  	unit xtime.Unit,
   296  	annotation []byte,
   297  	wOpts WriteOptions,
   298  ) (bool, WriteType, error) {
   299  	var (
   300  		ropts        = b.opts.RetentionOptions()
   301  		bufferPast   = ropts.BufferPast()
   302  		bufferFuture = ropts.BufferFuture()
   303  		now          = xtime.ToUnixNano(b.nowFn())
   304  		pastLimit    = now.Add(-1 * bufferPast).Truncate(time.Second)
   305  		futureLimit  = now.Add(bufferFuture).Truncate(time.Second)
   306  		blockSize    = ropts.BlockSize()
   307  		blockStart   = timestamp.Truncate(blockSize)
   308  		writeType    WriteType
   309  	)
   310  
   311  	switch {
   312  	case wOpts.BootstrapWrite:
   313  		exists, err := b.blockRetriever.IsBlockRetrievable(blockStart)
   314  		if err != nil {
   315  			return false, writeType, err
   316  		}
   317  		// Bootstrap writes are allowed to be outside of time boundaries
   318  		// and determined as cold or warm writes depending on whether
   319  		// the block is retrievable or not.
   320  		if !exists {
   321  			writeType = WarmWrite
   322  		} else {
   323  			writeType = ColdWrite
   324  		}
   325  
   326  	case timestamp.Before(pastLimit):
   327  		writeType = ColdWrite
   328  		if !b.opts.ColdWritesEnabled() {
   329  			return false, writeType, xerrors.NewInvalidParamsError(
   330  				fmt.Errorf("datapoint too far in past: "+
   331  					"id=%s, off_by=%s, timestamp=%s, past_limit=%s, "+
   332  					"timestamp_unix_nanos=%d, past_limit_unix_nanos=%d",
   333  					id.Bytes(), pastLimit.Sub(timestamp).String(),
   334  					timestamp.Format(errTimestampFormat),
   335  					pastLimit.Format(errTimestampFormat),
   336  					timestamp, pastLimit))
   337  		}
   338  
   339  	case !futureLimit.After(timestamp):
   340  		writeType = ColdWrite
   341  		if !b.opts.ColdWritesEnabled() {
   342  			return false, writeType, xerrors.NewInvalidParamsError(
   343  				fmt.Errorf("datapoint too far in future: "+
   344  					"id=%s, off_by=%s, timestamp=%s, future_limit=%s, "+
   345  					"timestamp_unix_nanos=%d, future_limit_unix_nanos=%d",
   346  					id.Bytes(), timestamp.Sub(futureLimit).String(),
   347  					timestamp.Format(errTimestampFormat),
   348  					futureLimit.Format(errTimestampFormat),
   349  					timestamp, futureLimit))
   350  		}
   351  
   352  	default:
   353  		writeType = WarmWrite
   354  
   355  	}
   356  
   357  	if writeType == ColdWrite {
   358  		retentionLimit := now.Add(-ropts.RetentionPeriod())
   359  		if wOpts.BootstrapWrite {
   360  			// NB(r): Allow bootstrapping to write to blocks that are
   361  			// still in retention.
   362  			retentionLimit = retentionLimit.Truncate(blockSize)
   363  		}
   364  		if retentionLimit.After(timestamp) {
   365  			if wOpts.SkipOutOfRetention {
   366  				// Allow for datapoint to be skipped since caller does not
   367  				// want writes out of retention to fail.
   368  				return false, writeType, nil
   369  			}
   370  			return false, writeType, xerrors.NewInvalidParamsError(
   371  				fmt.Errorf("datapoint too far in past and out of retention: "+
   372  					"id=%s, off_by=%s, timestamp=%s, retention_past_limit=%s, "+
   373  					"timestamp_unix_nanos=%d, retention_past_limit_unix_nanos=%d",
   374  					id.Bytes(), retentionLimit.Sub(timestamp).String(),
   375  					timestamp.Format(errTimestampFormat),
   376  					retentionLimit.Format(errTimestampFormat),
   377  					timestamp, retentionLimit))
   378  		}
   379  
   380  		futureRetentionLimit := now.Add(ropts.FutureRetentionPeriod())
   381  		if !futureRetentionLimit.After(timestamp) {
   382  			if wOpts.SkipOutOfRetention {
   383  				// Allow for datapoint to be skipped since caller does not
   384  				// want writes out of retention to fail.
   385  				return false, writeType, nil
   386  			}
   387  			return false, writeType, xerrors.NewInvalidParamsError(
   388  				fmt.Errorf("datapoint too far in future and out of retention: "+
   389  					"id=%s, off_by=%s, timestamp=%s, retention_future_limit=%s, "+
   390  					"timestamp_unix_nanos=%d, retention_future_limit_unix_nanos=%d",
   391  					id.Bytes(), timestamp.Sub(futureRetentionLimit).String(),
   392  					timestamp.Format(errTimestampFormat),
   393  					futureRetentionLimit.Format(errTimestampFormat),
   394  					timestamp, futureRetentionLimit))
   395  		}
   396  
   397  		b.opts.Stats().IncColdWrites()
   398  	}
   399  
   400  	buckets := b.bucketVersionsAtCreate(blockStart)
   401  	b.putBucketVersionsInCache(buckets)
   402  
   403  	if wOpts.TruncateType == TypeBlock {
   404  		timestamp = blockStart
   405  	}
   406  
   407  	if wOpts.TransformOptions.ForceValueEnabled {
   408  		value = wOpts.TransformOptions.ForceValue
   409  	}
   410  
   411  	ok, err := buckets.write(timestamp, value, unit, annotation, writeType, wOpts.SchemaDesc)
   412  	return ok, writeType, err
   413  }
   414  
   415  func (b *dbBuffer) IsEmpty() bool {
   416  	// A buffer can only be empty if there are no buckets in its map, since
   417  	// buckets are only created when a write for a new block start is done, and
   418  	// buckets are removed from the map when they are evicted from memory.
   419  	return len(b.bucketsMap) == 0
   420  }
   421  
   422  func (b *dbBuffer) MarkNonEmptyBlocks(nonEmptyBlockStarts map[xtime.UnixNano]struct{}) {
   423  	for blockStart, bv := range b.bucketsMap {
   424  		if _, ok := nonEmptyBlockStarts[blockStart]; !ok {
   425  			if !bv.streamsEmpty() {
   426  				nonEmptyBlockStarts[blockStart] = struct{}{}
   427  			}
   428  		}
   429  	}
   430  }
   431  
   432  func (b *dbBuffer) ColdFlushBlockStarts(blockStates map[xtime.UnixNano]BlockState) OptimizedTimes {
   433  	var times OptimizedTimes
   434  
   435  	for t, bucketVersions := range b.bucketsMap {
   436  		for _, bucket := range bucketVersions.buckets {
   437  			if bucket.writeType == ColdWrite &&
   438  				// We need to cold flush this bucket if it either:
   439  				// 1) Has new cold writes that need to be flushed, or
   440  				// 2) This bucket version is higher than what has been
   441  				//    successfully flushed. This can happen if a cold flush was
   442  				//    attempted, changing this bucket version, but fails to
   443  				//    completely finish (which is what the shard block state
   444  				//    signifies). In this case, we need to try to flush this
   445  				//    bucket again.
   446  				(bucket.version == writableBucketVersion ||
   447  					blockStates[bucket.start].ColdVersion < bucket.version) {
   448  				times.Add(t)
   449  				break
   450  			}
   451  		}
   452  	}
   453  
   454  	return times
   455  }
   456  
   457  func (b *dbBuffer) Stats() bufferStats {
   458  	return bufferStats{
   459  		wiredBlocks: len(b.bucketsMap),
   460  	}
   461  }
   462  
   463  func (b *dbBuffer) Tick(blockStates ShardBlockStateSnapshot, nsCtx namespace.Context) bufferTickResult {
   464  	mergedOutOfOrder := 0
   465  	var evictedBucketTimes OptimizedTimes
   466  	for tNano, buckets := range b.bucketsMap {
   467  		// The blockStates map is never written to after creation, so this
   468  		// read access is safe. Since this version map is a snapshot of the
   469  		// versions, the real block flush versions may be higher. This is okay
   470  		// here because it's safe to:
   471  		// 1) not remove a bucket that's actually retrievable, or
   472  		// 2) remove a lower versioned bucket.
   473  		// Retrievable and higher versioned buckets will be left to be
   474  		// collected in the next tick.
   475  		blockStateSnapshot, bootstrapped := blockStates.UnwrapValue()
   476  		// Only use block state snapshot information to make eviction decisions if the block state
   477  		// has been properly bootstrapped already.
   478  		if bootstrapped {
   479  			blockState := blockStateSnapshot.Snapshot[tNano]
   480  			if coldVersion := blockState.ColdVersion; blockState.WarmRetrievable || coldVersion > 0 {
   481  				if blockState.WarmRetrievable {
   482  					// Buckets for WarmWrites that are retrievable will only be version 1, since
   483  					// they only get successfully persisted once.
   484  					buckets.removeBucketsUpToVersion(WarmWrite, 1)
   485  				}
   486  				if coldVersion > 0 {
   487  					buckets.removeBucketsUpToVersion(ColdWrite, coldVersion)
   488  				}
   489  
   490  				if buckets.streamsLen() == 0 {
   491  					// All underlying buckets have been flushed successfully, so we can
   492  					// just remove the buckets from the bucketsMap.
   493  					b.removeBucketVersionsAt(tNano)
   494  					// Pass which bucket got evicted from the buffer to the series.
   495  					// Data gets read in order of precedence: buffer -> cache -> disk.
   496  					// After a bucket gets removed from the buffer, data from the cache
   497  					// will be served. However, since data just got persisted to disk,
   498  					// the cached block is now stale. To correct this, we can either:
   499  					// 1) evict the stale block from cache so that new data will
   500  					//    be retrieved from disk, or
   501  					// 2) merge the new data into the cached block.
   502  					// It's unclear whether recently flushed data would frequently be
   503  					// read soon afterward, so we're choosing (1) here, since it has a
   504  					// simpler implementation (just removing from a map).
   505  					evictedBucketTimes.Add(tNano)
   506  					continue
   507  				}
   508  			}
   509  		}
   510  
   511  		buckets.recordActiveEncoders()
   512  
   513  		// Once we've evicted all eligible buckets, we merge duplicate encoders
   514  		// in the remaining ones to try and reclaim memory.
   515  		merges, err := buckets.merge(WarmWrite, nsCtx)
   516  		if err != nil {
   517  			log := b.opts.InstrumentOptions().Logger()
   518  			log.Error("buffer merge encode error", zap.Error(err))
   519  		}
   520  		if merges > 0 {
   521  			mergedOutOfOrder++
   522  		}
   523  	}
   524  	return bufferTickResult{
   525  		mergedOutOfOrderBlocks: mergedOutOfOrder,
   526  		evictedBucketTimes:     evictedBucketTimes,
   527  	}
   528  }
   529  
   530  func (b *dbBuffer) Load(bl block.DatabaseBlock, writeType WriteType) {
   531  	var (
   532  		blockStart = bl.StartTime()
   533  		buckets    = b.bucketVersionsAtCreate(blockStart)
   534  		bucket     = buckets.writableBucketCreate(writeType)
   535  	)
   536  	bucket.loadedBlocks = append(bucket.loadedBlocks, bl)
   537  }
   538  
   539  func (b *dbBuffer) Snapshot(
   540  	ctx context.Context,
   541  	blockStart xtime.UnixNano,
   542  	metadata persist.Metadata,
   543  	persistFn persist.DataFn,
   544  	nsCtx namespace.Context,
   545  ) (SnapshotResult, error) {
   546  	var (
   547  		start  = b.nowFn()
   548  		result SnapshotResult
   549  	)
   550  
   551  	buckets, exists := b.bucketVersionsAt(blockStart)
   552  	if !exists {
   553  		return result, nil
   554  	}
   555  
   556  	// Snapshot must take both cold and warm writes because cold flushes don't
   557  	// happen for the current block (since cold flushes can't happen before a
   558  	// warm flush has happened).
   559  	streams, err := buckets.mergeToStreams(ctx, streamsOptions{filterWriteType: false, nsCtx: nsCtx})
   560  	if err != nil {
   561  		return result, err
   562  	}
   563  
   564  	afterMergeByBucket := b.nowFn()
   565  	result.Stats.TimeMergeByBucket = afterMergeByBucket.Sub(start)
   566  
   567  	var (
   568  		numStreams         = len(streams)
   569  		mergeAcrossBuckets = numStreams != 1
   570  		segment            ts.Segment
   571  	)
   572  	if !mergeAcrossBuckets {
   573  		segment, err = streams[0].Segment()
   574  		if err != nil {
   575  			return result, err
   576  		}
   577  	} else {
   578  		// We may need to merge again here because the regular merge method does
   579  		// not merge warm and cold buckets or buckets that have different versions.
   580  		sr := make([]xio.SegmentReader, 0, numStreams)
   581  		for _, stream := range streams {
   582  			sr = append(sr, stream)
   583  		}
   584  
   585  		bopts := b.opts.DatabaseBlockOptions()
   586  		encoder := bopts.EncoderPool().Get()
   587  		encoder.Reset(blockStart, bopts.DatabaseBlockAllocSize(), nsCtx.Schema)
   588  		iter := b.opts.MultiReaderIteratorPool().Get()
   589  		var encoderClosed bool
   590  		defer func() {
   591  			if !encoderClosed {
   592  				encoder.Close()
   593  			}
   594  			iter.Close()
   595  		}()
   596  		iter.Reset(sr, blockStart, b.opts.RetentionOptions().BlockSize(), nsCtx.Schema)
   597  
   598  		for iter.Next() {
   599  			dp, unit, annotation := iter.Current()
   600  			if err := encoder.Encode(dp, unit, annotation); err != nil {
   601  				return result, err
   602  			}
   603  		}
   604  		if err := iter.Err(); err != nil {
   605  			return result, err
   606  		}
   607  
   608  		segment = encoder.Discard()
   609  		defer segment.Finalize()
   610  		encoderClosed = true
   611  	}
   612  
   613  	afterMergeAcrossBuckets := b.nowFn()
   614  	result.Stats.TimeMergeAcrossBuckets = afterMergeAcrossBuckets.Sub(afterMergeByBucket)
   615  
   616  	if segment.Len() == 0 {
   617  		// Don't write out series with no data.
   618  		return result, nil
   619  	}
   620  
   621  	checksum := segment.CalculateChecksum()
   622  
   623  	afterChecksum := b.nowFn()
   624  	result.Stats.TimeChecksum = afterChecksum.Sub(afterMergeAcrossBuckets)
   625  
   626  	if err := persistFn(metadata, segment, checksum); err != nil {
   627  		return result, err
   628  	}
   629  
   630  	result.Stats.TimePersist = b.nowFn().Sub(afterChecksum)
   631  
   632  	result.Persist = true
   633  	return result, nil
   634  }
   635  
   636  func (b *dbBuffer) WarmFlush(
   637  	ctx context.Context,
   638  	blockStart xtime.UnixNano,
   639  	metadata persist.Metadata,
   640  	persistFn persist.DataFn,
   641  	nsCtx namespace.Context,
   642  ) (FlushOutcome, error) {
   643  	buckets, exists := b.bucketVersionsAt(blockStart)
   644  	if !exists {
   645  		return FlushOutcomeBlockDoesNotExist, nil
   646  	}
   647  
   648  	// Flush only deals with WarmWrites. ColdWrites get persisted to disk via
   649  	// the compaction cycle.
   650  	streams, err := buckets.mergeToStreams(ctx, streamsOptions{filterWriteType: true, writeType: WarmWrite, nsCtx: nsCtx})
   651  	if err != nil {
   652  		return FlushOutcomeErr, err
   653  	}
   654  
   655  	var (
   656  		stream xio.SegmentReader
   657  		ok     bool
   658  	)
   659  	if numStreams := len(streams); numStreams == 1 {
   660  		stream = streams[0]
   661  		ok = true
   662  	} else {
   663  		// In the majority of cases, there will only be one stream to persist
   664  		// here. Only when a previous flush fails midway through a shard will
   665  		// there be buckets for previous versions. In this case, we need to try
   666  		// to flush them again, so we merge them together to one stream and
   667  		// persist it.
   668  		encoder, _, err := mergeStreamsToEncoder(blockStart, streams, b.opts, nsCtx)
   669  		if err != nil {
   670  			return FlushOutcomeErr, err
   671  		}
   672  
   673  		stream, ok = encoder.Stream(ctx)
   674  		encoder.Close()
   675  	}
   676  
   677  	if !ok {
   678  		// Don't write out series with no data.
   679  		return FlushOutcomeBlockDoesNotExist, nil
   680  	}
   681  
   682  	segment, err := stream.Segment()
   683  	if err != nil {
   684  		return FlushOutcomeErr, err
   685  	}
   686  
   687  	if segment.Len() == 0 {
   688  		// Empty segment is equivalent to no stream, i.e data does not exist.
   689  		return FlushOutcomeBlockDoesNotExist, nil
   690  	}
   691  
   692  	checksum := segment.CalculateChecksum()
   693  	err = persistFn(metadata, segment, checksum)
   694  	if err != nil {
   695  		return FlushOutcomeErr, err
   696  	}
   697  
   698  	if bucket, exists := buckets.writableBucket(WarmWrite); exists {
   699  		// WarmFlushes only happen once per block, so it makes sense to always
   700  		// set this to 1.
   701  		bucket.version = 1
   702  	}
   703  
   704  	return FlushOutcomeFlushedToDisk, nil
   705  }
   706  
   707  func (b *dbBuffer) ReadEncoded(
   708  	ctx context.Context,
   709  	start xtime.UnixNano,
   710  	end xtime.UnixNano,
   711  	nsCtx namespace.Context,
   712  ) ([][]xio.BlockReader, error) {
   713  	var (
   714  		blockSize = b.opts.RetentionOptions().BlockSize()
   715  		// TODO(r): pool these results arrays
   716  		res [][]xio.BlockReader
   717  	)
   718  
   719  	for _, blockStart := range b.inOrderBlockStarts {
   720  		blockStart := blockStart
   721  		if !blockStart.Before(end) || !start.Before(blockStart.Add(blockSize)) {
   722  			continue
   723  		}
   724  
   725  		bv, exists := b.bucketVersionsAt(blockStart)
   726  		if !exists {
   727  			// Invariant violated. This means the keys in the bucket map does
   728  			// not match the sorted keys cache, which should never happen.
   729  			instrument.EmitAndLogInvariantViolation(
   730  				b.opts.InstrumentOptions(), func(l *zap.Logger) {
   731  					l.Error(errBucketMapCacheNotInSync, zap.Int64("blockStart", int64(blockStart)))
   732  				})
   733  			return nil, instrument.InvariantErrorf(
   734  				errBucketMapCacheNotInSyncFmt, blockStart)
   735  		}
   736  
   737  		if streams := bv.streams(ctx, streamsOptions{filterWriteType: false}); len(streams) > 0 {
   738  			res = append(res, streams)
   739  		}
   740  
   741  		// NB(r): Store the last read time, should not set this when
   742  		// calling FetchBlocks as a read is differentiated from
   743  		// a FetchBlocks call. One is initiated by an external
   744  		// entity and the other is used for streaming blocks between
   745  		// the storage nodes. This distinction is important as this
   746  		// data is important for use with understanding access patterns, etc.
   747  		bv.setLastRead(b.nowFn())
   748  	}
   749  
   750  	return res, nil
   751  }
   752  
   753  func (b *dbBuffer) FetchBlocksForColdFlush(
   754  	ctx context.Context,
   755  	start xtime.UnixNano,
   756  	version int,
   757  	nsCtx namespace.Context,
   758  ) (block.FetchBlockResult, error) {
   759  	res := b.fetchBlocks(ctx, []xtime.UnixNano{start},
   760  		streamsOptions{filterWriteType: true, writeType: ColdWrite, nsCtx: nsCtx})
   761  	if len(res) == 0 {
   762  		// The lifecycle of calling this function is preceded by first checking
   763  		// which blocks have cold data that have not yet been flushed.
   764  		// If we don't get data here, it means that it has since fallen out of
   765  		// retention and has been evicted.
   766  		return block.FetchBlockResult{}, nil
   767  	}
   768  	if len(res) != 1 {
   769  		// Must be only one result if anything at all, since fetchBlocks returns
   770  		// one result per block start.
   771  		return block.FetchBlockResult{}, fmt.Errorf("fetchBlocks did not return just one block for block start %s", start)
   772  	}
   773  
   774  	result := res[0]
   775  
   776  	buckets, exists := b.bucketVersionsAt(start)
   777  	if !exists {
   778  		return block.FetchBlockResult{}, fmt.Errorf("buckets do not exist with block start %s", start)
   779  	}
   780  	if bucket, exists := buckets.writableBucket(ColdWrite); exists {
   781  		// Update the version of the writable bucket (effectively making it not
   782  		// writable). This marks this bucket as attempted to be flushed,
   783  		// although it is only actually written to disk successfully at the
   784  		// shard level after every series has completed the flush process.
   785  		// The tick following a successful flush to disk will remove this bucket
   786  		// from memory.
   787  		bucket.version = version
   788  	}
   789  	// No-op if the writable bucket doesn't exist.
   790  	// This function should only get called for blocks that we know need to be
   791  	// cold flushed. However, buckets that get attempted to be cold flushed and
   792  	// fail need to get cold flushed as well. These kinds of buckets will have
   793  	// a non-writable version.
   794  
   795  	return result, nil
   796  }
   797  
   798  func (b *dbBuffer) FetchBlocks(
   799  	ctx context.Context,
   800  	starts []xtime.UnixNano,
   801  	nsCtx namespace.Context,
   802  ) []block.FetchBlockResult {
   803  	return b.fetchBlocks(ctx, starts, streamsOptions{filterWriteType: false, nsCtx: nsCtx})
   804  }
   805  
   806  func (b *dbBuffer) fetchBlocks(
   807  	ctx context.Context,
   808  	starts []xtime.UnixNano,
   809  	sOpts streamsOptions,
   810  ) []block.FetchBlockResult {
   811  	var res []block.FetchBlockResult
   812  
   813  	for _, start := range starts {
   814  		buckets, ok := b.bucketVersionsAt(start)
   815  		if !ok {
   816  			continue
   817  		}
   818  
   819  		streams := buckets.streams(ctx, sOpts)
   820  		if len(streams) > 0 {
   821  			result := block.NewFetchBlockResult(
   822  				start,
   823  				streams,
   824  				nil,
   825  			)
   826  			result.FirstWrite = buckets.firstWrite(sOpts)
   827  			res = append(res, result)
   828  		}
   829  	}
   830  
   831  	// Result should be sorted in ascending order.
   832  	sort.Slice(res, func(i, j int) bool { return res[i].Start.Before(res[j].Start) })
   833  
   834  	return res
   835  }
   836  
   837  func (b *dbBuffer) FetchBlocksMetadata(
   838  	ctx context.Context,
   839  	start, end xtime.UnixNano,
   840  	opts FetchBlocksMetadataOptions,
   841  ) (block.FetchBlockMetadataResults, error) {
   842  	blockSize := b.opts.RetentionOptions().BlockSize()
   843  	res := b.opts.FetchBlockMetadataResultsPool().Get()
   844  
   845  	for _, blockStart := range b.inOrderBlockStarts {
   846  		blockStart := blockStart
   847  		if !blockStart.Before(end) || !start.Before(blockStart.Add(blockSize)) {
   848  			continue
   849  		}
   850  
   851  		bv, exists := b.bucketVersionsAt(blockStart)
   852  		if !exists {
   853  			// Invariant violated. This means the keys in the bucket map does
   854  			// not match the sorted keys cache, which should never happen.
   855  			instrument.EmitAndLogInvariantViolation(
   856  				b.opts.InstrumentOptions(), func(l *zap.Logger) {
   857  					l.Error(errBucketMapCacheNotInSync, zap.Int64("blockStart", int64(blockStart)))
   858  				})
   859  			return nil, instrument.InvariantErrorf(errBucketMapCacheNotInSyncFmt, blockStart)
   860  		}
   861  
   862  		size := int64(bv.streamsLen())
   863  		// If we have no data in this bucket, skip early without appending it to the result.
   864  		if size == 0 {
   865  			continue
   866  		}
   867  		var resultSize int64
   868  		if opts.IncludeSizes {
   869  			resultSize = size
   870  		}
   871  		var resultLastRead xtime.UnixNano
   872  		if opts.IncludeLastRead {
   873  			resultLastRead = bv.lastRead()
   874  		}
   875  
   876  		var (
   877  			checksum *uint32
   878  			err      error
   879  		)
   880  		if opts.IncludeChecksums {
   881  			// Checksum calculations are best effort since we can't calculate one if there
   882  			// are multiple streams without performing an expensive merge.
   883  			checksum, err = bv.checksumIfSingleStream(ctx)
   884  			if err != nil {
   885  				return nil, err
   886  			}
   887  		}
   888  		res.Add(block.FetchBlockMetadataResult{
   889  			Start:    bv.start,
   890  			Size:     resultSize,
   891  			LastRead: resultLastRead,
   892  			Checksum: checksum,
   893  		})
   894  	}
   895  
   896  	return res, nil
   897  }
   898  
   899  func (b *dbBuffer) bucketVersionsAt(
   900  	t xtime.UnixNano,
   901  ) (*BufferBucketVersions, bool) {
   902  	// First check LRU cache.
   903  	for _, buckets := range b.bucketVersionsCache {
   904  		if buckets == nil {
   905  			continue
   906  		}
   907  		if buckets.start.Equal(t) {
   908  			return buckets, true
   909  		}
   910  	}
   911  
   912  	// Then check the map.
   913  	if buckets, exists := b.bucketsMap[t]; exists {
   914  		return buckets, true
   915  	}
   916  
   917  	return nil, false
   918  }
   919  
   920  func (b *dbBuffer) bucketVersionsAtCreate(
   921  	t xtime.UnixNano,
   922  ) *BufferBucketVersions {
   923  	if buckets, exists := b.bucketVersionsAt(t); exists {
   924  		return buckets
   925  	}
   926  
   927  	buckets := b.bucketVersionsPool.Get()
   928  	buckets.resetTo(t, b.opts, b.bucketPool)
   929  	b.bucketsMap[t] = buckets
   930  	b.inOrderBlockStartsAdd(t)
   931  
   932  	return buckets
   933  }
   934  
   935  func (b *dbBuffer) putBucketVersionsInCache(newBuckets *BufferBucketVersions) {
   936  	replaceIdx := bucketsCacheSize - 1
   937  	for i, buckets := range b.bucketVersionsCache {
   938  		// Check if we have the same pointer in cache.
   939  		if buckets == newBuckets {
   940  			replaceIdx = i
   941  		}
   942  	}
   943  
   944  	for i := replaceIdx; i > 0; i-- {
   945  		b.bucketVersionsCache[i] = b.bucketVersionsCache[i-1]
   946  	}
   947  
   948  	b.bucketVersionsCache[0] = newBuckets
   949  }
   950  
   951  func (b *dbBuffer) removeBucketVersionsInCache(oldBuckets *BufferBucketVersions) {
   952  	nilIdx := -1
   953  	for i, buckets := range b.bucketVersionsCache {
   954  		if buckets == oldBuckets {
   955  			nilIdx = i
   956  		}
   957  	}
   958  	if nilIdx == -1 {
   959  		return
   960  	}
   961  
   962  	for i := nilIdx; i < bucketsCacheSize-1; i++ {
   963  		b.bucketVersionsCache[i] = b.bucketVersionsCache[i+1]
   964  	}
   965  
   966  	b.bucketVersionsCache[bucketsCacheSize-1] = nil
   967  }
   968  
   969  func (b *dbBuffer) removeBucketVersionsAt(blockStart xtime.UnixNano) {
   970  	buckets, exists := b.bucketVersionsAt(blockStart)
   971  	if !exists {
   972  		return
   973  	}
   974  	delete(b.bucketsMap, blockStart)
   975  	b.removeBucketVersionsInCache(buckets)
   976  	b.inOrderBlockStartsRemove(blockStart)
   977  	// nil out pointers.
   978  	buckets.resetTo(0, nil, nil)
   979  	b.bucketVersionsPool.Put(buckets)
   980  }
   981  
   982  func (b *dbBuffer) inOrderBlockStartsAdd(newTime xtime.UnixNano) {
   983  	starts := b.inOrderBlockStarts
   984  	idx := len(starts)
   985  	// There shouldn't be that many starts here, so just linear search through.
   986  	for i, t := range starts {
   987  		if t.After(newTime) {
   988  			idx = i
   989  			break
   990  		}
   991  	}
   992  	// Insert new time without allocating new slice.
   993  	b.inOrderBlockStarts = append(starts, 0) //nolint
   994  	// Update to new slice
   995  	starts = b.inOrderBlockStarts
   996  	copy(starts[idx+1:], starts[idx:])
   997  	starts[idx] = newTime
   998  }
   999  
  1000  func (b *dbBuffer) inOrderBlockStartsRemove(removeTime xtime.UnixNano) {
  1001  	starts := b.inOrderBlockStarts
  1002  	// There shouldn't be that many starts here, so just linear search through.
  1003  	for i, t := range starts {
  1004  		if t.Equal(removeTime) {
  1005  			b.inOrderBlockStarts = append(starts[:i], starts[i+1:]...)
  1006  			return
  1007  		}
  1008  	}
  1009  }
  1010  
  1011  // BufferBucketVersions is a container for different versions of buffer buckets.
  1012  // Bucket versions are how the buffer separates writes that have been written
  1013  // to disk as a fileset and writes that have not. The bucket with a version of
  1014  // `writableBucketVersion` is the bucket that all writes go into (as thus is the
  1015  // bucket version that have not yet been persisted). After a bucket gets
  1016  // persisted, its version gets set to a version that the shard passes down to it
  1017  // (since the shard knows what has been fully persisted to disk).
  1018  type BufferBucketVersions struct {
  1019  	buckets           []*BufferBucket
  1020  	start             xtime.UnixNano
  1021  	opts              Options
  1022  	lastReadUnixNanos int64
  1023  	bucketPool        *BufferBucketPool
  1024  }
  1025  
  1026  func (b *BufferBucketVersions) resetTo(
  1027  	start xtime.UnixNano,
  1028  	opts Options,
  1029  	bucketPool *BufferBucketPool,
  1030  ) {
  1031  	// nil all elements so that they get GC'd.
  1032  	for i := range b.buckets {
  1033  		b.buckets[i] = nil
  1034  	}
  1035  	b.buckets = b.buckets[:0]
  1036  	b.start = start
  1037  	b.opts = opts
  1038  	atomic.StoreInt64(&b.lastReadUnixNanos, 0)
  1039  	b.bucketPool = bucketPool
  1040  }
  1041  
  1042  // streams returns all the streams for this BufferBucketVersions.
  1043  func (b *BufferBucketVersions) streams(ctx context.Context, opts streamsOptions) []xio.BlockReader {
  1044  	var res []xio.BlockReader
  1045  	for _, bucket := range b.buckets {
  1046  		if opts.filterWriteType && bucket.writeType != opts.writeType {
  1047  			continue
  1048  		}
  1049  		res = append(res, bucket.streams(ctx)...)
  1050  	}
  1051  
  1052  	return res
  1053  }
  1054  
  1055  func (b *BufferBucketVersions) firstWrite(opts streamsOptions) xtime.UnixNano {
  1056  	var res xtime.UnixNano
  1057  	for _, bucket := range b.buckets {
  1058  		if opts.filterWriteType && bucket.writeType != opts.writeType {
  1059  			continue
  1060  		}
  1061  		// Get the earliest valid first write time.
  1062  		if res == 0 ||
  1063  			(bucket.firstWrite.Before(res) && bucket.firstWrite != 0) {
  1064  			res = bucket.firstWrite
  1065  		}
  1066  	}
  1067  	return res
  1068  }
  1069  
  1070  func (b *BufferBucketVersions) streamsEmpty() bool {
  1071  	for _, bucket := range b.buckets {
  1072  		if !bucket.streamsEmpty() {
  1073  			return false
  1074  		}
  1075  	}
  1076  	return true
  1077  }
  1078  
  1079  func (b *BufferBucketVersions) streamsLen() int {
  1080  	res := 0
  1081  	for _, bucket := range b.buckets {
  1082  		res += bucket.streamsLen()
  1083  	}
  1084  	return res
  1085  }
  1086  
  1087  func (b *BufferBucketVersions) checksumIfSingleStream(ctx context.Context) (*uint32, error) {
  1088  	if len(b.buckets) != 1 {
  1089  		return nil, nil
  1090  	}
  1091  	return b.buckets[0].checksumIfSingleStream(ctx)
  1092  }
  1093  
  1094  func (b *BufferBucketVersions) write(
  1095  	timestamp xtime.UnixNano,
  1096  	value float64,
  1097  	unit xtime.Unit,
  1098  	annotation []byte,
  1099  	writeType WriteType,
  1100  	schema namespace.SchemaDescr,
  1101  ) (bool, error) {
  1102  	return b.writableBucketCreate(writeType).write(timestamp, value, unit, annotation, schema)
  1103  }
  1104  
  1105  func (b *BufferBucketVersions) merge(writeType WriteType, nsCtx namespace.Context) (int, error) {
  1106  	res := 0
  1107  	for _, bucket := range b.buckets {
  1108  		// Only makes sense to merge buckets that are writable.
  1109  		if bucket.version == writableBucketVersion && writeType == bucket.writeType {
  1110  			merges, err := bucket.merge(nsCtx)
  1111  			if err != nil {
  1112  				return 0, err
  1113  			}
  1114  			res += merges
  1115  		}
  1116  	}
  1117  
  1118  	return res, nil
  1119  }
  1120  
  1121  func (b *BufferBucketVersions) removeBucketsUpToVersion(
  1122  	writeType WriteType,
  1123  	version int,
  1124  ) {
  1125  	// Avoid allocating a new backing array.
  1126  	nonEvictedBuckets := b.buckets[:0]
  1127  
  1128  	for _, bucket := range b.buckets {
  1129  		bVersion := bucket.version
  1130  		if bucket.writeType == writeType && bVersion != writableBucketVersion &&
  1131  			bVersion <= version {
  1132  			// We no longer need to keep any version which is equal to
  1133  			// or less than the retrievable version, since that means
  1134  			// that the version has successfully persisted to disk.
  1135  			// Bucket gets reset before use.
  1136  			b.bucketPool.Put(bucket)
  1137  			continue
  1138  		}
  1139  
  1140  		nonEvictedBuckets = append(nonEvictedBuckets, bucket)
  1141  	}
  1142  
  1143  	b.buckets = nonEvictedBuckets
  1144  }
  1145  
  1146  func (b *BufferBucketVersions) setLastRead(value time.Time) {
  1147  	atomic.StoreInt64(&b.lastReadUnixNanos, value.UnixNano())
  1148  }
  1149  
  1150  func (b *BufferBucketVersions) lastRead() xtime.UnixNano {
  1151  	return xtime.UnixNano(atomic.LoadInt64(&b.lastReadUnixNanos))
  1152  }
  1153  
  1154  func (b *BufferBucketVersions) writableBucket(writeType WriteType) (*BufferBucket, bool) {
  1155  	for _, bucket := range b.buckets {
  1156  		if bucket.version == writableBucketVersion && bucket.writeType == writeType {
  1157  			return bucket, true
  1158  		}
  1159  	}
  1160  
  1161  	return nil, false
  1162  }
  1163  
  1164  func (b *BufferBucketVersions) writableBucketCreate(writeType WriteType) *BufferBucket {
  1165  	bucket, exists := b.writableBucket(writeType)
  1166  
  1167  	if exists {
  1168  		return bucket
  1169  	}
  1170  
  1171  	newBucket := b.bucketPool.Get()
  1172  	newBucket.resetTo(b.start, writeType, b.opts)
  1173  	b.buckets = append(b.buckets, newBucket)
  1174  	return newBucket
  1175  }
  1176  
  1177  // mergeToStreams merges each buffer bucket version's streams into one, then
  1178  // returns a single stream for each buffer bucket version.
  1179  func (b *BufferBucketVersions) mergeToStreams(ctx context.Context, opts streamsOptions) ([]xio.SegmentReader, error) {
  1180  	buckets := b.buckets
  1181  	res := make([]xio.SegmentReader, 0, len(buckets))
  1182  
  1183  	for _, bucket := range buckets {
  1184  		if opts.filterWriteType && bucket.writeType != opts.writeType {
  1185  			continue
  1186  		}
  1187  		stream, ok, err := bucket.mergeToStream(ctx, opts.nsCtx)
  1188  		if err != nil {
  1189  			return nil, err
  1190  		}
  1191  		if !ok {
  1192  			continue
  1193  		}
  1194  		res = append(res, stream)
  1195  	}
  1196  
  1197  	return res, nil
  1198  }
  1199  
  1200  func (b *BufferBucketVersions) recordActiveEncoders() {
  1201  	var numActiveEncoders int
  1202  	for _, bucket := range b.buckets {
  1203  		if bucket.version == writableBucketVersion {
  1204  			numActiveEncoders += len(bucket.encoders)
  1205  		}
  1206  	}
  1207  	b.opts.Stats().RecordEncodersPerBlock(numActiveEncoders)
  1208  }
  1209  
  1210  type streamsOptions struct {
  1211  	filterWriteType bool
  1212  	writeType       WriteType
  1213  	nsCtx           namespace.Context
  1214  }
  1215  
  1216  // BufferBucket is a specific version of a bucket of encoders, which is where
  1217  // writes are ultimately stored before they are persisted to disk as a fileset.
  1218  // See comment for BufferBucketVersions for more detail on bucket versions.
  1219  type BufferBucket struct {
  1220  	opts         Options
  1221  	start        xtime.UnixNano
  1222  	encoders     []inOrderEncoder
  1223  	loadedBlocks []block.DatabaseBlock
  1224  	version      int
  1225  	writeType    WriteType
  1226  	firstWrite   xtime.UnixNano
  1227  }
  1228  
  1229  type inOrderEncoder struct {
  1230  	encoder     encoding.Encoder
  1231  	lastWriteAt xtime.UnixNano
  1232  }
  1233  
  1234  func (b *BufferBucket) resetTo(
  1235  	start xtime.UnixNano,
  1236  	writeType WriteType,
  1237  	opts Options,
  1238  ) {
  1239  	// Close the old context if we're resetting for use.
  1240  	b.reset()
  1241  	b.opts = opts
  1242  	b.start = start
  1243  	bopts := b.opts.DatabaseBlockOptions()
  1244  	encoder := bopts.EncoderPool().Get()
  1245  	encoder.Reset(start, bopts.DatabaseBlockAllocSize(), nil)
  1246  	b.encoders = append(b.encoders, inOrderEncoder{
  1247  		encoder: encoder,
  1248  	})
  1249  	b.loadedBlocks = nil
  1250  	// We would only ever create a bucket for it to be writable.
  1251  	b.version = writableBucketVersion
  1252  	b.writeType = writeType
  1253  	b.firstWrite = 0
  1254  }
  1255  
  1256  func (b *BufferBucket) reset() {
  1257  	b.resetEncoders()
  1258  	b.resetLoadedBlocks()
  1259  }
  1260  
  1261  func (b *BufferBucket) write(
  1262  	timestamp xtime.UnixNano,
  1263  	value float64,
  1264  	unit xtime.Unit,
  1265  	annotation []byte,
  1266  	schema namespace.SchemaDescr,
  1267  ) (bool, error) {
  1268  	datapoint := ts.Datapoint{
  1269  		TimestampNanos: timestamp,
  1270  		Value:          value,
  1271  	}
  1272  
  1273  	// Find the correct encoder to write to
  1274  	idx := -1
  1275  	for i := range b.encoders {
  1276  		lastWriteAt := b.encoders[i].lastWriteAt
  1277  		if timestamp.Equal(lastWriteAt) {
  1278  			lastDatapoint, err := b.encoders[i].encoder.LastEncoded()
  1279  			if err != nil {
  1280  				return false, err
  1281  			}
  1282  			lastAnnotationChecksum, err := b.encoders[i].encoder.LastAnnotationChecksum()
  1283  			if err != nil {
  1284  				return false, err
  1285  			}
  1286  
  1287  			if lastDatapoint.Value == value && lastAnnotationChecksum == xxhash.Sum64(annotation) {
  1288  				// No-op since matches the current value. Propagates up to callers that
  1289  				// no value was written.
  1290  				return false, nil
  1291  			}
  1292  			continue
  1293  		}
  1294  
  1295  		if timestamp.After(lastWriteAt) {
  1296  			idx = i
  1297  			break
  1298  		}
  1299  	}
  1300  
  1301  	var err error
  1302  	defer func() {
  1303  		nowFn := b.opts.ClockOptions().NowFn()
  1304  		if err == nil && b.firstWrite == 0 {
  1305  			b.firstWrite = xtime.ToUnixNano(nowFn())
  1306  		}
  1307  	}()
  1308  
  1309  	// Upsert/last-write-wins semantics.
  1310  	// NB(r): We push datapoints with the same timestamp but differing
  1311  	// value into a new encoder later in the stack of in order encoders
  1312  	// since an encoder is immutable.
  1313  	// The encoders pushed later will surface their values first.
  1314  	if idx != -1 {
  1315  		err = b.writeToEncoderIndex(idx, datapoint, unit, annotation, schema)
  1316  		return err == nil, err
  1317  	}
  1318  
  1319  	// Need a new encoder, we didn't find an encoder to write to.
  1320  	maxEncoders := b.opts.RuntimeOptionsManager().Get().EncodersPerBlockLimit()
  1321  	if maxEncoders != 0 && len(b.encoders) >= int(maxEncoders) {
  1322  		b.opts.Stats().IncEncoderLimitWriteRejected()
  1323  		return false, errTooManyEncoders
  1324  	}
  1325  
  1326  	b.opts.Stats().IncCreatedEncoders()
  1327  	bopts := b.opts.DatabaseBlockOptions()
  1328  	blockSize := b.opts.RetentionOptions().BlockSize()
  1329  	blockAllocSize := bopts.DatabaseBlockAllocSize()
  1330  
  1331  	encoder := b.opts.EncoderPool().Get()
  1332  	encoder.Reset(timestamp.Truncate(blockSize), blockAllocSize, schema)
  1333  
  1334  	b.encoders = append(b.encoders, inOrderEncoder{
  1335  		encoder:     encoder,
  1336  		lastWriteAt: timestamp,
  1337  	})
  1338  
  1339  	idx = len(b.encoders) - 1
  1340  	err = b.writeToEncoderIndex(idx, datapoint, unit, annotation, schema)
  1341  	if err != nil {
  1342  		encoder.Close()
  1343  		b.encoders = b.encoders[:idx]
  1344  		return false, err
  1345  	}
  1346  	return true, nil
  1347  }
  1348  
  1349  func (b *BufferBucket) writeToEncoderIndex(
  1350  	idx int,
  1351  	datapoint ts.Datapoint,
  1352  	unit xtime.Unit,
  1353  	annotation []byte,
  1354  	schema namespace.SchemaDescr,
  1355  ) error {
  1356  	b.encoders[idx].encoder.SetSchema(schema)
  1357  	err := b.encoders[idx].encoder.Encode(datapoint, unit, annotation)
  1358  	if err != nil {
  1359  		return err
  1360  	}
  1361  
  1362  	b.encoders[idx].lastWriteAt = datapoint.TimestampNanos
  1363  	return nil
  1364  }
  1365  
  1366  func (b *BufferBucket) streams(ctx context.Context) []xio.BlockReader {
  1367  	streams := make([]xio.BlockReader, 0, len(b.loadedBlocks)+len(b.encoders))
  1368  	for _, bl := range b.loadedBlocks {
  1369  		if bl.Len() == 0 {
  1370  			continue
  1371  		}
  1372  		if s, err := bl.Stream(ctx); err == nil && s.IsNotEmpty() {
  1373  			// NB(r): block stream method will register the stream closer already
  1374  			streams = append(streams, s)
  1375  		}
  1376  	}
  1377  	for i := range b.encoders {
  1378  		start := b.start
  1379  		if s, ok := b.encoders[i].encoder.Stream(ctx); ok {
  1380  			br := xio.BlockReader{
  1381  				SegmentReader: s,
  1382  				Start:         start,
  1383  				BlockSize:     b.opts.RetentionOptions().BlockSize(),
  1384  			}
  1385  			ctx.RegisterFinalizer(s)
  1386  			streams = append(streams, br)
  1387  		}
  1388  	}
  1389  
  1390  	return streams
  1391  }
  1392  
  1393  func (b *BufferBucket) streamsEmpty() bool {
  1394  	for i := range b.loadedBlocks {
  1395  		if !b.loadedBlocks[i].Empty() {
  1396  			return false
  1397  		}
  1398  	}
  1399  	for i := range b.encoders {
  1400  		if !b.encoders[i].encoder.Empty() {
  1401  			return false
  1402  		}
  1403  	}
  1404  	return true
  1405  }
  1406  
  1407  func (b *BufferBucket) streamsLen() int {
  1408  	length := 0
  1409  	for i := range b.loadedBlocks {
  1410  		length += b.loadedBlocks[i].Len()
  1411  	}
  1412  	for i := range b.encoders {
  1413  		length += b.encoders[i].encoder.Len()
  1414  	}
  1415  	return length
  1416  }
  1417  
  1418  func (b *BufferBucket) checksumIfSingleStream(ctx context.Context) (*uint32, error) {
  1419  	if b.hasJustSingleEncoder() {
  1420  		enc := b.encoders[0].encoder
  1421  		stream, ok := enc.Stream(ctx)
  1422  		if !ok {
  1423  			return nil, nil
  1424  		}
  1425  
  1426  		segment, err := stream.Segment()
  1427  		if err != nil {
  1428  			return nil, err
  1429  		}
  1430  
  1431  		if segment.Len() == 0 {
  1432  			return nil, nil
  1433  		}
  1434  
  1435  		checksum := segment.CalculateChecksum()
  1436  		return &checksum, nil
  1437  	}
  1438  
  1439  	if b.hasJustSingleLoadedBlock() {
  1440  		checksum, err := b.loadedBlocks[0].Checksum()
  1441  		if err != nil {
  1442  			return nil, err
  1443  		}
  1444  		return &checksum, nil
  1445  	}
  1446  
  1447  	return nil, nil
  1448  }
  1449  
  1450  func (b *BufferBucket) resetEncoders() {
  1451  	var zeroed inOrderEncoder
  1452  	for i := range b.encoders {
  1453  		// Register when this bucket resets we close the encoder.
  1454  		encoder := b.encoders[i].encoder
  1455  		encoder.Close()
  1456  		b.encoders[i] = zeroed
  1457  	}
  1458  	b.encoders = b.encoders[:0]
  1459  }
  1460  
  1461  func (b *BufferBucket) resetLoadedBlocks() {
  1462  	for i := range b.loadedBlocks {
  1463  		bl := b.loadedBlocks[i]
  1464  		bl.Close()
  1465  	}
  1466  	b.loadedBlocks = nil
  1467  }
  1468  
  1469  func (b *BufferBucket) needsMerge() bool {
  1470  	return !(b.hasJustSingleEncoder() || b.hasJustSingleLoadedBlock())
  1471  }
  1472  
  1473  func (b *BufferBucket) hasJustSingleEncoder() bool {
  1474  	return len(b.encoders) == 1 && len(b.loadedBlocks) == 0
  1475  }
  1476  
  1477  func (b *BufferBucket) hasJustSingleLoadedBlock() bool {
  1478  	encodersEmpty := len(b.encoders) == 0 ||
  1479  		(len(b.encoders) == 1 && b.encoders[0].encoder.Len() == 0)
  1480  	return encodersEmpty && len(b.loadedBlocks) == 1
  1481  }
  1482  
  1483  func (b *BufferBucket) merge(nsCtx namespace.Context) (int, error) {
  1484  	if !b.needsMerge() {
  1485  		// Save unnecessary work
  1486  		return 0, nil
  1487  	}
  1488  
  1489  	var (
  1490  		start   = b.start
  1491  		readers = make([]xio.SegmentReader, 0, len(b.encoders)+len(b.loadedBlocks))
  1492  		streams = make([]xio.SegmentReader, 0, len(b.encoders))
  1493  		ctx     = b.opts.ContextPool().Get()
  1494  		merges  = 0
  1495  	)
  1496  	defer func() {
  1497  		ctx.Close()
  1498  		// NB(r): Only need to close the mutable encoder streams as
  1499  		// the context we created for reading the loaded blocks
  1500  		// will close those streams when it is closed.
  1501  		for _, stream := range streams {
  1502  			stream.Finalize()
  1503  		}
  1504  	}()
  1505  
  1506  	// Rank loaded blocks as data that has appeared before data that
  1507  	// arrived locally in the buffer
  1508  	for i := range b.loadedBlocks {
  1509  		block, err := b.loadedBlocks[i].Stream(ctx)
  1510  		if err == nil && block.SegmentReader != nil {
  1511  			merges++
  1512  			readers = append(readers, block.SegmentReader)
  1513  		}
  1514  	}
  1515  
  1516  	for i := range b.encoders {
  1517  		if s, ok := b.encoders[i].encoder.Stream(ctx); ok {
  1518  			merges++
  1519  			readers = append(readers, s)
  1520  			streams = append(streams, s)
  1521  		}
  1522  	}
  1523  
  1524  	encoder, lastWriteAt, err := mergeStreamsToEncoder(start, readers, b.opts, nsCtx)
  1525  	if err != nil {
  1526  		return 0, err
  1527  	}
  1528  
  1529  	b.resetEncoders()
  1530  	b.resetLoadedBlocks()
  1531  
  1532  	b.encoders = append(b.encoders, inOrderEncoder{
  1533  		encoder:     encoder,
  1534  		lastWriteAt: lastWriteAt,
  1535  	})
  1536  
  1537  	return merges, nil
  1538  }
  1539  
  1540  // mergeStreamsToEncoder merges streams to an encoder and returns the last
  1541  // write time. It is the responsibility of the caller to close the returned
  1542  // encoder when appropriate.
  1543  func mergeStreamsToEncoder(
  1544  	blockStart xtime.UnixNano,
  1545  	streams []xio.SegmentReader,
  1546  	opts Options,
  1547  	nsCtx namespace.Context,
  1548  ) (encoding.Encoder, xtime.UnixNano, error) {
  1549  	bopts := opts.DatabaseBlockOptions()
  1550  	encoder := opts.EncoderPool().Get()
  1551  	encoder.Reset(blockStart, bopts.DatabaseBlockAllocSize(), nsCtx.Schema)
  1552  	iter := opts.MultiReaderIteratorPool().Get()
  1553  	defer iter.Close()
  1554  
  1555  	var lastWriteAt xtime.UnixNano
  1556  	iter.Reset(streams, blockStart, opts.RetentionOptions().BlockSize(), nsCtx.Schema)
  1557  	for iter.Next() {
  1558  		dp, unit, annotation := iter.Current()
  1559  		if err := encoder.Encode(dp, unit, annotation); err != nil {
  1560  			encoder.Close()
  1561  			return nil, 0, err
  1562  		}
  1563  		lastWriteAt = dp.TimestampNanos
  1564  	}
  1565  	if err := iter.Err(); err != nil {
  1566  		encoder.Close()
  1567  		return nil, 0, err
  1568  	}
  1569  
  1570  	return encoder, lastWriteAt, nil
  1571  }
  1572  
  1573  // mergeToStream merges all streams in this BufferBucket into one stream and
  1574  // returns it.
  1575  func (b *BufferBucket) mergeToStream(ctx context.Context, nsCtx namespace.Context) (xio.SegmentReader, bool, error) {
  1576  	if b.hasJustSingleEncoder() {
  1577  		b.resetLoadedBlocks()
  1578  		// Already merged as a single encoder.
  1579  		stream, ok := b.encoders[0].encoder.Stream(ctx)
  1580  		if !ok {
  1581  			return nil, false, nil
  1582  		}
  1583  		ctx.RegisterFinalizer(stream)
  1584  		return stream, true, nil
  1585  	}
  1586  
  1587  	if b.hasJustSingleLoadedBlock() {
  1588  		// Need to reset encoders but do not want to finalize the block as we
  1589  		// are passing ownership of it to the caller.
  1590  		b.resetEncoders()
  1591  		stream, err := b.loadedBlocks[0].Stream(ctx)
  1592  		if err != nil {
  1593  			return nil, false, err
  1594  		}
  1595  		return stream, true, nil
  1596  	}
  1597  
  1598  	_, err := b.merge(nsCtx)
  1599  	if err != nil {
  1600  		b.resetEncoders()
  1601  		b.resetLoadedBlocks()
  1602  		return nil, false, err
  1603  	}
  1604  
  1605  	// After a successful merge, encoders and loaded blocks will be
  1606  	// reset, and the merged encoder appended as the only encoder in the
  1607  	// bucket.
  1608  	if !b.hasJustSingleEncoder() {
  1609  		return nil, false, errIncompleteMerge
  1610  	}
  1611  
  1612  	stream, ok := b.encoders[0].encoder.Stream(ctx)
  1613  	if !ok {
  1614  		return nil, false, nil
  1615  	}
  1616  	ctx.RegisterFinalizer(stream)
  1617  	return stream, true, nil
  1618  }
  1619  
  1620  // BufferBucketVersionsPool provides a pool for BufferBucketVersions.
  1621  type BufferBucketVersionsPool struct {
  1622  	pool pool.ObjectPool
  1623  }
  1624  
  1625  // NewBufferBucketVersionsPool creates a new BufferBucketVersionsPool.
  1626  func NewBufferBucketVersionsPool(opts pool.ObjectPoolOptions) *BufferBucketVersionsPool {
  1627  	p := &BufferBucketVersionsPool{pool: pool.NewObjectPool(opts)}
  1628  	p.pool.Init(func() interface{} {
  1629  		return &BufferBucketVersions{}
  1630  	})
  1631  	return p
  1632  }
  1633  
  1634  // Get gets a BufferBucketVersions from the pool.
  1635  func (p *BufferBucketVersionsPool) Get() *BufferBucketVersions {
  1636  	return p.pool.Get().(*BufferBucketVersions)
  1637  }
  1638  
  1639  // Put puts a BufferBucketVersions back into the pool.
  1640  func (p *BufferBucketVersionsPool) Put(buckets *BufferBucketVersions) {
  1641  	p.pool.Put(buckets)
  1642  }
  1643  
  1644  // BufferBucketPool provides a pool for BufferBuckets.
  1645  type BufferBucketPool struct {
  1646  	pool pool.ObjectPool
  1647  }
  1648  
  1649  // NewBufferBucketPool creates a new BufferBucketPool.
  1650  func NewBufferBucketPool(opts pool.ObjectPoolOptions) *BufferBucketPool {
  1651  	p := &BufferBucketPool{pool: pool.NewObjectPool(opts)}
  1652  	p.pool.Init(func() interface{} {
  1653  		return &BufferBucket{}
  1654  	})
  1655  	return p
  1656  }
  1657  
  1658  // Get gets a BufferBucket from the pool.
  1659  func (p *BufferBucketPool) Get() *BufferBucket {
  1660  	return p.pool.Get().(*BufferBucket)
  1661  }
  1662  
  1663  // Put puts a BufferBucket back into the pool.
  1664  func (p *BufferBucketPool) Put(bucket *BufferBucket) {
  1665  	p.pool.Put(bucket)
  1666  }