github.com/m3db/m3@v1.5.0/src/dbnode/storage/shard.go (about)

     1  // Copyright (c) 2020 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package storage
    22  
    23  import (
    24  	"container/list"
    25  	"errors"
    26  	"fmt"
    27  	"io"
    28  	"math"
    29  	"sync"
    30  	"time"
    31  
    32  	"github.com/m3db/m3/src/dbnode/generated/proto/pagetoken"
    33  	"github.com/m3db/m3/src/dbnode/namespace"
    34  	"github.com/m3db/m3/src/dbnode/persist"
    35  	"github.com/m3db/m3/src/dbnode/persist/fs"
    36  	"github.com/m3db/m3/src/dbnode/retention"
    37  	"github.com/m3db/m3/src/dbnode/runtime"
    38  	"github.com/m3db/m3/src/dbnode/storage/block"
    39  	"github.com/m3db/m3/src/dbnode/storage/bootstrap"
    40  	"github.com/m3db/m3/src/dbnode/storage/bootstrap/result"
    41  	"github.com/m3db/m3/src/dbnode/storage/index"
    42  	"github.com/m3db/m3/src/dbnode/storage/index/convert"
    43  	"github.com/m3db/m3/src/dbnode/storage/repair"
    44  	"github.com/m3db/m3/src/dbnode/storage/series"
    45  	"github.com/m3db/m3/src/dbnode/tracepoint"
    46  	"github.com/m3db/m3/src/dbnode/ts"
    47  	"github.com/m3db/m3/src/dbnode/ts/writes"
    48  	"github.com/m3db/m3/src/dbnode/x/xio"
    49  	"github.com/m3db/m3/src/m3ninx/doc"
    50  	"github.com/m3db/m3/src/x/checked"
    51  	"github.com/m3db/m3/src/x/clock"
    52  	"github.com/m3db/m3/src/x/context"
    53  	xerrors "github.com/m3db/m3/src/x/errors"
    54  	"github.com/m3db/m3/src/x/ident"
    55  	"github.com/m3db/m3/src/x/instrument"
    56  	xresource "github.com/m3db/m3/src/x/resource"
    57  	xtime "github.com/m3db/m3/src/x/time"
    58  
    59  	"github.com/gogo/protobuf/proto"
    60  	"github.com/opentracing/opentracing-go/log"
    61  	"github.com/uber-go/tally"
    62  	"go.uber.org/zap"
    63  )
    64  
    65  const (
    66  	shardIterateBatchPercent = 0.01
    67  	shardIterateBatchMinSize = 16
    68  )
    69  
    70  var (
    71  	errShardEntryNotFound         = errors.New("shard entry not found")
    72  	errShardNotOpen               = errors.New("shard is not open")
    73  	errShardAlreadyTicking        = errors.New("shard is already ticking")
    74  	errShardClosingTickTerminated = errors.New("shard is closing, terminating tick")
    75  	errShardInvalidPageToken      = errors.New("shard could not unmarshal page token")
    76  	errShardIsNotBootstrapped     = errors.New("shard is not bootstrapped")
    77  	errShardAlreadyBootstrapped   = errors.New("shard is already bootstrapped")
    78  	errFlushStateIsNotInitialized = errors.New("shard flush state is not initialized")
    79  	errTriedToLoadNilSeries       = errors.New("tried to load nil series into shard")
    80  
    81  	// ErrDatabaseLoadLimitHit is the error returned when the database load limit
    82  	// is hit or exceeded.
    83  	ErrDatabaseLoadLimitHit = errors.New("error loading series, database load limit hit")
    84  
    85  	emptyDoc = doc.Metadata{}
    86  )
    87  
    88  type filesetsFn func(
    89  	filePathPrefix string,
    90  	namespace ident.ID,
    91  	shardID uint32,
    92  ) (fs.FileSetFilesSlice, error)
    93  
    94  type filesetPathsBeforeFn func(
    95  	filePathPrefix string,
    96  	namespace ident.ID,
    97  	shardID uint32,
    98  	t xtime.UnixNano,
    99  ) ([]string, error)
   100  
   101  type tickPolicy int
   102  
   103  const (
   104  	tickPolicyRegular tickPolicy = iota
   105  	tickPolicyCloseShard
   106  )
   107  
   108  type dbShardState int
   109  
   110  const (
   111  	dbShardStateOpen dbShardState = iota
   112  	dbShardStateClosing
   113  )
   114  
   115  type dbShard struct {
   116  	sync.RWMutex
   117  	block.DatabaseBlockRetriever
   118  	opts                     Options
   119  	seriesOpts               series.Options
   120  	nowFn                    clock.NowFn
   121  	state                    dbShardState
   122  	namespace                namespace.Metadata
   123  	seriesBlockRetriever     series.QueryableBlockRetriever
   124  	seriesOnRetrieveBlock    block.OnRetrieveBlock
   125  	namespaceReaderMgr       databaseNamespaceReaderManager
   126  	increasingIndex          increasingIndex
   127  	seriesPool               series.DatabaseSeriesPool
   128  	reverseIndex             NamespaceIndex
   129  	insertQueue              *dbShardInsertQueue
   130  	lookup                   *shardMap
   131  	list                     *list.List
   132  	bootstrapState           BootstrapState
   133  	newMergerFn              fs.NewMergerFn
   134  	newFSMergeWithMemFn      newFSMergeWithMemFn
   135  	filesetsFn               filesetsFn
   136  	filesetPathsBeforeFn     filesetPathsBeforeFn
   137  	deleteFilesFn            deleteFilesFn
   138  	snapshotFilesFn          snapshotFilesFn
   139  	newReaderFn              fs.NewReaderFn
   140  	sleepFn                  func(time.Duration)
   141  	identifierPool           ident.Pool
   142  	contextPool              context.Pool
   143  	flushState               shardFlushState
   144  	tickWg                   *sync.WaitGroup
   145  	runtimeOptsListenClosers []xresource.SimpleCloser
   146  	currRuntimeOptions       dbShardRuntimeOptions
   147  	logger                   *zap.Logger
   148  	metrics                  dbShardMetrics
   149  	tileAggregator           TileAggregator
   150  	ticking                  bool
   151  	shard                    uint32
   152  	coldWritesEnabled        bool
   153  	indexEnabled             bool
   154  
   155  	entryMetrics *EntryMetrics
   156  }
   157  
   158  // NB(r): dbShardRuntimeOptions does not contain its own
   159  // mutex as some of the variables are needed each write
   160  // which already at least acquires read lock from the shard
   161  // mutex, so to keep the lock acquisitions to a minimum
   162  // these are protected under the same shard mutex.
   163  type dbShardRuntimeOptions struct {
   164  	writeNewSeriesAsync      bool
   165  	tickSleepSeriesBatchSize int
   166  	tickSleepPerSeries       time.Duration
   167  }
   168  
   169  type dbShardMetrics struct {
   170  	create                              tally.Counter
   171  	close                               tally.Counter
   172  	closeStart                          tally.Counter
   173  	closeLatency                        tally.Timer
   174  	seriesTicked                        tally.Gauge
   175  	insertAsyncInsertErrors             tally.Counter
   176  	insertAsyncWriteInternalErrors      tally.Counter
   177  	insertAsyncWriteInvalidParamsErrors tally.Counter
   178  	insertAsyncIndexErrors              tally.Counter
   179  	snapshotTotalLatency                tally.Timer
   180  	snapshotPrepareLatency              tally.Timer
   181  	snapshotMergeByBucketLatency        tally.Timer
   182  	snapshotMergeAcrossBucketsLatency   tally.Timer
   183  	snapshotChecksumLatency             tally.Timer
   184  	snapshotPersistLatency              tally.Timer
   185  	snapshotCloseLatency                tally.Timer
   186  
   187  	purgeUnexpectedRefCount tally.Counter
   188  }
   189  
   190  func newDatabaseShardMetrics(shardID uint32, scope tally.Scope) dbShardMetrics {
   191  	const insertErrorName = "insert-async.errors"
   192  	snapshotScope := scope.SubScope("snapshot")
   193  	return dbShardMetrics{
   194  		create:       scope.Counter("create"),
   195  		close:        scope.Counter("close"),
   196  		closeStart:   scope.Counter("close-start"),
   197  		closeLatency: scope.Timer("close-latency"),
   198  		seriesTicked: scope.Tagged(map[string]string{
   199  			"shard": fmt.Sprintf("%d", shardID),
   200  		}).Gauge("series-ticked"),
   201  		insertAsyncInsertErrors: scope.Tagged(map[string]string{
   202  			"error_type":    "insert-series",
   203  			"suberror_type": "shard-entry-insert-error",
   204  		}).Counter(insertErrorName),
   205  		insertAsyncWriteInternalErrors: scope.Tagged(map[string]string{
   206  			"error_type":    "write-value",
   207  			"suberror_type": "internal-error",
   208  		}).Counter(insertErrorName),
   209  		insertAsyncWriteInvalidParamsErrors: scope.Tagged(map[string]string{
   210  			"error_type":    "write-value",
   211  			"suberror_type": "invalid-params-error",
   212  		}).Counter(insertErrorName),
   213  		insertAsyncIndexErrors: scope.Tagged(map[string]string{
   214  			"error_type":    "reverse-index",
   215  			"suberror_type": "write-batch-error",
   216  		}).Counter(insertErrorName),
   217  		snapshotTotalLatency:              snapshotScope.Timer("total-latency"),
   218  		snapshotPrepareLatency:            snapshotScope.Timer("prepare-latency"),
   219  		snapshotMergeByBucketLatency:      snapshotScope.Timer("merge-by-bucket-latency"),
   220  		snapshotMergeAcrossBucketsLatency: snapshotScope.Timer("merge-across-buckets-latency"),
   221  		snapshotChecksumLatency:           snapshotScope.Timer("checksum-latency"),
   222  		snapshotPersistLatency:            snapshotScope.Timer("persist-latency"),
   223  		snapshotCloseLatency:              snapshotScope.Timer("close-latency"),
   224  		purgeUnexpectedRefCount:           scope.Counter("purge-unexpected-ref-count"),
   225  	}
   226  }
   227  
   228  type dbShardEntryWorkFn func(entry *Entry) bool
   229  
   230  type dbShardEntryBatchWorkFn func(entries []*Entry) bool
   231  
   232  type shardListElement *list.Element
   233  
   234  type shardFlushState struct {
   235  	sync.RWMutex
   236  	statesByTime map[xtime.UnixNano]fileOpState
   237  	initialized  bool
   238  }
   239  
   240  func newShardFlushState() shardFlushState {
   241  	return shardFlushState{
   242  		statesByTime: make(map[xtime.UnixNano]fileOpState),
   243  	}
   244  }
   245  
   246  func newDatabaseShard(
   247  	namespaceMetadata namespace.Metadata,
   248  	shard uint32,
   249  	blockRetriever block.DatabaseBlockRetriever,
   250  	namespaceReaderMgr databaseNamespaceReaderManager,
   251  	increasingIndex increasingIndex,
   252  	reverseIndex NamespaceIndex,
   253  	needsBootstrap bool,
   254  	opts Options,
   255  	seriesOpts series.Options,
   256  ) databaseShard {
   257  	scope := opts.InstrumentOptions().MetricsScope().
   258  		SubScope("dbshard")
   259  
   260  	s := &dbShard{
   261  		opts:                 opts,
   262  		seriesOpts:           seriesOpts,
   263  		nowFn:                opts.ClockOptions().NowFn(),
   264  		state:                dbShardStateOpen,
   265  		namespace:            namespaceMetadata,
   266  		shard:                shard,
   267  		namespaceReaderMgr:   namespaceReaderMgr,
   268  		increasingIndex:      increasingIndex,
   269  		seriesPool:           opts.DatabaseSeriesPool(),
   270  		reverseIndex:         reverseIndex,
   271  		lookup:               newShardMap(shardMapOptions{}),
   272  		list:                 list.New(),
   273  		newMergerFn:          fs.NewMerger,
   274  		newFSMergeWithMemFn:  newFSMergeWithMem,
   275  		filesetsFn:           fs.DataFiles,
   276  		filesetPathsBeforeFn: fs.DataFileSetsBefore,
   277  		deleteFilesFn:        fs.DeleteFiles,
   278  		snapshotFilesFn:      fs.SnapshotFiles,
   279  		sleepFn:              time.Sleep,
   280  		newReaderFn:          fs.NewReader,
   281  		identifierPool:       opts.IdentifierPool(),
   282  		contextPool:          opts.ContextPool(),
   283  		flushState:           newShardFlushState(),
   284  		tickWg:               &sync.WaitGroup{},
   285  		coldWritesEnabled:    namespaceMetadata.Options().ColdWritesEnabled(),
   286  		indexEnabled:         namespaceMetadata.Options().IndexOptions().Enabled(),
   287  		logger:               opts.InstrumentOptions().Logger(),
   288  		metrics:              newDatabaseShardMetrics(shard, scope),
   289  		tileAggregator:       opts.TileAggregator(),
   290  		entryMetrics:         NewEntryMetrics(scope.SubScope("entries")),
   291  	}
   292  	s.insertQueue = newDatabaseShardInsertQueue(s.insertSeriesBatch,
   293  		s.nowFn, opts.CoreFn(), scope, opts.InstrumentOptions().Logger())
   294  
   295  	registerRuntimeOptionsListener := func(listener runtime.OptionsListener) {
   296  		elem := opts.RuntimeOptionsManager().RegisterListener(listener)
   297  		s.runtimeOptsListenClosers = append(s.runtimeOptsListenClosers, elem)
   298  	}
   299  	registerRuntimeOptionsListener(s)
   300  	registerRuntimeOptionsListener(s.insertQueue)
   301  
   302  	// Start the insert queue after registering runtime options listeners
   303  	// that may immediately fire with values
   304  	s.insertQueue.Start()
   305  
   306  	if !needsBootstrap {
   307  		s.bootstrapState = Bootstrapped
   308  	}
   309  
   310  	if blockRetriever != nil {
   311  		s.setBlockRetriever(blockRetriever)
   312  	}
   313  
   314  	s.metrics.create.Inc(1)
   315  
   316  	return s
   317  }
   318  
   319  func (s *dbShard) setBlockRetriever(retriever block.DatabaseBlockRetriever) {
   320  	// If using the block retriever then set the block retriever field
   321  	// and set the series block retriever as the shard itself and
   322  	// the on retrieve block callback as the shard itself as well
   323  	s.DatabaseBlockRetriever = retriever
   324  	s.seriesBlockRetriever = s
   325  	s.seriesOnRetrieveBlock = s
   326  }
   327  
   328  func (s *dbShard) SetRuntimeOptions(value runtime.Options) {
   329  	s.Lock()
   330  	s.currRuntimeOptions = dbShardRuntimeOptions{
   331  		writeNewSeriesAsync:      value.WriteNewSeriesAsync(),
   332  		tickSleepSeriesBatchSize: value.TickSeriesBatchSize(),
   333  		tickSleepPerSeries:       value.TickPerSeriesSleepDuration(),
   334  	}
   335  	s.Unlock()
   336  }
   337  
   338  func (s *dbShard) ID() uint32 {
   339  	return s.shard
   340  }
   341  
   342  func (s *dbShard) NumSeries() int64 {
   343  	s.RLock()
   344  	n := s.list.Len()
   345  	s.RUnlock()
   346  	return int64(n)
   347  }
   348  
   349  // Stream implements series.QueryableBlockRetriever
   350  func (s *dbShard) Stream(
   351  	ctx context.Context,
   352  	id ident.ID,
   353  	blockStart xtime.UnixNano,
   354  	onRetrieve block.OnRetrieveBlock,
   355  	nsCtx namespace.Context,
   356  ) (xio.BlockReader, error) {
   357  	return s.DatabaseBlockRetriever.Stream(ctx, s.shard, id,
   358  		blockStart, onRetrieve, nsCtx)
   359  }
   360  
   361  // IsBlockRetrievable implements series.QueryableBlockRetriever
   362  func (s *dbShard) IsBlockRetrievable(blockStart xtime.UnixNano) (bool, error) {
   363  	return s.hasWarmFlushed(blockStart)
   364  }
   365  
   366  func (s *dbShard) hasWarmFlushed(blockStart xtime.UnixNano) (bool, error) {
   367  	flushState, err := s.FlushState(blockStart)
   368  	if err != nil {
   369  		return false, err
   370  	}
   371  	return s.warmStatusIsRetrievable(flushState.WarmStatus), nil
   372  }
   373  
   374  func (s *dbShard) warmStatusIsRetrievable(status warmStatus) bool {
   375  	if !statusIsRetrievable(status.DataFlushed) {
   376  		return false
   377  	}
   378  
   379  	// If the index is disabled, then we only are tracking data flushing.
   380  	// Otherwise, warm status requires both data and index flushed.
   381  	if !s.indexEnabled {
   382  		return true
   383  	}
   384  
   385  	return statusIsRetrievable(status.IndexFlushed)
   386  }
   387  
   388  func statusIsRetrievable(status fileOpStatus) bool {
   389  	switch status {
   390  	case fileOpNotStarted, fileOpInProgress, fileOpFailed:
   391  		return false
   392  	case fileOpSuccess:
   393  		return true
   394  	}
   395  	panic(fmt.Errorf("shard queried is retrievable with bad flush state %d",
   396  		status))
   397  }
   398  
   399  // RetrievableBlockColdVersion implements series.QueryableBlockRetriever
   400  func (s *dbShard) RetrievableBlockColdVersion(blockStart xtime.UnixNano) (int, error) {
   401  	flushState, err := s.FlushState(blockStart)
   402  	if err != nil {
   403  		return -1, err
   404  	}
   405  	return flushState.ColdVersionFlushed, nil
   406  }
   407  
   408  // BlockStatesSnapshot implements series.QueryableBlockRetriever
   409  func (s *dbShard) BlockStatesSnapshot() series.ShardBlockStateSnapshot {
   410  	s.RLock()
   411  	snapshots := s.blockStatesSnapshotWithRLock()
   412  	s.RUnlock()
   413  
   414  	return snapshots
   415  }
   416  
   417  func (s *dbShard) blockStatesSnapshotWithRLock() series.ShardBlockStateSnapshot {
   418  	bootstrapped := s.bootstrapState == Bootstrapped
   419  	if !bootstrapped {
   420  		// Needs to be bootstrapped.
   421  		return series.NewShardBlockStateSnapshot(false, series.BootstrappedBlockStateSnapshot{})
   422  	}
   423  
   424  	s.flushState.RLock()
   425  	defer s.flushState.RUnlock()
   426  	if !s.flushState.initialized {
   427  		// Also needs to have the shard flush states initialized.
   428  		return series.NewShardBlockStateSnapshot(false, series.BootstrappedBlockStateSnapshot{})
   429  	}
   430  
   431  	snapshot := make(map[xtime.UnixNano]series.BlockState, len(s.flushState.statesByTime))
   432  	for time, state := range s.flushState.statesByTime {
   433  		snapshot[time] = series.BlockState{
   434  			WarmRetrievable: s.warmStatusIsRetrievable(state.WarmStatus),
   435  			// Use ColdVersionRetrievable instead of ColdVersionFlushed since the snapshot
   436  			// will be used to make eviction decisions and we don't want to evict data before
   437  			// it is retrievable.
   438  			ColdVersion: state.ColdVersionRetrievable,
   439  		}
   440  	}
   441  
   442  	return series.NewShardBlockStateSnapshot(true, series.BootstrappedBlockStateSnapshot{
   443  		Snapshot: snapshot,
   444  	})
   445  }
   446  
   447  func (s *dbShard) OnRetrieveBlock(
   448  	id ident.ID,
   449  	tags ident.TagIterator,
   450  	startTime xtime.UnixNano,
   451  	segment ts.Segment,
   452  	nsCtx namespace.Context,
   453  ) {
   454  	s.RLock()
   455  	entry, err := s.lookupEntryWithLock(id)
   456  	if entry != nil {
   457  		entry.IncrementReaderWriterCount()
   458  		defer entry.DecrementReaderWriterCount()
   459  	}
   460  	s.RUnlock()
   461  
   462  	if err != nil && err != errShardEntryNotFound {
   463  		return // Likely closing
   464  	}
   465  
   466  	if entry != nil {
   467  		entry.Series.OnRetrieveBlock(id, tags, startTime, segment, nsCtx)
   468  		return
   469  	}
   470  
   471  	entry, err = s.newShardEntry(id, convert.NewTagsIterMetadataResolver(tags))
   472  	if err != nil {
   473  		// should never happen
   474  		instrument.EmitAndLogInvariantViolation(s.opts.InstrumentOptions(),
   475  			func(logger *zap.Logger) {
   476  				logger.Error("unable to create shardEntry from retrieved block data",
   477  					zap.Stringer("id", id),
   478  					zap.Time("startTime", startTime.ToTime()),
   479  					zap.Error(err))
   480  			})
   481  		return
   482  	}
   483  
   484  	// NB(r): Do not need to specify that needs to be indexed as series would
   485  	// have been already been indexed when it was written
   486  	copiedID := entry.Series.ID()
   487  	copiedTagsIter := s.identifierPool.TagsIterator()
   488  	copiedTagsIter.ResetFields(entry.Series.Metadata().Fields)
   489  	s.insertQueue.Insert(dbShardInsert{
   490  		entry: entry,
   491  		opts: dbShardInsertAsyncOptions{
   492  			// NB(r): Caching blocks should not be considered for
   493  			// new series insert rate limit.
   494  			skipRateLimit:            true,
   495  			hasPendingRetrievedBlock: true,
   496  			pendingRetrievedBlock: dbShardPendingRetrievedBlock{
   497  				id:      copiedID,
   498  				tags:    copiedTagsIter,
   499  				start:   startTime,
   500  				segment: segment,
   501  				nsCtx:   nsCtx,
   502  			},
   503  		},
   504  	})
   505  }
   506  
   507  func (s *dbShard) OnEvictedFromWiredList(id ident.ID, blockStart xtime.UnixNano) {
   508  	s.RLock()
   509  	entry, err := s.lookupEntryWithLock(id)
   510  	s.RUnlock()
   511  
   512  	if err != nil && err != errShardEntryNotFound {
   513  		return // Shard is probably closing
   514  	}
   515  
   516  	if entry == nil {
   517  		// Its counter-intuitive that this can ever occur because the series should
   518  		// always exist if it has any active blocks, and if we've reached this point
   519  		// then the WiredList had a reference to a block that should still be in the
   520  		// series, and thus the series should exist. The reason this can occur is that
   521  		// even though the WiredList controls the lifecycle of blocks retrieved from
   522  		// disk, those blocks can still be removed from the series if they've completely
   523  		// fallen out of the retention period. In that case, the series tick will still
   524  		// remove the block, and then the shard tick can remove the series. At that point,
   525  		// it's possible for the WiredList to have a reference to an expired block for a
   526  		// series that is no longer in the shard.
   527  		return
   528  	}
   529  
   530  	entry.Series.OnEvictedFromWiredList(id, blockStart)
   531  }
   532  
   533  func (s *dbShard) forEachShardEntry(entryFn dbShardEntryWorkFn) {
   534  	s.forEachShardEntryBatch(func(currEntries []*Entry) bool {
   535  		for _, entry := range currEntries {
   536  			if continueForEach := entryFn(entry); !continueForEach {
   537  				return false
   538  			}
   539  		}
   540  		return true
   541  	})
   542  }
   543  
   544  func iterateBatchSize(elemsLen int) int {
   545  	if elemsLen < shardIterateBatchMinSize {
   546  		return shardIterateBatchMinSize
   547  	}
   548  	t := math.Ceil(shardIterateBatchPercent * float64(elemsLen))
   549  	return int(math.Max(shardIterateBatchMinSize, t))
   550  }
   551  
   552  func (s *dbShard) forEachShardEntryBatch(entriesBatchFn dbShardEntryBatchWorkFn) {
   553  	// NB(r): consider using a lockless list for ticking.
   554  	s.RLock()
   555  	elemsLen := s.list.Len()
   556  	s.RUnlock()
   557  
   558  	batchSize := iterateBatchSize(elemsLen)
   559  	decRefElem := func(e *list.Element) {
   560  		if e == nil {
   561  			return
   562  		}
   563  		e.Value.(*Entry).DecrementReaderWriterCount()
   564  	}
   565  
   566  	var (
   567  		currEntries = make([]*Entry, 0, batchSize)
   568  		first       = true
   569  		nextElem    *list.Element
   570  	)
   571  	for nextElem != nil || first {
   572  		s.RLock()
   573  		// NB(prateek): release held reference on the next element pointer now
   574  		// that we have the read lock and are guaranteed it cannot be changed
   575  		// from under us.
   576  		decRefElem(nextElem)
   577  
   578  		// lazily pull from the head of the list at first
   579  		if first {
   580  			nextElem = s.list.Front()
   581  			first = false
   582  		}
   583  
   584  		elem := nextElem
   585  		for ticked := 0; ticked < batchSize && elem != nil; ticked++ {
   586  			nextElem = elem.Next()
   587  			entry := elem.Value.(*Entry)
   588  			entry.IncrementReaderWriterCount()
   589  			currEntries = append(currEntries, entry)
   590  			elem = nextElem
   591  		}
   592  
   593  		// NB(prateek): inc a reference to the next element while we have a lock,
   594  		// to guarantee the element pointer cannot be changed from under us.
   595  		if nextElem != nil {
   596  			nextElem.Value.(*Entry).IncrementReaderWriterCount()
   597  		}
   598  		s.RUnlock()
   599  
   600  		continueExecution := entriesBatchFn(currEntries)
   601  		for i := range currEntries {
   602  			currEntries[i].DecrementReaderWriterCount()
   603  			currEntries[i] = nil
   604  		}
   605  		currEntries = currEntries[:0]
   606  		if !continueExecution {
   607  			decRefElem(nextElem)
   608  			return
   609  		}
   610  	}
   611  }
   612  
   613  func (s *dbShard) IsBootstrapped() bool {
   614  	return s.BootstrapState() == Bootstrapped
   615  }
   616  
   617  func (s *dbShard) Close() error {
   618  	s.Lock()
   619  	if s.state != dbShardStateOpen {
   620  		s.Unlock()
   621  		return errShardNotOpen
   622  	}
   623  	s.state = dbShardStateClosing
   624  	s.Unlock()
   625  
   626  	s.insertQueue.Stop()
   627  
   628  	for _, closer := range s.runtimeOptsListenClosers {
   629  		closer.Close()
   630  	}
   631  
   632  	s.metrics.closeStart.Inc(1)
   633  	stopwatch := s.metrics.closeLatency.Start()
   634  	defer func() {
   635  		s.metrics.close.Inc(1)
   636  		stopwatch.Stop()
   637  	}()
   638  
   639  	// NB(prateek): wait till any existing ticks are finished. In the usual
   640  	// case, no other ticks are running, and tickWg count is at 0, so the
   641  	// call to Wait() will return immediately.
   642  	// In the case when there is an existing Tick running, the count for
   643  	// tickWg will be > 0, and we'll wait until it's reset to zero, which
   644  	// will happen because earlier in this function we set the shard state
   645  	// to dbShardStateClosing, which triggers an early termination of
   646  	// any active ticks.
   647  	s.tickWg.Wait()
   648  
   649  	// NB(r): Asynchronously we purge expired series to ensure pressure on the
   650  	// GC is not placed all at one time.  If the deadline is too low and still
   651  	// causes the GC to impact performance when closing shards the deadline
   652  	// should be increased.
   653  	cancellable := context.NewNoOpCanncellable()
   654  	_, err := s.tickAndExpire(cancellable, tickPolicyCloseShard, namespace.Context{})
   655  	return err
   656  }
   657  
   658  func (s *dbShard) Closed() bool {
   659  	return s.isClosing()
   660  }
   661  
   662  func (s *dbShard) isClosing() bool {
   663  	s.RLock()
   664  	closing := s.isClosingWithLock()
   665  	s.RUnlock()
   666  	return closing
   667  }
   668  
   669  func (s *dbShard) isClosingWithLock() bool {
   670  	return s.state == dbShardStateClosing
   671  }
   672  
   673  func (s *dbShard) Tick(c context.Cancellable, startTime xtime.UnixNano, nsCtx namespace.Context) (tickResult, error) {
   674  	s.removeAnyFlushStatesTooEarly(startTime)
   675  	return s.tickAndExpire(c, tickPolicyRegular, nsCtx)
   676  }
   677  
   678  func (s *dbShard) tickAndExpire(
   679  	c context.Cancellable,
   680  	policy tickPolicy,
   681  	nsCtx namespace.Context,
   682  ) (tickResult, error) {
   683  	s.Lock()
   684  	// ensure only one tick can execute at a time
   685  	if s.ticking {
   686  		s.Unlock()
   687  		// i.e. we were previously ticking
   688  		return tickResult{}, errShardAlreadyTicking
   689  	}
   690  
   691  	// NB(prateek): we bail out early if the shard is closing,
   692  	// unless it's the final tick issued during the Close(). This
   693  	// final tick is required to release resources back to our pools.
   694  	if policy != tickPolicyCloseShard && s.isClosingWithLock() {
   695  		s.Unlock()
   696  		return tickResult{}, errShardClosingTickTerminated
   697  	}
   698  
   699  	// enable Close() to track the lifecycle of the tick
   700  	s.ticking = true
   701  	s.tickWg.Add(1)
   702  	s.Unlock()
   703  
   704  	// reset ticking state
   705  	defer func() {
   706  		s.Lock()
   707  		s.ticking = false
   708  		s.tickWg.Done()
   709  		s.Unlock()
   710  		s.metrics.seriesTicked.Update(0.0) // reset external visibility
   711  	}()
   712  
   713  	var (
   714  		r                             tickResult
   715  		terminatedTickingDueToClosing bool
   716  		i                             int
   717  		slept                         time.Duration
   718  		expired                       []*Entry
   719  	)
   720  	s.RLock()
   721  	tickSleepBatch := s.currRuntimeOptions.tickSleepSeriesBatchSize
   722  	tickSleepPerSeries := s.currRuntimeOptions.tickSleepPerSeries
   723  	// Use blockStatesSnapshotWithRLock here to prevent nested read locks.
   724  	// Nested read locks will cause deadlocks if there is write lock attempt in
   725  	// between the nested read locks, since the write lock attempt will block
   726  	// future read lock attempts.
   727  	blockStates := s.blockStatesSnapshotWithRLock()
   728  	s.RUnlock()
   729  	s.forEachShardEntryBatch(func(currEntries []*Entry) bool {
   730  		// re-using `expired` to amortize allocs, still need to reset it
   731  		// to be safe for re-use.
   732  		for i := range expired {
   733  			expired[i] = nil
   734  		}
   735  		expired = expired[:0]
   736  		for _, entry := range currEntries {
   737  			if i > 0 && i%tickSleepBatch == 0 {
   738  				// NB(xichen): if the tick is cancelled, we bail out immediately.
   739  				// The cancellation check is performed on every batch of entries
   740  				// instead of every entry to reduce load.
   741  				if c.IsCancelled() {
   742  					return false
   743  				}
   744  				// NB(prateek): Also bail out early if the shard is closing,
   745  				// unless it's the final tick issued during the Close(). This
   746  				// final tick is required to release resources back to our pools.
   747  				if policy != tickPolicyCloseShard && s.isClosing() {
   748  					terminatedTickingDueToClosing = true
   749  					return false
   750  				}
   751  				// Expose shard level Tick() progress externally.
   752  				s.metrics.seriesTicked.Update(float64(i))
   753  				// Throttle the tick
   754  				sleepFor := time.Duration(tickSleepBatch) * tickSleepPerSeries
   755  				s.sleepFn(sleepFor)
   756  				slept += sleepFor
   757  			}
   758  
   759  			var (
   760  				result series.TickResult
   761  				err    error
   762  			)
   763  			switch policy {
   764  			case tickPolicyRegular:
   765  				result, err = entry.Series.Tick(blockStates, nsCtx)
   766  			case tickPolicyCloseShard:
   767  				err = series.ErrSeriesAllDatapointsExpired
   768  			}
   769  			if err == series.ErrSeriesAllDatapointsExpired {
   770  				expired = append(expired, entry)
   771  				r.expiredSeries++
   772  			} else {
   773  				r.activeSeries++
   774  				if err != nil {
   775  					r.errors++
   776  				}
   777  			}
   778  			r.activeBlocks += result.ActiveBlocks
   779  			r.wiredBlocks += result.WiredBlocks
   780  			r.unwiredBlocks += result.UnwiredBlocks
   781  			r.pendingMergeBlocks += result.PendingMergeBlocks
   782  			r.madeExpiredBlocks += result.MadeExpiredBlocks
   783  			r.madeUnwiredBlocks += result.MadeUnwiredBlocks
   784  			r.mergedOutOfOrderBlocks += result.MergedOutOfOrderBlocks
   785  			r.evictedBuckets += result.EvictedBuckets
   786  			i++
   787  		}
   788  
   789  		// Purge any series requiring purging.
   790  		if len(expired) > 0 {
   791  			s.purgeExpiredSeries(expired)
   792  			for i := range expired {
   793  				expired[i] = nil
   794  			}
   795  			expired = expired[:0]
   796  		}
   797  		// Continue.
   798  		return true
   799  	})
   800  
   801  	if terminatedTickingDueToClosing {
   802  		return tickResult{}, errShardClosingTickTerminated
   803  	}
   804  
   805  	return r, nil
   806  }
   807  
   808  // NB(prateek): purgeExpiredSeries requires that all entries passed to it have at least one reader/writer,
   809  // i.e. have a readWriteCount of at least 1.
   810  // Currently, this function is only called by the lambda inside `tickAndExpire`'s `forEachShardEntryBatch`
   811  // call. This satisfies the contract of all entries it operating upon being guaranteed to have a
   812  // readerWriterEntryCount of at least 1, by virtue of the implementation of `forEachShardEntryBatch`.
   813  func (s *dbShard) purgeExpiredSeries(expiredEntries []*Entry) {
   814  	// Remove all expired series from lookup and list.
   815  	s.Lock()
   816  	for _, entry := range expiredEntries {
   817  		// Only purge series after they've been GCed from the index, so that these happen and in order
   818  		// and there is no raciness around GCing something from the index when the series has already
   819  		// been removed from memory.
   820  		if s.indexEnabled && !entry.IndexGarbageCollected.Load() {
   821  			continue
   822  		}
   823  
   824  		series := entry.Series
   825  		id := series.ID()
   826  		elem, exists := s.lookup.Get(id)
   827  		if !exists {
   828  			continue
   829  		}
   830  
   831  		count := entry.ReaderWriterCount()
   832  		// The contract requires all entries to have count >= 1.
   833  		if count < 1 {
   834  			s.metrics.purgeUnexpectedRefCount.Inc(1)
   835  			instrument.EmitAndLogInvariantViolation(s.opts.InstrumentOptions(), func(l *zap.Logger) {
   836  				l.Error("purgeExpiredSeries encountered invalid series read/write count",
   837  					zap.Stringer("namespace", s.namespace.ID()),
   838  					zap.Uint32("shard", s.ID()),
   839  					zap.Stringer("series", series.ID()),
   840  					zap.Int32("readerWriterCount", count))
   841  			})
   842  			continue
   843  		}
   844  		// If this series is currently being written to or read from, we don't
   845  		// remove to ensure a consistent view of the series to other users.
   846  		if count > 1 {
   847  			continue
   848  		}
   849  		// If there have been datapoints written to the series since its
   850  		// last empty check, we don't remove it.
   851  		if !series.IsEmpty() {
   852  			continue
   853  		}
   854  
   855  		// NB(xichen): if we get here, we are guaranteed that there can be
   856  		// no more reads/writes to this series while the lock is held, so it's
   857  		// safe to remove it.
   858  		series.Close()
   859  		s.list.Remove(elem)
   860  		s.lookup.Delete(id)
   861  	}
   862  	s.Unlock()
   863  }
   864  
   865  func (s *dbShard) WriteTagged(
   866  	ctx context.Context,
   867  	id ident.ID,
   868  	tagResolver convert.TagMetadataResolver,
   869  	timestamp xtime.UnixNano,
   870  	value float64,
   871  	unit xtime.Unit,
   872  	annotation []byte,
   873  	wOpts series.WriteOptions,
   874  ) (SeriesWrite, error) {
   875  	return s.writeAndIndex(ctx, id, tagResolver, timestamp,
   876  		value, unit, annotation, wOpts, true)
   877  }
   878  
   879  func (s *dbShard) Write(
   880  	ctx context.Context,
   881  	id ident.ID,
   882  	timestamp xtime.UnixNano,
   883  	value float64,
   884  	unit xtime.Unit,
   885  	annotation []byte,
   886  	wOpts series.WriteOptions,
   887  ) (SeriesWrite, error) {
   888  	return s.writeAndIndex(ctx, id, convert.EmptyTagMetadataResolver, timestamp,
   889  		value, unit, annotation, wOpts, false)
   890  }
   891  
   892  func (s *dbShard) writeAndIndex(
   893  	ctx context.Context,
   894  	id ident.ID,
   895  	tagResolver convert.TagMetadataResolver,
   896  	timestamp xtime.UnixNano,
   897  	value float64,
   898  	unit xtime.Unit,
   899  	annotation []byte,
   900  	wOpts series.WriteOptions,
   901  	shouldReverseIndex bool,
   902  ) (SeriesWrite, error) {
   903  	// Prepare write
   904  	entry, opts, err := s.TryRetrieveSeriesAndIncrementReaderWriterCount(id)
   905  	if err != nil {
   906  		return SeriesWrite{}, err
   907  	}
   908  
   909  	writable := entry != nil
   910  
   911  	// If no entry and we are not writing new series asynchronously.
   912  	if !writable && !opts.WriteNewSeriesAsync {
   913  		// Avoid double lookup by enqueueing insert immediately.
   914  		result, err := s.insertSeriesAsyncBatched(id, tagResolver, dbShardInsertAsyncOptions{
   915  			hasPendingIndexing: shouldReverseIndex,
   916  			pendingIndex: dbShardPendingIndex{
   917  				timestamp:  timestamp,
   918  				enqueuedAt: s.nowFn(),
   919  			},
   920  		})
   921  		if err != nil {
   922  			return SeriesWrite{}, err
   923  		}
   924  
   925  		// Wait for the insert to be batched together and inserted
   926  		result.wg.Wait()
   927  
   928  		// Retrieve the inserted entry
   929  		entry, err = s.writableSeries(id, tagResolver)
   930  		if err != nil {
   931  			return SeriesWrite{}, err
   932  		}
   933  		writable = true
   934  
   935  		// NB(r): We just indexed this series if shouldReverseIndex was true
   936  		shouldReverseIndex = false
   937  	}
   938  
   939  	var (
   940  		commitLogSeriesID          ident.ID
   941  		commitLogSeriesUniqueIndex uint64
   942  		needsIndex                 bool
   943  		pendingIndexInsert         writes.PendingIndexInsert
   944  		// Err on the side of caution and always write to the commitlog if writing
   945  		// async, since there is no information about whether the write succeeded
   946  		// or not.
   947  		wasWritten = true
   948  	)
   949  	if writable {
   950  		// Perform write. No need to copy the annotation here because we're using it
   951  		// synchronously and all downstream code will copy anthing they need to maintain
   952  		// a reference to.
   953  		wasWritten, _, err = entry.Series.Write(ctx, timestamp, value, unit, annotation, wOpts)
   954  		// Load series metadata before decrementing the writer count
   955  		// to ensure this metadata is snapshotted at a consistent state
   956  		// NB(r): We explicitly do not place the series ID back into a
   957  		// pool as high frequency users of series IDs such
   958  		// as the commit log need to use the reference without the
   959  		// overhead of ownership tracking. This makes taking a ref here safe.
   960  		commitLogSeriesID = entry.Series.ID()
   961  		commitLogSeriesUniqueIndex = entry.Index
   962  		if err == nil && shouldReverseIndex {
   963  			if entry.NeedsIndexUpdate(s.reverseIndex.BlockStartForWriteTime(timestamp)) {
   964  				if !opts.WriteNewSeriesAsync {
   965  					return SeriesWrite{}, fmt.Errorf("to index async need write new series to be enabled")
   966  				}
   967  				needsIndex = true
   968  				pendingIndexInsert = s.pendingIndexInsert(entry, timestamp)
   969  			}
   970  		}
   971  		// release the reference we got on entry from `writableSeries`
   972  		entry.DecrementReaderWriterCount()
   973  		if err != nil {
   974  			return SeriesWrite{}, err
   975  		}
   976  	} else {
   977  		// This is an asynchronous insert and write which means we need to clone the annotation
   978  		// because its lifecycle in the commit log is independent of the calling function.
   979  		var annotationClone checked.Bytes
   980  		if len(annotation) != 0 {
   981  			annotationClone = s.opts.BytesPool().Get(len(annotation))
   982  			// IncRef here so we can write the bytes in, but don't DecRef because the queue is about
   983  			// to take ownership and will DecRef when its done.
   984  			annotationClone.IncRef()
   985  			annotationClone.AppendAll(annotation)
   986  		}
   987  
   988  		result, err := s.insertSeriesAsyncBatched(id, tagResolver, dbShardInsertAsyncOptions{
   989  			hasPendingWrite: true,
   990  			pendingWrite: dbShardPendingWrite{
   991  				timestamp:  timestamp,
   992  				value:      value,
   993  				unit:       unit,
   994  				annotation: annotationClone,
   995  				opts:       wOpts,
   996  			},
   997  		})
   998  		if err != nil {
   999  			return SeriesWrite{}, err
  1000  		}
  1001  
  1002  		if shouldReverseIndex {
  1003  			if !opts.WriteNewSeriesAsync {
  1004  				return SeriesWrite{}, fmt.Errorf("to index async need write new series to be enabled")
  1005  			}
  1006  			needsIndex = true
  1007  			pendingIndexInsert = s.pendingIndexInsert(result.entry, timestamp)
  1008  		}
  1009  
  1010  		// NB(r): Make sure to use the copied ID which will eventually
  1011  		// be set to the newly series inserted ID.
  1012  		// The `id` var here is volatile after the context is closed
  1013  		// and adding ownership tracking to use it in the commit log
  1014  		// (i.e. registering a dependency on the context) is too expensive.
  1015  		commitLogSeriesID = result.copiedID
  1016  		commitLogSeriesUniqueIndex = result.entry.Index
  1017  	}
  1018  
  1019  	// Return metadata useful for writing to commit log and indexing.
  1020  	return SeriesWrite{
  1021  		Series: ts.Series{
  1022  			UniqueIndex: commitLogSeriesUniqueIndex,
  1023  			Namespace:   s.namespace.ID(),
  1024  			ID:          commitLogSeriesID,
  1025  			Shard:       s.shard,
  1026  		},
  1027  		WasWritten:         wasWritten,
  1028  		NeedsIndex:         needsIndex,
  1029  		PendingIndexInsert: pendingIndexInsert,
  1030  	}, nil
  1031  }
  1032  
  1033  func (s *dbShard) SeriesRefResolver(
  1034  	id ident.ID,
  1035  	tags ident.TagIterator,
  1036  ) (bootstrap.SeriesRefResolver, error) {
  1037  	// Try retrieve existing series.
  1038  	entry, err := s.retrieveWritableSeriesAndIncrementReaderWriterCount(id)
  1039  	if err != nil {
  1040  		return nil, err
  1041  	}
  1042  
  1043  	if entry != nil {
  1044  		// The read/write ref is already incremented.
  1045  		return entry, nil
  1046  	}
  1047  
  1048  	entry, err = s.newShardEntry(id, convert.NewTagsIterMetadataResolver(tags))
  1049  	if err != nil {
  1050  		return nil, err
  1051  	}
  1052  
  1053  	// Increment ref count to avoid expiration of the new entry just after adding it to the queue.
  1054  	// It is possible that this entry does not end up as the one in the shard. Therefore, the resolver
  1055  	// for this specific entry is responsible for closing, and there should always be one resolver
  1056  	// responsible for the one that DOES end up in the shard.
  1057  	entry.IncrementReaderWriterCount()
  1058  
  1059  	wg, err := s.insertQueue.Insert(dbShardInsert{
  1060  		entry: entry,
  1061  		opts: dbShardInsertAsyncOptions{
  1062  			// skipRateLimit for true since this method is used by bootstrapping
  1063  			// and should not be rate limited.
  1064  			skipRateLimit: true,
  1065  			// do not release entry ref during async write, because entry ref will be released when
  1066  			// ReleaseRef() is called on bootstrap.SeriesRefResolver.
  1067  			releaseEntryRef: false,
  1068  		},
  1069  	})
  1070  	if err != nil {
  1071  		return nil, err
  1072  	}
  1073  
  1074  	// Series will wait for the result to be batched together and inserted.
  1075  	return NewSeriesResolver(
  1076  		wg,
  1077  		entry,
  1078  		s.retrieveWritableSeriesAndIncrementReaderWriterCount), nil
  1079  }
  1080  
  1081  func (s *dbShard) ReadEncoded(
  1082  	ctx context.Context,
  1083  	id ident.ID,
  1084  	start, end xtime.UnixNano,
  1085  	nsCtx namespace.Context,
  1086  ) (series.BlockReaderIter, error) {
  1087  	s.RLock()
  1088  	entry, err := s.lookupEntryWithLock(id)
  1089  	if entry != nil {
  1090  		// NB(r): Ensure readers have consistent view of this series, do
  1091  		// not expire the series while being read from.
  1092  		entry.IncrementReaderWriterCount()
  1093  		defer entry.DecrementReaderWriterCount()
  1094  	}
  1095  	s.RUnlock()
  1096  
  1097  	if err == errShardEntryNotFound {
  1098  		switch s.opts.SeriesCachePolicy() {
  1099  		case series.CacheAll:
  1100  			// No-op, would be in memory if cached
  1101  			return nil, nil
  1102  		}
  1103  	} else if err != nil {
  1104  		return nil, err
  1105  	}
  1106  
  1107  	if entry != nil {
  1108  		return entry.Series.ReadEncoded(ctx, start, end, nsCtx)
  1109  	}
  1110  
  1111  	retriever := s.seriesBlockRetriever
  1112  	onRetrieve := s.seriesOnRetrieveBlock
  1113  	opts := s.seriesOpts
  1114  	reader := series.NewReaderUsingRetriever(id, retriever, onRetrieve, nil, opts)
  1115  	return reader.ReadEncoded(ctx, start, end, nsCtx)
  1116  }
  1117  
  1118  // lookupEntryWithLock returns the entry for a given id while holding a read lock or a write lock.
  1119  func (s *dbShard) lookupEntryWithLock(id ident.ID) (*Entry, error) {
  1120  	if s.state != dbShardStateOpen {
  1121  		// NB(r): Return an invalid params error here so any upstream
  1122  		// callers will not retry this operation
  1123  		return nil, xerrors.NewInvalidParamsError(errShardNotOpen)
  1124  	}
  1125  	elem, exists := s.lookup.Get(id)
  1126  	if !exists {
  1127  		return nil, errShardEntryNotFound
  1128  	}
  1129  	return elem.Value.(*Entry), nil
  1130  }
  1131  
  1132  func (s *dbShard) writableSeries(id ident.ID, tagResolver convert.TagMetadataResolver) (*Entry, error) {
  1133  	for {
  1134  		entry, err := s.retrieveWritableSeriesAndIncrementReaderWriterCount(id)
  1135  		if entry != nil {
  1136  			return entry, nil
  1137  		}
  1138  		if err != nil {
  1139  			return nil, err
  1140  		}
  1141  
  1142  		// Not inserted, attempt a batched insert
  1143  		result, err := s.insertSeriesAsyncBatched(id, tagResolver, dbShardInsertAsyncOptions{})
  1144  		if err != nil {
  1145  			return nil, err
  1146  		}
  1147  
  1148  		// Wait for the insert attempt
  1149  		result.wg.Wait()
  1150  	}
  1151  }
  1152  
  1153  // WritableSeriesOptions defines writable series options.
  1154  type WritableSeriesOptions struct {
  1155  	// WriteNewSeriesAsync specifies if the series should be async written.
  1156  	WriteNewSeriesAsync bool
  1157  }
  1158  
  1159  // TryRetrieveSeriesAndIncrementReaderWriterCount attempts to retrieve a writable series.
  1160  // This increments the reader/writer count and so should be decremented when the series
  1161  // is no longer held.
  1162  func (s *dbShard) TryRetrieveSeriesAndIncrementReaderWriterCount(id ident.ID) (
  1163  	*Entry,
  1164  	WritableSeriesOptions,
  1165  	error,
  1166  ) {
  1167  	s.RLock()
  1168  	opts := WritableSeriesOptions{
  1169  		WriteNewSeriesAsync: s.currRuntimeOptions.writeNewSeriesAsync,
  1170  	}
  1171  	if entry, err := s.lookupEntryWithLock(id); err == nil {
  1172  		entry.IncrementReaderWriterCount()
  1173  		s.RUnlock()
  1174  		return entry, opts, nil
  1175  	} else if err != errShardEntryNotFound {
  1176  		s.RUnlock()
  1177  		return nil, opts, err
  1178  	}
  1179  	s.RUnlock()
  1180  	return nil, opts, nil
  1181  }
  1182  
  1183  func (s *dbShard) retrieveWritableSeriesAndIncrementReaderWriterCount(id ident.ID) (*Entry, error) {
  1184  	entry, _, err := s.TryRetrieveSeriesAndIncrementReaderWriterCount(id)
  1185  	return entry, err
  1186  }
  1187  
  1188  func (s *dbShard) newShardEntry(
  1189  	id ident.ID,
  1190  	tagResolver convert.TagMetadataResolver,
  1191  ) (*Entry, error) {
  1192  	// NB(r): As documented in storage/series.DatabaseSeries the series IDs
  1193  	// and metadata are garbage collected, hence we cast the ID to a BytesID
  1194  	// that can't be finalized.
  1195  	// Since series are purged so infrequently the overhead of not releasing
  1196  	// back an ID and metadata to a pool is amortized over a long period of
  1197  	// time.
  1198  	// Also of note, when a series is indexed in multiple index segments it is
  1199  	// worth keeping the metadata around so it can be referenced to twice
  1200  	// without creating a new array of []doc.Field for all the tags twice.
  1201  	// Hence this stays on the storage/series.DatabaseSeries for when it needs
  1202  	// to be re-indexed.
  1203  	var (
  1204  		seriesMetadata doc.Metadata
  1205  		err            error
  1206  	)
  1207  
  1208  	seriesMetadata, err = tagResolver.Resolve(id)
  1209  	if err != nil {
  1210  		return nil, err
  1211  	}
  1212  
  1213  	// Use the same bytes as the series metadata for the ID.
  1214  	seriesID := ident.BytesID(seriesMetadata.ID)
  1215  
  1216  	uniqueIndex := s.increasingIndex.nextIndex()
  1217  	newSeries := s.seriesPool.Get()
  1218  	newSeries.Reset(series.DatabaseSeriesOptions{
  1219  		ID:                     seriesID,
  1220  		Metadata:               seriesMetadata,
  1221  		UniqueIndex:            uniqueIndex,
  1222  		BlockRetriever:         s.seriesBlockRetriever,
  1223  		OnRetrieveBlock:        s.seriesOnRetrieveBlock,
  1224  		OnEvictedFromWiredList: s,
  1225  		Options:                s.seriesOpts,
  1226  	})
  1227  	return NewEntry(NewEntryOptions{
  1228  		Shard:        s,
  1229  		Series:       newSeries,
  1230  		Index:        uniqueIndex,
  1231  		IndexWriter:  s.reverseIndex,
  1232  		NowFn:        s.nowFn,
  1233  		EntryMetrics: s.entryMetrics,
  1234  	}), nil
  1235  }
  1236  
  1237  type insertAsyncResult struct {
  1238  	wg       *sync.WaitGroup
  1239  	copiedID ident.ID
  1240  	// entry is not guaranteed to be the final entry
  1241  	// inserted into the shard map in case there is already
  1242  	// an existing entry waiting in the insert queue
  1243  	entry *Entry
  1244  }
  1245  
  1246  func (s *dbShard) pendingIndexInsert(
  1247  	entry *Entry,
  1248  	timestamp xtime.UnixNano,
  1249  ) writes.PendingIndexInsert {
  1250  	// inc a ref on the entry to ensure it's valid until the queue acts upon it.
  1251  	entry.OnIndexPrepare(s.reverseIndex.BlockStartForWriteTime(timestamp))
  1252  	return writes.PendingIndexInsert{
  1253  		Entry: index.WriteBatchEntry{
  1254  			Timestamp:     timestamp,
  1255  			OnIndexSeries: entry,
  1256  			EnqueuedAt:    s.nowFn(),
  1257  		},
  1258  		Document: entry.Series.Metadata(),
  1259  	}
  1260  }
  1261  
  1262  func (s *dbShard) insertSeriesForIndexingAsyncBatched(
  1263  	entry *Entry,
  1264  	timestamp xtime.UnixNano,
  1265  	async bool,
  1266  ) error {
  1267  	indexBlockStart := s.reverseIndex.BlockStartForWriteTime(timestamp)
  1268  	// inc a ref on the entry to ensure it's valid until the queue acts upon it.
  1269  	entry.OnIndexPrepare(indexBlockStart)
  1270  	wg, err := s.insertQueue.Insert(dbShardInsert{
  1271  		entry: entry,
  1272  		opts: dbShardInsertAsyncOptions{
  1273  			// NB(r): Just indexing, should not be considered for new
  1274  			// series insert rate limiting.
  1275  			skipRateLimit:      true,
  1276  			hasPendingIndexing: true,
  1277  			pendingIndex: dbShardPendingIndex{
  1278  				timestamp:  timestamp,
  1279  				enqueuedAt: s.nowFn(),
  1280  			},
  1281  			// indicate we already have inc'd the entry's ref count, so we can correctly
  1282  			// handle the ref counting semantics in `insertSeriesBatch`.
  1283  			releaseEntryRef: true,
  1284  		},
  1285  	})
  1286  	// i.e. unable to enqueue into shard insert queue
  1287  	if err != nil {
  1288  		entry.OnIndexFinalize(indexBlockStart) // release any reference's we've held for indexing
  1289  		return err
  1290  	}
  1291  
  1292  	// if operating in async mode, we're done
  1293  	if async {
  1294  		return nil
  1295  	}
  1296  
  1297  	// if indexing in sync mode, wait till we're done and ensure we have have indexed the entry
  1298  	wg.Wait()
  1299  	if !entry.IndexedForBlockStart(indexBlockStart) {
  1300  		// i.e. indexing failed
  1301  		return fmt.Errorf("internal error: unable to index series")
  1302  	}
  1303  
  1304  	return nil
  1305  }
  1306  
  1307  func (s *dbShard) insertSeriesAsyncBatched(
  1308  	id ident.ID,
  1309  	tagResolver convert.TagMetadataResolver,
  1310  	opts dbShardInsertAsyncOptions,
  1311  ) (insertAsyncResult, error) {
  1312  	entry, err := s.newShardEntry(id, tagResolver)
  1313  	if err != nil {
  1314  		return insertAsyncResult{}, err
  1315  	}
  1316  
  1317  	wg, err := s.insertQueue.Insert(dbShardInsert{
  1318  		entry: entry,
  1319  		opts:  opts,
  1320  	})
  1321  	return insertAsyncResult{
  1322  		wg: wg,
  1323  		// Make sure to return the copied ID from the new series.
  1324  		copiedID: entry.Series.ID(),
  1325  		entry:    entry,
  1326  	}, err
  1327  }
  1328  
  1329  type insertSyncType uint8
  1330  
  1331  // nolint: varcheck, unused
  1332  const (
  1333  	insertSync insertSyncType = iota
  1334  	insertSyncIncReaderWriterCount
  1335  )
  1336  
  1337  type insertSyncOptions struct {
  1338  	insertType      insertSyncType
  1339  	hasPendingIndex bool
  1340  	pendingIndex    dbShardPendingIndex
  1341  }
  1342  
  1343  func (s *dbShard) insertSeriesSync(
  1344  	id ident.ID,
  1345  	tagResolver convert.TagMetadataResolver,
  1346  	opts insertSyncOptions,
  1347  ) (*Entry, error) {
  1348  	// NB(r): Create new shard entry outside of write lock to reduce
  1349  	// time using write lock.
  1350  	newEntry, err := s.newShardEntry(id, tagResolver)
  1351  	if err != nil {
  1352  		// should never happen
  1353  		instrument.EmitAndLogInvariantViolation(s.opts.InstrumentOptions(),
  1354  			func(logger *zap.Logger) {
  1355  				logger.Error("insertSeriesSync error creating shard entry",
  1356  					zap.String("id", id.String()),
  1357  					zap.Error(err))
  1358  			})
  1359  		return nil, err
  1360  	}
  1361  
  1362  	s.Lock()
  1363  	unlocked := false
  1364  	defer func() {
  1365  		if !unlocked {
  1366  			s.Unlock()
  1367  		}
  1368  	}()
  1369  
  1370  	existingEntry, err := s.lookupEntryWithLock(id)
  1371  	if err != nil && err != errShardEntryNotFound {
  1372  		// Shard not taking inserts likely.
  1373  		return nil, err
  1374  	}
  1375  	if existingEntry != nil {
  1376  		// Already inserted, likely a race.
  1377  		return existingEntry, nil
  1378  	}
  1379  
  1380  	s.insertNewShardEntryWithLock(newEntry)
  1381  
  1382  	// Track unlocking.
  1383  	unlocked = true
  1384  	s.Unlock()
  1385  
  1386  	// Be sure to enqueue for indexing if requires a pending index.
  1387  	if opts.hasPendingIndex {
  1388  		if _, err := s.insertQueue.Insert(dbShardInsert{
  1389  			entry: newEntry,
  1390  			opts: dbShardInsertAsyncOptions{
  1391  				// NB(r): Just indexing, should not be considered for new
  1392  				// series insert rate limiting.
  1393  				skipRateLimit:      true,
  1394  				hasPendingIndexing: opts.hasPendingIndex,
  1395  				pendingIndex:       opts.pendingIndex,
  1396  			},
  1397  		}); err != nil {
  1398  			return nil, err
  1399  		}
  1400  	}
  1401  
  1402  	// Check if we're making a modification to this entry, be sure
  1403  	// to increment the writer count so it's visible when we release
  1404  	// the lock.
  1405  	if opts.insertType == insertSyncIncReaderWriterCount {
  1406  		newEntry.IncrementReaderWriterCount()
  1407  	}
  1408  
  1409  	return newEntry, nil
  1410  }
  1411  
  1412  func (s *dbShard) insertNewShardEntryWithLock(entry *Entry) {
  1413  	// Set the lookup value, we use the copied ID and since it is GC'd
  1414  	// we explicitly set it with options to not copy the key and not to
  1415  	// finalize it.
  1416  	copiedID := entry.Series.ID()
  1417  	listElem := s.list.PushBack(entry)
  1418  	s.lookup.SetUnsafe(copiedID, listElem, shardMapSetUnsafeOptions{
  1419  		NoCopyKey:     true,
  1420  		NoFinalizeKey: true,
  1421  	})
  1422  	entry.SetInsertTime(s.nowFn())
  1423  }
  1424  
  1425  func (s *dbShard) insertSeriesBatch(inserts []dbShardInsert) error {
  1426  	var (
  1427  		anyPendingAction   = false
  1428  		numPendingIndexing = 0
  1429  	)
  1430  
  1431  	s.Lock()
  1432  	for i := range inserts {
  1433  		// If we are going to write to this entry then increment the
  1434  		// writer count so it does not look empty immediately after
  1435  		// we release the write lock.
  1436  		hasPendingWrite := inserts[i].opts.hasPendingWrite
  1437  		hasPendingIndexing := inserts[i].opts.hasPendingIndexing
  1438  		hasPendingRetrievedBlock := inserts[i].opts.hasPendingRetrievedBlock
  1439  		anyPendingAction = anyPendingAction || hasPendingWrite ||
  1440  			hasPendingRetrievedBlock || hasPendingIndexing
  1441  
  1442  		if hasPendingIndexing {
  1443  			numPendingIndexing++
  1444  		}
  1445  
  1446  		// we don't need to inc the entry ref count if we already have a ref on the entry. check if
  1447  		// that's the case.
  1448  		if inserts[i].opts.releaseEntryRef {
  1449  			// don't need to inc a ref on the entry, we were given as writable entry as input.
  1450  			continue
  1451  		}
  1452  
  1453  		// i.e. we don't have a ref on provided entry, so we check if between the operation being
  1454  		// enqueue in the shard insert queue, and this function executing, an entry was created
  1455  		// for the same ID.
  1456  		entry, err := s.lookupEntryWithLock(inserts[i].entry.Series.ID())
  1457  		if entry != nil {
  1458  			// Already exists so update the entry we're pointed at for this insert.
  1459  			inserts[i].entry = entry
  1460  		}
  1461  
  1462  		if hasPendingIndexing || hasPendingWrite || hasPendingRetrievedBlock {
  1463  			// We're definitely writing a value, ensure that the pending write is
  1464  			// visible before we release the lookup write lock.
  1465  			inserts[i].entry.IncrementReaderWriterCount()
  1466  			// also indicate that we have a ref count on this entry for this operation.
  1467  			inserts[i].opts.releaseEntryRef = true
  1468  		}
  1469  
  1470  		if err == nil {
  1471  			// Already inserted.
  1472  			continue
  1473  		}
  1474  
  1475  		if err != errShardEntryNotFound {
  1476  			// Shard is not taking inserts.
  1477  			s.Unlock()
  1478  			// FOLLOWUP(prateek): is this an existing bug? why don't we need to release any ref's we've inc'd
  1479  			// on entries in the loop before this point, i.e. in range [0, i). Otherwise, how are those entries
  1480  			// going to get cleaned up?
  1481  			s.metrics.insertAsyncInsertErrors.Inc(int64(len(inserts) - i))
  1482  			return err
  1483  		}
  1484  
  1485  		// Insert still pending, perform the insert
  1486  		entry = inserts[i].entry
  1487  		s.insertNewShardEntryWithLock(entry)
  1488  	}
  1489  	s.Unlock()
  1490  
  1491  	if !anyPendingAction {
  1492  		return nil
  1493  	}
  1494  
  1495  	// Perform any indexing, pending writes or pending retrieved blocks outside of lock
  1496  	ctx := s.contextPool.Get()
  1497  	// TODO(prateek): pool this type
  1498  	indexBlockSize := s.namespace.Options().IndexOptions().BlockSize()
  1499  	indexBatch := index.NewWriteBatch(index.WriteBatchOptions{
  1500  		InitialCapacity: numPendingIndexing,
  1501  		IndexBlockSize:  indexBlockSize,
  1502  	})
  1503  	for i := range inserts {
  1504  		var (
  1505  			entry           = inserts[i].entry
  1506  			releaseEntryRef = inserts[i].opts.releaseEntryRef
  1507  			err             error
  1508  		)
  1509  
  1510  		if inserts[i].opts.hasPendingWrite {
  1511  			write := inserts[i].opts.pendingWrite
  1512  			var annotationBytes []byte
  1513  			if write.annotation != nil {
  1514  				annotationBytes = write.annotation.Bytes()
  1515  			}
  1516  			// NB: Ignore the `wasWritten` return argument here since this is an async
  1517  			// operation and there is nothing further to do with this value.
  1518  			// TODO: Consider propagating the `wasWritten` argument back to the caller
  1519  			// using waitgroup (or otherwise) in the future.
  1520  			_, _, err = entry.Series.Write(ctx, write.timestamp, write.value,
  1521  				write.unit, annotationBytes, write.opts)
  1522  			if err != nil {
  1523  				if xerrors.IsInvalidParams(err) {
  1524  					s.metrics.insertAsyncWriteInvalidParamsErrors.Inc(1)
  1525  				} else {
  1526  					s.metrics.insertAsyncWriteInternalErrors.Inc(1)
  1527  					s.logger.Error("error with async insert write", zap.Error(err))
  1528  				}
  1529  			}
  1530  
  1531  			if write.annotation != nil {
  1532  				// Now that we've performed the write, we can finalize the annotation because
  1533  				// we're done with it and all the code from the series downwards has copied any
  1534  				// data that it required.
  1535  				write.annotation.DecRef()
  1536  				write.annotation.Finalize()
  1537  			}
  1538  		}
  1539  
  1540  		if inserts[i].opts.hasPendingIndexing {
  1541  			pendingIndex := inserts[i].opts.pendingIndex
  1542  			// increment the ref on the entry, as the original one was transferred to the
  1543  			// this method (insertSeriesBatch) via `releaseEntryRef` mechanism.
  1544  			entry.OnIndexPrepare(s.reverseIndex.BlockStartForWriteTime(pendingIndex.timestamp))
  1545  
  1546  			writeBatchEntry := index.WriteBatchEntry{
  1547  				Timestamp:     pendingIndex.timestamp,
  1548  				OnIndexSeries: entry,
  1549  				EnqueuedAt:    pendingIndex.enqueuedAt,
  1550  			}
  1551  
  1552  			indexBatch.Append(writeBatchEntry, entry.Series.Metadata())
  1553  		}
  1554  
  1555  		if inserts[i].opts.hasPendingRetrievedBlock {
  1556  			block := inserts[i].opts.pendingRetrievedBlock
  1557  			entry.Series.OnRetrieveBlock(block.id, block.tags, block.start, block.segment, block.nsCtx)
  1558  		}
  1559  
  1560  		// Entries in the shard insert queue are either of:
  1561  		// - new entries
  1562  		// - existing entries that we've taken a ref on (marked as releaseEntryRef)
  1563  		if releaseEntryRef {
  1564  			entry.DecrementReaderWriterCount()
  1565  		}
  1566  	}
  1567  
  1568  	var err error
  1569  	// index all requested entries in batch.
  1570  	if n := indexBatch.Len(); n > 0 {
  1571  		err = s.reverseIndex.WriteBatch(indexBatch)
  1572  		if err != nil {
  1573  			s.metrics.insertAsyncIndexErrors.Inc(int64(n))
  1574  		}
  1575  	}
  1576  
  1577  	// Avoid goroutine spinning up to close this context
  1578  	ctx.BlockingClose()
  1579  
  1580  	return err
  1581  }
  1582  
  1583  func (s *dbShard) FetchBlocks(
  1584  	ctx context.Context,
  1585  	id ident.ID,
  1586  	starts []xtime.UnixNano,
  1587  	nsCtx namespace.Context,
  1588  ) ([]block.FetchBlockResult, error) {
  1589  	s.RLock()
  1590  	entry, err := s.lookupEntryWithLock(id)
  1591  	if entry != nil {
  1592  		// NB(r): Ensure readers have consistent view of this series, do
  1593  		// not expire the series while being read from.
  1594  		entry.IncrementReaderWriterCount()
  1595  		defer entry.DecrementReaderWriterCount()
  1596  	}
  1597  	s.RUnlock()
  1598  
  1599  	if err == errShardEntryNotFound {
  1600  		switch s.opts.SeriesCachePolicy() {
  1601  		case series.CacheAll:
  1602  			// No-op, would be in memory if cached
  1603  			return nil, nil
  1604  		}
  1605  	} else if err != nil {
  1606  		return nil, err
  1607  	}
  1608  
  1609  	if entry != nil {
  1610  		return entry.Series.FetchBlocks(ctx, starts, nsCtx)
  1611  	}
  1612  
  1613  	retriever := s.seriesBlockRetriever
  1614  	onRetrieve := s.seriesOnRetrieveBlock
  1615  	opts := s.seriesOpts
  1616  	// Nil for onRead callback because we don't want peer bootstrapping to impact
  1617  	// the behavior of the LRU
  1618  	var onReadCb block.OnReadBlock
  1619  	reader := series.NewReaderUsingRetriever(id, retriever, onRetrieve, onReadCb, opts)
  1620  	return reader.FetchBlocks(ctx, starts, nsCtx)
  1621  }
  1622  
  1623  func (s *dbShard) FetchBlocksForColdFlush(
  1624  	ctx context.Context,
  1625  	seriesID ident.ID,
  1626  	start xtime.UnixNano,
  1627  	version int,
  1628  	nsCtx namespace.Context,
  1629  ) (block.FetchBlockResult, error) {
  1630  	s.RLock()
  1631  	entry, err := s.lookupEntryWithLock(seriesID)
  1632  	s.RUnlock()
  1633  	if entry == nil || err != nil {
  1634  		return block.FetchBlockResult{}, err
  1635  	}
  1636  
  1637  	return entry.Series.FetchBlocksForColdFlush(ctx, start, version, nsCtx)
  1638  }
  1639  
  1640  func (s *dbShard) fetchActiveBlocksMetadata(
  1641  	ctx context.Context,
  1642  	start, end xtime.UnixNano,
  1643  	limit int64,
  1644  	indexCursor int64,
  1645  	opts series.FetchBlocksMetadataOptions,
  1646  ) (block.FetchBlocksMetadataResults, *int64, error) {
  1647  	var (
  1648  		res             = s.opts.FetchBlocksMetadataResultsPool().Get()
  1649  		fetchCtx        = s.contextPool.Get()
  1650  		nextIndexCursor *int64
  1651  	)
  1652  
  1653  	var loopErr error
  1654  	s.forEachShardEntry(func(entry *Entry) bool {
  1655  		// Break out of the iteration loop once we've accumulated enough entries.
  1656  		if int64(len(res.Results())) >= limit {
  1657  			next := int64(entry.Index)
  1658  			nextIndexCursor = &next
  1659  			return false
  1660  		}
  1661  
  1662  		// Fast forward past indexes lower than page token
  1663  		if int64(entry.Index) < indexCursor {
  1664  			return true
  1665  		}
  1666  
  1667  		// Use a context here that we finalize immediately so the stream
  1668  		// readers can be returned to pool after we finish fetching the
  1669  		// metadata for this series.
  1670  		// NB(r): Use a pooled context for pooled finalizers/closers but
  1671  		// reuse so don't need to put and get from the pool each iteration.
  1672  		fetchCtx.Reset()
  1673  		metadata, err := entry.Series.FetchBlocksMetadata(ctx, start, end, opts)
  1674  		fetchCtx.BlockingCloseReset()
  1675  		if err != nil {
  1676  			loopErr = err
  1677  			return false
  1678  		}
  1679  
  1680  		// If the blocksMetadata is empty, the series have no data within the specified
  1681  		// time range so we don't return it to the client
  1682  		if len(metadata.Blocks.Results()) == 0 {
  1683  			metadata.Blocks.Close()
  1684  			return true
  1685  		}
  1686  
  1687  		// Otherwise add it to the result which takes care of closing the metadata
  1688  		res.Add(metadata)
  1689  
  1690  		return true
  1691  	})
  1692  
  1693  	return res, nextIndexCursor, loopErr
  1694  }
  1695  
  1696  func (s *dbShard) FetchBlocksMetadataV2(
  1697  	ctx context.Context,
  1698  	start, end xtime.UnixNano,
  1699  	limit int64,
  1700  	encodedPageToken PageToken,
  1701  	opts block.FetchBlocksMetadataOptions,
  1702  ) (block.FetchBlocksMetadataResults, PageToken, error) {
  1703  	token := new(pagetoken.PageToken)
  1704  	if encodedPageToken != nil {
  1705  		if err := proto.Unmarshal(encodedPageToken, token); err != nil {
  1706  			return nil, nil, xerrors.NewInvalidParamsError(errShardInvalidPageToken)
  1707  		}
  1708  	} else {
  1709  		// NB(bodu): Allow callers to specify that they only want results from disk.
  1710  		if opts.OnlyDisk {
  1711  			token.FlushedSeriesPhase = &pagetoken.PageToken_FlushedSeriesPhase{}
  1712  		}
  1713  	}
  1714  
  1715  	// NB(r): If returning mixed in memory and disk results, then we return anything
  1716  	// that's mutable in memory first then all disk results.
  1717  	// We work backwards so we don't hit race conditions with blocks
  1718  	// being flushed and potentially missed between paginations. Working
  1719  	// backwards means that we might duplicate metadata sent back switching
  1720  	// between active phase and flushed phase, but that's better than missing
  1721  	// data working in the opposite direction. De-duping which block time ranges
  1722  	// were actually sent is also difficult as it's not always a consistent view
  1723  	// across async pagination.
  1724  	// Duplicating the metadata sent back means that consumers get a consistent
  1725  	// view of the world if they merge all the results together.
  1726  	// In the future we should consider the lifecycle of fileset files rather
  1727  	// than directly working with them here while filesystem cleanup manager
  1728  	// could delete them mid-read, on linux this is ok as it's just an unlink
  1729  	// and we'll finish our read cleanly. If there's a race between us thinking
  1730  	// the file is accessible and us opening a reader to it then this will bubble
  1731  	// an error to the client which will be retried.
  1732  	var (
  1733  		activePhase  = token.ActiveSeriesPhase
  1734  		flushedPhase = token.FlushedSeriesPhase
  1735  	)
  1736  	if flushedPhase == nil {
  1737  		// If first phase started or no phases started then return active
  1738  		// series metadata until we find a block start time that we have fileset
  1739  		// files for.
  1740  		indexCursor := int64(0)
  1741  		if activePhase != nil {
  1742  			indexCursor = activePhase.IndexCursor
  1743  		}
  1744  		// We do not include cached blocks because we'll send metadata for
  1745  		// those blocks when we send metadata directly from the flushed files.
  1746  		seriesFetchBlocksMetadataOpts := series.FetchBlocksMetadataOptions{
  1747  			FetchBlocksMetadataOptions: opts,
  1748  		}
  1749  		result, nextIndexCursor, err := s.fetchActiveBlocksMetadata(ctx, start, end,
  1750  			limit, indexCursor, seriesFetchBlocksMetadataOpts)
  1751  		if err != nil {
  1752  			return nil, nil, err
  1753  		}
  1754  
  1755  		// Encode the next page token.
  1756  		if nextIndexCursor == nil {
  1757  			// Next phase, no more results from active series.
  1758  			token = &pagetoken.PageToken{
  1759  				FlushedSeriesPhase: &pagetoken.PageToken_FlushedSeriesPhase{},
  1760  			}
  1761  		} else {
  1762  			// This phase is still active.
  1763  			token = &pagetoken.PageToken{
  1764  				ActiveSeriesPhase: &pagetoken.PageToken_ActiveSeriesPhase{
  1765  					IndexCursor: *nextIndexCursor,
  1766  				},
  1767  			}
  1768  		}
  1769  
  1770  		data, err := proto.Marshal(token)
  1771  		if err != nil {
  1772  			return nil, nil, err
  1773  		}
  1774  
  1775  		return result, PageToken(data), nil
  1776  	}
  1777  
  1778  	// Must be in the second phase, start with checking the latest possible
  1779  	// flushed block and work backwards.
  1780  	var (
  1781  		result    = s.opts.FetchBlocksMetadataResultsPool().Get()
  1782  		ropts     = s.namespace.Options().RetentionOptions()
  1783  		blockSize = ropts.BlockSize()
  1784  		// Subtract one blocksize because all fetch requests are exclusive on the end side.
  1785  		blockStart      = end.Truncate(blockSize).Add(-1 * blockSize)
  1786  		now             = xtime.ToUnixNano(s.nowFn())
  1787  		tokenBlockStart xtime.UnixNano
  1788  		numResults      int64
  1789  	)
  1790  	if flushedPhase.CurrBlockStartUnixNanos > 0 {
  1791  		tokenBlockStart = xtime.UnixNano(flushedPhase.CurrBlockStartUnixNanos)
  1792  		blockStart = tokenBlockStart
  1793  	}
  1794  
  1795  	// Work backwards while in requested range and not before retention.
  1796  	for !blockStart.Before(start) &&
  1797  		!blockStart.Before(retention.FlushTimeStart(ropts, now)) {
  1798  		exists, err := s.namespaceReaderMgr.filesetExistsAt(s.shard, blockStart)
  1799  		if err != nil {
  1800  			return nil, nil, err
  1801  		}
  1802  		if !exists {
  1803  			// No fileset files here.
  1804  			blockStart = blockStart.Add(-1 * blockSize)
  1805  			continue
  1806  		}
  1807  
  1808  		var pos readerPosition
  1809  		if !tokenBlockStart.IsZero() {
  1810  			// Was previously seeking through a previous block, need to validate
  1811  			// this is the correct one we found otherwise the file just went missing.
  1812  			if !blockStart.Equal(tokenBlockStart) {
  1813  				return nil, nil, fmt.Errorf(
  1814  					"was reading block at %v but next available block is: %v",
  1815  					tokenBlockStart, blockStart)
  1816  			}
  1817  
  1818  			// Do not need to check if we move onto the next block that it matches
  1819  			// the token's block start on next iteration.
  1820  			tokenBlockStart = 0
  1821  
  1822  			pos.metadataIdx = int(flushedPhase.CurrBlockEntryIdx)
  1823  			pos.volume = int(flushedPhase.Volume)
  1824  		}
  1825  
  1826  		// Open a reader at this position, potentially from cache.
  1827  		reader, err := s.namespaceReaderMgr.get(s.shard, blockStart, pos)
  1828  		if err != nil {
  1829  			return nil, nil, err
  1830  		}
  1831  
  1832  		for numResults < limit {
  1833  			id, tags, size, checksum, err := reader.ReadMetadata()
  1834  			if err == io.EOF {
  1835  				// Clean end of volume, we can break now.
  1836  				if err := reader.Close(); err != nil {
  1837  					return nil, nil, fmt.Errorf(
  1838  						"could not close metadata reader for block %v: %v",
  1839  						blockStart, err)
  1840  				}
  1841  				break
  1842  			}
  1843  			if err != nil {
  1844  				// Best effort to close the reader on a read error.
  1845  				if err := reader.Close(); err != nil {
  1846  					s.logger.Error("could not close reader on unexpected err", zap.Error(err))
  1847  				}
  1848  				return nil, nil, fmt.Errorf(
  1849  					"could not read metadata for block %v: %v",
  1850  					blockStart, err)
  1851  			}
  1852  
  1853  			blockResult := s.opts.FetchBlockMetadataResultsPool().Get()
  1854  			value := block.FetchBlockMetadataResult{
  1855  				Start: blockStart,
  1856  			}
  1857  			if opts.IncludeSizes {
  1858  				value.Size = int64(size)
  1859  			}
  1860  			if opts.IncludeChecksums {
  1861  				v := checksum
  1862  				value.Checksum = &v
  1863  			}
  1864  			blockResult.Add(value)
  1865  
  1866  			numResults++
  1867  			result.Add(block.NewFetchBlocksMetadataResult(id, tags,
  1868  				blockResult))
  1869  		}
  1870  
  1871  		endPos := int64(reader.MetadataRead())
  1872  		// This volume may be different from the one initially requested,
  1873  		// e.g. if there was a compaction between the last call and this
  1874  		// one, so be sure to update the state of the pageToken. If this is not
  1875  		// updated, the request would have to start from the beginning since it
  1876  		// would be requesting a stale volume, which could result in an infinite
  1877  		// loop of requests that never complete.
  1878  		volume := int64(reader.Status().Volume)
  1879  
  1880  		// Return the reader to the cache. Since this is effectively putting
  1881  		// the reader into a shared pool, don't use the reader after this call.
  1882  		err = s.namespaceReaderMgr.put(reader)
  1883  		if err != nil {
  1884  			return nil, nil, err
  1885  		}
  1886  
  1887  		if numResults >= limit {
  1888  			// We hit the limit, return results with page token.
  1889  			token = &pagetoken.PageToken{
  1890  				FlushedSeriesPhase: &pagetoken.PageToken_FlushedSeriesPhase{
  1891  					CurrBlockStartUnixNanos: int64(blockStart),
  1892  					CurrBlockEntryIdx:       endPos,
  1893  					Volume:                  volume,
  1894  				},
  1895  			}
  1896  			data, err := proto.Marshal(token)
  1897  			if err != nil {
  1898  				return nil, nil, err
  1899  			}
  1900  			return result, data, nil
  1901  		}
  1902  
  1903  		// Otherwise we move on to the previous block.
  1904  		blockStart = blockStart.Add(-1 * blockSize)
  1905  	}
  1906  
  1907  	// No more results if we fall through.
  1908  	return result, nil, nil
  1909  }
  1910  
  1911  func (s *dbShard) PrepareBootstrap(ctx context.Context) error {
  1912  	ctx, span, sampled := ctx.StartSampledTraceSpan(tracepoint.ShardPrepareBootstrap)
  1913  	defer span.Finish()
  1914  
  1915  	if sampled {
  1916  		span.LogFields(log.Int("shard", int(s.shard)))
  1917  	}
  1918  
  1919  	// Iterate flushed time ranges to determine which blocks are retrievable.
  1920  	// NB(r): This must be done before bootstrap since during bootstrapping
  1921  	// series will load blocks into series with series.LoadBlock(...) which
  1922  	// needs to ask the shard whether certain time windows have been flushed or
  1923  	// not.
  1924  	s.initializeFlushStates()
  1925  	return nil
  1926  }
  1927  
  1928  func (s *dbShard) initializeFlushStates() {
  1929  	s.flushState.RLock()
  1930  	initialized := s.flushState.initialized
  1931  	s.flushState.RUnlock()
  1932  	if initialized {
  1933  		return
  1934  	}
  1935  
  1936  	defer func() {
  1937  		s.flushState.Lock()
  1938  		s.flushState.initialized = true
  1939  		s.flushState.Unlock()
  1940  	}()
  1941  
  1942  	s.UpdateFlushStates()
  1943  	return
  1944  }
  1945  
  1946  func (s *dbShard) UpdateFlushStates() {
  1947  	fsOpts := s.opts.CommitLogOptions().FilesystemOptions()
  1948  	readInfoFilesResults := fs.ReadInfoFiles(fsOpts.FilePathPrefix(), s.namespace.ID(), s.shard,
  1949  		fsOpts.InfoReaderBufferSize(), fsOpts.DecodingOptions(), persist.FileSetFlushType)
  1950  
  1951  	for _, result := range readInfoFilesResults {
  1952  		if err := result.Err.Error(); err != nil {
  1953  			s.logger.Error("unable to read info files in shard bootstrap",
  1954  				zap.Uint32("shard", s.ID()),
  1955  				zap.Stringer("namespace", s.namespace.ID()),
  1956  				zap.String("filepath", result.Err.Filepath()),
  1957  				zap.Error(err))
  1958  			continue
  1959  		}
  1960  
  1961  		info := result.Info
  1962  		at := xtime.UnixNano(info.BlockStart)
  1963  		currState := s.flushStateNoBootstrapCheck(at)
  1964  
  1965  		if currState.WarmStatus.DataFlushed != fileOpSuccess {
  1966  			s.markWarmDataFlushStateSuccess(at)
  1967  		}
  1968  
  1969  		// Cold version needs to get bootstrapped so that the 1:1 relationship
  1970  		// between volume number and cold version is maintained and the volume
  1971  		// numbers / flush versions remain monotonically increasing.
  1972  		//
  1973  		// Note that there can be multiple info files for the same block, for
  1974  		// example if the database didn't get to clean up compacted filesets
  1975  		// before terminating.
  1976  		if currState.ColdVersionRetrievable < info.VolumeIndex {
  1977  			s.setFlushStateColdVersionRetrievable(at, info.VolumeIndex)
  1978  			s.setFlushStateColdVersionFlushed(at, info.VolumeIndex)
  1979  		}
  1980  	}
  1981  
  1982  	// Populate index flush state only if enabled.
  1983  	if !s.indexEnabled {
  1984  		return
  1985  	}
  1986  
  1987  	blockSize := s.namespace.Options().RetentionOptions().BlockSize()
  1988  	indexBlockSize := s.namespace.Options().IndexOptions().BlockSize()
  1989  
  1990  	indexFlushedBlockStarts := s.reverseIndex.WarmFlushBlockStarts()
  1991  	for _, blockStart := range indexFlushedBlockStarts {
  1992  		// Index block size is wider than data block size, so we want to set all data blockStarts
  1993  		// within the range of a given index blockStart
  1994  		blockEnd := blockStart.Add(indexBlockSize)
  1995  		for at := blockStart; at < blockEnd; at = at.Add(blockSize) {
  1996  			currState := s.flushStateNoBootstrapCheck(at)
  1997  			if currState.WarmStatus.IndexFlushed != fileOpSuccess {
  1998  				s.markWarmIndexFlushStateSuccess(at)
  1999  			}
  2000  		}
  2001  	}
  2002  }
  2003  
  2004  func (s *dbShard) Bootstrap(
  2005  	ctx context.Context,
  2006  	nsCtx namespace.Context,
  2007  ) error {
  2008  	ctx, span, sampled := ctx.StartSampledTraceSpan(tracepoint.ShardBootstrap)
  2009  	defer span.Finish()
  2010  
  2011  	if sampled {
  2012  		span.LogFields(log.Int("shard", int(s.shard)))
  2013  	}
  2014  
  2015  	s.Lock()
  2016  	if s.bootstrapState == Bootstrapped {
  2017  		s.Unlock()
  2018  		return errShardAlreadyBootstrapped
  2019  	}
  2020  	if s.bootstrapState == Bootstrapping {
  2021  		s.Unlock()
  2022  		return errShardIsBootstrapping
  2023  	}
  2024  	s.bootstrapState = Bootstrapping
  2025  	s.Unlock()
  2026  
  2027  	multiErr := xerrors.NewMultiError()
  2028  
  2029  	// Initialize the flush states if we haven't called prepare bootstrap.
  2030  	if err := s.PrepareBootstrap(ctx); err != nil {
  2031  		multiErr = multiErr.Add(err)
  2032  	}
  2033  
  2034  	// Now that this shard has finished bootstrapping, attempt to cache all of its seekers. Cannot call
  2035  	// this earlier as block lease verification will fail due to the shards not being bootstrapped
  2036  	// (and as a result no leases can be verified since the flush state is not yet known).
  2037  	if err := s.cacheShardIndices(); err != nil {
  2038  		multiErr = multiErr.Add(err)
  2039  	}
  2040  
  2041  	// Move any bootstrap buffers into position for reading.
  2042  	s.forEachShardEntry(func(entry *Entry) bool {
  2043  		if err := entry.Series.Bootstrap(nsCtx); err != nil {
  2044  			multiErr = multiErr.Add(err)
  2045  		}
  2046  		return true
  2047  	})
  2048  
  2049  	s.Lock()
  2050  	s.bootstrapState = Bootstrapped
  2051  	s.Unlock()
  2052  
  2053  	return multiErr.FinalError()
  2054  }
  2055  
  2056  func (s *dbShard) LoadBlocks(
  2057  	seriesToLoad *result.Map,
  2058  ) error {
  2059  	if seriesToLoad == nil {
  2060  		return errTriedToLoadNilSeries
  2061  	}
  2062  
  2063  	s.Lock()
  2064  	// Don't allow loads until the shard is bootstrapped because the shard flush states need to be
  2065  	// bootstrapped in order to safely load blocks. This also keeps things simpler to reason about.
  2066  	if s.bootstrapState != Bootstrapped {
  2067  		s.Unlock()
  2068  		return errShardIsNotBootstrapped
  2069  	}
  2070  	s.Unlock()
  2071  
  2072  	memTracker := s.opts.MemoryTracker()
  2073  	estimatedSize := result.EstimateMapBytesSize(seriesToLoad)
  2074  	ok := memTracker.IncNumLoadedBytes(estimatedSize)
  2075  	if !ok {
  2076  		return ErrDatabaseLoadLimitHit
  2077  	}
  2078  
  2079  	multiErr := xerrors.NewMultiError()
  2080  	for _, elem := range seriesToLoad.Iter() {
  2081  		dbBlocks := elem.Value()
  2082  		id := dbBlocks.ID
  2083  		tags := dbBlocks.Tags
  2084  
  2085  		canFinalizeTagsAll := true
  2086  		for _, block := range dbBlocks.Blocks.AllBlocks() {
  2087  			result, err := s.loadBlock(id, tags, block)
  2088  			if err != nil {
  2089  				multiErr = multiErr.Add(err)
  2090  			}
  2091  
  2092  			canFinalizeTagsAll = canFinalizeTagsAll && result.canFinalizeTags
  2093  		}
  2094  
  2095  		if canFinalizeTagsAll {
  2096  			tags.Finalize()
  2097  		}
  2098  	}
  2099  
  2100  	return multiErr.FinalError()
  2101  }
  2102  
  2103  type loadBlockResult struct {
  2104  	canFinalizeTags bool
  2105  }
  2106  
  2107  func (s *dbShard) loadBlock(
  2108  	id ident.ID,
  2109  	tags ident.Tags,
  2110  	block block.DatabaseBlock,
  2111  ) (loadBlockResult, error) {
  2112  	var (
  2113  		timestamp = block.StartTime()
  2114  		result    loadBlockResult
  2115  	)
  2116  
  2117  	// First lookup if series already exists.
  2118  	entry, shardOpts, err := s.TryRetrieveSeriesAndIncrementReaderWriterCount(id)
  2119  	if err != nil && err != errShardEntryNotFound {
  2120  		return result, err
  2121  	}
  2122  	if entry == nil {
  2123  		// Synchronously insert to avoid waiting for the insert queue which could potentially
  2124  		// delay the insert.
  2125  		entry, err = s.insertSeriesSync(id, convert.NewTagsMetadataResolver(tags),
  2126  			insertSyncOptions{
  2127  				// NB(r): Because insertSyncIncReaderWriterCount is used here we
  2128  				// don't need to explicitly increment the reader/writer count and it
  2129  				// will happen while the write lock is held so that it can't immediately
  2130  				// be expired.
  2131  				insertType:      insertSyncIncReaderWriterCount,
  2132  				hasPendingIndex: s.reverseIndex != nil,
  2133  				pendingIndex: dbShardPendingIndex{
  2134  					timestamp:  timestamp,
  2135  					enqueuedAt: s.nowFn(),
  2136  				},
  2137  			})
  2138  		if err != nil {
  2139  			return result, err
  2140  		}
  2141  	} else {
  2142  		// No longer needed as we found the series and we don't require
  2143  		// them for insertion.
  2144  		// FOLLOWUP(r): Audit places that keep refs to the ID from a
  2145  		// bootstrap result, newShardEntry copies it but some of the
  2146  		// bootstrapped blocks when using certain series cache policies
  2147  		// keeps refs to the ID with seriesID, so for now these IDs will
  2148  		// be garbage collected)
  2149  		result.canFinalizeTags = true
  2150  	}
  2151  
  2152  	// Always decrement the reader writer count.
  2153  	defer entry.DecrementReaderWriterCount()
  2154  
  2155  	// NB(rartoul): The data being loaded is not part of the bootstrap process then it needs to be
  2156  	// loaded as a cold write because the load could be happening concurrently with
  2157  	// other processes like the flush (as opposed to bootstrap which cannot happen
  2158  	// concurrently with a flush) and there is no way to know if this series/block
  2159  	// combination has been warm flushed or not yet since updating the shard block state
  2160  	// doesn't happen until the entire flush completes.
  2161  	//
  2162  	// As a result the only safe operation is to load the block as a cold write which
  2163  	// ensures that the data will eventually be flushed and merged with the existing data
  2164  	// on disk in the two scenarios where the Load() API is used (cold writes and repairs).
  2165  	if err := entry.Series.LoadBlock(block, series.ColdWrite); err != nil {
  2166  		return result, err
  2167  	}
  2168  	// Cannot close blocks once done as series takes ref to them.
  2169  
  2170  	// Check if needs to be reverse indexed.
  2171  	if s.reverseIndex != nil &&
  2172  		entry.NeedsIndexUpdate(s.reverseIndex.BlockStartForWriteTime(timestamp)) {
  2173  		err = s.insertSeriesForIndexingAsyncBatched(entry, timestamp,
  2174  			shardOpts.WriteNewSeriesAsync)
  2175  		if err != nil {
  2176  			return result, err
  2177  		}
  2178  	}
  2179  
  2180  	return result, nil
  2181  }
  2182  
  2183  func (s *dbShard) cacheShardIndices() error {
  2184  	retriever := s.DatabaseBlockRetriever
  2185  	// May be nil depending on the caching policy.
  2186  	if retriever == nil {
  2187  		return nil
  2188  	}
  2189  
  2190  	s.logger.Debug("caching shard indices", zap.Uint32("shard", s.ID()))
  2191  	if err := retriever.CacheShardIndices([]uint32{s.ID()}); err != nil {
  2192  		s.logger.Error("caching shard indices error",
  2193  			zap.Uint32("shard", s.ID()),
  2194  			zap.Error(err))
  2195  		return err
  2196  	}
  2197  
  2198  	s.logger.Debug("caching shard indices completed successfully",
  2199  		zap.Uint32("shard", s.ID()))
  2200  	return nil
  2201  }
  2202  
  2203  func (s *dbShard) WarmFlush(
  2204  	blockStart xtime.UnixNano,
  2205  	flushPreparer persist.FlushPreparer,
  2206  	nsCtx namespace.Context,
  2207  ) error {
  2208  	// We don't flush data when the shard is still bootstrapping
  2209  	s.RLock()
  2210  	if s.bootstrapState != Bootstrapped {
  2211  		s.RUnlock()
  2212  		return errShardNotBootstrappedToFlush
  2213  	}
  2214  	s.RUnlock()
  2215  
  2216  	prepareOpts := persist.DataPrepareOptions{
  2217  		NamespaceMetadata: s.namespace,
  2218  		Shard:             s.ID(),
  2219  		BlockStart:        blockStart,
  2220  		// Volume index is always 0 for warm flushes because a warm flush must
  2221  		// happen first before cold flushes happen.
  2222  		VolumeIndex: 0,
  2223  		// We explicitly set delete if exists to false here as we track which
  2224  		// filesets exist at bootstrap time so we should never encounter a time
  2225  		// where a fileset already exists when we attempt to flush unless there
  2226  		// is a bug in the code.
  2227  		DeleteIfExists: false,
  2228  		FileSetType:    persist.FileSetFlushType,
  2229  	}
  2230  	prepared, err := flushPreparer.PrepareData(prepareOpts)
  2231  	if err != nil {
  2232  		return err
  2233  	}
  2234  
  2235  	var multiErr xerrors.MultiError
  2236  	flushCtx := s.contextPool.Get() // From pool so finalizers are from pool.
  2237  
  2238  	flushResult := dbShardFlushResult{}
  2239  	s.forEachShardEntry(func(entry *Entry) bool {
  2240  		curr := entry.Series
  2241  		// Use a temporary context here so the stream readers can be returned to
  2242  		// the pool after we finish fetching flushing the series.
  2243  		flushCtx.Reset()
  2244  		flushOutcome, err := curr.WarmFlush(flushCtx, blockStart, prepared.Persist, nsCtx)
  2245  		// Use BlockingCloseReset so context doesn't get returned to the pool.
  2246  		flushCtx.BlockingCloseReset()
  2247  
  2248  		if err != nil {
  2249  			multiErr = multiErr.Add(err)
  2250  			// If we encounter an error when persisting a series, don't continue as
  2251  			// the file on disk could be in a corrupt state.
  2252  			return false
  2253  		}
  2254  
  2255  		flushResult.update(flushOutcome)
  2256  
  2257  		return true
  2258  	})
  2259  
  2260  	s.logFlushResult(flushResult)
  2261  
  2262  	if err := prepared.Close(); err != nil {
  2263  		multiErr = multiErr.Add(err)
  2264  	}
  2265  
  2266  	return s.markWarmDataFlushStateSuccessOrError(blockStart, multiErr.FinalError())
  2267  }
  2268  
  2269  func (s *dbShard) ColdFlush(
  2270  	flushPreparer persist.FlushPreparer,
  2271  	resources coldFlushReusableResources,
  2272  	nsCtx namespace.Context,
  2273  	onFlushSeries persist.OnFlushSeries,
  2274  ) (ShardColdFlush, error) {
  2275  	// We don't flush data when the shard is still bootstrapping.
  2276  	s.RLock()
  2277  	if s.bootstrapState != Bootstrapped {
  2278  		s.RUnlock()
  2279  		return shardColdFlush{}, errShardNotBootstrappedToFlush
  2280  	}
  2281  	// Use blockStatesSnapshotWithRLock to avoid having to re-acquire read lock.
  2282  	blockStates := s.blockStatesSnapshotWithRLock()
  2283  	s.RUnlock()
  2284  
  2285  	resources.reset()
  2286  	var (
  2287  		multiErr           xerrors.MultiError
  2288  		dirtySeries        = resources.dirtySeries
  2289  		dirtySeriesToWrite = resources.dirtySeriesToWrite
  2290  		idElementPool      = resources.idElementPool
  2291  	)
  2292  
  2293  	blockStatesSnapshot, bootstrapped := blockStates.UnwrapValue()
  2294  	if !bootstrapped {
  2295  		return shardColdFlush{}, errFlushStateIsNotInitialized
  2296  	}
  2297  
  2298  	var (
  2299  		// forEachShardEntry should not execute in parallel, but protect with a lock anyways for paranoia.
  2300  		loopErrLock sync.Mutex
  2301  		loopErr     error
  2302  	)
  2303  	// First, loop through all series to capture data on which blocks have dirty
  2304  	// series and add them to the resources for further processing.
  2305  	s.forEachShardEntry(func(entry *Entry) bool {
  2306  		curr := entry.Series
  2307  		seriesMetadata := curr.Metadata()
  2308  		blockStarts := curr.ColdFlushBlockStarts(blockStatesSnapshot)
  2309  		blockStarts.ForEach(func(t xtime.UnixNano) {
  2310  			// Cold flushes can only happen on blockStarts that have been
  2311  			// warm flushed, because warm flush logic does not currently
  2312  			// perform any merging logic.
  2313  			hasWarmFlushed, err := s.hasWarmFlushed(t)
  2314  			if err != nil {
  2315  				loopErrLock.Lock()
  2316  				loopErr = err
  2317  				loopErrLock.Unlock()
  2318  				return
  2319  			}
  2320  			if !hasWarmFlushed {
  2321  				return
  2322  			}
  2323  
  2324  			seriesList := dirtySeriesToWrite[t]
  2325  			if seriesList == nil {
  2326  				seriesList = newIDList(idElementPool)
  2327  				dirtySeriesToWrite[t] = seriesList
  2328  			}
  2329  			element := seriesList.PushBack(seriesMetadata)
  2330  
  2331  			dirtySeries.Set(idAndBlockStart{
  2332  				blockStart: t,
  2333  				id:         seriesMetadata.ID,
  2334  			}, element)
  2335  		})
  2336  
  2337  		return true
  2338  	})
  2339  	if loopErr != nil {
  2340  		return shardColdFlush{}, loopErr
  2341  	}
  2342  
  2343  	if dirtySeries.Len() == 0 {
  2344  		// Early exit if there is nothing dirty to merge. dirtySeriesToWrite
  2345  		// may be non-empty when dirtySeries is empty because we purposely
  2346  		// leave empty seriesLists in the dirtySeriesToWrite map to avoid having
  2347  		// to reallocate them in subsequent usages of the shared resource.
  2348  		return shardColdFlush{}, nil
  2349  	}
  2350  
  2351  	flush := shardColdFlush{
  2352  		shard:   s,
  2353  		doneFns: make([]shardColdFlushDone, 0, len(dirtySeriesToWrite)),
  2354  	}
  2355  	merger := s.newMergerFn(resources.fsReader, s.opts.DatabaseBlockOptions().DatabaseBlockAllocSize(),
  2356  		s.opts.SegmentReaderPool(), s.opts.MultiReaderIteratorPool(),
  2357  		s.opts.IdentifierPool(), s.opts.EncoderPool(), s.opts.ContextPool(),
  2358  		s.opts.CommitLogOptions().FilesystemOptions().FilePathPrefix(), s.namespace.Options())
  2359  	mergeWithMem := s.newFSMergeWithMemFn(s, s, dirtySeries, dirtySeriesToWrite)
  2360  	// Loop through each block that we know has ColdWrites. Since each block
  2361  	// has its own fileset, if we encounter an error while trying to persist
  2362  	// a block, we continue to try persisting other blocks.
  2363  	for startTime := range dirtySeriesToWrite {
  2364  		coldVersion, err := s.RetrievableBlockColdVersion(startTime)
  2365  		if err != nil {
  2366  			multiErr = multiErr.Add(err)
  2367  			continue
  2368  		}
  2369  
  2370  		fsID := fs.FileSetFileIdentifier{
  2371  			Namespace:   s.namespace.ID(),
  2372  			Shard:       s.ID(),
  2373  			BlockStart:  startTime,
  2374  			VolumeIndex: coldVersion,
  2375  		}
  2376  
  2377  		nextVersion := coldVersion + 1
  2378  		close, err := merger.Merge(fsID, mergeWithMem, nextVersion, flushPreparer, nsCtx,
  2379  			onFlushSeries)
  2380  		if err != nil {
  2381  			multiErr = multiErr.Add(err)
  2382  			continue
  2383  		}
  2384  		flush.doneFns = append(flush.doneFns, shardColdFlushDone{
  2385  			startTime:   startTime,
  2386  			nextVersion: nextVersion,
  2387  			close:       close,
  2388  		})
  2389  	}
  2390  	return flush, multiErr.FinalError()
  2391  }
  2392  
  2393  func (s *dbShard) FilterBlocksNeedSnapshot(blockStarts []xtime.UnixNano) []xtime.UnixNano {
  2394  	if !s.IsBootstrapped() {
  2395  		return nil
  2396  	}
  2397  
  2398  	needs := map[xtime.UnixNano]struct{}{}
  2399  	s.forEachShardEntry(func(entry *Entry) bool {
  2400  		entry.Series.MarkNonEmptyBlocks(needs)
  2401  		if len(needs) < len(blockStarts) {
  2402  			return true
  2403  		}
  2404  		// Note: entries.Series might have non empty blocks that are not contained in blockStarts.
  2405  		// This prevents usage of len(needs) < len(blockStarts) as early exit criteria.
  2406  		for _, bl := range blockStarts {
  2407  			if _, ok := needs[bl]; !ok {
  2408  				return true
  2409  			}
  2410  		}
  2411  		return false
  2412  	})
  2413  
  2414  	// Note: doing this to keep original ordering. Not sure if that matters though.
  2415  	filtered := make([]xtime.UnixNano, 0, len(needs))
  2416  	for _, bl := range blockStarts {
  2417  		if _, ok := needs[bl]; ok {
  2418  			filtered = append(filtered, bl)
  2419  		}
  2420  	}
  2421  	return filtered
  2422  }
  2423  
  2424  func (s *dbShard) Snapshot(
  2425  	blockStart xtime.UnixNano,
  2426  	snapshotTime xtime.UnixNano,
  2427  	snapshotPreparer persist.SnapshotPreparer,
  2428  	nsCtx namespace.Context,
  2429  ) (ShardSnapshotResult, error) {
  2430  	// We don't snapshot data when the shard is still bootstrapping
  2431  	if !s.IsBootstrapped() {
  2432  		return ShardSnapshotResult{}, errShardNotBootstrappedToSnapshot
  2433  	}
  2434  
  2435  	// Record per-shard snapshot latency, not many shards so safe
  2436  	// to use a timer.
  2437  	totalTimer := s.metrics.snapshotTotalLatency.Start()
  2438  	defer totalTimer.Stop()
  2439  
  2440  	prepareOpts := persist.DataPrepareOptions{
  2441  		NamespaceMetadata: s.namespace,
  2442  		Shard:             s.ID(),
  2443  		BlockStart:        blockStart,
  2444  		FileSetType:       persist.FileSetSnapshotType,
  2445  		// We explicitly set delete if exists to false here as we do not
  2446  		// expect there to be a collision as snapshots files are appended
  2447  		// with a monotonically increasing number to avoid collisions, there
  2448  		// would have to be a competing process to cause a collision.
  2449  		DeleteIfExists: false,
  2450  		Snapshot: persist.DataPrepareSnapshotOptions{
  2451  			SnapshotTime: snapshotTime,
  2452  		},
  2453  	}
  2454  	prepareTimer := s.metrics.snapshotPrepareLatency.Start()
  2455  	prepared, err := snapshotPreparer.PrepareData(prepareOpts)
  2456  	prepareTimer.Stop()
  2457  	if err != nil {
  2458  		return ShardSnapshotResult{}, err
  2459  	}
  2460  
  2461  	var (
  2462  		snapshotCtx = s.contextPool.Get()
  2463  		persist     int
  2464  		stats       series.SnapshotResultStats
  2465  		multiErr    xerrors.MultiError
  2466  	)
  2467  	s.forEachShardEntry(func(entry *Entry) bool {
  2468  		series := entry.Series
  2469  		// Use a temporary context here so the stream readers can be returned to
  2470  		// pool after we finish fetching flushing the series
  2471  		snapshotCtx.Reset()
  2472  		result, err := series.Snapshot(snapshotCtx, blockStart, prepared.Persist, nsCtx)
  2473  		snapshotCtx.BlockingCloseReset()
  2474  
  2475  		if err != nil {
  2476  			multiErr = multiErr.Add(err)
  2477  			// If we encounter an error when persisting a series, don't continue as
  2478  			// the file on disk could be in a corrupt state.
  2479  			return false
  2480  		}
  2481  
  2482  		if result.Persist {
  2483  			persist++
  2484  		}
  2485  
  2486  		// Add snapshot result to cumulative result.
  2487  		stats.Add(result.Stats)
  2488  		return true
  2489  	})
  2490  
  2491  	// Emit cumulative snapshot result timings.
  2492  	if multiErr.NumErrors() == 0 {
  2493  		s.metrics.snapshotMergeByBucketLatency.Record(stats.TimeMergeByBucket)
  2494  		s.metrics.snapshotMergeAcrossBucketsLatency.Record(stats.TimeMergeAcrossBuckets)
  2495  		s.metrics.snapshotChecksumLatency.Record(stats.TimeChecksum)
  2496  		s.metrics.snapshotPersistLatency.Record(stats.TimePersist)
  2497  	}
  2498  
  2499  	closeTimer := s.metrics.snapshotCloseLatency.Start()
  2500  	multiErr = multiErr.Add(prepared.Close())
  2501  	closeTimer.Stop()
  2502  
  2503  	if err := multiErr.FinalError(); err != nil {
  2504  		return ShardSnapshotResult{}, err
  2505  	}
  2506  
  2507  	return ShardSnapshotResult{
  2508  		SeriesPersist: persist,
  2509  	}, nil
  2510  }
  2511  
  2512  func (s *dbShard) FlushState(blockStart xtime.UnixNano) (fileOpState, error) {
  2513  	s.flushState.RLock()
  2514  	initialized := s.flushState.initialized
  2515  	state := s.flushStateWithRLock(blockStart)
  2516  	s.flushState.RUnlock()
  2517  
  2518  	if !initialized {
  2519  		return fileOpState{}, errFlushStateIsNotInitialized
  2520  	}
  2521  
  2522  	return state, nil
  2523  }
  2524  
  2525  func (s *dbShard) flushStateNoBootstrapCheck(blockStart xtime.UnixNano) fileOpState {
  2526  	s.flushState.RLock()
  2527  	check := s.flushStateWithRLock(blockStart)
  2528  	s.flushState.RUnlock()
  2529  	return check
  2530  }
  2531  
  2532  func (s *dbShard) flushStateWithRLock(blockStart xtime.UnixNano) fileOpState {
  2533  	state, ok := s.flushState.statesByTime[blockStart]
  2534  	if !ok {
  2535  		return fileOpState{WarmStatus: warmStatus{
  2536  			DataFlushed:  fileOpNotStarted,
  2537  			IndexFlushed: fileOpNotStarted,
  2538  		}}
  2539  	}
  2540  	return state
  2541  }
  2542  
  2543  func (s *dbShard) markWarmDataFlushStateSuccessOrError(blockStart xtime.UnixNano, err error) error {
  2544  	// Track flush state for block state
  2545  	if err == nil {
  2546  		s.markWarmDataFlushStateSuccess(blockStart)
  2547  	} else {
  2548  		s.markWarmDataFlushStateFail(blockStart)
  2549  	}
  2550  	return err
  2551  }
  2552  
  2553  func (s *dbShard) markWarmDataFlushStateSuccess(blockStart xtime.UnixNano) {
  2554  	s.flushState.Lock()
  2555  	state := s.flushState.statesByTime[blockStart]
  2556  	state.WarmStatus.DataFlushed = fileOpSuccess
  2557  	s.flushState.statesByTime[blockStart] = state
  2558  	s.flushState.Unlock()
  2559  }
  2560  
  2561  func (s *dbShard) markWarmDataFlushStateFail(blockStart xtime.UnixNano) {
  2562  	s.flushState.Lock()
  2563  	state := s.flushState.statesByTime[blockStart]
  2564  	state.WarmStatus.DataFlushed = fileOpFailed
  2565  	state.NumFailures++
  2566  	s.flushState.statesByTime[blockStart] = state
  2567  	s.flushState.Unlock()
  2568  }
  2569  
  2570  // MarkWarmIndexFlushStateSuccessOrError marks the blockStart as
  2571  // success or fail based on the provided err.
  2572  func (s *dbShard) MarkWarmIndexFlushStateSuccessOrError(blockStart xtime.UnixNano, err error) {
  2573  	// Track flush state for block state
  2574  	if err == nil {
  2575  		s.markWarmIndexFlushStateSuccess(blockStart)
  2576  	} else {
  2577  		s.markWarmIndexFlushStateFail(blockStart)
  2578  	}
  2579  }
  2580  
  2581  func (s *dbShard) markWarmIndexFlushStateSuccess(blockStart xtime.UnixNano) {
  2582  	s.flushState.Lock()
  2583  	state := s.flushState.statesByTime[blockStart]
  2584  	state.WarmStatus.IndexFlushed = fileOpSuccess
  2585  	s.flushState.statesByTime[blockStart] = state
  2586  	s.flushState.Unlock()
  2587  }
  2588  
  2589  func (s *dbShard) markWarmIndexFlushStateFail(blockStart xtime.UnixNano) {
  2590  	s.flushState.Lock()
  2591  	state := s.flushState.statesByTime[blockStart]
  2592  	state.WarmStatus.IndexFlushed = fileOpFailed
  2593  	state.NumFailures++
  2594  	s.flushState.statesByTime[blockStart] = state
  2595  	s.flushState.Unlock()
  2596  }
  2597  
  2598  func (s *dbShard) setFlushStateColdVersionRetrievable(blockStart xtime.UnixNano, version int) {
  2599  	s.flushState.Lock()
  2600  	state := s.flushState.statesByTime[blockStart]
  2601  	state.ColdVersionRetrievable = version
  2602  	s.flushState.statesByTime[blockStart] = state
  2603  	s.flushState.Unlock()
  2604  }
  2605  
  2606  func (s *dbShard) setFlushStateColdVersionFlushed(blockStart xtime.UnixNano, version int) {
  2607  	s.flushState.Lock()
  2608  	state := s.flushState.statesByTime[blockStart]
  2609  	state.ColdVersionFlushed = version
  2610  	s.flushState.statesByTime[blockStart] = state
  2611  	s.flushState.Unlock()
  2612  }
  2613  
  2614  func (s *dbShard) removeAnyFlushStatesTooEarly(startTime xtime.UnixNano) {
  2615  	s.flushState.Lock()
  2616  	earliestFlush := retention.FlushTimeStart(s.namespace.Options().RetentionOptions(), startTime)
  2617  	for t := range s.flushState.statesByTime {
  2618  		if t.Before(earliestFlush) {
  2619  			delete(s.flushState.statesByTime, t)
  2620  		}
  2621  	}
  2622  	s.flushState.Unlock()
  2623  }
  2624  
  2625  func (s *dbShard) CleanupExpiredFileSets(earliestToRetain xtime.UnixNano) error {
  2626  	filePathPrefix := s.opts.CommitLogOptions().FilesystemOptions().FilePathPrefix()
  2627  	expired, err := s.filesetPathsBeforeFn(filePathPrefix, s.namespace.ID(), s.ID(), earliestToRetain)
  2628  	if err != nil {
  2629  		return fmt.Errorf("encountered errors when getting fileset files for prefix %s namespace %s shard %d: %v",
  2630  			filePathPrefix, s.namespace.ID(), s.ID(), err)
  2631  	}
  2632  
  2633  	return s.deleteFilesFn(expired)
  2634  }
  2635  
  2636  func (s *dbShard) CleanupCompactedFileSets() error {
  2637  	filePathPrefix := s.opts.CommitLogOptions().FilesystemOptions().FilePathPrefix()
  2638  	filesets, err := s.filesetsFn(filePathPrefix, s.namespace.ID(), s.ID())
  2639  	if err != nil {
  2640  		return fmt.Errorf("encountered errors when getting fileset files for prefix %s namespace %s shard %d: %v",
  2641  			filePathPrefix, s.namespace.ID(), s.ID(), err)
  2642  	}
  2643  
  2644  	// Get a snapshot of all states here to prevent constantly getting/releasing
  2645  	// locks in a tight loop below. This snapshot won't become stale halfway
  2646  	// through this because flushing and cleanup never happen in parallel.
  2647  	blockStates := s.BlockStatesSnapshot()
  2648  	blockStatesSnapshot, bootstrapped := blockStates.UnwrapValue()
  2649  	if !bootstrapped {
  2650  		return errShardIsNotBootstrapped
  2651  	}
  2652  
  2653  	toDelete := fs.FileSetFilesSlice(make([]fs.FileSetFile, 0, len(filesets)))
  2654  	for _, datafile := range filesets {
  2655  		fileID := datafile.ID
  2656  		blockState := blockStatesSnapshot.Snapshot[fileID.BlockStart]
  2657  		if fileID.VolumeIndex < blockState.ColdVersion {
  2658  			toDelete = append(toDelete, datafile)
  2659  		}
  2660  	}
  2661  
  2662  	return s.deleteFilesFn(toDelete.Filepaths())
  2663  }
  2664  
  2665  func (s *dbShard) Repair(
  2666  	ctx context.Context,
  2667  	nsCtx namespace.Context,
  2668  	nsMeta namespace.Metadata,
  2669  	tr xtime.Range,
  2670  	repairer databaseShardRepairer,
  2671  ) (repair.MetadataComparisonResult, error) {
  2672  	return repairer.Repair(ctx, nsCtx, nsMeta, tr, s)
  2673  }
  2674  
  2675  func (s *dbShard) AggregateTiles(
  2676  	ctx context.Context,
  2677  	sourceNs, targetNs Namespace,
  2678  	shardID uint32,
  2679  	onFlushSeries persist.OnFlushSeries,
  2680  	opts AggregateTilesOptions,
  2681  ) (int64, error) {
  2682  	var multiErr xerrors.MultiError
  2683  
  2684  	processedTileCount, nextVolume, err := s.tileAggregator.AggregateTiles(
  2685  		ctx, sourceNs, targetNs, shardID, onFlushSeries, opts)
  2686  	if err != nil {
  2687  		// NB: cannot return on the error here, must finish writing.
  2688  		multiErr = multiErr.Add(err)
  2689  	} else {
  2690  		// Notify all block leasers that a new volume for the namespace/shard/blockstart
  2691  		// has been created. This will block until all leasers have relinquished their
  2692  		// leases.
  2693  		// NB: markWarmFlushStateSuccess=true because there are no flushes happening in this
  2694  		// flow, and we need to set WarmStatus to fileOpSuccess explicitly in order to make
  2695  		// the new blocks readable.
  2696  		if err = s.finishWriting(opts.Start, nextVolume, true); err != nil {
  2697  			multiErr = multiErr.Add(err)
  2698  		}
  2699  	}
  2700  
  2701  	if err := multiErr.FinalError(); err != nil {
  2702  		return 0, err
  2703  	}
  2704  
  2705  	s.logger.Debug("finished aggregating tiles",
  2706  		zap.Uint32("shard", s.ID()),
  2707  		zap.Int64("processedTiles", processedTileCount))
  2708  
  2709  	return processedTileCount, nil
  2710  }
  2711  
  2712  func (s *dbShard) BootstrapState() BootstrapState {
  2713  	s.RLock()
  2714  	bs := s.bootstrapState
  2715  	s.RUnlock()
  2716  	return bs
  2717  }
  2718  
  2719  func (s *dbShard) DocRef(id ident.ID) (doc.Metadata, bool, error) {
  2720  	s.RLock()
  2721  	defer s.RUnlock()
  2722  
  2723  	entry, err := s.lookupEntryWithLock(id)
  2724  	if err == nil {
  2725  		return entry.Series.Metadata(), true, nil
  2726  	}
  2727  	if err == errShardEntryNotFound {
  2728  		return emptyDoc, false, nil
  2729  	}
  2730  	return emptyDoc, false, err
  2731  }
  2732  
  2733  func (s *dbShard) LatestVolume(blockStart xtime.UnixNano) (int, error) {
  2734  	return s.namespaceReaderMgr.latestVolume(s.shard, blockStart)
  2735  }
  2736  
  2737  func (s *dbShard) OpenStreamingReader(blockStart xtime.UnixNano) (fs.DataFileSetReader, error) {
  2738  	latestVolume, err := s.LatestVolume(blockStart)
  2739  	if err != nil {
  2740  		return nil, err
  2741  	}
  2742  
  2743  	reader, err := s.newReaderFn(s.opts.BytesPool(), s.opts.CommitLogOptions().FilesystemOptions())
  2744  	if err != nil {
  2745  		return nil, err
  2746  	}
  2747  
  2748  	openOpts := fs.DataReaderOpenOptions{
  2749  		Identifier: fs.FileSetFileIdentifier{
  2750  			Namespace:   s.namespace.ID(),
  2751  			Shard:       s.ID(),
  2752  			BlockStart:  blockStart,
  2753  			VolumeIndex: latestVolume,
  2754  		},
  2755  		FileSetType:      persist.FileSetFlushType,
  2756  		StreamingEnabled: true,
  2757  	}
  2758  
  2759  	if err := reader.Open(openOpts); err != nil {
  2760  		return nil, err
  2761  	}
  2762  
  2763  	return reader, nil
  2764  }
  2765  
  2766  func (s *dbShard) logFlushResult(r dbShardFlushResult) {
  2767  	s.logger.Debug("shard flush outcome",
  2768  		zap.Uint32("shard", s.ID()),
  2769  		zap.Int64("numBlockDoesNotExist", r.numBlockDoesNotExist),
  2770  	)
  2771  }
  2772  
  2773  func (s *dbShard) finishWriting(
  2774  	blockStart xtime.UnixNano,
  2775  	nextVersion int,
  2776  	markWarmFlushStateSuccess bool,
  2777  ) error {
  2778  	if markWarmFlushStateSuccess {
  2779  		s.markWarmDataFlushStateSuccess(blockStart)
  2780  		s.markWarmIndexFlushStateSuccess(blockStart)
  2781  	}
  2782  
  2783  	// After writing the full block successfully update the ColdVersionFlushed number. This will
  2784  	// allow the SeekerManager to open a lease on the latest version of the fileset files because
  2785  	// the BlockLeaseVerifier will check the ColdVersionFlushed value, but the buffer only looks at
  2786  	// ColdVersionRetrievable so a concurrent tick will not yet cause the blocks in memory to be
  2787  	// evicted (which is the desired behavior because we haven't updated the open leases yet which
  2788  	// means the newly written data is not available for querying via the SeekerManager yet.)
  2789  	s.setFlushStateColdVersionFlushed(blockStart, nextVersion)
  2790  
  2791  	// Notify all block leasers that a new volume for the namespace/shard/blockstart
  2792  	// has been created. This will block until all leasers have relinquished their
  2793  	// leases.
  2794  	_, err := s.opts.BlockLeaseManager().UpdateOpenLeases(block.LeaseDescriptor{
  2795  		Namespace:  s.namespace.ID(),
  2796  		Shard:      s.ID(),
  2797  		BlockStart: blockStart,
  2798  	}, block.LeaseState{Volume: nextVersion})
  2799  	// After writing the full block successfully **and** propagating the new lease to the
  2800  	// BlockLeaseManager, update the ColdVersionRetrievable in the flush state. Once this function
  2801  	// completes concurrent ticks will be able to evict the data from memory that was just flushed
  2802  	// (which is now safe to do since the SeekerManager has been notified of the presence of new
  2803  	// files).
  2804  	//
  2805  	// NB(rartoul): Ideally the ColdVersionRetrievable would only be updated if the call to UpdateOpenLeases
  2806  	// succeeded, but that would allow the ColdVersionRetrievable and ColdVersionFlushed numbers to drift
  2807  	// which would increase the complexity of the code to address a situation that is probably not
  2808  	// recoverable (failure to UpdateOpenLeases is an invariant violated error).
  2809  	s.setFlushStateColdVersionRetrievable(blockStart, nextVersion)
  2810  	if err != nil {
  2811  		instrument.EmitAndLogInvariantViolation(s.opts.InstrumentOptions(), func(l *zap.Logger) {
  2812  			l.With(
  2813  				zap.String("namespace", s.namespace.ID().String()),
  2814  				zap.Uint32("shard", s.ID()),
  2815  				zap.Time("blockStart", blockStart.ToTime()),
  2816  				zap.Int("nextVersion", nextVersion),
  2817  				zap.Error(err),
  2818  			).Error("failed to update open leases after updating flush state cold version")
  2819  		})
  2820  		return err
  2821  	}
  2822  	return nil
  2823  }
  2824  
  2825  type shardColdFlushDone struct {
  2826  	startTime   xtime.UnixNano
  2827  	nextVersion int
  2828  	close       persist.DataCloser
  2829  }
  2830  
  2831  type shardColdFlush struct {
  2832  	shard   *dbShard
  2833  	doneFns []shardColdFlushDone
  2834  }
  2835  
  2836  func (s shardColdFlush) Done() error {
  2837  	multiErr := xerrors.NewMultiError()
  2838  	for _, done := range s.doneFns {
  2839  		startTime := done.startTime
  2840  		nextVersion := done.nextVersion
  2841  
  2842  		if err := done.close(); err != nil {
  2843  			multiErr = multiErr.Add(err)
  2844  			continue
  2845  		}
  2846  
  2847  		err := s.shard.finishWriting(startTime, nextVersion, false)
  2848  		if err != nil {
  2849  			multiErr = multiErr.Add(err)
  2850  		}
  2851  	}
  2852  	return multiErr.FinalError()
  2853  }
  2854  
  2855  // dbShardFlushResult is a helper struct for keeping track of the result of flushing all the
  2856  // series in the shard.
  2857  type dbShardFlushResult struct {
  2858  	numBlockDoesNotExist int64
  2859  }
  2860  
  2861  func (r *dbShardFlushResult) update(u series.FlushOutcome) {
  2862  	if u == series.FlushOutcomeBlockDoesNotExist {
  2863  		r.numBlockDoesNotExist++
  2864  	}
  2865  }