github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/storage/index.go (about)

     1  // Copyright (c) 2020 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package storage
    22  
    23  import (
    24  	"bytes"
    25  	"errors"
    26  	"fmt"
    27  	"io"
    28  	"math"
    29  	goruntime "runtime"
    30  	"sort"
    31  	"strconv"
    32  	"sync"
    33  	"time"
    34  
    35  	"github.com/m3db/m3/src/dbnode/namespace"
    36  	"github.com/m3db/m3/src/dbnode/persist"
    37  	"github.com/m3db/m3/src/dbnode/persist/fs"
    38  	"github.com/m3db/m3/src/dbnode/retention"
    39  	"github.com/m3db/m3/src/dbnode/runtime"
    40  	"github.com/m3db/m3/src/dbnode/sharding"
    41  	"github.com/m3db/m3/src/dbnode/storage/block"
    42  	"github.com/m3db/m3/src/dbnode/storage/bootstrap/result"
    43  	m3dberrors "github.com/m3db/m3/src/dbnode/storage/errors"
    44  	"github.com/m3db/m3/src/dbnode/storage/index"
    45  	"github.com/m3db/m3/src/dbnode/storage/index/compaction"
    46  	"github.com/m3db/m3/src/dbnode/storage/index/convert"
    47  	"github.com/m3db/m3/src/dbnode/storage/limits"
    48  	"github.com/m3db/m3/src/dbnode/storage/limits/permits"
    49  	"github.com/m3db/m3/src/dbnode/storage/series"
    50  	"github.com/m3db/m3/src/dbnode/tracepoint"
    51  	"github.com/m3db/m3/src/dbnode/ts/writes"
    52  	"github.com/m3db/m3/src/m3ninx/doc"
    53  	"github.com/m3db/m3/src/m3ninx/idx"
    54  	m3ninxindex "github.com/m3db/m3/src/m3ninx/index"
    55  	"github.com/m3db/m3/src/m3ninx/index/segment"
    56  	"github.com/m3db/m3/src/m3ninx/index/segment/builder"
    57  	idxpersist "github.com/m3db/m3/src/m3ninx/persist"
    58  	"github.com/m3db/m3/src/m3ninx/x"
    59  	"github.com/m3db/m3/src/x/clock"
    60  	"github.com/m3db/m3/src/x/context"
    61  	xerrors "github.com/m3db/m3/src/x/errors"
    62  	"github.com/m3db/m3/src/x/ident"
    63  	"github.com/m3db/m3/src/x/instrument"
    64  	xopentracing "github.com/m3db/m3/src/x/opentracing"
    65  	xresource "github.com/m3db/m3/src/x/resource"
    66  	xtime "github.com/m3db/m3/src/x/time"
    67  
    68  	"github.com/m3db/bitset"
    69  	"github.com/opentracing/opentracing-go"
    70  	opentracinglog "github.com/opentracing/opentracing-go/log"
    71  	"github.com/uber-go/tally"
    72  	"go.uber.org/atomic"
    73  	"go.uber.org/zap"
    74  )
    75  
    76  var (
    77  	errDbIndexAlreadyClosed               = errors.New("database index has already been closed")
    78  	errDbIndexUnableToWriteClosed         = errors.New("unable to write to database index, already closed")
    79  	errDbIndexUnableToQueryClosed         = errors.New("unable to query database index, already closed")
    80  	errDbIndexUnableToFlushClosed         = errors.New("unable to flush database index, already closed")
    81  	errDbIndexUnableToCleanupClosed       = errors.New("unable to cleanup database index, already closed")
    82  	errDbIndexTerminatingTickCancellation = errors.New("terminating tick early due to cancellation")
    83  	errDbIndexIsBootstrapping             = errors.New("index is already bootstrapping")
    84  	errDbIndexDoNotIndexSeries            = errors.New("series matched do not index fields")
    85  )
    86  
    87  const (
    88  	defaultFlushReadDataBlocksBatchSize = int64(4096)
    89  	nsIndexReportStatsInterval          = 10 * time.Second
    90  
    91  	defaultFlushDocsBatchSize = 8192
    92  )
    93  
    94  var allQuery = idx.NewAllQuery()
    95  
    96  // nolint: maligned
    97  type nsIndex struct {
    98  	state nsIndexState
    99  
   100  	// all the vars below this line are not modified past the ctor
   101  	// and don't require a lock when being accessed.
   102  	nowFn                 clock.NowFn
   103  	blockSize             time.Duration
   104  	retentionPeriod       time.Duration
   105  	futureRetentionPeriod time.Duration
   106  	bufferPast            time.Duration
   107  	bufferFuture          time.Duration
   108  	coldWritesEnabled     bool
   109  
   110  	namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager
   111  	indexFilesetsBeforeFn   indexFilesetsBeforeFn
   112  	deleteFilesFn           deleteFilesFn
   113  	readIndexInfoFilesFn    readIndexInfoFilesFn
   114  
   115  	newBlockFn            index.NewBlockFn
   116  	logger                *zap.Logger
   117  	opts                  Options
   118  	nsMetadata            namespace.Metadata
   119  	runtimeOptsListener   xresource.SimpleCloser
   120  	runtimeNsOptsListener xresource.SimpleCloser
   121  
   122  	resultsPool          index.QueryResultsPool
   123  	aggregateResultsPool index.AggregateResultsPool
   124  
   125  	permitsManager permits.Manager
   126  
   127  	// queriesWg tracks outstanding queries to ensure
   128  	// we wait for all queries to complete before actually closing
   129  	// blocks and other cleanup tasks on index close
   130  	queriesWg sync.WaitGroup
   131  
   132  	metrics nsIndexMetrics
   133  
   134  	// forwardIndexDice determines if an incoming index write should be dual
   135  	// written to the next block.
   136  	forwardIndexDice forwardIndexDice
   137  
   138  	doNotIndexWithFields []doc.Field
   139  
   140  	activeBlock index.Block
   141  }
   142  
   143  type nsIndexState struct {
   144  	sync.RWMutex // NB: guards all variables in this struct
   145  
   146  	closed         bool
   147  	closeCh        chan struct{}
   148  	bootstrapState BootstrapState
   149  
   150  	runtimeOpts nsIndexRuntimeOptions
   151  
   152  	insertQueue namespaceIndexInsertQueue
   153  
   154  	// NB: `latestBlock` v `blocksByTime`: blocksByTime contains all the blocks known to `nsIndex`.
   155  	// `latestBlock` refers to the block with greatest StartTime within blocksByTime. We do this
   156  	// to skip accessing the map blocksByTime in the vast majority of write/query requests. It's
   157  	// lazily updated, so it can point to an older element until a Tick()/write rotates it.
   158  	blocksByTime map[xtime.UnixNano]index.Block
   159  	latestBlock  index.Block
   160  
   161  	// NB: `blockStartsDescOrder` contains the keys from the map `blocksByTime` in reverse
   162  	// chronological order. This is used at query time to enforce determinism about results
   163  	// returned.
   164  	// NB(r): Reference to this slice can be safely taken for iteration purposes
   165  	// for Query(..) since it is rebuilt each time and immutable once built.
   166  	blocksDescOrderImmutable []blockAndBlockStart
   167  
   168  	// shardsFilterID is set every time the shards change to correctly
   169  	// only return IDs that this node owns.
   170  	shardsFilterID func(ident.ID) bool
   171  
   172  	// shardFilteredForID is set every time the shards change to correctly
   173  	// only return IDs that this node owns, and the shard responsible for that ID.
   174  	shardFilteredForID func(id ident.ID) (uint32, bool)
   175  
   176  	shardsAssigned map[uint32]struct{}
   177  }
   178  
   179  type blockAndBlockStart struct {
   180  	block      index.Block
   181  	blockStart xtime.UnixNano
   182  }
   183  
   184  // NB: nsIndexRuntimeOptions does not contain its own mutex as some of the variables
   185  // are needed for each index write which already at least acquires read lock from
   186  // nsIndex mutex, so to keep the lock acquisitions to a minimum these are protected
   187  // under the same nsIndex mutex.
   188  type nsIndexRuntimeOptions struct {
   189  	insertMode          index.InsertMode
   190  	maxQuerySeriesLimit int64
   191  	maxQueryDocsLimit   int64
   192  }
   193  
   194  // NB(prateek): the returned filesets are strictly before the given time, i.e. they
   195  // live in the period (-infinity, exclusiveTime).
   196  type indexFilesetsBeforeFn func(dir string,
   197  	nsID ident.ID,
   198  	exclusiveTime xtime.UnixNano,
   199  ) ([]string, error)
   200  
   201  type readIndexInfoFilesFn func(opts fs.ReadIndexInfoFilesOptions) []fs.ReadIndexInfoFileResult
   202  
   203  type newNamespaceIndexOpts struct {
   204  	md                      namespace.Metadata
   205  	namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager
   206  	shardSet                sharding.ShardSet
   207  	opts                    Options
   208  	newIndexQueueFn         newNamespaceIndexInsertQueueFn
   209  	newBlockFn              index.NewBlockFn
   210  }
   211  
   212  // execBlockQueryFn executes a query against the given block whilst tracking state.
   213  type execBlockQueryFn func(
   214  	ctx context.Context,
   215  	block index.Block,
   216  	permit permits.Permit,
   217  	iter index.ResultIterator,
   218  	opts index.QueryOptions,
   219  	state *asyncQueryExecState,
   220  	results index.BaseResults,
   221  	logFields []opentracinglog.Field,
   222  )
   223  
   224  // newBlockIterFn returns a new ResultIterator for the query.
   225  type newBlockIterFn func(
   226  	ctx context.Context,
   227  	block index.Block,
   228  	query index.Query,
   229  	results index.BaseResults,
   230  ) (index.ResultIterator, error)
   231  
   232  // asyncQueryExecState tracks the async execution errors for a query.
   233  type asyncQueryExecState struct {
   234  	sync.RWMutex
   235  	multiErr  xerrors.MultiError
   236  	waitCount atomic.Uint64
   237  }
   238  
   239  func (s *asyncQueryExecState) hasErr() bool {
   240  	s.RLock()
   241  	defer s.RUnlock()
   242  	return s.multiErr.NumErrors() > 0
   243  }
   244  
   245  func (s *asyncQueryExecState) addErr(err error) {
   246  	s.Lock()
   247  	s.multiErr = s.multiErr.Add(err)
   248  	s.Unlock()
   249  }
   250  
   251  func (s *asyncQueryExecState) incWaited(i int) {
   252  	s.waitCount.Add(uint64(i))
   253  }
   254  
   255  func (s *asyncQueryExecState) waited() int {
   256  	return int(s.waitCount.Load())
   257  }
   258  
   259  // newNamespaceIndex returns a new namespaceIndex for the provided namespace.
   260  func newNamespaceIndex(
   261  	nsMD namespace.Metadata,
   262  	namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager,
   263  	shardSet sharding.ShardSet,
   264  	opts Options,
   265  ) (NamespaceIndex, error) {
   266  	return newNamespaceIndexWithOptions(newNamespaceIndexOpts{
   267  		md:                      nsMD,
   268  		namespaceRuntimeOptsMgr: namespaceRuntimeOptsMgr,
   269  		shardSet:                shardSet,
   270  		opts:                    opts,
   271  		newIndexQueueFn:         newNamespaceIndexInsertQueue,
   272  		newBlockFn:              index.NewBlock,
   273  	})
   274  }
   275  
   276  // newNamespaceIndexWithInsertQueueFn is a ctor used in tests to override the insert queue.
   277  func newNamespaceIndexWithInsertQueueFn(
   278  	nsMD namespace.Metadata,
   279  	namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager,
   280  	shardSet sharding.ShardSet,
   281  	newIndexQueueFn newNamespaceIndexInsertQueueFn,
   282  	opts Options,
   283  ) (NamespaceIndex, error) {
   284  	return newNamespaceIndexWithOptions(newNamespaceIndexOpts{
   285  		md:                      nsMD,
   286  		namespaceRuntimeOptsMgr: namespaceRuntimeOptsMgr,
   287  		shardSet:                shardSet,
   288  		opts:                    opts,
   289  		newIndexQueueFn:         newIndexQueueFn,
   290  		newBlockFn:              index.NewBlock,
   291  	})
   292  }
   293  
   294  // newNamespaceIndexWithNewBlockFn is a ctor used in tests to inject blocks.
   295  func newNamespaceIndexWithNewBlockFn(
   296  	nsMD namespace.Metadata,
   297  	namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager,
   298  	shardSet sharding.ShardSet,
   299  	newBlockFn index.NewBlockFn,
   300  	opts Options,
   301  ) (NamespaceIndex, error) {
   302  	return newNamespaceIndexWithOptions(newNamespaceIndexOpts{
   303  		md:                      nsMD,
   304  		namespaceRuntimeOptsMgr: namespaceRuntimeOptsMgr,
   305  		shardSet:                shardSet,
   306  		opts:                    opts,
   307  		newIndexQueueFn:         newNamespaceIndexInsertQueue,
   308  		newBlockFn:              newBlockFn,
   309  	})
   310  }
   311  
   312  // newNamespaceIndexWithOptions returns a new namespaceIndex with the provided configuration options.
   313  func newNamespaceIndexWithOptions(
   314  	newIndexOpts newNamespaceIndexOpts,
   315  ) (NamespaceIndex, error) {
   316  	var (
   317  		nsMD            = newIndexOpts.md
   318  		shardSet        = newIndexOpts.shardSet
   319  		indexOpts       = newIndexOpts.opts.IndexOptions()
   320  		instrumentOpts  = newIndexOpts.opts.InstrumentOptions()
   321  		newIndexQueueFn = newIndexOpts.newIndexQueueFn
   322  		newBlockFn      = newIndexOpts.newBlockFn
   323  		coreFn          = newIndexOpts.opts.CoreFn()
   324  		runtimeOptsMgr  = newIndexOpts.opts.RuntimeOptionsManager()
   325  	)
   326  	if err := indexOpts.Validate(); err != nil {
   327  		return nil, err
   328  	}
   329  
   330  	scope := instrumentOpts.MetricsScope().
   331  		SubScope("dbindex").
   332  		Tagged(map[string]string{
   333  			"namespace": nsMD.ID().String(),
   334  		})
   335  	instrumentOpts = instrumentOpts.SetMetricsScope(scope)
   336  	indexOpts = indexOpts.SetInstrumentOptions(instrumentOpts)
   337  
   338  	nowFn := indexOpts.ClockOptions().NowFn()
   339  	logger := indexOpts.InstrumentOptions().Logger()
   340  
   341  	var doNotIndexWithFields []doc.Field
   342  	if m := newIndexOpts.opts.DoNotIndexWithFieldsMap(); m != nil && len(m) != 0 {
   343  		for k, v := range m {
   344  			doNotIndexWithFields = append(doNotIndexWithFields, doc.Field{
   345  				Name:  []byte(k),
   346  				Value: []byte(v),
   347  			})
   348  		}
   349  	}
   350  
   351  	idx := &nsIndex{
   352  		state: nsIndexState{
   353  			closeCh: make(chan struct{}),
   354  			runtimeOpts: nsIndexRuntimeOptions{
   355  				insertMode: indexOpts.InsertMode(), // FOLLOWUP(prateek): wire to allow this to be tweaked at runtime
   356  			},
   357  			blocksByTime:   make(map[xtime.UnixNano]index.Block),
   358  			shardsAssigned: make(map[uint32]struct{}),
   359  		},
   360  
   361  		nowFn:                 nowFn,
   362  		blockSize:             nsMD.Options().IndexOptions().BlockSize(),
   363  		retentionPeriod:       nsMD.Options().RetentionOptions().RetentionPeriod(),
   364  		futureRetentionPeriod: nsMD.Options().RetentionOptions().FutureRetentionPeriod(),
   365  		bufferPast:            nsMD.Options().RetentionOptions().BufferPast(),
   366  		bufferFuture:          nsMD.Options().RetentionOptions().BufferFuture(),
   367  		coldWritesEnabled:     nsMD.Options().ColdWritesEnabled(),
   368  
   369  		namespaceRuntimeOptsMgr: newIndexOpts.namespaceRuntimeOptsMgr,
   370  		indexFilesetsBeforeFn:   fs.IndexFileSetsBefore,
   371  		readIndexInfoFilesFn:    fs.ReadIndexInfoFiles,
   372  		deleteFilesFn:           fs.DeleteFiles,
   373  
   374  		newBlockFn: newBlockFn,
   375  		opts:       newIndexOpts.opts,
   376  		logger:     logger,
   377  		nsMetadata: nsMD,
   378  
   379  		resultsPool:          indexOpts.QueryResultsPool(),
   380  		aggregateResultsPool: indexOpts.AggregateResultsPool(),
   381  
   382  		permitsManager: newIndexOpts.opts.PermitsOptions().IndexQueryPermitsManager(),
   383  		metrics:        newNamespaceIndexMetrics(indexOpts, instrumentOpts),
   384  
   385  		doNotIndexWithFields: doNotIndexWithFields,
   386  	}
   387  
   388  	activeBlock, err := idx.newBlockFn(xtime.UnixNano(0), idx.nsMetadata,
   389  		index.BlockOptions{ActiveBlock: true}, idx.namespaceRuntimeOptsMgr,
   390  		idx.opts.IndexOptions())
   391  	if err != nil {
   392  		return nil, idx.unableToAllocBlockInvariantError(err)
   393  	}
   394  
   395  	idx.activeBlock = activeBlock
   396  
   397  	// Assign shard set upfront.
   398  	idx.AssignShardSet(shardSet)
   399  
   400  	idx.runtimeOptsListener = runtimeOptsMgr.RegisterListener(idx)
   401  	idx.runtimeNsOptsListener = idx.namespaceRuntimeOptsMgr.RegisterListener(idx)
   402  
   403  	// set up forward index dice.
   404  	dice, err := newForwardIndexDice(newIndexOpts.opts)
   405  	if err != nil {
   406  		return nil, err
   407  	}
   408  
   409  	if dice.enabled {
   410  		logger.Info("namespace forward indexing configured",
   411  			zap.Stringer("namespace", nsMD.ID()),
   412  			zap.Bool("enabled", dice.enabled),
   413  			zap.Duration("threshold", dice.forwardIndexThreshold),
   414  			zap.Float64("rate", dice.forwardIndexDice.Rate()))
   415  	} else {
   416  		idxOpts := newIndexOpts.opts.IndexOptions()
   417  		logger.Info("namespace forward indexing not enabled",
   418  			zap.Stringer("namespace", nsMD.ID()),
   419  			zap.Bool("enabled", false),
   420  			zap.Float64("threshold", idxOpts.ForwardIndexThreshold()),
   421  			zap.Float64("probability", idxOpts.ForwardIndexProbability()))
   422  	}
   423  
   424  	idx.forwardIndexDice = dice
   425  
   426  	// allocate indexing queue and start it up.
   427  	queue := newIndexQueueFn(idx.writeBatches, nsMD, nowFn, coreFn, scope)
   428  	if err := queue.Start(); err != nil {
   429  		return nil, err
   430  	}
   431  	idx.state.insertQueue = queue
   432  
   433  	// allocate the current block to ensure we're able to index as soon as we return
   434  	currentBlock := xtime.ToUnixNano(nowFn()).Truncate(idx.blockSize)
   435  	idx.state.RLock()
   436  	_, err = idx.ensureBlockPresentWithRLock(currentBlock)
   437  	idx.state.RUnlock()
   438  	if err != nil {
   439  		return nil, err
   440  	}
   441  
   442  	// Report stats
   443  	go idx.reportStatsUntilClosed()
   444  
   445  	return idx, nil
   446  }
   447  
   448  func (i *nsIndex) SetRuntimeOptions(runtime.Options) {
   449  }
   450  
   451  func (i *nsIndex) SetNamespaceRuntimeOptions(opts namespace.RuntimeOptions) {
   452  	// We don't like to log from every single index segment that has
   453  	// settings updated so we log the changes here.
   454  	i.logger.Info("set namespace runtime index options",
   455  		zap.Stringer("namespace", i.nsMetadata.ID()),
   456  		zap.Any("writeIndexingPerCPUConcurrency", opts.WriteIndexingPerCPUConcurrency()),
   457  		zap.Any("flushIndexingPerCPUConcurrency", opts.FlushIndexingPerCPUConcurrency()))
   458  }
   459  
   460  func (i *nsIndex) reportStatsUntilClosed() {
   461  	ticker := time.NewTicker(nsIndexReportStatsInterval)
   462  	defer ticker.Stop()
   463  
   464  	for {
   465  		select {
   466  		case <-ticker.C:
   467  			err := i.reportStats()
   468  			if err != nil {
   469  				i.logger.Warn("could not report index stats", zap.Error(err))
   470  			}
   471  		case <-i.state.closeCh:
   472  			return
   473  		}
   474  	}
   475  }
   476  
   477  type nsIndexCompactionLevelStats struct {
   478  	numSegments  int64
   479  	numTotalDocs int64
   480  }
   481  
   482  func (i *nsIndex) reportStats() error {
   483  	i.state.RLock()
   484  	defer i.state.RUnlock()
   485  
   486  	foregroundLevels := i.metrics.blockMetrics.ForegroundSegments.Levels
   487  	foregroundLevelStats := make([]nsIndexCompactionLevelStats, len(foregroundLevels))
   488  
   489  	backgroundLevels := i.metrics.blockMetrics.BackgroundSegments.Levels
   490  	backgroundLevelStats := make([]nsIndexCompactionLevelStats, len(backgroundLevels))
   491  
   492  	flushedLevels := i.metrics.blockMetrics.FlushedSegments.Levels
   493  	flushedLevelStats := make([]nsIndexCompactionLevelStats, len(flushedLevels))
   494  
   495  	minIndexConcurrency := 0
   496  	maxIndexConcurrency := 0
   497  	sumIndexConcurrency := 0
   498  	numIndexingStats := 0
   499  	reporter := index.NewBlockStatsReporter(
   500  		func(s index.BlockSegmentStats) {
   501  			var (
   502  				levels     []nsIndexBlocksSegmentsLevelMetrics
   503  				levelStats []nsIndexCompactionLevelStats
   504  			)
   505  			switch s.Type {
   506  			case index.ActiveForegroundSegment:
   507  				levels = foregroundLevels
   508  				levelStats = foregroundLevelStats
   509  			case index.ActiveBackgroundSegment:
   510  				levels = backgroundLevels
   511  				levelStats = backgroundLevelStats
   512  			case index.FlushedSegment:
   513  				levels = flushedLevels
   514  				levelStats = flushedLevelStats
   515  			}
   516  
   517  			for i, l := range levels {
   518  				contained := s.Size >= l.MinSizeInclusive && s.Size < l.MaxSizeExclusive
   519  				if !contained {
   520  					continue
   521  				}
   522  
   523  				l.SegmentsAge.Record(s.Age)
   524  				levelStats[i].numSegments++
   525  				levelStats[i].numTotalDocs += s.Size
   526  
   527  				break
   528  			}
   529  		},
   530  		func(s index.BlockIndexingStats) {
   531  			first := numIndexingStats == 0
   532  			numIndexingStats++
   533  
   534  			if first {
   535  				minIndexConcurrency = s.IndexConcurrency
   536  				maxIndexConcurrency = s.IndexConcurrency
   537  				sumIndexConcurrency = s.IndexConcurrency
   538  				return
   539  			}
   540  
   541  			if v := s.IndexConcurrency; v < minIndexConcurrency {
   542  				minIndexConcurrency = v
   543  			}
   544  			if v := s.IndexConcurrency; v > maxIndexConcurrency {
   545  				maxIndexConcurrency = v
   546  			}
   547  			sumIndexConcurrency += s.IndexConcurrency
   548  		})
   549  
   550  	// iterate known blocks in a defined order of time (newest first)
   551  	// for debug log ordering
   552  	for _, b := range i.state.blocksDescOrderImmutable {
   553  		err := b.block.Stats(reporter)
   554  		if err == index.ErrUnableReportStatsBlockClosed {
   555  			// Closed blocks are temporarily in the list still
   556  			continue
   557  		}
   558  		if err != nil {
   559  			return err
   560  		}
   561  	}
   562  	// Active block should always be open.
   563  	if err := i.activeBlock.Stats(reporter); err != nil {
   564  		return err
   565  	}
   566  
   567  	// Update level stats.
   568  	for _, elem := range []struct {
   569  		levels     []nsIndexBlocksSegmentsLevelMetrics
   570  		levelStats []nsIndexCompactionLevelStats
   571  	}{
   572  		{foregroundLevels, foregroundLevelStats},
   573  		{backgroundLevels, backgroundLevelStats},
   574  	} {
   575  		for i, v := range elem.levelStats {
   576  			elem.levels[i].NumSegments.Update(float64(v.numSegments))
   577  			elem.levels[i].NumTotalDocs.Update(float64(v.numTotalDocs))
   578  		}
   579  	}
   580  
   581  	// Update the indexing stats.
   582  	i.metrics.indexingConcurrencyMin.Update(float64(minIndexConcurrency))
   583  	i.metrics.indexingConcurrencyMax.Update(float64(maxIndexConcurrency))
   584  	avgIndexConcurrency := float64(sumIndexConcurrency) / float64(numIndexingStats)
   585  	i.metrics.indexingConcurrencyAvg.Update(avgIndexConcurrency)
   586  
   587  	return nil
   588  }
   589  
   590  func (i *nsIndex) BlockStartForWriteTime(writeTime xtime.UnixNano) xtime.UnixNano {
   591  	return writeTime.Truncate(i.blockSize)
   592  }
   593  
   594  func (i *nsIndex) BlockForBlockStart(blockStart xtime.UnixNano) (index.Block, error) {
   595  	result, err := i.ensureBlockPresent(blockStart)
   596  	if err != nil {
   597  		return nil, err
   598  	}
   599  	return result.block, nil
   600  }
   601  
   602  // NB(prateek): including the call chains leading to this point:
   603  //
   604  // - For new entry (previously unseen in the shard):
   605  //     shard.WriteTagged()
   606  //       => shard.insertSeriesAsyncBatched()
   607  //       => shardInsertQueue.Insert()
   608  //       => shard.writeBatch()
   609  //       => index.WriteBatch()
   610  //       => indexQueue.Insert()
   611  //       => index.writeBatch()
   612  //
   613  // - For entry which exists in the shard, but needs indexing (either past
   614  //   the TTL or the last indexing hasn't happened/failed):
   615  //      shard.WriteTagged()
   616  //        => shard.insertSeriesForIndexingAsyncBatched()
   617  //        => shardInsertQueue.Insert()
   618  //        => shard.writeBatch()
   619  //        => index.Write()
   620  //        => indexQueue.Insert()
   621  //      	=> index.writeBatch()
   622  
   623  func (i *nsIndex) WriteBatch(
   624  	batch *index.WriteBatch,
   625  ) error {
   626  	// Filter anything with a pending index out before acquiring lock.
   627  	batch.MarkUnmarkedIfAlreadyIndexedSuccessAndFinalize()
   628  	if !batch.PendingAny() {
   629  		return nil
   630  	}
   631  
   632  	i.state.RLock()
   633  	if !i.isOpenWithRLock() {
   634  		i.state.RUnlock()
   635  		i.metrics.insertAfterClose.Inc(1)
   636  		err := errDbIndexUnableToWriteClosed
   637  		batch.MarkUnmarkedEntriesError(err)
   638  		return err
   639  	}
   640  
   641  	// NB(prateek): retrieving insertMode here while we have the RLock.
   642  	insertMode := i.state.runtimeOpts.insertMode
   643  	wg, err := i.state.insertQueue.InsertBatch(batch)
   644  
   645  	// release the lock because we don't need it past this point.
   646  	i.state.RUnlock()
   647  
   648  	// if we're unable to index, we still have to finalize the reference we hold.
   649  	if err != nil {
   650  		batch.MarkUnmarkedEntriesError(err)
   651  		return err
   652  	}
   653  	// once the write has been queued in the indexInsertQueue, it assumes
   654  	// responsibility for calling the resource hooks.
   655  
   656  	// wait/terminate depending on if we are indexing synchronously or not.
   657  	if insertMode != index.InsertAsync {
   658  		wg.Wait()
   659  
   660  		// Re-sort the batch by initial enqueue order
   661  		if numErrs := batch.NumErrs(); numErrs > 0 {
   662  			// Restore the sort order from when enqueued for the caller.
   663  			batch.SortByEnqueued()
   664  			return fmt.Errorf("check batch: %d insert errors", numErrs)
   665  		}
   666  	}
   667  
   668  	return nil
   669  }
   670  
   671  func (i *nsIndex) WritePending(
   672  	pending []writes.PendingIndexInsert,
   673  ) error {
   674  	// Filter anything with a pending index out before acquiring lock.
   675  	incoming := pending
   676  	pending = pending[:0]
   677  	for j := range incoming {
   678  		t := i.BlockStartForWriteTime(incoming[j].Entry.Timestamp)
   679  		if incoming[j].Entry.OnIndexSeries.IfAlreadyIndexedMarkIndexSuccessAndFinalize(t) {
   680  			continue
   681  		}
   682  		// Continue to add this element.
   683  		pending = append(pending, incoming[j])
   684  	}
   685  	if len(pending) == 0 {
   686  		return nil
   687  	}
   688  
   689  	i.state.RLock()
   690  	if !i.isOpenWithRLock() {
   691  		i.state.RUnlock()
   692  		i.metrics.insertAfterClose.Inc(1)
   693  		return errDbIndexUnableToWriteClosed
   694  	}
   695  	_, err := i.state.insertQueue.InsertPending(pending)
   696  	// release the lock because we don't need it past this point.
   697  	i.state.RUnlock()
   698  
   699  	return err
   700  }
   701  
   702  // WriteBatches is called by the indexInsertQueue.
   703  func (i *nsIndex) writeBatches(
   704  	batch *index.WriteBatch,
   705  ) {
   706  	// NB(prateek): we use a read lock to guard against mutation of the
   707  	// indexBlocks, mutations within the underlying blocks are guarded
   708  	// by primitives internal to it.
   709  	i.state.RLock()
   710  	if !i.isOpenWithRLock() {
   711  		i.state.RUnlock()
   712  		// NB(prateek): deliberately skip calling any of the `OnIndexFinalize` methods
   713  		// on the provided inserts to terminate quicker during shutdown.
   714  		return
   715  	}
   716  	var (
   717  		now                        = xtime.ToUnixNano(i.nowFn())
   718  		blockSize                  = i.blockSize
   719  		futureLimit                = now.Add(1 * i.bufferFuture)
   720  		pastLimit                  = now.Add(-1 * i.bufferPast)
   721  		earliestBlockStartToRetain = retention.FlushTimeStartForRetentionPeriod(i.retentionPeriod, i.blockSize, now)
   722  		batchOptions               = batch.Options()
   723  		forwardIndexDice           = i.forwardIndexDice
   724  		forwardIndexEnabled        = forwardIndexDice.enabled
   725  		total                      int
   726  		notSkipped                 int
   727  		forwardIndexHits           int
   728  		forwardIndexMiss           int
   729  
   730  		forwardIndexBatch *index.WriteBatch
   731  	)
   732  	// NB(r): Release lock early to avoid writing batches impacting ticking
   733  	// speed, etc.
   734  	// Sometimes foreground compaction can take a long time during heavy inserts.
   735  	// Each lookup to ensureBlockPresent checks that index is still open, etc.
   736  	i.state.RUnlock()
   737  
   738  	if forwardIndexEnabled {
   739  		// NB(arnikola): Don't initialize forward index batch if forward indexing
   740  		// is not enabled.
   741  		forwardIndexBatch = index.NewWriteBatch(batchOptions)
   742  	}
   743  
   744  	// Ensure timestamp is not too old/new based on retention policies and that
   745  	// doc is valid. Add potential forward writes to the forwardWriteBatch.
   746  	batch.ForEach(
   747  		func(idx int, entry index.WriteBatchEntry,
   748  			d doc.Metadata, _ index.WriteBatchEntryResult) {
   749  			total++
   750  
   751  			if len(i.doNotIndexWithFields) != 0 {
   752  				// This feature rarely used, do not optimize and just do n*m checks.
   753  				drop := true
   754  				for _, matchField := range i.doNotIndexWithFields {
   755  					matchedField := false
   756  					for _, actualField := range d.Fields {
   757  						if bytes.Equal(actualField.Name, matchField.Name) {
   758  							matchedField = bytes.Equal(actualField.Value, matchField.Value)
   759  							break
   760  						}
   761  					}
   762  					if !matchedField {
   763  						drop = false
   764  						break
   765  					}
   766  				}
   767  				if drop {
   768  					batch.MarkUnmarkedEntryError(errDbIndexDoNotIndexSeries, idx)
   769  					return
   770  				}
   771  			}
   772  
   773  			ts := entry.Timestamp
   774  			// NB(bodu): Always check first to see if the write is within retention.
   775  			if !ts.After(earliestBlockStartToRetain) {
   776  				batch.MarkUnmarkedEntryError(m3dberrors.ErrTooPast, idx)
   777  				return
   778  			}
   779  
   780  			if !futureLimit.After(ts) {
   781  				batch.MarkUnmarkedEntryError(m3dberrors.ErrTooFuture, idx)
   782  				return
   783  			}
   784  
   785  			if ts.Before(pastLimit) && !i.coldWritesEnabled {
   786  				// NB(bodu): We only mark entries as too far in the past if
   787  				// cold writes are not enabled.
   788  				batch.MarkUnmarkedEntryError(m3dberrors.ErrTooPast, idx)
   789  				return
   790  			}
   791  
   792  			if forwardIndexEnabled {
   793  				if forwardIndexDice.roll(ts) {
   794  					forwardIndexHits++
   795  					forwardEntryTimestamp := ts.Truncate(blockSize).Add(blockSize)
   796  					if entry.OnIndexSeries.NeedsIndexUpdate(forwardEntryTimestamp) {
   797  						forwardIndexEntry := entry
   798  						forwardIndexEntry.Timestamp = forwardEntryTimestamp
   799  						t := i.BlockStartForWriteTime(forwardEntryTimestamp)
   800  						forwardIndexEntry.OnIndexSeries.OnIndexPrepare(t)
   801  						forwardIndexBatch.Append(forwardIndexEntry, d)
   802  					}
   803  				} else {
   804  					forwardIndexMiss++
   805  				}
   806  			}
   807  
   808  			notSkipped++
   809  		})
   810  
   811  	if forwardIndexEnabled && forwardIndexBatch.Len() > 0 {
   812  		i.metrics.forwardIndexCounter.Inc(int64(forwardIndexBatch.Len()))
   813  		batch.AppendAll(forwardIndexBatch)
   814  	}
   815  
   816  	// Sort the inserts by which block they're applicable for, and do the inserts
   817  	// for each block, making sure to not try to insert any entries already marked
   818  	// with a result.
   819  	batch.ForEachUnmarkedBatchByBlockStart(i.writeBatchForBlockStart)
   820  
   821  	// Track index insertions.
   822  	// Note: attemptTotal should = attemptSkip + attemptWrite.
   823  	i.metrics.asyncInsertAttemptTotal.Inc(int64(total))
   824  	i.metrics.asyncInsertAttemptSkip.Inc(int64(total - notSkipped))
   825  	i.metrics.forwardIndexHits.Inc(int64(forwardIndexHits))
   826  	i.metrics.forwardIndexMisses.Inc(int64(forwardIndexMiss))
   827  }
   828  
   829  func (i *nsIndex) writeBatchForBlockStart(
   830  	blockStart xtime.UnixNano, batch *index.WriteBatch,
   831  ) {
   832  	// NB(r): Capture pending entries so we can emit the latencies
   833  	pending := batch.PendingEntries()
   834  	numPending := len(pending)
   835  
   836  	// Track attempted write.
   837  	// Note: attemptTotal should = attemptSkip + attemptWrite.
   838  	i.metrics.asyncInsertAttemptWrite.Inc(int64(numPending))
   839  
   840  	// i.e. we have the block and the inserts, perform the writes.
   841  	result, err := i.activeBlock.WriteBatch(batch)
   842  
   843  	// Record the end to end indexing latency.
   844  	now := i.nowFn()
   845  	for idx := range pending {
   846  		took := now.Sub(pending[idx].EnqueuedAt)
   847  		i.metrics.insertEndToEndLatency.Record(took)
   848  	}
   849  
   850  	// NB: we don't need to do anything to the OnIndexSeries refs in `inserts` at this point,
   851  	// the index.Block WriteBatch assumes responsibility for calling the appropriate methods.
   852  	if n := result.NumSuccess; n > 0 {
   853  		i.metrics.asyncInsertSuccess.Inc(n)
   854  	}
   855  
   856  	// Record mutable segments count foreground/background if latest block.
   857  	if stats := result.MutableSegmentsStats; !stats.Empty() {
   858  		i.metrics.latestBlockNumSegmentsForeground.Update(float64(stats.Foreground.NumSegments))
   859  		i.metrics.latestBlockNumDocsForeground.Update(float64(stats.Foreground.NumDocs))
   860  		i.metrics.latestBlockNumSegmentsBackground.Update(float64(stats.Background.NumSegments))
   861  		i.metrics.latestBlockNumDocsBackground.Update(float64(stats.Background.NumDocs))
   862  	}
   863  
   864  	// Allow for duplicate write errors since due to re-indexing races
   865  	// we may try to re-index a series more than once.
   866  	if err := i.sanitizeAllowDuplicatesWriteError(err); err != nil {
   867  		numErrors := numPending - int(result.NumSuccess)
   868  		if partialError, ok := err.(*m3ninxindex.BatchPartialError); ok {
   869  			// If it was a batch partial error we know exactly how many failed
   870  			// after filtering out for duplicate ID errors.
   871  			numErrors = len(partialError.Errs())
   872  		}
   873  		i.metrics.asyncInsertErrors.Inc(int64(numErrors))
   874  		i.logger.Error("error writing to index block", zap.Error(err))
   875  	}
   876  }
   877  
   878  // Bootstrap bootstraps the index with the provided blocks.
   879  func (i *nsIndex) Bootstrap(
   880  	bootstrapResults result.IndexResults,
   881  ) error {
   882  	i.state.Lock()
   883  	if i.state.bootstrapState == Bootstrapping {
   884  		i.state.Unlock()
   885  		return errDbIndexIsBootstrapping
   886  	}
   887  	i.state.bootstrapState = Bootstrapping
   888  	i.state.Unlock()
   889  
   890  	i.state.RLock()
   891  	defer func() {
   892  		i.state.RUnlock()
   893  		i.state.Lock()
   894  		i.state.bootstrapState = Bootstrapped
   895  		i.state.Unlock()
   896  	}()
   897  
   898  	var multiErr xerrors.MultiError
   899  	for blockStart, blockResults := range bootstrapResults {
   900  		blockResult, err := i.ensureBlockPresentWithRLock(blockStart)
   901  		if err != nil { // should never happen
   902  			multiErr = multiErr.Add(i.unableToAllocBlockInvariantError(err))
   903  			continue
   904  		}
   905  		if err := blockResult.block.AddResults(blockResults); err != nil {
   906  			multiErr = multiErr.Add(err)
   907  		}
   908  	}
   909  
   910  	return multiErr.FinalError()
   911  }
   912  
   913  func (i *nsIndex) Bootstrapped() bool {
   914  	i.state.RLock()
   915  	result := i.state.bootstrapState == Bootstrapped
   916  	i.state.RUnlock()
   917  	return result
   918  }
   919  
   920  func (i *nsIndex) Tick(
   921  	c context.Cancellable,
   922  	startTime xtime.UnixNano,
   923  ) (namespaceIndexTickResult, error) {
   924  	var result namespaceIndexTickResult
   925  
   926  	// First collect blocks and acquire lock to remove those that need removing
   927  	// but then release lock so can Tick and do other expensive tasks
   928  	// such as notify of sealed blocks.
   929  	tickingBlocks, multiErr := i.tickingBlocks(startTime)
   930  
   931  	result.NumBlocks = int64(tickingBlocks.totalBlocks)
   932  	for _, block := range tickingBlocks.tickingBlocks {
   933  		if c.IsCancelled() {
   934  			multiErr = multiErr.Add(errDbIndexTerminatingTickCancellation)
   935  			return result, multiErr.FinalError()
   936  		}
   937  
   938  		blockTickResult, tickErr := block.Tick(c)
   939  		multiErr = multiErr.Add(tickErr)
   940  		result.NumSegments += blockTickResult.NumSegments
   941  		result.NumSegmentsBootstrapped += blockTickResult.NumSegmentsBootstrapped
   942  		result.NumSegmentsMutable += blockTickResult.NumSegmentsMutable
   943  		result.NumTotalDocs += blockTickResult.NumDocs
   944  		result.FreeMmap += blockTickResult.FreeMmap
   945  	}
   946  
   947  	blockTickResult, tickErr := tickingBlocks.activeBlock.Tick(c)
   948  	multiErr = multiErr.Add(tickErr)
   949  	result.NumSegments += blockTickResult.NumSegments
   950  	result.NumSegmentsBootstrapped += blockTickResult.NumSegmentsBootstrapped
   951  	result.NumSegmentsMutable += blockTickResult.NumSegmentsMutable
   952  	result.NumTotalDocs += blockTickResult.NumDocs
   953  	result.FreeMmap += blockTickResult.FreeMmap
   954  
   955  	i.metrics.tick.Inc(1)
   956  
   957  	return result, multiErr.FinalError()
   958  }
   959  
   960  type tickingBlocksResult struct {
   961  	totalBlocks   int
   962  	activeBlock   index.Block
   963  	tickingBlocks []index.Block
   964  }
   965  
   966  func (i *nsIndex) tickingBlocks(
   967  	startTime xtime.UnixNano,
   968  ) (tickingBlocksResult, xerrors.MultiError) {
   969  	multiErr := xerrors.NewMultiError()
   970  	earliestBlockStartToRetain := retention.FlushTimeStartForRetentionPeriod(
   971  		i.retentionPeriod, i.blockSize, startTime)
   972  
   973  	i.state.Lock()
   974  	activeBlock := i.activeBlock
   975  	tickingBlocks := make([]index.Block, 0, len(i.state.blocksByTime))
   976  	defer func() {
   977  		i.updateBlockStartsWithLock()
   978  		i.state.Unlock()
   979  	}()
   980  
   981  	for blockStart, block := range i.state.blocksByTime {
   982  		// Drop any blocks past the retention period.
   983  		if blockStart.Before(earliestBlockStartToRetain) {
   984  			multiErr = multiErr.Add(block.Close())
   985  			delete(i.state.blocksByTime, blockStart)
   986  			continue
   987  		}
   988  
   989  		// Tick any blocks we're going to retain, but don't tick inline here
   990  		// we'll do this out of the block.
   991  		tickingBlocks = append(tickingBlocks, block)
   992  
   993  		// Seal any blocks that are sealable while holding lock (seal is fast).
   994  		if !blockStart.After(i.lastSealableBlockStart(startTime)) && !block.IsSealed() {
   995  			multiErr = multiErr.Add(block.Seal())
   996  		}
   997  	}
   998  
   999  	return tickingBlocksResult{
  1000  		totalBlocks:   len(i.state.blocksByTime),
  1001  		activeBlock:   activeBlock,
  1002  		tickingBlocks: tickingBlocks,
  1003  	}, multiErr
  1004  }
  1005  
  1006  func (i *nsIndex) WarmFlush(
  1007  	flush persist.IndexFlush,
  1008  	shards []databaseShard,
  1009  ) error {
  1010  	if len(shards) == 0 {
  1011  		// No-op if no shards currently owned.
  1012  		return nil
  1013  	}
  1014  
  1015  	flushable, err := i.flushableBlocks(shards, series.WarmWrite)
  1016  	if err != nil {
  1017  		return err
  1018  	}
  1019  
  1020  	// Determine the current flush indexing concurrency.
  1021  	namespaceRuntimeOpts := i.namespaceRuntimeOptsMgr.Get()
  1022  	perCPUFraction := namespaceRuntimeOpts.FlushIndexingPerCPUConcurrencyOrDefault()
  1023  	cpus := math.Ceil(perCPUFraction * float64(goruntime.GOMAXPROCS(0)))
  1024  	concurrency := int(math.Max(1, cpus))
  1025  
  1026  	builderOpts := i.opts.IndexOptions().SegmentBuilderOptions().
  1027  		SetConcurrency(concurrency)
  1028  
  1029  	builder, err := builder.NewBuilderFromDocuments(builderOpts)
  1030  	if err != nil {
  1031  		return err
  1032  	}
  1033  	defer builder.Close()
  1034  
  1035  	// Emit concurrency, then reset gauge to zero to show time
  1036  	// active during flushing broken down per namespace.
  1037  	i.metrics.flushIndexingConcurrency.Update(float64(concurrency))
  1038  	defer i.metrics.flushIndexingConcurrency.Update(0)
  1039  
  1040  	var evicted int
  1041  	for _, block := range flushable {
  1042  		immutableSegments, err := i.flushBlock(flush, block, shards, builder)
  1043  		if err != nil {
  1044  			return err
  1045  		}
  1046  		// Make a result that covers the entire time ranges for the
  1047  		// block for each shard
  1048  		fulfilled := result.NewShardTimeRangesFromRange(block.StartTime(), block.EndTime(),
  1049  			dbShards(shards).IDs()...)
  1050  
  1051  		// Add the results to the block.
  1052  		persistedSegments := make([]result.Segment, 0, len(immutableSegments))
  1053  		for _, elem := range immutableSegments {
  1054  			persistedSegment := result.NewSegment(elem, true)
  1055  			persistedSegments = append(persistedSegments, persistedSegment)
  1056  		}
  1057  		blockResult := result.NewIndexBlock(persistedSegments, fulfilled)
  1058  		results := result.NewIndexBlockByVolumeType(block.StartTime())
  1059  		results.SetBlock(idxpersist.DefaultIndexVolumeType, blockResult)
  1060  		if err := block.AddResults(results); err != nil {
  1061  			return err
  1062  		}
  1063  
  1064  		evicted++
  1065  
  1066  		// It's now safe to remove the mutable segments as anything the block
  1067  		// held is covered by the owned shards we just read
  1068  		if err := block.EvictMutableSegments(); err != nil {
  1069  			// deliberately choosing to not mark this as an error as we have successfully
  1070  			// flushed any mutable data.
  1071  			i.logger.Warn("encountered error while evicting mutable segments for index block",
  1072  				zap.Error(err),
  1073  				zap.Time("blockStart", block.StartTime().ToTime()),
  1074  			)
  1075  		}
  1076  
  1077  		for _, t := range i.blockStartsFromIndexBlockStart(block.StartTime()) {
  1078  			for _, s := range shards {
  1079  				s.MarkWarmIndexFlushStateSuccessOrError(t, err)
  1080  			}
  1081  		}
  1082  	}
  1083  	i.metrics.blocksEvictedMutableSegments.Inc(int64(evicted))
  1084  	return nil
  1085  }
  1086  
  1087  func (i *nsIndex) ColdFlush(shards []databaseShard) (OnColdFlushDone, error) {
  1088  	if len(shards) == 0 {
  1089  		// No-op if no shards currently owned.
  1090  		return func() error { return nil }, nil
  1091  	}
  1092  
  1093  	flushable, err := i.flushableBlocks(shards, series.ColdWrite)
  1094  	if err != nil {
  1095  		return nil, err
  1096  	}
  1097  	// We only rotate cold mutable segments in phase I of cold flushing.
  1098  	for _, block := range flushable {
  1099  		if err := block.RotateColdMutableSegments(); err != nil {
  1100  			return nil, err
  1101  		}
  1102  	}
  1103  	// We can't immediately evict cold mutable segments so we return a callback to do so
  1104  	// when cold flush finishes.
  1105  	return func() error {
  1106  		multiErr := xerrors.NewMultiError()
  1107  		for _, block := range flushable {
  1108  			multiErr = multiErr.Add(block.EvictColdMutableSegments())
  1109  		}
  1110  		return multiErr.FinalError()
  1111  	}, nil
  1112  }
  1113  
  1114  // WarmFlushBlockStarts returns all index blockStarts which have been flushed to disk.
  1115  func (i *nsIndex) WarmFlushBlockStarts() []xtime.UnixNano {
  1116  	flushed := make([]xtime.UnixNano, 0)
  1117  	infoFiles := i.readInfoFilesAsMap()
  1118  
  1119  	for blockStart := range infoFiles {
  1120  		if i.hasIndexWarmFlushedToDisk(infoFiles, blockStart) {
  1121  			flushed = append(flushed, blockStart)
  1122  		}
  1123  	}
  1124  	return flushed
  1125  }
  1126  
  1127  // BackgroundCompact background compacts eligible segments.
  1128  func (i *nsIndex) BackgroundCompact() {
  1129  	if i.activeBlock != nil {
  1130  		i.activeBlock.BackgroundCompact()
  1131  	}
  1132  	for _, b := range i.state.blocksByTime {
  1133  		b.BackgroundCompact()
  1134  	}
  1135  }
  1136  
  1137  func (i *nsIndex) readInfoFilesAsMap() map[xtime.UnixNano][]fs.ReadIndexInfoFileResult {
  1138  	fsOpts := i.opts.CommitLogOptions().FilesystemOptions()
  1139  	infoFiles := i.readIndexInfoFilesFn(fs.ReadIndexInfoFilesOptions{
  1140  		FilePathPrefix:   fsOpts.FilePathPrefix(),
  1141  		Namespace:        i.nsMetadata.ID(),
  1142  		ReaderBufferSize: fsOpts.InfoReaderBufferSize(),
  1143  	})
  1144  	result := make(map[xtime.UnixNano][]fs.ReadIndexInfoFileResult)
  1145  	for _, infoFile := range infoFiles {
  1146  		t := xtime.UnixNano(infoFile.Info.BlockStart)
  1147  		files := result[t]
  1148  		result[t] = append(files, infoFile)
  1149  	}
  1150  	return result
  1151  }
  1152  
  1153  func (i *nsIndex) flushableBlocks(
  1154  	shards []databaseShard,
  1155  	flushType series.WriteType,
  1156  ) ([]index.Block, error) {
  1157  	i.state.RLock()
  1158  	defer i.state.RUnlock()
  1159  	if !i.isOpenWithRLock() {
  1160  		return nil, errDbIndexUnableToFlushClosed
  1161  	}
  1162  	// NB(bodu): We read index info files once here to avoid re-reading all of them
  1163  	// for each block.
  1164  	infoFiles := i.readInfoFilesAsMap()
  1165  	flushable := make([]index.Block, 0, len(i.state.blocksByTime))
  1166  
  1167  	now := xtime.ToUnixNano(i.nowFn())
  1168  	earliestBlockStartToRetain := retention.FlushTimeStartForRetentionPeriod(i.retentionPeriod, i.blockSize, now)
  1169  	currentBlockStart := now.Truncate(i.blockSize)
  1170  	// Check for flushable blocks by iterating through all block starts w/in retention.
  1171  	for blockStart := earliestBlockStartToRetain; blockStart.Before(currentBlockStart); blockStart = blockStart.Add(i.blockSize) {
  1172  		blockResult, err := i.ensureBlockPresentWithRLock(blockStart)
  1173  		if err != nil {
  1174  			return nil, err
  1175  		}
  1176  
  1177  		canFlush, err := i.canFlushBlockWithRLock(infoFiles, blockStart,
  1178  			blockResult.block, shards, flushType)
  1179  		if err != nil {
  1180  			return nil, err
  1181  		}
  1182  		if !canFlush {
  1183  			continue
  1184  		}
  1185  
  1186  		flushable = append(flushable, blockResult.block)
  1187  	}
  1188  	return flushable, nil
  1189  }
  1190  
  1191  func (i *nsIndex) canFlushBlockWithRLock(
  1192  	infoFiles map[xtime.UnixNano][]fs.ReadIndexInfoFileResult,
  1193  	blockStart xtime.UnixNano,
  1194  	block index.Block,
  1195  	shards []databaseShard,
  1196  	flushType series.WriteType,
  1197  ) (bool, error) {
  1198  	switch flushType {
  1199  	case series.WarmWrite:
  1200  		// NB(bodu): We should always attempt to warm flush sealed blocks to disk if
  1201  		// there doesn't already exist data on disk. We're checking this instead of
  1202  		// `block.NeedsMutableSegmentsEvicted()` since bootstrap writes for cold block starts
  1203  		// get marked as warm writes if there doesn't already exist data on disk and need to
  1204  		// properly go through the warm flush lifecycle.
  1205  		if !block.IsSealed() || i.hasIndexWarmFlushedToDisk(infoFiles, blockStart) {
  1206  			return false, nil
  1207  		}
  1208  	case series.ColdWrite:
  1209  		if !block.NeedsColdMutableSegmentsEvicted() {
  1210  			return false, nil
  1211  		}
  1212  	}
  1213  
  1214  	// Check all data files exist for the shards we own
  1215  	for _, shard := range shards {
  1216  		if !shard.IsBootstrapped() {
  1217  			i.logger.
  1218  				With(zap.Uint32("shard", shard.ID())).
  1219  				Debug("skipping index cold flush due to shard not bootstrapped yet")
  1220  			continue
  1221  		}
  1222  
  1223  		for _, t := range i.blockStartsFromIndexBlockStart(blockStart) {
  1224  			flushState, err := shard.FlushState(t)
  1225  			if err != nil {
  1226  				return false, err
  1227  			}
  1228  
  1229  			// Skip if the data flushing failed. Data flushing precedes index flushing.
  1230  			if flushState.WarmStatus.DataFlushed != fileOpSuccess {
  1231  				return false, nil
  1232  			}
  1233  		}
  1234  	}
  1235  
  1236  	return true, nil
  1237  }
  1238  
  1239  // blockStartsFromIndexBlockStart returns the possibly many blocksStarts that exist within
  1240  // a given index block (since index block size >= data block size)
  1241  func (i *nsIndex) blockStartsFromIndexBlockStart(blockStart xtime.UnixNano) []xtime.UnixNano {
  1242  	start := blockStart
  1243  	end := blockStart.Add(i.blockSize)
  1244  	dataBlockSize := i.nsMetadata.Options().RetentionOptions().BlockSize()
  1245  	blockStarts := make([]xtime.UnixNano, 0)
  1246  	for t := start; t.Before(end); t = t.Add(dataBlockSize) {
  1247  		blockStarts = append(blockStarts, t)
  1248  	}
  1249  	return blockStarts
  1250  }
  1251  
  1252  func (i *nsIndex) hasIndexWarmFlushedToDisk(
  1253  	infoFiles map[xtime.UnixNano][]fs.ReadIndexInfoFileResult,
  1254  	blockStart xtime.UnixNano,
  1255  ) bool {
  1256  	// NB(bodu): We consider the block to have been warm flushed if there are any
  1257  	// filesets on disk. This is consistent with the "has warm flushed" check in the db shard.
  1258  	// Shard block starts are marked as having warm flushed if an info file is successfully read from disk.
  1259  	f, ok := infoFiles[blockStart]
  1260  	if !ok {
  1261  		return false
  1262  	}
  1263  
  1264  	for _, fileInfo := range f {
  1265  		indexVolumeType := idxpersist.DefaultIndexVolumeType
  1266  		if fileInfo.Info.IndexVolumeType != nil {
  1267  			indexVolumeType = idxpersist.IndexVolumeType(fileInfo.Info.IndexVolumeType.Value)
  1268  		}
  1269  		match := fileInfo.ID.BlockStart == blockStart && indexVolumeType == idxpersist.DefaultIndexVolumeType
  1270  		if match {
  1271  			return true
  1272  		}
  1273  	}
  1274  	return false
  1275  }
  1276  
  1277  func (i *nsIndex) flushBlock(
  1278  	flush persist.IndexFlush,
  1279  	indexBlock index.Block,
  1280  	shards []databaseShard,
  1281  	builder segment.DocumentsBuilder,
  1282  ) ([]segment.Segment, error) {
  1283  	allShards := make(map[uint32]struct{})
  1284  	for _, shard := range shards {
  1285  		// Populate all shards
  1286  		allShards[shard.ID()] = struct{}{}
  1287  	}
  1288  
  1289  	volumeIndex, err := i.opts.IndexClaimsManager().ClaimNextIndexFileSetVolumeIndex(
  1290  		i.nsMetadata,
  1291  		indexBlock.StartTime(),
  1292  	)
  1293  	if err != nil {
  1294  		return nil, fmt.Errorf("failed to claim next index volume index: %w", err)
  1295  	}
  1296  
  1297  	preparedPersist, err := flush.PrepareIndex(persist.IndexPrepareOptions{
  1298  		NamespaceMetadata: i.nsMetadata,
  1299  		BlockStart:        indexBlock.StartTime(),
  1300  		FileSetType:       persist.FileSetFlushType,
  1301  		Shards:            allShards,
  1302  		// NB(bodu): By default, we always write to the "default" index volume type.
  1303  		IndexVolumeType: idxpersist.DefaultIndexVolumeType,
  1304  		VolumeIndex:     volumeIndex,
  1305  	})
  1306  	if err != nil {
  1307  		return nil, err
  1308  	}
  1309  
  1310  	var closed bool
  1311  	defer func() {
  1312  		if !closed {
  1313  			segments, _ := preparedPersist.Close()
  1314  			// NB(r): Safe to for over a nil array so disregard error here.
  1315  			for _, segment := range segments {
  1316  				segment.Close()
  1317  			}
  1318  		}
  1319  	}()
  1320  
  1321  	// Flush a single block segment.
  1322  	if err := i.flushBlockSegment(preparedPersist, indexBlock, shards, builder); err != nil {
  1323  		return nil, err
  1324  	}
  1325  
  1326  	closed = true
  1327  
  1328  	// Now return the immutable segments
  1329  	return preparedPersist.Close()
  1330  }
  1331  
  1332  func (i *nsIndex) flushBlockSegment(
  1333  	preparedPersist persist.PreparedIndexPersist,
  1334  	indexBlock index.Block,
  1335  	shards []databaseShard,
  1336  	builder segment.DocumentsBuilder,
  1337  ) error {
  1338  	// Reset the builder
  1339  	builder.Reset()
  1340  
  1341  	var (
  1342  		batch     = m3ninxindex.Batch{AllowPartialUpdates: true}
  1343  		batchSize = defaultFlushDocsBatchSize
  1344  	)
  1345  	ctx := i.opts.ContextPool().Get()
  1346  	defer ctx.Close()
  1347  
  1348  	for _, shard := range shards {
  1349  		var (
  1350  			first     = true
  1351  			pageToken PageToken
  1352  		)
  1353  		for first || pageToken != nil {
  1354  			first = false
  1355  
  1356  			var (
  1357  				opts = block.FetchBlocksMetadataOptions{
  1358  					// NB(bodu): There is a lag between when data gets flushed
  1359  					// to disk and when it gets removed from memory during the next
  1360  					// Tick. In this case, the same series can exist both on disk
  1361  					// and in memory at the same time resulting in dupe series IDs.
  1362  					// Only read data from disk when flushing index segments.
  1363  					OnlyDisk: true,
  1364  				}
  1365  				limit   = defaultFlushReadDataBlocksBatchSize
  1366  				results block.FetchBlocksMetadataResults
  1367  				err     error
  1368  			)
  1369  			ctx.Reset()
  1370  			results, pageToken, err = shard.FetchBlocksMetadataV2(ctx,
  1371  				indexBlock.StartTime(), indexBlock.EndTime(),
  1372  				limit, pageToken, opts)
  1373  			if err != nil {
  1374  				return err
  1375  			}
  1376  
  1377  			// Reset docs batch before use.
  1378  			batch.Docs = batch.Docs[:0]
  1379  			for _, result := range results.Results() {
  1380  				doc, exists, err := shard.DocRef(result.ID)
  1381  				if err != nil {
  1382  					return err
  1383  				}
  1384  				if !exists {
  1385  					doc, err = convert.FromSeriesIDAndTagIter(result.ID, result.Tags)
  1386  					if err != nil {
  1387  						return err
  1388  					}
  1389  					i.metrics.flushDocsNew.Inc(1)
  1390  				} else {
  1391  					i.metrics.flushDocsCached.Inc(1)
  1392  				}
  1393  
  1394  				batch.Docs = append(batch.Docs, doc)
  1395  				if len(batch.Docs) < batchSize {
  1396  					continue
  1397  				}
  1398  
  1399  				err = i.sanitizeAllowDuplicatesWriteError(builder.InsertBatch(batch))
  1400  				if err != nil {
  1401  					return err
  1402  				}
  1403  
  1404  				// Reset docs after insertions.
  1405  				batch.Docs = batch.Docs[:0]
  1406  			}
  1407  
  1408  			// Add last batch if remaining.
  1409  			if len(batch.Docs) > 0 {
  1410  				err := i.sanitizeAllowDuplicatesWriteError(builder.InsertBatch(batch))
  1411  				if err != nil {
  1412  					return err
  1413  				}
  1414  			}
  1415  
  1416  			results.Close()
  1417  
  1418  			// Use BlockingCloseReset so that we can reuse the context without
  1419  			// it going back to the pool.
  1420  			ctx.BlockingCloseReset()
  1421  		}
  1422  	}
  1423  
  1424  	// Finally flush this segment
  1425  	return preparedPersist.Persist(builder)
  1426  }
  1427  
  1428  func (i *nsIndex) sanitizeAllowDuplicatesWriteError(err error) error {
  1429  	if err == nil {
  1430  		return nil
  1431  	}
  1432  
  1433  	// NB: dropping duplicate id error messages from logs as they're expected when we see
  1434  	// repeated inserts. as long as a block has an ID, it's not an error so we don't need
  1435  	// to pollute the logs with these messages.
  1436  	if partialError, ok := err.(*m3ninxindex.BatchPartialError); ok {
  1437  		err = partialError.FilterDuplicateIDErrors()
  1438  	}
  1439  
  1440  	return err
  1441  }
  1442  
  1443  func (i *nsIndex) AssignShardSet(shardSet sharding.ShardSet) {
  1444  	// NB(r): Allocate the filter function once, it can be used outside
  1445  	// of locks as it depends on no internal state.
  1446  	set := bitset.NewBitSet(uint(shardSet.Max()))
  1447  	assigned := make(map[uint32]struct{})
  1448  	for _, shardID := range shardSet.AllIDs() {
  1449  		set.Set(uint(shardID))
  1450  		assigned[shardID] = struct{}{}
  1451  	}
  1452  
  1453  	i.state.Lock()
  1454  	i.state.shardsFilterID = func(id ident.ID) bool {
  1455  		// NB(r): Use a bitset for fast lookups.
  1456  		return set.Test(uint(shardSet.Lookup(id)))
  1457  	}
  1458  
  1459  	i.state.shardFilteredForID = func(id ident.ID) (uint32, bool) {
  1460  		shard := shardSet.Lookup(id)
  1461  		return shard, set.Test(uint(shard))
  1462  	}
  1463  
  1464  	i.state.shardsAssigned = assigned
  1465  	i.state.Unlock()
  1466  }
  1467  
  1468  func (i *nsIndex) shardsFilterID() func(id ident.ID) bool {
  1469  	i.state.RLock()
  1470  	v := i.state.shardsFilterID
  1471  	i.state.RUnlock()
  1472  	return v
  1473  }
  1474  
  1475  func (i *nsIndex) shardForID() func(id ident.ID) (uint32, bool) {
  1476  	i.state.RLock()
  1477  	v := i.state.shardFilteredForID
  1478  	i.state.RUnlock()
  1479  	return v
  1480  }
  1481  
  1482  func (i *nsIndex) Query(
  1483  	ctx context.Context,
  1484  	query index.Query,
  1485  	opts index.QueryOptions,
  1486  ) (index.QueryResult, error) {
  1487  	var logFields []opentracinglog.Field
  1488  	ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.NSIdxQuery)
  1489  	defer sp.Finish()
  1490  	if sampled {
  1491  		// Only allocate metadata such as query string if sampling trace.
  1492  		logFields = []opentracinglog.Field{
  1493  			opentracinglog.String("query", query.String()),
  1494  			opentracinglog.String("namespace", i.nsMetadata.ID().String()),
  1495  			opentracinglog.Int("seriesLimit", opts.SeriesLimit),
  1496  			opentracinglog.Int("docsLimit", opts.DocsLimit),
  1497  			xopentracing.Time("queryStart", opts.StartInclusive.ToTime()),
  1498  			xopentracing.Time("queryEnd", opts.EndExclusive.ToTime()),
  1499  		}
  1500  		sp.LogFields(logFields...)
  1501  	}
  1502  
  1503  	// Get results and set the namespace ID and size limit.
  1504  	results := i.resultsPool.Get()
  1505  	results.Reset(i.nsMetadata.ID(), index.QueryResultsOptions{
  1506  		SizeLimit: opts.SeriesLimit,
  1507  		FilterID:  i.shardsFilterID(),
  1508  	})
  1509  	ctx.RegisterFinalizer(results)
  1510  	queryRes, err := i.query(ctx, query, results, opts, i.execBlockQueryFn,
  1511  		i.newBlockQueryIterFn, logFields)
  1512  	if err != nil {
  1513  		sp.LogFields(opentracinglog.Error(err))
  1514  		return index.QueryResult{}, err
  1515  	}
  1516  
  1517  	return index.QueryResult{
  1518  		Results:    results,
  1519  		Exhaustive: queryRes.exhaustive,
  1520  		Waited:     queryRes.waited,
  1521  	}, nil
  1522  }
  1523  
  1524  func (i *nsIndex) AggregateQuery(
  1525  	ctx context.Context,
  1526  	query index.Query,
  1527  	opts index.AggregationOptions,
  1528  ) (index.AggregateQueryResult, error) {
  1529  	id := i.nsMetadata.ID()
  1530  	logFields := []opentracinglog.Field{
  1531  		opentracinglog.String("query", query.String()),
  1532  		opentracinglog.String("namespace", id.String()),
  1533  		opentracinglog.Int("seriesLimit", opts.SeriesLimit),
  1534  		opentracinglog.Int("docsLimit", opts.DocsLimit),
  1535  		xopentracing.Time("queryStart", opts.StartInclusive.ToTime()),
  1536  		xopentracing.Time("queryEnd", opts.EndExclusive.ToTime()),
  1537  	}
  1538  
  1539  	ctx, sp := ctx.StartTraceSpan(tracepoint.NSIdxAggregateQuery)
  1540  	sp.LogFields(logFields...)
  1541  	defer sp.Finish()
  1542  
  1543  	metrics := index.NewAggregateUsageMetrics(id, i.opts.InstrumentOptions())
  1544  	// Get results and set the filters, namespace ID and size limit.
  1545  	results := i.aggregateResultsPool.Get()
  1546  	aopts := index.AggregateResultsOptions{
  1547  		SizeLimit:             opts.SeriesLimit,
  1548  		DocsLimit:             opts.DocsLimit,
  1549  		FieldFilter:           opts.FieldFilter,
  1550  		Type:                  opts.Type,
  1551  		AggregateUsageMetrics: metrics,
  1552  	}
  1553  	ctx.RegisterFinalizer(results)
  1554  	// use appropriate fn to query underlying blocks.
  1555  	// use block.Aggregate() for querying and set the query if required.
  1556  	fn := i.execBlockAggregateQueryFn
  1557  	isAllQuery := query.Equal(allQuery)
  1558  	if !isAllQuery {
  1559  		if field, isFieldQuery := idx.FieldQuery(query.Query); isFieldQuery {
  1560  			aopts.FieldFilter = aopts.FieldFilter.AddIfMissing(field)
  1561  		} else {
  1562  			// Need to actually restrict whether we should return a term or not
  1563  			// based on running the actual query to resolve a postings list and
  1564  			// then seeing if that intersects the aggregated term postings list
  1565  			// at all.
  1566  			aopts.RestrictByQuery = &query
  1567  		}
  1568  	}
  1569  	aopts.FieldFilter = aopts.FieldFilter.SortAndDedupe()
  1570  	results.Reset(id, aopts)
  1571  	queryRes, err := i.query(ctx, query, results, opts.QueryOptions, fn,
  1572  		i.newBlockAggregatorIterFn, logFields)
  1573  	if err != nil {
  1574  		return index.AggregateQueryResult{}, err
  1575  	}
  1576  	return index.AggregateQueryResult{
  1577  		Results:    results,
  1578  		Exhaustive: queryRes.exhaustive,
  1579  		Waited:     queryRes.waited,
  1580  	}, nil
  1581  }
  1582  
  1583  type queryResult struct {
  1584  	exhaustive bool
  1585  	waited     int
  1586  }
  1587  
  1588  func (i *nsIndex) query(
  1589  	ctx context.Context,
  1590  	query index.Query,
  1591  	results index.BaseResults,
  1592  	opts index.QueryOptions,
  1593  	execBlockFn execBlockQueryFn,
  1594  	newBlockIterFn newBlockIterFn,
  1595  	logFields []opentracinglog.Field,
  1596  ) (queryResult, error) {
  1597  	ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.NSIdxQueryHelper)
  1598  	sp.LogFields(logFields...)
  1599  	defer sp.Finish()
  1600  	if sampled {
  1601  		// Only log fields if sampled.
  1602  		sp.LogFields(logFields...)
  1603  	}
  1604  
  1605  	queryRes, err := i.queryWithSpan(ctx, query, results, opts, execBlockFn,
  1606  		newBlockIterFn, sp, logFields)
  1607  	if err != nil {
  1608  		sp.LogFields(opentracinglog.Error(err))
  1609  
  1610  		if queryRes.exhaustive {
  1611  			i.metrics.queryExhaustiveInternalError.Inc(1)
  1612  		} else {
  1613  			i.metrics.queryNonExhaustiveInternalError.Inc(1)
  1614  		}
  1615  		return queryRes, err
  1616  	}
  1617  
  1618  	if queryRes.exhaustive {
  1619  		i.metrics.queryExhaustiveSuccess.Inc(1)
  1620  		return queryRes, nil
  1621  	}
  1622  
  1623  	// If require exhaustive but not, return error.
  1624  	if opts.RequireExhaustive {
  1625  		seriesCount := results.Size()
  1626  		docsCount := results.TotalDocsCount()
  1627  		if opts.SeriesLimitExceeded(seriesCount) {
  1628  			i.metrics.queryNonExhaustiveSeriesLimitError.Inc(1)
  1629  		} else if opts.DocsLimitExceeded(docsCount) {
  1630  			i.metrics.queryNonExhaustiveDocsLimitError.Inc(1)
  1631  		} else {
  1632  			i.metrics.queryNonExhaustiveLimitError.Inc(1)
  1633  		}
  1634  
  1635  		// NB(r): Make sure error is not retried and returns as bad request.
  1636  		return queryRes, xerrors.NewInvalidParamsError(limits.NewQueryLimitExceededError(fmt.Sprintf(
  1637  			"query exceeded limit: require_exhaustive=%v, series_limit=%d, series_matched=%d, docs_limit=%d, docs_matched=%d",
  1638  			opts.RequireExhaustive,
  1639  			opts.SeriesLimit,
  1640  			seriesCount,
  1641  			opts.DocsLimit,
  1642  			docsCount,
  1643  		)))
  1644  	}
  1645  
  1646  	// Otherwise non-exhaustive but not required to be.
  1647  	i.metrics.queryNonExhaustiveSuccess.Inc(1)
  1648  	return queryRes, nil
  1649  }
  1650  
  1651  // blockIter is a composite type to hold various state about a block while iterating over the results.
  1652  type blockIter struct {
  1653  	iter           index.ResultIterator
  1654  	iterCloser     io.Closer
  1655  	block          index.Block
  1656  	waitTime       time.Duration
  1657  	processingTime time.Duration
  1658  }
  1659  
  1660  func (i *nsIndex) queryWithSpan(
  1661  	ctx context.Context,
  1662  	query index.Query,
  1663  	results index.BaseResults,
  1664  	opts index.QueryOptions,
  1665  	execBlockFn execBlockQueryFn,
  1666  	newBlockIterFn newBlockIterFn,
  1667  	span opentracing.Span,
  1668  	logFields []opentracinglog.Field,
  1669  ) (queryResult, error) {
  1670  	i.state.RLock()
  1671  	if !i.isOpenWithRLock() {
  1672  		i.state.RUnlock()
  1673  		return queryResult{}, errDbIndexUnableToQueryClosed
  1674  	}
  1675  
  1676  	// Track this as an inflight query that needs to finish
  1677  	// when the index is closed.
  1678  	i.queriesWg.Add(1)
  1679  	defer i.queriesWg.Done()
  1680  
  1681  	// Enact overrides for query options
  1682  	opts = i.overriddenOptsForQueryWithRLock(opts)
  1683  
  1684  	// Retrieve blocks to query, then we can release lock.
  1685  	// NB(r): Important not to block ticking, and other tasks by
  1686  	// holding the RLock during a query.
  1687  	qryRange := xtime.NewRanges(xtime.Range{
  1688  		Start: opts.StartInclusive,
  1689  		End:   opts.EndExclusive,
  1690  	})
  1691  	// NB(r): Safe to take ref to i.state.blocksDescOrderImmutable since it's
  1692  	// immutable and we only create an iterator over it.
  1693  	blocks := newBlocksIterStackAlloc(i.activeBlock, i.state.blocksDescOrderImmutable, qryRange)
  1694  
  1695  	// Can now release the lock and execute the query without holding the lock.
  1696  	i.state.RUnlock()
  1697  
  1698  	var (
  1699  		// State contains concurrent mutable state for async execution below.
  1700  		state = &asyncQueryExecState{}
  1701  		wg    sync.WaitGroup
  1702  	)
  1703  	perms, err := i.permitsManager.NewPermits(ctx)
  1704  	if err != nil {
  1705  		return queryResult{}, err
  1706  	}
  1707  	defer perms.Close()
  1708  
  1709  	var blockIters []*blockIter
  1710  	for b, ok := blocks.Next(); ok; b, ok = b.Next() {
  1711  		block := b.Current()
  1712  		iter, err := newBlockIterFn(ctx, block, query, results)
  1713  		if err != nil {
  1714  			return queryResult{}, err
  1715  		}
  1716  		blockIters = append(blockIters, &blockIter{
  1717  			iter:       iter,
  1718  			iterCloser: x.NewSafeCloser(iter),
  1719  			block:      block,
  1720  		})
  1721  	}
  1722  
  1723  	defer func() {
  1724  		for _, iter := range blockIters {
  1725  			// safe to call Close multiple times, so it's fine to eagerly close in the loop below and here.
  1726  			_ = iter.iterCloser.Close()
  1727  		}
  1728  	}()
  1729  
  1730  	// queryCanceled returns true if the query has been canceled and the current iteration should terminate.
  1731  	queryCanceled := func() bool {
  1732  		return opts.LimitsExceeded(results.Size(), results.TotalDocsCount()) || state.hasErr()
  1733  	}
  1734  	// waitForPermit waits for a permit. returns non-nil if the permit was acquired and the wait time.
  1735  	waitForPermit := func() (permits.Permit, time.Duration) {
  1736  		// make sure the query hasn't been canceled before waiting for a permit.
  1737  		if queryCanceled() {
  1738  			return nil, 0
  1739  		}
  1740  
  1741  		startWait := time.Now()
  1742  		acquireResult, err := perms.Acquire(ctx)
  1743  		waitTime := time.Since(startWait)
  1744  		var success bool
  1745  		defer func() {
  1746  			// Note: ALWAYS release if we do not successfully return back
  1747  			// the permit and we checked one out.
  1748  			if !success && acquireResult.Permit != nil {
  1749  				perms.Release(acquireResult.Permit)
  1750  			}
  1751  		}()
  1752  		if acquireResult.Waited {
  1753  			// Potentially break an error if require no wait set.
  1754  			if err == nil && opts.RequireNoWait {
  1755  				// Fail iteration if request requires no waiting occurs.
  1756  				err = permits.ErrOperationWaitedOnRequireNoWait
  1757  			}
  1758  			state.incWaited(1)
  1759  		}
  1760  		if err != nil {
  1761  			state.addErr(err)
  1762  			return nil, waitTime
  1763  		}
  1764  
  1765  		// make sure the query hasn't been canceled while waiting for a permit.
  1766  		if queryCanceled() {
  1767  			return nil, waitTime
  1768  		}
  1769  
  1770  		success = true
  1771  		return acquireResult.Permit, waitTime
  1772  	}
  1773  
  1774  	// We're looping through all the blocks that we need to query and kicking
  1775  	// off parallel queries which are bounded by the permits maximum
  1776  	// concurrency. It's possible at this point that we've completed querying one or more blocks and already exhausted
  1777  	// the maximum number of results that we're allowed to return. If thats the case, there is no value in kicking off
  1778  	// more parallel queries, so we break out of the loop.
  1779  	for _, blockIter := range blockIters {
  1780  		// Capture for async query execution below.
  1781  		blockIter := blockIter
  1782  
  1783  		// acquire a permit before kicking off the goroutine to process the iterator. this limits the number of
  1784  		// concurrent goroutines to # of permits + large queries that needed multiple iterations to finish.
  1785  		permit, waitTime := waitForPermit()
  1786  		blockIter.waitTime += waitTime
  1787  		if permit == nil {
  1788  			break
  1789  		}
  1790  
  1791  		// must not reuse logField slice as the last field will be mutated by concurrent goroutines.
  1792  		blockLogFields := make([]opentracinglog.Field, 0, len(logFields)+1)
  1793  		blockLogFields = append(blockLogFields, logFields...)
  1794  
  1795  		wg.Add(1)
  1796  		// kick off a go routine to process the entire iterator.
  1797  		go func() {
  1798  			defer wg.Done()
  1799  			first := true
  1800  			for !blockIter.iter.Done() {
  1801  				// if this is not the first iteration of the iterator, need to acquire another permit.
  1802  				if !first {
  1803  					permit, waitTime = waitForPermit()
  1804  					blockIter.waitTime += waitTime
  1805  					if permit == nil {
  1806  						break
  1807  					}
  1808  				}
  1809  				blockLogFields = append(blockLogFields, xopentracing.Duration("permitWaitTime", waitTime))
  1810  				first = false
  1811  				startProcessing := time.Now()
  1812  				execBlockFn(ctx, blockIter.block, permit, blockIter.iter, opts, state, results, blockLogFields)
  1813  				processingTime := time.Since(startProcessing)
  1814  				blockIter.processingTime += processingTime
  1815  				permit.Use(int64(processingTime))
  1816  				perms.Release(permit)
  1817  			}
  1818  			if first {
  1819  				// this should never happen since a new iter cannot be Done, but just to be safe.
  1820  				perms.Release(permit)
  1821  			}
  1822  
  1823  			// close the iterator since it's no longer needed. it's safe to call Close multiple times, here and in the
  1824  			// defer when the function returns.
  1825  			if err := blockIter.iterCloser.Close(); err != nil {
  1826  				state.addErr(err)
  1827  			}
  1828  		}()
  1829  	}
  1830  
  1831  	// wait for all workers to finish. if the caller cancels the call, the workers will be interrupted and eventually
  1832  	// finish.
  1833  	wg.Wait()
  1834  
  1835  	i.metrics.loadedDocsPerQuery.RecordValue(float64(results.TotalDocsCount()))
  1836  
  1837  	exhaustive := opts.Exhaustive(results.Size(), results.TotalDocsCount())
  1838  	// ok to read state without lock since all parallel queries are done.
  1839  	multiErr := state.multiErr
  1840  	err = multiErr.FinalError()
  1841  
  1842  	return queryResult{
  1843  		exhaustive: exhaustive,
  1844  		waited:     state.waited(),
  1845  	}, err
  1846  }
  1847  
  1848  func (i *nsIndex) newBlockQueryIterFn(
  1849  	ctx context.Context,
  1850  	block index.Block,
  1851  	query index.Query,
  1852  	_ index.BaseResults,
  1853  ) (index.ResultIterator, error) {
  1854  	return block.QueryIter(ctx, query)
  1855  }
  1856  
  1857  //nolint: dupl
  1858  func (i *nsIndex) execBlockQueryFn(
  1859  	ctx context.Context,
  1860  	block index.Block,
  1861  	permit permits.Permit,
  1862  	iter index.ResultIterator,
  1863  	opts index.QueryOptions,
  1864  	state *asyncQueryExecState,
  1865  	results index.BaseResults,
  1866  	logFields []opentracinglog.Field,
  1867  ) {
  1868  	logFields = append(logFields,
  1869  		xopentracing.Time("blockStart", block.StartTime().ToTime()),
  1870  		xopentracing.Time("blockEnd", block.EndTime().ToTime()),
  1871  	)
  1872  
  1873  	ctx, sp := ctx.StartTraceSpan(tracepoint.NSIdxBlockQuery)
  1874  	sp.LogFields(logFields...)
  1875  	defer sp.Finish()
  1876  
  1877  	docResults, ok := results.(index.DocumentResults)
  1878  	if !ok { // should never happen
  1879  		state.addErr(fmt.Errorf("unknown results type [%T] received during query", results))
  1880  		return
  1881  	}
  1882  	queryIter, ok := iter.(index.QueryIterator)
  1883  	if !ok { // should never happen
  1884  		state.addErr(fmt.Errorf("unknown results type [%T] received during query", iter))
  1885  		return
  1886  	}
  1887  
  1888  	deadline := time.Now().Add(time.Duration(permit.AllowedQuota()))
  1889  	err := block.QueryWithIter(ctx, opts, queryIter, docResults, deadline, logFields)
  1890  	if err == index.ErrUnableToQueryBlockClosed {
  1891  		// NB(r): Because we query this block outside of the results lock, it's
  1892  		// possible this block may get closed if it slides out of retention, in
  1893  		// that case those results are no longer considered valid and outside of
  1894  		// retention regardless, so this is a non-issue.
  1895  		err = nil
  1896  	}
  1897  
  1898  	if err != nil {
  1899  		sp.LogFields(opentracinglog.Error(err))
  1900  		state.addErr(err)
  1901  	}
  1902  }
  1903  
  1904  func (i *nsIndex) newBlockAggregatorIterFn(
  1905  	ctx context.Context,
  1906  	block index.Block,
  1907  	_ index.Query,
  1908  	results index.BaseResults,
  1909  ) (index.ResultIterator, error) {
  1910  	aggResults, ok := results.(index.AggregateResults)
  1911  	if !ok { // should never happen
  1912  		return nil, fmt.Errorf("unknown results type [%T] received during aggregation", results)
  1913  	}
  1914  	return block.AggregateIter(ctx, aggResults.AggregateResultsOptions())
  1915  }
  1916  
  1917  func (i *nsIndex) execBlockAggregateQueryFn(
  1918  	ctx context.Context,
  1919  	block index.Block,
  1920  	permit permits.Permit,
  1921  	iter index.ResultIterator,
  1922  	opts index.QueryOptions,
  1923  	state *asyncQueryExecState,
  1924  	results index.BaseResults,
  1925  	logFields []opentracinglog.Field,
  1926  ) {
  1927  	logFields = append(logFields,
  1928  		xopentracing.Time("blockStart", block.StartTime().ToTime()),
  1929  		xopentracing.Time("blockEnd", block.EndTime().ToTime()),
  1930  	)
  1931  
  1932  	ctx, sp := ctx.StartTraceSpan(tracepoint.NSIdxBlockAggregateQuery)
  1933  	sp.LogFields(logFields...)
  1934  	defer sp.Finish()
  1935  
  1936  	aggResults, ok := results.(index.AggregateResults)
  1937  	if !ok { // should never happen
  1938  		state.addErr(fmt.Errorf("unknown results type [%T] received during aggregation", results))
  1939  		return
  1940  	}
  1941  	aggIter, ok := iter.(index.AggregateIterator)
  1942  	if !ok { // should never happen
  1943  		state.addErr(fmt.Errorf("unknown results type [%T] received during query", iter))
  1944  		return
  1945  	}
  1946  
  1947  	deadline := time.Now().Add(time.Duration(permit.AllowedQuota()))
  1948  	err := block.AggregateWithIter(ctx, aggIter, opts, aggResults, deadline, logFields)
  1949  	if err == index.ErrUnableToQueryBlockClosed {
  1950  		// NB(r): Because we query this block outside of the results lock, it's
  1951  		// possible this block may get closed if it slides out of retention, in
  1952  		// that case those results are no longer considered valid and outside of
  1953  		// retention regardless, so this is a non-issue.
  1954  		err = nil
  1955  	}
  1956  
  1957  	if err != nil {
  1958  		sp.LogFields(opentracinglog.Error(err))
  1959  		state.addErr(err)
  1960  	}
  1961  }
  1962  
  1963  func (i *nsIndex) overriddenOptsForQueryWithRLock(
  1964  	opts index.QueryOptions,
  1965  ) index.QueryOptions {
  1966  	// Override query response limits if needed.
  1967  	if i.state.runtimeOpts.maxQuerySeriesLimit > 0 && (opts.SeriesLimit == 0 ||
  1968  		int64(opts.SeriesLimit) > i.state.runtimeOpts.maxQuerySeriesLimit) {
  1969  		i.logger.Debug("overriding query response series limit",
  1970  			zap.Int("requested", opts.SeriesLimit),
  1971  			zap.Int64("maxAllowed", i.state.runtimeOpts.maxQuerySeriesLimit)) // FOLLOWUP(prateek): log query too once it's serializable.
  1972  		opts.SeriesLimit = int(i.state.runtimeOpts.maxQuerySeriesLimit)
  1973  	}
  1974  	if i.state.runtimeOpts.maxQueryDocsLimit > 0 && (opts.DocsLimit == 0 ||
  1975  		int64(opts.DocsLimit) > i.state.runtimeOpts.maxQueryDocsLimit) {
  1976  		i.logger.Debug("overriding query response docs limit",
  1977  			zap.Int("requested", opts.DocsLimit),
  1978  			zap.Int64("maxAllowed", i.state.runtimeOpts.maxQueryDocsLimit)) // FOLLOWUP(prateek): log query too once it's serializable.
  1979  		opts.DocsLimit = int(i.state.runtimeOpts.maxQueryDocsLimit)
  1980  	}
  1981  	return opts
  1982  }
  1983  
  1984  type blockPresentResult struct {
  1985  	block  index.Block
  1986  	latest bool
  1987  }
  1988  
  1989  func (i *nsIndex) ensureBlockPresent(blockStart xtime.UnixNano) (blockPresentResult, error) {
  1990  	i.state.RLock()
  1991  	defer i.state.RUnlock()
  1992  	if !i.isOpenWithRLock() {
  1993  		return blockPresentResult{}, errDbIndexUnableToWriteClosed
  1994  	}
  1995  	return i.ensureBlockPresentWithRLock(blockStart)
  1996  }
  1997  
  1998  func (i *nsIndex) isLatestBlockWithRLock(blockStart xtime.UnixNano) bool {
  1999  	return i.state.latestBlock != nil && i.state.latestBlock.StartTime().Equal(blockStart)
  2000  }
  2001  
  2002  // ensureBlockPresentWithRLock guarantees an index.Block exists for the specified
  2003  // blockStart, allocating one if it does not. It returns the desired block, or
  2004  // error if it's unable to do so.
  2005  func (i *nsIndex) ensureBlockPresentWithRLock(blockStart xtime.UnixNano) (blockPresentResult, error) {
  2006  	// check if the current latest block matches the required block, this
  2007  	// is the usual path and can short circuit the rest of the logic in this
  2008  	// function in most cases.
  2009  	if i.isLatestBlockWithRLock(blockStart) {
  2010  		return blockPresentResult{
  2011  			block:  i.state.latestBlock,
  2012  			latest: true,
  2013  		}, nil
  2014  	}
  2015  
  2016  	// check if exists in the map (this can happen if the latestBlock has not
  2017  	// been rotated yet).
  2018  	if block, ok := i.state.blocksByTime[blockStart]; ok {
  2019  		return blockPresentResult{block: block}, nil
  2020  	}
  2021  
  2022  	// i.e. block start does not exist, so we have to alloc.
  2023  	// we release the RLock (the function is called with this lock), and acquire
  2024  	// the write lock to do the extra allocation.
  2025  	i.state.RUnlock()
  2026  	i.state.Lock()
  2027  
  2028  	// need to guarantee all exit paths from the function leave with the RLock
  2029  	// so we release the write lock and re-acquire a read lock.
  2030  	defer func() {
  2031  		i.state.Unlock()
  2032  		i.state.RLock()
  2033  	}()
  2034  
  2035  	// re-check if exists in the map (another routine did the alloc)
  2036  	if block, ok := i.state.blocksByTime[blockStart]; ok {
  2037  		return blockPresentResult{
  2038  			block:  block,
  2039  			latest: i.isLatestBlockWithRLock(blockStart),
  2040  		}, nil
  2041  	}
  2042  
  2043  	// ok now we know for sure we have to alloc
  2044  	block, err := i.newBlockFn(blockStart, i.nsMetadata,
  2045  		index.BlockOptions{}, i.namespaceRuntimeOptsMgr, i.opts.IndexOptions())
  2046  	if err != nil { // unable to allocate the block, should never happen.
  2047  		return blockPresentResult{}, i.unableToAllocBlockInvariantError(err)
  2048  	}
  2049  
  2050  	// NB(bodu): Use same time barrier as `Tick` to make sealing of cold index blocks consistent.
  2051  	// We need to seal cold blocks write away for cold writes.
  2052  	if !blockStart.After(i.lastSealableBlockStart(xtime.ToUnixNano(i.nowFn()))) {
  2053  		if err := block.Seal(); err != nil {
  2054  			return blockPresentResult{}, err
  2055  		}
  2056  	}
  2057  
  2058  	// add to tracked blocks map
  2059  	i.state.blocksByTime[blockStart] = block
  2060  
  2061  	// update ordered blockStarts slice, and latestBlock
  2062  	i.updateBlockStartsWithLock()
  2063  
  2064  	return blockPresentResult{
  2065  		block:  block,
  2066  		latest: i.isLatestBlockWithRLock(blockStart),
  2067  	}, nil
  2068  }
  2069  
  2070  func (i *nsIndex) lastSealableBlockStart(t xtime.UnixNano) xtime.UnixNano {
  2071  	return retention.FlushTimeEndForBlockSize(i.blockSize, t.Add(-i.bufferPast))
  2072  }
  2073  
  2074  func (i *nsIndex) updateBlockStartsWithLock() {
  2075  	// update ordered blockStarts slice
  2076  	var (
  2077  		latestBlockStart xtime.UnixNano
  2078  		latestBlock      index.Block
  2079  	)
  2080  
  2081  	blocks := make([]blockAndBlockStart, 0, len(i.state.blocksByTime)+1)
  2082  	for ts, block := range i.state.blocksByTime {
  2083  		if ts >= latestBlockStart {
  2084  			latestBlockStart = ts
  2085  			latestBlock = block
  2086  		}
  2087  		blocks = append(blocks, blockAndBlockStart{
  2088  			block:      block,
  2089  			blockStart: ts,
  2090  		})
  2091  	}
  2092  
  2093  	// order in desc order (i.e. reverse chronological)
  2094  	sort.Slice(blocks, func(i, j int) bool {
  2095  		return blocks[i].blockStart > blocks[j].blockStart
  2096  	})
  2097  
  2098  	// NB(r): Important not to modify this once set since we take reference
  2099  	// to this slice with an RLock, release with RUnlock and then loop over it
  2100  	// during query time so it must not be altered and stay immutable.
  2101  	// This is done to avoid allocating a copy of the slice at query time for
  2102  	// each query.
  2103  	i.state.blocksDescOrderImmutable = blocks
  2104  
  2105  	// rotate latestBlock
  2106  	i.state.latestBlock = latestBlock
  2107  }
  2108  
  2109  func (i *nsIndex) isOpenWithRLock() bool {
  2110  	return !i.state.closed
  2111  }
  2112  
  2113  func (i *nsIndex) CleanupExpiredFileSets(t xtime.UnixNano) error {
  2114  	// we only expire data on drive that we don't hold a reference to, and is
  2115  	// past the expiration period. the earliest data we have to retain is given
  2116  	// by the following computation:
  2117  	//  Min(FIRST_EXPIRED_BLOCK, EARLIEST_RETAINED_BLOCK)
  2118  	i.state.RLock()
  2119  	defer i.state.RUnlock()
  2120  	if i.state.closed {
  2121  		return errDbIndexUnableToCleanupClosed
  2122  	}
  2123  
  2124  	// earliest block to retain based on retention period
  2125  	earliestBlockStartToRetain := retention.FlushTimeStartForRetentionPeriod(i.retentionPeriod, i.blockSize, t)
  2126  
  2127  	// now we loop through the blocks we hold, to ensure we don't delete any data for them.
  2128  	for t := range i.state.blocksByTime {
  2129  		if t.Before(earliestBlockStartToRetain) {
  2130  			earliestBlockStartToRetain = t
  2131  		}
  2132  	}
  2133  
  2134  	// know the earliest block to retain, find all blocks earlier than it
  2135  	var (
  2136  		pathPrefix = i.opts.CommitLogOptions().FilesystemOptions().FilePathPrefix()
  2137  		nsID       = i.nsMetadata.ID()
  2138  	)
  2139  	filesets, err := i.indexFilesetsBeforeFn(pathPrefix, nsID, earliestBlockStartToRetain)
  2140  	if err != nil {
  2141  		return err
  2142  	}
  2143  
  2144  	// and delete them
  2145  	return i.deleteFilesFn(filesets)
  2146  }
  2147  
  2148  func (i *nsIndex) CleanupCorruptedFileSets() error {
  2149  	/*
  2150  	   Corrupted index filesets can be safely cleaned up if its not
  2151  	   the latest volume index per index volume type/block start combo.
  2152  
  2153  	   We are guaranteed not to be actively writing to an index fileset once
  2154  	   we're already writing to later volume indices.
  2155  	*/
  2156  	fsOpts := i.opts.CommitLogOptions().FilesystemOptions()
  2157  	infoFiles := i.readIndexInfoFilesFn(fs.ReadIndexInfoFilesOptions{
  2158  		FilePathPrefix:   fsOpts.FilePathPrefix(),
  2159  		Namespace:        i.nsMetadata.ID(),
  2160  		ReaderBufferSize: fsOpts.InfoReaderBufferSize(),
  2161  		IncludeCorrupted: true,
  2162  	})
  2163  
  2164  	if len(infoFiles) == 0 {
  2165  		return nil
  2166  	}
  2167  
  2168  	var (
  2169  		toDelete []string
  2170  		begin    = 0 // marks the beginning of a subslice that contains filesets with same block starts
  2171  	)
  2172  	// It's expected that info files are ordered by block start and volume index
  2173  	for j := range infoFiles {
  2174  		if infoFiles[begin].ID.BlockStart.Before(infoFiles[j].ID.BlockStart) {
  2175  			files, err := i.getCorruptedVolumesForDeletion(infoFiles[begin:j])
  2176  			if err != nil {
  2177  				return err
  2178  			}
  2179  			toDelete = append(toDelete, files...)
  2180  			begin = j
  2181  		} else if infoFiles[begin].ID.BlockStart.After(infoFiles[j].ID.BlockStart) {
  2182  			errorMessage := "filesets are expected to be ordered by block start"
  2183  			instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) {
  2184  				l.Error(errorMessage)
  2185  			})
  2186  			return instrument.InvariantErrorf(errorMessage)
  2187  		}
  2188  	}
  2189  
  2190  	// Process the volumes in the last block, which are not covered by the loop.
  2191  	files, err := i.getCorruptedVolumesForDeletion(infoFiles[begin:])
  2192  	if err != nil {
  2193  		return err
  2194  	}
  2195  	toDelete = append(toDelete, files...)
  2196  
  2197  	return i.deleteFilesFn(toDelete)
  2198  }
  2199  
  2200  func (i *nsIndex) getCorruptedVolumesForDeletion(filesets []fs.ReadIndexInfoFileResult) ([]string, error) {
  2201  	if len(filesets) <= 1 {
  2202  		return nil, nil
  2203  	}
  2204  
  2205  	// Check for invariants.
  2206  	for j := 1; j < len(filesets); j++ {
  2207  		if !filesets[j-1].ID.BlockStart.Equal(filesets[j].ID.BlockStart) {
  2208  			errorMessage := "all the filesets passed to this function should have the same block start"
  2209  			instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) {
  2210  				l.Error(errorMessage)
  2211  			})
  2212  			return nil, instrument.InvariantErrorf(errorMessage)
  2213  		} else if filesets[j-1].ID.VolumeIndex >= filesets[j].ID.VolumeIndex {
  2214  			errorMessage := "filesets should be ordered by volume index in increasing order"
  2215  			instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) {
  2216  				l.Error(errorMessage)
  2217  			})
  2218  			return nil, instrument.InvariantErrorf(errorMessage)
  2219  		}
  2220  	}
  2221  
  2222  	toDelete := make([]string, 0)
  2223  	hasMoreRecentVolumeOfType := make(map[idxpersist.IndexVolumeType]struct{})
  2224  	// Iterate filesets in reverse order to process higher volume indexes first.
  2225  	for j := len(filesets) - 1; j >= 0; j-- {
  2226  		f := filesets[j]
  2227  
  2228  		// NB: If the fileset info fields contains inconsistent information (e.g. block start inside
  2229  		// info file doesn't match the block start extracted from the filename), it means that info file
  2230  		// is missing or corrupted. Thus we cannot trust the information of this fileset
  2231  		// and we cannot be sure what's the actual volume type of it. However, a part of corrupted
  2232  		// fileset cleanup logic depends on knowing the volume type.
  2233  		//
  2234  		// Such fileset is deleted, except when it is the most recent volume in the block.
  2235  		//
  2236  		// The most recent volume is excluded because it is more likely to be actively written to.
  2237  		// If info file writes are not atomic, due to timing readers might observe the file
  2238  		// to be corrupted, even though at that moment the file is being written/re-written.
  2239  		if f.Corrupted && !f.ID.BlockStart.Equal(xtime.UnixNano(f.Info.BlockStart)) {
  2240  			if j != len(filesets)-1 {
  2241  				toDelete = append(toDelete, f.AbsoluteFilePaths...)
  2242  			}
  2243  			continue
  2244  		}
  2245  
  2246  		volType := idxpersist.DefaultIndexVolumeType
  2247  		if f.Info.IndexVolumeType != nil {
  2248  			volType = idxpersist.IndexVolumeType(f.Info.IndexVolumeType.Value)
  2249  		}
  2250  		// Delete corrupted filesets if there are more recent volumes with the same volume type.
  2251  		if _, ok := hasMoreRecentVolumeOfType[volType]; !ok {
  2252  			hasMoreRecentVolumeOfType[volType] = struct{}{}
  2253  		} else if f.Corrupted {
  2254  			toDelete = append(toDelete, f.AbsoluteFilePaths...)
  2255  		}
  2256  	}
  2257  	return toDelete, nil
  2258  }
  2259  
  2260  func (i *nsIndex) CleanupDuplicateFileSets(activeShards []uint32) error {
  2261  	fsOpts := i.opts.CommitLogOptions().FilesystemOptions()
  2262  	infoFiles := i.readIndexInfoFilesFn(fs.ReadIndexInfoFilesOptions{
  2263  		FilePathPrefix:   fsOpts.FilePathPrefix(),
  2264  		Namespace:        i.nsMetadata.ID(),
  2265  		ReaderBufferSize: fsOpts.InfoReaderBufferSize(),
  2266  	})
  2267  
  2268  	segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart := make(map[xtime.UnixNano]map[idxpersist.IndexVolumeType][]fs.Segments)
  2269  	for _, file := range infoFiles {
  2270  		seg := fs.NewSegments(file.Info, file.ID.VolumeIndex, file.AbsoluteFilePaths)
  2271  		blockStart := seg.BlockStart()
  2272  		segmentsOrderByVolumeIndexByVolumeType, ok := segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart[blockStart]
  2273  		if !ok {
  2274  			segmentsOrderByVolumeIndexByVolumeType = make(map[idxpersist.IndexVolumeType][]fs.Segments)
  2275  			segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart[blockStart] = segmentsOrderByVolumeIndexByVolumeType
  2276  		}
  2277  
  2278  		volumeType := seg.VolumeType()
  2279  		if _, ok := segmentsOrderByVolumeIndexByVolumeType[volumeType]; !ok {
  2280  			segmentsOrderByVolumeIndexByVolumeType[volumeType] = make([]fs.Segments, 0)
  2281  		}
  2282  		segmentsOrderByVolumeIndexByVolumeType[volumeType] = append(segmentsOrderByVolumeIndexByVolumeType[volumeType], seg)
  2283  	}
  2284  
  2285  	// Ensure that segments are sorted by volume index.
  2286  	for _, segmentsOrderByVolumeIndexByVolumeType := range segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart {
  2287  		for _, segs := range segmentsOrderByVolumeIndexByVolumeType {
  2288  			sort.SliceStable(segs, func(i, j int) bool {
  2289  				return segs[i].VolumeIndex() < segs[j].VolumeIndex()
  2290  			})
  2291  		}
  2292  	}
  2293  
  2294  	multiErr := xerrors.NewMultiError()
  2295  	// Check for dupes and remove.
  2296  	filesToDelete := make([]string, 0)
  2297  	for _, segmentsOrderByVolumeIndexByVolumeType := range segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart {
  2298  		for _, segmentsOrderByVolumeIndex := range segmentsOrderByVolumeIndexByVolumeType {
  2299  			segmentsToKeep := make([]fs.Segments, 0)
  2300  			for _, seg := range segmentsOrderByVolumeIndex {
  2301  				for len(segmentsToKeep) > 0 {
  2302  					idx := len(segmentsToKeep) - 1
  2303  					if previous := segmentsToKeep[idx]; seg.ShardTimeRanges().IsSuperset(
  2304  						previous.ShardTimeRanges().FilterShards(activeShards)) {
  2305  						filesToDelete = append(filesToDelete, previous.AbsoluteFilePaths()...)
  2306  						segmentsToKeep = segmentsToKeep[:idx]
  2307  					} else {
  2308  						break
  2309  					}
  2310  				}
  2311  				segmentsToKeep = append(segmentsToKeep, seg)
  2312  			}
  2313  		}
  2314  	}
  2315  	multiErr = multiErr.Add(i.deleteFilesFn(filesToDelete))
  2316  	return multiErr.FinalError()
  2317  }
  2318  
  2319  func (i *nsIndex) DebugMemorySegments(opts DebugMemorySegmentsOptions) error {
  2320  	i.state.RLock()
  2321  	defer i.state.RUnlock()
  2322  	if i.state.closed {
  2323  		return errDbIndexAlreadyClosed
  2324  	}
  2325  
  2326  	ctx := context.NewBackground()
  2327  	defer ctx.Close()
  2328  
  2329  	// Create a new set of file system options to output to new directory.
  2330  	fsOpts := i.opts.CommitLogOptions().
  2331  		FilesystemOptions().
  2332  		SetFilePathPrefix(opts.OutputDirectory)
  2333  
  2334  	for _, block := range i.state.blocksByTime {
  2335  		segmentsData, err := block.MemorySegmentsData(ctx)
  2336  		if err != nil {
  2337  			return err
  2338  		}
  2339  
  2340  		for numSegment, segmentData := range segmentsData {
  2341  			indexWriter, err := fs.NewIndexWriter(fsOpts)
  2342  			if err != nil {
  2343  				return err
  2344  			}
  2345  
  2346  			fileSetID := fs.FileSetFileIdentifier{
  2347  				FileSetContentType: persist.FileSetIndexContentType,
  2348  				Namespace:          i.nsMetadata.ID(),
  2349  				BlockStart:         block.StartTime(),
  2350  				VolumeIndex:        numSegment,
  2351  			}
  2352  			openOpts := fs.IndexWriterOpenOptions{
  2353  				Identifier:      fileSetID,
  2354  				BlockSize:       i.blockSize,
  2355  				FileSetType:     persist.FileSetFlushType,
  2356  				Shards:          i.state.shardsAssigned,
  2357  				IndexVolumeType: idxpersist.DefaultIndexVolumeType,
  2358  			}
  2359  			if err := indexWriter.Open(openOpts); err != nil {
  2360  				return err
  2361  			}
  2362  
  2363  			segWriter, err := idxpersist.NewFSTSegmentDataFileSetWriter(segmentData)
  2364  			if err != nil {
  2365  				return err
  2366  			}
  2367  
  2368  			if err := indexWriter.WriteSegmentFileSet(segWriter); err != nil {
  2369  				return err
  2370  			}
  2371  
  2372  			if err := indexWriter.Close(); err != nil {
  2373  				return err
  2374  			}
  2375  		}
  2376  	}
  2377  
  2378  	return nil
  2379  }
  2380  
  2381  func (i *nsIndex) Close() error {
  2382  	i.state.Lock()
  2383  	if !i.isOpenWithRLock() {
  2384  		i.state.Unlock()
  2385  		return errDbIndexAlreadyClosed
  2386  	}
  2387  
  2388  	i.state.closed = true
  2389  	close(i.state.closeCh)
  2390  
  2391  	var multiErr xerrors.MultiError
  2392  	multiErr = multiErr.Add(i.state.insertQueue.Stop())
  2393  
  2394  	blocks := make([]index.Block, 0, len(i.state.blocksByTime)+1)
  2395  	for _, block := range i.state.blocksByTime {
  2396  		blocks = append(blocks, block)
  2397  	}
  2398  	blocks = append(blocks, i.activeBlock)
  2399  
  2400  	i.activeBlock = nil
  2401  	i.state.latestBlock = nil
  2402  	i.state.blocksByTime = nil
  2403  	i.state.blocksDescOrderImmutable = nil
  2404  
  2405  	if i.runtimeOptsListener != nil {
  2406  		i.runtimeOptsListener.Close()
  2407  		i.runtimeOptsListener = nil
  2408  	}
  2409  
  2410  	if i.runtimeNsOptsListener != nil {
  2411  		i.runtimeNsOptsListener.Close()
  2412  		i.runtimeNsOptsListener = nil
  2413  	}
  2414  
  2415  	// Can now unlock after collecting blocks to close and setting closed state.
  2416  	i.state.Unlock()
  2417  
  2418  	// Wait for inflight queries to finish before closing blocks, do this
  2419  	// outside of lock in case an inflight query needs to acquire a read lock
  2420  	// to finish but can't acquire it because close was holding the lock waiting
  2421  	// for queries to drain first.
  2422  	i.queriesWg.Wait()
  2423  
  2424  	for _, block := range blocks {
  2425  		multiErr = multiErr.Add(block.Close())
  2426  	}
  2427  
  2428  	return multiErr.FinalError()
  2429  }
  2430  
  2431  func (i *nsIndex) unableToAllocBlockInvariantError(err error) error {
  2432  	ierr := fmt.Errorf("index unable to allocate block: %v", err)
  2433  	instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) {
  2434  		l.Error(ierr.Error())
  2435  	})
  2436  	return ierr
  2437  }
  2438  
  2439  type nsIndexMetrics struct {
  2440  	tick tally.Counter
  2441  
  2442  	asyncInsertAttemptTotal tally.Counter
  2443  	asyncInsertAttemptSkip  tally.Counter
  2444  	asyncInsertAttemptWrite tally.Counter
  2445  
  2446  	asyncInsertSuccess               tally.Counter
  2447  	asyncInsertErrors                tally.Counter
  2448  	insertAfterClose                 tally.Counter
  2449  	queryAfterClose                  tally.Counter
  2450  	forwardIndexHits                 tally.Counter
  2451  	forwardIndexMisses               tally.Counter
  2452  	forwardIndexCounter              tally.Counter
  2453  	insertEndToEndLatency            tally.Timer
  2454  	blocksEvictedMutableSegments     tally.Counter
  2455  	blockMetrics                     nsIndexBlocksMetrics
  2456  	indexingConcurrencyMin           tally.Gauge
  2457  	indexingConcurrencyMax           tally.Gauge
  2458  	indexingConcurrencyAvg           tally.Gauge
  2459  	flushIndexingConcurrency         tally.Gauge
  2460  	flushDocsNew                     tally.Counter
  2461  	flushDocsCached                  tally.Counter
  2462  	latestBlockNumSegmentsForeground tally.Gauge
  2463  	latestBlockNumDocsForeground     tally.Gauge
  2464  	latestBlockNumSegmentsBackground tally.Gauge
  2465  	latestBlockNumDocsBackground     tally.Gauge
  2466  
  2467  	loadedDocsPerQuery                 tally.Histogram
  2468  	queryExhaustiveSuccess             tally.Counter
  2469  	queryExhaustiveInternalError       tally.Counter
  2470  	queryNonExhaustiveSuccess          tally.Counter
  2471  	queryNonExhaustiveInternalError    tally.Counter
  2472  	queryNonExhaustiveLimitError       tally.Counter
  2473  	queryNonExhaustiveSeriesLimitError tally.Counter
  2474  	queryNonExhaustiveDocsLimitError   tally.Counter
  2475  }
  2476  
  2477  func newNamespaceIndexMetrics(
  2478  	opts index.Options,
  2479  	iopts instrument.Options,
  2480  ) nsIndexMetrics {
  2481  	const (
  2482  		indexAttemptName         = "index-attempt"
  2483  		forwardIndexName         = "forward-index"
  2484  		indexingConcurrency      = "indexing-concurrency"
  2485  		flushIndexingConcurrency = "flush-indexing-concurrency"
  2486  	)
  2487  	scope := iopts.MetricsScope()
  2488  	blocksScope := scope.SubScope("blocks")
  2489  	m := nsIndexMetrics{
  2490  		tick: scope.Counter("index-tick"),
  2491  		asyncInsertAttemptTotal: scope.Tagged(map[string]string{
  2492  			"stage": "process",
  2493  		}).Counter(indexAttemptName),
  2494  		asyncInsertAttemptSkip: scope.Tagged(map[string]string{
  2495  			"stage": "skip",
  2496  		}).Counter(indexAttemptName),
  2497  		asyncInsertAttemptWrite: scope.Tagged(map[string]string{
  2498  			"stage": "write",
  2499  		}).Counter(indexAttemptName),
  2500  		asyncInsertSuccess: scope.Counter("index-success"),
  2501  		asyncInsertErrors: scope.Tagged(map[string]string{
  2502  			"error_type": "async-insert",
  2503  		}).Counter("index-error"),
  2504  		insertAfterClose: scope.Tagged(map[string]string{
  2505  			"error_type": "insert-closed",
  2506  		}).Counter("insert-after-close"),
  2507  		queryAfterClose: scope.Tagged(map[string]string{
  2508  			"error_type": "query-closed",
  2509  		}).Counter("query-after-error"),
  2510  		forwardIndexHits: scope.Tagged(map[string]string{
  2511  			"status": "hit",
  2512  		}).Counter(forwardIndexName),
  2513  		forwardIndexMisses: scope.Tagged(map[string]string{
  2514  			"status": "miss",
  2515  		}).Counter(forwardIndexName),
  2516  		forwardIndexCounter: scope.Tagged(map[string]string{
  2517  			"status": "count",
  2518  		}).Counter(forwardIndexName),
  2519  		insertEndToEndLatency: instrument.NewTimer(scope,
  2520  			"insert-end-to-end-latency", iopts.TimerOptions()),
  2521  		blocksEvictedMutableSegments: scope.Counter("blocks-evicted-mutable-segments"),
  2522  		blockMetrics:                 newNamespaceIndexBlocksMetrics(opts, blocksScope),
  2523  		indexingConcurrencyMin: scope.Tagged(map[string]string{
  2524  			"stat": "min",
  2525  		}).Gauge(indexingConcurrency),
  2526  		indexingConcurrencyMax: scope.Tagged(map[string]string{
  2527  			"stat": "max",
  2528  		}).Gauge(indexingConcurrency),
  2529  		indexingConcurrencyAvg: scope.Tagged(map[string]string{
  2530  			"stat": "avg",
  2531  		}).Gauge(indexingConcurrency),
  2532  		flushIndexingConcurrency: scope.Gauge(flushIndexingConcurrency),
  2533  		flushDocsNew: scope.Tagged(map[string]string{
  2534  			"status": "new",
  2535  		}).Counter("flush-docs"),
  2536  		flushDocsCached: scope.Tagged(map[string]string{
  2537  			"status": "cached",
  2538  		}).Counter("flush-docs"),
  2539  		latestBlockNumSegmentsForeground: scope.Tagged(map[string]string{
  2540  			"segment_type": "foreground",
  2541  		}).Gauge("latest-block-num-segments"),
  2542  		latestBlockNumDocsForeground: scope.Tagged(map[string]string{
  2543  			"segment_type": "foreground",
  2544  		}).Gauge("latest-block-num-docs"),
  2545  		latestBlockNumSegmentsBackground: scope.Tagged(map[string]string{
  2546  			"segment_type": "background",
  2547  		}).Gauge("latest-block-num-segments"),
  2548  		latestBlockNumDocsBackground: scope.Tagged(map[string]string{
  2549  			"segment_type": "background",
  2550  		}).Gauge("latest-block-num-docs"),
  2551  		loadedDocsPerQuery: scope.Histogram(
  2552  			"loaded-docs-per-query",
  2553  			tally.MustMakeExponentialValueBuckets(10, 2, 16),
  2554  		),
  2555  		queryExhaustiveSuccess: scope.Tagged(map[string]string{
  2556  			"exhaustive": "true",
  2557  			"result":     "success",
  2558  		}).Counter("query"),
  2559  		queryExhaustiveInternalError: scope.Tagged(map[string]string{
  2560  			"exhaustive": "true",
  2561  			"result":     "error_internal",
  2562  		}).Counter("query"),
  2563  		queryNonExhaustiveSuccess: scope.Tagged(map[string]string{
  2564  			"exhaustive": "false",
  2565  			"result":     "success",
  2566  		}).Counter("query"),
  2567  		queryNonExhaustiveInternalError: scope.Tagged(map[string]string{
  2568  			"exhaustive": "false",
  2569  			"result":     "error_internal",
  2570  		}).Counter("query"),
  2571  		queryNonExhaustiveLimitError: scope.Tagged(map[string]string{
  2572  			"exhaustive": "false",
  2573  			"result":     "error_require_exhaustive",
  2574  		}).Counter("query"),
  2575  		queryNonExhaustiveSeriesLimitError: scope.Tagged(map[string]string{
  2576  			"exhaustive": "false",
  2577  			"result":     "error_series_require_exhaustive",
  2578  		}).Counter("query"),
  2579  		queryNonExhaustiveDocsLimitError: scope.Tagged(map[string]string{
  2580  			"exhaustive": "false",
  2581  			"result":     "error_docs_require_exhaustive",
  2582  		}).Counter("query"),
  2583  	}
  2584  
  2585  	// Initialize gauges that should default to zero before
  2586  	// returning results so that they are exported with an
  2587  	// explicit zero value at process startup.
  2588  	m.flushIndexingConcurrency.Update(0)
  2589  
  2590  	return m
  2591  }
  2592  
  2593  type nsIndexBlocksMetrics struct {
  2594  	ForegroundSegments nsIndexBlocksSegmentsMetrics
  2595  	BackgroundSegments nsIndexBlocksSegmentsMetrics
  2596  	FlushedSegments    nsIndexBlocksSegmentsMetrics
  2597  }
  2598  
  2599  func newNamespaceIndexBlocksMetrics(
  2600  	opts index.Options,
  2601  	scope tally.Scope,
  2602  ) nsIndexBlocksMetrics {
  2603  	return nsIndexBlocksMetrics{
  2604  		ForegroundSegments: newNamespaceIndexBlocksSegmentsMetrics(
  2605  			opts.ForegroundCompactionPlannerOptions(),
  2606  			scope.Tagged(map[string]string{
  2607  				"segment-type": "foreground",
  2608  			})),
  2609  		BackgroundSegments: newNamespaceIndexBlocksSegmentsMetrics(
  2610  			opts.BackgroundCompactionPlannerOptions(),
  2611  			scope.Tagged(map[string]string{
  2612  				"segment-type": "background",
  2613  			})),
  2614  		FlushedSegments: newNamespaceIndexBlocksSegmentsMetrics(
  2615  			opts.BackgroundCompactionPlannerOptions(),
  2616  			scope.Tagged(map[string]string{
  2617  				"segment-type": "flushed",
  2618  			})),
  2619  	}
  2620  }
  2621  
  2622  type nsIndexBlocksSegmentsMetrics struct {
  2623  	Levels []nsIndexBlocksSegmentsLevelMetrics
  2624  }
  2625  
  2626  type nsIndexBlocksSegmentsLevelMetrics struct {
  2627  	MinSizeInclusive int64
  2628  	MaxSizeExclusive int64
  2629  	NumSegments      tally.Gauge
  2630  	NumTotalDocs     tally.Gauge
  2631  	SegmentsAge      tally.Timer
  2632  }
  2633  
  2634  func newNamespaceIndexBlocksSegmentsMetrics(
  2635  	compactionOpts compaction.PlannerOptions,
  2636  	scope tally.Scope,
  2637  ) nsIndexBlocksSegmentsMetrics {
  2638  	segmentLevelsScope := scope.SubScope("segment-levels")
  2639  	levels := make([]nsIndexBlocksSegmentsLevelMetrics, 0, len(compactionOpts.Levels))
  2640  	for _, level := range compactionOpts.Levels {
  2641  		subScope := segmentLevelsScope.Tagged(map[string]string{
  2642  			"level-min-size": strconv.Itoa(int(level.MinSizeInclusive)),
  2643  			"level-max-size": strconv.Itoa(int(level.MaxSizeExclusive)),
  2644  		})
  2645  		levels = append(levels, nsIndexBlocksSegmentsLevelMetrics{
  2646  			MinSizeInclusive: level.MinSizeInclusive,
  2647  			MaxSizeExclusive: level.MaxSizeExclusive,
  2648  			NumSegments:      subScope.Gauge("num-segments"),
  2649  			NumTotalDocs:     subScope.Gauge("num-total-docs"),
  2650  			SegmentsAge:      subScope.Timer("segments-age"),
  2651  		})
  2652  	}
  2653  
  2654  	return nsIndexBlocksSegmentsMetrics{
  2655  		Levels: levels,
  2656  	}
  2657  }
  2658  
  2659  type dbShards []databaseShard
  2660  
  2661  func (shards dbShards) IDs() []uint32 {
  2662  	ids := make([]uint32, 0, len(shards))
  2663  	for _, s := range shards {
  2664  		ids = append(ids, s.ID())
  2665  	}
  2666  	return ids
  2667  }
  2668  
  2669  // blocksIterStackAlloc is a stack allocated block iterator, ensuring no
  2670  // allocations per query.
  2671  type blocksIterStackAlloc struct {
  2672  	activeBlock index.Block
  2673  	blocks      []blockAndBlockStart
  2674  	queryRanges xtime.Ranges
  2675  	idx         int
  2676  }
  2677  
  2678  func newBlocksIterStackAlloc(
  2679  	activeBlock index.Block,
  2680  	blocks []blockAndBlockStart,
  2681  	queryRanges xtime.Ranges,
  2682  ) blocksIterStackAlloc {
  2683  	return blocksIterStackAlloc{
  2684  		activeBlock: activeBlock,
  2685  		blocks:      blocks,
  2686  		queryRanges: queryRanges,
  2687  		idx:         -2,
  2688  	}
  2689  }
  2690  
  2691  func (i blocksIterStackAlloc) Next() (blocksIterStackAlloc, bool) {
  2692  	iter := i
  2693  
  2694  	for {
  2695  		iter.idx++
  2696  		if iter.idx == -1 {
  2697  			// This will return the active block.
  2698  			return iter, true
  2699  		}
  2700  
  2701  		// No more ranges to query, perform this second so that
  2702  		// the in memory block always returns results.
  2703  		if i.queryRanges.IsEmpty() {
  2704  			return iter, false
  2705  		}
  2706  
  2707  		if iter.idx >= len(i.blocks) {
  2708  			return iter, false
  2709  		}
  2710  
  2711  		block := i.blocks[iter.idx].block
  2712  
  2713  		// Ensure the block has data requested by the query.
  2714  		blockRange := xtime.Range{
  2715  			Start: block.StartTime(),
  2716  			End:   block.EndTime(),
  2717  		}
  2718  		if !i.queryRanges.Overlaps(blockRange) {
  2719  			continue
  2720  		}
  2721  
  2722  		// Remove this range from the query range.
  2723  		i.queryRanges.RemoveRange(blockRange)
  2724  
  2725  		return iter, true
  2726  	}
  2727  }
  2728  
  2729  func (i blocksIterStackAlloc) Current() index.Block {
  2730  	if i.idx == -1 {
  2731  		return i.activeBlock
  2732  	}
  2733  	return i.blocks[i.idx].block
  2734  }