github.com/m3db/m3@v1.5.0/src/dbnode/storage/index.go (about)

     1  // Copyright (c) 2020 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package storage
    22  
    23  import (
    24  	"bytes"
    25  	"errors"
    26  	"fmt"
    27  	"io"
    28  	"math"
    29  	goruntime "runtime"
    30  	"sort"
    31  	"strconv"
    32  	"sync"
    33  	"time"
    34  
    35  	"github.com/m3db/m3/src/dbnode/namespace"
    36  	"github.com/m3db/m3/src/dbnode/persist"
    37  	"github.com/m3db/m3/src/dbnode/persist/fs"
    38  	"github.com/m3db/m3/src/dbnode/retention"
    39  	"github.com/m3db/m3/src/dbnode/runtime"
    40  	"github.com/m3db/m3/src/dbnode/sharding"
    41  	"github.com/m3db/m3/src/dbnode/storage/block"
    42  	"github.com/m3db/m3/src/dbnode/storage/bootstrap/result"
    43  	m3dberrors "github.com/m3db/m3/src/dbnode/storage/errors"
    44  	"github.com/m3db/m3/src/dbnode/storage/index"
    45  	"github.com/m3db/m3/src/dbnode/storage/index/compaction"
    46  	"github.com/m3db/m3/src/dbnode/storage/index/convert"
    47  	"github.com/m3db/m3/src/dbnode/storage/limits"
    48  	"github.com/m3db/m3/src/dbnode/storage/limits/permits"
    49  	"github.com/m3db/m3/src/dbnode/storage/series"
    50  	"github.com/m3db/m3/src/dbnode/tracepoint"
    51  	"github.com/m3db/m3/src/dbnode/ts/writes"
    52  	"github.com/m3db/m3/src/m3ninx/doc"
    53  	"github.com/m3db/m3/src/m3ninx/idx"
    54  	m3ninxindex "github.com/m3db/m3/src/m3ninx/index"
    55  	"github.com/m3db/m3/src/m3ninx/index/segment"
    56  	"github.com/m3db/m3/src/m3ninx/index/segment/builder"
    57  	idxpersist "github.com/m3db/m3/src/m3ninx/persist"
    58  	"github.com/m3db/m3/src/m3ninx/x"
    59  	"github.com/m3db/m3/src/x/clock"
    60  	"github.com/m3db/m3/src/x/context"
    61  	xerrors "github.com/m3db/m3/src/x/errors"
    62  	"github.com/m3db/m3/src/x/ident"
    63  	"github.com/m3db/m3/src/x/instrument"
    64  	xopentracing "github.com/m3db/m3/src/x/opentracing"
    65  	xresource "github.com/m3db/m3/src/x/resource"
    66  	xtime "github.com/m3db/m3/src/x/time"
    67  
    68  	"github.com/m3db/bitset"
    69  	"github.com/opentracing/opentracing-go"
    70  	opentracinglog "github.com/opentracing/opentracing-go/log"
    71  	"github.com/uber-go/tally"
    72  	"go.uber.org/atomic"
    73  	"go.uber.org/zap"
    74  )
    75  
    76  var (
    77  	errDbIndexAlreadyClosed               = errors.New("database index has already been closed")
    78  	errDbIndexUnableToWriteClosed         = errors.New("unable to write to database index, already closed")
    79  	errDbIndexUnableToQueryClosed         = errors.New("unable to query database index, already closed")
    80  	errDbIndexUnableToFlushClosed         = errors.New("unable to flush database index, already closed")
    81  	errDbIndexUnableToCleanupClosed       = errors.New("unable to cleanup database index, already closed")
    82  	errDbIndexTerminatingTickCancellation = errors.New("terminating tick early due to cancellation")
    83  	errDbIndexIsBootstrapping             = errors.New("index is already bootstrapping")
    84  	errDbIndexDoNotIndexSeries            = errors.New("series matched do not index fields")
    85  )
    86  
    87  const (
    88  	defaultFlushReadDataBlocksBatchSize = int64(4096)
    89  	nsIndexReportStatsInterval          = 10 * time.Second
    90  
    91  	defaultFlushDocsBatchSize = 8192
    92  )
    93  
    94  var allQuery = idx.NewAllQuery()
    95  
    96  // nolint: maligned
    97  type nsIndex struct {
    98  	state nsIndexState
    99  
   100  	// all the vars below this line are not modified past the ctor
   101  	// and don't require a lock when being accessed.
   102  	nowFn                 clock.NowFn
   103  	blockSize             time.Duration
   104  	retentionPeriod       time.Duration
   105  	futureRetentionPeriod time.Duration
   106  	bufferPast            time.Duration
   107  	bufferFuture          time.Duration
   108  	coldWritesEnabled     bool
   109  
   110  	namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager
   111  	indexFilesetsBeforeFn   indexFilesetsBeforeFn
   112  	deleteFilesFn           deleteFilesFn
   113  	readIndexInfoFilesFn    readIndexInfoFilesFn
   114  
   115  	newBlockFn            index.NewBlockFn
   116  	logger                *zap.Logger
   117  	opts                  Options
   118  	nsMetadata            namespace.Metadata
   119  	runtimeOptsListener   xresource.SimpleCloser
   120  	runtimeNsOptsListener xresource.SimpleCloser
   121  
   122  	resultsPool          index.QueryResultsPool
   123  	aggregateResultsPool index.AggregateResultsPool
   124  
   125  	permitsManager permits.Manager
   126  
   127  	// queriesWg tracks outstanding queries to ensure
   128  	// we wait for all queries to complete before actually closing
   129  	// blocks and other cleanup tasks on index close
   130  	queriesWg sync.WaitGroup
   131  
   132  	metrics nsIndexMetrics
   133  
   134  	// forwardIndexDice determines if an incoming index write should be dual
   135  	// written to the next block.
   136  	forwardIndexDice forwardIndexDice
   137  
   138  	doNotIndexWithFields []doc.Field
   139  
   140  	activeBlock index.Block
   141  }
   142  
   143  type nsIndexState struct {
   144  	sync.RWMutex // NB: guards all variables in this struct
   145  
   146  	closed         bool
   147  	closeCh        chan struct{}
   148  	bootstrapState BootstrapState
   149  
   150  	runtimeOpts nsIndexRuntimeOptions
   151  
   152  	insertQueue namespaceIndexInsertQueue
   153  
   154  	// NB: `latestBlock` v `blocksByTime`: blocksByTime contains all the blocks known to `nsIndex`.
   155  	// `latestBlock` refers to the block with greatest StartTime within blocksByTime. We do this
   156  	// to skip accessing the map blocksByTime in the vast majority of write/query requests. It's
   157  	// lazily updated, so it can point to an older element until a Tick()/write rotates it.
   158  	blocksByTime map[xtime.UnixNano]index.Block
   159  	latestBlock  index.Block
   160  
   161  	// NB: `blockStartsDescOrder` contains the keys from the map `blocksByTime` in reverse
   162  	// chronological order. This is used at query time to enforce determinism about results
   163  	// returned.
   164  	// NB(r): Reference to this slice can be safely taken for iteration purposes
   165  	// for Query(..) since it is rebuilt each time and immutable once built.
   166  	blocksDescOrderImmutable []blockAndBlockStart
   167  
   168  	// shardsFilterID is set every time the shards change to correctly
   169  	// only return IDs that this node owns.
   170  	shardsFilterID func(ident.ID) bool
   171  
   172  	// shardFilteredForID is set every time the shards change to correctly
   173  	// only return IDs that this node owns, and the shard responsible for that ID.
   174  	shardFilteredForID func(id ident.ID) (uint32, bool)
   175  
   176  	shardsAssigned map[uint32]struct{}
   177  }
   178  
   179  type blockAndBlockStart struct {
   180  	block      index.Block
   181  	blockStart xtime.UnixNano
   182  }
   183  
   184  // NB: nsIndexRuntimeOptions does not contain its own mutex as some of the variables
   185  // are needed for each index write which already at least acquires read lock from
   186  // nsIndex mutex, so to keep the lock acquisitions to a minimum these are protected
   187  // under the same nsIndex mutex.
   188  type nsIndexRuntimeOptions struct {
   189  	insertMode          index.InsertMode
   190  	maxQuerySeriesLimit int64
   191  	maxQueryDocsLimit   int64
   192  }
   193  
   194  // NB(prateek): the returned filesets are strictly before the given time, i.e. they
   195  // live in the period (-infinity, exclusiveTime).
   196  type indexFilesetsBeforeFn func(dir string,
   197  	nsID ident.ID,
   198  	exclusiveTime xtime.UnixNano,
   199  ) ([]string, error)
   200  
   201  type readIndexInfoFilesFn func(opts fs.ReadIndexInfoFilesOptions) []fs.ReadIndexInfoFileResult
   202  
   203  type newNamespaceIndexOpts struct {
   204  	md                      namespace.Metadata
   205  	namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager
   206  	shardSet                sharding.ShardSet
   207  	opts                    Options
   208  	newIndexQueueFn         newNamespaceIndexInsertQueueFn
   209  	newBlockFn              index.NewBlockFn
   210  }
   211  
   212  // execBlockQueryFn executes a query against the given block whilst tracking state.
   213  type execBlockQueryFn func(
   214  	ctx context.Context,
   215  	block index.Block,
   216  	permit permits.Permit,
   217  	iter index.ResultIterator,
   218  	opts index.QueryOptions,
   219  	state *asyncQueryExecState,
   220  	results index.BaseResults,
   221  	logFields []opentracinglog.Field,
   222  )
   223  
   224  // newBlockIterFn returns a new ResultIterator for the query.
   225  type newBlockIterFn func(
   226  	ctx context.Context,
   227  	block index.Block,
   228  	query index.Query,
   229  	results index.BaseResults,
   230  ) (index.ResultIterator, error)
   231  
   232  // asyncQueryExecState tracks the async execution errors for a query.
   233  type asyncQueryExecState struct {
   234  	sync.RWMutex
   235  	multiErr  xerrors.MultiError
   236  	waitCount atomic.Uint64
   237  }
   238  
   239  func (s *asyncQueryExecState) hasErr() bool {
   240  	s.RLock()
   241  	defer s.RUnlock()
   242  	return s.multiErr.NumErrors() > 0
   243  }
   244  
   245  func (s *asyncQueryExecState) addErr(err error) {
   246  	s.Lock()
   247  	s.multiErr = s.multiErr.Add(err)
   248  	s.Unlock()
   249  }
   250  
   251  func (s *asyncQueryExecState) incWaited(i int) {
   252  	s.waitCount.Add(uint64(i))
   253  }
   254  
   255  func (s *asyncQueryExecState) waited() int {
   256  	return int(s.waitCount.Load())
   257  }
   258  
   259  // newNamespaceIndex returns a new namespaceIndex for the provided namespace.
   260  func newNamespaceIndex(
   261  	nsMD namespace.Metadata,
   262  	namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager,
   263  	shardSet sharding.ShardSet,
   264  	opts Options,
   265  ) (NamespaceIndex, error) {
   266  	return newNamespaceIndexWithOptions(newNamespaceIndexOpts{
   267  		md:                      nsMD,
   268  		namespaceRuntimeOptsMgr: namespaceRuntimeOptsMgr,
   269  		shardSet:                shardSet,
   270  		opts:                    opts,
   271  		newIndexQueueFn:         newNamespaceIndexInsertQueue,
   272  		newBlockFn:              index.NewBlock,
   273  	})
   274  }
   275  
   276  // newNamespaceIndexWithInsertQueueFn is a ctor used in tests to override the insert queue.
   277  func newNamespaceIndexWithInsertQueueFn(
   278  	nsMD namespace.Metadata,
   279  	namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager,
   280  	shardSet sharding.ShardSet,
   281  	newIndexQueueFn newNamespaceIndexInsertQueueFn,
   282  	opts Options,
   283  ) (NamespaceIndex, error) {
   284  	return newNamespaceIndexWithOptions(newNamespaceIndexOpts{
   285  		md:                      nsMD,
   286  		namespaceRuntimeOptsMgr: namespaceRuntimeOptsMgr,
   287  		shardSet:                shardSet,
   288  		opts:                    opts,
   289  		newIndexQueueFn:         newIndexQueueFn,
   290  		newBlockFn:              index.NewBlock,
   291  	})
   292  }
   293  
   294  // newNamespaceIndexWithNewBlockFn is a ctor used in tests to inject blocks.
   295  func newNamespaceIndexWithNewBlockFn(
   296  	nsMD namespace.Metadata,
   297  	namespaceRuntimeOptsMgr namespace.RuntimeOptionsManager,
   298  	shardSet sharding.ShardSet,
   299  	newBlockFn index.NewBlockFn,
   300  	opts Options,
   301  ) (NamespaceIndex, error) {
   302  	return newNamespaceIndexWithOptions(newNamespaceIndexOpts{
   303  		md:                      nsMD,
   304  		namespaceRuntimeOptsMgr: namespaceRuntimeOptsMgr,
   305  		shardSet:                shardSet,
   306  		opts:                    opts,
   307  		newIndexQueueFn:         newNamespaceIndexInsertQueue,
   308  		newBlockFn:              newBlockFn,
   309  	})
   310  }
   311  
   312  // newNamespaceIndexWithOptions returns a new namespaceIndex with the provided configuration options.
   313  func newNamespaceIndexWithOptions(
   314  	newIndexOpts newNamespaceIndexOpts,
   315  ) (NamespaceIndex, error) {
   316  	var (
   317  		nsMD            = newIndexOpts.md
   318  		shardSet        = newIndexOpts.shardSet
   319  		indexOpts       = newIndexOpts.opts.IndexOptions()
   320  		instrumentOpts  = newIndexOpts.opts.InstrumentOptions()
   321  		newIndexQueueFn = newIndexOpts.newIndexQueueFn
   322  		newBlockFn      = newIndexOpts.newBlockFn
   323  		coreFn          = newIndexOpts.opts.CoreFn()
   324  		runtimeOptsMgr  = newIndexOpts.opts.RuntimeOptionsManager()
   325  	)
   326  	if err := indexOpts.Validate(); err != nil {
   327  		return nil, err
   328  	}
   329  
   330  	scope := instrumentOpts.MetricsScope().
   331  		SubScope("dbindex").
   332  		Tagged(map[string]string{
   333  			"namespace": nsMD.ID().String(),
   334  		})
   335  	instrumentOpts = instrumentOpts.SetMetricsScope(scope)
   336  	indexOpts = indexOpts.SetInstrumentOptions(instrumentOpts)
   337  
   338  	nowFn := indexOpts.ClockOptions().NowFn()
   339  	logger := indexOpts.InstrumentOptions().Logger()
   340  
   341  	var doNotIndexWithFields []doc.Field
   342  	if m := newIndexOpts.opts.DoNotIndexWithFieldsMap(); m != nil && len(m) != 0 {
   343  		for k, v := range m {
   344  			doNotIndexWithFields = append(doNotIndexWithFields, doc.Field{
   345  				Name:  []byte(k),
   346  				Value: []byte(v),
   347  			})
   348  		}
   349  	}
   350  
   351  	idx := &nsIndex{
   352  		state: nsIndexState{
   353  			closeCh: make(chan struct{}),
   354  			runtimeOpts: nsIndexRuntimeOptions{
   355  				insertMode: indexOpts.InsertMode(), // FOLLOWUP(prateek): wire to allow this to be tweaked at runtime
   356  			},
   357  			blocksByTime:   make(map[xtime.UnixNano]index.Block),
   358  			shardsAssigned: make(map[uint32]struct{}),
   359  		},
   360  
   361  		nowFn:                 nowFn,
   362  		blockSize:             nsMD.Options().IndexOptions().BlockSize(),
   363  		retentionPeriod:       nsMD.Options().RetentionOptions().RetentionPeriod(),
   364  		futureRetentionPeriod: nsMD.Options().RetentionOptions().FutureRetentionPeriod(),
   365  		bufferPast:            nsMD.Options().RetentionOptions().BufferPast(),
   366  		bufferFuture:          nsMD.Options().RetentionOptions().BufferFuture(),
   367  		coldWritesEnabled:     nsMD.Options().ColdWritesEnabled(),
   368  
   369  		namespaceRuntimeOptsMgr: newIndexOpts.namespaceRuntimeOptsMgr,
   370  		indexFilesetsBeforeFn:   fs.IndexFileSetsBefore,
   371  		readIndexInfoFilesFn:    fs.ReadIndexInfoFiles,
   372  		deleteFilesFn:           fs.DeleteFiles,
   373  
   374  		newBlockFn: newBlockFn,
   375  		opts:       newIndexOpts.opts,
   376  		logger:     logger,
   377  		nsMetadata: nsMD,
   378  
   379  		resultsPool:          indexOpts.QueryResultsPool(),
   380  		aggregateResultsPool: indexOpts.AggregateResultsPool(),
   381  
   382  		permitsManager: newIndexOpts.opts.PermitsOptions().IndexQueryPermitsManager(),
   383  		metrics:        newNamespaceIndexMetrics(indexOpts, instrumentOpts),
   384  
   385  		doNotIndexWithFields: doNotIndexWithFields,
   386  	}
   387  
   388  	activeBlock, err := idx.newBlockFn(xtime.UnixNano(0), idx.nsMetadata,
   389  		index.BlockOptions{ActiveBlock: true}, idx.namespaceRuntimeOptsMgr,
   390  		idx.opts.IndexOptions())
   391  	if err != nil {
   392  		return nil, idx.unableToAllocBlockInvariantError(err)
   393  	}
   394  
   395  	idx.activeBlock = activeBlock
   396  
   397  	// Assign shard set upfront.
   398  	idx.AssignShardSet(shardSet)
   399  
   400  	idx.runtimeOptsListener = runtimeOptsMgr.RegisterListener(idx)
   401  	idx.runtimeNsOptsListener = idx.namespaceRuntimeOptsMgr.RegisterListener(idx)
   402  
   403  	// set up forward index dice.
   404  	dice, err := newForwardIndexDice(newIndexOpts.opts)
   405  	if err != nil {
   406  		return nil, err
   407  	}
   408  
   409  	if dice.enabled {
   410  		logger.Info("namespace forward indexing configured",
   411  			zap.Stringer("namespace", nsMD.ID()),
   412  			zap.Bool("enabled", dice.enabled),
   413  			zap.Duration("threshold", dice.forwardIndexThreshold),
   414  			zap.Float64("rate", dice.forwardIndexDice.Rate()))
   415  	} else {
   416  		idxOpts := newIndexOpts.opts.IndexOptions()
   417  		logger.Info("namespace forward indexing not enabled",
   418  			zap.Stringer("namespace", nsMD.ID()),
   419  			zap.Bool("enabled", false),
   420  			zap.Float64("threshold", idxOpts.ForwardIndexThreshold()),
   421  			zap.Float64("probability", idxOpts.ForwardIndexProbability()))
   422  	}
   423  
   424  	idx.forwardIndexDice = dice
   425  
   426  	// allocate indexing queue and start it up.
   427  	queue := newIndexQueueFn(idx.writeBatches, nsMD, nowFn, coreFn, scope)
   428  	if err := queue.Start(); err != nil {
   429  		return nil, err
   430  	}
   431  	idx.state.insertQueue = queue
   432  
   433  	// allocate the current block to ensure we're able to index as soon as we return
   434  	currentBlock := xtime.ToUnixNano(nowFn()).Truncate(idx.blockSize)
   435  	idx.state.RLock()
   436  	_, err = idx.ensureBlockPresentWithRLock(currentBlock)
   437  	idx.state.RUnlock()
   438  	if err != nil {
   439  		return nil, err
   440  	}
   441  
   442  	// Report stats
   443  	go idx.reportStatsUntilClosed()
   444  
   445  	return idx, nil
   446  }
   447  
   448  func (i *nsIndex) SetRuntimeOptions(runtime.Options) {
   449  }
   450  
   451  func (i *nsIndex) SetNamespaceRuntimeOptions(opts namespace.RuntimeOptions) {
   452  	// We don't like to log from every single index segment that has
   453  	// settings updated so we log the changes here.
   454  	i.logger.Info("set namespace runtime index options",
   455  		zap.Stringer("namespace", i.nsMetadata.ID()),
   456  		zap.Any("writeIndexingPerCPUConcurrency", opts.WriteIndexingPerCPUConcurrency()),
   457  		zap.Any("flushIndexingPerCPUConcurrency", opts.FlushIndexingPerCPUConcurrency()))
   458  }
   459  
   460  func (i *nsIndex) reportStatsUntilClosed() {
   461  	ticker := time.NewTicker(nsIndexReportStatsInterval)
   462  	defer ticker.Stop()
   463  
   464  	for {
   465  		select {
   466  		case <-ticker.C:
   467  			err := i.reportStats()
   468  			if err != nil {
   469  				i.logger.Warn("could not report index stats", zap.Error(err))
   470  			}
   471  		case <-i.state.closeCh:
   472  			return
   473  		}
   474  	}
   475  }
   476  
   477  type nsIndexCompactionLevelStats struct {
   478  	numSegments  int64
   479  	numTotalDocs int64
   480  }
   481  
   482  func (i *nsIndex) reportStats() error {
   483  	i.state.RLock()
   484  	defer i.state.RUnlock()
   485  
   486  	foregroundLevels := i.metrics.blockMetrics.ForegroundSegments.Levels
   487  	foregroundLevelStats := make([]nsIndexCompactionLevelStats, len(foregroundLevels))
   488  
   489  	backgroundLevels := i.metrics.blockMetrics.BackgroundSegments.Levels
   490  	backgroundLevelStats := make([]nsIndexCompactionLevelStats, len(backgroundLevels))
   491  
   492  	flushedLevels := i.metrics.blockMetrics.FlushedSegments.Levels
   493  	flushedLevelStats := make([]nsIndexCompactionLevelStats, len(flushedLevels))
   494  
   495  	minIndexConcurrency := 0
   496  	maxIndexConcurrency := 0
   497  	sumIndexConcurrency := 0
   498  	numIndexingStats := 0
   499  	reporter := index.NewBlockStatsReporter(
   500  		func(s index.BlockSegmentStats) {
   501  			var (
   502  				levels     []nsIndexBlocksSegmentsLevelMetrics
   503  				levelStats []nsIndexCompactionLevelStats
   504  			)
   505  			switch s.Type {
   506  			case index.ActiveForegroundSegment:
   507  				levels = foregroundLevels
   508  				levelStats = foregroundLevelStats
   509  			case index.ActiveBackgroundSegment:
   510  				levels = backgroundLevels
   511  				levelStats = backgroundLevelStats
   512  			case index.FlushedSegment:
   513  				levels = flushedLevels
   514  				levelStats = flushedLevelStats
   515  			}
   516  
   517  			for i, l := range levels {
   518  				contained := s.Size >= l.MinSizeInclusive && s.Size < l.MaxSizeExclusive
   519  				if !contained {
   520  					continue
   521  				}
   522  
   523  				l.SegmentsAge.Record(s.Age)
   524  				levelStats[i].numSegments++
   525  				levelStats[i].numTotalDocs += s.Size
   526  
   527  				break
   528  			}
   529  		},
   530  		func(s index.BlockIndexingStats) {
   531  			first := numIndexingStats == 0
   532  			numIndexingStats++
   533  
   534  			if first {
   535  				minIndexConcurrency = s.IndexConcurrency
   536  				maxIndexConcurrency = s.IndexConcurrency
   537  				sumIndexConcurrency = s.IndexConcurrency
   538  				return
   539  			}
   540  
   541  			if v := s.IndexConcurrency; v < minIndexConcurrency {
   542  				minIndexConcurrency = v
   543  			}
   544  			if v := s.IndexConcurrency; v > maxIndexConcurrency {
   545  				maxIndexConcurrency = v
   546  			}
   547  			sumIndexConcurrency += s.IndexConcurrency
   548  		})
   549  
   550  	// iterate known blocks in a defined order of time (newest first)
   551  	// for debug log ordering
   552  	for _, b := range i.state.blocksDescOrderImmutable {
   553  		err := b.block.Stats(reporter)
   554  		if err == index.ErrUnableReportStatsBlockClosed {
   555  			// Closed blocks are temporarily in the list still
   556  			continue
   557  		}
   558  		if err != nil {
   559  			return err
   560  		}
   561  	}
   562  	// Active block should always be open.
   563  	if err := i.activeBlock.Stats(reporter); err != nil {
   564  		return err
   565  	}
   566  
   567  	// Update level stats.
   568  	for _, elem := range []struct {
   569  		levels     []nsIndexBlocksSegmentsLevelMetrics
   570  		levelStats []nsIndexCompactionLevelStats
   571  	}{
   572  		{foregroundLevels, foregroundLevelStats},
   573  		{backgroundLevels, backgroundLevelStats},
   574  	} {
   575  		for i, v := range elem.levelStats {
   576  			elem.levels[i].NumSegments.Update(float64(v.numSegments))
   577  			elem.levels[i].NumTotalDocs.Update(float64(v.numTotalDocs))
   578  		}
   579  	}
   580  
   581  	// Update the indexing stats.
   582  	i.metrics.indexingConcurrencyMin.Update(float64(minIndexConcurrency))
   583  	i.metrics.indexingConcurrencyMax.Update(float64(maxIndexConcurrency))
   584  	avgIndexConcurrency := float64(sumIndexConcurrency) / float64(numIndexingStats)
   585  	i.metrics.indexingConcurrencyAvg.Update(avgIndexConcurrency)
   586  
   587  	return nil
   588  }
   589  
   590  func (i *nsIndex) BlockStartForWriteTime(writeTime xtime.UnixNano) xtime.UnixNano {
   591  	return writeTime.Truncate(i.blockSize)
   592  }
   593  
   594  func (i *nsIndex) BlockForBlockStart(blockStart xtime.UnixNano) (index.Block, error) {
   595  	result, err := i.ensureBlockPresent(blockStart)
   596  	if err != nil {
   597  		return nil, err
   598  	}
   599  	return result.block, nil
   600  }
   601  
   602  // NB(prateek): including the call chains leading to this point:
   603  //
   604  // - For new entry (previously unseen in the shard):
   605  //     shard.WriteTagged()
   606  //       => shard.insertSeriesAsyncBatched()
   607  //       => shardInsertQueue.Insert()
   608  //       => shard.writeBatch()
   609  //       => index.WriteBatch()
   610  //       => indexQueue.Insert()
   611  //       => index.writeBatch()
   612  //
   613  // - For entry which exists in the shard, but needs indexing (either past
   614  //   the TTL or the last indexing hasn't happened/failed):
   615  //      shard.WriteTagged()
   616  //        => shard.insertSeriesForIndexingAsyncBatched()
   617  //        => shardInsertQueue.Insert()
   618  //        => shard.writeBatch()
   619  //        => index.Write()
   620  //        => indexQueue.Insert()
   621  //      	=> index.writeBatch()
   622  
   623  func (i *nsIndex) WriteBatch(
   624  	batch *index.WriteBatch,
   625  ) error {
   626  	// Filter anything with a pending index out before acquiring lock.
   627  	batch.MarkUnmarkedIfAlreadyIndexedSuccessAndFinalize()
   628  	if !batch.PendingAny() {
   629  		return nil
   630  	}
   631  
   632  	i.state.RLock()
   633  	if !i.isOpenWithRLock() {
   634  		i.state.RUnlock()
   635  		i.metrics.insertAfterClose.Inc(1)
   636  		err := errDbIndexUnableToWriteClosed
   637  		batch.MarkUnmarkedEntriesError(err)
   638  		return err
   639  	}
   640  
   641  	// NB(prateek): retrieving insertMode here while we have the RLock.
   642  	insertMode := i.state.runtimeOpts.insertMode
   643  	wg, err := i.state.insertQueue.InsertBatch(batch)
   644  
   645  	// release the lock because we don't need it past this point.
   646  	i.state.RUnlock()
   647  
   648  	// if we're unable to index, we still have to finalize the reference we hold.
   649  	if err != nil {
   650  		batch.MarkUnmarkedEntriesError(err)
   651  		return err
   652  	}
   653  	// once the write has been queued in the indexInsertQueue, it assumes
   654  	// responsibility for calling the resource hooks.
   655  
   656  	// wait/terminate depending on if we are indexing synchronously or not.
   657  	if insertMode != index.InsertAsync {
   658  		wg.Wait()
   659  
   660  		// Re-sort the batch by initial enqueue order
   661  		if numErrs := batch.NumErrs(); numErrs > 0 {
   662  			// Restore the sort order from when enqueued for the caller.
   663  			batch.SortByEnqueued()
   664  			return fmt.Errorf("check batch: %d insert errors", numErrs)
   665  		}
   666  	}
   667  
   668  	return nil
   669  }
   670  
   671  func (i *nsIndex) WritePending(
   672  	pending []writes.PendingIndexInsert,
   673  ) error {
   674  	// Filter anything with a pending index out before acquiring lock.
   675  	incoming := pending
   676  	pending = pending[:0]
   677  	for j := range incoming {
   678  		t := i.BlockStartForWriteTime(incoming[j].Entry.Timestamp)
   679  		if incoming[j].Entry.OnIndexSeries.IfAlreadyIndexedMarkIndexSuccessAndFinalize(t) {
   680  			continue
   681  		}
   682  		// Continue to add this element.
   683  		pending = append(pending, incoming[j])
   684  	}
   685  	if len(pending) == 0 {
   686  		return nil
   687  	}
   688  
   689  	i.state.RLock()
   690  	if !i.isOpenWithRLock() {
   691  		i.state.RUnlock()
   692  		i.metrics.insertAfterClose.Inc(1)
   693  		return errDbIndexUnableToWriteClosed
   694  	}
   695  	_, err := i.state.insertQueue.InsertPending(pending)
   696  	// release the lock because we don't need it past this point.
   697  	i.state.RUnlock()
   698  
   699  	return err
   700  }
   701  
   702  // WriteBatches is called by the indexInsertQueue.
   703  func (i *nsIndex) writeBatches(
   704  	batch *index.WriteBatch,
   705  ) {
   706  	// NB(prateek): we use a read lock to guard against mutation of the
   707  	// indexBlocks, mutations within the underlying blocks are guarded
   708  	// by primitives internal to it.
   709  	i.state.RLock()
   710  	if !i.isOpenWithRLock() {
   711  		i.state.RUnlock()
   712  		// NB(prateek): deliberately skip calling any of the `OnIndexFinalize` methods
   713  		// on the provided inserts to terminate quicker during shutdown.
   714  		return
   715  	}
   716  	var (
   717  		now                        = xtime.ToUnixNano(i.nowFn())
   718  		blockSize                  = i.blockSize
   719  		futureLimit                = now.Add(1 * i.bufferFuture)
   720  		pastLimit                  = now.Add(-1 * i.bufferPast)
   721  		earliestBlockStartToRetain = retention.FlushTimeStartForRetentionPeriod(i.retentionPeriod, i.blockSize, now)
   722  		batchOptions               = batch.Options()
   723  		forwardIndexDice           = i.forwardIndexDice
   724  		forwardIndexEnabled        = forwardIndexDice.enabled
   725  		total                      int
   726  		notSkipped                 int
   727  		forwardIndexHits           int
   728  		forwardIndexMiss           int
   729  
   730  		forwardIndexBatch *index.WriteBatch
   731  	)
   732  	// NB(r): Release lock early to avoid writing batches impacting ticking
   733  	// speed, etc.
   734  	// Sometimes foreground compaction can take a long time during heavy inserts.
   735  	// Each lookup to ensureBlockPresent checks that index is still open, etc.
   736  	i.state.RUnlock()
   737  
   738  	if forwardIndexEnabled {
   739  		// NB(arnikola): Don't initialize forward index batch if forward indexing
   740  		// is not enabled.
   741  		forwardIndexBatch = index.NewWriteBatch(batchOptions)
   742  	}
   743  
   744  	// Ensure timestamp is not too old/new based on retention policies and that
   745  	// doc is valid. Add potential forward writes to the forwardWriteBatch.
   746  	batch.ForEach(
   747  		func(idx int, entry index.WriteBatchEntry,
   748  			d doc.Metadata, _ index.WriteBatchEntryResult) {
   749  			total++
   750  
   751  			if len(i.doNotIndexWithFields) != 0 {
   752  				// This feature rarely used, do not optimize and just do n*m checks.
   753  				drop := true
   754  				for _, matchField := range i.doNotIndexWithFields {
   755  					matchedField := false
   756  					for _, actualField := range d.Fields {
   757  						if bytes.Equal(actualField.Name, matchField.Name) {
   758  							matchedField = bytes.Equal(actualField.Value, matchField.Value)
   759  							break
   760  						}
   761  					}
   762  					if !matchedField {
   763  						drop = false
   764  						break
   765  					}
   766  				}
   767  				if drop {
   768  					batch.MarkUnmarkedEntryError(errDbIndexDoNotIndexSeries, idx)
   769  					return
   770  				}
   771  			}
   772  
   773  			ts := entry.Timestamp
   774  			// NB(bodu): Always check first to see if the write is within retention.
   775  			if !ts.After(earliestBlockStartToRetain) {
   776  				batch.MarkUnmarkedEntryError(m3dberrors.ErrTooPast, idx)
   777  				return
   778  			}
   779  
   780  			if !futureLimit.After(ts) {
   781  				batch.MarkUnmarkedEntryError(m3dberrors.ErrTooFuture, idx)
   782  				return
   783  			}
   784  
   785  			if ts.Before(pastLimit) && !i.coldWritesEnabled {
   786  				// NB(bodu): We only mark entries as too far in the past if
   787  				// cold writes are not enabled.
   788  				batch.MarkUnmarkedEntryError(m3dberrors.ErrTooPast, idx)
   789  				return
   790  			}
   791  
   792  			if forwardIndexEnabled {
   793  				if forwardIndexDice.roll(ts) {
   794  					forwardIndexHits++
   795  					forwardEntryTimestamp := ts.Truncate(blockSize).Add(blockSize)
   796  					if entry.OnIndexSeries.NeedsIndexUpdate(forwardEntryTimestamp) {
   797  						forwardIndexEntry := entry
   798  						forwardIndexEntry.Timestamp = forwardEntryTimestamp
   799  						t := i.BlockStartForWriteTime(forwardEntryTimestamp)
   800  						forwardIndexEntry.OnIndexSeries.OnIndexPrepare(t)
   801  						forwardIndexBatch.Append(forwardIndexEntry, d)
   802  					}
   803  				} else {
   804  					forwardIndexMiss++
   805  				}
   806  			}
   807  
   808  			notSkipped++
   809  		})
   810  
   811  	if forwardIndexEnabled && forwardIndexBatch.Len() > 0 {
   812  		i.metrics.forwardIndexCounter.Inc(int64(forwardIndexBatch.Len()))
   813  		batch.AppendAll(forwardIndexBatch)
   814  	}
   815  
   816  	// Sort the inserts by which block they're applicable for, and do the inserts
   817  	// for each block, making sure to not try to insert any entries already marked
   818  	// with a result.
   819  	batch.ForEachUnmarkedBatchByBlockStart(i.writeBatchForBlockStart)
   820  
   821  	// Track index insertions.
   822  	// Note: attemptTotal should = attemptSkip + attemptWrite.
   823  	i.metrics.asyncInsertAttemptTotal.Inc(int64(total))
   824  	i.metrics.asyncInsertAttemptSkip.Inc(int64(total - notSkipped))
   825  	i.metrics.forwardIndexHits.Inc(int64(forwardIndexHits))
   826  	i.metrics.forwardIndexMisses.Inc(int64(forwardIndexMiss))
   827  }
   828  
   829  func (i *nsIndex) writeBatchForBlockStart(
   830  	blockStart xtime.UnixNano, batch *index.WriteBatch,
   831  ) {
   832  	// NB(r): Capture pending entries so we can emit the latencies
   833  	pending := batch.PendingEntries()
   834  	numPending := len(pending)
   835  
   836  	// Track attempted write.
   837  	// Note: attemptTotal should = attemptSkip + attemptWrite.
   838  	i.metrics.asyncInsertAttemptWrite.Inc(int64(numPending))
   839  
   840  	// i.e. we have the block and the inserts, perform the writes.
   841  	result, err := i.activeBlock.WriteBatch(batch)
   842  
   843  	// Record the end to end indexing latency.
   844  	now := i.nowFn()
   845  	for idx := range pending {
   846  		took := now.Sub(pending[idx].EnqueuedAt)
   847  		i.metrics.insertEndToEndLatency.Record(took)
   848  	}
   849  
   850  	// NB: we don't need to do anything to the OnIndexSeries refs in `inserts` at this point,
   851  	// the index.Block WriteBatch assumes responsibility for calling the appropriate methods.
   852  	if n := result.NumSuccess; n > 0 {
   853  		i.metrics.asyncInsertSuccess.Inc(n)
   854  	}
   855  
   856  	// Record mutable segments count foreground/background if latest block.
   857  	if stats := result.MutableSegmentsStats; !stats.Empty() {
   858  		i.metrics.latestBlockNumSegmentsForeground.Update(float64(stats.Foreground.NumSegments))
   859  		i.metrics.latestBlockNumDocsForeground.Update(float64(stats.Foreground.NumDocs))
   860  		i.metrics.latestBlockNumSegmentsBackground.Update(float64(stats.Background.NumSegments))
   861  		i.metrics.latestBlockNumDocsBackground.Update(float64(stats.Background.NumDocs))
   862  	}
   863  
   864  	// Allow for duplicate write errors since due to re-indexing races
   865  	// we may try to re-index a series more than once.
   866  	if err := i.sanitizeAllowDuplicatesWriteError(err); err != nil {
   867  		numErrors := numPending - int(result.NumSuccess)
   868  		if partialError, ok := err.(*m3ninxindex.BatchPartialError); ok {
   869  			// If it was a batch partial error we know exactly how many failed
   870  			// after filtering out for duplicate ID errors.
   871  			numErrors = len(partialError.Errs())
   872  		}
   873  		i.metrics.asyncInsertErrors.Inc(int64(numErrors))
   874  		i.logger.Error("error writing to index block", zap.Error(err))
   875  	}
   876  }
   877  
   878  // Bootstrap bootstraps the index with the provided blocks.
   879  func (i *nsIndex) Bootstrap(
   880  	bootstrapResults result.IndexResults,
   881  ) error {
   882  	i.state.Lock()
   883  	if i.state.bootstrapState == Bootstrapping {
   884  		i.state.Unlock()
   885  		return errDbIndexIsBootstrapping
   886  	}
   887  	i.state.bootstrapState = Bootstrapping
   888  	i.state.Unlock()
   889  
   890  	i.state.RLock()
   891  	defer func() {
   892  		i.state.RUnlock()
   893  		i.state.Lock()
   894  		i.state.bootstrapState = Bootstrapped
   895  		i.state.Unlock()
   896  	}()
   897  
   898  	var multiErr xerrors.MultiError
   899  	for blockStart, blockResults := range bootstrapResults {
   900  		blockResult, err := i.ensureBlockPresentWithRLock(blockStart)
   901  		if err != nil { // should never happen
   902  			multiErr = multiErr.Add(i.unableToAllocBlockInvariantError(err))
   903  			continue
   904  		}
   905  		if err := blockResult.block.AddResults(blockResults); err != nil {
   906  			multiErr = multiErr.Add(err)
   907  		}
   908  	}
   909  
   910  	return multiErr.FinalError()
   911  }
   912  
   913  func (i *nsIndex) Bootstrapped() bool {
   914  	i.state.RLock()
   915  	result := i.state.bootstrapState == Bootstrapped
   916  	i.state.RUnlock()
   917  	return result
   918  }
   919  
   920  func (i *nsIndex) Tick(
   921  	c context.Cancellable,
   922  	startTime xtime.UnixNano,
   923  ) (namespaceIndexTickResult, error) {
   924  	var result namespaceIndexTickResult
   925  
   926  	// First collect blocks and acquire lock to remove those that need removing
   927  	// but then release lock so can Tick and do other expensive tasks
   928  	// such as notify of sealed blocks.
   929  	tickingBlocks, multiErr := i.tickingBlocks(startTime)
   930  
   931  	result.NumBlocks = int64(tickingBlocks.totalBlocks)
   932  	for _, block := range tickingBlocks.tickingBlocks {
   933  		if c.IsCancelled() {
   934  			multiErr = multiErr.Add(errDbIndexTerminatingTickCancellation)
   935  			return result, multiErr.FinalError()
   936  		}
   937  
   938  		blockTickResult, tickErr := block.Tick(c)
   939  		multiErr = multiErr.Add(tickErr)
   940  		result.NumSegments += blockTickResult.NumSegments
   941  		result.NumSegmentsBootstrapped += blockTickResult.NumSegmentsBootstrapped
   942  		result.NumSegmentsMutable += blockTickResult.NumSegmentsMutable
   943  		result.NumTotalDocs += blockTickResult.NumDocs
   944  		result.FreeMmap += blockTickResult.FreeMmap
   945  	}
   946  
   947  	blockTickResult, tickErr := tickingBlocks.activeBlock.Tick(c)
   948  	multiErr = multiErr.Add(tickErr)
   949  	result.NumSegments += blockTickResult.NumSegments
   950  	result.NumSegmentsBootstrapped += blockTickResult.NumSegmentsBootstrapped
   951  	result.NumSegmentsMutable += blockTickResult.NumSegmentsMutable
   952  	result.NumTotalDocs += blockTickResult.NumDocs
   953  	result.FreeMmap += blockTickResult.FreeMmap
   954  
   955  	i.metrics.tick.Inc(1)
   956  
   957  	return result, multiErr.FinalError()
   958  }
   959  
   960  type tickingBlocksResult struct {
   961  	totalBlocks   int
   962  	activeBlock   index.Block
   963  	tickingBlocks []index.Block
   964  }
   965  
   966  func (i *nsIndex) tickingBlocks(
   967  	startTime xtime.UnixNano,
   968  ) (tickingBlocksResult, xerrors.MultiError) {
   969  	multiErr := xerrors.NewMultiError()
   970  	earliestBlockStartToRetain := retention.FlushTimeStartForRetentionPeriod(
   971  		i.retentionPeriod, i.blockSize, startTime)
   972  
   973  	i.state.Lock()
   974  	activeBlock := i.activeBlock
   975  	tickingBlocks := make([]index.Block, 0, len(i.state.blocksByTime))
   976  	defer func() {
   977  		i.updateBlockStartsWithLock()
   978  		i.state.Unlock()
   979  	}()
   980  
   981  	for blockStart, block := range i.state.blocksByTime {
   982  		// Drop any blocks past the retention period.
   983  		if blockStart.Before(earliestBlockStartToRetain) {
   984  			multiErr = multiErr.Add(block.Close())
   985  			delete(i.state.blocksByTime, blockStart)
   986  			continue
   987  		}
   988  
   989  		// Tick any blocks we're going to retain, but don't tick inline here
   990  		// we'll do this out of the block.
   991  		tickingBlocks = append(tickingBlocks, block)
   992  
   993  		// Seal any blocks that are sealable while holding lock (seal is fast).
   994  		if !blockStart.After(i.lastSealableBlockStart(startTime)) && !block.IsSealed() {
   995  			multiErr = multiErr.Add(block.Seal())
   996  		}
   997  	}
   998  
   999  	return tickingBlocksResult{
  1000  		totalBlocks:   len(i.state.blocksByTime),
  1001  		activeBlock:   activeBlock,
  1002  		tickingBlocks: tickingBlocks,
  1003  	}, multiErr
  1004  }
  1005  
  1006  func (i *nsIndex) WarmFlush(
  1007  	flush persist.IndexFlush,
  1008  	shards []databaseShard,
  1009  ) error {
  1010  	if len(shards) == 0 {
  1011  		// No-op if no shards currently owned.
  1012  		return nil
  1013  	}
  1014  
  1015  	flushable, err := i.flushableBlocks(shards, series.WarmWrite)
  1016  	if err != nil {
  1017  		return err
  1018  	}
  1019  
  1020  	// Determine the current flush indexing concurrency.
  1021  	namespaceRuntimeOpts := i.namespaceRuntimeOptsMgr.Get()
  1022  	perCPUFraction := namespaceRuntimeOpts.FlushIndexingPerCPUConcurrencyOrDefault()
  1023  	cpus := math.Ceil(perCPUFraction * float64(goruntime.GOMAXPROCS(0)))
  1024  	concurrency := int(math.Max(1, cpus))
  1025  
  1026  	builderOpts := i.opts.IndexOptions().SegmentBuilderOptions().
  1027  		SetConcurrency(concurrency)
  1028  
  1029  	builder, err := builder.NewBuilderFromDocuments(builderOpts)
  1030  	if err != nil {
  1031  		return err
  1032  	}
  1033  	defer builder.Close()
  1034  
  1035  	// Emit concurrency, then reset gauge to zero to show time
  1036  	// active during flushing broken down per namespace.
  1037  	i.metrics.flushIndexingConcurrency.Update(float64(concurrency))
  1038  	defer i.metrics.flushIndexingConcurrency.Update(0)
  1039  
  1040  	var evicted int
  1041  	for _, block := range flushable {
  1042  		immutableSegments, err := i.flushBlock(flush, block, shards, builder)
  1043  		if err != nil {
  1044  			return err
  1045  		}
  1046  		// Make a result that covers the entire time ranges for the
  1047  		// block for each shard
  1048  		fulfilled := result.NewShardTimeRangesFromRange(block.StartTime(), block.EndTime(),
  1049  			dbShards(shards).IDs()...)
  1050  
  1051  		// Add the results to the block.
  1052  		persistedSegments := make([]result.Segment, 0, len(immutableSegments))
  1053  		for _, elem := range immutableSegments {
  1054  			persistedSegment := result.NewSegment(elem, true)
  1055  			persistedSegments = append(persistedSegments, persistedSegment)
  1056  		}
  1057  		blockResult := result.NewIndexBlock(persistedSegments, fulfilled)
  1058  		results := result.NewIndexBlockByVolumeType(block.StartTime())
  1059  		results.SetBlock(idxpersist.DefaultIndexVolumeType, blockResult)
  1060  		if err := block.AddResults(results); err != nil {
  1061  			return err
  1062  		}
  1063  
  1064  		evicted++
  1065  
  1066  		// It's now safe to remove the mutable segments as anything the block
  1067  		// held is covered by the owned shards we just read
  1068  		if err := block.EvictMutableSegments(); err != nil {
  1069  			// deliberately choosing to not mark this as an error as we have successfully
  1070  			// flushed any mutable data.
  1071  			i.logger.Warn("encountered error while evicting mutable segments for index block",
  1072  				zap.Error(err),
  1073  				zap.Time("blockStart", block.StartTime().ToTime()),
  1074  			)
  1075  		}
  1076  
  1077  		for _, t := range i.blockStartsFromIndexBlockStart(block.StartTime()) {
  1078  			for _, s := range shards {
  1079  				s.MarkWarmIndexFlushStateSuccessOrError(t, err)
  1080  			}
  1081  		}
  1082  	}
  1083  	i.metrics.blocksEvictedMutableSegments.Inc(int64(evicted))
  1084  	return nil
  1085  }
  1086  
  1087  func (i *nsIndex) ColdFlush(shards []databaseShard) (OnColdFlushDone, error) {
  1088  	if len(shards) == 0 {
  1089  		// No-op if no shards currently owned.
  1090  		return func() error { return nil }, nil
  1091  	}
  1092  
  1093  	flushable, err := i.flushableBlocks(shards, series.ColdWrite)
  1094  	if err != nil {
  1095  		return nil, err
  1096  	}
  1097  	// We only rotate cold mutable segments in phase I of cold flushing.
  1098  	for _, block := range flushable {
  1099  		if err := block.RotateColdMutableSegments(); err != nil {
  1100  			return nil, err
  1101  		}
  1102  	}
  1103  	// We can't immediately evict cold mutable segments so we return a callback to do so
  1104  	// when cold flush finishes.
  1105  	return func() error {
  1106  		multiErr := xerrors.NewMultiError()
  1107  		for _, block := range flushable {
  1108  			multiErr = multiErr.Add(block.EvictColdMutableSegments())
  1109  		}
  1110  		return multiErr.FinalError()
  1111  	}, nil
  1112  }
  1113  
  1114  // WarmFlushBlockStarts returns all index blockStarts which have been flushed to disk.
  1115  func (i *nsIndex) WarmFlushBlockStarts() []xtime.UnixNano {
  1116  	flushed := make([]xtime.UnixNano, 0)
  1117  	infoFiles := i.readInfoFilesAsMap()
  1118  
  1119  	for blockStart := range infoFiles {
  1120  		if i.hasIndexWarmFlushedToDisk(infoFiles, blockStart) {
  1121  			flushed = append(flushed, blockStart)
  1122  		}
  1123  	}
  1124  	return flushed
  1125  }
  1126  
  1127  // BackgroundCompact background compacts eligible segments.
  1128  func (i *nsIndex) BackgroundCompact() {
  1129  	if i.activeBlock != nil {
  1130  		i.activeBlock.BackgroundCompact()
  1131  	}
  1132  	for _, b := range i.state.blocksByTime {
  1133  		b.BackgroundCompact()
  1134  	}
  1135  }
  1136  
  1137  func (i *nsIndex) readInfoFilesAsMap() map[xtime.UnixNano][]fs.ReadIndexInfoFileResult {
  1138  	fsOpts := i.opts.CommitLogOptions().FilesystemOptions()
  1139  	infoFiles := i.readIndexInfoFilesFn(fs.ReadIndexInfoFilesOptions{
  1140  		FilePathPrefix:   fsOpts.FilePathPrefix(),
  1141  		Namespace:        i.nsMetadata.ID(),
  1142  		ReaderBufferSize: fsOpts.InfoReaderBufferSize(),
  1143  	})
  1144  	result := make(map[xtime.UnixNano][]fs.ReadIndexInfoFileResult)
  1145  	for _, infoFile := range infoFiles {
  1146  		t := xtime.UnixNano(infoFile.Info.BlockStart)
  1147  		files := result[t]
  1148  		result[t] = append(files, infoFile)
  1149  	}
  1150  	return result
  1151  }
  1152  
  1153  func (i *nsIndex) flushableBlocks(
  1154  	shards []databaseShard,
  1155  	flushType series.WriteType,
  1156  ) ([]index.Block, error) {
  1157  	i.state.RLock()
  1158  	defer i.state.RUnlock()
  1159  	if !i.isOpenWithRLock() {
  1160  		return nil, errDbIndexUnableToFlushClosed
  1161  	}
  1162  	// NB(bodu): We read index info files once here to avoid re-reading all of them
  1163  	// for each block.
  1164  	infoFiles := i.readInfoFilesAsMap()
  1165  	flushable := make([]index.Block, 0, len(i.state.blocksByTime))
  1166  
  1167  	now := xtime.ToUnixNano(i.nowFn())
  1168  	earliestBlockStartToRetain := retention.FlushTimeStartForRetentionPeriod(i.retentionPeriod, i.blockSize, now)
  1169  	currentBlockStart := now.Truncate(i.blockSize)
  1170  	// Check for flushable blocks by iterating through all block starts w/in retention.
  1171  	for blockStart := earliestBlockStartToRetain; blockStart.Before(currentBlockStart); blockStart = blockStart.Add(i.blockSize) {
  1172  		blockResult, err := i.ensureBlockPresentWithRLock(blockStart)
  1173  		if err != nil {
  1174  			return nil, err
  1175  		}
  1176  
  1177  		canFlush, err := i.canFlushBlockWithRLock(infoFiles, blockStart,
  1178  			blockResult.block, shards, flushType)
  1179  		if err != nil {
  1180  			return nil, err
  1181  		}
  1182  		if !canFlush {
  1183  			continue
  1184  		}
  1185  
  1186  		flushable = append(flushable, blockResult.block)
  1187  	}
  1188  	return flushable, nil
  1189  }
  1190  
  1191  func (i *nsIndex) canFlushBlockWithRLock(
  1192  	infoFiles map[xtime.UnixNano][]fs.ReadIndexInfoFileResult,
  1193  	blockStart xtime.UnixNano,
  1194  	block index.Block,
  1195  	shards []databaseShard,
  1196  	flushType series.WriteType,
  1197  ) (bool, error) {
  1198  	switch flushType {
  1199  	case series.WarmWrite:
  1200  		// NB(bodu): We should always attempt to warm flush sealed blocks to disk if
  1201  		// there doesn't already exist data on disk. We're checking this instead of
  1202  		// `block.NeedsMutableSegmentsEvicted()` since bootstrap writes for cold block starts
  1203  		// get marked as warm writes if there doesn't already exist data on disk and need to
  1204  		// properly go through the warm flush lifecycle.
  1205  		if !block.IsSealed() || i.hasIndexWarmFlushedToDisk(infoFiles, blockStart) {
  1206  			return false, nil
  1207  		}
  1208  	case series.ColdWrite:
  1209  		if !block.NeedsColdMutableSegmentsEvicted() {
  1210  			return false, nil
  1211  		}
  1212  	}
  1213  
  1214  	// Check all data files exist for the shards we own
  1215  	for _, shard := range shards {
  1216  		if !shard.IsBootstrapped() {
  1217  			i.logger.
  1218  				With(zap.Uint32("shard", shard.ID())).
  1219  				Debug("skipping index cold flush due to shard not bootstrapped yet")
  1220  			continue
  1221  		}
  1222  
  1223  		for _, t := range i.blockStartsFromIndexBlockStart(blockStart) {
  1224  			flushState, err := shard.FlushState(t)
  1225  			if err != nil {
  1226  				return false, err
  1227  			}
  1228  
  1229  			// Skip if the data flushing failed. Data flushing precedes index flushing.
  1230  			if flushState.WarmStatus.DataFlushed != fileOpSuccess {
  1231  				return false, nil
  1232  			}
  1233  		}
  1234  	}
  1235  
  1236  	return true, nil
  1237  }
  1238  
  1239  // blockStartsFromIndexBlockStart returns the possibly many blocksStarts that exist within
  1240  // a given index block (since index block size >= data block size)
  1241  func (i *nsIndex) blockStartsFromIndexBlockStart(blockStart xtime.UnixNano) []xtime.UnixNano {
  1242  	start := blockStart
  1243  	end := blockStart.Add(i.blockSize)
  1244  	dataBlockSize := i.nsMetadata.Options().RetentionOptions().BlockSize()
  1245  	blockStarts := make([]xtime.UnixNano, 0)
  1246  	for t := start; t.Before(end); t = t.Add(dataBlockSize) {
  1247  		blockStarts = append(blockStarts, t)
  1248  	}
  1249  	return blockStarts
  1250  }
  1251  
  1252  func (i *nsIndex) hasIndexWarmFlushedToDisk(
  1253  	infoFiles map[xtime.UnixNano][]fs.ReadIndexInfoFileResult,
  1254  	blockStart xtime.UnixNano,
  1255  ) bool {
  1256  	// NB(bodu): We consider the block to have been warm flushed if there are any
  1257  	// filesets on disk. This is consistent with the "has warm flushed" check in the db shard.
  1258  	// Shard block starts are marked as having warm flushed if an info file is successfully read from disk.
  1259  	f, ok := infoFiles[blockStart]
  1260  	if !ok {
  1261  		return false
  1262  	}
  1263  
  1264  	for _, fileInfo := range f {
  1265  		indexVolumeType := idxpersist.DefaultIndexVolumeType
  1266  		if fileInfo.Info.IndexVolumeType != nil {
  1267  			indexVolumeType = idxpersist.IndexVolumeType(fileInfo.Info.IndexVolumeType.Value)
  1268  		}
  1269  		match := fileInfo.ID.BlockStart == blockStart && indexVolumeType == idxpersist.DefaultIndexVolumeType
  1270  		if match {
  1271  			return true
  1272  		}
  1273  	}
  1274  	return false
  1275  }
  1276  
  1277  func (i *nsIndex) flushBlock(
  1278  	flush persist.IndexFlush,
  1279  	indexBlock index.Block,
  1280  	shards []databaseShard,
  1281  	builder segment.DocumentsBuilder,
  1282  ) ([]segment.Segment, error) {
  1283  	allShards := make(map[uint32]struct{})
  1284  	for _, shard := range shards {
  1285  		// Populate all shards
  1286  		allShards[shard.ID()] = struct{}{}
  1287  	}
  1288  
  1289  	volumeIndex, err := i.opts.IndexClaimsManager().ClaimNextIndexFileSetVolumeIndex(
  1290  		i.nsMetadata,
  1291  		indexBlock.StartTime(),
  1292  	)
  1293  	if err != nil {
  1294  		return nil, fmt.Errorf("failed to claim next index volume index: %w", err)
  1295  	}
  1296  
  1297  	preparedPersist, err := flush.PrepareIndex(persist.IndexPrepareOptions{
  1298  		NamespaceMetadata: i.nsMetadata,
  1299  		BlockStart:        indexBlock.StartTime(),
  1300  		FileSetType:       persist.FileSetFlushType,
  1301  		Shards:            allShards,
  1302  		// NB(bodu): By default, we always write to the "default" index volume type.
  1303  		IndexVolumeType: idxpersist.DefaultIndexVolumeType,
  1304  		VolumeIndex:     volumeIndex,
  1305  	})
  1306  	if err != nil {
  1307  		return nil, err
  1308  	}
  1309  
  1310  	var closed bool
  1311  	defer func() {
  1312  		if !closed {
  1313  			segments, _ := preparedPersist.Close()
  1314  			// NB(r): Safe to for over a nil array so disregard error here.
  1315  			for _, segment := range segments {
  1316  				segment.Close()
  1317  			}
  1318  		}
  1319  	}()
  1320  
  1321  	// Flush a single block segment.
  1322  	if err := i.flushBlockSegment(preparedPersist, indexBlock, shards, builder); err != nil {
  1323  		return nil, err
  1324  	}
  1325  
  1326  	closed = true
  1327  
  1328  	// Now return the immutable segments
  1329  	return preparedPersist.Close()
  1330  }
  1331  
  1332  func (i *nsIndex) flushBlockSegment(
  1333  	preparedPersist persist.PreparedIndexPersist,
  1334  	indexBlock index.Block,
  1335  	shards []databaseShard,
  1336  	builder segment.DocumentsBuilder,
  1337  ) error {
  1338  	// Reset the builder
  1339  	builder.Reset()
  1340  
  1341  	var (
  1342  		batch     = m3ninxindex.Batch{AllowPartialUpdates: true}
  1343  		batchSize = defaultFlushDocsBatchSize
  1344  	)
  1345  	ctx := i.opts.ContextPool().Get()
  1346  	defer ctx.Close()
  1347  
  1348  	for _, shard := range shards {
  1349  		var (
  1350  			first     = true
  1351  			pageToken PageToken
  1352  		)
  1353  		for first || pageToken != nil {
  1354  			first = false
  1355  
  1356  			var (
  1357  				opts = block.FetchBlocksMetadataOptions{
  1358  					// NB(bodu): There is a lag between when data gets flushed
  1359  					// to disk and when it gets removed from memory during the next
  1360  					// Tick. In this case, the same series can exist both on disk
  1361  					// and in memory at the same time resulting in dupe series IDs.
  1362  					// Only read data from disk when flushing index segments.
  1363  					OnlyDisk: true,
  1364  				}
  1365  				limit   = defaultFlushReadDataBlocksBatchSize
  1366  				results block.FetchBlocksMetadataResults
  1367  				err     error
  1368  			)
  1369  			ctx.Reset()
  1370  			results, pageToken, err = shard.FetchBlocksMetadataV2(ctx,
  1371  				indexBlock.StartTime(), indexBlock.EndTime(),
  1372  				limit, pageToken, opts)
  1373  			if err != nil {
  1374  				return err
  1375  			}
  1376  
  1377  			// Reset docs batch before use.
  1378  			batch.Docs = batch.Docs[:0]
  1379  			for _, result := range results.Results() {
  1380  				doc, exists, err := shard.DocRef(result.ID)
  1381  				if err != nil {
  1382  					return err
  1383  				}
  1384  				if !exists {
  1385  					doc, err = convert.FromSeriesIDAndTagIter(result.ID, result.Tags)
  1386  					if err != nil {
  1387  						return err
  1388  					}
  1389  					i.metrics.flushDocsNew.Inc(1)
  1390  				} else {
  1391  					i.metrics.flushDocsCached.Inc(1)
  1392  				}
  1393  
  1394  				batch.Docs = append(batch.Docs, doc)
  1395  				if len(batch.Docs) < batchSize {
  1396  					continue
  1397  				}
  1398  
  1399  				err = i.sanitizeAllowDuplicatesWriteError(builder.InsertBatch(batch))
  1400  				if err != nil {
  1401  					return err
  1402  				}
  1403  
  1404  				// Reset docs after insertions.
  1405  				batch.Docs = batch.Docs[:0]
  1406  			}
  1407  
  1408  			// Add last batch if remaining.
  1409  			if len(batch.Docs) > 0 {
  1410  				err := i.sanitizeAllowDuplicatesWriteError(builder.InsertBatch(batch))
  1411  				if err != nil {
  1412  					return err
  1413  				}
  1414  			}
  1415  
  1416  			results.Close()
  1417  
  1418  			// Use BlockingCloseReset so that we can reuse the context without
  1419  			// it going back to the pool.
  1420  			ctx.BlockingCloseReset()
  1421  		}
  1422  	}
  1423  
  1424  	// Finally flush this segment
  1425  	return preparedPersist.Persist(builder)
  1426  }
  1427  
  1428  func (i *nsIndex) sanitizeAllowDuplicatesWriteError(err error) error {
  1429  	if err == nil {
  1430  		return nil
  1431  	}
  1432  
  1433  	// NB: dropping duplicate id error messages from logs as they're expected when we see
  1434  	// repeated inserts. as long as a block has an ID, it's not an error so we don't need
  1435  	// to pollute the logs with these messages.
  1436  	if partialError, ok := err.(*m3ninxindex.BatchPartialError); ok {
  1437  		err = partialError.FilterDuplicateIDErrors()
  1438  	}
  1439  
  1440  	return err
  1441  }
  1442  
  1443  func (i *nsIndex) AssignShardSet(shardSet sharding.ShardSet) {
  1444  	// NB(r): Allocate the filter function once, it can be used outside
  1445  	// of locks as it depends on no internal state.
  1446  	set := bitset.NewBitSet(uint(shardSet.Max()))
  1447  	assigned := make(map[uint32]struct{})
  1448  	for _, shardID := range shardSet.AllIDs() {
  1449  		set.Set(uint(shardID))
  1450  		assigned[shardID] = struct{}{}
  1451  	}
  1452  
  1453  	i.state.Lock()
  1454  	i.state.shardsFilterID = func(id ident.ID) bool {
  1455  		// NB(r): Use a bitset for fast lookups.
  1456  		return set.Test(uint(shardSet.Lookup(id)))
  1457  	}
  1458  
  1459  	i.state.shardFilteredForID = func(id ident.ID) (uint32, bool) {
  1460  		shard := shardSet.Lookup(id)
  1461  		return shard, set.Test(uint(shard))
  1462  	}
  1463  
  1464  	i.state.shardsAssigned = assigned
  1465  	i.state.Unlock()
  1466  }
  1467  
  1468  func (i *nsIndex) shardsFilterID() func(id ident.ID) bool {
  1469  	i.state.RLock()
  1470  	v := i.state.shardsFilterID
  1471  	i.state.RUnlock()
  1472  	return v
  1473  }
  1474  
  1475  func (i *nsIndex) shardForID() func(id ident.ID) (uint32, bool) {
  1476  	i.state.RLock()
  1477  	v := i.state.shardFilteredForID
  1478  	i.state.RUnlock()
  1479  	return v
  1480  }
  1481  
  1482  func (i *nsIndex) Query(
  1483  	ctx context.Context,
  1484  	query index.Query,
  1485  	opts index.QueryOptions,
  1486  ) (index.QueryResult, error) {
  1487  	var logFields []opentracinglog.Field
  1488  	ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.NSIdxQuery)
  1489  	defer sp.Finish()
  1490  	if sampled {
  1491  		// Only allocate metadata such as query string if sampling trace.
  1492  		logFields = []opentracinglog.Field{
  1493  			opentracinglog.String("query", query.String()),
  1494  			opentracinglog.String("namespace", i.nsMetadata.ID().String()),
  1495  			opentracinglog.Int("seriesLimit", opts.SeriesLimit),
  1496  			opentracinglog.Int("docsLimit", opts.DocsLimit),
  1497  			xopentracing.Time("queryStart", opts.StartInclusive.ToTime()),
  1498  			xopentracing.Time("queryEnd", opts.EndExclusive.ToTime()),
  1499  		}
  1500  		sp.LogFields(logFields...)
  1501  	}
  1502  
  1503  	// Get results and set the namespace ID and size limit.
  1504  	results := i.resultsPool.Get()
  1505  	results.Reset(i.nsMetadata.ID(), index.QueryResultsOptions{
  1506  		SizeLimit: opts.SeriesLimit,
  1507  		FilterID:  i.shardsFilterID(),
  1508  	})
  1509  	ctx.RegisterFinalizer(results)
  1510  	queryRes, err := i.query(ctx, query, results, opts, i.execBlockQueryFn,
  1511  		i.newBlockQueryIterFn, logFields)
  1512  	if err != nil {
  1513  		sp.LogFields(opentracinglog.Error(err))
  1514  		return index.QueryResult{}, err
  1515  	}
  1516  
  1517  	return index.QueryResult{
  1518  		Results:    results,
  1519  		Exhaustive: queryRes.exhaustive,
  1520  		Waited:     queryRes.waited,
  1521  	}, nil
  1522  }
  1523  
  1524  func (i *nsIndex) AggregateQuery(
  1525  	ctx context.Context,
  1526  	query index.Query,
  1527  	opts index.AggregationOptions,
  1528  ) (index.AggregateQueryResult, error) {
  1529  	id := i.nsMetadata.ID()
  1530  	logFields := []opentracinglog.Field{
  1531  		opentracinglog.String("query", query.String()),
  1532  		opentracinglog.String("namespace", id.String()),
  1533  		opentracinglog.Int("seriesLimit", opts.SeriesLimit),
  1534  		opentracinglog.Int("docsLimit", opts.DocsLimit),
  1535  		xopentracing.Time("queryStart", opts.StartInclusive.ToTime()),
  1536  		xopentracing.Time("queryEnd", opts.EndExclusive.ToTime()),
  1537  	}
  1538  
  1539  	ctx, sp := ctx.StartTraceSpan(tracepoint.NSIdxAggregateQuery)
  1540  	sp.LogFields(logFields...)
  1541  	defer sp.Finish()
  1542  
  1543  	metrics := index.NewAggregateUsageMetrics(id, i.opts.InstrumentOptions())
  1544  	// Get results and set the filters, namespace ID and size limit.
  1545  	results := i.aggregateResultsPool.Get()
  1546  	aopts := index.AggregateResultsOptions{
  1547  		SizeLimit:             opts.SeriesLimit,
  1548  		DocsLimit:             opts.DocsLimit,
  1549  		FieldFilter:           opts.FieldFilter,
  1550  		Type:                  opts.Type,
  1551  		AggregateUsageMetrics: metrics,
  1552  	}
  1553  	ctx.RegisterFinalizer(results)
  1554  	// use appropriate fn to query underlying blocks.
  1555  	// use block.Aggregate() for querying and set the query if required.
  1556  	fn := i.execBlockAggregateQueryFn
  1557  	isAllQuery := query.Equal(allQuery)
  1558  	if !isAllQuery {
  1559  		if field, isFieldQuery := idx.FieldQuery(query.Query); isFieldQuery {
  1560  			aopts.FieldFilter = aopts.FieldFilter.AddIfMissing(field)
  1561  		} else {
  1562  			// Need to actually restrict whether we should return a term or not
  1563  			// based on running the actual query to resolve a postings list and
  1564  			// then seeing if that intersects the aggregated term postings list
  1565  			// at all.
  1566  			aopts.RestrictByQuery = &query
  1567  		}
  1568  	}
  1569  	aopts.FieldFilter = aopts.FieldFilter.SortAndDedupe()
  1570  	results.Reset(id, aopts)
  1571  	queryRes, err := i.query(ctx, query, results, opts.QueryOptions, fn,
  1572  		i.newBlockAggregatorIterFn, logFields)
  1573  	if err != nil {
  1574  		return index.AggregateQueryResult{}, err
  1575  	}
  1576  	return index.AggregateQueryResult{
  1577  		Results:    results,
  1578  		Exhaustive: queryRes.exhaustive,
  1579  		Waited:     queryRes.waited,
  1580  	}, nil
  1581  }
  1582  
  1583  type queryResult struct {
  1584  	exhaustive bool
  1585  	waited     int
  1586  }
  1587  
  1588  func (i *nsIndex) query(
  1589  	ctx context.Context,
  1590  	query index.Query,
  1591  	results index.BaseResults,
  1592  	opts index.QueryOptions,
  1593  	execBlockFn execBlockQueryFn,
  1594  	newBlockIterFn newBlockIterFn,
  1595  	logFields []opentracinglog.Field,
  1596  ) (queryResult, error) {
  1597  	ctx, sp, sampled := ctx.StartSampledTraceSpan(tracepoint.NSIdxQueryHelper)
  1598  	sp.LogFields(logFields...)
  1599  	defer sp.Finish()
  1600  	if sampled {
  1601  		// Only log fields if sampled.
  1602  		sp.LogFields(logFields...)
  1603  	}
  1604  
  1605  	queryRes, err := i.queryWithSpan(ctx, query, results, opts, execBlockFn,
  1606  		newBlockIterFn, sp, logFields)
  1607  	if err != nil {
  1608  		sp.LogFields(opentracinglog.Error(err))
  1609  
  1610  		if queryRes.exhaustive {
  1611  			i.metrics.queryExhaustiveInternalError.Inc(1)
  1612  		} else {
  1613  			i.metrics.queryNonExhaustiveInternalError.Inc(1)
  1614  		}
  1615  		return queryRes, err
  1616  	}
  1617  
  1618  	if queryRes.exhaustive {
  1619  		i.metrics.queryExhaustiveSuccess.Inc(1)
  1620  		return queryRes, nil
  1621  	}
  1622  
  1623  	// If require exhaustive but not, return error.
  1624  	if opts.RequireExhaustive {
  1625  		seriesCount := results.Size()
  1626  		docsCount := results.TotalDocsCount()
  1627  		if opts.SeriesLimitExceeded(seriesCount) {
  1628  			i.metrics.queryNonExhaustiveSeriesLimitError.Inc(1)
  1629  		} else if opts.DocsLimitExceeded(docsCount) {
  1630  			i.metrics.queryNonExhaustiveDocsLimitError.Inc(1)
  1631  		} else {
  1632  			i.metrics.queryNonExhaustiveLimitError.Inc(1)
  1633  		}
  1634  
  1635  		// NB(r): Make sure error is not retried and returns as bad request.
  1636  		return queryRes, xerrors.NewInvalidParamsError(limits.NewQueryLimitExceededError(fmt.Sprintf(
  1637  			"query exceeded limit: require_exhaustive=%v, series_limit=%d, series_matched=%d, docs_limit=%d, docs_matched=%d",
  1638  			opts.RequireExhaustive,
  1639  			opts.SeriesLimit,
  1640  			seriesCount,
  1641  			opts.DocsLimit,
  1642  			docsCount,
  1643  		)))
  1644  	}
  1645  
  1646  	// Otherwise non-exhaustive but not required to be.
  1647  	i.metrics.queryNonExhaustiveSuccess.Inc(1)
  1648  	return queryRes, nil
  1649  }
  1650  
  1651  // blockIter is a composite type to hold various state about a block while iterating over the results.
  1652  type blockIter struct {
  1653  	iter           index.ResultIterator
  1654  	iterCloser     io.Closer
  1655  	block          index.Block
  1656  	waitTime       time.Duration
  1657  	processingTime time.Duration
  1658  }
  1659  
  1660  func (i *nsIndex) queryWithSpan(
  1661  	ctx context.Context,
  1662  	query index.Query,
  1663  	results index.BaseResults,
  1664  	opts index.QueryOptions,
  1665  	execBlockFn execBlockQueryFn,
  1666  	newBlockIterFn newBlockIterFn,
  1667  	span opentracing.Span,
  1668  	logFields []opentracinglog.Field,
  1669  ) (queryResult, error) {
  1670  	i.state.RLock()
  1671  	if !i.isOpenWithRLock() {
  1672  		i.state.RUnlock()
  1673  		return queryResult{}, errDbIndexUnableToQueryClosed
  1674  	}
  1675  
  1676  	// Track this as an inflight query that needs to finish
  1677  	// when the index is closed.
  1678  	i.queriesWg.Add(1)
  1679  	defer i.queriesWg.Done()
  1680  
  1681  	// Enact overrides for query options
  1682  	opts = i.overriddenOptsForQueryWithRLock(opts)
  1683  
  1684  	// Retrieve blocks to query, then we can release lock.
  1685  	// NB(r): Important not to block ticking, and other tasks by
  1686  	// holding the RLock during a query.
  1687  	qryRange := xtime.NewRanges(xtime.Range{
  1688  		Start: opts.StartInclusive,
  1689  		End:   opts.EndExclusive,
  1690  	})
  1691  	// NB(r): Safe to take ref to i.state.blocksDescOrderImmutable since it's
  1692  	// immutable and we only create an iterator over it.
  1693  	blocks := newBlocksIterStackAlloc(i.activeBlock, i.state.blocksDescOrderImmutable, qryRange)
  1694  
  1695  	// Can now release the lock and execute the query without holding the lock.
  1696  	i.state.RUnlock()
  1697  
  1698  	var (
  1699  		// State contains concurrent mutable state for async execution below.
  1700  		state = &asyncQueryExecState{}
  1701  		wg    sync.WaitGroup
  1702  	)
  1703  	perms, err := i.permitsManager.NewPermits(ctx)
  1704  	if err != nil {
  1705  		return queryResult{}, err
  1706  	}
  1707  	defer perms.Close()
  1708  
  1709  	var blockIters []*blockIter
  1710  	for b, ok := blocks.Next(); ok; b, ok = b.Next() {
  1711  		block := b.Current()
  1712  		iter, err := newBlockIterFn(ctx, block, query, results)
  1713  		if err != nil {
  1714  			return queryResult{}, err
  1715  		}
  1716  		blockIters = append(blockIters, &blockIter{
  1717  			iter:       iter,
  1718  			iterCloser: x.NewSafeCloser(iter),
  1719  			block:      block,
  1720  		})
  1721  	}
  1722  
  1723  	defer func() {
  1724  		for _, iter := range blockIters {
  1725  			// safe to call Close multiple times, so it's fine to eagerly close in the loop below and here.
  1726  			_ = iter.iterCloser.Close()
  1727  		}
  1728  	}()
  1729  
  1730  	// queryCanceled returns true if the query has been canceled and the current iteration should terminate.
  1731  	queryCanceled := func() bool {
  1732  		return opts.LimitsExceeded(results.Size(), results.TotalDocsCount()) || state.hasErr()
  1733  	}
  1734  	// waitForPermit waits for a permit. returns non-nil if the permit was acquired and the wait time.
  1735  	waitForPermit := func() (permits.Permit, time.Duration) {
  1736  		// make sure the query hasn't been canceled before waiting for a permit.
  1737  		if queryCanceled() {
  1738  			return nil, 0
  1739  		}
  1740  
  1741  		startWait := time.Now()
  1742  		acquireResult, err := perms.Acquire(ctx)
  1743  		waitTime := time.Since(startWait)
  1744  		var success bool
  1745  		defer func() {
  1746  			// Note: ALWAYS release if we do not successfully return back
  1747  			// the permit and we checked one out.
  1748  			if !success && acquireResult.Permit != nil {
  1749  				perms.Release(acquireResult.Permit)
  1750  			}
  1751  		}()
  1752  		if acquireResult.Waited {
  1753  			// Potentially break an error if require no wait set.
  1754  			if err == nil && opts.RequireNoWait {
  1755  				// Fail iteration if request requires no waiting occurs.
  1756  				err = permits.ErrOperationWaitedOnRequireNoWait
  1757  			}
  1758  			state.incWaited(1)
  1759  		}
  1760  		if err != nil {
  1761  			state.addErr(err)
  1762  			return nil, waitTime
  1763  		}
  1764  
  1765  		// make sure the query hasn't been canceled while waiting for a permit.
  1766  		if queryCanceled() {
  1767  			return nil, waitTime
  1768  		}
  1769  
  1770  		success = true
  1771  		return acquireResult.Permit, waitTime
  1772  	}
  1773  
  1774  	// We're looping through all the blocks that we need to query and kicking
  1775  	// off parallel queries which are bounded by the permits maximum
  1776  	// concurrency. It's possible at this point that we've completed querying one or more blocks and already exhausted
  1777  	// the maximum number of results that we're allowed to return. If thats the case, there is no value in kicking off
  1778  	// more parallel queries, so we break out of the loop.
  1779  	for _, blockIter := range blockIters {
  1780  		// Capture for async query execution below.
  1781  		blockIter := blockIter
  1782  
  1783  		// acquire a permit before kicking off the goroutine to process the iterator. this limits the number of
  1784  		// concurrent goroutines to # of permits + large queries that needed multiple iterations to finish.
  1785  		permit, waitTime := waitForPermit()
  1786  		blockIter.waitTime += waitTime
  1787  		if permit == nil {
  1788  			break
  1789  		}
  1790  
  1791  		wg.Add(1)
  1792  		// kick off a go routine to process the entire iterator.
  1793  		go func() {
  1794  			defer wg.Done()
  1795  			first := true
  1796  			for !blockIter.iter.Done() {
  1797  				// if this is not the first iteration of the iterator, need to acquire another permit.
  1798  				if !first {
  1799  					permit, waitTime = waitForPermit()
  1800  					blockIter.waitTime += waitTime
  1801  					if permit == nil {
  1802  						break
  1803  					}
  1804  				}
  1805  				blockLogFields := append(logFields, xopentracing.Duration("permitWaitTime", waitTime))
  1806  				first = false
  1807  				startProcessing := time.Now()
  1808  				execBlockFn(ctx, blockIter.block, permit, blockIter.iter, opts, state, results, blockLogFields)
  1809  				processingTime := time.Since(startProcessing)
  1810  				blockIter.processingTime += processingTime
  1811  				permit.Use(int64(processingTime))
  1812  				perms.Release(permit)
  1813  			}
  1814  			if first {
  1815  				// this should never happen since a new iter cannot be Done, but just to be safe.
  1816  				perms.Release(permit)
  1817  			}
  1818  
  1819  			// close the iterator since it's no longer needed. it's safe to call Close multiple times, here and in the
  1820  			// defer when the function returns.
  1821  			if err := blockIter.iterCloser.Close(); err != nil {
  1822  				state.addErr(err)
  1823  			}
  1824  		}()
  1825  	}
  1826  
  1827  	// wait for all workers to finish. if the caller cancels the call, the workers will be interrupted and eventually
  1828  	// finish.
  1829  	wg.Wait()
  1830  
  1831  	i.metrics.loadedDocsPerQuery.RecordValue(float64(results.TotalDocsCount()))
  1832  
  1833  	exhaustive := opts.Exhaustive(results.Size(), results.TotalDocsCount())
  1834  	// ok to read state without lock since all parallel queries are done.
  1835  	multiErr := state.multiErr
  1836  	err = multiErr.FinalError()
  1837  
  1838  	return queryResult{
  1839  		exhaustive: exhaustive,
  1840  		waited:     state.waited(),
  1841  	}, err
  1842  }
  1843  
  1844  func (i *nsIndex) newBlockQueryIterFn(
  1845  	ctx context.Context,
  1846  	block index.Block,
  1847  	query index.Query,
  1848  	_ index.BaseResults,
  1849  ) (index.ResultIterator, error) {
  1850  	return block.QueryIter(ctx, query)
  1851  }
  1852  
  1853  //nolint: dupl
  1854  func (i *nsIndex) execBlockQueryFn(
  1855  	ctx context.Context,
  1856  	block index.Block,
  1857  	permit permits.Permit,
  1858  	iter index.ResultIterator,
  1859  	opts index.QueryOptions,
  1860  	state *asyncQueryExecState,
  1861  	results index.BaseResults,
  1862  	logFields []opentracinglog.Field,
  1863  ) {
  1864  	logFields = append(logFields,
  1865  		xopentracing.Time("blockStart", block.StartTime().ToTime()),
  1866  		xopentracing.Time("blockEnd", block.EndTime().ToTime()),
  1867  	)
  1868  
  1869  	ctx, sp := ctx.StartTraceSpan(tracepoint.NSIdxBlockQuery)
  1870  	sp.LogFields(logFields...)
  1871  	defer sp.Finish()
  1872  
  1873  	docResults, ok := results.(index.DocumentResults)
  1874  	if !ok { // should never happen
  1875  		state.addErr(fmt.Errorf("unknown results type [%T] received during query", results))
  1876  		return
  1877  	}
  1878  	queryIter, ok := iter.(index.QueryIterator)
  1879  	if !ok { // should never happen
  1880  		state.addErr(fmt.Errorf("unknown results type [%T] received during query", iter))
  1881  		return
  1882  	}
  1883  
  1884  	deadline := time.Now().Add(time.Duration(permit.AllowedQuota()))
  1885  	err := block.QueryWithIter(ctx, opts, queryIter, docResults, deadline, logFields)
  1886  	if err == index.ErrUnableToQueryBlockClosed {
  1887  		// NB(r): Because we query this block outside of the results lock, it's
  1888  		// possible this block may get closed if it slides out of retention, in
  1889  		// that case those results are no longer considered valid and outside of
  1890  		// retention regardless, so this is a non-issue.
  1891  		err = nil
  1892  	}
  1893  
  1894  	if err != nil {
  1895  		sp.LogFields(opentracinglog.Error(err))
  1896  		state.addErr(err)
  1897  	}
  1898  }
  1899  
  1900  func (i *nsIndex) newBlockAggregatorIterFn(
  1901  	ctx context.Context,
  1902  	block index.Block,
  1903  	_ index.Query,
  1904  	results index.BaseResults,
  1905  ) (index.ResultIterator, error) {
  1906  	aggResults, ok := results.(index.AggregateResults)
  1907  	if !ok { // should never happen
  1908  		return nil, fmt.Errorf("unknown results type [%T] received during aggregation", results)
  1909  	}
  1910  	return block.AggregateIter(ctx, aggResults.AggregateResultsOptions())
  1911  }
  1912  
  1913  func (i *nsIndex) execBlockAggregateQueryFn(
  1914  	ctx context.Context,
  1915  	block index.Block,
  1916  	permit permits.Permit,
  1917  	iter index.ResultIterator,
  1918  	opts index.QueryOptions,
  1919  	state *asyncQueryExecState,
  1920  	results index.BaseResults,
  1921  	logFields []opentracinglog.Field,
  1922  ) {
  1923  	logFields = append(logFields,
  1924  		xopentracing.Time("blockStart", block.StartTime().ToTime()),
  1925  		xopentracing.Time("blockEnd", block.EndTime().ToTime()),
  1926  	)
  1927  
  1928  	ctx, sp := ctx.StartTraceSpan(tracepoint.NSIdxBlockAggregateQuery)
  1929  	sp.LogFields(logFields...)
  1930  	defer sp.Finish()
  1931  
  1932  	aggResults, ok := results.(index.AggregateResults)
  1933  	if !ok { // should never happen
  1934  		state.addErr(fmt.Errorf("unknown results type [%T] received during aggregation", results))
  1935  		return
  1936  	}
  1937  	aggIter, ok := iter.(index.AggregateIterator)
  1938  	if !ok { // should never happen
  1939  		state.addErr(fmt.Errorf("unknown results type [%T] received during query", iter))
  1940  		return
  1941  	}
  1942  
  1943  	deadline := time.Now().Add(time.Duration(permit.AllowedQuota()))
  1944  	err := block.AggregateWithIter(ctx, aggIter, opts, aggResults, deadline, logFields)
  1945  	if err == index.ErrUnableToQueryBlockClosed {
  1946  		// NB(r): Because we query this block outside of the results lock, it's
  1947  		// possible this block may get closed if it slides out of retention, in
  1948  		// that case those results are no longer considered valid and outside of
  1949  		// retention regardless, so this is a non-issue.
  1950  		err = nil
  1951  	}
  1952  
  1953  	if err != nil {
  1954  		sp.LogFields(opentracinglog.Error(err))
  1955  		state.addErr(err)
  1956  	}
  1957  }
  1958  
  1959  func (i *nsIndex) overriddenOptsForQueryWithRLock(
  1960  	opts index.QueryOptions,
  1961  ) index.QueryOptions {
  1962  	// Override query response limits if needed.
  1963  	if i.state.runtimeOpts.maxQuerySeriesLimit > 0 && (opts.SeriesLimit == 0 ||
  1964  		int64(opts.SeriesLimit) > i.state.runtimeOpts.maxQuerySeriesLimit) {
  1965  		i.logger.Debug("overriding query response series limit",
  1966  			zap.Int("requested", opts.SeriesLimit),
  1967  			zap.Int64("maxAllowed", i.state.runtimeOpts.maxQuerySeriesLimit)) // FOLLOWUP(prateek): log query too once it's serializable.
  1968  		opts.SeriesLimit = int(i.state.runtimeOpts.maxQuerySeriesLimit)
  1969  	}
  1970  	if i.state.runtimeOpts.maxQueryDocsLimit > 0 && (opts.DocsLimit == 0 ||
  1971  		int64(opts.DocsLimit) > i.state.runtimeOpts.maxQueryDocsLimit) {
  1972  		i.logger.Debug("overriding query response docs limit",
  1973  			zap.Int("requested", opts.DocsLimit),
  1974  			zap.Int64("maxAllowed", i.state.runtimeOpts.maxQueryDocsLimit)) // FOLLOWUP(prateek): log query too once it's serializable.
  1975  		opts.DocsLimit = int(i.state.runtimeOpts.maxQueryDocsLimit)
  1976  	}
  1977  	return opts
  1978  }
  1979  
  1980  type blockPresentResult struct {
  1981  	block  index.Block
  1982  	latest bool
  1983  }
  1984  
  1985  func (i *nsIndex) ensureBlockPresent(blockStart xtime.UnixNano) (blockPresentResult, error) {
  1986  	i.state.RLock()
  1987  	defer i.state.RUnlock()
  1988  	if !i.isOpenWithRLock() {
  1989  		return blockPresentResult{}, errDbIndexUnableToWriteClosed
  1990  	}
  1991  	return i.ensureBlockPresentWithRLock(blockStart)
  1992  }
  1993  
  1994  func (i *nsIndex) isLatestBlockWithRLock(blockStart xtime.UnixNano) bool {
  1995  	return i.state.latestBlock != nil && i.state.latestBlock.StartTime().Equal(blockStart)
  1996  }
  1997  
  1998  // ensureBlockPresentWithRLock guarantees an index.Block exists for the specified
  1999  // blockStart, allocating one if it does not. It returns the desired block, or
  2000  // error if it's unable to do so.
  2001  func (i *nsIndex) ensureBlockPresentWithRLock(blockStart xtime.UnixNano) (blockPresentResult, error) {
  2002  	// check if the current latest block matches the required block, this
  2003  	// is the usual path and can short circuit the rest of the logic in this
  2004  	// function in most cases.
  2005  	if i.isLatestBlockWithRLock(blockStart) {
  2006  		return blockPresentResult{
  2007  			block:  i.state.latestBlock,
  2008  			latest: true,
  2009  		}, nil
  2010  	}
  2011  
  2012  	// check if exists in the map (this can happen if the latestBlock has not
  2013  	// been rotated yet).
  2014  	if block, ok := i.state.blocksByTime[blockStart]; ok {
  2015  		return blockPresentResult{block: block}, nil
  2016  	}
  2017  
  2018  	// i.e. block start does not exist, so we have to alloc.
  2019  	// we release the RLock (the function is called with this lock), and acquire
  2020  	// the write lock to do the extra allocation.
  2021  	i.state.RUnlock()
  2022  	i.state.Lock()
  2023  
  2024  	// need to guarantee all exit paths from the function leave with the RLock
  2025  	// so we release the write lock and re-acquire a read lock.
  2026  	defer func() {
  2027  		i.state.Unlock()
  2028  		i.state.RLock()
  2029  	}()
  2030  
  2031  	// re-check if exists in the map (another routine did the alloc)
  2032  	if block, ok := i.state.blocksByTime[blockStart]; ok {
  2033  		return blockPresentResult{
  2034  			block:  block,
  2035  			latest: i.isLatestBlockWithRLock(blockStart),
  2036  		}, nil
  2037  	}
  2038  
  2039  	// ok now we know for sure we have to alloc
  2040  	block, err := i.newBlockFn(blockStart, i.nsMetadata,
  2041  		index.BlockOptions{}, i.namespaceRuntimeOptsMgr, i.opts.IndexOptions())
  2042  	if err != nil { // unable to allocate the block, should never happen.
  2043  		return blockPresentResult{}, i.unableToAllocBlockInvariantError(err)
  2044  	}
  2045  
  2046  	// NB(bodu): Use same time barrier as `Tick` to make sealing of cold index blocks consistent.
  2047  	// We need to seal cold blocks write away for cold writes.
  2048  	if !blockStart.After(i.lastSealableBlockStart(xtime.ToUnixNano(i.nowFn()))) {
  2049  		if err := block.Seal(); err != nil {
  2050  			return blockPresentResult{}, err
  2051  		}
  2052  	}
  2053  
  2054  	// add to tracked blocks map
  2055  	i.state.blocksByTime[blockStart] = block
  2056  
  2057  	// update ordered blockStarts slice, and latestBlock
  2058  	i.updateBlockStartsWithLock()
  2059  
  2060  	return blockPresentResult{
  2061  		block:  block,
  2062  		latest: i.isLatestBlockWithRLock(blockStart),
  2063  	}, nil
  2064  }
  2065  
  2066  func (i *nsIndex) lastSealableBlockStart(t xtime.UnixNano) xtime.UnixNano {
  2067  	return retention.FlushTimeEndForBlockSize(i.blockSize, t.Add(-i.bufferPast))
  2068  }
  2069  
  2070  func (i *nsIndex) updateBlockStartsWithLock() {
  2071  	// update ordered blockStarts slice
  2072  	var (
  2073  		latestBlockStart xtime.UnixNano
  2074  		latestBlock      index.Block
  2075  	)
  2076  
  2077  	blocks := make([]blockAndBlockStart, 0, len(i.state.blocksByTime)+1)
  2078  	for ts, block := range i.state.blocksByTime {
  2079  		if ts >= latestBlockStart {
  2080  			latestBlockStart = ts
  2081  			latestBlock = block
  2082  		}
  2083  		blocks = append(blocks, blockAndBlockStart{
  2084  			block:      block,
  2085  			blockStart: ts,
  2086  		})
  2087  	}
  2088  
  2089  	// order in desc order (i.e. reverse chronological)
  2090  	sort.Slice(blocks, func(i, j int) bool {
  2091  		return blocks[i].blockStart > blocks[j].blockStart
  2092  	})
  2093  
  2094  	// NB(r): Important not to modify this once set since we take reference
  2095  	// to this slice with an RLock, release with RUnlock and then loop over it
  2096  	// during query time so it must not be altered and stay immutable.
  2097  	// This is done to avoid allocating a copy of the slice at query time for
  2098  	// each query.
  2099  	i.state.blocksDescOrderImmutable = blocks
  2100  
  2101  	// rotate latestBlock
  2102  	i.state.latestBlock = latestBlock
  2103  }
  2104  
  2105  func (i *nsIndex) isOpenWithRLock() bool {
  2106  	return !i.state.closed
  2107  }
  2108  
  2109  func (i *nsIndex) CleanupExpiredFileSets(t xtime.UnixNano) error {
  2110  	// we only expire data on drive that we don't hold a reference to, and is
  2111  	// past the expiration period. the earliest data we have to retain is given
  2112  	// by the following computation:
  2113  	//  Min(FIRST_EXPIRED_BLOCK, EARLIEST_RETAINED_BLOCK)
  2114  	i.state.RLock()
  2115  	defer i.state.RUnlock()
  2116  	if i.state.closed {
  2117  		return errDbIndexUnableToCleanupClosed
  2118  	}
  2119  
  2120  	// earliest block to retain based on retention period
  2121  	earliestBlockStartToRetain := retention.FlushTimeStartForRetentionPeriod(i.retentionPeriod, i.blockSize, t)
  2122  
  2123  	// now we loop through the blocks we hold, to ensure we don't delete any data for them.
  2124  	for t := range i.state.blocksByTime {
  2125  		if t.Before(earliestBlockStartToRetain) {
  2126  			earliestBlockStartToRetain = t
  2127  		}
  2128  	}
  2129  
  2130  	// know the earliest block to retain, find all blocks earlier than it
  2131  	var (
  2132  		pathPrefix = i.opts.CommitLogOptions().FilesystemOptions().FilePathPrefix()
  2133  		nsID       = i.nsMetadata.ID()
  2134  	)
  2135  	filesets, err := i.indexFilesetsBeforeFn(pathPrefix, nsID, earliestBlockStartToRetain)
  2136  	if err != nil {
  2137  		return err
  2138  	}
  2139  
  2140  	// and delete them
  2141  	return i.deleteFilesFn(filesets)
  2142  }
  2143  
  2144  func (i *nsIndex) CleanupCorruptedFileSets() error {
  2145  	/*
  2146  	   Corrupted index filesets can be safely cleaned up if its not
  2147  	   the latest volume index per index volume type/block start combo.
  2148  
  2149  	   We are guaranteed not to be actively writing to an index fileset once
  2150  	   we're already writing to later volume indices.
  2151  	*/
  2152  	fsOpts := i.opts.CommitLogOptions().FilesystemOptions()
  2153  	infoFiles := i.readIndexInfoFilesFn(fs.ReadIndexInfoFilesOptions{
  2154  		FilePathPrefix:   fsOpts.FilePathPrefix(),
  2155  		Namespace:        i.nsMetadata.ID(),
  2156  		ReaderBufferSize: fsOpts.InfoReaderBufferSize(),
  2157  		IncludeCorrupted: true,
  2158  	})
  2159  
  2160  	if len(infoFiles) == 0 {
  2161  		return nil
  2162  	}
  2163  
  2164  	var (
  2165  		toDelete []string
  2166  		begin    = 0 // marks the beginning of a subslice that contains filesets with same block starts
  2167  	)
  2168  	// It's expected that info files are ordered by block start and volume index
  2169  	for j := range infoFiles {
  2170  		if infoFiles[begin].ID.BlockStart.Before(infoFiles[j].ID.BlockStart) {
  2171  			files, err := i.getCorruptedVolumesForDeletion(infoFiles[begin:j])
  2172  			if err != nil {
  2173  				return err
  2174  			}
  2175  			toDelete = append(toDelete, files...)
  2176  			begin = j
  2177  		} else if infoFiles[begin].ID.BlockStart.After(infoFiles[j].ID.BlockStart) {
  2178  			errorMessage := "filesets are expected to be ordered by block start"
  2179  			instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) {
  2180  				l.Error(errorMessage)
  2181  			})
  2182  			return instrument.InvariantErrorf(errorMessage)
  2183  		}
  2184  	}
  2185  
  2186  	// Process the volumes in the last block, which are not covered by the loop.
  2187  	files, err := i.getCorruptedVolumesForDeletion(infoFiles[begin:])
  2188  	if err != nil {
  2189  		return err
  2190  	}
  2191  	toDelete = append(toDelete, files...)
  2192  
  2193  	return i.deleteFilesFn(toDelete)
  2194  }
  2195  
  2196  func (i *nsIndex) getCorruptedVolumesForDeletion(filesets []fs.ReadIndexInfoFileResult) ([]string, error) {
  2197  	if len(filesets) <= 1 {
  2198  		return nil, nil
  2199  	}
  2200  
  2201  	// Check for invariants.
  2202  	for j := 1; j < len(filesets); j++ {
  2203  		if !filesets[j-1].ID.BlockStart.Equal(filesets[j].ID.BlockStart) {
  2204  			errorMessage := "all the filesets passed to this function should have the same block start"
  2205  			instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) {
  2206  				l.Error(errorMessage)
  2207  			})
  2208  			return nil, instrument.InvariantErrorf(errorMessage)
  2209  		} else if filesets[j-1].ID.VolumeIndex >= filesets[j].ID.VolumeIndex {
  2210  			errorMessage := "filesets should be ordered by volume index in increasing order"
  2211  			instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) {
  2212  				l.Error(errorMessage)
  2213  			})
  2214  			return nil, instrument.InvariantErrorf(errorMessage)
  2215  		}
  2216  	}
  2217  
  2218  	toDelete := make([]string, 0)
  2219  	hasMoreRecentVolumeOfType := make(map[idxpersist.IndexVolumeType]struct{})
  2220  	// Iterate filesets in reverse order to process higher volume indexes first.
  2221  	for j := len(filesets) - 1; j >= 0; j-- {
  2222  		f := filesets[j]
  2223  
  2224  		// NB: If the fileset info fields contains inconsistent information (e.g. block start inside
  2225  		// info file doesn't match the block start extracted from the filename), it means that info file
  2226  		// is missing or corrupted. Thus we cannot trust the information of this fileset
  2227  		// and we cannot be sure what's the actual volume type of it. However, a part of corrupted
  2228  		// fileset cleanup logic depends on knowing the volume type.
  2229  		//
  2230  		// Such fileset is deleted, except when it is the most recent volume in the block.
  2231  		//
  2232  		// The most recent volume is excluded because it is more likely to be actively written to.
  2233  		// If info file writes are not atomic, due to timing readers might observe the file
  2234  		// to be corrupted, even though at that moment the file is being written/re-written.
  2235  		if f.Corrupted && !f.ID.BlockStart.Equal(xtime.UnixNano(f.Info.BlockStart)) {
  2236  			if j != len(filesets)-1 {
  2237  				toDelete = append(toDelete, f.AbsoluteFilePaths...)
  2238  			}
  2239  			continue
  2240  		}
  2241  
  2242  		volType := idxpersist.DefaultIndexVolumeType
  2243  		if f.Info.IndexVolumeType != nil {
  2244  			volType = idxpersist.IndexVolumeType(f.Info.IndexVolumeType.Value)
  2245  		}
  2246  		// Delete corrupted filesets if there are more recent volumes with the same volume type.
  2247  		if _, ok := hasMoreRecentVolumeOfType[volType]; !ok {
  2248  			hasMoreRecentVolumeOfType[volType] = struct{}{}
  2249  		} else if f.Corrupted {
  2250  			toDelete = append(toDelete, f.AbsoluteFilePaths...)
  2251  		}
  2252  	}
  2253  	return toDelete, nil
  2254  }
  2255  
  2256  func (i *nsIndex) CleanupDuplicateFileSets(activeShards []uint32) error {
  2257  	fsOpts := i.opts.CommitLogOptions().FilesystemOptions()
  2258  	infoFiles := i.readIndexInfoFilesFn(fs.ReadIndexInfoFilesOptions{
  2259  		FilePathPrefix:   fsOpts.FilePathPrefix(),
  2260  		Namespace:        i.nsMetadata.ID(),
  2261  		ReaderBufferSize: fsOpts.InfoReaderBufferSize(),
  2262  	})
  2263  
  2264  	segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart := make(map[xtime.UnixNano]map[idxpersist.IndexVolumeType][]fs.Segments)
  2265  	for _, file := range infoFiles {
  2266  		seg := fs.NewSegments(file.Info, file.ID.VolumeIndex, file.AbsoluteFilePaths)
  2267  		blockStart := seg.BlockStart()
  2268  		segmentsOrderByVolumeIndexByVolumeType, ok := segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart[blockStart]
  2269  		if !ok {
  2270  			segmentsOrderByVolumeIndexByVolumeType = make(map[idxpersist.IndexVolumeType][]fs.Segments)
  2271  			segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart[blockStart] = segmentsOrderByVolumeIndexByVolumeType
  2272  		}
  2273  
  2274  		volumeType := seg.VolumeType()
  2275  		if _, ok := segmentsOrderByVolumeIndexByVolumeType[volumeType]; !ok {
  2276  			segmentsOrderByVolumeIndexByVolumeType[volumeType] = make([]fs.Segments, 0)
  2277  		}
  2278  		segmentsOrderByVolumeIndexByVolumeType[volumeType] = append(segmentsOrderByVolumeIndexByVolumeType[volumeType], seg)
  2279  	}
  2280  
  2281  	// Ensure that segments are sorted by volume index.
  2282  	for _, segmentsOrderByVolumeIndexByVolumeType := range segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart {
  2283  		for _, segs := range segmentsOrderByVolumeIndexByVolumeType {
  2284  			sort.SliceStable(segs, func(i, j int) bool {
  2285  				return segs[i].VolumeIndex() < segs[j].VolumeIndex()
  2286  			})
  2287  		}
  2288  	}
  2289  
  2290  	multiErr := xerrors.NewMultiError()
  2291  	// Check for dupes and remove.
  2292  	filesToDelete := make([]string, 0)
  2293  	for _, segmentsOrderByVolumeIndexByVolumeType := range segmentsOrderByVolumeIndexByVolumeTypeAndBlockStart {
  2294  		for _, segmentsOrderByVolumeIndex := range segmentsOrderByVolumeIndexByVolumeType {
  2295  			segmentsToKeep := make([]fs.Segments, 0)
  2296  			for _, seg := range segmentsOrderByVolumeIndex {
  2297  				for len(segmentsToKeep) > 0 {
  2298  					idx := len(segmentsToKeep) - 1
  2299  					if previous := segmentsToKeep[idx]; seg.ShardTimeRanges().IsSuperset(
  2300  						previous.ShardTimeRanges().FilterShards(activeShards)) {
  2301  						filesToDelete = append(filesToDelete, previous.AbsoluteFilePaths()...)
  2302  						segmentsToKeep = segmentsToKeep[:idx]
  2303  					} else {
  2304  						break
  2305  					}
  2306  				}
  2307  				segmentsToKeep = append(segmentsToKeep, seg)
  2308  			}
  2309  		}
  2310  	}
  2311  	multiErr = multiErr.Add(i.deleteFilesFn(filesToDelete))
  2312  	return multiErr.FinalError()
  2313  }
  2314  
  2315  func (i *nsIndex) DebugMemorySegments(opts DebugMemorySegmentsOptions) error {
  2316  	i.state.RLock()
  2317  	defer i.state.RUnlock()
  2318  	if i.state.closed {
  2319  		return errDbIndexAlreadyClosed
  2320  	}
  2321  
  2322  	ctx := context.NewBackground()
  2323  	defer ctx.Close()
  2324  
  2325  	// Create a new set of file system options to output to new directory.
  2326  	fsOpts := i.opts.CommitLogOptions().
  2327  		FilesystemOptions().
  2328  		SetFilePathPrefix(opts.OutputDirectory)
  2329  
  2330  	for _, block := range i.state.blocksByTime {
  2331  		segmentsData, err := block.MemorySegmentsData(ctx)
  2332  		if err != nil {
  2333  			return err
  2334  		}
  2335  
  2336  		for numSegment, segmentData := range segmentsData {
  2337  			indexWriter, err := fs.NewIndexWriter(fsOpts)
  2338  			if err != nil {
  2339  				return err
  2340  			}
  2341  
  2342  			fileSetID := fs.FileSetFileIdentifier{
  2343  				FileSetContentType: persist.FileSetIndexContentType,
  2344  				Namespace:          i.nsMetadata.ID(),
  2345  				BlockStart:         block.StartTime(),
  2346  				VolumeIndex:        numSegment,
  2347  			}
  2348  			openOpts := fs.IndexWriterOpenOptions{
  2349  				Identifier:      fileSetID,
  2350  				BlockSize:       i.blockSize,
  2351  				FileSetType:     persist.FileSetFlushType,
  2352  				Shards:          i.state.shardsAssigned,
  2353  				IndexVolumeType: idxpersist.DefaultIndexVolumeType,
  2354  			}
  2355  			if err := indexWriter.Open(openOpts); err != nil {
  2356  				return err
  2357  			}
  2358  
  2359  			segWriter, err := idxpersist.NewFSTSegmentDataFileSetWriter(segmentData)
  2360  			if err != nil {
  2361  				return err
  2362  			}
  2363  
  2364  			if err := indexWriter.WriteSegmentFileSet(segWriter); err != nil {
  2365  				return err
  2366  			}
  2367  
  2368  			if err := indexWriter.Close(); err != nil {
  2369  				return err
  2370  			}
  2371  		}
  2372  	}
  2373  
  2374  	return nil
  2375  }
  2376  
  2377  func (i *nsIndex) Close() error {
  2378  	i.state.Lock()
  2379  	if !i.isOpenWithRLock() {
  2380  		i.state.Unlock()
  2381  		return errDbIndexAlreadyClosed
  2382  	}
  2383  
  2384  	i.state.closed = true
  2385  	close(i.state.closeCh)
  2386  
  2387  	var multiErr xerrors.MultiError
  2388  	multiErr = multiErr.Add(i.state.insertQueue.Stop())
  2389  
  2390  	blocks := make([]index.Block, 0, len(i.state.blocksByTime)+1)
  2391  	for _, block := range i.state.blocksByTime {
  2392  		blocks = append(blocks, block)
  2393  	}
  2394  	blocks = append(blocks, i.activeBlock)
  2395  
  2396  	i.activeBlock = nil
  2397  	i.state.latestBlock = nil
  2398  	i.state.blocksByTime = nil
  2399  	i.state.blocksDescOrderImmutable = nil
  2400  
  2401  	if i.runtimeOptsListener != nil {
  2402  		i.runtimeOptsListener.Close()
  2403  		i.runtimeOptsListener = nil
  2404  	}
  2405  
  2406  	if i.runtimeNsOptsListener != nil {
  2407  		i.runtimeNsOptsListener.Close()
  2408  		i.runtimeNsOptsListener = nil
  2409  	}
  2410  
  2411  	// Can now unlock after collecting blocks to close and setting closed state.
  2412  	i.state.Unlock()
  2413  
  2414  	// Wait for inflight queries to finish before closing blocks, do this
  2415  	// outside of lock in case an inflight query needs to acquire a read lock
  2416  	// to finish but can't acquire it because close was holding the lock waiting
  2417  	// for queries to drain first.
  2418  	i.queriesWg.Wait()
  2419  
  2420  	for _, block := range blocks {
  2421  		multiErr = multiErr.Add(block.Close())
  2422  	}
  2423  
  2424  	return multiErr.FinalError()
  2425  }
  2426  
  2427  func (i *nsIndex) unableToAllocBlockInvariantError(err error) error {
  2428  	ierr := fmt.Errorf("index unable to allocate block: %v", err)
  2429  	instrument.EmitAndLogInvariantViolation(i.opts.InstrumentOptions(), func(l *zap.Logger) {
  2430  		l.Error(ierr.Error())
  2431  	})
  2432  	return ierr
  2433  }
  2434  
  2435  type nsIndexMetrics struct {
  2436  	tick tally.Counter
  2437  
  2438  	asyncInsertAttemptTotal tally.Counter
  2439  	asyncInsertAttemptSkip  tally.Counter
  2440  	asyncInsertAttemptWrite tally.Counter
  2441  
  2442  	asyncInsertSuccess               tally.Counter
  2443  	asyncInsertErrors                tally.Counter
  2444  	insertAfterClose                 tally.Counter
  2445  	queryAfterClose                  tally.Counter
  2446  	forwardIndexHits                 tally.Counter
  2447  	forwardIndexMisses               tally.Counter
  2448  	forwardIndexCounter              tally.Counter
  2449  	insertEndToEndLatency            tally.Timer
  2450  	blocksEvictedMutableSegments     tally.Counter
  2451  	blockMetrics                     nsIndexBlocksMetrics
  2452  	indexingConcurrencyMin           tally.Gauge
  2453  	indexingConcurrencyMax           tally.Gauge
  2454  	indexingConcurrencyAvg           tally.Gauge
  2455  	flushIndexingConcurrency         tally.Gauge
  2456  	flushDocsNew                     tally.Counter
  2457  	flushDocsCached                  tally.Counter
  2458  	latestBlockNumSegmentsForeground tally.Gauge
  2459  	latestBlockNumDocsForeground     tally.Gauge
  2460  	latestBlockNumSegmentsBackground tally.Gauge
  2461  	latestBlockNumDocsBackground     tally.Gauge
  2462  
  2463  	loadedDocsPerQuery                 tally.Histogram
  2464  	queryExhaustiveSuccess             tally.Counter
  2465  	queryExhaustiveInternalError       tally.Counter
  2466  	queryNonExhaustiveSuccess          tally.Counter
  2467  	queryNonExhaustiveInternalError    tally.Counter
  2468  	queryNonExhaustiveLimitError       tally.Counter
  2469  	queryNonExhaustiveSeriesLimitError tally.Counter
  2470  	queryNonExhaustiveDocsLimitError   tally.Counter
  2471  }
  2472  
  2473  func newNamespaceIndexMetrics(
  2474  	opts index.Options,
  2475  	iopts instrument.Options,
  2476  ) nsIndexMetrics {
  2477  	const (
  2478  		indexAttemptName         = "index-attempt"
  2479  		forwardIndexName         = "forward-index"
  2480  		indexingConcurrency      = "indexing-concurrency"
  2481  		flushIndexingConcurrency = "flush-indexing-concurrency"
  2482  	)
  2483  	scope := iopts.MetricsScope()
  2484  	blocksScope := scope.SubScope("blocks")
  2485  	m := nsIndexMetrics{
  2486  		tick: scope.Counter("index-tick"),
  2487  		asyncInsertAttemptTotal: scope.Tagged(map[string]string{
  2488  			"stage": "process",
  2489  		}).Counter(indexAttemptName),
  2490  		asyncInsertAttemptSkip: scope.Tagged(map[string]string{
  2491  			"stage": "skip",
  2492  		}).Counter(indexAttemptName),
  2493  		asyncInsertAttemptWrite: scope.Tagged(map[string]string{
  2494  			"stage": "write",
  2495  		}).Counter(indexAttemptName),
  2496  		asyncInsertSuccess: scope.Counter("index-success"),
  2497  		asyncInsertErrors: scope.Tagged(map[string]string{
  2498  			"error_type": "async-insert",
  2499  		}).Counter("index-error"),
  2500  		insertAfterClose: scope.Tagged(map[string]string{
  2501  			"error_type": "insert-closed",
  2502  		}).Counter("insert-after-close"),
  2503  		queryAfterClose: scope.Tagged(map[string]string{
  2504  			"error_type": "query-closed",
  2505  		}).Counter("query-after-error"),
  2506  		forwardIndexHits: scope.Tagged(map[string]string{
  2507  			"status": "hit",
  2508  		}).Counter(forwardIndexName),
  2509  		forwardIndexMisses: scope.Tagged(map[string]string{
  2510  			"status": "miss",
  2511  		}).Counter(forwardIndexName),
  2512  		forwardIndexCounter: scope.Tagged(map[string]string{
  2513  			"status": "count",
  2514  		}).Counter(forwardIndexName),
  2515  		insertEndToEndLatency: instrument.NewTimer(scope,
  2516  			"insert-end-to-end-latency", iopts.TimerOptions()),
  2517  		blocksEvictedMutableSegments: scope.Counter("blocks-evicted-mutable-segments"),
  2518  		blockMetrics:                 newNamespaceIndexBlocksMetrics(opts, blocksScope),
  2519  		indexingConcurrencyMin: scope.Tagged(map[string]string{
  2520  			"stat": "min",
  2521  		}).Gauge(indexingConcurrency),
  2522  		indexingConcurrencyMax: scope.Tagged(map[string]string{
  2523  			"stat": "max",
  2524  		}).Gauge(indexingConcurrency),
  2525  		indexingConcurrencyAvg: scope.Tagged(map[string]string{
  2526  			"stat": "avg",
  2527  		}).Gauge(indexingConcurrency),
  2528  		flushIndexingConcurrency: scope.Gauge(flushIndexingConcurrency),
  2529  		flushDocsNew: scope.Tagged(map[string]string{
  2530  			"status": "new",
  2531  		}).Counter("flush-docs"),
  2532  		flushDocsCached: scope.Tagged(map[string]string{
  2533  			"status": "cached",
  2534  		}).Counter("flush-docs"),
  2535  		latestBlockNumSegmentsForeground: scope.Tagged(map[string]string{
  2536  			"segment_type": "foreground",
  2537  		}).Gauge("latest-block-num-segments"),
  2538  		latestBlockNumDocsForeground: scope.Tagged(map[string]string{
  2539  			"segment_type": "foreground",
  2540  		}).Gauge("latest-block-num-docs"),
  2541  		latestBlockNumSegmentsBackground: scope.Tagged(map[string]string{
  2542  			"segment_type": "background",
  2543  		}).Gauge("latest-block-num-segments"),
  2544  		latestBlockNumDocsBackground: scope.Tagged(map[string]string{
  2545  			"segment_type": "background",
  2546  		}).Gauge("latest-block-num-docs"),
  2547  		loadedDocsPerQuery: scope.Histogram(
  2548  			"loaded-docs-per-query",
  2549  			tally.MustMakeExponentialValueBuckets(10, 2, 16),
  2550  		),
  2551  		queryExhaustiveSuccess: scope.Tagged(map[string]string{
  2552  			"exhaustive": "true",
  2553  			"result":     "success",
  2554  		}).Counter("query"),
  2555  		queryExhaustiveInternalError: scope.Tagged(map[string]string{
  2556  			"exhaustive": "true",
  2557  			"result":     "error_internal",
  2558  		}).Counter("query"),
  2559  		queryNonExhaustiveSuccess: scope.Tagged(map[string]string{
  2560  			"exhaustive": "false",
  2561  			"result":     "success",
  2562  		}).Counter("query"),
  2563  		queryNonExhaustiveInternalError: scope.Tagged(map[string]string{
  2564  			"exhaustive": "false",
  2565  			"result":     "error_internal",
  2566  		}).Counter("query"),
  2567  		queryNonExhaustiveLimitError: scope.Tagged(map[string]string{
  2568  			"exhaustive": "false",
  2569  			"result":     "error_require_exhaustive",
  2570  		}).Counter("query"),
  2571  		queryNonExhaustiveSeriesLimitError: scope.Tagged(map[string]string{
  2572  			"exhaustive": "false",
  2573  			"result":     "error_series_require_exhaustive",
  2574  		}).Counter("query"),
  2575  		queryNonExhaustiveDocsLimitError: scope.Tagged(map[string]string{
  2576  			"exhaustive": "false",
  2577  			"result":     "error_docs_require_exhaustive",
  2578  		}).Counter("query"),
  2579  	}
  2580  
  2581  	// Initialize gauges that should default to zero before
  2582  	// returning results so that they are exported with an
  2583  	// explicit zero value at process startup.
  2584  	m.flushIndexingConcurrency.Update(0)
  2585  
  2586  	return m
  2587  }
  2588  
  2589  type nsIndexBlocksMetrics struct {
  2590  	ForegroundSegments nsIndexBlocksSegmentsMetrics
  2591  	BackgroundSegments nsIndexBlocksSegmentsMetrics
  2592  	FlushedSegments    nsIndexBlocksSegmentsMetrics
  2593  }
  2594  
  2595  func newNamespaceIndexBlocksMetrics(
  2596  	opts index.Options,
  2597  	scope tally.Scope,
  2598  ) nsIndexBlocksMetrics {
  2599  	return nsIndexBlocksMetrics{
  2600  		ForegroundSegments: newNamespaceIndexBlocksSegmentsMetrics(
  2601  			opts.ForegroundCompactionPlannerOptions(),
  2602  			scope.Tagged(map[string]string{
  2603  				"segment-type": "foreground",
  2604  			})),
  2605  		BackgroundSegments: newNamespaceIndexBlocksSegmentsMetrics(
  2606  			opts.BackgroundCompactionPlannerOptions(),
  2607  			scope.Tagged(map[string]string{
  2608  				"segment-type": "background",
  2609  			})),
  2610  		FlushedSegments: newNamespaceIndexBlocksSegmentsMetrics(
  2611  			opts.BackgroundCompactionPlannerOptions(),
  2612  			scope.Tagged(map[string]string{
  2613  				"segment-type": "flushed",
  2614  			})),
  2615  	}
  2616  }
  2617  
  2618  type nsIndexBlocksSegmentsMetrics struct {
  2619  	Levels []nsIndexBlocksSegmentsLevelMetrics
  2620  }
  2621  
  2622  type nsIndexBlocksSegmentsLevelMetrics struct {
  2623  	MinSizeInclusive int64
  2624  	MaxSizeExclusive int64
  2625  	NumSegments      tally.Gauge
  2626  	NumTotalDocs     tally.Gauge
  2627  	SegmentsAge      tally.Timer
  2628  }
  2629  
  2630  func newNamespaceIndexBlocksSegmentsMetrics(
  2631  	compactionOpts compaction.PlannerOptions,
  2632  	scope tally.Scope,
  2633  ) nsIndexBlocksSegmentsMetrics {
  2634  	segmentLevelsScope := scope.SubScope("segment-levels")
  2635  	levels := make([]nsIndexBlocksSegmentsLevelMetrics, 0, len(compactionOpts.Levels))
  2636  	for _, level := range compactionOpts.Levels {
  2637  		subScope := segmentLevelsScope.Tagged(map[string]string{
  2638  			"level-min-size": strconv.Itoa(int(level.MinSizeInclusive)),
  2639  			"level-max-size": strconv.Itoa(int(level.MaxSizeExclusive)),
  2640  		})
  2641  		levels = append(levels, nsIndexBlocksSegmentsLevelMetrics{
  2642  			MinSizeInclusive: level.MinSizeInclusive,
  2643  			MaxSizeExclusive: level.MaxSizeExclusive,
  2644  			NumSegments:      subScope.Gauge("num-segments"),
  2645  			NumTotalDocs:     subScope.Gauge("num-total-docs"),
  2646  			SegmentsAge:      subScope.Timer("segments-age"),
  2647  		})
  2648  	}
  2649  
  2650  	return nsIndexBlocksSegmentsMetrics{
  2651  		Levels: levels,
  2652  	}
  2653  }
  2654  
  2655  type dbShards []databaseShard
  2656  
  2657  func (shards dbShards) IDs() []uint32 {
  2658  	ids := make([]uint32, 0, len(shards))
  2659  	for _, s := range shards {
  2660  		ids = append(ids, s.ID())
  2661  	}
  2662  	return ids
  2663  }
  2664  
  2665  // blocksIterStackAlloc is a stack allocated block iterator, ensuring no
  2666  // allocations per query.
  2667  type blocksIterStackAlloc struct {
  2668  	activeBlock index.Block
  2669  	blocks      []blockAndBlockStart
  2670  	queryRanges xtime.Ranges
  2671  	idx         int
  2672  }
  2673  
  2674  func newBlocksIterStackAlloc(
  2675  	activeBlock index.Block,
  2676  	blocks []blockAndBlockStart,
  2677  	queryRanges xtime.Ranges,
  2678  ) blocksIterStackAlloc {
  2679  	return blocksIterStackAlloc{
  2680  		activeBlock: activeBlock,
  2681  		blocks:      blocks,
  2682  		queryRanges: queryRanges,
  2683  		idx:         -2,
  2684  	}
  2685  }
  2686  
  2687  func (i blocksIterStackAlloc) Next() (blocksIterStackAlloc, bool) {
  2688  	iter := i
  2689  
  2690  	for {
  2691  		iter.idx++
  2692  		if iter.idx == -1 {
  2693  			// This will return the active block.
  2694  			return iter, true
  2695  		}
  2696  
  2697  		// No more ranges to query, perform this second so that
  2698  		// the in memory block always returns results.
  2699  		if i.queryRanges.IsEmpty() {
  2700  			return iter, false
  2701  		}
  2702  
  2703  		if iter.idx >= len(i.blocks) {
  2704  			return iter, false
  2705  		}
  2706  
  2707  		block := i.blocks[iter.idx].block
  2708  
  2709  		// Ensure the block has data requested by the query.
  2710  		blockRange := xtime.Range{
  2711  			Start: block.StartTime(),
  2712  			End:   block.EndTime(),
  2713  		}
  2714  		if !i.queryRanges.Overlaps(blockRange) {
  2715  			continue
  2716  		}
  2717  
  2718  		// Remove this range from the query range.
  2719  		i.queryRanges.RemoveRange(blockRange)
  2720  
  2721  		return iter, true
  2722  	}
  2723  }
  2724  
  2725  func (i blocksIterStackAlloc) Current() index.Block {
  2726  	if i.idx == -1 {
  2727  		return i.activeBlock
  2728  	}
  2729  	return i.blocks[i.idx].block
  2730  }