github.com/m3db/m3@v1.5.0/src/dbnode/storage/bootstrap/bootstrapper/fs/source.go

github.com/m3db/m3@v1.5.0/src/dbnode/storage/bootstrap/bootstrapper/fs/source.go (about)

     1  // Copyright (c) 2016 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package fs
    22  
    23  import (
    24  	"errors"
    25  	"fmt"
    26  	"sync"
    27  	"time"
    28  
    29  	"github.com/m3db/m3/src/dbnode/namespace"
    30  	"github.com/m3db/m3/src/dbnode/persist"
    31  	"github.com/m3db/m3/src/dbnode/persist/fs"
    32  	"github.com/m3db/m3/src/dbnode/persist/fs/migration"
    33  	"github.com/m3db/m3/src/dbnode/retention"
    34  	"github.com/m3db/m3/src/dbnode/storage/block"
    35  	"github.com/m3db/m3/src/dbnode/storage/bootstrap"
    36  	"github.com/m3db/m3/src/dbnode/storage/bootstrap/bootstrapper"
    37  	"github.com/m3db/m3/src/dbnode/storage/bootstrap/bootstrapper/fs/migrator"
    38  	"github.com/m3db/m3/src/dbnode/storage/bootstrap/result"
    39  	"github.com/m3db/m3/src/dbnode/storage/index"
    40  	"github.com/m3db/m3/src/dbnode/storage/index/compaction"
    41  	"github.com/m3db/m3/src/dbnode/storage/index/convert"
    42  	"github.com/m3db/m3/src/dbnode/storage/series"
    43  	"github.com/m3db/m3/src/dbnode/ts"
    44  	"github.com/m3db/m3/src/m3ninx/doc"
    45  	"github.com/m3db/m3/src/m3ninx/index/segment/fst"
    46  	idxpersist "github.com/m3db/m3/src/m3ninx/persist"
    47  	"github.com/m3db/m3/src/x/checked"
    48  	"github.com/m3db/m3/src/x/clock"
    49  	"github.com/m3db/m3/src/x/context"
    50  	"github.com/m3db/m3/src/x/ident"
    51  	"github.com/m3db/m3/src/x/instrument"
    52  	"github.com/m3db/m3/src/x/pool"
    53  	xtime "github.com/m3db/m3/src/x/time"
    54  
    55  	"github.com/opentracing/opentracing-go"
    56  	opentracinglog "github.com/opentracing/opentracing-go/log"
    57  	"github.com/uber-go/tally"
    58  	"go.uber.org/zap"
    59  	"go.uber.org/zap/zapcore"
    60  )
    61  
    62  type runType int
    63  
    64  const (
    65  	bootstrapDataRunType runType = iota
    66  	bootstrapIndexRunType
    67  )
    68  
    69  type newDataFileSetReaderFn func(
    70  	bytesPool pool.CheckedBytesPool,
    71  	opts fs.Options,
    72  ) (fs.DataFileSetReader, error)
    73  
    74  type fileSystemSource struct {
    75  	opts              Options
    76  	fsopts            fs.Options
    77  	log               *zap.Logger
    78  	nowFn             clock.NowFn
    79  	idPool            ident.Pool
    80  	newReaderFn       newDataFileSetReaderFn
    81  	newReaderPoolOpts bootstrapper.NewReaderPoolOptions
    82  	metrics           fileSystemSourceMetrics
    83  	instrumentation   *instrumentation
    84  }
    85  
    86  type fileSystemSourceMetrics struct {
    87  	persistedIndexBlocksRead           tally.Counter
    88  	persistedIndexBlocksWrite          tally.Counter
    89  	persistedIndexBlocksOutOfRetention tally.Counter
    90  }
    91  
    92  func newFileSystemSource(opts Options) (bootstrap.Source, error) {
    93  	if err := opts.Validate(); err != nil {
    94  		return nil, err
    95  	}
    96  
    97  	var (
    98  		scope = opts.InstrumentOptions().MetricsScope().SubScope("fs-bootstrapper")
    99  		iopts = opts.InstrumentOptions().SetMetricsScope(scope)
   100  	)
   101  	opts = opts.SetInstrumentOptions(iopts)
   102  
   103  	s := &fileSystemSource{
   104  		opts:        opts,
   105  		fsopts:      opts.FilesystemOptions(),
   106  		log:         iopts.Logger().With(zap.String("bootstrapper", "filesystem")),
   107  		nowFn:       opts.ResultOptions().ClockOptions().NowFn(),
   108  		idPool:      opts.IdentifierPool(),
   109  		newReaderFn: fs.NewReader,
   110  		metrics: fileSystemSourceMetrics{
   111  			persistedIndexBlocksRead:           scope.Counter("persist-index-blocks-read"),
   112  			persistedIndexBlocksWrite:          scope.Counter("persist-index-blocks-write"),
   113  			persistedIndexBlocksOutOfRetention: scope.Counter("persist-index-blocks-out-of-retention"),
   114  		},
   115  		instrumentation: newInstrumentation(opts, scope, iopts),
   116  	}
   117  	s.newReaderPoolOpts.Alloc = s.newReader
   118  
   119  	return s, nil
   120  }
   121  
   122  func (s *fileSystemSource) AvailableData(
   123  	md namespace.Metadata,
   124  	shardTimeRanges result.ShardTimeRanges,
   125  	cache bootstrap.Cache,
   126  	_ bootstrap.RunOptions,
   127  ) (result.ShardTimeRanges, error) {
   128  	return s.availability(md, shardTimeRanges, cache)
   129  }
   130  
   131  func (s *fileSystemSource) AvailableIndex(
   132  	md namespace.Metadata,
   133  	shardTimeRanges result.ShardTimeRanges,
   134  	cache bootstrap.Cache,
   135  	_ bootstrap.RunOptions,
   136  ) (result.ShardTimeRanges, error) {
   137  	return s.availability(md, shardTimeRanges, cache)
   138  }
   139  
   140  func (s *fileSystemSource) Read(
   141  	ctx context.Context,
   142  	namespaces bootstrap.Namespaces,
   143  	cache bootstrap.Cache,
   144  ) (bootstrap.NamespaceResults, error) {
   145  	instrCtx := s.instrumentation.fsBootstrapperSourceReadStarted(ctx)
   146  	defer instrCtx.finish()
   147  
   148  	results := bootstrap.NamespaceResults{
   149  		Results: bootstrap.NewNamespaceResultsMap(bootstrap.NamespaceResultsMapOptions{}),
   150  	}
   151  
   152  	// Perform any necessary migrations but don't block bootstrap process on failure. Will update info file
   153  	// in-memory structures in place if migrations have written new files to disk. This saves us the need from
   154  	// having to re-read migrated info files.
   155  	infoFilesByNamespace := cache.ReadInfoFiles()
   156  	s.runMigrations(ctx, infoFilesByNamespace)
   157  
   158  	// NB(r): Perform all data bootstrapping first then index bootstrapping
   159  	// to more clearly deliniate which process is slower than the other.
   160  	instrCtx.bootstrapDataStarted()
   161  	for _, elem := range namespaces.Namespaces.Iter() {
   162  		namespace := elem.Value()
   163  		md := namespace.Metadata
   164  
   165  		r, err := s.read(bootstrapDataRunType, md, namespace.DataAccumulator,
   166  			namespace.DataRunOptions.ShardTimeRanges,
   167  			namespace.DataRunOptions.RunOptions, instrCtx.span, cache)
   168  		if err != nil {
   169  			return bootstrap.NamespaceResults{}, err
   170  		}
   171  
   172  		results.Results.Set(md.ID(), bootstrap.NamespaceResult{
   173  			Metadata:   md,
   174  			Shards:     namespace.Shards,
   175  			DataResult: r.data,
   176  		})
   177  	}
   178  	instrCtx.bootstrapDataCompleted()
   179  
   180  	instrCtx.bootstrapIndexStarted()
   181  	for _, elem := range namespaces.Namespaces.Iter() {
   182  		namespace := elem.Value()
   183  		md := namespace.Metadata
   184  		if !md.Options().IndexOptions().Enabled() {
   185  			// Not bootstrapping for index.
   186  			s.log.Info("bootstrapping for namespace disabled by options",
   187  				zap.String("ns", md.ID().String()))
   188  			continue
   189  		}
   190  
   191  		r, err := s.read(bootstrapIndexRunType, md, namespace.DataAccumulator,
   192  			namespace.IndexRunOptions.ShardTimeRanges,
   193  			namespace.IndexRunOptions.RunOptions, instrCtx.span, cache)
   194  		if err != nil {
   195  			return bootstrap.NamespaceResults{}, err
   196  		}
   197  
   198  		result, ok := results.Results.Get(md.ID())
   199  		if !ok {
   200  			err = fmt.Errorf("missing expected result for namespace: %s",
   201  				md.ID().String())
   202  			return bootstrap.NamespaceResults{}, err
   203  		}
   204  
   205  		result.IndexResult = r.index
   206  		results.Results.Set(md.ID(), result)
   207  	}
   208  	instrCtx.bootstrapIndexCompleted()
   209  	return results, nil
   210  }
   211  
   212  func (s *fileSystemSource) runMigrations(ctx context.Context, infoFilesByNamespace bootstrap.InfoFilesByNamespace) {
   213  	// Only one migration for now, so just short circuit entirely if not enabled
   214  	if s.opts.MigrationOptions().TargetMigrationVersion() != migration.MigrationVersion_1_1 {
   215  		return
   216  	}
   217  
   218  	migrator, err := migrator.NewMigrator(migrator.NewOptions().
   219  		SetMigrationTaskFn(migration.MigrationTask).
   220  		SetInfoFilesByNamespace(infoFilesByNamespace).
   221  		SetMigrationOptions(s.opts.MigrationOptions()).
   222  		SetFilesystemOptions(s.fsopts).
   223  		SetInstrumentOptions(s.opts.InstrumentOptions()).
   224  		SetStorageOptions(s.opts.StorageOptions()))
   225  	if err != nil {
   226  		s.log.Error("error creating migrator. continuing bootstrap", zap.Error(err))
   227  	}
   228  
   229  	// NB(nate): Handling of errors should be re-evaluated as migrations are added. Current migrations
   230  	// do not mutate state in such a way that data can be left in an invalid state in the case of failures. Additionally,
   231  	// we want to ensure that the bootstrap process is always able to continue. If either of these conditions change,
   232  	// error handling at this level AND the individual migration task level should be reconsidered.
   233  	//
   234  	// One final note, as more migrations are introduced and the complexity is increased, we may want to consider adding
   235  	// 1) a recovery mechanism to ensure that repeatable panics don't create a crash loop and
   236  	// 2) state tracking to abort migration attempts after a certain number of consecutive failures.
   237  	// For now, simply setting the target migration to "None" in config is enough to mitigate both of these cases.
   238  	if err = migrator.Run(ctx); err != nil {
   239  		s.log.Error("error performing migrations. continuing bootstrap", zap.Error(err))
   240  	}
   241  }
   242  
   243  func (s *fileSystemSource) availability(
   244  	md namespace.Metadata,
   245  	shardTimeRanges result.ShardTimeRanges,
   246  	cache bootstrap.Cache,
   247  ) (result.ShardTimeRanges, error) {
   248  	result := result.NewShardTimeRangesFromSize(shardTimeRanges.Len())
   249  	for shard, ranges := range shardTimeRanges.Iter() {
   250  		availabilities, err := s.shardAvailability(md, shard, ranges, cache)
   251  		if err != nil {
   252  			return nil, err
   253  		}
   254  		result.Set(shard, availabilities)
   255  	}
   256  	return result, nil
   257  }
   258  
   259  func (s *fileSystemSource) shardAvailability(
   260  	md namespace.Metadata,
   261  	shard uint32,
   262  	targetRangesForShard xtime.Ranges,
   263  	cache bootstrap.Cache,
   264  ) (xtime.Ranges, error) {
   265  	if targetRangesForShard.IsEmpty() {
   266  		return xtime.NewRanges(), nil
   267  	}
   268  	readInfoFileResults, err := cache.InfoFilesForShard(md, shard)
   269  	if err != nil {
   270  		return nil, err
   271  	}
   272  	return s.shardAvailabilityWithInfoFiles(md.ID(), shard, targetRangesForShard, readInfoFileResults), nil
   273  }
   274  
   275  func (s *fileSystemSource) shardAvailabilityWithInfoFiles(
   276  	namespace ident.ID,
   277  	shard uint32,
   278  	targetRangesForShard xtime.Ranges,
   279  	readInfoFilesResults []fs.ReadInfoFileResult,
   280  ) xtime.Ranges {
   281  	tr := xtime.NewRanges()
   282  	for i := 0; i < len(readInfoFilesResults); i++ {
   283  		result := readInfoFilesResults[i]
   284  		if err := result.Err.Error(); err != nil {
   285  			s.log.Error("unable to read info files in shardAvailability",
   286  				zap.Uint32("shard", shard),
   287  				zap.Stringer("namespace", namespace),
   288  				zap.Error(err),
   289  				zap.Any("targetRangesForShard", targetRangesForShard),
   290  				zap.String("filepath", result.Err.Filepath()),
   291  			)
   292  			continue
   293  		}
   294  		info := result.Info
   295  		t := xtime.UnixNano(info.BlockStart)
   296  		w := time.Duration(info.BlockSize)
   297  		currRange := xtime.Range{Start: t, End: t.Add(w)}
   298  		if targetRangesForShard.Overlaps(currRange) {
   299  			tr.AddRange(currRange)
   300  		}
   301  	}
   302  	return tr
   303  }
   304  
   305  func (s *fileSystemSource) bootstrapFromReaders(
   306  	run runType,
   307  	ns namespace.Metadata,
   308  	accumulator bootstrap.NamespaceDataAccumulator,
   309  	runOpts bootstrap.RunOptions,
   310  	runResult *runResult,
   311  	readerPool *bootstrapper.ReaderPool,
   312  	readersCh <-chan bootstrapper.TimeWindowReaders,
   313  	builder *result.IndexBuilder,
   314  	persistManager *bootstrapper.SharedPersistManager,
   315  	compactor *bootstrapper.SharedCompactor,
   316  ) {
   317  	resultOpts := s.opts.ResultOptions()
   318  
   319  	for timeWindowReaders := range readersCh {
   320  		// NB(bodu): Since we are re-using the same builder for all bootstrapped index blocks,
   321  		// it is not thread safe and requires reset after every processed index block.
   322  		builder.Builder().Reset()
   323  
   324  		s.loadShardReadersDataIntoShardResult(run, ns, accumulator,
   325  			runOpts, runResult, resultOpts, timeWindowReaders, readerPool,
   326  			builder, persistManager, compactor)
   327  	}
   328  }
   329  
   330  // markRunResultErrorsAndUnfulfilled checks the list of times that had errors and makes
   331  // sure that we don't return any blocks or bloom filters for them. In addition,
   332  // it looks at any remaining (unfulfilled) ranges and makes sure they're marked
   333  // as unfulfilled.
   334  func (s *fileSystemSource) markRunResultErrorsAndUnfulfilled(
   335  	runResult *runResult,
   336  	requestedRanges result.ShardTimeRanges,
   337  	remainingRanges result.ShardTimeRanges,
   338  	timesWithErrors []time.Time,
   339  ) {
   340  	// NB(xichen): this is the exceptional case where we encountered errors due to files
   341  	// being corrupted, which should be fairly rare so we can live with the overhead. We
   342  	// experimented with adding the series to a temporary map and only adding the temporary map
   343  	// to the final result but adding series to large map with string keys is expensive, and
   344  	// the current implementation saves the extra overhead of merging temporary map with the
   345  	// final result.
   346  	if len(timesWithErrors) > 0 {
   347  		timesWithErrorsString := make([]string, len(timesWithErrors))
   348  		for i := range timesWithErrors {
   349  			timesWithErrorsString[i] = timesWithErrors[i].String()
   350  		}
   351  		s.log.Info("encountered errors for range",
   352  			zap.String("requestedRanges", requestedRanges.SummaryString()),
   353  			zap.Strings("timesWithErrors", timesWithErrorsString))
   354  	}
   355  
   356  	if !remainingRanges.IsEmpty() {
   357  		runResult.Lock()
   358  		for _, unfulfilled := range []result.ShardTimeRanges{
   359  			runResult.data.Unfulfilled(),
   360  			runResult.index.Unfulfilled(),
   361  		} {
   362  			unfulfilled.AddRanges(remainingRanges)
   363  		}
   364  		runResult.Unlock()
   365  	}
   366  }
   367  
   368  func (s *fileSystemSource) loadShardReadersDataIntoShardResult(
   369  	run runType,
   370  	ns namespace.Metadata,
   371  	accumulator bootstrap.NamespaceDataAccumulator,
   372  	runOpts bootstrap.RunOptions,
   373  	runResult *runResult,
   374  	ropts result.Options,
   375  	timeWindowReaders bootstrapper.TimeWindowReaders,
   376  	readerPool *bootstrapper.ReaderPool,
   377  	builder *result.IndexBuilder,
   378  	persistManager *bootstrapper.SharedPersistManager,
   379  	compactor *bootstrapper.SharedCompactor,
   380  ) {
   381  	var (
   382  		blockPool            = ropts.DatabaseBlockOptions().DatabaseBlockPool()
   383  		seriesCachePolicy    = ropts.SeriesCachePolicy()
   384  		timesWithErrors      []time.Time
   385  		nsCtx                = namespace.NewContextFrom(ns)
   386  		metadataPool         = s.opts.IndexOptions().MetadataArrayPool()
   387  		batch                = metadataPool.Get()
   388  		totalEntries         int
   389  		totalFulfilledRanges = result.NewShardTimeRanges()
   390  	)
   391  	defer metadataPool.Put(batch)
   392  
   393  	requestedRanges := timeWindowReaders.Ranges
   394  	remainingRanges := requestedRanges.Copy()
   395  	shardReaders := timeWindowReaders.Readers
   396  	defer func() {
   397  		// Return readers to pool.
   398  		for _, shardReaders := range shardReaders {
   399  			for _, r := range shardReaders.Readers {
   400  				if err := r.Close(); err == nil {
   401  					readerPool.Put(r)
   402  				}
   403  			}
   404  		}
   405  	}()
   406  
   407  	for shard, shardReaders := range shardReaders {
   408  		shard := uint32(shard)
   409  		readers := shardReaders.Readers
   410  
   411  		for _, r := range readers {
   412  			var (
   413  				timeRange = r.Range()
   414  				start     = timeRange.Start
   415  				blockSize = ns.Options().RetentionOptions().BlockSize()
   416  				err       error
   417  			)
   418  			switch run {
   419  			case bootstrapDataRunType:
   420  				// Pass, since nothing to do.
   421  			case bootstrapIndexRunType:
   422  				runResult.addIndexBlockIfNotExists(start, ns)
   423  			default:
   424  				// Unreachable unless an internal method calls with a run type casted from int.
   425  				panic(fmt.Errorf("invalid run type: %d", run))
   426  			}
   427  
   428  			numEntries := r.Entries()
   429  			for i := 0; err == nil && i < numEntries; i++ {
   430  				switch run {
   431  				case bootstrapDataRunType:
   432  					err = s.readNextEntryAndRecordBlock(nsCtx, accumulator, shard, r,
   433  						runResult, start, blockSize, blockPool, seriesCachePolicy)
   434  				case bootstrapIndexRunType:
   435  					// We can just read the entry and index if performing an index run.
   436  					batch, err = s.readNextEntryAndMaybeIndex(r, batch, builder)
   437  					if err != nil {
   438  						s.log.Error("readNextEntryAndMaybeIndex failed", zap.Error(err),
   439  							zap.Time("timeRangeStart", timeRange.Start.ToTime()))
   440  					}
   441  					totalEntries++
   442  				default:
   443  					// Unreachable unless an internal method calls with a run type casted from int.
   444  					panic(fmt.Errorf("invalid run type: %d", run))
   445  				}
   446  			}
   447  			// NB(bodu): Only flush if we've experienced no errors up to this point.
   448  			if err == nil && len(batch) > 0 {
   449  				batch, err = builder.FlushBatch(batch)
   450  				if err != nil {
   451  					s.log.Error("builder FlushBatch failed", zap.Error(err),
   452  						zap.Time("timeRangeStart", timeRange.Start.ToTime()))
   453  				}
   454  			}
   455  
   456  			if err == nil {
   457  				// Validate the read results.
   458  				var validateErr error
   459  				switch run {
   460  				case bootstrapDataRunType:
   461  					if seriesCachePolicy == series.CacheAll {
   462  						validateErr = r.Validate()
   463  					} else {
   464  						err = fmt.Errorf("invalid series cache policy: %s", seriesCachePolicy.String())
   465  					}
   466  				case bootstrapIndexRunType:
   467  					validateErr = r.ValidateMetadata()
   468  				default:
   469  					// Unreachable unless an internal method calls with a run type casted from int.
   470  					panic(fmt.Errorf("invalid run type: %d", run))
   471  				}
   472  				if validateErr != nil {
   473  					err = fmt.Errorf("data validation failed: %v", validateErr)
   474  				}
   475  			}
   476  
   477  			if err == nil && run == bootstrapIndexRunType {
   478  				// Mark index block as fulfilled.
   479  				fulfilled := result.NewShardTimeRanges().Set(shard, xtime.NewRanges(timeRange))
   480  				runResult.Lock()
   481  				err = runResult.index.IndexResults().MarkFulfilled(start, fulfilled,
   482  					// NB(bodu): By default, we always load bootstrapped data into the default index volume.
   483  					idxpersist.DefaultIndexVolumeType, ns.Options().IndexOptions())
   484  				runResult.Unlock()
   485  				if err != nil {
   486  					s.log.Error("indexResults MarkFulfilled failed", zap.Error(err),
   487  						zap.Time("timeRangeStart", timeRange.Start.ToTime()))
   488  				}
   489  			}
   490  
   491  			if err == nil {
   492  				fulfilled := result.NewShardTimeRanges().Set(shard, xtime.NewRanges(timeRange))
   493  				totalFulfilledRanges.AddRanges(fulfilled)
   494  				remainingRanges.Subtract(fulfilled)
   495  			} else {
   496  				s.log.Error("unknown error", zap.Error(err),
   497  					zap.Time("timeRangeStart", timeRange.Start.ToTime()))
   498  				timesWithErrors = append(timesWithErrors, timeRange.Start.ToTime())
   499  			}
   500  		}
   501  	}
   502  
   503  	var (
   504  		noneRemaining      = remainingRanges.IsEmpty()
   505  		shouldBuildSegment = run == bootstrapIndexRunType &&
   506  			// NB(r): Do not try to build a segment if no entries to index.
   507  			totalEntries > 0 &&
   508  			len(timesWithErrors) == 0
   509  	)
   510  	if shouldBuildSegment {
   511  		var (
   512  			indexBlockSize            = ns.Options().IndexOptions().BlockSize()
   513  			retentionPeriod           = ns.Options().RetentionOptions().RetentionPeriod()
   514  			beginningOfIndexRetention = retention.FlushTimeStartForRetentionPeriod(
   515  				retentionPeriod, indexBlockSize, xtime.ToUnixNano(s.nowFn()))
   516  			initialIndexRange = xtime.Range{
   517  				Start: beginningOfIndexRetention,
   518  				End:   beginningOfIndexRetention.Add(indexBlockSize),
   519  			}
   520  			overlapsWithInitialIndexRange = false
   521  			min, max                      = requestedRanges.MinMax()
   522  			blockStart                    = min.Truncate(indexBlockSize)
   523  			blockEnd                      = blockStart.Add(indexBlockSize)
   524  			iopts                         = s.opts.ResultOptions().InstrumentOptions()
   525  			indexBlock                    result.IndexBlock
   526  			err                           error
   527  		)
   528  		for _, remainingRange := range remainingRanges.Iter() {
   529  			if remainingRange.Overlaps(initialIndexRange) {
   530  				overlapsWithInitialIndexRange = true
   531  			}
   532  		}
   533  
   534  		remainingMin, remainingMax := remainingRanges.MinMax()
   535  		fulfilledMin, fulfilledMax := totalFulfilledRanges.MinMax()
   536  
   537  		// NB(bodu): Assume if we're bootstrapping data from disk that it is the
   538  		// "default" index volume type.
   539  		runResult.Lock()
   540  		existingIndexBlock, ok := bootstrapper.GetDefaultIndexBlockForBlockStart(
   541  			runResult.index.IndexResults(), blockStart)
   542  		runResult.Unlock()
   543  		if !ok {
   544  			err := fmt.Errorf("could not find index block in results: time=%s, ts=%d",
   545  				blockStart.String(), blockStart)
   546  			instrument.EmitAndLogInvariantViolation(iopts, func(l *zap.Logger) {
   547  				l.Error("index bootstrap failed",
   548  					zap.Error(err),
   549  					zap.Stringer("namespace", ns.ID()),
   550  					zap.Stringer("requestedRanges", requestedRanges))
   551  			})
   552  		}
   553  
   554  		// Determine if should flush data for range.
   555  		persistCfg := runOpts.PersistConfig()
   556  		shouldFlush := persistCfg.Enabled &&
   557  			persistCfg.FileSetType == persist.FileSetFlushType
   558  
   559  		// Determine all requested ranges were fulfilled or at edge of retention
   560  		satisfiedFlushRanges := noneRemaining || overlapsWithInitialIndexRange
   561  
   562  		buildIndexLogFields := []zapcore.Field{
   563  			zap.Stringer("namespace", ns.ID()),
   564  			zap.Bool("shouldBuildSegment", shouldBuildSegment),
   565  			zap.Bool("noneRemaining", noneRemaining),
   566  			zap.Bool("overlapsWithInitialIndexRange", overlapsWithInitialIndexRange),
   567  			zap.Int("totalEntries", totalEntries),
   568  			zap.String("requestedRangesMinMax", fmt.Sprintf("%v - %v", min, max)),
   569  			zap.String("remainingRangesMinMax", fmt.Sprintf("%v - %v", remainingMin, remainingMax)),
   570  			zap.String("remainingRanges", remainingRanges.SummaryString()),
   571  			zap.String("totalFulfilledRangesMinMax", fmt.Sprintf("%v - %v", fulfilledMin, fulfilledMax)),
   572  			zap.String("totalFulfilledRanges", totalFulfilledRanges.SummaryString()),
   573  			zap.String("initialIndexRange", fmt.Sprintf("%v - %v", initialIndexRange.Start, initialIndexRange.End)),
   574  			zap.Bool("shouldFlush", shouldFlush),
   575  			zap.Bool("satisfiedFlushRanges", satisfiedFlushRanges),
   576  		}
   577  
   578  		if shouldFlush && satisfiedFlushRanges {
   579  			s.log.Debug("building file set index segment", buildIndexLogFields...)
   580  			indexBlock, err = bootstrapper.PersistBootstrapIndexSegment(
   581  				ns,
   582  				requestedRanges,
   583  				builder.Builder(),
   584  				persistManager,
   585  				s.opts.IndexClaimsManager(),
   586  				s.opts.ResultOptions(),
   587  				existingIndexBlock.Fulfilled(),
   588  				blockStart,
   589  				blockEnd,
   590  			)
   591  			if errors.Is(err, fs.ErrIndexOutOfRetention) {
   592  				// Bail early if the index segment is already out of retention.
   593  				// This can happen when the edge of requested ranges at time of data bootstrap
   594  				// is now out of retention.
   595  				s.log.Debug("skipping out of retention index segment", buildIndexLogFields...)
   596  				s.metrics.persistedIndexBlocksOutOfRetention.Inc(1)
   597  				return
   598  			} else if err != nil {
   599  				instrument.EmitAndLogInvariantViolation(iopts, func(l *zap.Logger) {
   600  					l.Error("persist fs index bootstrap failed",
   601  						zap.Error(err),
   602  						zap.Stringer("namespace", ns.ID()),
   603  						zap.Stringer("requestedRanges", requestedRanges))
   604  				})
   605  			}
   606  			// Track success.
   607  			s.metrics.persistedIndexBlocksWrite.Inc(1)
   608  		} else {
   609  			s.log.Info("building in-memory index segment", buildIndexLogFields...)
   610  			indexBlock, err = bootstrapper.BuildBootstrapIndexSegment(
   611  				ns,
   612  				requestedRanges,
   613  				builder.Builder(),
   614  				compactor,
   615  				s.opts.ResultOptions(),
   616  				s.opts.FilesystemOptions().MmapReporter(),
   617  				blockStart,
   618  				blockEnd,
   619  			)
   620  			if errors.Is(err, fs.ErrIndexOutOfRetention) {
   621  				// Bail early if the index segment is already out of retention.
   622  				// This can happen when the edge of requested ranges at time of data bootstrap
   623  				// is now out of retention.
   624  				s.log.Debug("skipping out of retention index segment", buildIndexLogFields...)
   625  				s.metrics.persistedIndexBlocksOutOfRetention.Inc(1)
   626  				return
   627  			} else if err != nil {
   628  				iopts := s.opts.ResultOptions().InstrumentOptions()
   629  				instrument.EmitAndLogInvariantViolation(iopts, func(l *zap.Logger) {
   630  					l.Error("build fs index bootstrap failed",
   631  						zap.Error(err),
   632  						zap.Stringer("namespace", ns.ID()),
   633  						zap.Stringer("requestedRanges", requestedRanges))
   634  				})
   635  			}
   636  		}
   637  
   638  		// Merge segments and fulfilled time ranges.
   639  		segments := indexBlock.Segments()
   640  		for _, seg := range existingIndexBlock.Segments() {
   641  			segments = append(segments, seg)
   642  		}
   643  		newFulfilled := existingIndexBlock.Fulfilled().Copy()
   644  		newFulfilled.AddRanges(indexBlock.Fulfilled())
   645  
   646  		// Replace index block for default index volume type.
   647  		runResult.Lock()
   648  		runResult.index.IndexResults()[blockStart].
   649  			SetBlock(idxpersist.DefaultIndexVolumeType, result.NewIndexBlock(segments, newFulfilled))
   650  		runResult.Unlock()
   651  	}
   652  
   653  	s.markRunResultErrorsAndUnfulfilled(runResult, requestedRanges,
   654  		remainingRanges, timesWithErrors)
   655  }
   656  
   657  func (s *fileSystemSource) readNextEntryAndRecordBlock(
   658  	nsCtx namespace.Context,
   659  	accumulator bootstrap.NamespaceDataAccumulator,
   660  	shardID uint32,
   661  	r fs.DataFileSetReader,
   662  	runResult *runResult,
   663  	blockStart xtime.UnixNano,
   664  	blockSize time.Duration,
   665  	blockPool block.DatabaseBlockPool,
   666  	seriesCachePolicy series.CachePolicy,
   667  ) error {
   668  	var (
   669  		seriesBlock = blockPool.Get()
   670  		id          ident.ID
   671  		tagsIter    ident.TagIterator
   672  		data        checked.Bytes
   673  		err         error
   674  	)
   675  
   676  	defer func() {
   677  		// Can finalize the ID and tags always.
   678  		if id != nil {
   679  			id.Finalize()
   680  		}
   681  		if tagsIter != nil {
   682  			tagsIter.Close()
   683  		}
   684  	}()
   685  
   686  	switch seriesCachePolicy {
   687  	case series.CacheAll:
   688  		id, tagsIter, data, _, err = r.Read()
   689  	default:
   690  		err = fmt.Errorf("invalid series cache policy: %s", seriesCachePolicy.String())
   691  	}
   692  	if err != nil {
   693  		return fmt.Errorf("error reading data file: %v", err)
   694  	}
   695  
   696  	ref, owned, err := accumulator.CheckoutSeriesWithLock(shardID, id, tagsIter)
   697  	if err != nil {
   698  		if !owned {
   699  			// Ignore if we no longer own the shard for this series.
   700  			return nil
   701  		}
   702  		return fmt.Errorf("unable to checkout series: %v", err)
   703  	}
   704  
   705  	seg := ts.NewSegment(data, nil, 0, ts.FinalizeHead)
   706  	seriesBlock.Reset(blockStart, blockSize, seg, nsCtx)
   707  
   708  	seriesRef, err := ref.Resolver.SeriesRef()
   709  	if err != nil {
   710  		return fmt.Errorf("unable to resolve seriesRef: %w", err)
   711  	}
   712  	if err := seriesRef.LoadBlock(seriesBlock, series.WarmWrite); err != nil {
   713  		return fmt.Errorf("unable to load block: %v", err)
   714  	}
   715  
   716  	return nil
   717  }
   718  
   719  func (s *fileSystemSource) readNextEntryAndMaybeIndex(
   720  	r fs.DataFileSetReader,
   721  	batch []doc.Metadata,
   722  	builder *result.IndexBuilder,
   723  ) ([]doc.Metadata, error) {
   724  	// If performing index run, then simply read the metadata and add to segment.
   725  	entry, err := r.StreamingReadMetadata()
   726  	if err != nil {
   727  		return batch, err
   728  	}
   729  
   730  	d, err := convert.FromSeriesIDAndEncodedTags(entry.ID, entry.EncodedTags)
   731  	if err != nil {
   732  		return batch, err
   733  	}
   734  
   735  	batch = append(batch, d)
   736  
   737  	if len(batch) >= index.MetadataArrayPoolCapacity {
   738  		return builder.FlushBatch(batch)
   739  	}
   740  
   741  	return batch, nil
   742  }
   743  
   744  func (s *fileSystemSource) read(
   745  	run runType,
   746  	md namespace.Metadata,
   747  	accumulator bootstrap.NamespaceDataAccumulator,
   748  	shardTimeRanges result.ShardTimeRanges,
   749  	runOpts bootstrap.RunOptions,
   750  	span opentracing.Span,
   751  	cache bootstrap.Cache,
   752  ) (*runResult, error) {
   753  	var (
   754  		seriesCachePolicy = s.opts.ResultOptions().SeriesCachePolicy()
   755  		res               *runResult
   756  	)
   757  	if shardTimeRanges.IsEmpty() {
   758  		return newRunResult(), nil
   759  	}
   760  
   761  	setOrMergeResult := func(newResult *runResult) {
   762  		if newResult == nil {
   763  			return
   764  		}
   765  		if res == nil {
   766  			res = newResult
   767  		} else {
   768  			res = res.mergedResult(newResult)
   769  		}
   770  	}
   771  
   772  	if run == bootstrapDataRunType {
   773  		if seriesCachePolicy != series.CacheAll {
   774  			// Unless we're caching all series (or all series metadata) in memory, we
   775  			// return just the availability of the files we have.
   776  			return s.bootstrapDataRunResultFromAvailability(md, shardTimeRanges, cache)
   777  		}
   778  	}
   779  
   780  	logSpan := func(event string) {
   781  		span.LogFields(
   782  			opentracinglog.String("event", event),
   783  			opentracinglog.String("nsID", md.ID().String()),
   784  			opentracinglog.String("shardTimeRanges", shardTimeRanges.SummaryString()),
   785  		)
   786  	}
   787  	if run == bootstrapIndexRunType {
   788  		logSpan("bootstrap_from_index_persisted_blocks_start")
   789  		// NB(r): First read all the FSTs and add to runResult index results,
   790  		// subtract the shard + time ranges from what we intend to bootstrap
   791  		// for those we found.
   792  		r, err := s.bootstrapFromIndexPersistedBlocks(md,
   793  			shardTimeRanges)
   794  		if err != nil {
   795  			s.log.Warn("filesystem bootstrapped failed to read persisted index blocks")
   796  		} else {
   797  			// We may have less we need to read
   798  			shardTimeRanges = shardTimeRanges.Copy()
   799  			shardTimeRanges.Subtract(r.fulfilled)
   800  			// Set or merge result.
   801  			setOrMergeResult(r.result)
   802  		}
   803  		logSpan("bootstrap_from_index_persisted_blocks_done")
   804  	}
   805  
   806  	// Create a reader pool once per bootstrap as we don't really want to
   807  	// allocate and keep around readers outside of the bootstrapping process,
   808  	// hence why its created on demand each time.
   809  	readerPool := bootstrapper.NewReaderPool(s.newReaderPoolOpts)
   810  	indexSegmentConcurrency := s.opts.IndexSegmentConcurrency()
   811  	readersCh := make(chan bootstrapper.TimeWindowReaders, indexSegmentConcurrency)
   812  	var blockSize time.Duration
   813  	switch run {
   814  	case bootstrapDataRunType:
   815  		blockSize = md.Options().RetentionOptions().BlockSize()
   816  	case bootstrapIndexRunType:
   817  		blockSize = md.Options().IndexOptions().BlockSize()
   818  	default:
   819  		panic(fmt.Errorf("unrecognized run type: %d", run))
   820  	}
   821  	runtimeOpts := s.opts.RuntimeOptionsManager().Get()
   822  	go bootstrapper.EnqueueReaders(bootstrapper.EnqueueReadersOptions{
   823  		NsMD:            md,
   824  		RunOpts:         runOpts,
   825  		RuntimeOpts:     runtimeOpts,
   826  		FsOpts:          s.fsopts,
   827  		ShardTimeRanges: shardTimeRanges,
   828  		ReaderPool:      readerPool,
   829  		ReadersCh:       readersCh,
   830  		BlockSize:       blockSize,
   831  		// NB(bodu): We only read metadata when bootstrap index
   832  		// so we do not need to sort the data fileset reader.
   833  		ReadMetadataOnly: run == bootstrapIndexRunType,
   834  		Logger:           s.log,
   835  		Span:             span,
   836  		NowFn:            s.nowFn,
   837  		Cache:            cache,
   838  	})
   839  
   840  	bootstrapFromReadersRunResult := newRunResult()
   841  
   842  	var buildWg sync.WaitGroup
   843  	for i := 0; i < indexSegmentConcurrency; i++ {
   844  		alloc := s.opts.ResultOptions().IndexDocumentsBuilderAllocator()
   845  		segBuilder, err := alloc()
   846  		if err != nil {
   847  			return nil, err
   848  		}
   849  
   850  		builder := result.NewIndexBuilder(segBuilder)
   851  
   852  		indexOpts := s.opts.IndexOptions()
   853  		compactor, err := compaction.NewCompactor(indexOpts.MetadataArrayPool(),
   854  			index.MetadataArrayPoolCapacity,
   855  			indexOpts.SegmentBuilderOptions(),
   856  			indexOpts.FSTSegmentOptions(),
   857  			compaction.CompactorOptions{
   858  				FSTWriterOptions: &fst.WriterOptions{
   859  					// DisableRegistry is set to true to trade a larger FST size
   860  					// for a faster FST compaction since we want to reduce the end
   861  					// to end latency for time to first index a metric.
   862  					DisableRegistry: true,
   863  				},
   864  			})
   865  		if err != nil {
   866  			return nil, err
   867  		}
   868  
   869  		persistManager, err := fs.NewPersistManager(s.opts.FilesystemOptions())
   870  		if err != nil {
   871  			return nil, err
   872  		}
   873  
   874  		buildWg.Add(1)
   875  		go func() {
   876  			s.bootstrapFromReaders(run, md,
   877  				accumulator, runOpts, bootstrapFromReadersRunResult,
   878  				readerPool, readersCh, builder,
   879  				&bootstrapper.SharedPersistManager{Mgr: persistManager},
   880  				&bootstrapper.SharedCompactor{Compactor: compactor})
   881  			buildWg.Done()
   882  		}()
   883  	}
   884  
   885  	buildWg.Wait()
   886  
   887  	// Merge any existing results if necessary.
   888  	setOrMergeResult(bootstrapFromReadersRunResult)
   889  
   890  	return res, nil
   891  }
   892  
   893  func (s *fileSystemSource) newReader() (fs.DataFileSetReader, error) {
   894  	bytesPool := s.opts.ResultOptions().DatabaseBlockOptions().BytesPool()
   895  	return s.newReaderFn(bytesPool, s.fsopts)
   896  }
   897  
   898  func (s *fileSystemSource) bootstrapDataRunResultFromAvailability(
   899  	md namespace.Metadata,
   900  	shardTimeRanges result.ShardTimeRanges,
   901  	cache bootstrap.Cache,
   902  ) (*runResult, error) {
   903  	// No locking required, all local to this fn until returned.
   904  	runResult := newRunResult()
   905  	unfulfilled := runResult.data.Unfulfilled()
   906  	for shard, ranges := range shardTimeRanges.Iter() {
   907  		if ranges.IsEmpty() {
   908  			continue
   909  		}
   910  		infoFiles, err := cache.InfoFilesForShard(md, shard)
   911  		if err != nil {
   912  			return nil, err
   913  		}
   914  		availability := s.shardAvailabilityWithInfoFiles(md.ID(), shard, ranges, infoFiles)
   915  		remaining := ranges.Clone()
   916  		remaining.RemoveRanges(availability)
   917  		if !remaining.IsEmpty() {
   918  			unfulfilled.AddRanges(result.NewShardTimeRanges().Set(
   919  				shard,
   920  				remaining,
   921  			))
   922  		}
   923  	}
   924  	runResult.data.SetUnfulfilled(unfulfilled)
   925  	return runResult, nil
   926  }
   927  
   928  type bootstrapFromIndexPersistedBlocksResult struct {
   929  	fulfilled result.ShardTimeRanges
   930  	result    *runResult
   931  }
   932  
   933  func (s *fileSystemSource) bootstrapFromIndexPersistedBlocks(
   934  	ns namespace.Metadata,
   935  	shardTimeRanges result.ShardTimeRanges,
   936  ) (bootstrapFromIndexPersistedBlocksResult, error) {
   937  	res := bootstrapFromIndexPersistedBlocksResult{
   938  		fulfilled: result.NewShardTimeRanges(),
   939  	}
   940  
   941  	indexBlockSize := ns.Options().IndexOptions().BlockSize()
   942  	infoFiles := fs.ReadIndexInfoFiles(fs.ReadIndexInfoFilesOptions{
   943  		FilePathPrefix:   s.fsopts.FilePathPrefix(),
   944  		Namespace:        ns.ID(),
   945  		ReaderBufferSize: s.fsopts.InfoReaderBufferSize(),
   946  	})
   947  
   948  	for _, infoFile := range infoFiles {
   949  		if err := infoFile.Err.Error(); err != nil {
   950  			s.log.Error("unable to read index info file",
   951  				zap.Stringer("namespace", ns.ID()),
   952  				zap.Error(err),
   953  				zap.Stringer("shardTimeRanges", shardTimeRanges),
   954  				zap.String("filepath", infoFile.Err.Filepath()),
   955  			)
   956  			continue
   957  		}
   958  
   959  		info := infoFile.Info
   960  		indexBlockStart := xtime.UnixNano(info.BlockStart)
   961  		indexBlockRange := xtime.Range{
   962  			Start: indexBlockStart,
   963  			End:   indexBlockStart.Add(indexBlockSize),
   964  		}
   965  		willFulfill := result.NewShardTimeRanges()
   966  		for _, shard := range info.Shards {
   967  			tr, ok := shardTimeRanges.Get(shard)
   968  			if !ok {
   969  				// No ranges match for this shard.
   970  				continue
   971  			}
   972  			if _, ok := willFulfill.Get(shard); !ok {
   973  				willFulfill.Set(shard, xtime.NewRanges())
   974  			}
   975  
   976  			iter := tr.Iter()
   977  			for iter.Next() {
   978  				curr := iter.Value()
   979  				intersection, intersects := curr.Intersect(indexBlockRange)
   980  				if !intersects {
   981  					continue
   982  				}
   983  				willFulfill.GetOrAdd(shard).AddRange(intersection)
   984  			}
   985  		}
   986  
   987  		if willFulfill.IsEmpty() {
   988  			// No matching shard/time ranges with this block.
   989  			continue
   990  		}
   991  
   992  		fsOpts := s.fsopts
   993  		verify := s.opts.IndexSegmentsVerify()
   994  		if verify {
   995  			// Make sure for this call to read index segments
   996  			// to validate the index segment.
   997  			// If fails validation will rebuild since missing from
   998  			// fulfilled range.
   999  			fsOpts = fsOpts.SetIndexReaderAutovalidateIndexSegments(true)
  1000  		}
  1001  
  1002  		readResult, err := fs.ReadIndexSegments(fs.ReadIndexSegmentsOptions{
  1003  			ReaderOptions: fs.IndexReaderOpenOptions{
  1004  				Identifier:  infoFile.ID,
  1005  				FileSetType: persist.FileSetFlushType,
  1006  			},
  1007  			FilesystemOptions: fsOpts,
  1008  		})
  1009  		if err != nil {
  1010  			s.log.Error("unable to read segments from index fileset",
  1011  				zap.Stringer("namespace", ns.ID()),
  1012  				zap.Error(err),
  1013  				zap.Time("blockStart", indexBlockStart.ToTime()),
  1014  				zap.Int("volumeIndex", infoFile.ID.VolumeIndex),
  1015  			)
  1016  			continue
  1017  		}
  1018  
  1019  		// Track success.
  1020  		s.metrics.persistedIndexBlocksRead.Inc(1)
  1021  
  1022  		// Record result.
  1023  		if res.result == nil {
  1024  			res.result = newRunResult()
  1025  		}
  1026  		segmentsFulfilled := willFulfill
  1027  		// NB(bodu): All segments read from disk are already persisted.
  1028  		persistedSegments := make([]result.Segment, 0, len(readResult.Segments))
  1029  		for _, segment := range readResult.Segments {
  1030  			persistedSegments = append(persistedSegments, result.NewSegment(segment, true))
  1031  		}
  1032  		volumeType := idxpersist.DefaultIndexVolumeType
  1033  		if info.IndexVolumeType != nil {
  1034  			volumeType = idxpersist.IndexVolumeType(info.IndexVolumeType.Value)
  1035  		}
  1036  		indexBlockByVolumeType := result.NewIndexBlockByVolumeType(indexBlockStart)
  1037  		indexBlockByVolumeType.SetBlock(volumeType, result.NewIndexBlock(persistedSegments, segmentsFulfilled))
  1038  		// NB(r): Don't need to call MarkFulfilled on the IndexResults here
  1039  		// as we've already passed the ranges fulfilled to the block that
  1040  		// we place in the IndexResuts with the call to Add(...).
  1041  		res.result.index.Add(indexBlockByVolumeType, nil)
  1042  		res.fulfilled.AddRanges(segmentsFulfilled)
  1043  	}
  1044  
  1045  	return res, nil
  1046  }
  1047  
  1048  type runResult struct {
  1049  	sync.RWMutex
  1050  	data  result.DataBootstrapResult
  1051  	index result.IndexBootstrapResult
  1052  }
  1053  
  1054  func newRunResult() *runResult {
  1055  	return &runResult{
  1056  		data:  result.NewDataBootstrapResult(),
  1057  		index: result.NewIndexBootstrapResult(),
  1058  	}
  1059  }
  1060  
  1061  func (r *runResult) addIndexBlockIfNotExists(
  1062  	start xtime.UnixNano,
  1063  	ns namespace.Metadata,
  1064  ) {
  1065  	// Only called once per shard so ok to acquire write lock immediately.
  1066  	r.Lock()
  1067  	defer r.Unlock()
  1068  
  1069  	idxOpts := ns.Options().IndexOptions()
  1070  	r.index.IndexResults().AddBlockIfNotExists(start, idxOpts)
  1071  }
  1072  
  1073  func (r *runResult) mergedResult(other *runResult) *runResult {
  1074  	r.Lock()
  1075  	defer r.Unlock()
  1076  
  1077  	other.Lock()
  1078  	defer other.Unlock()
  1079  
  1080  	return &runResult{
  1081  		data:  result.MergedDataBootstrapResult(r.data, other.data),
  1082  		index: result.MergedIndexBootstrapResult(r.index, other.index),
  1083  	}
  1084  }