github.com/m3db/m3@v1.5.0/src/dbnode/storage/bootstrap/bootstrapper/peers/source.go (about)

     1  // Copyright (c) 2016 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package peers
    22  
    23  import (
    24  	"errors"
    25  	"fmt"
    26  	"io"
    27  	"sync"
    28  	"time"
    29  
    30  	"github.com/opentracing/opentracing-go"
    31  	"go.uber.org/zap"
    32  	"go.uber.org/zap/zapcore"
    33  
    34  	"github.com/m3db/m3/src/cluster/shard"
    35  	"github.com/m3db/m3/src/dbnode/client"
    36  	"github.com/m3db/m3/src/dbnode/namespace"
    37  	"github.com/m3db/m3/src/dbnode/persist"
    38  	"github.com/m3db/m3/src/dbnode/persist/fs"
    39  	"github.com/m3db/m3/src/dbnode/storage/block"
    40  	"github.com/m3db/m3/src/dbnode/storage/bootstrap"
    41  	"github.com/m3db/m3/src/dbnode/storage/bootstrap/bootstrapper"
    42  	"github.com/m3db/m3/src/dbnode/storage/bootstrap/result"
    43  	"github.com/m3db/m3/src/dbnode/storage/index"
    44  	"github.com/m3db/m3/src/dbnode/storage/index/compaction"
    45  	"github.com/m3db/m3/src/dbnode/storage/index/convert"
    46  	"github.com/m3db/m3/src/dbnode/storage/series"
    47  	"github.com/m3db/m3/src/dbnode/topology"
    48  	"github.com/m3db/m3/src/m3ninx/doc"
    49  	"github.com/m3db/m3/src/m3ninx/index/segment/fst"
    50  	idxpersist "github.com/m3db/m3/src/m3ninx/persist"
    51  	"github.com/m3db/m3/src/x/context"
    52  	"github.com/m3db/m3/src/x/ident"
    53  	"github.com/m3db/m3/src/x/instrument"
    54  	xresource "github.com/m3db/m3/src/x/resource"
    55  	xsync "github.com/m3db/m3/src/x/sync"
    56  	xtime "github.com/m3db/m3/src/x/time"
    57  )
    58  
    59  var errNamespaceNotFound = errors.New("namespace not found")
    60  
    61  const readSeriesBlocksWorkerChannelSize = 512
    62  
    63  type peersSource struct {
    64  	opts              Options
    65  	newPersistManager func() (persist.Manager, error)
    66  	log               *zap.Logger
    67  	instrumentation   *instrumentation
    68  }
    69  
    70  type persistenceFlush struct {
    71  	nsMetadata  namespace.Metadata
    72  	shard       uint32
    73  	shardResult result.ShardResult
    74  	timeRange   xtime.Range
    75  }
    76  
    77  func newPeersSource(opts Options) (bootstrap.Source, error) {
    78  	if err := opts.Validate(); err != nil {
    79  		return nil, err
    80  	}
    81  
    82  	instrumentation := newInstrumentation(opts)
    83  	return &peersSource{
    84  		opts: opts,
    85  		newPersistManager: func() (persist.Manager, error) {
    86  			return fs.NewPersistManager(opts.FilesystemOptions())
    87  		},
    88  		log:             instrumentation.log,
    89  		instrumentation: instrumentation,
    90  	}, nil
    91  }
    92  
    93  type shardPeerAvailability struct {
    94  	numPeers          int
    95  	numAvailablePeers int
    96  }
    97  
    98  func (s *peersSource) AvailableData(
    99  	nsMetadata namespace.Metadata,
   100  	shardTimeRanges result.ShardTimeRanges,
   101  	_ bootstrap.Cache,
   102  	runOpts bootstrap.RunOptions,
   103  ) (result.ShardTimeRanges, error) {
   104  	if err := s.validateRunOpts(runOpts); err != nil {
   105  		return nil, err
   106  	}
   107  	return s.peerAvailability(nsMetadata, shardTimeRanges, runOpts)
   108  }
   109  
   110  func (s *peersSource) AvailableIndex(
   111  	nsMetadata namespace.Metadata,
   112  	shardTimeRanges result.ShardTimeRanges,
   113  	_ bootstrap.Cache,
   114  	runOpts bootstrap.RunOptions,
   115  ) (result.ShardTimeRanges, error) {
   116  	if err := s.validateRunOpts(runOpts); err != nil {
   117  		return nil, err
   118  	}
   119  	return s.peerAvailability(nsMetadata, shardTimeRanges, runOpts)
   120  }
   121  
   122  func (s *peersSource) Read(
   123  	ctx context.Context,
   124  	namespaces bootstrap.Namespaces,
   125  	cache bootstrap.Cache,
   126  ) (bootstrap.NamespaceResults, error) {
   127  	instrCtx := s.instrumentation.peersBootstrapperSourceReadStarted(ctx)
   128  	defer instrCtx.finish()
   129  
   130  	timeRangesEmpty := true
   131  	for _, elem := range namespaces.Namespaces.Iter() {
   132  		namespace := elem.Value()
   133  		dataRangesNotEmpty := !namespace.DataRunOptions.ShardTimeRanges.IsEmpty()
   134  
   135  		indexEnabled := namespace.Metadata.Options().IndexOptions().Enabled()
   136  		indexRangesNotEmpty := indexEnabled && !namespace.IndexRunOptions.ShardTimeRanges.IsEmpty()
   137  		if dataRangesNotEmpty || indexRangesNotEmpty {
   138  			timeRangesEmpty = false
   139  			break
   140  		}
   141  	}
   142  	if timeRangesEmpty {
   143  		// Return empty result with no unfulfilled ranges.
   144  		return bootstrap.NewNamespaceResults(namespaces), nil
   145  	}
   146  
   147  	results := bootstrap.NamespaceResults{
   148  		Results: bootstrap.NewNamespaceResultsMap(bootstrap.NamespaceResultsMapOptions{}),
   149  	}
   150  
   151  	// NB(r): Perform all data bootstrapping first then index bootstrapping
   152  	// to more clearly delineate which process is slower than the other.
   153  	instrCtx.bootstrapDataStarted()
   154  	for _, elem := range namespaces.Namespaces.Iter() {
   155  		namespace := elem.Value()
   156  		md := namespace.Metadata
   157  
   158  		r, err := s.readData(md, namespace.DataAccumulator,
   159  			namespace.DataRunOptions.ShardTimeRanges,
   160  			namespace.DataRunOptions.RunOptions)
   161  		if err != nil {
   162  			return bootstrap.NamespaceResults{}, err
   163  		}
   164  
   165  		results.Results.Set(md.ID(), bootstrap.NamespaceResult{
   166  			Metadata:   md,
   167  			Shards:     namespace.Shards,
   168  			DataResult: r,
   169  		})
   170  	}
   171  	instrCtx.bootstrapDataCompleted()
   172  	// NB(bodu): We need to evict the info file cache before reading index data since we've
   173  	// maybe fetched blocks from peers so the cached info file state is now stale.
   174  	cache.Evict()
   175  
   176  	instrCtx.bootstrapIndexStarted()
   177  	for _, elem := range namespaces.Namespaces.Iter() {
   178  		namespace := elem.Value()
   179  		md := namespace.Metadata
   180  		if !md.Options().IndexOptions().Enabled() {
   181  			s.log.Info("skipping bootstrap for namespace based on options",
   182  				zap.Stringer("namespace", md.ID()))
   183  
   184  			// Not bootstrapping for index.
   185  			continue
   186  		}
   187  
   188  		var (
   189  			opts = namespace.IndexRunOptions.RunOptions
   190  			r    result.IndexBootstrapResult
   191  			err  error
   192  		)
   193  		if s.shouldPersist(opts) {
   194  			// Only attempt to bootstrap index if we've persisted tsdb data.
   195  			r, err = s.readIndex(md,
   196  				namespace.IndexRunOptions.ShardTimeRanges,
   197  				instrCtx.span,
   198  				cache,
   199  				opts,
   200  			)
   201  			if err != nil {
   202  				return bootstrap.NamespaceResults{}, err
   203  			}
   204  		} else {
   205  			// Copy data unfulfilled ranges over to index results
   206  			// we did not persist any tsdb data (e.g. snapshot data).
   207  			dataNsResult, ok := results.Results.Get(md.ID())
   208  			if !ok {
   209  				return bootstrap.NamespaceResults{}, errNamespaceNotFound
   210  			}
   211  			r = result.NewIndexBootstrapResult()
   212  			r.SetUnfulfilled(dataNsResult.DataResult.Unfulfilled().Copy())
   213  		}
   214  
   215  		result, ok := results.Results.Get(md.ID())
   216  		if !ok {
   217  			err = fmt.Errorf("missing expected result for namespace: %s",
   218  				md.ID().String())
   219  			return bootstrap.NamespaceResults{}, err
   220  		}
   221  
   222  		result.IndexResult = r
   223  
   224  		results.Results.Set(md.ID(), result)
   225  	}
   226  	instrCtx.bootstrapIndexCompleted()
   227  
   228  	return results, nil
   229  }
   230  
   231  func (s *peersSource) readData(
   232  	nsMetadata namespace.Metadata,
   233  	accumulator bootstrap.NamespaceDataAccumulator,
   234  	shardTimeRanges result.ShardTimeRanges,
   235  	opts bootstrap.RunOptions,
   236  ) (result.DataBootstrapResult, error) {
   237  	if err := s.validateRunOpts(opts); err != nil {
   238  		return nil, err
   239  	}
   240  
   241  	if shardTimeRanges.IsEmpty() {
   242  		return result.NewDataBootstrapResult(), nil
   243  	}
   244  
   245  	shouldPersist := s.shouldPersist(opts)
   246  	result := result.NewDataBootstrapResult()
   247  	session, err := s.opts.AdminClient().DefaultAdminSession()
   248  	if err != nil {
   249  		s.log.Error("peers bootstrapper cannot get default admin session", zap.Error(err))
   250  		result.SetUnfulfilled(shardTimeRanges)
   251  		return nil, err
   252  	}
   253  
   254  	var (
   255  		resultLock              sync.Mutex
   256  		persistenceMaxQueueSize = s.opts.PersistenceMaxQueueSize()
   257  		persistenceQueue        = make(chan persistenceFlush, persistenceMaxQueueSize)
   258  		resultOpts              = s.opts.ResultOptions()
   259  		count                   = shardTimeRanges.Len()
   260  		concurrency             = s.opts.DefaultShardConcurrency()
   261  		blockSize               = nsMetadata.Options().RetentionOptions().BlockSize()
   262  		persistWg               = &sync.WaitGroup{}
   263  		persistClosers          []io.Closer
   264  	)
   265  	if shouldPersist {
   266  		concurrency = s.opts.ShardPersistenceConcurrency()
   267  	}
   268  
   269  	instrCtx := s.instrumentation.bootstrapShardsStarted(nsMetadata.ID(), count, concurrency, shouldPersist)
   270  	defer instrCtx.bootstrapShardsCompleted()
   271  	if shouldPersist {
   272  		// Spin up persist workers.
   273  		for i := 0; i < s.opts.ShardPersistenceFlushConcurrency(); i++ {
   274  			closer, err := s.startPersistenceQueueWorkerLoop(opts,
   275  				persistWg, persistenceQueue, result, &resultLock)
   276  			if err != nil {
   277  				return nil, err
   278  			}
   279  
   280  			persistClosers = append(persistClosers, closer)
   281  		}
   282  	}
   283  
   284  	var (
   285  		wg      sync.WaitGroup
   286  		workers = xsync.NewWorkerPool(concurrency)
   287  	)
   288  	workers.Init()
   289  	for shard, ranges := range shardTimeRanges.Iter() {
   290  		shard, ranges := shard, ranges
   291  		wg.Add(1)
   292  		workers.Go(func() {
   293  			defer wg.Done()
   294  			s.fetchBootstrapBlocksFromPeers(shard, ranges, nsMetadata, session,
   295  				accumulator, resultOpts, result, &resultLock, shouldPersist,
   296  				persistenceQueue, blockSize)
   297  		})
   298  	}
   299  
   300  	wg.Wait()
   301  	close(persistenceQueue)
   302  	if shouldPersist {
   303  		// Wait for the persistenceQueue workers to finish flushing everything.
   304  		persistWg.Wait()
   305  
   306  		// Close any persist closers to finalize files written.
   307  		for _, closer := range persistClosers {
   308  			if err := closer.Close(); err != nil {
   309  				return nil, err
   310  			}
   311  		}
   312  	}
   313  
   314  	return result, nil
   315  }
   316  
   317  func (s *peersSource) startPersistenceQueueWorkerLoop(
   318  	opts bootstrap.RunOptions,
   319  	persistWg *sync.WaitGroup,
   320  	persistenceQueue chan persistenceFlush,
   321  	bootstrapResult result.DataBootstrapResult,
   322  	lock *sync.Mutex,
   323  ) (io.Closer, error) {
   324  	persistMgr, err := s.newPersistManager()
   325  	if err != nil {
   326  		return nil, err
   327  	}
   328  
   329  	persistFlush, err := persistMgr.StartFlushPersist()
   330  	if err != nil {
   331  		return nil, err
   332  	}
   333  
   334  	persistWg.Add(1)
   335  	go func() {
   336  		defer persistWg.Done()
   337  		s.runPersistenceQueueWorkerLoop(opts, persistenceQueue,
   338  			persistFlush, bootstrapResult, lock)
   339  	}()
   340  
   341  	return xresource.CloserFn(persistFlush.DoneFlush), nil
   342  }
   343  
   344  // runPersistenceQueueWorkerLoop is meant to be run in its own goroutine, and it creates a worker that
   345  // loops through the persistenceQueue and performs a flush for each entry, ensuring that
   346  // no more than one flush is ever happening at once. Once the persistenceQueue channel
   347  // is closed, and the worker has completed flushing all the remaining entries, it will close the
   348  // provided doneCh so that callers can block until everything has been successfully flushed.
   349  func (s *peersSource) runPersistenceQueueWorkerLoop(
   350  	opts bootstrap.RunOptions,
   351  	persistenceQueue chan persistenceFlush,
   352  	persistFlush persist.FlushPreparer,
   353  	bootstrapResult result.DataBootstrapResult,
   354  	lock *sync.Mutex,
   355  ) {
   356  	// Track async cleanup tasks.
   357  	asyncTasks := &sync.WaitGroup{}
   358  
   359  	// Wait for cleanups to all occur before returning from worker.
   360  	defer asyncTasks.Wait()
   361  
   362  	// If performing a bootstrap with persistence enabled then flush one
   363  	// at a time as shard results are gathered.
   364  	for flush := range persistenceQueue {
   365  		err := s.flush(opts, persistFlush, flush.nsMetadata, flush.shard,
   366  			flush.shardResult, flush.timeRange, asyncTasks)
   367  		if err == nil {
   368  			continue
   369  		}
   370  
   371  		// Remove results and make unfulfilled if an error occurred.
   372  		s.log.Error("peers bootstrapper bootstrap with persistence flush encountered error",
   373  			zap.Error(err))
   374  
   375  		// Make unfulfilled.
   376  		lock.Lock()
   377  		unfulfilled := bootstrapResult.Unfulfilled().Copy()
   378  		unfulfilled.AddRanges(result.NewShardTimeRanges().Set(
   379  			flush.shard,
   380  			xtime.NewRanges(flush.timeRange),
   381  		))
   382  		bootstrapResult.SetUnfulfilled(unfulfilled)
   383  		lock.Unlock()
   384  	}
   385  }
   386  
   387  type seriesBlocks struct {
   388  	resolver bootstrap.SeriesRefResolver
   389  	blocks   block.DatabaseSeriesBlocks
   390  }
   391  
   392  // fetchBootstrapBlocksFromPeers loops through all the provided ranges for a given shard and
   393  // fetches all the bootstrap blocks from the appropriate peers.
   394  // 		Persistence enabled case: Immediately add the results to the bootstrap result
   395  // 		Persistence disabled case: Don't add the results yet, but push a flush into the
   396  // 						  persistenceQueue. The persistenceQueue worker will eventually
   397  // 						  add the results once its performed the flush.
   398  func (s *peersSource) fetchBootstrapBlocksFromPeers(
   399  	shard uint32,
   400  	ranges xtime.Ranges,
   401  	nsMetadata namespace.Metadata,
   402  	session client.AdminSession,
   403  	accumulator bootstrap.NamespaceDataAccumulator,
   404  	bopts result.Options,
   405  	bootstrapResult result.DataBootstrapResult,
   406  	lock *sync.Mutex,
   407  	shouldPersist bool,
   408  	persistenceQueue chan persistenceFlush,
   409  	blockSize time.Duration,
   410  ) {
   411  	it := ranges.Iter()
   412  	tagsIter := ident.NewTagsIterator(ident.Tags{})
   413  	unfulfill := func(r xtime.Range) {
   414  		lock.Lock()
   415  		unfulfilled := bootstrapResult.Unfulfilled()
   416  		unfulfilled.AddRanges(result.NewShardTimeRanges().Set(shard, xtime.NewRanges(r)))
   417  		lock.Unlock()
   418  	}
   419  	for it.Next() {
   420  		currRange := it.Value()
   421  
   422  		for blockStart := currRange.Start; blockStart.Before(currRange.End); blockStart = blockStart.Add(blockSize) {
   423  			blockEnd := blockStart.Add(blockSize)
   424  			shardResult, err := session.FetchBootstrapBlocksFromPeers(
   425  				nsMetadata, shard, blockStart, blockEnd, bopts)
   426  			s.logFetchBootstrapBlocksFromPeersOutcome(shard, shardResult, err)
   427  
   428  			if err != nil {
   429  				// No result to add for this bootstrap.
   430  				unfulfill(currRange)
   431  				continue
   432  			}
   433  
   434  			if shouldPersist {
   435  				persistenceQueue <- persistenceFlush{
   436  					nsMetadata:  nsMetadata,
   437  					shard:       shard,
   438  					shardResult: shardResult,
   439  					timeRange:   xtime.Range{Start: blockStart, End: blockEnd},
   440  				}
   441  				continue
   442  			}
   443  
   444  			dataCh := make(chan seriesBlocks, readSeriesBlocksWorkerChannelSize)
   445  			go func() {
   446  				defer close(dataCh)
   447  				for _, elem := range shardResult.AllSeries().Iter() {
   448  					entry := elem.Value()
   449  					tagsIter.Reset(entry.Tags)
   450  					ref, owned, err := accumulator.CheckoutSeriesWithLock(shard, entry.ID, tagsIter)
   451  					if err != nil {
   452  						if !owned {
   453  							// Only if we own this shard do we care consider this an
   454  							// error in bootstrapping.
   455  							continue
   456  						}
   457  						unfulfill(currRange)
   458  						s.log.Error("could not checkout series", zap.Error(err))
   459  						continue
   460  					}
   461  
   462  					dataCh <- seriesBlocks{
   463  						resolver: ref.Resolver,
   464  						blocks:   entry.Blocks,
   465  					}
   466  
   467  					// Safe to finalize these IDs and Tags, shard result no longer used.
   468  					entry.ID.Finalize()
   469  					entry.Tags.Finalize()
   470  				}
   471  			}()
   472  
   473  			for seriesBlocks := range dataCh {
   474  				seriesRef, err := seriesBlocks.resolver.SeriesRef()
   475  				if err != nil {
   476  					s.log.Error("could not resolve seriesRef", zap.Error(err))
   477  					unfulfill(currRange)
   478  					continue
   479  				}
   480  
   481  				for _, bl := range seriesBlocks.blocks.AllBlocks() {
   482  					if err := seriesRef.LoadBlock(bl, series.WarmWrite); err != nil {
   483  						unfulfill(currRange)
   484  						s.log.Error("could not load series block", zap.Error(err))
   485  					}
   486  				}
   487  			}
   488  		}
   489  	}
   490  }
   491  
   492  func (s *peersSource) logFetchBootstrapBlocksFromPeersOutcome(
   493  	shard uint32,
   494  	shardResult result.ShardResult,
   495  	err error,
   496  ) {
   497  	if err != nil {
   498  		s.log.Error("error fetching bootstrap blocks",
   499  			zap.Uint32("shard", shard),
   500  			zap.Error(err),
   501  		)
   502  		return
   503  	}
   504  
   505  	shardBlockSeriesCounter := map[xtime.UnixNano]int64{}
   506  	for _, entry := range shardResult.AllSeries().Iter() { // nolint
   507  		series := entry.Value()
   508  		for blockStart := range series.Blocks.AllBlocks() {
   509  			shardBlockSeriesCounter[blockStart]++
   510  		}
   511  	}
   512  
   513  	for block, numSeries := range shardBlockSeriesCounter {
   514  		s.log.Info("peer bootstrapped shard",
   515  			zap.Uint32("shard", shard),
   516  			zap.Int64("numSeries", numSeries),
   517  			zap.Time("blockStart", block.ToTime()),
   518  		)
   519  	}
   520  }
   521  
   522  // flush is used to flush peer-bootstrapped shards to disk as they finish so
   523  // that we're not (necessarily) holding everything in memory at once.
   524  // flush starts by looping through every block in a timerange for
   525  // a given shard, and then subsequently looping through every series in that
   526  // shard/block and flushing it to disk. Depending on the series caching policy,
   527  // the series will either be held in memory, or removed from memory once
   528  // flushing has completed.
   529  // In addition, if the caching policy is not CacheAll, then
   530  // at the end we remove all the series objects from the shard result as well
   531  // (since all their corresponding blocks have been removed anyways) to prevent
   532  // a huge memory spike caused by adding lots of unused series to the Shard
   533  // object and then immediately evicting them in the next tick.
   534  func (s *peersSource) flush(
   535  	opts bootstrap.RunOptions,
   536  	flush persist.FlushPreparer,
   537  	nsMetadata namespace.Metadata,
   538  	shard uint32,
   539  	shardResult result.ShardResult,
   540  	tr xtime.Range,
   541  	asyncTasks *sync.WaitGroup,
   542  ) error {
   543  	persistConfig := opts.PersistConfig()
   544  	if persistConfig.FileSetType != persist.FileSetFlushType {
   545  		// Should never happen.
   546  		iOpts := s.opts.ResultOptions().InstrumentOptions()
   547  		instrument.EmitAndLogInvariantViolation(iOpts, func(l *zap.Logger) {
   548  			l.With(
   549  				zap.Stringer("namespace", nsMetadata.ID()),
   550  				zap.Any("filesetType", persistConfig.FileSetType),
   551  			).Error("error tried to persist data in peers bootstrapper with non-flush fileset type")
   552  		})
   553  		return instrument.InvariantErrorf(
   554  			"tried to flush with unexpected fileset type: %v", persistConfig.FileSetType)
   555  	}
   556  
   557  	seriesCachePolicy := s.opts.ResultOptions().SeriesCachePolicy()
   558  	if seriesCachePolicy == series.CacheAll {
   559  		// Should never happen.
   560  		iOpts := s.opts.ResultOptions().InstrumentOptions()
   561  		instrument.EmitAndLogInvariantViolation(iOpts, func(l *zap.Logger) {
   562  			l.With(
   563  				zap.Stringer("namespace", nsMetadata.ID()),
   564  				zap.Any("cachePolicy", seriesCachePolicy),
   565  			).Error("error tried to persist data in peers bootstrapper with invalid cache policy")
   566  		})
   567  		return instrument.InvariantErrorf(
   568  			"tried to persist data in peers bootstrapper with invalid cache policy: %v", seriesCachePolicy)
   569  	}
   570  
   571  	var (
   572  		ropts     = nsMetadata.Options().RetentionOptions()
   573  		blockSize = ropts.BlockSize()
   574  	)
   575  	for start := tr.Start; start.Before(tr.End); start = start.Add(blockSize) {
   576  		prepareOpts := persist.DataPrepareOptions{
   577  			NamespaceMetadata: nsMetadata,
   578  			FileSetType:       persistConfig.FileSetType,
   579  			Shard:             shard,
   580  			BlockStart:        start,
   581  			// When bootstrapping, the volume index will always be 0. However,
   582  			// if we want to be able to snapshot and flush while bootstrapping,
   583  			// this may not be the case, e.g. if a flush occurs before a
   584  			// bootstrap, then the bootstrap volume index will be >0. In order
   585  			// to support this, bootstrapping code will need to incorporate
   586  			// merging logic and flush version/volume index will need to be
   587  			// synchronized between processes.
   588  			VolumeIndex: 0,
   589  			// If we've peer bootstrapped this shard/block combination AND the fileset
   590  			// already exists on disk, then that means either:
   591  			// 1) The Filesystem bootstrapper was unable to bootstrap the fileset
   592  			//    files on disk, even though they have a checkpoint file. This
   593  			//    could either be the result of data corruption, or a
   594  			//    backwards-incompatible change to the file-format.
   595  			// 2) The Filesystem bootstrapper is not enabled, in which case it makes
   596  			//    complete sense to replaces the fileset on disk with the one which
   597  			//    we just peer-bootstrapped because the operator has already made it
   598  			//    clear that they only want data to be returned if it came from peers
   599  			//    (they made this decision by turning off the Filesystem bootstrapper).
   600  			// 3) We have received a shard/block we previously owned. For example, when a
   601  			//    node was added to this replication group and was later removed.
   602  			//    Although we take writes while bootstrapping, we do not allow flushes
   603  			//    so it is safe to delete on disk data.
   604  			DeleteIfExists: true,
   605  		}
   606  		prepared, err := flush.PrepareData(prepareOpts)
   607  		if err != nil {
   608  			return err
   609  		}
   610  
   611  		var blockErr error
   612  		for _, entry := range shardResult.AllSeries().Iter() {
   613  			s := entry.Value()
   614  			bl, ok := s.Blocks.BlockAt(start)
   615  			if !ok {
   616  				continue
   617  			}
   618  
   619  			checksum, err := bl.Checksum()
   620  			if err != nil {
   621  				blockErr = err // Need to call prepared.Close, avoid return
   622  				break
   623  			}
   624  
   625  			// Discard and finalize the block.
   626  			segment := bl.Discard()
   627  
   628  			// Remove from map.
   629  			s.Blocks.RemoveBlockAt(start)
   630  
   631  			metadata := persist.NewMetadataFromIDAndTags(s.ID, s.Tags,
   632  				persist.MetadataOptions{})
   633  			err = prepared.Persist(metadata, segment, checksum)
   634  			if err != nil {
   635  				blockErr = err // Need to call prepared.Close, avoid return
   636  				break
   637  			}
   638  		}
   639  
   640  		// Always close before attempting to check if block error occurred,
   641  		// avoid using a defer here as this needs to be done for each inner loop
   642  		err = prepared.Close()
   643  		if blockErr != nil {
   644  			// A block error is more interesting to bubble up than a close error
   645  			err = blockErr
   646  		}
   647  
   648  		if err != nil {
   649  			return err
   650  		}
   651  	}
   652  
   653  	// Perform cleanup async but allow caller to wait on them.
   654  	// This allows to progress to next flush faster.
   655  	asyncTasks.Add(1)
   656  	go func() {
   657  		defer asyncTasks.Done()
   658  
   659  		// Since we've persisted the data to disk, we don't want to keep all the series in the shard
   660  		// result. Otherwise if we leave them in, then they will all get loaded into the shard object,
   661  		// and then immediately evicted on the next tick which causes unnecessary memory pressure
   662  		// during peer bootstrapping.
   663  		numSeriesTriedToRemoveWithRemainingBlocks := 0
   664  		for _, entry := range shardResult.AllSeries().Iter() {
   665  			series := entry.Value()
   666  			numBlocksRemaining := len(series.Blocks.AllBlocks())
   667  			// Should never happen since we removed all the block in the previous loop and fetching
   668  			// bootstrap blocks should always be exclusive on the end side.
   669  			if numBlocksRemaining > 0 {
   670  				numSeriesTriedToRemoveWithRemainingBlocks++
   671  				continue
   672  			}
   673  
   674  			shardResult.RemoveSeries(series.ID)
   675  			series.Blocks.Close()
   676  			// Safe to finalize these IDs and Tags because the prepared object was the only other thing
   677  			// using them, and it has been closed.
   678  			series.ID.Finalize()
   679  			series.Tags.Finalize()
   680  		}
   681  		if numSeriesTriedToRemoveWithRemainingBlocks > 0 {
   682  			iOpts := s.opts.ResultOptions().InstrumentOptions()
   683  			instrument.EmitAndLogInvariantViolation(iOpts, func(l *zap.Logger) {
   684  				l.With(
   685  					zap.Int64("start", tr.Start.Seconds()),
   686  					zap.Int64("end", tr.End.Seconds()),
   687  					zap.Int("numTimes", numSeriesTriedToRemoveWithRemainingBlocks),
   688  				).Error("error tried to remove series that still has blocks")
   689  			})
   690  		}
   691  	}()
   692  
   693  	return nil
   694  }
   695  
   696  func (s *peersSource) readIndex(
   697  	ns namespace.Metadata,
   698  	shardTimeRanges result.ShardTimeRanges,
   699  	span opentracing.Span,
   700  	cache bootstrap.Cache,
   701  	opts bootstrap.RunOptions,
   702  ) (result.IndexBootstrapResult, error) {
   703  	if err := s.validateRunOpts(opts); err != nil {
   704  		return nil, err
   705  	}
   706  
   707  	// FOLLOWUP(r): Try to reuse any metadata fetched during the ReadData(...)
   708  	// call rather than going to the network again
   709  	r := result.NewIndexBootstrapResult()
   710  	if shardTimeRanges.IsEmpty() {
   711  		return r, nil
   712  	}
   713  
   714  	var (
   715  		count          = shardTimeRanges.Len()
   716  		indexBlockSize = ns.Options().IndexOptions().BlockSize()
   717  		runtimeOpts    = s.opts.RuntimeOptionsManager().Get()
   718  		fsOpts         = s.opts.FilesystemOptions()
   719  		idxOpts        = ns.Options().IndexOptions()
   720  		readerPool     = bootstrapper.NewReaderPool(bootstrapper.NewReaderPoolOptions{
   721  			Alloc: func() (fs.DataFileSetReader, error) {
   722  				bytesPool := s.opts.ResultOptions().DatabaseBlockOptions().BytesPool()
   723  				return fs.NewReader(bytesPool, fsOpts)
   724  			},
   725  		})
   726  		resultLock              = &sync.Mutex{}
   727  		indexSegmentConcurrency = s.opts.IndexSegmentConcurrency()
   728  		readersCh               = make(chan bootstrapper.TimeWindowReaders, indexSegmentConcurrency)
   729  	)
   730  	s.log.Info("peers bootstrapper bootstrapping index for ranges",
   731  		zap.Int("shards", count))
   732  
   733  	go bootstrapper.EnqueueReaders(bootstrapper.EnqueueReadersOptions{
   734  		NsMD:            ns,
   735  		RunOpts:         opts,
   736  		RuntimeOpts:     runtimeOpts,
   737  		FsOpts:          fsOpts,
   738  		ShardTimeRanges: shardTimeRanges,
   739  		ReaderPool:      readerPool,
   740  		ReadersCh:       readersCh,
   741  		BlockSize:       indexBlockSize,
   742  		// NB(bodu): We only read metadata when performing a peers bootstrap
   743  		// so we do not need to sort the data fileset reader.
   744  		ReadMetadataOnly: true,
   745  		Logger:           s.instrumentation.log,
   746  		Span:             span,
   747  		NowFn:            s.instrumentation.nowFn,
   748  		Cache:            cache,
   749  	})
   750  
   751  	var buildWg sync.WaitGroup
   752  	for i := 0; i < indexSegmentConcurrency; i++ {
   753  		alloc := s.opts.ResultOptions().IndexDocumentsBuilderAllocator()
   754  		segBuilder, err := alloc()
   755  		if err != nil {
   756  			return nil, err
   757  		}
   758  
   759  		builder := result.NewIndexBuilder(segBuilder)
   760  
   761  		indexOpts := s.opts.IndexOptions()
   762  		compactor, err := compaction.NewCompactor(indexOpts.MetadataArrayPool(),
   763  			index.MetadataArrayPoolCapacity,
   764  			indexOpts.SegmentBuilderOptions(),
   765  			indexOpts.FSTSegmentOptions(),
   766  			compaction.CompactorOptions{
   767  				FSTWriterOptions: &fst.WriterOptions{
   768  					// DisableRegistry is set to true to trade a larger FST size
   769  					// for a faster FST compaction since we want to reduce the end
   770  					// to end latency for time to first index a metric.
   771  					DisableRegistry: true,
   772  				},
   773  			})
   774  		if err != nil {
   775  			return nil, err
   776  		}
   777  
   778  		persistManager, err := s.newPersistManager()
   779  		if err != nil {
   780  			return nil, err
   781  		}
   782  
   783  		buildWg.Add(1)
   784  		go func() {
   785  			s.processReadersWorker(ns, r, readersCh, builder, readerPool, idxOpts,
   786  				&bootstrapper.SharedPersistManager{Mgr: persistManager},
   787  				&bootstrapper.SharedCompactor{Compactor: compactor},
   788  				resultLock)
   789  			buildWg.Done()
   790  		}()
   791  	}
   792  
   793  	buildWg.Wait()
   794  
   795  	return r, nil
   796  }
   797  
   798  func (s *peersSource) processReadersWorker(
   799  	ns namespace.Metadata,
   800  	r result.IndexBootstrapResult,
   801  	readersCh <-chan bootstrapper.TimeWindowReaders,
   802  	builder *result.IndexBuilder,
   803  	readerPool *bootstrapper.ReaderPool,
   804  	idxOpts namespace.IndexOptions,
   805  	persistManager *bootstrapper.SharedPersistManager,
   806  	compactor *bootstrapper.SharedCompactor,
   807  	resultLock *sync.Mutex,
   808  ) {
   809  	for timeWindowReaders := range readersCh {
   810  		// NB(bodu): Since we are re-using the same builder for all bootstrapped index blocks,
   811  		// it is not thread safe and requires reset after every processed index block.
   812  		builder.Builder().Reset()
   813  
   814  		// NB(bodu): This is fetching the data for all shards for a block of time.
   815  		remainingRanges, timesWithErrors := s.processReaders(
   816  			ns,
   817  			r,
   818  			builder,
   819  			timeWindowReaders,
   820  			readerPool,
   821  			idxOpts,
   822  			persistManager,
   823  			compactor,
   824  			resultLock,
   825  		)
   826  		s.markRunResultErrorsAndUnfulfilled(resultLock, r, timeWindowReaders.Ranges,
   827  			remainingRanges, timesWithErrors)
   828  	}
   829  }
   830  
   831  func (s *peersSource) processReaders(
   832  	ns namespace.Metadata,
   833  	r result.IndexBootstrapResult,
   834  	builder *result.IndexBuilder,
   835  	timeWindowReaders bootstrapper.TimeWindowReaders,
   836  	readerPool *bootstrapper.ReaderPool,
   837  	idxOpts namespace.IndexOptions,
   838  	persistManager *bootstrapper.SharedPersistManager,
   839  	compactor *bootstrapper.SharedCompactor,
   840  	resultLock *sync.Mutex,
   841  ) (result.ShardTimeRanges, []xtime.UnixNano) {
   842  	var (
   843  		metadataPool    = s.opts.IndexOptions().MetadataArrayPool()
   844  		batch           = metadataPool.Get()
   845  		timesWithErrors []xtime.UnixNano
   846  		totalEntries    int
   847  	)
   848  
   849  	defer func() {
   850  		metadataPool.Put(batch)
   851  		// Return readers to pool.
   852  		for _, shardReaders := range timeWindowReaders.Readers {
   853  			for _, r := range shardReaders.Readers {
   854  				if err := r.Close(); err == nil {
   855  					readerPool.Put(r)
   856  				}
   857  			}
   858  		}
   859  	}()
   860  
   861  	requestedRanges := timeWindowReaders.Ranges
   862  	remainingRanges := requestedRanges.Copy()
   863  	for shard, shardReaders := range timeWindowReaders.Readers {
   864  		shard := uint32(shard)
   865  		readers := shardReaders.Readers
   866  
   867  		for _, reader := range readers {
   868  			var (
   869  				timeRange = reader.Range()
   870  				start     = timeRange.Start
   871  				err       error
   872  			)
   873  
   874  			resultLock.Lock()
   875  			r.IndexResults().AddBlockIfNotExists(start, idxOpts)
   876  			resultLock.Unlock()
   877  			numEntries := reader.Entries()
   878  			for i := 0; err == nil && i < numEntries; i++ {
   879  				batch, err = s.readNextEntryAndMaybeIndex(reader, batch, builder)
   880  				totalEntries++
   881  			}
   882  
   883  			// NB(bodu): Only flush if we've experienced no errors up until this point.
   884  			if err == nil && len(batch) > 0 {
   885  				batch, err = builder.FlushBatch(batch)
   886  			}
   887  
   888  			// Validate the read results
   889  			if err == nil {
   890  				err = reader.ValidateMetadata()
   891  			}
   892  
   893  			if err == nil {
   894  				// Mark index block as fulfilled.
   895  				fulfilled := result.NewShardTimeRanges().Set(
   896  					shard,
   897  					xtime.NewRanges(timeRange),
   898  				)
   899  				resultLock.Lock()
   900  				err = r.IndexResults().MarkFulfilled(start, fulfilled,
   901  					// NB(bodu): By default, we always load bootstrapped data into the default index volume.
   902  					idxpersist.DefaultIndexVolumeType, idxOpts)
   903  				resultLock.Unlock()
   904  			}
   905  
   906  			if err == nil {
   907  				remainingRanges.Subtract(result.NewShardTimeRanges().Set(
   908  					shard,
   909  					xtime.NewRanges(timeRange),
   910  				))
   911  			} else {
   912  				s.log.Error("error processing readers", zap.Error(err),
   913  					zap.Time("timeRange.start", start.ToTime()))
   914  				timesWithErrors = append(timesWithErrors, timeRange.Start)
   915  			}
   916  		}
   917  	}
   918  	if totalEntries == 0 {
   919  		// NB(r): Do not try to build a segment if no entries to index.
   920  		return remainingRanges, timesWithErrors
   921  	}
   922  
   923  	// Only persist to disk if the requested ranges were completely fulfilled.
   924  	// Otherwise, this is the latest index segment and should only exist in mem.
   925  	var (
   926  		iopts          = s.opts.ResultOptions().InstrumentOptions()
   927  		shouldPersist  = remainingRanges.IsEmpty()
   928  		min, max       = requestedRanges.MinMax()
   929  		indexBlockSize = ns.Options().IndexOptions().BlockSize()
   930  		blockStart     = min.Truncate(indexBlockSize)
   931  		blockEnd       = blockStart.Add(indexBlockSize)
   932  		indexBlock     result.IndexBlock
   933  		err            error
   934  	)
   935  
   936  	// NB(bodu): Assume if we're bootstrapping data from disk that it is the "default" index volume type.
   937  	resultLock.Lock()
   938  	existingIndexBlock, ok := bootstrapper.GetDefaultIndexBlockForBlockStart(
   939  		r.IndexResults(), blockStart)
   940  	resultLock.Unlock()
   941  
   942  	if !ok {
   943  		err := fmt.Errorf("could not find index block in results: time=%s, ts=%d",
   944  			blockStart.String(), blockStart)
   945  		instrument.EmitAndLogInvariantViolation(iopts, func(l *zap.Logger) {
   946  			l.Error("peers bootstrap failed",
   947  				zap.Error(err),
   948  				zap.Stringer("namespace", ns.ID()),
   949  				zap.Stringer("requestedRanges", requestedRanges))
   950  		})
   951  	}
   952  
   953  	buildIndexLogFields := []zapcore.Field{
   954  		zap.Bool("shouldPersist", shouldPersist),
   955  		zap.Int("totalEntries", totalEntries),
   956  		zap.String("requestedRanges", fmt.Sprintf("%v - %v", min, max)),
   957  		zap.String("timesWithErrors", fmt.Sprintf("%v", timesWithErrors)),
   958  		zap.String("remainingRanges", remainingRanges.SummaryString()),
   959  	}
   960  	if shouldPersist {
   961  		s.log.Debug("building file set index segment", buildIndexLogFields...)
   962  		indexBlock, err = bootstrapper.PersistBootstrapIndexSegment(
   963  			ns,
   964  			requestedRanges,
   965  			builder.Builder(),
   966  			persistManager,
   967  			s.opts.IndexClaimsManager(),
   968  			s.opts.ResultOptions(),
   969  			existingIndexBlock.Fulfilled(),
   970  			blockStart,
   971  			blockEnd,
   972  		)
   973  		if errors.Is(err, fs.ErrIndexOutOfRetention) {
   974  			// Bail early if the index segment is already out of retention.
   975  			// This can happen when the edge of requested ranges at time of data bootstrap
   976  			// is now out of retention.
   977  			s.instrumentation.outOfRetentionIndexSegmentSkipped(buildIndexLogFields)
   978  			return remainingRanges, timesWithErrors
   979  		} else if err != nil {
   980  			instrument.EmitAndLogInvariantViolation(iopts, func(l *zap.Logger) {
   981  				l.Error("persist fs index bootstrap failed",
   982  					zap.Stringer("namespace", ns.ID()),
   983  					zap.Stringer("requestedRanges", requestedRanges),
   984  					zap.Error(err))
   985  			})
   986  		}
   987  	} else {
   988  		s.log.Info("building in-memory index segment", buildIndexLogFields...)
   989  		indexBlock, err = bootstrapper.BuildBootstrapIndexSegment(
   990  			ns,
   991  			requestedRanges,
   992  			builder.Builder(),
   993  			compactor,
   994  			s.opts.ResultOptions(),
   995  			s.opts.IndexOptions().MmapReporter(),
   996  			blockStart,
   997  			blockEnd,
   998  		)
   999  		if errors.Is(err, fs.ErrIndexOutOfRetention) {
  1000  			// Bail early if the index segment is already out of retention.
  1001  			// This can happen when the edge of requested ranges at time of data bootstrap
  1002  			// is now out of retention.
  1003  			s.instrumentation.outOfRetentionIndexSegmentSkipped(buildIndexLogFields)
  1004  			return remainingRanges, timesWithErrors
  1005  		} else if err != nil {
  1006  			instrument.EmitAndLogInvariantViolation(iopts, func(l *zap.Logger) {
  1007  				l.Error("build fs index bootstrap failed",
  1008  					zap.Stringer("namespace", ns.ID()),
  1009  					zap.Stringer("requestedRanges", requestedRanges),
  1010  					zap.Error(err))
  1011  			})
  1012  		}
  1013  	}
  1014  
  1015  	// Merge segments and fulfilled time ranges.
  1016  	segments := indexBlock.Segments()
  1017  	for _, seg := range existingIndexBlock.Segments() {
  1018  		segments = append(segments, seg)
  1019  	}
  1020  	newFulfilled := existingIndexBlock.Fulfilled().Copy()
  1021  	newFulfilled.AddRanges(indexBlock.Fulfilled())
  1022  
  1023  	// Replace index block for default index volume type.
  1024  	resultLock.Lock()
  1025  	r.IndexResults()[blockStart].
  1026  		SetBlock(idxpersist.DefaultIndexVolumeType, result.NewIndexBlock(segments, newFulfilled))
  1027  	resultLock.Unlock()
  1028  
  1029  	return remainingRanges, timesWithErrors
  1030  }
  1031  
  1032  func (s *peersSource) readNextEntryAndMaybeIndex(
  1033  	r fs.DataFileSetReader,
  1034  	batch []doc.Metadata,
  1035  	builder *result.IndexBuilder,
  1036  ) ([]doc.Metadata, error) {
  1037  	// If performing index run, then simply read the metadata and add to segment.
  1038  	entry, err := r.StreamingReadMetadata()
  1039  	if err != nil {
  1040  		return batch, err
  1041  	}
  1042  
  1043  	d, err := convert.FromSeriesIDAndEncodedTags(entry.ID, entry.EncodedTags)
  1044  	if err != nil {
  1045  		return batch, err
  1046  	}
  1047  
  1048  	batch = append(batch, d)
  1049  
  1050  	if len(batch) >= index.MetadataArrayPoolCapacity {
  1051  		return builder.FlushBatch(batch)
  1052  	}
  1053  
  1054  	return batch, nil
  1055  }
  1056  
  1057  // markRunResultErrorsAndUnfulfilled checks the list of times that had errors and makes
  1058  // sure that we don't return any blocks or bloom filters for them. In addition,
  1059  // it looks at any remaining (unfulfilled) ranges and makes sure they're marked
  1060  // as unfulfilled.
  1061  func (s *peersSource) markRunResultErrorsAndUnfulfilled(
  1062  	resultLock *sync.Mutex,
  1063  	results result.IndexBootstrapResult,
  1064  	requestedRanges result.ShardTimeRanges,
  1065  	remainingRanges result.ShardTimeRanges,
  1066  	timesWithErrors []xtime.UnixNano,
  1067  ) {
  1068  	// NB(xichen): this is the exceptional case where we encountered errors due to files
  1069  	// being corrupted, which should be fairly rare so we can live with the overhead. We
  1070  	// experimented with adding the series to a temporary map and only adding the temporary map
  1071  	// to the final result but adding series to large map with string keys is expensive, and
  1072  	// the current implementation saves the extra overhead of merging temporary map with the
  1073  	// final result.
  1074  	if len(timesWithErrors) > 0 {
  1075  		timesWithErrorsString := make([]string, len(timesWithErrors))
  1076  		for i := range timesWithErrors {
  1077  			timesWithErrorsString[i] = timesWithErrors[i].String()
  1078  		}
  1079  		s.log.Info("encountered errors for range",
  1080  			zap.String("requestedRanges", remainingRanges.SummaryString()),
  1081  			zap.Strings("timesWithErrors", timesWithErrorsString))
  1082  	}
  1083  
  1084  	if !remainingRanges.IsEmpty() {
  1085  		resultLock.Lock()
  1086  		results.Unfulfilled().AddRanges(remainingRanges)
  1087  		resultLock.Unlock()
  1088  	}
  1089  }
  1090  
  1091  func (s *peersSource) peerAvailability(
  1092  	_ namespace.Metadata,
  1093  	shardTimeRanges result.ShardTimeRanges,
  1094  	runOpts bootstrap.RunOptions,
  1095  ) (result.ShardTimeRanges, error) {
  1096  	var (
  1097  		peerAvailabilityByShard = map[topology.ShardID]*shardPeerAvailability{}
  1098  		initialTopologyState    = runOpts.InitialTopologyState()
  1099  	)
  1100  
  1101  	for shardIDUint := range shardTimeRanges.Iter() {
  1102  		shardID := topology.ShardID(shardIDUint)
  1103  		shardPeers, ok := peerAvailabilityByShard[shardID]
  1104  		if !ok {
  1105  			shardPeers = &shardPeerAvailability{}
  1106  			peerAvailabilityByShard[shardID] = shardPeers
  1107  		}
  1108  		hostShardStates, ok := initialTopologyState.ShardStates[shardID]
  1109  		if !ok {
  1110  			// This shard was not part of the topology when the bootstrapping
  1111  			// process began.
  1112  			continue
  1113  		}
  1114  
  1115  		shardPeers.numPeers = len(hostShardStates)
  1116  		for _, hostShardState := range hostShardStates {
  1117  			if hostShardState.Host.ID() == initialTopologyState.Origin.ID() {
  1118  				// Don't take self into account
  1119  				continue
  1120  			}
  1121  
  1122  			shardState := hostShardState.ShardState
  1123  
  1124  			switch shardState {
  1125  			// Don't want to peer bootstrap from a node that has not yet completely
  1126  			// taken ownership of the shard.
  1127  			case shard.Initializing:
  1128  				// Success cases - We can bootstrap from this host, which is enough to
  1129  				// mark this shard as bootstrappable.
  1130  			case shard.Leaving:
  1131  				fallthrough
  1132  			case shard.Available:
  1133  				shardPeers.numAvailablePeers++
  1134  			case shard.Unknown:
  1135  				fallthrough
  1136  			default:
  1137  				return nil, fmt.Errorf("unknown shard state: %v", shardState)
  1138  			}
  1139  		}
  1140  	}
  1141  
  1142  	var (
  1143  		runtimeOpts               = s.opts.RuntimeOptionsManager().Get()
  1144  		bootstrapConsistencyLevel = runtimeOpts.ClientBootstrapConsistencyLevel()
  1145  		majorityReplicas          = initialTopologyState.MajorityReplicas
  1146  		availableShardTimeRanges  = result.NewShardTimeRanges()
  1147  	)
  1148  	for shardIDUint := range shardTimeRanges.Iter() {
  1149  		var (
  1150  			shardID    = topology.ShardID(shardIDUint)
  1151  			shardPeers = peerAvailabilityByShard[shardID]
  1152  
  1153  			total     = shardPeers.numPeers
  1154  			available = shardPeers.numAvailablePeers
  1155  		)
  1156  
  1157  		if available == 0 {
  1158  			// Can't peer bootstrap if there are no available peers.
  1159  			s.log.Debug("0 available peers, unable to peer bootstrap",
  1160  				zap.Int("total", total),
  1161  				zap.Uint32("shard", shardIDUint))
  1162  			continue
  1163  		}
  1164  
  1165  		if !topology.ReadConsistencyAchieved(
  1166  			bootstrapConsistencyLevel, majorityReplicas, total, available) {
  1167  			s.log.Debug("read consistency not achieved, unable to peer bootstrap",
  1168  				zap.Any("level", bootstrapConsistencyLevel),
  1169  				zap.Int("replicas", majorityReplicas),
  1170  				zap.Int("total", total),
  1171  				zap.Int("available", available))
  1172  			continue
  1173  		}
  1174  
  1175  		// Optimistically assume that the peers will be able to provide
  1176  		// all the data. This assumption is safe, as the shard/block ranges
  1177  		// will simply be marked unfulfilled if the peers are not able to
  1178  		// satisfy the requests.
  1179  		if tr, ok := shardTimeRanges.Get(shardIDUint); ok {
  1180  			availableShardTimeRanges.Set(shardIDUint, tr)
  1181  		}
  1182  	}
  1183  
  1184  	return availableShardTimeRanges, nil
  1185  }
  1186  
  1187  func (s *peersSource) validateRunOpts(runOpts bootstrap.RunOptions) error {
  1188  	persistConfig := runOpts.PersistConfig()
  1189  	if persistConfig.FileSetType != persist.FileSetFlushType &&
  1190  		persistConfig.FileSetType != persist.FileSetSnapshotType {
  1191  		// Should never happen
  1192  		return fmt.Errorf("unknown persist config fileset file type: %v", persistConfig.FileSetType)
  1193  	}
  1194  
  1195  	return nil
  1196  }
  1197  
  1198  func (s *peersSource) shouldPersist(runOpts bootstrap.RunOptions) bool {
  1199  	persistConfig := runOpts.PersistConfig()
  1200  
  1201  	return persistConfig.Enabled &&
  1202  		persistConfig.FileSetType == persist.FileSetFlushType &&
  1203  		// TODO(bodu): We should migrate to series.CacheLRU only.
  1204  		s.opts.ResultOptions().SeriesCachePolicy() != series.CacheAll
  1205  }