github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/storage/repair.go

github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/storage/repair.go (about)

     1  // Copyright (c) 2016 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package storage
    22  
    23  import (
    24  	"bytes"
    25  	"errors"
    26  	"fmt"
    27  	"math"
    28  	"strconv"
    29  	"sync"
    30  	"sync/atomic"
    31  	"time"
    32  
    33  	"github.com/m3db/m3/src/dbnode/client"
    34  	"github.com/m3db/m3/src/dbnode/namespace"
    35  	"github.com/m3db/m3/src/dbnode/retention"
    36  	"github.com/m3db/m3/src/dbnode/storage/block"
    37  	"github.com/m3db/m3/src/dbnode/storage/bootstrap/result"
    38  	"github.com/m3db/m3/src/dbnode/storage/repair"
    39  	"github.com/m3db/m3/src/dbnode/topology"
    40  	"github.com/m3db/m3/src/dbnode/x/xio"
    41  	"github.com/m3db/m3/src/x/clock"
    42  	"github.com/m3db/m3/src/x/context"
    43  	xerrors "github.com/m3db/m3/src/x/errors"
    44  	"github.com/m3db/m3/src/x/ident"
    45  	"github.com/m3db/m3/src/x/instrument"
    46  	xtime "github.com/m3db/m3/src/x/time"
    47  
    48  	"github.com/jhump/protoreflect/dynamic"
    49  	"github.com/uber-go/tally"
    50  	"go.uber.org/zap"
    51  )
    52  
    53  var (
    54  	errNoRepairOptions  = errors.New("no repair options")
    55  	errRepairInProgress = errors.New("repair already in progress")
    56  )
    57  
    58  type recordFn func(
    59  	origin topology.Host,
    60  	namespace ident.ID,
    61  	shard databaseShard,
    62  	diffRes repair.MetadataComparisonResult,
    63  )
    64  
    65  // TODO(rartoul): See if we can find a way to guard against too much metadata.
    66  type shardRepairer struct {
    67  	opts    Options
    68  	rpopts  repair.Options
    69  	clients []client.AdminClient
    70  	record  recordFn
    71  	nowFn   clock.NowFn
    72  	logger  *zap.Logger
    73  	scope   tally.Scope
    74  	metrics shardRepairerMetrics
    75  }
    76  
    77  type shardRepairerMetrics struct {
    78  	runDefault     tally.Counter
    79  	runOnlyCompare tally.Counter
    80  }
    81  
    82  func newShardRepairerMetrics(scope tally.Scope) shardRepairerMetrics {
    83  	return shardRepairerMetrics{
    84  		runDefault: scope.Tagged(map[string]string{
    85  			"repair_type": "default",
    86  		}).Counter("run"),
    87  		runOnlyCompare: scope.Tagged(map[string]string{
    88  			"repair_type": "only_compare",
    89  		}).Counter("run"),
    90  	}
    91  }
    92  
    93  func newShardRepairer(opts Options, rpopts repair.Options) databaseShardRepairer {
    94  	iopts := opts.InstrumentOptions()
    95  	scope := iopts.MetricsScope().SubScope("repair")
    96  
    97  	r := shardRepairer{
    98  		opts:    opts,
    99  		rpopts:  rpopts,
   100  		clients: rpopts.AdminClients(),
   101  		nowFn:   opts.ClockOptions().NowFn(),
   102  		logger:  iopts.Logger(),
   103  		scope:   scope,
   104  		metrics: newShardRepairerMetrics(scope),
   105  	}
   106  	r.record = r.recordDifferences
   107  
   108  	return r
   109  }
   110  
   111  func (r shardRepairer) Options() repair.Options {
   112  	return r.rpopts
   113  }
   114  
   115  func (r shardRepairer) Repair(
   116  	ctx context.Context,
   117  	nsCtx namespace.Context,
   118  	nsMeta namespace.Metadata,
   119  	tr xtime.Range,
   120  	shard databaseShard,
   121  ) (repair.MetadataComparisonResult, error) {
   122  	repairType := r.rpopts.Type()
   123  	switch repairType {
   124  	case repair.DefaultRepair:
   125  		defer r.metrics.runDefault.Inc(1)
   126  	case repair.OnlyCompareRepair:
   127  		defer r.metrics.runOnlyCompare.Inc(1)
   128  	default:
   129  		// Unknown repair type.
   130  		err := fmt.Errorf("unknown repair type: %v", repairType)
   131  		return repair.MetadataComparisonResult{}, err
   132  	}
   133  
   134  	var sessions []sessionAndTopo
   135  	for _, c := range r.clients {
   136  		session, err := c.DefaultAdminSession()
   137  		if err != nil {
   138  			fmtErr := fmt.Errorf("error obtaining default admin session: %v", err)
   139  			return repair.MetadataComparisonResult{}, fmtErr
   140  		}
   141  
   142  		topo, err := session.TopologyMap()
   143  		if err != nil {
   144  			fmtErr := fmt.Errorf("error obtaining topology map: %v", err)
   145  			return repair.MetadataComparisonResult{}, fmtErr
   146  		}
   147  
   148  		sessions = append(sessions, sessionAndTopo{
   149  			session: session,
   150  			topo:    topo,
   151  		})
   152  	}
   153  
   154  	var (
   155  		start = tr.Start
   156  		end   = tr.End
   157  		// Guaranteed to have at least one session and all should have an identical
   158  		// origin (both assumptions guaranteed by options validation).
   159  		origin = sessions[0].session.Origin()
   160  	)
   161  
   162  	metadata := repair.NewReplicaMetadataComparer(origin, r.rpopts)
   163  	ctx.RegisterFinalizer(metadata)
   164  
   165  	// Add local metadata.
   166  	opts := block.FetchBlocksMetadataOptions{
   167  		IncludeSizes:     true,
   168  		IncludeChecksums: true,
   169  	}
   170  	var (
   171  		accumLocalMetadata = block.NewFetchBlocksMetadataResults()
   172  		pageToken          PageToken
   173  		err                error
   174  	)
   175  	// Safe to register since by the time this function completes we won't be using the metadata
   176  	// for anything anymore.
   177  	ctx.RegisterCloser(accumLocalMetadata)
   178  
   179  	for {
   180  		// It's possible for FetchBlocksMetadataV2 to not return all the metadata at once even if
   181  		// math.MaxInt64 is passed as the limit due to its implementation and the different phases
   182  		// of the page token. As a result, the only way to ensure that all the metadata has been
   183  		// fetched is to continue looping until a nil pageToken is returned.
   184  		var currLocalMetadata block.FetchBlocksMetadataResults
   185  		currLocalMetadata, pageToken, err = shard.FetchBlocksMetadataV2(ctx, start, end, math.MaxInt64, pageToken, opts)
   186  		if err != nil {
   187  			return repair.MetadataComparisonResult{}, err
   188  		}
   189  
   190  		// Merge.
   191  		if currLocalMetadata != nil {
   192  			for _, result := range currLocalMetadata.Results() {
   193  				accumLocalMetadata.Add(result)
   194  			}
   195  		}
   196  
   197  		if pageToken == nil {
   198  			break
   199  		}
   200  	}
   201  
   202  	if r.rpopts.DebugShadowComparisonsEnabled() {
   203  		for _, sesTopo := range sessions {
   204  			// Shadow comparison is mostly a debug feature that can be used to test new builds and diagnose
   205  			// issues with the repair feature. It should not be enabled for production use-cases.
   206  			err := r.shadowCompare(start, end, accumLocalMetadata, sesTopo.session, shard, nsCtx)
   207  			if err != nil {
   208  				r.logger.Error(
   209  					"Shadow compare failed",
   210  					zap.Error(err))
   211  			}
   212  		}
   213  	}
   214  
   215  	localIter := block.NewFilteredBlocksMetadataIter(accumLocalMetadata)
   216  	err = metadata.AddLocalMetadata(localIter)
   217  	if err != nil {
   218  		return repair.MetadataComparisonResult{}, err
   219  	}
   220  
   221  	var (
   222  		rsOpts = r.opts.RepairOptions().ResultOptions()
   223  		level  = r.rpopts.RepairConsistencyLevel()
   224  	)
   225  	for _, sesTopo := range sessions {
   226  		// Add peer metadata.
   227  		peerIter, err := sesTopo.session.FetchBlocksMetadataFromPeers(nsCtx.ID, shard.ID(), start, end,
   228  			level, rsOpts)
   229  		if err != nil {
   230  			return repair.MetadataComparisonResult{}, err
   231  		}
   232  		if err := metadata.AddPeerMetadata(peerIter); err != nil {
   233  			return repair.MetadataComparisonResult{}, err
   234  		}
   235  	}
   236  
   237  	var (
   238  		// TODO(rartoul): Pool these slices.
   239  		metadatasToFetchBlocksForPerSession = make([][]block.ReplicaMetadata, len(sessions))
   240  		metadataRes                         = metadata.Compare()
   241  		seriesWithChecksumMismatches        = metadataRes.ChecksumDifferences.Series()
   242  	)
   243  
   244  	// Shard repair can fail due to transient network errors due to the significant amount of data fetched from peers.
   245  	// So collect and emit metadata comparison metrics before fetching blocks from peer to repair.
   246  	r.record(origin, nsCtx.ID, shard, metadataRes)
   247  	if repairType == repair.OnlyCompareRepair {
   248  		// Early return if repair type doesn't require executing repairing the data step.
   249  		return metadataRes, nil
   250  	}
   251  
   252  	originID := origin.ID()
   253  	for _, e := range seriesWithChecksumMismatches.Iter() {
   254  		for blockStart, replicaMetadataBlocks := range e.Value().Metadata.Blocks() {
   255  			blStartRange := xtime.Range{Start: blockStart, End: blockStart}
   256  			if !tr.Contains(blStartRange) {
   257  				instrument.EmitAndLogInvariantViolation(r.opts.InstrumentOptions(), func(l *zap.Logger) {
   258  					l.With(
   259  						zap.Time("blockStart", blockStart.ToTime()),
   260  						zap.String("namespace", nsMeta.ID().String()),
   261  						zap.Uint32("shard", shard.ID()),
   262  					).Error("repair received replica metadata for unrequested blockStart")
   263  				})
   264  				continue
   265  			}
   266  
   267  			for _, replicaMetadata := range replicaMetadataBlocks.Metadata() {
   268  				metadataHostID := replicaMetadata.Host.ID()
   269  				if metadataHostID == originID {
   270  					// Don't request blocks for self metadata.
   271  					continue
   272  				}
   273  
   274  				if len(sessions) == 1 {
   275  					// Optimized path for single session case.
   276  					metadatasToFetchBlocksForPerSession[0] = append(metadatasToFetchBlocksForPerSession[0], replicaMetadata)
   277  					continue
   278  				}
   279  
   280  				// If there is more than one session then we need to match up all of the metadata to the
   281  				// session it belongs to so that we can fetch the corresponding blocks of data.
   282  				foundSessionForMetadata := false
   283  				for i, sesTopo := range sessions {
   284  					_, ok := sesTopo.topo.LookupHostShardSet(metadataHostID)
   285  					if !ok {
   286  						// The host this metadata came from is not part of the cluster this session is connected to.
   287  						continue
   288  					}
   289  					metadatasToFetchBlocksForPerSession[i] = append(metadatasToFetchBlocksForPerSession[i], replicaMetadata)
   290  					foundSessionForMetadata = true
   291  					break
   292  				}
   293  
   294  				if !foundSessionForMetadata {
   295  					// Could happen during topology changes (I.E node is kicked out of the cluster in-between
   296  					// fetching its metadata and this step).
   297  					r.logger.Debug(
   298  						"could not identify which session mismatched metadata belong to",
   299  						zap.String("hostID", metadataHostID),
   300  						zap.Time("blockStart", blockStart.ToTime()),
   301  					)
   302  				}
   303  			}
   304  		}
   305  	}
   306  
   307  	// TODO(rartoul): Copying the IDs for the purposes of the map key is wasteful. Considering using
   308  	// SetUnsafe or marking as NoFinalize() and making the map check IsNoFinalize().
   309  	results := result.NewShardResult(rsOpts)
   310  	for i, metadatasToFetchBlocksFor := range metadatasToFetchBlocksForPerSession {
   311  		if len(metadatasToFetchBlocksFor) == 0 {
   312  			continue
   313  		}
   314  
   315  		session := sessions[i].session
   316  		perSeriesReplicaIter, err := session.FetchBlocksFromPeers(nsMeta, shard.ID(), level, metadatasToFetchBlocksFor, rsOpts)
   317  		if err != nil {
   318  			return repair.MetadataComparisonResult{}, err
   319  		}
   320  
   321  		for perSeriesReplicaIter.Next() {
   322  			_, id, tags, block := perSeriesReplicaIter.Current()
   323  			if existing, ok := results.BlockAt(id, block.StartTime()); ok {
   324  				// Merge contents with existing block.
   325  				if err := existing.Merge(block); err != nil {
   326  					return repair.MetadataComparisonResult{}, err
   327  				}
   328  				continue
   329  			}
   330  
   331  			// Add block for first time to results.
   332  			results.AddBlock(id, tags, block)
   333  		}
   334  	}
   335  
   336  	if err := r.loadDataIntoShard(shard, results); err != nil {
   337  		return repair.MetadataComparisonResult{}, err
   338  	}
   339  
   340  	return metadataRes, nil
   341  }
   342  
   343  // TODO(rartoul): Currently throttling via the MemoryTracker can only occur at the level of an entire
   344  // block for a given namespace/shard/blockStart. For almost all practical use-cases this is fine, but
   345  // this could be improved and made more granular by breaking data that is being loaded into the shard
   346  // into smaller batches (less than one complete block). This would improve the granularity of throttling
   347  // for clusters where the number of shards is low.
   348  func (r shardRepairer) loadDataIntoShard(shard databaseShard, data result.ShardResult) error {
   349  	var (
   350  		waitingGauge  = r.scope.Gauge("waiting-for-limit")
   351  		waitedCounter = r.scope.Counter("waited-for-limit")
   352  		doneCh        = make(chan struct{})
   353  		waiting       bool
   354  		waitingLock   sync.Mutex
   355  	)
   356  	defer close(doneCh)
   357  
   358  	// Emit a gauge constantly that indicates whether or not the repair process is blocked waiting.
   359  	go func() {
   360  		for {
   361  			select {
   362  			case <-doneCh:
   363  				waitingGauge.Update(0)
   364  				return
   365  			default:
   366  				waitingLock.Lock()
   367  				currWaiting := waiting
   368  				waitingLock.Unlock()
   369  				if currWaiting {
   370  					waitingGauge.Update(1)
   371  				} else {
   372  					waitingGauge.Update(0)
   373  				}
   374  				time.Sleep(5 * time.Second)
   375  			}
   376  		}
   377  	}()
   378  
   379  	for {
   380  		err := shard.LoadBlocks(data.AllSeries())
   381  		if err == ErrDatabaseLoadLimitHit {
   382  			waitedCounter.Inc(1)
   383  			waitingLock.Lock()
   384  			waiting = true
   385  			waitingLock.Unlock()
   386  			// Wait for some of the outstanding data to be flushed before trying again.
   387  			r.logger.Info("repair throttled due to memory load limits, waiting for data to be flushed before continuing")
   388  			r.opts.MemoryTracker().WaitForDec()
   389  			continue
   390  		}
   391  		if err != nil {
   392  			return err
   393  		}
   394  		return nil
   395  	}
   396  }
   397  
   398  func (r shardRepairer) recordDifferences(
   399  	origin topology.Host,
   400  	namespace ident.ID,
   401  	shard databaseShard,
   402  	diffRes repair.MetadataComparisonResult,
   403  ) {
   404  	var (
   405  		shardScope = r.scope.Tagged(map[string]string{
   406  			"namespace": namespace.String(),
   407  			"shard":     strconv.Itoa(int(shard.ID())),
   408  		})
   409  		totalScope        = shardScope.Tagged(map[string]string{"resultType": "total"})
   410  		sizeDiffScope     = shardScope.Tagged(map[string]string{"resultType": "sizeDiff"})
   411  		checksumDiffScope = shardScope.Tagged(map[string]string{"resultType": "checksumDiff"})
   412  	)
   413  
   414  	// Record total number of series and total number of blocks.
   415  	totalScope.Counter("series").Inc(diffRes.NumSeries)
   416  	totalScope.Counter("blocks").Inc(diffRes.NumBlocks)
   417  
   418  	// Record size differences.
   419  	sizeDiffScope.Counter("series").Inc(diffRes.SizeDifferences.NumSeries())
   420  	sizeDiffScope.Counter("blocks").Inc(diffRes.SizeDifferences.NumBlocks())
   421  
   422  	absoluteBlockSizeDiff, blockSizeDiffAsPercentage := r.computeMaximumBlockSizeDifference(origin, diffRes)
   423  	sizeDiffScope.Gauge("max-block-size-diff").Update(float64(absoluteBlockSizeDiff))
   424  	sizeDiffScope.Gauge("max-block-size-diff-as-percentage").Update(blockSizeDiffAsPercentage)
   425  
   426  	// Record checksum differences.
   427  	checksumDiffScope.Counter("series").Inc(diffRes.ChecksumDifferences.NumSeries())
   428  	checksumDiffScope.Counter("blocks").Inc(diffRes.ChecksumDifferences.NumBlocks())
   429  }
   430  
   431  // computeMaximumBlockSizeDifferenceAsPercentage returns a metric which represents maximum divergence of a shard with
   432  // any of its peers. A positive divergence means that origin shard has more data than its peer and a negative
   433  // divergence means that origin shard has lesser data than its peer.  Since sizes for all the blocks in rentention
   434  // window are not readily available, exact divergence of a shard from its peer cannot be calculated. So this method
   435  // settles for returning maximum divergence of a block/shard with any of its peers. Divergence(as percentage) of shard
   436  // is upper bounded by divergence of block/shard so this metric can be used to monitor severity of divergence.
   437  func (r shardRepairer) computeMaximumBlockSizeDifference(
   438  	origin topology.Host,
   439  	diffRes repair.MetadataComparisonResult,
   440  ) (int64, float64) {
   441  	var (
   442  		maxBlockSizeDiffAsRatio float64
   443  		maxBlockSizeDiff        int64
   444  	)
   445  	// Iterate over all the series which differ in size between origin and a peer.
   446  	for _, entry := range diffRes.SizeDifferences.Series().Iter() {
   447  		series := entry.Value()
   448  		replicaBlocksMetadata := diffRes.SizeDifferences.GetOrAdd(series.ID)
   449  		// Iterate over all the time ranges which had a mismatched series between origin and a peer.
   450  		for _, replicasMetadata := range replicaBlocksMetadata.Blocks() {
   451  			var (
   452  				// Setting minimum origin block size to 1 so that percetages off of origin block size can be calculated
   453  				// without worrying about divide by zero errors. Exact percentages are not required so setting a
   454  				// non-zero size for an empty block is acceptable.
   455  				originBlockSize int64 = 1
   456  				// Represents maximum size difference of a block with one of its peers.
   457  				maxPeerBlockSizeDiff int64
   458  			)
   459  			// Record the block size on the origin.
   460  			for _, replicaMetadata := range replicasMetadata.Metadata() {
   461  				if replicaMetadata.Host.ID() == origin.ID() && replicaMetadata.Size > 0 {
   462  					originBlockSize = replicaMetadata.Size
   463  					break
   464  				}
   465  			}
   466  			// Fetch the maximum block size difference of origin with any of its peers.
   467  			for _, replicaMetadata := range replicasMetadata.Metadata() {
   468  				if replicaMetadata.Host.ID() != origin.ID() {
   469  					blockSizeDiff := originBlockSize - replicaMetadata.Size
   470  					if math.Abs(float64(blockSizeDiff)) > math.Abs(float64(maxPeerBlockSizeDiff)) {
   471  						maxPeerBlockSizeDiff = blockSizeDiff
   472  					}
   473  				}
   474  			}
   475  			// Record divergence as percentage for origin block which has diverged the most from its peers.
   476  			if math.Abs(float64(maxPeerBlockSizeDiff)) > math.Abs(float64(maxBlockSizeDiff)) {
   477  				maxBlockSizeDiff = maxPeerBlockSizeDiff
   478  				maxBlockSizeDiffAsRatio = float64(maxPeerBlockSizeDiff) / float64(originBlockSize)
   479  			}
   480  		}
   481  	}
   482  	return maxBlockSizeDiff, maxBlockSizeDiffAsRatio * 100
   483  }
   484  
   485  type repairFn func() error
   486  
   487  type sleepFn func(d time.Duration)
   488  
   489  type repairStatus int
   490  
   491  const (
   492  	repairNotStarted repairStatus = iota
   493  	repairSuccess
   494  	repairFailed
   495  )
   496  
   497  type repairState struct {
   498  	LastAttempt xtime.UnixNano
   499  	Status      repairStatus
   500  }
   501  
   502  type namespaceRepairStateByTime map[xtime.UnixNano]repairState
   503  
   504  // NB(r): This uses a map[string]element instead of a generated map for
   505  // native ident.ID keys, this was because the call frequency is very low
   506  // it's not in the hot path so casting ident.ID to string isn't too expensive
   507  // and this data structure may very well change soon with a refactor of the
   508  // background repair in the works.
   509  type repairStatesByNs map[string]namespaceRepairStateByTime
   510  
   511  func newRepairStates() repairStatesByNs {
   512  	return make(repairStatesByNs)
   513  }
   514  
   515  func (r repairStatesByNs) repairStates(
   516  	namespace ident.ID,
   517  	t xtime.UnixNano,
   518  ) (repairState, bool) {
   519  	var rs repairState
   520  
   521  	nsRepairState, ok := r[namespace.String()]
   522  	if !ok {
   523  		return rs, false
   524  	}
   525  
   526  	rs, ok = nsRepairState[t]
   527  	return rs, ok
   528  }
   529  
   530  func (r repairStatesByNs) setRepairState(
   531  	namespace ident.ID,
   532  	t xtime.UnixNano,
   533  	state repairState,
   534  ) {
   535  	nsRepairState, ok := r[namespace.String()]
   536  	if !ok {
   537  		nsRepairState = make(namespaceRepairStateByTime)
   538  		r[namespace.String()] = nsRepairState
   539  	}
   540  	nsRepairState[t] = state
   541  }
   542  
   543  // NB(prateek): dbRepairer.Repair(...) guarantees atomicity of execution, so all other
   544  // state does not need to be thread safe. One exception - `dbRepairer.closed` is used
   545  // for early termination if `dbRepairer.Stop()` is called during a repair, so we guard
   546  // it with a mutex.
   547  type dbRepairer struct {
   548  	database         database
   549  	opts             Options
   550  	ropts            repair.Options
   551  	shardRepairer    databaseShardRepairer
   552  	repairStatesByNs repairStatesByNs
   553  
   554  	repairFn            repairFn
   555  	sleepFn             sleepFn
   556  	nowFn               clock.NowFn
   557  	logger              *zap.Logger
   558  	repairCheckInterval time.Duration
   559  	scope               tally.Scope
   560  	status              tally.Gauge
   561  
   562  	closedLock sync.Mutex
   563  	running    int32
   564  	closed     bool
   565  }
   566  
   567  func newDatabaseRepairer(database database, opts Options) (databaseRepairer, error) {
   568  	var (
   569  		nowFn = opts.ClockOptions().NowFn()
   570  		scope = opts.InstrumentOptions().MetricsScope().SubScope("repair")
   571  		ropts = opts.RepairOptions()
   572  	)
   573  	if ropts == nil {
   574  		return nil, errNoRepairOptions
   575  	}
   576  	if err := ropts.Validate(); err != nil {
   577  		return nil, err
   578  	}
   579  
   580  	shardRepairer := newShardRepairer(opts, ropts)
   581  
   582  	r := &dbRepairer{
   583  		database:            database,
   584  		opts:                opts,
   585  		ropts:               ropts,
   586  		shardRepairer:       shardRepairer,
   587  		repairStatesByNs:    newRepairStates(),
   588  		sleepFn:             time.Sleep,
   589  		nowFn:               nowFn,
   590  		logger:              opts.InstrumentOptions().Logger(),
   591  		repairCheckInterval: ropts.RepairCheckInterval(),
   592  		scope:               scope,
   593  		status:              scope.Gauge("repair"),
   594  	}
   595  	r.repairFn = r.Repair
   596  
   597  	return r, nil
   598  }
   599  
   600  func (r *dbRepairer) run() {
   601  	for {
   602  		r.closedLock.Lock()
   603  		closed := r.closed
   604  		r.closedLock.Unlock()
   605  
   606  		if closed {
   607  			break
   608  		}
   609  
   610  		r.sleepFn(r.repairCheckInterval)
   611  
   612  		if err := r.repairFn(); err != nil {
   613  			r.logger.Error("error repairing database", zap.Error(err))
   614  		}
   615  	}
   616  }
   617  
   618  func (r *dbRepairer) namespaceRepairTimeRange(ns databaseNamespace) xtime.Range {
   619  	var (
   620  		now    = xtime.ToUnixNano(r.nowFn())
   621  		rtopts = ns.Options().RetentionOptions()
   622  	)
   623  	return xtime.Range{
   624  		Start: retention.FlushTimeStart(rtopts, now),
   625  		End:   retention.FlushTimeEnd(rtopts, now)}
   626  }
   627  
   628  func (r *dbRepairer) Start() {
   629  	go r.run()
   630  }
   631  
   632  func (r *dbRepairer) Stop() {
   633  	r.closedLock.Lock()
   634  	r.closed = true
   635  	r.closedLock.Unlock()
   636  }
   637  
   638  // Repair will analyze the current repair state for each namespace/blockStart combination and pick one blockStart
   639  // per namespace to repair. It will prioritize blocks that have never been repaired over those that have been
   640  // repaired before, and it will prioritize more recent blocks over older ones. If all blocks have been repaired
   641  // before then it will prioritize the least recently repaired block.
   642  //
   643  // The Repair function only attempts to repair one block at a time because this allows the background repair process
   644  // to run its prioritization logic more frequently. For example, if we attempted to repair all blocks in one pass,
   645  // even with appropriate backpressure, this could lead to situations where recent blocks are not repaired for a
   646  // substantial amount of time whereas with the current approach the longest delay between running the prioritization
   647  // logic is the amount of time it takes to repair one block for all shards.
   648  //
   649  // Long term we will want to move to a model that actually tracks state for individual shard/blockStart combinations,
   650  // not just blockStarts.
   651  func (r *dbRepairer) Repair() error {
   652  	// Don't attempt a repair if the database is not bootstrapped yet
   653  	if !r.database.IsBootstrapped() {
   654  		return nil
   655  	}
   656  
   657  	if !atomic.CompareAndSwapInt32(&r.running, 0, 1) {
   658  		return errRepairInProgress
   659  	}
   660  
   661  	defer func() {
   662  		atomic.StoreInt32(&r.running, 0)
   663  	}()
   664  
   665  	multiErr := xerrors.NewMultiError()
   666  	namespaces, err := r.database.OwnedNamespaces()
   667  	if err != nil {
   668  		return err
   669  	}
   670  
   671  	var (
   672  		strategy                           = r.ropts.Strategy()
   673  		repairBlockStartShortCircuitRepair bool
   674  	)
   675  	switch strategy {
   676  	case repair.DefaultStrategy:
   677  		repairBlockStartShortCircuitRepair = true
   678  	case repair.FullSweepStrategy:
   679  		repairBlockStartShortCircuitRepair = false
   680  	default:
   681  		// Unrecognized strategy.
   682  		return fmt.Errorf("unknown repair strategy: %v", strategy)
   683  	}
   684  
   685  	for _, n := range namespaces {
   686  		repairRange := r.namespaceRepairTimeRange(n)
   687  		blockSize := n.Options().RetentionOptions().BlockSize()
   688  
   689  		// Iterating backwards will be exclusive on the start, but we want to be inclusive on the
   690  		// start so subtract a blocksize.
   691  		repairRange.Start = repairRange.Start.Add(-blockSize)
   692  
   693  		var (
   694  			numUnrepairedBlocks                           = 0
   695  			hasRepairedABlockStart                        = false
   696  			leastRecentlyRepairedBlockStart               xtime.UnixNano
   697  			leastRecentlyRepairedBlockStartLastRepairTime xtime.UnixNano
   698  			namespaceScope                                = r.scope.Tagged(map[string]string{
   699  				"namespace": n.ID().String(),
   700  			})
   701  		)
   702  		repairRange.IterateBackward(blockSize, func(blockStart xtime.UnixNano) bool {
   703  			// Update metrics around progress of repair.
   704  			blockStartUnixSeconds := blockStart.ToTime().Unix()
   705  			namespaceScope.Gauge("timestamp-current-block-repair").Update(float64(blockStartUnixSeconds))
   706  
   707  			// Update state for later reporting of least recently repaired block.
   708  			repairState, ok := r.repairStatesByNs.repairStates(n.ID(), blockStart)
   709  			if ok && (leastRecentlyRepairedBlockStart.IsZero() ||
   710  				repairState.LastAttempt.Before(leastRecentlyRepairedBlockStartLastRepairTime)) {
   711  				leastRecentlyRepairedBlockStart = blockStart
   712  				leastRecentlyRepairedBlockStartLastRepairTime = repairState.LastAttempt
   713  			}
   714  
   715  			if ok && repairState.Status == repairSuccess {
   716  				return true
   717  			}
   718  
   719  			// Failed or unrepair block from this point onwards.
   720  			numUnrepairedBlocks++
   721  			if hasRepairedABlockStart && repairBlockStartShortCircuitRepair {
   722  				// Only want to repair one namespace/blockStart per call to Repair()
   723  				// so once we've repaired a single blockStart we don't perform any
   724  				// more actual repairs although we do keep iterating so that we can
   725  				// emit an accurate value for the "num-unrepaired-blocks" gauge.
   726  				return true
   727  			}
   728  
   729  			if err := r.repairNamespaceBlockstart(n, blockStart); err != nil {
   730  				multiErr = multiErr.Add(err)
   731  			} else {
   732  				hasRepairedABlockStart = true
   733  			}
   734  
   735  			return true
   736  		})
   737  
   738  		// Update metrics with statistics about repair status.
   739  		namespaceScope.Gauge("num-unrepaired-blocks").Update(float64(numUnrepairedBlocks))
   740  
   741  		secondsSinceLastRepair := xtime.ToUnixNano(r.nowFn()).
   742  			Sub(leastRecentlyRepairedBlockStartLastRepairTime).Seconds()
   743  		namespaceScope.Gauge("max-seconds-since-last-block-repair").Update(secondsSinceLastRepair)
   744  
   745  		if hasRepairedABlockStart {
   746  			// Previous loop performed a repair which means we've hit our limit of repairing
   747  			// one block per namespace per call to Repair() so we can skip the logic below.
   748  			continue
   749  		}
   750  
   751  		// If we've made it this far that means that there were no unrepaired blocks which means we should
   752  		// repair the least recently repaired block instead.
   753  		if leastRecentlyRepairedBlockStart.IsZero() {
   754  			continue
   755  		}
   756  		if err := r.repairNamespaceBlockstart(n, leastRecentlyRepairedBlockStart); err != nil {
   757  			multiErr = multiErr.Add(err)
   758  		}
   759  	}
   760  
   761  	return multiErr.FinalError()
   762  }
   763  
   764  func (r *dbRepairer) Report() {
   765  	if atomic.LoadInt32(&r.running) == 1 {
   766  		r.status.Update(1)
   767  	} else {
   768  		r.status.Update(0)
   769  	}
   770  }
   771  
   772  func (r *dbRepairer) repairNamespaceBlockstart(n databaseNamespace, blockStart xtime.UnixNano) error {
   773  	var (
   774  		blockSize   = n.Options().RetentionOptions().BlockSize()
   775  		repairRange = xtime.Range{Start: blockStart, End: blockStart.Add(blockSize)}
   776  		repairTime  = xtime.ToUnixNano(r.nowFn())
   777  	)
   778  	if err := r.repairNamespaceWithTimeRange(n, repairRange); err != nil {
   779  		r.markRepairAttempt(n.ID(), blockStart, repairTime, repairFailed)
   780  		return err
   781  	}
   782  
   783  	r.markRepairAttempt(n.ID(), blockStart, repairTime, repairSuccess)
   784  	return nil
   785  }
   786  
   787  func (r *dbRepairer) repairNamespaceWithTimeRange(n databaseNamespace, tr xtime.Range) error {
   788  	if err := n.Repair(r.shardRepairer, tr, NamespaceRepairOptions{
   789  		Force: r.ropts.Force(),
   790  	}); err != nil {
   791  		return fmt.Errorf("namespace %s failed to repair time range %v: %v", n.ID().String(), tr, err)
   792  	}
   793  	return nil
   794  }
   795  
   796  func (r *dbRepairer) markRepairAttempt(
   797  	namespace ident.ID,
   798  	blockStart xtime.UnixNano,
   799  	repairTime xtime.UnixNano,
   800  	repairStatus repairStatus) {
   801  	repairState, _ := r.repairStatesByNs.repairStates(namespace, blockStart)
   802  	repairState.Status = repairStatus
   803  	repairState.LastAttempt = repairTime
   804  	r.repairStatesByNs.setRepairState(namespace, blockStart, repairState)
   805  }
   806  
   807  var noOpRepairer databaseRepairer = repairerNoOp{}
   808  
   809  type repairerNoOp struct{}
   810  
   811  func newNoopDatabaseRepairer() databaseRepairer { return noOpRepairer }
   812  
   813  func (r repairerNoOp) Start()        {}
   814  func (r repairerNoOp) Stop()         {}
   815  func (r repairerNoOp) Repair() error { return nil }
   816  func (r repairerNoOp) Report()       {}
   817  
   818  func (r shardRepairer) shadowCompare(
   819  	start xtime.UnixNano,
   820  	end xtime.UnixNano,
   821  	localMetadataBlocks block.FetchBlocksMetadataResults,
   822  	session client.AdminSession,
   823  	shard databaseShard,
   824  	nsCtx namespace.Context,
   825  ) error {
   826  	dice, err := newDice(r.rpopts.DebugShadowComparisonsPercentage())
   827  	if err != nil {
   828  		return fmt.Errorf("err creating shadow comparison dice: %v", err)
   829  	}
   830  
   831  	var localM, peerM *dynamic.Message
   832  	if nsCtx.Schema != nil {
   833  		// Only required if a schema (proto feature) is present. Reset between uses.
   834  		localM = dynamic.NewMessage(nsCtx.Schema.Get().MessageDescriptor)
   835  		peerM = dynamic.NewMessage(nsCtx.Schema.Get().MessageDescriptor)
   836  	}
   837  
   838  	readCtx := r.opts.ContextPool().Get()
   839  	compareResultFunc := func(result block.FetchBlocksMetadataResult) error {
   840  		seriesID := result.ID
   841  		peerSeriesIter, err := session.Fetch(nsCtx.ID, seriesID, start, end)
   842  		if err != nil {
   843  			return err
   844  		}
   845  		defer peerSeriesIter.Close()
   846  
   847  		readCtx.Reset()
   848  		defer readCtx.BlockingCloseReset()
   849  
   850  		iter, err := shard.ReadEncoded(readCtx, seriesID, start, end, nsCtx)
   851  		if err != nil {
   852  			return err
   853  		}
   854  		unfilteredLocalSeriesDataBlocks, err := iter.ToSlices(readCtx)
   855  		if err != nil {
   856  			return err
   857  		}
   858  		localSeriesDataBlocks, err := xio.FilterEmptyBlockReadersSliceOfSlicesInPlace(unfilteredLocalSeriesDataBlocks)
   859  		if err != nil {
   860  			return err
   861  		}
   862  
   863  		localSeriesSliceOfSlices := xio.NewReaderSliceOfSlicesFromBlockReadersIterator(localSeriesDataBlocks)
   864  		localSeriesIter := r.opts.MultiReaderIteratorPool().Get()
   865  		localSeriesIter.ResetSliceOfSlices(localSeriesSliceOfSlices, nsCtx.Schema)
   866  
   867  		var (
   868  			i             = 0
   869  			foundMismatch = false
   870  		)
   871  		for localSeriesIter.Next() {
   872  			if !peerSeriesIter.Next() {
   873  				r.logger.Error(
   874  					"series had next locally, but not from peers",
   875  					zap.String("namespace", nsCtx.ID.String()),
   876  					zap.Time("start", start.ToTime()),
   877  					zap.Time("end", end.ToTime()),
   878  					zap.String("series", seriesID.String()),
   879  					zap.Error(peerSeriesIter.Err()),
   880  				)
   881  				foundMismatch = true
   882  				break
   883  			}
   884  
   885  			localDP, localUnit, localAnnotation := localSeriesIter.Current()
   886  			peerDP, peerUnit, peerAnnotation := peerSeriesIter.Current()
   887  
   888  			if !localDP.Equal(peerDP) {
   889  				r.logger.Error(
   890  					"datapoints did not match",
   891  					zap.Int("index", i),
   892  					zap.Any("local", localDP),
   893  					zap.Any("peer", peerDP),
   894  				)
   895  				foundMismatch = true
   896  				break
   897  			}
   898  
   899  			if localUnit != peerUnit {
   900  				r.logger.Error(
   901  					"units did not match",
   902  					zap.Int("index", i),
   903  					zap.Int("local", int(localUnit)),
   904  					zap.Int("peer", int(peerUnit)),
   905  				)
   906  				foundMismatch = true
   907  				break
   908  			}
   909  
   910  			if nsCtx.Schema == nil {
   911  				// Remaining shadow logic is proto-specific.
   912  				continue
   913  			}
   914  
   915  			err = localM.Unmarshal(localAnnotation)
   916  			if err != nil {
   917  				r.logger.Error(
   918  					"Unable to unmarshal local annotation",
   919  					zap.Int("index", i),
   920  					zap.Error(err),
   921  				)
   922  				foundMismatch = true
   923  				break
   924  			}
   925  
   926  			err = peerM.Unmarshal(peerAnnotation)
   927  			if err != nil {
   928  				r.logger.Error(
   929  					"Unable to unmarshal peer annotation",
   930  					zap.Int("index", i),
   931  					zap.Error(err),
   932  				)
   933  				foundMismatch = true
   934  				break
   935  			}
   936  
   937  			if !dynamic.Equal(localM, peerM) {
   938  				r.logger.Error(
   939  					"Local message does not equal peer message",
   940  					zap.Int("index", i),
   941  					zap.String("local", localM.String()),
   942  					zap.String("peer", peerM.String()),
   943  				)
   944  				foundMismatch = true
   945  				break
   946  			}
   947  
   948  			if !bytes.Equal(localAnnotation, peerAnnotation) {
   949  				r.logger.Error(
   950  					"Local message equals peer message, but annotations do not match",
   951  					zap.Int("index", i),
   952  					zap.String("local", string(localAnnotation)),
   953  					zap.String("peer", string(peerAnnotation)),
   954  				)
   955  				foundMismatch = true
   956  				break
   957  			}
   958  
   959  			i++
   960  		}
   961  
   962  		if localSeriesIter.Err() != nil {
   963  			r.logger.Error(
   964  				"Local series iterator experienced an error",
   965  				zap.String("namespace", nsCtx.ID.String()),
   966  				zap.Time("start", start.ToTime()),
   967  				zap.Time("end", end.ToTime()),
   968  				zap.String("series", seriesID.String()),
   969  				zap.Int("numDPs", i),
   970  				zap.Error(localSeriesIter.Err()),
   971  			)
   972  		} else if foundMismatch {
   973  			r.logger.Error(
   974  				"Found mismatch between series",
   975  				zap.String("namespace", nsCtx.ID.String()),
   976  				zap.Time("start", start.ToTime()),
   977  				zap.Time("end", end.ToTime()),
   978  				zap.String("series", seriesID.String()),
   979  				zap.Int("numDPs", i),
   980  			)
   981  		} else {
   982  			r.logger.Debug(
   983  				"All values for series match",
   984  				zap.String("namespace", nsCtx.ID.String()),
   985  				zap.Time("start", start.ToTime()),
   986  				zap.Time("end", end.ToTime()),
   987  				zap.String("series", seriesID.String()),
   988  				zap.Int("numDPs", i),
   989  			)
   990  		}
   991  
   992  		return nil
   993  	}
   994  
   995  	for _, result := range localMetadataBlocks.Results() {
   996  		if !dice.Roll() {
   997  			continue
   998  		}
   999  
  1000  		if err := compareResultFunc(result); err != nil {
  1001  			return err
  1002  		}
  1003  	}
  1004  
  1005  	return nil
  1006  }
  1007  
  1008  type sessionAndTopo struct {
  1009  	session client.AdminSession
  1010  	topo    topology.Map
  1011  }