github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_consistency.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"crypto/sha512"
    17  	"encoding/binary"
    18  	"fmt"
    19  	"math"
    20  	"sort"
    21  	"sync"
    22  	"sync/atomic"
    23  	"time"
    24  
    25  	"github.com/cockroachdb/cockroach/pkg/keys"
    26  	"github.com/cockroachdb/cockroach/pkg/kv"
    27  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval"
    28  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer"
    29  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
    30  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    31  	"github.com/cockroachdb/cockroach/pkg/rpc"
    32  	"github.com/cockroachdb/cockroach/pkg/storage"
    33  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    34  	"github.com/cockroachdb/cockroach/pkg/util/bufalloc"
    35  	"github.com/cockroachdb/cockroach/pkg/util/contextutil"
    36  	"github.com/cockroachdb/cockroach/pkg/util/envutil"
    37  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    38  	"github.com/cockroachdb/cockroach/pkg/util/log"
    39  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    40  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    41  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    42  	"github.com/cockroachdb/errors"
    43  )
    44  
    45  // fatalOnStatsMismatch, if true, turns stats mismatches into fatal errors. A
    46  // stats mismatch is the event in which
    47  // - the consistency checker finds that all replicas are consistent
    48  //   (i.e. byte-by-byte identical)
    49  // - the (identical) stats tracked in them do not correspond to a recomputation
    50  //   via the data, i.e. the stats were incorrect
    51  // - ContainsEstimates==false, i.e. the stats claimed they were correct.
    52  //
    53  // Before issuing the fatal error, the cluster bootstrap version is verified.
    54  // We know that old versions of CockroachDB sometimes violated this invariant,
    55  // but we want to exclude these violations, focusing only on cases in which we
    56  // know old CRDB versions (<19.1 at time of writing) were not involved.
    57  var fatalOnStatsMismatch = envutil.EnvOrDefaultBool("COCKROACH_ENFORCE_CONSISTENT_STATS", false)
    58  
    59  const (
    60  	// collectChecksumTimeout controls how long we'll wait to collect a checksum
    61  	// for a CheckConsistency request. We need to bound the time that we wait
    62  	// because the checksum might never be computed for a replica if that replica
    63  	// is caught up via a snapshot and never performs the ComputeChecksum
    64  	// operation.
    65  	collectChecksumTimeout = 15 * time.Second
    66  )
    67  
    68  // ReplicaChecksum contains progress on a replica checksum computation.
    69  type ReplicaChecksum struct {
    70  	CollectChecksumResponse
    71  	// started is true if the checksum computation has started.
    72  	started bool
    73  	// If gcTimestamp is nonzero, GC this checksum after gcTimestamp. gcTimestamp
    74  	// is zero if and only if the checksum computation is in progress.
    75  	gcTimestamp time.Time
    76  	// This channel is closed after the checksum is computed, and is used
    77  	// as a notification.
    78  	notify chan struct{}
    79  }
    80  
    81  // CheckConsistency runs a consistency check on the range. It first applies a
    82  // ComputeChecksum through Raft and then issues CollectChecksum commands to the
    83  // other replicas. These are inspected and a CheckConsistencyResponse is assembled.
    84  //
    85  // When args.Mode is CHECK_VIA_QUEUE and an inconsistency is detected and no
    86  // diff was requested, the consistency check will be re-run to collect a diff,
    87  // which is then printed before calling `log.Fatal`. This behavior should be
    88  // lifted to the consistency checker queue in the future.
    89  func (r *Replica) CheckConsistency(
    90  	ctx context.Context, args roachpb.CheckConsistencyRequest,
    91  ) (roachpb.CheckConsistencyResponse, *roachpb.Error) {
    92  	startKey := r.Desc().StartKey.AsRawKey()
    93  
    94  	checkArgs := roachpb.ComputeChecksumRequest{
    95  		RequestHeader: roachpb.RequestHeader{Key: startKey},
    96  		Version:       batcheval.ReplicaChecksumVersion,
    97  		Snapshot:      args.WithDiff,
    98  		Mode:          args.Mode,
    99  		Checkpoint:    args.Checkpoint,
   100  		Terminate:     args.Terminate,
   101  	}
   102  
   103  	isQueue := args.Mode == roachpb.ChecksumMode_CHECK_VIA_QUEUE
   104  
   105  	results, err := r.RunConsistencyCheck(ctx, checkArgs)
   106  	if err != nil {
   107  		return roachpb.CheckConsistencyResponse{}, roachpb.NewError(err)
   108  	}
   109  
   110  	res := roachpb.CheckConsistencyResponse_Result{}
   111  	res.RangeID = r.RangeID
   112  
   113  	shaToIdxs := map[string][]int{}
   114  	var missing []ConsistencyCheckResult
   115  	for i, result := range results {
   116  		if result.Err != nil {
   117  			missing = append(missing, result)
   118  			continue
   119  		}
   120  		s := string(result.Response.Checksum)
   121  		shaToIdxs[s] = append(shaToIdxs[s], i)
   122  	}
   123  
   124  	// When replicas diverge, anecdotally often the minority (usually of size
   125  	// one) is in the wrong. If there's more than one smallest minority (for
   126  	// example, if three replicas all return different hashes) we pick any of
   127  	// them.
   128  	var minoritySHA string
   129  	if len(shaToIdxs) > 1 {
   130  		for sha, idxs := range shaToIdxs {
   131  			if minoritySHA == "" || len(shaToIdxs[minoritySHA]) > len(idxs) {
   132  				minoritySHA = sha
   133  			}
   134  		}
   135  	}
   136  
   137  	// There is an inconsistency if and only if there is a minority SHA.
   138  
   139  	if minoritySHA != "" {
   140  		var buf bytes.Buffer
   141  		for sha, idxs := range shaToIdxs {
   142  			minority := ""
   143  			if sha == minoritySHA {
   144  				minority = " [minority]"
   145  			}
   146  			for _, idx := range idxs {
   147  				_, _ = fmt.Fprintf(&buf, "%s: checksum %x%s\n"+
   148  					"- stats: %+v\n"+
   149  					"- stats.Sub(recomputation): %+v\n",
   150  					&results[idx].Replica,
   151  					sha,
   152  					minority,
   153  					&results[idx].Response.Persisted,
   154  					&results[idx].Response.Delta,
   155  				)
   156  			}
   157  			minoritySnap := results[shaToIdxs[minoritySHA][0]].Response.Snapshot
   158  			curSnap := results[shaToIdxs[sha][0]].Response.Snapshot
   159  			if sha != minoritySHA && minoritySnap != nil && curSnap != nil {
   160  				diff := diffRange(curSnap, minoritySnap)
   161  				if report := r.store.cfg.TestingKnobs.ConsistencyTestingKnobs.BadChecksumReportDiff; report != nil {
   162  					report(*r.store.Ident, diff)
   163  				}
   164  				_, _ = fmt.Fprintf(&buf, "====== diff(%x, [minority]) ======\n", sha)
   165  				_, _ = diff.WriteTo(&buf)
   166  			}
   167  		}
   168  
   169  		if isQueue {
   170  			log.Errorf(ctx, "%v", buf.String())
   171  		}
   172  		res.Detail += buf.String()
   173  	} else {
   174  		res.Detail += fmt.Sprintf("stats: %+v\n", results[0].Response.Persisted)
   175  	}
   176  	for _, result := range missing {
   177  		res.Detail += fmt.Sprintf("%s: error: %v\n", result.Replica, result.Err)
   178  	}
   179  
   180  	delta := enginepb.MVCCStats(results[0].Response.Delta)
   181  	var haveDelta bool
   182  	{
   183  		d2 := delta
   184  		d2.AgeTo(0)
   185  		haveDelta = d2 != enginepb.MVCCStats{}
   186  	}
   187  
   188  	res.StartKey = []byte(startKey)
   189  	res.Status = roachpb.CheckConsistencyResponse_RANGE_CONSISTENT
   190  	if minoritySHA != "" {
   191  		res.Status = roachpb.CheckConsistencyResponse_RANGE_INCONSISTENT
   192  	} else if args.Mode != roachpb.ChecksumMode_CHECK_STATS && haveDelta {
   193  		if delta.ContainsEstimates > 0 {
   194  			// When ContainsEstimates is set, it's generally expected that we'll get a different
   195  			// result when we recompute from scratch.
   196  			res.Status = roachpb.CheckConsistencyResponse_RANGE_CONSISTENT_STATS_ESTIMATED
   197  		} else {
   198  			// When ContainsEstimates is unset, we expect the recomputation to agree with the stored stats.
   199  			// If that's not the case, that's a problem: it could be a bug in the stats computation
   200  			// or stats maintenance, but it could also hint at the replica having diverged from its peers.
   201  			res.Status = roachpb.CheckConsistencyResponse_RANGE_CONSISTENT_STATS_INCORRECT
   202  		}
   203  		res.Detail += fmt.Sprintf("stats - recomputation: %+v\n", enginepb.MVCCStats(results[0].Response.Delta))
   204  	} else if len(missing) > 0 {
   205  		// No inconsistency was detected, but we didn't manage to inspect all replicas.
   206  		res.Status = roachpb.CheckConsistencyResponse_RANGE_INDETERMINATE
   207  	}
   208  	var resp roachpb.CheckConsistencyResponse
   209  	resp.Result = append(resp.Result, res)
   210  
   211  	// Bail out at this point except if the queue is the caller. All of the stuff
   212  	// below should really happen in the consistency queue to keep CheckConsistency
   213  	// itself self-contained.
   214  	if !isQueue {
   215  		return resp, nil
   216  	}
   217  
   218  	if minoritySHA == "" {
   219  		// The replicas were in sync. Check that the MVCCStats haven't diverged from
   220  		// what they should be. This code originated in the realization that there
   221  		// were many bugs in our stats computations. These are being fixed, but it
   222  		// is through this mechanism that existing ranges are updated. Hence, the
   223  		// logging below is relatively timid.
   224  
   225  		// If there's no delta, there's nothing else to do.
   226  		if !haveDelta {
   227  			return resp, nil
   228  		}
   229  
   230  		if delta.ContainsEstimates <= 0 && fatalOnStatsMismatch {
   231  			// We just found out that the recomputation doesn't match the persisted stats,
   232  			// so ContainsEstimates should have been strictly positive.
   233  
   234  			var v roachpb.Version
   235  			if err := r.store.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   236  				return txn.GetProto(ctx, keys.BootstrapVersionKey, &v)
   237  			}); err != nil {
   238  				log.Infof(ctx, "while retrieving cluster bootstrap version: %s", err)
   239  				// Intentionally continue with the assumption that it's the current version.
   240  				v = r.store.cfg.Settings.Version.ActiveVersion(ctx).Version
   241  			}
   242  			// For clusters that ever ran <19.1, we're not so sure that the stats are
   243  			// consistent. Verify this only for clusters that started out on 19.1 or
   244  			// higher.
   245  			if !v.Less(roachpb.Version{Major: 19, Minor: 1}) {
   246  				log.Fatalf(ctx, "found a delta of %+v", log.Safe(delta))
   247  			}
   248  		}
   249  
   250  		// We've found that there's something to correct; send an RecomputeStatsRequest. Note that this
   251  		// code runs only on the lease holder (at the time of initiating the computation), so this work
   252  		// isn't duplicated except in rare leaseholder change scenarios (and concurrent invocation of
   253  		// RecomputeStats is allowed because these requests block on one another). Also, we're
   254  		// essentially paced by the consistency checker so we won't call this too often.
   255  		log.Infof(ctx, "triggering stats recomputation to resolve delta of %+v", results[0].Response.Delta)
   256  
   257  		req := roachpb.RecomputeStatsRequest{
   258  			RequestHeader: roachpb.RequestHeader{Key: startKey},
   259  		}
   260  
   261  		var b kv.Batch
   262  		b.AddRawRequest(&req)
   263  
   264  		err := r.store.db.Run(ctx, &b)
   265  		return resp, roachpb.NewError(err)
   266  	}
   267  
   268  	if args.WithDiff {
   269  		// A diff was already printed. Return because all the code below will do
   270  		// is request another consistency check, with a diff and with
   271  		// instructions to terminate the minority nodes.
   272  		log.Errorf(ctx, "consistency check failed")
   273  		return resp, nil
   274  	}
   275  
   276  	// No diff was printed, so we want to re-run with diff.
   277  	// Note that this recursive call will be terminated in the `args.WithDiff`
   278  	// branch above.
   279  	args.WithDiff = true
   280  	args.Checkpoint = true
   281  	for _, idxs := range shaToIdxs[minoritySHA] {
   282  		args.Terminate = append(args.Terminate, results[idxs].Replica)
   283  	}
   284  	log.Errorf(ctx, "consistency check failed; fetching details and shutting down minority %v", args.Terminate)
   285  
   286  	// We've noticed in practice that if the snapshot diff is large, the log
   287  	// file in it is promptly rotated away, so up the limits while the diff
   288  	// printing occurs.
   289  	//
   290  	// See:
   291  	// https://github.com/cockroachdb/cockroach/issues/36861
   292  	oldLogLimit := atomic.LoadInt64(&log.LogFilesCombinedMaxSize)
   293  	atomic.CompareAndSwapInt64(&log.LogFilesCombinedMaxSize, oldLogLimit, math.MaxInt64)
   294  	defer atomic.CompareAndSwapInt64(&log.LogFilesCombinedMaxSize, math.MaxInt64, oldLogLimit)
   295  
   296  	if _, pErr := r.CheckConsistency(ctx, args); pErr != nil {
   297  		log.Errorf(ctx, "replica inconsistency detected; could not obtain actual diff: %s", pErr)
   298  	}
   299  
   300  	return resp, nil
   301  }
   302  
   303  // A ConsistencyCheckResult contains the outcome of a CollectChecksum call.
   304  type ConsistencyCheckResult struct {
   305  	Replica  roachpb.ReplicaDescriptor
   306  	Response CollectChecksumResponse
   307  	Err      error
   308  }
   309  
   310  func (r *Replica) collectChecksumFromReplica(
   311  	ctx context.Context, replica roachpb.ReplicaDescriptor, id uuid.UUID, checksum []byte,
   312  ) (CollectChecksumResponse, error) {
   313  	conn, err := r.store.cfg.NodeDialer.Dial(ctx, replica.NodeID, rpc.DefaultClass)
   314  	if err != nil {
   315  		return CollectChecksumResponse{},
   316  			errors.Wrapf(err, "could not dial node ID %d", replica.NodeID)
   317  	}
   318  	client := NewPerReplicaClient(conn)
   319  	req := &CollectChecksumRequest{
   320  		StoreRequestHeader: StoreRequestHeader{NodeID: replica.NodeID, StoreID: replica.StoreID},
   321  		RangeID:            r.RangeID,
   322  		ChecksumID:         id,
   323  		Checksum:           checksum,
   324  	}
   325  	resp, err := client.CollectChecksum(ctx, req)
   326  	if err != nil {
   327  		return CollectChecksumResponse{}, err
   328  	}
   329  	return *resp, nil
   330  }
   331  
   332  // RunConsistencyCheck carries out a round of CheckConsistency/CollectChecksum
   333  // for the members of this range, returning the results (which it does not act
   334  // upon). The first result will belong to the local replica, and in particular
   335  // there is a first result when no error is returned.
   336  func (r *Replica) RunConsistencyCheck(
   337  	ctx context.Context, req roachpb.ComputeChecksumRequest,
   338  ) ([]ConsistencyCheckResult, error) {
   339  	// Send a ComputeChecksum which will trigger computation of the checksum on
   340  	// all replicas.
   341  	res, pErr := kv.SendWrapped(ctx, r.store.db.NonTransactionalSender(), &req)
   342  	if pErr != nil {
   343  		return nil, pErr.GoError()
   344  	}
   345  	ccRes := res.(*roachpb.ComputeChecksumResponse)
   346  
   347  	var orderedReplicas []roachpb.ReplicaDescriptor
   348  	{
   349  		desc := r.Desc()
   350  		localReplica, err := r.GetReplicaDescriptor()
   351  		if err != nil {
   352  			return nil, errors.Wrap(err, "could not get replica descriptor")
   353  		}
   354  
   355  		// Move the local replica to the front (which makes it the "master"
   356  		// we're comparing against).
   357  		orderedReplicas = append(orderedReplicas, desc.Replicas().All()...)
   358  
   359  		sort.Slice(orderedReplicas, func(i, j int) bool {
   360  			return orderedReplicas[i] == localReplica
   361  		})
   362  	}
   363  
   364  	resultCh := make(chan ConsistencyCheckResult, len(orderedReplicas))
   365  	var results []ConsistencyCheckResult
   366  	var wg sync.WaitGroup
   367  
   368  	for _, replica := range orderedReplicas {
   369  		wg.Add(1)
   370  		replica := replica // per-iteration copy for the goroutine
   371  		if err := r.store.Stopper().RunAsyncTask(ctx, "storage.Replica: checking consistency",
   372  			func(ctx context.Context) {
   373  				defer wg.Done()
   374  
   375  				var resp CollectChecksumResponse
   376  				err := contextutil.RunWithTimeout(ctx, "collect checksum", collectChecksumTimeout,
   377  					func(ctx context.Context) error {
   378  						var masterChecksum []byte
   379  						if len(results) > 0 {
   380  							masterChecksum = results[0].Response.Checksum
   381  						}
   382  						var err error
   383  						resp, err = r.collectChecksumFromReplica(ctx, replica, ccRes.ChecksumID, masterChecksum)
   384  						return err
   385  					})
   386  				resultCh <- ConsistencyCheckResult{
   387  					Replica:  replica,
   388  					Response: resp,
   389  					Err:      err,
   390  				}
   391  			}); err != nil {
   392  			wg.Done()
   393  			// If we can't start tasks, the node is likely draining. Just return the error verbatim.
   394  			return nil, err
   395  		}
   396  
   397  		// Collect the master result eagerly so that we can send a SHA in the
   398  		// remaining requests (this is used for logging inconsistencies on the
   399  		// remote nodes only).
   400  		if len(results) == 0 {
   401  			wg.Wait()
   402  			result := <-resultCh
   403  			if err := result.Err; err != nil {
   404  				// If we can't compute the local checksum, give up.
   405  				return nil, errors.Wrap(err, "computing own checksum")
   406  			}
   407  			results = append(results, result)
   408  		}
   409  	}
   410  
   411  	wg.Wait()
   412  	close(resultCh)
   413  
   414  	// Collect the remaining results.
   415  	for result := range resultCh {
   416  		results = append(results, result)
   417  	}
   418  
   419  	return results, nil
   420  }
   421  
   422  // getChecksum waits for the result of ComputeChecksum and returns it.
   423  // It returns false if there is no checksum being computed for the id,
   424  // or it has already been GCed.
   425  func (r *Replica) getChecksum(ctx context.Context, id uuid.UUID) (ReplicaChecksum, error) {
   426  	now := timeutil.Now()
   427  	r.mu.Lock()
   428  	r.gcOldChecksumEntriesLocked(now)
   429  	c, ok := r.mu.checksums[id]
   430  	if !ok {
   431  		// TODO(tbg): we need to unconditionally set a gcTimestamp or this
   432  		// request can simply get stuck forever or cancel anyway and leak an
   433  		// entry in r.mu.checksums.
   434  		if d, dOk := ctx.Deadline(); dOk {
   435  			c.gcTimestamp = d
   436  		}
   437  		c.notify = make(chan struct{})
   438  		r.mu.checksums[id] = c
   439  	}
   440  	r.mu.Unlock()
   441  	// Wait
   442  	select {
   443  	case <-r.store.Stopper().ShouldStop():
   444  		return ReplicaChecksum{},
   445  			errors.Errorf("store has stopped while waiting for compute checksum (ID = %s)", id)
   446  	case <-ctx.Done():
   447  		return ReplicaChecksum{},
   448  			errors.Wrapf(ctx.Err(), "while waiting for compute checksum (ID = %s)", id)
   449  	case <-c.notify:
   450  	}
   451  	if log.V(1) {
   452  		log.Infof(ctx, "waited for compute checksum for %s", timeutil.Since(now))
   453  	}
   454  	r.mu.RLock()
   455  	c, ok = r.mu.checksums[id]
   456  	r.mu.RUnlock()
   457  	// If the checksum wasn't found or the checksum could not be computed, error out.
   458  	// The latter case can occur when there's a version mismatch or, more generally,
   459  	// when the (async) checksum computation fails.
   460  	if !ok || c.Checksum == nil {
   461  		return ReplicaChecksum{}, errors.Errorf("no checksum found (ID = %s)", id)
   462  	}
   463  	return c, nil
   464  }
   465  
   466  // computeChecksumDone adds the computed checksum, sets a deadline for GCing the
   467  // checksum, and sends out a notification.
   468  func (r *Replica) computeChecksumDone(
   469  	ctx context.Context, id uuid.UUID, result *replicaHash, snapshot *roachpb.RaftSnapshotData,
   470  ) {
   471  	r.mu.Lock()
   472  	defer r.mu.Unlock()
   473  	if c, ok := r.mu.checksums[id]; ok {
   474  		if result != nil {
   475  			c.Checksum = result.SHA512[:]
   476  
   477  			delta := result.PersistedMS
   478  			delta.Subtract(result.RecomputedMS)
   479  			c.Delta = enginepb.MVCCStatsDelta(delta)
   480  			c.Persisted = result.PersistedMS
   481  		}
   482  		c.gcTimestamp = timeutil.Now().Add(batcheval.ReplicaChecksumGCInterval)
   483  		c.Snapshot = snapshot
   484  		r.mu.checksums[id] = c
   485  		// Notify
   486  		close(c.notify)
   487  	} else {
   488  		// ComputeChecksum adds an entry into the map, and the entry can
   489  		// only be GCed once the gcTimestamp is set above. Something
   490  		// really bad happened.
   491  		log.Errorf(ctx, "no map entry for checksum (ID = %s)", id)
   492  	}
   493  }
   494  
   495  type replicaHash struct {
   496  	SHA512                    [sha512.Size]byte
   497  	PersistedMS, RecomputedMS enginepb.MVCCStats
   498  }
   499  
   500  // sha512 computes the SHA512 hash of all the replica data at the snapshot.
   501  // It will dump all the kv data into snapshot if it is provided.
   502  func (r *Replica) sha512(
   503  	ctx context.Context,
   504  	desc roachpb.RangeDescriptor,
   505  	snap storage.Reader,
   506  	snapshot *roachpb.RaftSnapshotData,
   507  	mode roachpb.ChecksumMode,
   508  ) (*replicaHash, error) {
   509  	statsOnly := mode == roachpb.ChecksumMode_CHECK_STATS
   510  
   511  	// Iterate over all the data in the range.
   512  	iter := snap.NewIterator(storage.IterOptions{UpperBound: desc.EndKey.AsRawKey()})
   513  	defer iter.Close()
   514  
   515  	var alloc bufalloc.ByteAllocator
   516  	var intBuf [8]byte
   517  	var legacyTimestamp hlc.LegacyTimestamp
   518  	var timestampBuf []byte
   519  	hasher := sha512.New()
   520  
   521  	visitor := func(unsafeKey storage.MVCCKey, unsafeValue []byte) error {
   522  		if snapshot != nil {
   523  			// Add (a copy of) the kv pair into the debug message.
   524  			kv := roachpb.RaftSnapshotData_KeyValue{
   525  				Timestamp: unsafeKey.Timestamp,
   526  			}
   527  			alloc, kv.Key = alloc.Copy(unsafeKey.Key, 0)
   528  			alloc, kv.Value = alloc.Copy(unsafeValue, 0)
   529  			snapshot.KV = append(snapshot.KV, kv)
   530  		}
   531  
   532  		// Encode the length of the key and value.
   533  		binary.LittleEndian.PutUint64(intBuf[:], uint64(len(unsafeKey.Key)))
   534  		if _, err := hasher.Write(intBuf[:]); err != nil {
   535  			return err
   536  		}
   537  		binary.LittleEndian.PutUint64(intBuf[:], uint64(len(unsafeValue)))
   538  		if _, err := hasher.Write(intBuf[:]); err != nil {
   539  			return err
   540  		}
   541  		if _, err := hasher.Write(unsafeKey.Key); err != nil {
   542  			return err
   543  		}
   544  		legacyTimestamp = hlc.LegacyTimestamp(unsafeKey.Timestamp)
   545  		if size := legacyTimestamp.Size(); size > cap(timestampBuf) {
   546  			timestampBuf = make([]byte, size)
   547  		} else {
   548  			timestampBuf = timestampBuf[:size]
   549  		}
   550  		if _, err := protoutil.MarshalTo(&legacyTimestamp, timestampBuf); err != nil {
   551  			return err
   552  		}
   553  		if _, err := hasher.Write(timestampBuf); err != nil {
   554  			return err
   555  		}
   556  		_, err := hasher.Write(unsafeValue)
   557  		return err
   558  	}
   559  
   560  	var ms enginepb.MVCCStats
   561  	// In statsOnly mode, we hash only the RangeAppliedState. In regular mode, hash
   562  	// all of the replicated key space.
   563  	if !statsOnly {
   564  		for _, span := range rditer.MakeReplicatedKeyRanges(&desc) {
   565  			spanMS, err := storage.ComputeStatsGo(
   566  				iter, span.Start.Key, span.End.Key, 0 /* nowNanos */, visitor,
   567  			)
   568  			if err != nil {
   569  				return nil, err
   570  			}
   571  			ms.Add(spanMS)
   572  		}
   573  	}
   574  
   575  	var result replicaHash
   576  	result.RecomputedMS = ms
   577  
   578  	rangeAppliedState, err := stateloader.Make(desc.RangeID).LoadRangeAppliedState(ctx, snap)
   579  	if err != nil {
   580  		return nil, err
   581  	}
   582  	if rangeAppliedState == nil {
   583  		// This error is transient: the range applied state is used in v2.1 already
   584  		// but is migrated into on a per-range basis for clusters bootstrapped before
   585  		// v2.1. Clusters bootstrapped at v2.1 or higher will never hit this path since
   586  		// there's always an applied state.
   587  		return nil, errors.New("no range applied state found")
   588  	}
   589  	result.PersistedMS = rangeAppliedState.RangeStats.ToStats()
   590  
   591  	if statsOnly {
   592  		b, err := protoutil.Marshal(rangeAppliedState)
   593  		if err != nil {
   594  			return nil, err
   595  		}
   596  		if snapshot != nil {
   597  			// Add LeaseAppliedState to the diff.
   598  			kv := roachpb.RaftSnapshotData_KeyValue{
   599  				Timestamp: hlc.Timestamp{},
   600  			}
   601  			kv.Key = keys.RangeAppliedStateKey(desc.RangeID)
   602  			var v roachpb.Value
   603  			if err := v.SetProto(rangeAppliedState); err != nil {
   604  				return nil, err
   605  			}
   606  			kv.Value = v.RawBytes
   607  			snapshot.KV = append(snapshot.KV, kv)
   608  		}
   609  		if _, err := hasher.Write(b); err != nil {
   610  			return nil, err
   611  		}
   612  	}
   613  
   614  	hasher.Sum(result.SHA512[:0])
   615  
   616  	// We're not required to do so, but it looks nicer if both stats are aged to
   617  	// the same timestamp.
   618  	result.RecomputedMS.AgeTo(result.PersistedMS.LastUpdateNanos)
   619  
   620  	return &result, nil
   621  }