github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"strings"
    17  	"sync/atomic"
    18  	"time"
    19  	"unsafe"
    20  
    21  	"github.com/cockroachdb/cockroach/pkg/base"
    22  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    23  	"github.com/cockroachdb/cockroach/pkg/keys"
    24  	"github.com/cockroachdb/cockroach/pkg/kv"
    25  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/abortspan"
    26  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval"
    27  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb"
    28  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency"
    29  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/gc"
    30  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    31  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    32  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/rangefeed"
    33  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/split"
    34  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
    35  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    36  	"github.com/cockroachdb/cockroach/pkg/rpc"
    37  	"github.com/cockroachdb/cockroach/pkg/settings"
    38  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    39  	"github.com/cockroachdb/cockroach/pkg/storage"
    40  	"github.com/cockroachdb/cockroach/pkg/storage/cloud"
    41  	enginepb "github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    42  	"github.com/cockroachdb/cockroach/pkg/util"
    43  	"github.com/cockroachdb/cockroach/pkg/util/envutil"
    44  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    45  	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
    46  	"github.com/cockroachdb/cockroach/pkg/util/log"
    47  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    48  	"github.com/cockroachdb/cockroach/pkg/util/quotapool"
    49  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    50  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    51  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    52  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    53  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    54  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    55  	"github.com/cockroachdb/errors"
    56  	"github.com/google/btree"
    57  	"github.com/kr/pretty"
    58  	"go.etcd.io/etcd/raft"
    59  )
    60  
    61  const (
    62  	// configGossipTTL is the time-to-live for configuration maps.
    63  
    64  	// optimizePutThreshold is the minimum length of a contiguous run
    65  	// of batched puts or conditional puts, after which the constituent
    66  	// put operations will possibly be optimized by determining whether
    67  	// the key space being written is starting out empty.
    68  	optimizePutThreshold = 10
    69  
    70  	replicaChangeTxnName = "change-replica"
    71  	splitTxnName         = "split"
    72  	mergeTxnName         = "merge"
    73  
    74  	defaultReplicaRaftMuWarnThreshold = 500 * time.Millisecond
    75  )
    76  
    77  var testingDisableQuiescence = envutil.EnvOrDefaultBool("COCKROACH_DISABLE_QUIESCENCE", false)
    78  
    79  var disableSyncRaftLog = settings.RegisterBoolSetting(
    80  	"kv.raft_log.disable_synchronization_unsafe",
    81  	"set to true to disable synchronization on Raft log writes to persistent storage. "+
    82  		"Setting to true risks data loss or data corruption on server crashes. "+
    83  		"The setting is meant for internal testing only and SHOULD NOT be used in production.",
    84  	false,
    85  )
    86  
    87  // UseAtomicReplicationChanges determines whether to issue atomic replication changes.
    88  // This has no effect until the cluster version is 19.2 or higher.
    89  var UseAtomicReplicationChanges = settings.RegisterBoolSetting(
    90  	"kv.atomic_replication_changes.enabled",
    91  	"use atomic replication changes",
    92  	true,
    93  )
    94  
    95  // MaxCommandSizeFloor is the minimum allowed value for the MaxCommandSize
    96  // cluster setting.
    97  const MaxCommandSizeFloor = 4 << 20 // 4MB
    98  
    99  // MaxCommandSize wraps "kv.raft.command.max_size".
   100  var MaxCommandSize = settings.RegisterValidatedByteSizeSetting(
   101  	"kv.raft.command.max_size",
   102  	"maximum size of a raft command",
   103  	64<<20,
   104  	func(size int64) error {
   105  		if size < MaxCommandSizeFloor {
   106  			return fmt.Errorf("max_size must be greater than %s", humanizeutil.IBytes(MaxCommandSizeFloor))
   107  		}
   108  		return nil
   109  	},
   110  )
   111  
   112  // StrictGCEnforcement controls whether requests are rejected based on the GC
   113  // threshold and the current GC TTL (true) or just based on the GC threshold
   114  // (false).
   115  var StrictGCEnforcement = settings.RegisterBoolSetting(
   116  	"kv.gc_ttl.strict_enforcement.enabled",
   117  	"if true, fail to serve requests at timestamps below the TTL even if the data still exists",
   118  	true,
   119  )
   120  
   121  type proposalReevaluationReason int
   122  
   123  const (
   124  	proposalNoReevaluation proposalReevaluationReason = iota
   125  	// proposalIllegalLeaseIndex indicates the proposal failed to apply at
   126  	// a Lease index it was not legal for. The command should be re-evaluated.
   127  	proposalIllegalLeaseIndex
   128  )
   129  
   130  type atomicDescString struct {
   131  	strPtr unsafe.Pointer
   132  }
   133  
   134  // store atomically updates d.strPtr with the string representation of desc.
   135  func (d *atomicDescString) store(replicaID roachpb.ReplicaID, desc *roachpb.RangeDescriptor) {
   136  	var buf strings.Builder
   137  	fmt.Fprintf(&buf, "%d/", desc.RangeID)
   138  	if replicaID == 0 {
   139  		fmt.Fprintf(&buf, "?:")
   140  	} else {
   141  		fmt.Fprintf(&buf, "%d:", replicaID)
   142  	}
   143  
   144  	if !desc.IsInitialized() {
   145  		buf.WriteString("{-}")
   146  	} else {
   147  		const maxRangeChars = 30
   148  		rngStr := keys.PrettyPrintRange(roachpb.Key(desc.StartKey), roachpb.Key(desc.EndKey), maxRangeChars)
   149  		buf.WriteString(rngStr)
   150  	}
   151  
   152  	str := buf.String()
   153  	atomic.StorePointer(&d.strPtr, unsafe.Pointer(&str))
   154  }
   155  
   156  // String returns the string representation of the range; since we are not
   157  // using a lock, the copy might be inconsistent.
   158  func (d *atomicDescString) String() string {
   159  	return *(*string)(atomic.LoadPointer(&d.strPtr))
   160  }
   161  
   162  // atomicConnectionClass stores an rpc.ConnectionClass atomically.
   163  type atomicConnectionClass uint32
   164  
   165  // get reads the current value of the ConnectionClass.
   166  func (c *atomicConnectionClass) get() rpc.ConnectionClass {
   167  	return rpc.ConnectionClass(atomic.LoadUint32((*uint32)(c)))
   168  }
   169  
   170  // set updates the current value of the ConnectionClass.
   171  func (c *atomicConnectionClass) set(cc rpc.ConnectionClass) {
   172  	atomic.StoreUint32((*uint32)(c), uint32(cc))
   173  }
   174  
   175  // A Replica is a contiguous keyspace with writes managed via an
   176  // instance of the Raft consensus algorithm. Many ranges may exist
   177  // in a store and they are unlikely to be contiguous. Ranges are
   178  // independent units and are responsible for maintaining their own
   179  // integrity by replacing failed replicas, splitting and merging
   180  // as appropriate.
   181  type Replica struct {
   182  	log.AmbientContext
   183  
   184  	// TODO(tschottdorf): Duplicates r.mu.state.desc.RangeID; revisit that.
   185  	RangeID roachpb.RangeID // Only set by the constructor
   186  
   187  	store     *Store
   188  	abortSpan *abortspan.AbortSpan // Avoids anomalous reads after abort
   189  
   190  	// leaseholderStats tracks all incoming BatchRequests to the replica and which
   191  	// localities they come from in order to aid in lease rebalancing decisions.
   192  	leaseholderStats *replicaStats
   193  	// writeStats tracks the number of keys written by applied raft commands
   194  	// in order to aid in replica rebalancing decisions.
   195  	writeStats *replicaStats
   196  
   197  	// creatingReplica is set when a replica is created as uninitialized
   198  	// via a raft message.
   199  	creatingReplica *roachpb.ReplicaDescriptor
   200  
   201  	// Held in read mode during read-only commands. Held in exclusive mode to
   202  	// prevent read-only commands from executing. Acquired before the embedded
   203  	// RWMutex.
   204  	readOnlyCmdMu syncutil.RWMutex
   205  
   206  	// rangeStr is a string representation of a RangeDescriptor that can be
   207  	// atomically read and updated without needing to acquire the replica.mu lock.
   208  	// All updates to state.Desc should be duplicated here.
   209  	rangeStr atomicDescString
   210  
   211  	// connectionClass controls the ConnectionClass used to send raft messages.
   212  	connectionClass atomicConnectionClass
   213  
   214  	// raftMu protects Raft processing the replica.
   215  	//
   216  	// Locking notes: Replica.raftMu < Replica.mu
   217  	raftMu struct {
   218  		syncutil.Mutex
   219  
   220  		// Note that there are two StateLoaders, in raftMu and mu,
   221  		// depending on which lock is being held.
   222  		stateLoader stateloader.StateLoader
   223  		// on-disk storage for sideloaded SSTables. nil when there's no ReplicaID.
   224  		sideloaded SideloadStorage
   225  		// stateMachine is used to apply committed raft entries.
   226  		stateMachine replicaStateMachine
   227  		// decoder is used to decode committed raft entries.
   228  		decoder replicaDecoder
   229  	}
   230  
   231  	// Contains the lease history when enabled.
   232  	leaseHistory *leaseHistory
   233  
   234  	// concMgr sequences incoming requests and provides isolation between
   235  	// requests that intend to perform conflicting operations. It is the
   236  	// centerpiece of transaction contention handling.
   237  	concMgr concurrency.Manager
   238  
   239  	mu struct {
   240  		// Protects all fields in the mu struct.
   241  		syncutil.RWMutex
   242  		// The destroyed status of a replica indicating if it's alive, corrupt,
   243  		// scheduled for destruction or has been GCed.
   244  		// destroyStatus should only be set while also holding the raftMu.
   245  		destroyStatus
   246  		// Is the range quiescent? Quiescent ranges are not Tick()'d and unquiesce
   247  		// whenever a Raft operation is performed.
   248  		quiescent bool
   249  		// mergeComplete is non-nil if a merge is in-progress, in which case any
   250  		// requests should be held until the completion of the merge is signaled by
   251  		// the closing of the channel.
   252  		mergeComplete chan struct{}
   253  		// The state of the Raft state machine.
   254  		state kvserverpb.ReplicaState
   255  		// Last index/term persisted to the raft log (not necessarily
   256  		// committed). Note that lastTerm may be 0 (and thus invalid) even when
   257  		// lastIndex is known, in which case the term will have to be retrieved
   258  		// from the Raft log entry. Use the invalidLastTerm constant for this
   259  		// case.
   260  		lastIndex, lastTerm uint64
   261  		// A map of raft log index of pending snapshots to deadlines.
   262  		// Used to prohibit raft log truncations that would leave a gap between
   263  		// the snapshot and the new first index. The map entry has a zero
   264  		// deadline while the snapshot is being sent and turns nonzero when the
   265  		// snapshot has completed, preventing truncation for a grace period
   266  		// (since there is a race between the snapshot completing and its being
   267  		// reflected in the raft status used to make truncation decisions).
   268  		//
   269  		// NB: If we kept only one value, we could end up in situations in which
   270  		// we're either giving some snapshots no grace period, or keep an
   271  		// already finished snapshot "pending" for extended periods of time
   272  		// (preventing log truncation).
   273  		snapshotLogTruncationConstraints map[uuid.UUID]snapTruncationInfo
   274  		// raftLogSize is the approximate size in bytes of the persisted raft
   275  		// log, including sideloaded entries' payloads. The value itself is not
   276  		// persisted and is computed lazily, paced by the raft log truncation
   277  		// queue which will recompute the log size when it finds it
   278  		// uninitialized. This recomputation mechanism isn't relevant for ranges
   279  		// which see regular write activity (for those the log size will deviate
   280  		// from zero quickly, and so it won't be recomputed but will undercount
   281  		// until the first truncation is carried out), but it prevents a large
   282  		// dormant Raft log from sitting around forever, which has caused problems
   283  		// in the past.
   284  		raftLogSize int64
   285  		// If raftLogSizeTrusted is false, don't trust the above raftLogSize until
   286  		// it has been recomputed.
   287  		raftLogSizeTrusted bool
   288  		// raftLogLastCheckSize is the value of raftLogSize the last time the Raft
   289  		// log was checked for truncation or at the time of the last Raft log
   290  		// truncation.
   291  		raftLogLastCheckSize int64
   292  		// pendingLeaseRequest is used to coalesce RequestLease requests.
   293  		pendingLeaseRequest pendingLeaseRequest
   294  		// minLeaseProposedTS is the minimum acceptable lease.ProposedTS; only
   295  		// leases proposed after this timestamp can be used for proposing commands.
   296  		// This is used to protect against several hazards:
   297  		// - leases held (or even proposed) before a restart cannot be used after a
   298  		// restart. This is because:
   299  		// 		a) the spanlatch manager is wiped during the restart; there might be
   300  		// 		writes in flight that do not have the latches they held reflected. So,
   301  		// 		we need to synchronize all new reads with those old in-flight writes.
   302  		// 		Forcing acquisition of a new lease essentially flushes all the
   303  		// 		previous raft commands.
   304  		// 		b) a lease transfer might have been in progress at the time of the
   305  		// 		restart. Using the existing lease after the restart would break the
   306  		// 		transfer proposer's promise to not use the existing lease.
   307  		// - a lease cannot be used after a transfer is initiated. Moreover, even
   308  		// lease extension that were in flight at the time of the transfer cannot be
   309  		// used, if they eventually apply.
   310  		minLeaseProposedTS hlc.Timestamp
   311  		// A pointer to the zone config for this replica.
   312  		zone *zonepb.ZoneConfig
   313  		// proposalBuf buffers Raft commands as they are passed to the Raft
   314  		// replication subsystem. The buffer is populated by requests after
   315  		// evaluation and is consumed by the Raft processing thread. Once
   316  		// consumed, commands are proposed through Raft and moved to the
   317  		// proposals map.
   318  		//
   319  		// Access to proposalBuf must occur *without* holding the mutex.
   320  		// Instead, the buffer internally holds a reference to mu and will use
   321  		// it appropriately.
   322  		proposalBuf propBuf
   323  		// proposals stores the Raft in-flight commands which originated at
   324  		// this Replica, i.e. all commands for which propose has been called,
   325  		// but which have not yet applied.
   326  		//
   327  		// The *ProposalData in the map are "owned" by it. Elements from the
   328  		// map must only be referenced while the Replica.mu is held, except
   329  		// if the element is removed from the map first. Modifying the proposal
   330  		// itself may require holding the raftMu as fields can be accessed
   331  		// underneath raft. See comments on ProposalData fields for synchronization
   332  		// requirements.
   333  		//
   334  		// Due to Raft reproposals, multiple in-flight Raft entries can have
   335  		// the same CmdIDKey, all corresponding to the same KV request. However,
   336  		// not all Raft entries with a given command ID will correspond directly
   337  		// to the *RaftCommand contained in its associated *ProposalData. This
   338  		// is because the *RaftCommand can be mutated during reproposals by
   339  		// Replica.tryReproposeWithNewLeaseIndex.
   340  		//
   341  		// TODO(ajwerner): move the proposal map and ProposalData entirely under
   342  		// the raftMu.
   343  		proposals         map[kvserverbase.CmdIDKey]*ProposalData
   344  		internalRaftGroup *raft.RawNode
   345  		// The ID of the replica within the Raft group. This value may never be 0.
   346  		replicaID roachpb.ReplicaID
   347  		// The minimum allowed ID for this replica. Initialized from
   348  		// RangeTombstone.NextReplicaID.
   349  		tombstoneMinReplicaID roachpb.ReplicaID
   350  
   351  		// The ID of the leader replica within the Raft group. Used to determine
   352  		// when the leadership changes.
   353  		leaderID roachpb.ReplicaID
   354  		// The most recently added replica for the range and when it was added.
   355  		// Used to determine whether a replica is new enough that we shouldn't
   356  		// penalize it for being slightly behind. These field gets cleared out once
   357  		// we know that the replica has caught up.
   358  		lastReplicaAdded     roachpb.ReplicaID
   359  		lastReplicaAddedTime time.Time
   360  		// initialMaxClosed is the initial maxClosed timestamp for the replica as known
   361  		// from its left-hand-side upon creation.
   362  		initialMaxClosed hlc.Timestamp
   363  
   364  		// The most recently updated time for each follower of this range. This is updated
   365  		// every time a Raft message is received from a peer.
   366  		// Note that superficially it seems that similar information is contained in the
   367  		// Progress of a RaftStatus, which has a RecentActive field. However, that field
   368  		// is always true unless CheckQuorum is active, which at the time of writing in
   369  		// CockroachDB is not the case.
   370  		//
   371  		// The lastUpdateTimes map is also updated when a leaseholder steps up
   372  		// (making the assumption that all followers are live at that point),
   373  		// and when the range unquiesces (marking all replicating followers as
   374  		// live).
   375  		//
   376  		// TODO(tschottdorf): keeping a map on each replica seems to be
   377  		// overdoing it. We should map the replicaID to a NodeID and then use
   378  		// node liveness (or any sensible measure of the peer being around).
   379  		// The danger in doing so is that a single stuck replica on an otherwise
   380  		// functioning node could fill up the quota pool. We are already taking
   381  		// this kind of risk though: a replica that gets stuck on an otherwise
   382  		// live node will not lose leaseholdership.
   383  		lastUpdateTimes lastUpdateTimesMap
   384  
   385  		// The last seen replica descriptors from incoming Raft messages. These are
   386  		// stored so that the replica still knows the replica descriptors for itself
   387  		// and for its message recipients in the circumstances when its RangeDescriptor
   388  		// is out of date.
   389  		//
   390  		// Normally, a replica knows about the other replica descriptors for a
   391  		// range via the RangeDescriptor stored in Replica.mu.state.Desc. But that
   392  		// descriptor is only updated during a Split or ChangeReplicas operation.
   393  		// There are periods during a Replica's lifetime when that information is
   394  		// out of date:
   395  		//
   396  		// 1. When a replica is being newly created as the result of an incoming
   397  		// Raft message for it. This is the common case for ChangeReplicas and an
   398  		// uncommon case for Splits. The leader will be sending the replica
   399  		// messages and the replica needs to be able to respond before it can
   400  		// receive an updated range descriptor (via a snapshot,
   401  		// changeReplicasTrigger, or splitTrigger).
   402  		//
   403  		// 2. If the node containing a replica is partitioned or down while the
   404  		// replicas for the range are updated. When the node comes back up, other
   405  		// replicas may begin communicating with it and it needs to be able to
   406  		// respond. Unlike 1 where there is no range descriptor, in this situation
   407  		// the replica has a range descriptor but it is out of date. Note that a
   408  		// replica being removed from a node and then quickly re-added before the
   409  		// replica has been GC'd will also use the last seen descriptors. In
   410  		// effect, this is another path for which the replica's local range
   411  		// descriptor is out of date.
   412  		//
   413  		// The last seen replica descriptors are updated on receipt of every raft
   414  		// message via Replica.setLastReplicaDescriptors (see
   415  		// Store.HandleRaftRequest). These last seen descriptors are used when
   416  		// the replica's RangeDescriptor contains missing or out of date descriptors
   417  		// for a replica (see Replica.sendRaftMessage).
   418  		//
   419  		// Removing a replica from Store.mu.replicas is not a problem because
   420  		// when a replica is completely removed, it won't be recreated until
   421  		// there is another event that will repopulate the replicas map in the
   422  		// range descriptor. When it is temporarily dropped and recreated, the
   423  		// newly recreated replica will have a complete range descriptor.
   424  		lastToReplica, lastFromReplica roachpb.ReplicaDescriptor
   425  
   426  		// Computed checksum at a snapshot UUID.
   427  		checksums map[uuid.UUID]ReplicaChecksum
   428  
   429  		// proposalQuota is the quota pool maintained by the lease holder where
   430  		// incoming writes acquire quota from a fixed quota pool before going
   431  		// through. If there is no quota available, the write is throttled
   432  		// until quota is made available to the pool.
   433  		// Acquired quota for a given command is only released when all the
   434  		// replicas have persisted the corresponding entry into their logs.
   435  		proposalQuota *quotapool.IntPool
   436  
   437  		// The base index is the index up to (including) which quota was already
   438  		// released. That is, the first element in quotaReleaseQueue below is
   439  		// released as the base index moves up by one, etc.
   440  		proposalQuotaBaseIndex uint64
   441  
   442  		// Once the leader observes a proposal come 'out of Raft', we add the size
   443  		// of the associated command to a queue of quotas we have yet to release
   444  		// back to the quota pool. At that point ownership of the quota is
   445  		// transferred from r.mu.proposals to this queue.
   446  		// We'll release the respective quota once all replicas have persisted the
   447  		// corresponding entry into their logs (or once we give up waiting on some
   448  		// replica because it looks like it's dead).
   449  		quotaReleaseQueue []*quotapool.IntAlloc
   450  
   451  		// Counts calls to Replica.tick()
   452  		ticks int
   453  
   454  		// Counts Raft messages refused due to queue congestion.
   455  		droppedMessages int
   456  
   457  		// Note that there are two replicaStateLoaders, in raftMu and mu,
   458  		// depending on which lock is being held.
   459  		stateLoader stateloader.StateLoader
   460  
   461  		// draining specifies whether this replica is draining. Raft leadership
   462  		// transfers due to a lease change will be attempted even if the target does
   463  		// not have all the log entries.
   464  		draining bool
   465  
   466  		// cachedProtectedTS provides the state of the protected timestamp
   467  		// subsystem as used on the request serving path to determine the effective
   468  		// gc threshold given the current TTL when using strict GC enforcement.
   469  		//
   470  		// It would be too expensive to go read from the protected timestamp cache
   471  		// for every request. Instead, if clients want to ensure that their request
   472  		// will see the effect of a protected timestamp record, they need to verify
   473  		// the request. See the comment on the struct for more details.
   474  		cachedProtectedTS cachedProtectedTimestampState
   475  
   476  		// largestPreviousMaxRangeSizeBytes tracks a previous zone.RangeMaxBytes
   477  		// which exceeded the current zone.RangeMaxBytes to help defeat the range
   478  		// backpressure mechanism in cases where a user reduces the configured range
   479  		// size. It is set when the zone config changes to a smaller value and the
   480  		// current range size exceeds the new value. It is cleared after the range's
   481  		// size drops below its current zone.MaxRangeBytes or if the
   482  		// zone.MaxRangeBytes increases to surpass the current value.
   483  		largestPreviousMaxRangeSizeBytes int64
   484  
   485  		// failureToGossipSystemConfig is set to true when the leaseholder of the
   486  		// range containing the system config span fails to gossip due to an
   487  		// outstanding intent (see MaybeGossipSystemConfig). It is reset when the
   488  		// system config is successfully gossiped or when the Replica loses the
   489  		// lease. It is read when handling a MaybeGossipSystemConfigIfHaveFailure
   490  		// local result trigger. That trigger is set when an EndTransaction with an
   491  		// ABORTED status is evaluated on a range containing the system config span.
   492  		//
   493  		// While the gossipping of the system config span is best-effort, the sql
   494  		// schema leasing mechanism degrades dramatically if changes are not
   495  		// gossiped. This degradation is due to the fact that schema changes, after
   496  		// writing intents, often need to ensure that there aren't outstanding
   497  		// leases on old versions and if there are, roll back and wait until there
   498  		// are not. The problem is that this waiting may take a long time if the
   499  		// current leaseholders are not notified. We deal with this by detecting the
   500  		// abort of a transaction which might have blocked the system config from
   501  		// being gossiped and attempting to gossip again.
   502  		failureToGossipSystemConfig bool
   503  	}
   504  
   505  	rangefeedMu struct {
   506  		syncutil.RWMutex
   507  		// proc is an instance of a rangefeed Processor that is capable of
   508  		// routing rangefeed events to a set of subscribers. Will be nil if no
   509  		// subscribers are registered.
   510  		//
   511  		// Requires Replica.rangefeedMu be held when mutating the pointer.
   512  		// Requires Replica.raftMu be held when providing logical ops and
   513  		//  informing the processor of closed timestamp updates. This properly
   514  		//  synchronizes updates that are linearized and driven by the Raft log.
   515  		proc *rangefeed.Processor
   516  		// opFilter is a best-effort filter that informs the raft processing
   517  		// goroutine of which logical operations the rangefeed processor is
   518  		// interested in based on the processor's current registrations.
   519  		//
   520  		// The filter is allowed to return false positives, but not false
   521  		// negatives. False negatives are avoided by updating (expanding) the
   522  		// filter while holding the Replica.raftMu when adding new registrations
   523  		// after flushing the rangefeed.Processor event channel. This ensures
   524  		// that no events that were filtered before the new registration was
   525  		// added will be observed by the new registration and all events after
   526  		// the new registration will respect the updated filter.
   527  		//
   528  		// Requires Replica.rangefeedMu be held when mutating the pointer.
   529  		opFilter *rangefeed.Filter
   530  	}
   531  
   532  	// Throttle how often we offer this Replica to the split and merge queues.
   533  	// We have triggers downstream of Raft that do so based on limited
   534  	// information and without explicit throttling some replicas will offer once
   535  	// per applied Raft command, which is silly and also clogs up the queues'
   536  	// semaphores.
   537  	splitQueueThrottle, mergeQueueThrottle util.EveryN
   538  
   539  	// loadBasedSplitter keeps information about load-based splitting.
   540  	loadBasedSplitter split.Decider
   541  
   542  	unreachablesMu struct {
   543  		syncutil.Mutex
   544  		remotes map[roachpb.ReplicaID]struct{}
   545  	}
   546  
   547  	// r.mu < r.protectedTimestampMu
   548  	protectedTimestampMu struct {
   549  		syncutil.Mutex
   550  
   551  		// minStateReadTimestamp is a lower bound on the timestamp of the cached
   552  		// protected timestamp state which may be used when updating
   553  		// pendingGCThreshold. This field acts to eliminate races between
   554  		// verification of protected timestamp records and the setting of a new
   555  		// GC threshold
   556  		minStateReadTimestamp hlc.Timestamp
   557  
   558  		// pendingGCThreshold holds a timestamp which is being proposed as a new
   559  		// GC threshold for the range.
   560  		pendingGCThreshold hlc.Timestamp
   561  	}
   562  }
   563  
   564  var _ batcheval.EvalContext = &Replica{}
   565  
   566  // KeyRange is an interface type for the replicasByKey BTree, to compare
   567  // Replica and ReplicaPlaceholder.
   568  type KeyRange interface {
   569  	Desc() *roachpb.RangeDescriptor
   570  	rangeKeyItem
   571  	btree.Item
   572  	fmt.Stringer
   573  }
   574  
   575  var _ KeyRange = &Replica{}
   576  
   577  var _ kv.Sender = &Replica{}
   578  
   579  // String returns the string representation of the replica using an
   580  // inconsistent copy of the range descriptor. Therefore, String does not
   581  // require a lock and its output may not be atomic with other ongoing work in
   582  // the replica. This is done to prevent deadlocks in logging sites.
   583  func (r *Replica) String() string {
   584  	return fmt.Sprintf("[n%d,s%d,r%s]", r.store.Ident.NodeID, r.store.Ident.StoreID, &r.rangeStr)
   585  }
   586  
   587  // ReplicaID returns the ID for the Replica. It may be zero if the replica does
   588  // not know its ID. Once a Replica has a non-zero ReplicaID it will never change.
   589  func (r *Replica) ReplicaID() roachpb.ReplicaID {
   590  	r.mu.RLock()
   591  	defer r.mu.RUnlock()
   592  	return r.mu.replicaID
   593  }
   594  
   595  // cleanupFailedProposal cleans up after a proposal that has failed. It
   596  // clears any references to the proposal and releases associated quota.
   597  // It requires that both Replica.mu and Replica.raftMu are exclusively held.
   598  func (r *Replica) cleanupFailedProposalLocked(p *ProposalData) {
   599  	r.raftMu.AssertHeld()
   600  	r.mu.AssertHeld()
   601  	delete(r.mu.proposals, p.idKey)
   602  	p.releaseQuota()
   603  }
   604  
   605  // GetMinBytes gets the replica's minimum byte threshold.
   606  func (r *Replica) GetMinBytes() int64 {
   607  	r.mu.RLock()
   608  	defer r.mu.RUnlock()
   609  	return *r.mu.zone.RangeMinBytes
   610  }
   611  
   612  // GetMaxBytes gets the replica's maximum byte threshold.
   613  func (r *Replica) GetMaxBytes() int64 {
   614  	r.mu.RLock()
   615  	defer r.mu.RUnlock()
   616  	return *r.mu.zone.RangeMaxBytes
   617  }
   618  
   619  // SetZoneConfig sets the replica's zone config.
   620  func (r *Replica) SetZoneConfig(zone *zonepb.ZoneConfig) {
   621  	r.mu.Lock()
   622  	defer r.mu.Unlock()
   623  
   624  	if r.isInitializedRLocked() &&
   625  		r.mu.zone != nil &&
   626  		zone != nil {
   627  		total := r.mu.state.Stats.Total()
   628  
   629  		// Set largestPreviousMaxRangeSizeBytes if the current range size is above
   630  		// the new limit and we don't already have a larger value. Reset it if
   631  		// the new limit is larger than the current largest we're aware of.
   632  		if total > *zone.RangeMaxBytes &&
   633  			*zone.RangeMaxBytes < *r.mu.zone.RangeMaxBytes &&
   634  			r.mu.largestPreviousMaxRangeSizeBytes < *r.mu.zone.RangeMaxBytes &&
   635  			// Check to make sure that we're replacing a real zone config. Otherwise
   636  			// the default value would prevent backpressure until the range was
   637  			// larger than the default value. When the store starts up it sets the
   638  			// zone for the replica to this default value; later on it overwrites it
   639  			// with a new instance even if the value is the same as the default.
   640  			r.mu.zone != r.store.cfg.DefaultZoneConfig &&
   641  			r.mu.zone != r.store.cfg.DefaultSystemZoneConfig {
   642  
   643  			r.mu.largestPreviousMaxRangeSizeBytes = *r.mu.zone.RangeMaxBytes
   644  		} else if r.mu.largestPreviousMaxRangeSizeBytes > 0 &&
   645  			r.mu.largestPreviousMaxRangeSizeBytes < *zone.RangeMaxBytes {
   646  
   647  			r.mu.largestPreviousMaxRangeSizeBytes = 0
   648  		}
   649  	}
   650  	r.mu.zone = zone
   651  }
   652  
   653  // IsFirstRange returns true if this is the first range.
   654  func (r *Replica) IsFirstRange() bool {
   655  	return r.RangeID == 1
   656  }
   657  
   658  // IsDestroyed returns a non-nil error if the replica has been destroyed
   659  // and the reason if it has.
   660  func (r *Replica) IsDestroyed() (DestroyReason, error) {
   661  	r.mu.RLock()
   662  	defer r.mu.RUnlock()
   663  	return r.isDestroyedRLocked()
   664  }
   665  
   666  func (r *Replica) isDestroyedRLocked() (DestroyReason, error) {
   667  	return r.mu.destroyStatus.reason, r.mu.destroyStatus.err
   668  }
   669  
   670  // DescAndZone returns the authoritative range descriptor as well
   671  // as the zone config for the replica.
   672  func (r *Replica) DescAndZone() (*roachpb.RangeDescriptor, *zonepb.ZoneConfig) {
   673  	r.mu.RLock()
   674  	defer r.mu.RUnlock()
   675  	return r.mu.state.Desc, r.mu.zone
   676  }
   677  
   678  // Desc returns the authoritative range descriptor, acquiring a replica lock in
   679  // the process.
   680  func (r *Replica) Desc() *roachpb.RangeDescriptor {
   681  	r.mu.RLock()
   682  	defer r.mu.RUnlock()
   683  	return r.mu.state.Desc
   684  }
   685  
   686  func (r *Replica) descRLocked() *roachpb.RangeDescriptor {
   687  	r.mu.AssertRHeld()
   688  	return r.mu.state.Desc
   689  }
   690  
   691  // NodeID returns the ID of the node this replica belongs to.
   692  func (r *Replica) NodeID() roachpb.NodeID {
   693  	return r.store.nodeDesc.NodeID
   694  }
   695  
   696  // GetNodeLocality returns the locality of the node this replica belongs to.
   697  func (r *Replica) GetNodeLocality() roachpb.Locality {
   698  	return r.store.nodeDesc.Locality
   699  }
   700  
   701  // ClusterSettings returns the node's ClusterSettings.
   702  func (r *Replica) ClusterSettings() *cluster.Settings {
   703  	return r.store.cfg.Settings
   704  }
   705  
   706  // StoreID returns the Replica's StoreID.
   707  func (r *Replica) StoreID() roachpb.StoreID {
   708  	return r.store.StoreID()
   709  }
   710  
   711  // EvalKnobs returns the EvalContext's Knobs.
   712  func (r *Replica) EvalKnobs() kvserverbase.BatchEvalTestingKnobs {
   713  	return r.store.cfg.TestingKnobs.EvalKnobs
   714  }
   715  
   716  // Clock returns the hlc clock shared by this replica.
   717  func (r *Replica) Clock() *hlc.Clock {
   718  	return r.store.Clock()
   719  }
   720  
   721  // DB returns the Replica's client DB.
   722  func (r *Replica) DB() *kv.DB {
   723  	return r.store.DB()
   724  }
   725  
   726  // Engine returns the Replica's underlying Engine. In most cases the
   727  // evaluation Batch should be used instead.
   728  func (r *Replica) Engine() storage.Engine {
   729  	return r.store.Engine()
   730  }
   731  
   732  // AbortSpan returns the Replica's AbortSpan.
   733  func (r *Replica) AbortSpan() *abortspan.AbortSpan {
   734  	// Despite its name, the AbortSpan doesn't hold on-disk data in
   735  	// memory. It just provides methods that take a Batch, so SpanSet
   736  	// declarations are enforced there.
   737  	return r.abortSpan
   738  }
   739  
   740  // GetLimiters returns the Replica's limiters.
   741  func (r *Replica) GetLimiters() *batcheval.Limiters {
   742  	return &r.store.limiters
   743  }
   744  
   745  // GetConcurrencyManager returns the Replica's concurrency.Manager.
   746  func (r *Replica) GetConcurrencyManager() concurrency.Manager {
   747  	return r.concMgr
   748  }
   749  
   750  // GetTerm returns the term of the given index in the raft log.
   751  func (r *Replica) GetTerm(i uint64) (uint64, error) {
   752  	r.mu.RLock()
   753  	defer r.mu.RUnlock()
   754  	return r.raftTermRLocked(i)
   755  }
   756  
   757  // GetRangeID returns the Range ID.
   758  func (r *Replica) GetRangeID() roachpb.RangeID {
   759  	return r.RangeID
   760  }
   761  
   762  // GetGCThreshold returns the GC threshold.
   763  func (r *Replica) GetGCThreshold() hlc.Timestamp {
   764  	r.mu.RLock()
   765  	defer r.mu.RUnlock()
   766  	return *r.mu.state.GCThreshold
   767  }
   768  
   769  // getImpliedGCThresholdRLocked returns the gc threshold of the replica which
   770  // should be used to determine the validity of commands. The returned timestamp
   771  // may be newer than the replica's true GC threshold if strict enforcement
   772  // is enabled and the TTL has passed. If this is an admin command or this range
   773  // contains data outside of the user keyspace, we return the true GC threshold.
   774  func (r *Replica) getImpliedGCThresholdRLocked(
   775  	st *kvserverpb.LeaseStatus, isAdmin bool,
   776  ) hlc.Timestamp {
   777  	threshold := *r.mu.state.GCThreshold
   778  
   779  	// The GC threshold is the oldest value we can return here.
   780  	if isAdmin || !StrictGCEnforcement.Get(&r.store.ClusterSettings().SV) ||
   781  		r.isSystemRangeRLocked() {
   782  		return threshold
   783  	}
   784  
   785  	// In order to make this check inexpensive, we keep a copy of the reading of
   786  	// protected timestamp state in the replica. This state may be stale, may not
   787  	// exist, or may be unusable given the current lease status. In those cases we
   788  	// must return the GC threshold. On the one hand this seems like a big deal,
   789  	// after a lease transfer, for minutes, users will be able to read data that
   790  	// has technically expired. Fortunately this strict enforcement is merely a
   791  	// user experience win; it's always safe to allow reads to continue so long
   792  	// as they are after the GC threshold.
   793  	c := r.mu.cachedProtectedTS
   794  	if st.State != kvserverpb.LeaseState_VALID || c.readAt.Less(st.Lease.Start) {
   795  		return threshold
   796  	}
   797  
   798  	impliedThreshold := gc.CalculateThreshold(st.Timestamp, *r.mu.zone.GC)
   799  	threshold.Forward(impliedThreshold)
   800  
   801  	// If we have a protected timestamp record which precedes the implied
   802  	// threshold, use the threshold it implies instead.
   803  	if c.earliestRecord != nil && c.earliestRecord.Timestamp.Less(threshold) {
   804  		threshold = c.earliestRecord.Timestamp.Prev()
   805  	}
   806  	return threshold
   807  }
   808  
   809  // isSystemRange returns true if r's key range precedes keys.UserTableDataMin.
   810  func (r *Replica) isSystemRange() bool {
   811  	r.mu.RLock()
   812  	defer r.mu.RUnlock()
   813  	return r.isSystemRangeRLocked()
   814  }
   815  
   816  func (r *Replica) isSystemRangeRLocked() bool {
   817  	return r.mu.state.Desc.StartKey.Less(roachpb.RKey(keys.UserTableDataMin))
   818  }
   819  
   820  // maxReplicaIDOfAny returns the maximum ReplicaID of any replica, including
   821  // voters and learners.
   822  func maxReplicaIDOfAny(desc *roachpb.RangeDescriptor) roachpb.ReplicaID {
   823  	if desc == nil || !desc.IsInitialized() {
   824  		return 0
   825  	}
   826  	var maxID roachpb.ReplicaID
   827  	for _, repl := range desc.Replicas().All() {
   828  		if repl.ReplicaID > maxID {
   829  			maxID = repl.ReplicaID
   830  		}
   831  	}
   832  	return maxID
   833  }
   834  
   835  // LastReplicaAdded returns the ID of the most recently added replica and the
   836  // time at which it was added.
   837  func (r *Replica) LastReplicaAdded() (roachpb.ReplicaID, time.Time) {
   838  	r.mu.RLock()
   839  	defer r.mu.RUnlock()
   840  	return r.mu.lastReplicaAdded, r.mu.lastReplicaAddedTime
   841  }
   842  
   843  // GetReplicaDescriptor returns the replica for this range from the range
   844  // descriptor. Returns a *RangeNotFoundError if the replica is not found.
   845  // No other errors are returned.
   846  func (r *Replica) GetReplicaDescriptor() (roachpb.ReplicaDescriptor, error) {
   847  	r.mu.RLock()
   848  	defer r.mu.RUnlock()
   849  	return r.getReplicaDescriptorRLocked()
   850  }
   851  
   852  // getReplicaDescriptorRLocked is like getReplicaDescriptor, but assumes that
   853  // r.mu is held for either reading or writing.
   854  func (r *Replica) getReplicaDescriptorRLocked() (roachpb.ReplicaDescriptor, error) {
   855  	repDesc, ok := r.mu.state.Desc.GetReplicaDescriptor(r.store.StoreID())
   856  	if ok {
   857  		return repDesc, nil
   858  	}
   859  	return roachpb.ReplicaDescriptor{}, roachpb.NewRangeNotFoundError(r.RangeID, r.store.StoreID())
   860  }
   861  
   862  func (r *Replica) getMergeCompleteCh() chan struct{} {
   863  	r.mu.RLock()
   864  	defer r.mu.RUnlock()
   865  	return r.getMergeCompleteChRLocked()
   866  }
   867  
   868  func (r *Replica) getMergeCompleteChRLocked() chan struct{} {
   869  	return r.mu.mergeComplete
   870  }
   871  
   872  // setLastReplicaDescriptors sets the the most recently seen replica
   873  // descriptors to those contained in the *RaftMessageRequest, acquiring r.mu
   874  // to do so.
   875  func (r *Replica) setLastReplicaDescriptors(req *RaftMessageRequest) {
   876  	r.mu.Lock()
   877  	r.mu.lastFromReplica = req.FromReplica
   878  	r.mu.lastToReplica = req.ToReplica
   879  	r.mu.Unlock()
   880  }
   881  
   882  // GetMVCCStats returns a copy of the MVCC stats object for this range.
   883  // This accessor is thread-safe, but provides no guarantees about its
   884  // synchronization with any concurrent writes.
   885  func (r *Replica) GetMVCCStats() enginepb.MVCCStats {
   886  	r.mu.RLock()
   887  	defer r.mu.RUnlock()
   888  	return *r.mu.state.Stats
   889  }
   890  
   891  // GetSplitQPS returns the Replica's queries/s request rate.
   892  //
   893  // NOTE: This should only be used for load based splitting, only
   894  // works when the load based splitting cluster setting is enabled.
   895  //
   896  // Use QueriesPerSecond() for current QPS stats for all other purposes.
   897  func (r *Replica) GetSplitQPS() float64 {
   898  	return r.loadBasedSplitter.LastQPS(timeutil.Now())
   899  }
   900  
   901  // ContainsKey returns whether this range contains the specified key.
   902  //
   903  // TODO(bdarnell): This is not the same as RangeDescriptor.ContainsKey.
   904  func (r *Replica) ContainsKey(key roachpb.Key) bool {
   905  	return kvserverbase.ContainsKey(r.Desc(), key)
   906  }
   907  
   908  // ContainsKeyRange returns whether this range contains the specified
   909  // key range from start to end.
   910  func (r *Replica) ContainsKeyRange(start, end roachpb.Key) bool {
   911  	return kvserverbase.ContainsKeyRange(r.Desc(), start, end)
   912  }
   913  
   914  // GetLastReplicaGCTimestamp reads the timestamp at which the replica was
   915  // last checked for removal by the replica gc queue.
   916  func (r *Replica) GetLastReplicaGCTimestamp(ctx context.Context) (hlc.Timestamp, error) {
   917  	key := keys.RangeLastReplicaGCTimestampKey(r.RangeID)
   918  	var timestamp hlc.Timestamp
   919  	_, err := storage.MVCCGetProto(ctx, r.store.Engine(), key, hlc.Timestamp{}, &timestamp,
   920  		storage.MVCCGetOptions{})
   921  	if err != nil {
   922  		return hlc.Timestamp{}, err
   923  	}
   924  	return timestamp, nil
   925  }
   926  
   927  func (r *Replica) setLastReplicaGCTimestamp(ctx context.Context, timestamp hlc.Timestamp) error {
   928  	key := keys.RangeLastReplicaGCTimestampKey(r.RangeID)
   929  	return storage.MVCCPutProto(ctx, r.store.Engine(), nil, key, hlc.Timestamp{}, nil, &timestamp)
   930  }
   931  
   932  // getQueueLastProcessed returns the last processed timestamp for the
   933  // specified queue, or the zero timestamp if not available.
   934  func (r *Replica) getQueueLastProcessed(ctx context.Context, queue string) (hlc.Timestamp, error) {
   935  	key := keys.QueueLastProcessedKey(r.Desc().StartKey, queue)
   936  	var timestamp hlc.Timestamp
   937  	if r.store != nil {
   938  		_, err := storage.MVCCGetProto(ctx, r.store.Engine(), key, hlc.Timestamp{}, &timestamp,
   939  			storage.MVCCGetOptions{})
   940  		if err != nil {
   941  			log.VErrEventf(ctx, 2, "last processed timestamp unavailable: %s", err)
   942  			return hlc.Timestamp{}, err
   943  		}
   944  	}
   945  	log.VEventf(ctx, 2, "last processed timestamp: %s", timestamp)
   946  	return timestamp, nil
   947  }
   948  
   949  // setQueueLastProcessed writes the last processed timestamp for the
   950  // specified queue.
   951  func (r *Replica) setQueueLastProcessed(
   952  	ctx context.Context, queue string, timestamp hlc.Timestamp,
   953  ) error {
   954  	key := keys.QueueLastProcessedKey(r.Desc().StartKey, queue)
   955  	return r.store.DB().PutInline(ctx, key, &timestamp)
   956  }
   957  
   958  // RaftStatus returns the current raft status of the replica. It returns nil
   959  // if the Raft group has not been initialized yet.
   960  func (r *Replica) RaftStatus() *raft.Status {
   961  	r.mu.RLock()
   962  	defer r.mu.RUnlock()
   963  	return r.raftStatusRLocked()
   964  }
   965  
   966  func (r *Replica) raftStatusRLocked() *raft.Status {
   967  	if rg := r.mu.internalRaftGroup; rg != nil {
   968  		s := rg.Status()
   969  		return &s
   970  	}
   971  	return nil
   972  }
   973  
   974  func (r *Replica) raftBasicStatusRLocked() raft.BasicStatus {
   975  	if rg := r.mu.internalRaftGroup; rg != nil {
   976  		return rg.BasicStatus()
   977  	}
   978  	return raft.BasicStatus{}
   979  }
   980  
   981  // State returns a copy of the internal state of the Replica, along with some
   982  // auxiliary information.
   983  func (r *Replica) State() kvserverpb.RangeInfo {
   984  	var ri kvserverpb.RangeInfo
   985  
   986  	// NB: this acquires an RLock(). Reentrant RLocks are deadlock prone, so do
   987  	// this first before RLocking below. Performance of this extra lock
   988  	// acquisition is not a concern.
   989  	ri.ActiveClosedTimestamp, _ = r.maxClosed(context.Background())
   990  
   991  	// NB: numRangefeedRegistrations doesn't require Replica.mu to be locked.
   992  	// However, it does require coordination between multiple goroutines, so
   993  	// it's best to keep it out of the Replica.mu critical section.
   994  	ri.RangefeedRegistrations = int64(r.numRangefeedRegistrations())
   995  
   996  	r.mu.RLock()
   997  	defer r.mu.RUnlock()
   998  	ri.ReplicaState = *(protoutil.Clone(&r.mu.state)).(*kvserverpb.ReplicaState)
   999  	ri.LastIndex = r.mu.lastIndex
  1000  	ri.NumPending = uint64(r.numPendingProposalsRLocked())
  1001  	ri.RaftLogSize = r.mu.raftLogSize
  1002  	ri.RaftLogSizeTrusted = r.mu.raftLogSizeTrusted
  1003  	ri.NumDropped = uint64(r.mu.droppedMessages)
  1004  	if r.mu.proposalQuota != nil {
  1005  		ri.ApproximateProposalQuota = int64(r.mu.proposalQuota.ApproximateQuota())
  1006  		ri.ProposalQuotaBaseIndex = int64(r.mu.proposalQuotaBaseIndex)
  1007  		ri.ProposalQuotaReleaseQueue = make([]int64, len(r.mu.quotaReleaseQueue))
  1008  		for i, a := range r.mu.quotaReleaseQueue {
  1009  			if a != nil {
  1010  				ri.ProposalQuotaReleaseQueue[i] = int64(a.Acquired())
  1011  			}
  1012  		}
  1013  	}
  1014  	ri.RangeMaxBytes = *r.mu.zone.RangeMaxBytes
  1015  	if desc := ri.ReplicaState.Desc; desc != nil {
  1016  		// Learner replicas don't serve follower reads, but they still receive
  1017  		// closed timestamp updates, so include them here.
  1018  		allReplicas := desc.Replicas().All()
  1019  		for i := range allReplicas {
  1020  			replDesc := &allReplicas[i]
  1021  			r.store.cfg.ClosedTimestamp.Storage.VisitDescending(replDesc.NodeID, func(e ctpb.Entry) (done bool) {
  1022  				mlai, found := e.MLAI[r.RangeID]
  1023  				if !found {
  1024  					return false // not done
  1025  				}
  1026  				if ri.NewestClosedTimestamp.ClosedTimestamp.Less(e.ClosedTimestamp) {
  1027  					ri.NewestClosedTimestamp.NodeID = replDesc.NodeID
  1028  					ri.NewestClosedTimestamp.ClosedTimestamp = e.ClosedTimestamp
  1029  					ri.NewestClosedTimestamp.MLAI = int64(mlai)
  1030  					ri.NewestClosedTimestamp.Epoch = int64(e.Epoch)
  1031  				}
  1032  				return true // done
  1033  			})
  1034  		}
  1035  	}
  1036  	return ri
  1037  }
  1038  
  1039  // assertStateLocked can be called from the Raft goroutine to check that the
  1040  // in-memory and on-disk states of the Replica are congruent.
  1041  // Requires that both r.raftMu and r.mu are held.
  1042  //
  1043  // TODO(tschottdorf): Consider future removal (for example, when #7224 is resolved).
  1044  func (r *Replica) assertStateLocked(ctx context.Context, reader storage.Reader) {
  1045  	diskState, err := r.mu.stateLoader.Load(ctx, reader, r.mu.state.Desc)
  1046  	if err != nil {
  1047  		log.Fatalf(ctx, "%v", err)
  1048  	}
  1049  	if !diskState.Equal(r.mu.state) {
  1050  		// The roundabout way of printing here is to expose this information in sentry.io.
  1051  		//
  1052  		// TODO(dt): expose properly once #15892 is addressed.
  1053  		log.Errorf(ctx, "on-disk and in-memory state diverged:\n%s",
  1054  			pretty.Diff(diskState, r.mu.state))
  1055  		r.mu.state.Desc, diskState.Desc = nil, nil
  1056  		log.Fatalf(ctx, "on-disk and in-memory state diverged: %s",
  1057  			log.Safe(pretty.Diff(diskState, r.mu.state)))
  1058  	}
  1059  }
  1060  
  1061  // checkExecutionCanProceed returns an error if a batch request cannot be
  1062  // executed by the Replica. An error indicates that the Replica is not live and
  1063  // able to serve traffic or that the request is not compatible with the state of
  1064  // the Range.
  1065  //
  1066  // The method accepts a concurrency Guard and a LeaseStatus parameter. These are
  1067  // used to indicate whether the caller has acquired latches and checked the
  1068  // Range lease. The method will only check for a pending merge if both of these
  1069  // conditions are true. If either !g.HoldingLatches() or st == nil then the
  1070  // method will not check for a pending merge. Callers might be ok with this if
  1071  // they know that they will end up checking for a pending merge at some later
  1072  // time.
  1073  func (r *Replica) checkExecutionCanProceed(
  1074  	ba *roachpb.BatchRequest, g *concurrency.Guard, st *kvserverpb.LeaseStatus,
  1075  ) error {
  1076  	rSpan, err := keys.Range(ba.Requests)
  1077  	if err != nil {
  1078  		return err
  1079  	}
  1080  	r.mu.RLock()
  1081  	defer r.mu.RUnlock()
  1082  	if _, err := r.isDestroyedRLocked(); err != nil {
  1083  		return err
  1084  	} else if err := r.checkSpanInRangeRLocked(rSpan); err != nil {
  1085  		return err
  1086  	} else if err := r.checkTSAboveGCThresholdRLocked(ba.Timestamp, st, ba.IsAdmin()); err != nil {
  1087  		return err
  1088  	} else if g.HoldingLatches() && st != nil {
  1089  		// Only check for a pending merge if latches are held and the Range
  1090  		// lease is held by this Replica. Without both of these conditions,
  1091  		// checkForPendingMergeRLocked could return false negatives.
  1092  		return r.checkForPendingMergeRLocked(ba)
  1093  	}
  1094  	return nil
  1095  }
  1096  
  1097  // checkExecutionCanProceedForRangeFeed returns an error if a rangefeed request
  1098  // cannot be executed by the Replica.
  1099  func (r *Replica) checkExecutionCanProceedForRangeFeed(
  1100  	rSpan roachpb.RSpan, ts hlc.Timestamp,
  1101  ) error {
  1102  	now := r.Clock().Now()
  1103  	r.mu.RLock()
  1104  	defer r.mu.RUnlock()
  1105  	status := r.leaseStatus(*r.mu.state.Lease, now, r.mu.minLeaseProposedTS)
  1106  	if _, err := r.isDestroyedRLocked(); err != nil {
  1107  		return err
  1108  	} else if err := r.checkSpanInRangeRLocked(rSpan); err != nil {
  1109  		return err
  1110  	} else if err := r.checkTSAboveGCThresholdRLocked(ts, &status, false /* isAdmin */); err != nil {
  1111  		return err
  1112  	} else if r.requiresExpiringLeaseRLocked() {
  1113  		// Ensure that the range does not require an expiration-based lease. If it
  1114  		// does, it will never get closed timestamp updates and the rangefeed will
  1115  		// never be able to advance its resolved timestamp.
  1116  		return errors.New("expiration-based leases are incompatible with rangefeeds")
  1117  	}
  1118  	return nil
  1119  }
  1120  
  1121  // checkSpanInRangeRLocked returns an error if a request (identified by its
  1122  // key span) can be run on the replica.
  1123  func (r *Replica) checkSpanInRangeRLocked(rspan roachpb.RSpan) error {
  1124  	desc := r.mu.state.Desc
  1125  	if desc.ContainsKeyRange(rspan.Key, rspan.EndKey) {
  1126  		return nil
  1127  	}
  1128  	return roachpb.NewRangeKeyMismatchError(
  1129  		rspan.Key.AsRawKey(), rspan.EndKey.AsRawKey(), desc,
  1130  	)
  1131  }
  1132  
  1133  // checkTSAboveGCThresholdRLocked returns an error if a request (identified
  1134  // by its MVCC timestamp) can be run on the replica.
  1135  func (r *Replica) checkTSAboveGCThresholdRLocked(
  1136  	ts hlc.Timestamp, st *kvserverpb.LeaseStatus, isAdmin bool,
  1137  ) error {
  1138  	threshold := r.getImpliedGCThresholdRLocked(st, isAdmin)
  1139  	if threshold.Less(ts) {
  1140  		return nil
  1141  	}
  1142  	return &roachpb.BatchTimestampBeforeGCError{
  1143  		Timestamp: ts,
  1144  		Threshold: threshold,
  1145  	}
  1146  }
  1147  
  1148  // checkForPendingMergeRLocked determines whether the replica is being merged
  1149  // into its left-hand neighbor. If so, an error is returned to prevent the
  1150  // request from proceeding until the merge completes.
  1151  func (r *Replica) checkForPendingMergeRLocked(ba *roachpb.BatchRequest) error {
  1152  	if r.getMergeCompleteChRLocked() == nil {
  1153  		return nil
  1154  	}
  1155  	if ba.IsSingleSubsumeRequest() {
  1156  		return nil
  1157  	}
  1158  	// The replica is being merged into its left-hand neighbor. This request
  1159  	// cannot proceed until the merge completes, signaled by the closing of the
  1160  	// channel.
  1161  	//
  1162  	// It is very important that this check occur after we have acquired latches
  1163  	// from the spanlatch manager. Only after we release these latches are we
  1164  	// guaranteed that we're not racing with a Subsume command. (Subsume
  1165  	// commands declare a conflict with all other commands.) It is also
  1166  	// important that this check occur after we have verified that this replica
  1167  	// is the leaseholder. Only the leaseholder will have its merge complete
  1168  	// channel set.
  1169  	//
  1170  	// Note that Subsume commands are exempt from waiting on the mergeComplete
  1171  	// channel. This is necessary to avoid deadlock. While normally a Subsume
  1172  	// request will trigger the installation of a mergeComplete channel after it
  1173  	// is executed, it may sometimes execute after the mergeComplete channel has
  1174  	// been installed. Consider the case where the RHS replica acquires a new
  1175  	// lease after the merge transaction deletes its local range descriptor but
  1176  	// before the Subsume command is sent. The lease acquisition request will
  1177  	// notice the intent on the local range descriptor and install a
  1178  	// mergeComplete channel. If the forthcoming Subsume blocked on that
  1179  	// channel, the merge transaction would deadlock.
  1180  	//
  1181  	// This exclusion admits a small race condition. If a Subsume request is
  1182  	// sent to the right-hand side of a merge, outside of a merge transaction,
  1183  	// after the merge has committed but before the RHS has noticed that the
  1184  	// merge has committed, the request may return stale data. Since the merge
  1185  	// has committed, the LHS may have processed writes to the keyspace
  1186  	// previously owned by the RHS that the RHS is unaware of. This window
  1187  	// closes quickly, as the RHS will soon notice the merge transaction has
  1188  	// committed and mark itself as destroyed, which prevents it from serving
  1189  	// all traffic, including Subsume requests.
  1190  	//
  1191  	// In our current, careful usage of Subsume, this race condition is
  1192  	// irrelevant. Subsume is only sent from within a merge transaction, and
  1193  	// merge transactions read the RHS descriptor at the beginning of the
  1194  	// transaction to verify that it has not already been merged away.
  1195  	//
  1196  	// We can't wait for the merge to complete here, though. The replica might
  1197  	// need to respond to a Subsume request in order for the merge to complete,
  1198  	// and blocking here would force that Subsume request to sit in hold its
  1199  	// latches forever, deadlocking the merge. Instead, we release the latches
  1200  	// we acquired above and return a MergeInProgressError. The store will catch
  1201  	// that error and resubmit the request after mergeCompleteCh closes. See
  1202  	// #27442 for the full context.
  1203  	return &roachpb.MergeInProgressError{}
  1204  }
  1205  
  1206  // isNewerThanSplit is a helper used in split(Pre|Post)Apply to
  1207  // determine whether the Replica on the right hand side of the split must
  1208  // have been removed from this store after the split. There is one
  1209  // false negative where false will be returned but the hard state may
  1210  // be due to a newer replica which is outlined below. It should be safe.
  1211  //
  1212  // TODO(ajwerner): Ideally if this store had ever learned that the replica
  1213  // created by the split were removed it would not forget that fact.
  1214  // There exists one edge case where the store may learn that it should house
  1215  // a replica of the same range with a higher replica ID and then forget.
  1216  // If the first raft message this store ever receives for the this range
  1217  // contains a replica ID higher than the replica ID in the split trigger
  1218  // then an in-memory replica at that higher replica ID will be created and
  1219  // no tombstone at a lower replica ID will be written. If the server then
  1220  // crashes it will forget that it had ever been the higher replica ID. The
  1221  // server may then proceed to process the split and initialize a replica at
  1222  // the replica ID implied by the split. This is potentially problematic as
  1223  // the replica may have voted as this higher replica ID and when it rediscovers
  1224  // the higher replica ID it will delete all of the state corresponding to the
  1225  // older replica ID including its hard state which may have been synthesized
  1226  // with votes as the newer replica ID. This case tends to be handled safely
  1227  // in practice because the replica should only be receiving messages as the
  1228  // newer replica ID after it has been added to the range. Prior to learner
  1229  // replicas we would only add a store to a range after we've successfully
  1230  // applied a pre-emptive snapshot. If the store were to split between the
  1231  // preemptive snapshot and the addition then the addition would fail due to
  1232  // the conditional put logic. If the store were to then enable learners then
  1233  // we're still okay because we won't promote a learner unless we succeed in
  1234  // sending a learner snapshot. If we fail to send the replica never becomes
  1235  // a voter then its votes don't matter and are safe to discard.
  1236  //
  1237  // Despite the safety due to the change replicas protocol explained above
  1238  // it'd be good to know for sure that a replica ID for a range on a store
  1239  // is always monotonically increasing, even across restarts.
  1240  //
  1241  // See TestProcessSplitAfterRightHandSideHasBeenRemoved.
  1242  func (r *Replica) isNewerThanSplit(split *roachpb.SplitTrigger) bool {
  1243  	r.mu.RLock()
  1244  	defer r.mu.RUnlock()
  1245  	return r.isNewerThanSplitRLocked(split)
  1246  }
  1247  
  1248  func (r *Replica) isNewerThanSplitRLocked(split *roachpb.SplitTrigger) bool {
  1249  	rightDesc, _ := split.RightDesc.GetReplicaDescriptor(r.StoreID())
  1250  	// If we have written a tombstone for this range then we know that the RHS
  1251  	// must have already been removed at the split replica ID.
  1252  	return r.mu.tombstoneMinReplicaID != 0 ||
  1253  		// If the first raft message we received for the RHS range was for a replica
  1254  		// ID which is above the replica ID of the split then we would not have
  1255  		// written a tombstone but we will have a replica ID that will exceed the
  1256  		// split replica ID.
  1257  		r.mu.replicaID > rightDesc.ReplicaID
  1258  }
  1259  
  1260  // endCmds holds necessary information to end a batch after Raft
  1261  // command processing.
  1262  type endCmds struct {
  1263  	repl *Replica
  1264  	g    *concurrency.Guard
  1265  }
  1266  
  1267  // move moves the endCmds into the return value, clearing and making
  1268  // a call to done on the receiver a no-op.
  1269  func (ec *endCmds) move() endCmds {
  1270  	res := *ec
  1271  	*ec = endCmds{}
  1272  	return res
  1273  }
  1274  
  1275  // done releases the latches acquired by the command and updates
  1276  // the timestamp cache using the final timestamp of each command.
  1277  //
  1278  // No-op if the receiver has been zeroed out by a call to move.
  1279  // Idempotent and is safe to call more than once.
  1280  func (ec *endCmds) done(
  1281  	ctx context.Context, ba *roachpb.BatchRequest, br *roachpb.BatchResponse, pErr *roachpb.Error,
  1282  ) {
  1283  	if ec.repl == nil {
  1284  		// The endCmds were cleared.
  1285  		return
  1286  	}
  1287  	defer ec.move() // clear
  1288  
  1289  	// Update the timestamp cache if the request is not being re-evaluated. Each
  1290  	// request is considered in turn; only those marked as affecting the cache are
  1291  	// processed.
  1292  	ec.repl.updateTimestampCache(ctx, ba, br, pErr)
  1293  
  1294  	// Release the latches acquired by the request and exit lock wait-queues.
  1295  	// Must be done AFTER the timestamp cache is updated. ec.g is only set when
  1296  	// the Raft proposal has assumed responsibility for the request.
  1297  	if ec.g != nil {
  1298  		ec.repl.concMgr.FinishReq(ec.g)
  1299  	}
  1300  }
  1301  
  1302  // maybeWatchForMerge checks whether a merge of this replica into its left
  1303  // neighbor is in its critical phase and, if so, arranges to block all requests
  1304  // until the merge completes.
  1305  func (r *Replica) maybeWatchForMerge(ctx context.Context) error {
  1306  	desc := r.Desc()
  1307  	descKey := keys.RangeDescriptorKey(desc.StartKey)
  1308  	_, intent, err := storage.MVCCGet(ctx, r.Engine(), descKey, r.Clock().Now(),
  1309  		storage.MVCCGetOptions{Inconsistent: true})
  1310  	if err != nil {
  1311  		return err
  1312  	} else if intent == nil {
  1313  		return nil
  1314  	}
  1315  	val, _, err := storage.MVCCGetAsTxn(
  1316  		ctx, r.Engine(), descKey, intent.Txn.WriteTimestamp, intent.Txn)
  1317  	if err != nil {
  1318  		return err
  1319  	} else if val != nil {
  1320  		return nil
  1321  	}
  1322  
  1323  	// At this point, we know we have a deletion intent on our range descriptor.
  1324  	// That means a merge is in progress. Block all commands until we can
  1325  	// retrieve an updated range descriptor from meta2, which will indicate
  1326  	// whether the merge succeeded or not.
  1327  
  1328  	mergeCompleteCh := make(chan struct{})
  1329  	r.mu.Lock()
  1330  	if r.mu.mergeComplete != nil {
  1331  		// Another request already noticed the merge, installed a mergeComplete
  1332  		// channel, and launched a goroutine to watch for the merge's completion.
  1333  		// Nothing more to do.
  1334  		r.mu.Unlock()
  1335  		return nil
  1336  	}
  1337  	r.mu.mergeComplete = mergeCompleteCh
  1338  	// The RHS of a merge is not permitted to quiesce while a mergeComplete
  1339  	// channel is installed. (If the RHS is quiescent when the merge commits, any
  1340  	// orphaned followers would fail to queue themselves for GC.) Unquiesce the
  1341  	// range in case it managed to quiesce between when the Subsume request
  1342  	// arrived and now, which is rare but entirely legal.
  1343  	r.unquiesceLocked()
  1344  	r.mu.Unlock()
  1345  
  1346  	taskCtx := r.AnnotateCtx(context.Background())
  1347  	err = r.store.stopper.RunAsyncTask(taskCtx, "wait-for-merge", func(ctx context.Context) {
  1348  		var pushTxnRes *roachpb.PushTxnResponse
  1349  		for retry := retry.Start(base.DefaultRetryOptions()); retry.Next(); {
  1350  			// Wait for the merge transaction to complete by attempting to push it. We
  1351  			// don't want to accidentally abort the merge transaction, so we use the
  1352  			// minimum transaction priority. Note that a push type of
  1353  			// roachpb.PUSH_TOUCH, though it might appear more semantically correct,
  1354  			// returns immediately and causes us to spin hot, whereas
  1355  			// roachpb.PUSH_ABORT efficiently blocks until the transaction completes.
  1356  			b := &kv.Batch{}
  1357  			b.Header.Timestamp = r.Clock().Now()
  1358  			b.AddRawRequest(&roachpb.PushTxnRequest{
  1359  				RequestHeader: roachpb.RequestHeader{Key: intent.Txn.Key},
  1360  				PusherTxn: roachpb.Transaction{
  1361  					TxnMeta: enginepb.TxnMeta{Priority: enginepb.MinTxnPriority},
  1362  				},
  1363  				PusheeTxn: intent.Txn,
  1364  				PushType:  roachpb.PUSH_ABORT,
  1365  			})
  1366  			if err := r.DB().Run(ctx, b); err != nil {
  1367  				select {
  1368  				case <-r.store.stopper.ShouldQuiesce():
  1369  					// The server is shutting down. The error while pushing the
  1370  					// transaction was probably caused by the shutdown, so ignore it.
  1371  					return
  1372  				default:
  1373  					log.Warningf(ctx, "error while watching for merge to complete: PushTxn: %+v", err)
  1374  					// We can't safely unblock traffic until we can prove that the merge
  1375  					// transaction is committed or aborted. Nothing to do but try again.
  1376  					continue
  1377  				}
  1378  			}
  1379  			pushTxnRes = b.RawResponse().Responses[0].GetInner().(*roachpb.PushTxnResponse)
  1380  			break
  1381  		}
  1382  
  1383  		var mergeCommitted bool
  1384  		switch pushTxnRes.PusheeTxn.Status {
  1385  		case roachpb.PENDING, roachpb.STAGING:
  1386  			log.Fatalf(ctx, "PushTxn returned while merge transaction %s was still %s",
  1387  				intent.Txn.ID.Short(), pushTxnRes.PusheeTxn.Status)
  1388  		case roachpb.COMMITTED:
  1389  			// If PushTxn claims that the transaction committed, then the transaction
  1390  			// definitely committed.
  1391  			mergeCommitted = true
  1392  		case roachpb.ABORTED:
  1393  			// If PushTxn claims that the transaction aborted, it's not a guarantee
  1394  			// that the transaction actually aborted. It could also mean that the
  1395  			// transaction completed, resolved its intents, and GC'd its transaction
  1396  			// record before our PushTxn arrived. To figure out what happened, we
  1397  			// need to look in meta2.
  1398  			var getRes *roachpb.GetResponse
  1399  			for retry := retry.Start(base.DefaultRetryOptions()); retry.Next(); {
  1400  				metaKey := keys.RangeMetaKey(desc.EndKey)
  1401  				res, pErr := kv.SendWrappedWith(ctx, r.DB().NonTransactionalSender(), roachpb.Header{
  1402  					// Use READ_UNCOMMITTED to avoid trying to resolve intents, since
  1403  					// resolving those intents might involve sending requests to this
  1404  					// range, and that could deadlock. See the comment on
  1405  					// TestStoreRangeMergeConcurrentSplit for details.
  1406  					ReadConsistency: roachpb.READ_UNCOMMITTED,
  1407  				}, &roachpb.GetRequest{
  1408  					RequestHeader: roachpb.RequestHeader{Key: metaKey.AsRawKey()},
  1409  				})
  1410  				if pErr != nil {
  1411  					select {
  1412  					case <-r.store.stopper.ShouldQuiesce():
  1413  						// The server is shutting down. The error while fetching the range
  1414  						// descriptor was probably caused by the shutdown, so ignore it.
  1415  						return
  1416  					default:
  1417  						log.Warningf(ctx, "error while watching for merge to complete: Get %s: %s", metaKey, pErr)
  1418  						// We can't safely unblock traffic until we can prove that the merge
  1419  						// transaction is committed or aborted. Nothing to do but try again.
  1420  						continue
  1421  					}
  1422  				}
  1423  				getRes = res.(*roachpb.GetResponse)
  1424  				break
  1425  			}
  1426  			if getRes.Value == nil {
  1427  				// A range descriptor with our end key is no longer present in meta2, so
  1428  				// the merge must have committed.
  1429  				mergeCommitted = true
  1430  			} else {
  1431  				// A range descriptor with our end key is still present in meta2. The
  1432  				// merge committed iff that range descriptor has a different range ID.
  1433  				var meta2Desc roachpb.RangeDescriptor
  1434  				if err := getRes.Value.GetProto(&meta2Desc); err != nil {
  1435  					log.Fatalf(ctx, "error while watching for merge to complete: "+
  1436  						"unmarshaling meta2 range descriptor: %s", err)
  1437  				}
  1438  				if meta2Desc.RangeID != r.RangeID {
  1439  					mergeCommitted = true
  1440  				}
  1441  			}
  1442  		}
  1443  		r.raftMu.Lock()
  1444  		r.mu.Lock()
  1445  		if mergeCommitted && r.mu.destroyStatus.IsAlive() {
  1446  			// The merge committed but the left-hand replica on this store hasn't
  1447  			// subsumed this replica yet. Mark this replica as destroyed so it
  1448  			// doesn't serve requests when we close the mergeCompleteCh below.
  1449  			r.mu.destroyStatus.Set(roachpb.NewRangeNotFoundError(r.RangeID, r.store.StoreID()), destroyReasonMergePending)
  1450  		}
  1451  		// Unblock pending requests. If the merge committed, the requests will
  1452  		// notice that the replica has been destroyed and return an appropriate
  1453  		// error. If the merge aborted, the requests will be handled normally.
  1454  		r.mu.mergeComplete = nil
  1455  		close(mergeCompleteCh)
  1456  		r.mu.Unlock()
  1457  		r.raftMu.Unlock()
  1458  	})
  1459  	if errors.Is(err, stop.ErrUnavailable) {
  1460  		// We weren't able to launch a goroutine to watch for the merge's completion
  1461  		// because the server is shutting down. Normally failing to launch the
  1462  		// watcher goroutine would wedge pending requests on the replica's
  1463  		// mergeComplete channel forever, but since we're shutting down those
  1464  		// requests will get dropped and retried on another node. Suppress the error.
  1465  		err = nil
  1466  	}
  1467  	return err
  1468  }
  1469  
  1470  func (r *Replica) maybeTransferRaftLeadership(ctx context.Context) {
  1471  	r.mu.Lock()
  1472  	r.maybeTransferRaftLeadershipLocked(ctx)
  1473  	r.mu.Unlock()
  1474  }
  1475  
  1476  // maybeTransferRaftLeadershipLocked attempts to transfer the leadership away
  1477  // from this node to the leaseholder, if this node is the current raft leader
  1478  // but not the leaseholder. We don't attempt to transfer leadership if the
  1479  // leaseholder is behind on applying the log.
  1480  //
  1481  // We like it when leases and raft leadership are collocated because that
  1482  // facilitates quick command application (requests generally need to make it to
  1483  // both the lease holder and the raft leader before being applied by other
  1484  // replicas).
  1485  func (r *Replica) maybeTransferRaftLeadershipLocked(ctx context.Context) {
  1486  	if r.store.TestingKnobs().DisableLeaderFollowsLeaseholder {
  1487  		return
  1488  	}
  1489  	lease := *r.mu.state.Lease
  1490  	if lease.OwnedBy(r.StoreID()) || !r.isLeaseValidRLocked(lease, r.Clock().Now()) {
  1491  		return
  1492  	}
  1493  	raftStatus := r.raftStatusRLocked()
  1494  	if raftStatus == nil || raftStatus.RaftState != raft.StateLeader {
  1495  		return
  1496  	}
  1497  	lhReplicaID := uint64(lease.Replica.ReplicaID)
  1498  	lhProgress, ok := raftStatus.Progress[lhReplicaID]
  1499  	if (ok && lhProgress.Match >= raftStatus.Commit) || r.mu.draining {
  1500  		log.VEventf(ctx, 1, "transferring raft leadership to replica ID %v", lhReplicaID)
  1501  		r.store.metrics.RangeRaftLeaderTransfers.Inc(1)
  1502  		r.mu.internalRaftGroup.TransferLeader(lhReplicaID)
  1503  	}
  1504  }
  1505  
  1506  func (r *Replica) mergeInProgressRLocked() bool {
  1507  	return r.mu.mergeComplete != nil
  1508  }
  1509  
  1510  func (r *Replica) getReplicaDescriptorByIDRLocked(
  1511  	replicaID roachpb.ReplicaID, fallback roachpb.ReplicaDescriptor,
  1512  ) (roachpb.ReplicaDescriptor, error) {
  1513  	if repDesc, ok := r.mu.state.Desc.GetReplicaDescriptorByID(replicaID); ok {
  1514  		return repDesc, nil
  1515  	}
  1516  	if fallback.ReplicaID == replicaID {
  1517  		return fallback, nil
  1518  	}
  1519  	return roachpb.ReplicaDescriptor{},
  1520  		errors.Errorf("replica %d not present in %v, %v",
  1521  			replicaID, fallback, r.mu.state.Desc.Replicas())
  1522  }
  1523  
  1524  // checkIfTxnAborted checks the txn AbortSpan for the given
  1525  // transaction. In case the transaction has been aborted, return a
  1526  // transaction abort error.
  1527  func checkIfTxnAborted(
  1528  	ctx context.Context, rec batcheval.EvalContext, reader storage.Reader, txn roachpb.Transaction,
  1529  ) *roachpb.Error {
  1530  	var entry roachpb.AbortSpanEntry
  1531  	aborted, err := rec.AbortSpan().Get(ctx, reader, txn.ID, &entry)
  1532  	if err != nil {
  1533  		return roachpb.NewError(roachpb.NewReplicaCorruptionError(
  1534  			errors.Wrap(err, "could not read from AbortSpan")))
  1535  	}
  1536  	if aborted {
  1537  		// We hit the cache, so let the transaction restart.
  1538  		log.VEventf(ctx, 1, "found AbortSpan entry for %s with priority %d",
  1539  			txn.ID.Short(), entry.Priority)
  1540  		newTxn := txn.Clone()
  1541  		if entry.Priority > newTxn.Priority {
  1542  			newTxn.Priority = entry.Priority
  1543  		}
  1544  		newTxn.Status = roachpb.ABORTED
  1545  		return roachpb.NewErrorWithTxn(
  1546  			roachpb.NewTransactionAbortedError(roachpb.ABORT_REASON_ABORT_SPAN), newTxn)
  1547  	}
  1548  	return nil
  1549  }
  1550  
  1551  func (r *Replica) startKey() roachpb.RKey {
  1552  	return r.Desc().StartKey
  1553  }
  1554  
  1555  // Less implements the btree.Item interface.
  1556  func (r *Replica) Less(i btree.Item) bool {
  1557  	return r.startKey().Less(i.(rangeKeyItem).startKey())
  1558  }
  1559  
  1560  // GetLeaseHistory returns the lease history stored on this replica.
  1561  func (r *Replica) GetLeaseHistory() []roachpb.Lease {
  1562  	if r.leaseHistory == nil {
  1563  		return nil
  1564  	}
  1565  
  1566  	return r.leaseHistory.get()
  1567  }
  1568  
  1569  // EnableLeaseHistory turns on the lease history for testing purposes. Returns
  1570  // a function to return it to its original state that can be deferred.
  1571  func EnableLeaseHistory(maxEntries int) func() {
  1572  	originalValue := leaseHistoryMaxEntries
  1573  	leaseHistoryMaxEntries = maxEntries
  1574  	return func() {
  1575  		leaseHistoryMaxEntries = originalValue
  1576  	}
  1577  }
  1578  
  1579  // GetExternalStorage returns an ExternalStorage object, based on
  1580  // information parsed from a URI, stored in `dest`.
  1581  func (r *Replica) GetExternalStorage(
  1582  	ctx context.Context, dest roachpb.ExternalStorage,
  1583  ) (cloud.ExternalStorage, error) {
  1584  	return r.store.cfg.ExternalStorage(ctx, dest)
  1585  }
  1586  
  1587  // GetExternalStorageFromURI returns an ExternalStorage object, based on the given URI.
  1588  func (r *Replica) GetExternalStorageFromURI(
  1589  	ctx context.Context, uri string,
  1590  ) (cloud.ExternalStorage, error) {
  1591  	return r.store.cfg.ExternalStorageFromURI(ctx, uri)
  1592  }
  1593  
  1594  func (r *Replica) markSystemConfigGossipSuccess() {
  1595  	r.mu.Lock()
  1596  	defer r.mu.Unlock()
  1597  	r.mu.failureToGossipSystemConfig = false
  1598  }
  1599  
  1600  func (r *Replica) markSystemConfigGossipFailed() {
  1601  	r.mu.Lock()
  1602  	defer r.mu.Unlock()
  1603  	r.mu.failureToGossipSystemConfig = true
  1604  }
  1605  
  1606  func init() {
  1607  	tracing.RegisterTagRemapping("r", "range")
  1608  }