github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "fmt" 16 "strings" 17 "sync/atomic" 18 "time" 19 "unsafe" 20 21 "github.com/cockroachdb/cockroach/pkg/base" 22 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 23 "github.com/cockroachdb/cockroach/pkg/keys" 24 "github.com/cockroachdb/cockroach/pkg/kv" 25 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/abortspan" 26 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval" 27 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb" 28 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency" 29 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/gc" 30 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 31 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 32 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/rangefeed" 33 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/split" 34 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader" 35 "github.com/cockroachdb/cockroach/pkg/roachpb" 36 "github.com/cockroachdb/cockroach/pkg/rpc" 37 "github.com/cockroachdb/cockroach/pkg/settings" 38 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 39 "github.com/cockroachdb/cockroach/pkg/storage" 40 "github.com/cockroachdb/cockroach/pkg/storage/cloud" 41 enginepb "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 42 "github.com/cockroachdb/cockroach/pkg/util" 43 "github.com/cockroachdb/cockroach/pkg/util/envutil" 44 "github.com/cockroachdb/cockroach/pkg/util/hlc" 45 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 46 "github.com/cockroachdb/cockroach/pkg/util/log" 47 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 48 "github.com/cockroachdb/cockroach/pkg/util/quotapool" 49 "github.com/cockroachdb/cockroach/pkg/util/retry" 50 "github.com/cockroachdb/cockroach/pkg/util/stop" 51 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 52 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 53 "github.com/cockroachdb/cockroach/pkg/util/tracing" 54 "github.com/cockroachdb/cockroach/pkg/util/uuid" 55 "github.com/cockroachdb/errors" 56 "github.com/google/btree" 57 "github.com/kr/pretty" 58 "go.etcd.io/etcd/raft" 59 ) 60 61 const ( 62 // configGossipTTL is the time-to-live for configuration maps. 63 64 // optimizePutThreshold is the minimum length of a contiguous run 65 // of batched puts or conditional puts, after which the constituent 66 // put operations will possibly be optimized by determining whether 67 // the key space being written is starting out empty. 68 optimizePutThreshold = 10 69 70 replicaChangeTxnName = "change-replica" 71 splitTxnName = "split" 72 mergeTxnName = "merge" 73 74 defaultReplicaRaftMuWarnThreshold = 500 * time.Millisecond 75 ) 76 77 var testingDisableQuiescence = envutil.EnvOrDefaultBool("COCKROACH_DISABLE_QUIESCENCE", false) 78 79 var disableSyncRaftLog = settings.RegisterBoolSetting( 80 "kv.raft_log.disable_synchronization_unsafe", 81 "set to true to disable synchronization on Raft log writes to persistent storage. "+ 82 "Setting to true risks data loss or data corruption on server crashes. "+ 83 "The setting is meant for internal testing only and SHOULD NOT be used in production.", 84 false, 85 ) 86 87 // UseAtomicReplicationChanges determines whether to issue atomic replication changes. 88 // This has no effect until the cluster version is 19.2 or higher. 89 var UseAtomicReplicationChanges = settings.RegisterBoolSetting( 90 "kv.atomic_replication_changes.enabled", 91 "use atomic replication changes", 92 true, 93 ) 94 95 // MaxCommandSizeFloor is the minimum allowed value for the MaxCommandSize 96 // cluster setting. 97 const MaxCommandSizeFloor = 4 << 20 // 4MB 98 99 // MaxCommandSize wraps "kv.raft.command.max_size". 100 var MaxCommandSize = settings.RegisterValidatedByteSizeSetting( 101 "kv.raft.command.max_size", 102 "maximum size of a raft command", 103 64<<20, 104 func(size int64) error { 105 if size < MaxCommandSizeFloor { 106 return fmt.Errorf("max_size must be greater than %s", humanizeutil.IBytes(MaxCommandSizeFloor)) 107 } 108 return nil 109 }, 110 ) 111 112 // StrictGCEnforcement controls whether requests are rejected based on the GC 113 // threshold and the current GC TTL (true) or just based on the GC threshold 114 // (false). 115 var StrictGCEnforcement = settings.RegisterBoolSetting( 116 "kv.gc_ttl.strict_enforcement.enabled", 117 "if true, fail to serve requests at timestamps below the TTL even if the data still exists", 118 true, 119 ) 120 121 type proposalReevaluationReason int 122 123 const ( 124 proposalNoReevaluation proposalReevaluationReason = iota 125 // proposalIllegalLeaseIndex indicates the proposal failed to apply at 126 // a Lease index it was not legal for. The command should be re-evaluated. 127 proposalIllegalLeaseIndex 128 ) 129 130 type atomicDescString struct { 131 strPtr unsafe.Pointer 132 } 133 134 // store atomically updates d.strPtr with the string representation of desc. 135 func (d *atomicDescString) store(replicaID roachpb.ReplicaID, desc *roachpb.RangeDescriptor) { 136 var buf strings.Builder 137 fmt.Fprintf(&buf, "%d/", desc.RangeID) 138 if replicaID == 0 { 139 fmt.Fprintf(&buf, "?:") 140 } else { 141 fmt.Fprintf(&buf, "%d:", replicaID) 142 } 143 144 if !desc.IsInitialized() { 145 buf.WriteString("{-}") 146 } else { 147 const maxRangeChars = 30 148 rngStr := keys.PrettyPrintRange(roachpb.Key(desc.StartKey), roachpb.Key(desc.EndKey), maxRangeChars) 149 buf.WriteString(rngStr) 150 } 151 152 str := buf.String() 153 atomic.StorePointer(&d.strPtr, unsafe.Pointer(&str)) 154 } 155 156 // String returns the string representation of the range; since we are not 157 // using a lock, the copy might be inconsistent. 158 func (d *atomicDescString) String() string { 159 return *(*string)(atomic.LoadPointer(&d.strPtr)) 160 } 161 162 // atomicConnectionClass stores an rpc.ConnectionClass atomically. 163 type atomicConnectionClass uint32 164 165 // get reads the current value of the ConnectionClass. 166 func (c *atomicConnectionClass) get() rpc.ConnectionClass { 167 return rpc.ConnectionClass(atomic.LoadUint32((*uint32)(c))) 168 } 169 170 // set updates the current value of the ConnectionClass. 171 func (c *atomicConnectionClass) set(cc rpc.ConnectionClass) { 172 atomic.StoreUint32((*uint32)(c), uint32(cc)) 173 } 174 175 // A Replica is a contiguous keyspace with writes managed via an 176 // instance of the Raft consensus algorithm. Many ranges may exist 177 // in a store and they are unlikely to be contiguous. Ranges are 178 // independent units and are responsible for maintaining their own 179 // integrity by replacing failed replicas, splitting and merging 180 // as appropriate. 181 type Replica struct { 182 log.AmbientContext 183 184 // TODO(tschottdorf): Duplicates r.mu.state.desc.RangeID; revisit that. 185 RangeID roachpb.RangeID // Only set by the constructor 186 187 store *Store 188 abortSpan *abortspan.AbortSpan // Avoids anomalous reads after abort 189 190 // leaseholderStats tracks all incoming BatchRequests to the replica and which 191 // localities they come from in order to aid in lease rebalancing decisions. 192 leaseholderStats *replicaStats 193 // writeStats tracks the number of keys written by applied raft commands 194 // in order to aid in replica rebalancing decisions. 195 writeStats *replicaStats 196 197 // creatingReplica is set when a replica is created as uninitialized 198 // via a raft message. 199 creatingReplica *roachpb.ReplicaDescriptor 200 201 // Held in read mode during read-only commands. Held in exclusive mode to 202 // prevent read-only commands from executing. Acquired before the embedded 203 // RWMutex. 204 readOnlyCmdMu syncutil.RWMutex 205 206 // rangeStr is a string representation of a RangeDescriptor that can be 207 // atomically read and updated without needing to acquire the replica.mu lock. 208 // All updates to state.Desc should be duplicated here. 209 rangeStr atomicDescString 210 211 // connectionClass controls the ConnectionClass used to send raft messages. 212 connectionClass atomicConnectionClass 213 214 // raftMu protects Raft processing the replica. 215 // 216 // Locking notes: Replica.raftMu < Replica.mu 217 raftMu struct { 218 syncutil.Mutex 219 220 // Note that there are two StateLoaders, in raftMu and mu, 221 // depending on which lock is being held. 222 stateLoader stateloader.StateLoader 223 // on-disk storage for sideloaded SSTables. nil when there's no ReplicaID. 224 sideloaded SideloadStorage 225 // stateMachine is used to apply committed raft entries. 226 stateMachine replicaStateMachine 227 // decoder is used to decode committed raft entries. 228 decoder replicaDecoder 229 } 230 231 // Contains the lease history when enabled. 232 leaseHistory *leaseHistory 233 234 // concMgr sequences incoming requests and provides isolation between 235 // requests that intend to perform conflicting operations. It is the 236 // centerpiece of transaction contention handling. 237 concMgr concurrency.Manager 238 239 mu struct { 240 // Protects all fields in the mu struct. 241 syncutil.RWMutex 242 // The destroyed status of a replica indicating if it's alive, corrupt, 243 // scheduled for destruction or has been GCed. 244 // destroyStatus should only be set while also holding the raftMu. 245 destroyStatus 246 // Is the range quiescent? Quiescent ranges are not Tick()'d and unquiesce 247 // whenever a Raft operation is performed. 248 quiescent bool 249 // mergeComplete is non-nil if a merge is in-progress, in which case any 250 // requests should be held until the completion of the merge is signaled by 251 // the closing of the channel. 252 mergeComplete chan struct{} 253 // The state of the Raft state machine. 254 state kvserverpb.ReplicaState 255 // Last index/term persisted to the raft log (not necessarily 256 // committed). Note that lastTerm may be 0 (and thus invalid) even when 257 // lastIndex is known, in which case the term will have to be retrieved 258 // from the Raft log entry. Use the invalidLastTerm constant for this 259 // case. 260 lastIndex, lastTerm uint64 261 // A map of raft log index of pending snapshots to deadlines. 262 // Used to prohibit raft log truncations that would leave a gap between 263 // the snapshot and the new first index. The map entry has a zero 264 // deadline while the snapshot is being sent and turns nonzero when the 265 // snapshot has completed, preventing truncation for a grace period 266 // (since there is a race between the snapshot completing and its being 267 // reflected in the raft status used to make truncation decisions). 268 // 269 // NB: If we kept only one value, we could end up in situations in which 270 // we're either giving some snapshots no grace period, or keep an 271 // already finished snapshot "pending" for extended periods of time 272 // (preventing log truncation). 273 snapshotLogTruncationConstraints map[uuid.UUID]snapTruncationInfo 274 // raftLogSize is the approximate size in bytes of the persisted raft 275 // log, including sideloaded entries' payloads. The value itself is not 276 // persisted and is computed lazily, paced by the raft log truncation 277 // queue which will recompute the log size when it finds it 278 // uninitialized. This recomputation mechanism isn't relevant for ranges 279 // which see regular write activity (for those the log size will deviate 280 // from zero quickly, and so it won't be recomputed but will undercount 281 // until the first truncation is carried out), but it prevents a large 282 // dormant Raft log from sitting around forever, which has caused problems 283 // in the past. 284 raftLogSize int64 285 // If raftLogSizeTrusted is false, don't trust the above raftLogSize until 286 // it has been recomputed. 287 raftLogSizeTrusted bool 288 // raftLogLastCheckSize is the value of raftLogSize the last time the Raft 289 // log was checked for truncation or at the time of the last Raft log 290 // truncation. 291 raftLogLastCheckSize int64 292 // pendingLeaseRequest is used to coalesce RequestLease requests. 293 pendingLeaseRequest pendingLeaseRequest 294 // minLeaseProposedTS is the minimum acceptable lease.ProposedTS; only 295 // leases proposed after this timestamp can be used for proposing commands. 296 // This is used to protect against several hazards: 297 // - leases held (or even proposed) before a restart cannot be used after a 298 // restart. This is because: 299 // a) the spanlatch manager is wiped during the restart; there might be 300 // writes in flight that do not have the latches they held reflected. So, 301 // we need to synchronize all new reads with those old in-flight writes. 302 // Forcing acquisition of a new lease essentially flushes all the 303 // previous raft commands. 304 // b) a lease transfer might have been in progress at the time of the 305 // restart. Using the existing lease after the restart would break the 306 // transfer proposer's promise to not use the existing lease. 307 // - a lease cannot be used after a transfer is initiated. Moreover, even 308 // lease extension that were in flight at the time of the transfer cannot be 309 // used, if they eventually apply. 310 minLeaseProposedTS hlc.Timestamp 311 // A pointer to the zone config for this replica. 312 zone *zonepb.ZoneConfig 313 // proposalBuf buffers Raft commands as they are passed to the Raft 314 // replication subsystem. The buffer is populated by requests after 315 // evaluation and is consumed by the Raft processing thread. Once 316 // consumed, commands are proposed through Raft and moved to the 317 // proposals map. 318 // 319 // Access to proposalBuf must occur *without* holding the mutex. 320 // Instead, the buffer internally holds a reference to mu and will use 321 // it appropriately. 322 proposalBuf propBuf 323 // proposals stores the Raft in-flight commands which originated at 324 // this Replica, i.e. all commands for which propose has been called, 325 // but which have not yet applied. 326 // 327 // The *ProposalData in the map are "owned" by it. Elements from the 328 // map must only be referenced while the Replica.mu is held, except 329 // if the element is removed from the map first. Modifying the proposal 330 // itself may require holding the raftMu as fields can be accessed 331 // underneath raft. See comments on ProposalData fields for synchronization 332 // requirements. 333 // 334 // Due to Raft reproposals, multiple in-flight Raft entries can have 335 // the same CmdIDKey, all corresponding to the same KV request. However, 336 // not all Raft entries with a given command ID will correspond directly 337 // to the *RaftCommand contained in its associated *ProposalData. This 338 // is because the *RaftCommand can be mutated during reproposals by 339 // Replica.tryReproposeWithNewLeaseIndex. 340 // 341 // TODO(ajwerner): move the proposal map and ProposalData entirely under 342 // the raftMu. 343 proposals map[kvserverbase.CmdIDKey]*ProposalData 344 internalRaftGroup *raft.RawNode 345 // The ID of the replica within the Raft group. This value may never be 0. 346 replicaID roachpb.ReplicaID 347 // The minimum allowed ID for this replica. Initialized from 348 // RangeTombstone.NextReplicaID. 349 tombstoneMinReplicaID roachpb.ReplicaID 350 351 // The ID of the leader replica within the Raft group. Used to determine 352 // when the leadership changes. 353 leaderID roachpb.ReplicaID 354 // The most recently added replica for the range and when it was added. 355 // Used to determine whether a replica is new enough that we shouldn't 356 // penalize it for being slightly behind. These field gets cleared out once 357 // we know that the replica has caught up. 358 lastReplicaAdded roachpb.ReplicaID 359 lastReplicaAddedTime time.Time 360 // initialMaxClosed is the initial maxClosed timestamp for the replica as known 361 // from its left-hand-side upon creation. 362 initialMaxClosed hlc.Timestamp 363 364 // The most recently updated time for each follower of this range. This is updated 365 // every time a Raft message is received from a peer. 366 // Note that superficially it seems that similar information is contained in the 367 // Progress of a RaftStatus, which has a RecentActive field. However, that field 368 // is always true unless CheckQuorum is active, which at the time of writing in 369 // CockroachDB is not the case. 370 // 371 // The lastUpdateTimes map is also updated when a leaseholder steps up 372 // (making the assumption that all followers are live at that point), 373 // and when the range unquiesces (marking all replicating followers as 374 // live). 375 // 376 // TODO(tschottdorf): keeping a map on each replica seems to be 377 // overdoing it. We should map the replicaID to a NodeID and then use 378 // node liveness (or any sensible measure of the peer being around). 379 // The danger in doing so is that a single stuck replica on an otherwise 380 // functioning node could fill up the quota pool. We are already taking 381 // this kind of risk though: a replica that gets stuck on an otherwise 382 // live node will not lose leaseholdership. 383 lastUpdateTimes lastUpdateTimesMap 384 385 // The last seen replica descriptors from incoming Raft messages. These are 386 // stored so that the replica still knows the replica descriptors for itself 387 // and for its message recipients in the circumstances when its RangeDescriptor 388 // is out of date. 389 // 390 // Normally, a replica knows about the other replica descriptors for a 391 // range via the RangeDescriptor stored in Replica.mu.state.Desc. But that 392 // descriptor is only updated during a Split or ChangeReplicas operation. 393 // There are periods during a Replica's lifetime when that information is 394 // out of date: 395 // 396 // 1. When a replica is being newly created as the result of an incoming 397 // Raft message for it. This is the common case for ChangeReplicas and an 398 // uncommon case for Splits. The leader will be sending the replica 399 // messages and the replica needs to be able to respond before it can 400 // receive an updated range descriptor (via a snapshot, 401 // changeReplicasTrigger, or splitTrigger). 402 // 403 // 2. If the node containing a replica is partitioned or down while the 404 // replicas for the range are updated. When the node comes back up, other 405 // replicas may begin communicating with it and it needs to be able to 406 // respond. Unlike 1 where there is no range descriptor, in this situation 407 // the replica has a range descriptor but it is out of date. Note that a 408 // replica being removed from a node and then quickly re-added before the 409 // replica has been GC'd will also use the last seen descriptors. In 410 // effect, this is another path for which the replica's local range 411 // descriptor is out of date. 412 // 413 // The last seen replica descriptors are updated on receipt of every raft 414 // message via Replica.setLastReplicaDescriptors (see 415 // Store.HandleRaftRequest). These last seen descriptors are used when 416 // the replica's RangeDescriptor contains missing or out of date descriptors 417 // for a replica (see Replica.sendRaftMessage). 418 // 419 // Removing a replica from Store.mu.replicas is not a problem because 420 // when a replica is completely removed, it won't be recreated until 421 // there is another event that will repopulate the replicas map in the 422 // range descriptor. When it is temporarily dropped and recreated, the 423 // newly recreated replica will have a complete range descriptor. 424 lastToReplica, lastFromReplica roachpb.ReplicaDescriptor 425 426 // Computed checksum at a snapshot UUID. 427 checksums map[uuid.UUID]ReplicaChecksum 428 429 // proposalQuota is the quota pool maintained by the lease holder where 430 // incoming writes acquire quota from a fixed quota pool before going 431 // through. If there is no quota available, the write is throttled 432 // until quota is made available to the pool. 433 // Acquired quota for a given command is only released when all the 434 // replicas have persisted the corresponding entry into their logs. 435 proposalQuota *quotapool.IntPool 436 437 // The base index is the index up to (including) which quota was already 438 // released. That is, the first element in quotaReleaseQueue below is 439 // released as the base index moves up by one, etc. 440 proposalQuotaBaseIndex uint64 441 442 // Once the leader observes a proposal come 'out of Raft', we add the size 443 // of the associated command to a queue of quotas we have yet to release 444 // back to the quota pool. At that point ownership of the quota is 445 // transferred from r.mu.proposals to this queue. 446 // We'll release the respective quota once all replicas have persisted the 447 // corresponding entry into their logs (or once we give up waiting on some 448 // replica because it looks like it's dead). 449 quotaReleaseQueue []*quotapool.IntAlloc 450 451 // Counts calls to Replica.tick() 452 ticks int 453 454 // Counts Raft messages refused due to queue congestion. 455 droppedMessages int 456 457 // Note that there are two replicaStateLoaders, in raftMu and mu, 458 // depending on which lock is being held. 459 stateLoader stateloader.StateLoader 460 461 // draining specifies whether this replica is draining. Raft leadership 462 // transfers due to a lease change will be attempted even if the target does 463 // not have all the log entries. 464 draining bool 465 466 // cachedProtectedTS provides the state of the protected timestamp 467 // subsystem as used on the request serving path to determine the effective 468 // gc threshold given the current TTL when using strict GC enforcement. 469 // 470 // It would be too expensive to go read from the protected timestamp cache 471 // for every request. Instead, if clients want to ensure that their request 472 // will see the effect of a protected timestamp record, they need to verify 473 // the request. See the comment on the struct for more details. 474 cachedProtectedTS cachedProtectedTimestampState 475 476 // largestPreviousMaxRangeSizeBytes tracks a previous zone.RangeMaxBytes 477 // which exceeded the current zone.RangeMaxBytes to help defeat the range 478 // backpressure mechanism in cases where a user reduces the configured range 479 // size. It is set when the zone config changes to a smaller value and the 480 // current range size exceeds the new value. It is cleared after the range's 481 // size drops below its current zone.MaxRangeBytes or if the 482 // zone.MaxRangeBytes increases to surpass the current value. 483 largestPreviousMaxRangeSizeBytes int64 484 485 // failureToGossipSystemConfig is set to true when the leaseholder of the 486 // range containing the system config span fails to gossip due to an 487 // outstanding intent (see MaybeGossipSystemConfig). It is reset when the 488 // system config is successfully gossiped or when the Replica loses the 489 // lease. It is read when handling a MaybeGossipSystemConfigIfHaveFailure 490 // local result trigger. That trigger is set when an EndTransaction with an 491 // ABORTED status is evaluated on a range containing the system config span. 492 // 493 // While the gossipping of the system config span is best-effort, the sql 494 // schema leasing mechanism degrades dramatically if changes are not 495 // gossiped. This degradation is due to the fact that schema changes, after 496 // writing intents, often need to ensure that there aren't outstanding 497 // leases on old versions and if there are, roll back and wait until there 498 // are not. The problem is that this waiting may take a long time if the 499 // current leaseholders are not notified. We deal with this by detecting the 500 // abort of a transaction which might have blocked the system config from 501 // being gossiped and attempting to gossip again. 502 failureToGossipSystemConfig bool 503 } 504 505 rangefeedMu struct { 506 syncutil.RWMutex 507 // proc is an instance of a rangefeed Processor that is capable of 508 // routing rangefeed events to a set of subscribers. Will be nil if no 509 // subscribers are registered. 510 // 511 // Requires Replica.rangefeedMu be held when mutating the pointer. 512 // Requires Replica.raftMu be held when providing logical ops and 513 // informing the processor of closed timestamp updates. This properly 514 // synchronizes updates that are linearized and driven by the Raft log. 515 proc *rangefeed.Processor 516 // opFilter is a best-effort filter that informs the raft processing 517 // goroutine of which logical operations the rangefeed processor is 518 // interested in based on the processor's current registrations. 519 // 520 // The filter is allowed to return false positives, but not false 521 // negatives. False negatives are avoided by updating (expanding) the 522 // filter while holding the Replica.raftMu when adding new registrations 523 // after flushing the rangefeed.Processor event channel. This ensures 524 // that no events that were filtered before the new registration was 525 // added will be observed by the new registration and all events after 526 // the new registration will respect the updated filter. 527 // 528 // Requires Replica.rangefeedMu be held when mutating the pointer. 529 opFilter *rangefeed.Filter 530 } 531 532 // Throttle how often we offer this Replica to the split and merge queues. 533 // We have triggers downstream of Raft that do so based on limited 534 // information and without explicit throttling some replicas will offer once 535 // per applied Raft command, which is silly and also clogs up the queues' 536 // semaphores. 537 splitQueueThrottle, mergeQueueThrottle util.EveryN 538 539 // loadBasedSplitter keeps information about load-based splitting. 540 loadBasedSplitter split.Decider 541 542 unreachablesMu struct { 543 syncutil.Mutex 544 remotes map[roachpb.ReplicaID]struct{} 545 } 546 547 // r.mu < r.protectedTimestampMu 548 protectedTimestampMu struct { 549 syncutil.Mutex 550 551 // minStateReadTimestamp is a lower bound on the timestamp of the cached 552 // protected timestamp state which may be used when updating 553 // pendingGCThreshold. This field acts to eliminate races between 554 // verification of protected timestamp records and the setting of a new 555 // GC threshold 556 minStateReadTimestamp hlc.Timestamp 557 558 // pendingGCThreshold holds a timestamp which is being proposed as a new 559 // GC threshold for the range. 560 pendingGCThreshold hlc.Timestamp 561 } 562 } 563 564 var _ batcheval.EvalContext = &Replica{} 565 566 // KeyRange is an interface type for the replicasByKey BTree, to compare 567 // Replica and ReplicaPlaceholder. 568 type KeyRange interface { 569 Desc() *roachpb.RangeDescriptor 570 rangeKeyItem 571 btree.Item 572 fmt.Stringer 573 } 574 575 var _ KeyRange = &Replica{} 576 577 var _ kv.Sender = &Replica{} 578 579 // String returns the string representation of the replica using an 580 // inconsistent copy of the range descriptor. Therefore, String does not 581 // require a lock and its output may not be atomic with other ongoing work in 582 // the replica. This is done to prevent deadlocks in logging sites. 583 func (r *Replica) String() string { 584 return fmt.Sprintf("[n%d,s%d,r%s]", r.store.Ident.NodeID, r.store.Ident.StoreID, &r.rangeStr) 585 } 586 587 // ReplicaID returns the ID for the Replica. It may be zero if the replica does 588 // not know its ID. Once a Replica has a non-zero ReplicaID it will never change. 589 func (r *Replica) ReplicaID() roachpb.ReplicaID { 590 r.mu.RLock() 591 defer r.mu.RUnlock() 592 return r.mu.replicaID 593 } 594 595 // cleanupFailedProposal cleans up after a proposal that has failed. It 596 // clears any references to the proposal and releases associated quota. 597 // It requires that both Replica.mu and Replica.raftMu are exclusively held. 598 func (r *Replica) cleanupFailedProposalLocked(p *ProposalData) { 599 r.raftMu.AssertHeld() 600 r.mu.AssertHeld() 601 delete(r.mu.proposals, p.idKey) 602 p.releaseQuota() 603 } 604 605 // GetMinBytes gets the replica's minimum byte threshold. 606 func (r *Replica) GetMinBytes() int64 { 607 r.mu.RLock() 608 defer r.mu.RUnlock() 609 return *r.mu.zone.RangeMinBytes 610 } 611 612 // GetMaxBytes gets the replica's maximum byte threshold. 613 func (r *Replica) GetMaxBytes() int64 { 614 r.mu.RLock() 615 defer r.mu.RUnlock() 616 return *r.mu.zone.RangeMaxBytes 617 } 618 619 // SetZoneConfig sets the replica's zone config. 620 func (r *Replica) SetZoneConfig(zone *zonepb.ZoneConfig) { 621 r.mu.Lock() 622 defer r.mu.Unlock() 623 624 if r.isInitializedRLocked() && 625 r.mu.zone != nil && 626 zone != nil { 627 total := r.mu.state.Stats.Total() 628 629 // Set largestPreviousMaxRangeSizeBytes if the current range size is above 630 // the new limit and we don't already have a larger value. Reset it if 631 // the new limit is larger than the current largest we're aware of. 632 if total > *zone.RangeMaxBytes && 633 *zone.RangeMaxBytes < *r.mu.zone.RangeMaxBytes && 634 r.mu.largestPreviousMaxRangeSizeBytes < *r.mu.zone.RangeMaxBytes && 635 // Check to make sure that we're replacing a real zone config. Otherwise 636 // the default value would prevent backpressure until the range was 637 // larger than the default value. When the store starts up it sets the 638 // zone for the replica to this default value; later on it overwrites it 639 // with a new instance even if the value is the same as the default. 640 r.mu.zone != r.store.cfg.DefaultZoneConfig && 641 r.mu.zone != r.store.cfg.DefaultSystemZoneConfig { 642 643 r.mu.largestPreviousMaxRangeSizeBytes = *r.mu.zone.RangeMaxBytes 644 } else if r.mu.largestPreviousMaxRangeSizeBytes > 0 && 645 r.mu.largestPreviousMaxRangeSizeBytes < *zone.RangeMaxBytes { 646 647 r.mu.largestPreviousMaxRangeSizeBytes = 0 648 } 649 } 650 r.mu.zone = zone 651 } 652 653 // IsFirstRange returns true if this is the first range. 654 func (r *Replica) IsFirstRange() bool { 655 return r.RangeID == 1 656 } 657 658 // IsDestroyed returns a non-nil error if the replica has been destroyed 659 // and the reason if it has. 660 func (r *Replica) IsDestroyed() (DestroyReason, error) { 661 r.mu.RLock() 662 defer r.mu.RUnlock() 663 return r.isDestroyedRLocked() 664 } 665 666 func (r *Replica) isDestroyedRLocked() (DestroyReason, error) { 667 return r.mu.destroyStatus.reason, r.mu.destroyStatus.err 668 } 669 670 // DescAndZone returns the authoritative range descriptor as well 671 // as the zone config for the replica. 672 func (r *Replica) DescAndZone() (*roachpb.RangeDescriptor, *zonepb.ZoneConfig) { 673 r.mu.RLock() 674 defer r.mu.RUnlock() 675 return r.mu.state.Desc, r.mu.zone 676 } 677 678 // Desc returns the authoritative range descriptor, acquiring a replica lock in 679 // the process. 680 func (r *Replica) Desc() *roachpb.RangeDescriptor { 681 r.mu.RLock() 682 defer r.mu.RUnlock() 683 return r.mu.state.Desc 684 } 685 686 func (r *Replica) descRLocked() *roachpb.RangeDescriptor { 687 r.mu.AssertRHeld() 688 return r.mu.state.Desc 689 } 690 691 // NodeID returns the ID of the node this replica belongs to. 692 func (r *Replica) NodeID() roachpb.NodeID { 693 return r.store.nodeDesc.NodeID 694 } 695 696 // GetNodeLocality returns the locality of the node this replica belongs to. 697 func (r *Replica) GetNodeLocality() roachpb.Locality { 698 return r.store.nodeDesc.Locality 699 } 700 701 // ClusterSettings returns the node's ClusterSettings. 702 func (r *Replica) ClusterSettings() *cluster.Settings { 703 return r.store.cfg.Settings 704 } 705 706 // StoreID returns the Replica's StoreID. 707 func (r *Replica) StoreID() roachpb.StoreID { 708 return r.store.StoreID() 709 } 710 711 // EvalKnobs returns the EvalContext's Knobs. 712 func (r *Replica) EvalKnobs() kvserverbase.BatchEvalTestingKnobs { 713 return r.store.cfg.TestingKnobs.EvalKnobs 714 } 715 716 // Clock returns the hlc clock shared by this replica. 717 func (r *Replica) Clock() *hlc.Clock { 718 return r.store.Clock() 719 } 720 721 // DB returns the Replica's client DB. 722 func (r *Replica) DB() *kv.DB { 723 return r.store.DB() 724 } 725 726 // Engine returns the Replica's underlying Engine. In most cases the 727 // evaluation Batch should be used instead. 728 func (r *Replica) Engine() storage.Engine { 729 return r.store.Engine() 730 } 731 732 // AbortSpan returns the Replica's AbortSpan. 733 func (r *Replica) AbortSpan() *abortspan.AbortSpan { 734 // Despite its name, the AbortSpan doesn't hold on-disk data in 735 // memory. It just provides methods that take a Batch, so SpanSet 736 // declarations are enforced there. 737 return r.abortSpan 738 } 739 740 // GetLimiters returns the Replica's limiters. 741 func (r *Replica) GetLimiters() *batcheval.Limiters { 742 return &r.store.limiters 743 } 744 745 // GetConcurrencyManager returns the Replica's concurrency.Manager. 746 func (r *Replica) GetConcurrencyManager() concurrency.Manager { 747 return r.concMgr 748 } 749 750 // GetTerm returns the term of the given index in the raft log. 751 func (r *Replica) GetTerm(i uint64) (uint64, error) { 752 r.mu.RLock() 753 defer r.mu.RUnlock() 754 return r.raftTermRLocked(i) 755 } 756 757 // GetRangeID returns the Range ID. 758 func (r *Replica) GetRangeID() roachpb.RangeID { 759 return r.RangeID 760 } 761 762 // GetGCThreshold returns the GC threshold. 763 func (r *Replica) GetGCThreshold() hlc.Timestamp { 764 r.mu.RLock() 765 defer r.mu.RUnlock() 766 return *r.mu.state.GCThreshold 767 } 768 769 // getImpliedGCThresholdRLocked returns the gc threshold of the replica which 770 // should be used to determine the validity of commands. The returned timestamp 771 // may be newer than the replica's true GC threshold if strict enforcement 772 // is enabled and the TTL has passed. If this is an admin command or this range 773 // contains data outside of the user keyspace, we return the true GC threshold. 774 func (r *Replica) getImpliedGCThresholdRLocked( 775 st *kvserverpb.LeaseStatus, isAdmin bool, 776 ) hlc.Timestamp { 777 threshold := *r.mu.state.GCThreshold 778 779 // The GC threshold is the oldest value we can return here. 780 if isAdmin || !StrictGCEnforcement.Get(&r.store.ClusterSettings().SV) || 781 r.isSystemRangeRLocked() { 782 return threshold 783 } 784 785 // In order to make this check inexpensive, we keep a copy of the reading of 786 // protected timestamp state in the replica. This state may be stale, may not 787 // exist, or may be unusable given the current lease status. In those cases we 788 // must return the GC threshold. On the one hand this seems like a big deal, 789 // after a lease transfer, for minutes, users will be able to read data that 790 // has technically expired. Fortunately this strict enforcement is merely a 791 // user experience win; it's always safe to allow reads to continue so long 792 // as they are after the GC threshold. 793 c := r.mu.cachedProtectedTS 794 if st.State != kvserverpb.LeaseState_VALID || c.readAt.Less(st.Lease.Start) { 795 return threshold 796 } 797 798 impliedThreshold := gc.CalculateThreshold(st.Timestamp, *r.mu.zone.GC) 799 threshold.Forward(impliedThreshold) 800 801 // If we have a protected timestamp record which precedes the implied 802 // threshold, use the threshold it implies instead. 803 if c.earliestRecord != nil && c.earliestRecord.Timestamp.Less(threshold) { 804 threshold = c.earliestRecord.Timestamp.Prev() 805 } 806 return threshold 807 } 808 809 // isSystemRange returns true if r's key range precedes keys.UserTableDataMin. 810 func (r *Replica) isSystemRange() bool { 811 r.mu.RLock() 812 defer r.mu.RUnlock() 813 return r.isSystemRangeRLocked() 814 } 815 816 func (r *Replica) isSystemRangeRLocked() bool { 817 return r.mu.state.Desc.StartKey.Less(roachpb.RKey(keys.UserTableDataMin)) 818 } 819 820 // maxReplicaIDOfAny returns the maximum ReplicaID of any replica, including 821 // voters and learners. 822 func maxReplicaIDOfAny(desc *roachpb.RangeDescriptor) roachpb.ReplicaID { 823 if desc == nil || !desc.IsInitialized() { 824 return 0 825 } 826 var maxID roachpb.ReplicaID 827 for _, repl := range desc.Replicas().All() { 828 if repl.ReplicaID > maxID { 829 maxID = repl.ReplicaID 830 } 831 } 832 return maxID 833 } 834 835 // LastReplicaAdded returns the ID of the most recently added replica and the 836 // time at which it was added. 837 func (r *Replica) LastReplicaAdded() (roachpb.ReplicaID, time.Time) { 838 r.mu.RLock() 839 defer r.mu.RUnlock() 840 return r.mu.lastReplicaAdded, r.mu.lastReplicaAddedTime 841 } 842 843 // GetReplicaDescriptor returns the replica for this range from the range 844 // descriptor. Returns a *RangeNotFoundError if the replica is not found. 845 // No other errors are returned. 846 func (r *Replica) GetReplicaDescriptor() (roachpb.ReplicaDescriptor, error) { 847 r.mu.RLock() 848 defer r.mu.RUnlock() 849 return r.getReplicaDescriptorRLocked() 850 } 851 852 // getReplicaDescriptorRLocked is like getReplicaDescriptor, but assumes that 853 // r.mu is held for either reading or writing. 854 func (r *Replica) getReplicaDescriptorRLocked() (roachpb.ReplicaDescriptor, error) { 855 repDesc, ok := r.mu.state.Desc.GetReplicaDescriptor(r.store.StoreID()) 856 if ok { 857 return repDesc, nil 858 } 859 return roachpb.ReplicaDescriptor{}, roachpb.NewRangeNotFoundError(r.RangeID, r.store.StoreID()) 860 } 861 862 func (r *Replica) getMergeCompleteCh() chan struct{} { 863 r.mu.RLock() 864 defer r.mu.RUnlock() 865 return r.getMergeCompleteChRLocked() 866 } 867 868 func (r *Replica) getMergeCompleteChRLocked() chan struct{} { 869 return r.mu.mergeComplete 870 } 871 872 // setLastReplicaDescriptors sets the the most recently seen replica 873 // descriptors to those contained in the *RaftMessageRequest, acquiring r.mu 874 // to do so. 875 func (r *Replica) setLastReplicaDescriptors(req *RaftMessageRequest) { 876 r.mu.Lock() 877 r.mu.lastFromReplica = req.FromReplica 878 r.mu.lastToReplica = req.ToReplica 879 r.mu.Unlock() 880 } 881 882 // GetMVCCStats returns a copy of the MVCC stats object for this range. 883 // This accessor is thread-safe, but provides no guarantees about its 884 // synchronization with any concurrent writes. 885 func (r *Replica) GetMVCCStats() enginepb.MVCCStats { 886 r.mu.RLock() 887 defer r.mu.RUnlock() 888 return *r.mu.state.Stats 889 } 890 891 // GetSplitQPS returns the Replica's queries/s request rate. 892 // 893 // NOTE: This should only be used for load based splitting, only 894 // works when the load based splitting cluster setting is enabled. 895 // 896 // Use QueriesPerSecond() for current QPS stats for all other purposes. 897 func (r *Replica) GetSplitQPS() float64 { 898 return r.loadBasedSplitter.LastQPS(timeutil.Now()) 899 } 900 901 // ContainsKey returns whether this range contains the specified key. 902 // 903 // TODO(bdarnell): This is not the same as RangeDescriptor.ContainsKey. 904 func (r *Replica) ContainsKey(key roachpb.Key) bool { 905 return kvserverbase.ContainsKey(r.Desc(), key) 906 } 907 908 // ContainsKeyRange returns whether this range contains the specified 909 // key range from start to end. 910 func (r *Replica) ContainsKeyRange(start, end roachpb.Key) bool { 911 return kvserverbase.ContainsKeyRange(r.Desc(), start, end) 912 } 913 914 // GetLastReplicaGCTimestamp reads the timestamp at which the replica was 915 // last checked for removal by the replica gc queue. 916 func (r *Replica) GetLastReplicaGCTimestamp(ctx context.Context) (hlc.Timestamp, error) { 917 key := keys.RangeLastReplicaGCTimestampKey(r.RangeID) 918 var timestamp hlc.Timestamp 919 _, err := storage.MVCCGetProto(ctx, r.store.Engine(), key, hlc.Timestamp{}, ×tamp, 920 storage.MVCCGetOptions{}) 921 if err != nil { 922 return hlc.Timestamp{}, err 923 } 924 return timestamp, nil 925 } 926 927 func (r *Replica) setLastReplicaGCTimestamp(ctx context.Context, timestamp hlc.Timestamp) error { 928 key := keys.RangeLastReplicaGCTimestampKey(r.RangeID) 929 return storage.MVCCPutProto(ctx, r.store.Engine(), nil, key, hlc.Timestamp{}, nil, ×tamp) 930 } 931 932 // getQueueLastProcessed returns the last processed timestamp for the 933 // specified queue, or the zero timestamp if not available. 934 func (r *Replica) getQueueLastProcessed(ctx context.Context, queue string) (hlc.Timestamp, error) { 935 key := keys.QueueLastProcessedKey(r.Desc().StartKey, queue) 936 var timestamp hlc.Timestamp 937 if r.store != nil { 938 _, err := storage.MVCCGetProto(ctx, r.store.Engine(), key, hlc.Timestamp{}, ×tamp, 939 storage.MVCCGetOptions{}) 940 if err != nil { 941 log.VErrEventf(ctx, 2, "last processed timestamp unavailable: %s", err) 942 return hlc.Timestamp{}, err 943 } 944 } 945 log.VEventf(ctx, 2, "last processed timestamp: %s", timestamp) 946 return timestamp, nil 947 } 948 949 // setQueueLastProcessed writes the last processed timestamp for the 950 // specified queue. 951 func (r *Replica) setQueueLastProcessed( 952 ctx context.Context, queue string, timestamp hlc.Timestamp, 953 ) error { 954 key := keys.QueueLastProcessedKey(r.Desc().StartKey, queue) 955 return r.store.DB().PutInline(ctx, key, ×tamp) 956 } 957 958 // RaftStatus returns the current raft status of the replica. It returns nil 959 // if the Raft group has not been initialized yet. 960 func (r *Replica) RaftStatus() *raft.Status { 961 r.mu.RLock() 962 defer r.mu.RUnlock() 963 return r.raftStatusRLocked() 964 } 965 966 func (r *Replica) raftStatusRLocked() *raft.Status { 967 if rg := r.mu.internalRaftGroup; rg != nil { 968 s := rg.Status() 969 return &s 970 } 971 return nil 972 } 973 974 func (r *Replica) raftBasicStatusRLocked() raft.BasicStatus { 975 if rg := r.mu.internalRaftGroup; rg != nil { 976 return rg.BasicStatus() 977 } 978 return raft.BasicStatus{} 979 } 980 981 // State returns a copy of the internal state of the Replica, along with some 982 // auxiliary information. 983 func (r *Replica) State() kvserverpb.RangeInfo { 984 var ri kvserverpb.RangeInfo 985 986 // NB: this acquires an RLock(). Reentrant RLocks are deadlock prone, so do 987 // this first before RLocking below. Performance of this extra lock 988 // acquisition is not a concern. 989 ri.ActiveClosedTimestamp, _ = r.maxClosed(context.Background()) 990 991 // NB: numRangefeedRegistrations doesn't require Replica.mu to be locked. 992 // However, it does require coordination between multiple goroutines, so 993 // it's best to keep it out of the Replica.mu critical section. 994 ri.RangefeedRegistrations = int64(r.numRangefeedRegistrations()) 995 996 r.mu.RLock() 997 defer r.mu.RUnlock() 998 ri.ReplicaState = *(protoutil.Clone(&r.mu.state)).(*kvserverpb.ReplicaState) 999 ri.LastIndex = r.mu.lastIndex 1000 ri.NumPending = uint64(r.numPendingProposalsRLocked()) 1001 ri.RaftLogSize = r.mu.raftLogSize 1002 ri.RaftLogSizeTrusted = r.mu.raftLogSizeTrusted 1003 ri.NumDropped = uint64(r.mu.droppedMessages) 1004 if r.mu.proposalQuota != nil { 1005 ri.ApproximateProposalQuota = int64(r.mu.proposalQuota.ApproximateQuota()) 1006 ri.ProposalQuotaBaseIndex = int64(r.mu.proposalQuotaBaseIndex) 1007 ri.ProposalQuotaReleaseQueue = make([]int64, len(r.mu.quotaReleaseQueue)) 1008 for i, a := range r.mu.quotaReleaseQueue { 1009 if a != nil { 1010 ri.ProposalQuotaReleaseQueue[i] = int64(a.Acquired()) 1011 } 1012 } 1013 } 1014 ri.RangeMaxBytes = *r.mu.zone.RangeMaxBytes 1015 if desc := ri.ReplicaState.Desc; desc != nil { 1016 // Learner replicas don't serve follower reads, but they still receive 1017 // closed timestamp updates, so include them here. 1018 allReplicas := desc.Replicas().All() 1019 for i := range allReplicas { 1020 replDesc := &allReplicas[i] 1021 r.store.cfg.ClosedTimestamp.Storage.VisitDescending(replDesc.NodeID, func(e ctpb.Entry) (done bool) { 1022 mlai, found := e.MLAI[r.RangeID] 1023 if !found { 1024 return false // not done 1025 } 1026 if ri.NewestClosedTimestamp.ClosedTimestamp.Less(e.ClosedTimestamp) { 1027 ri.NewestClosedTimestamp.NodeID = replDesc.NodeID 1028 ri.NewestClosedTimestamp.ClosedTimestamp = e.ClosedTimestamp 1029 ri.NewestClosedTimestamp.MLAI = int64(mlai) 1030 ri.NewestClosedTimestamp.Epoch = int64(e.Epoch) 1031 } 1032 return true // done 1033 }) 1034 } 1035 } 1036 return ri 1037 } 1038 1039 // assertStateLocked can be called from the Raft goroutine to check that the 1040 // in-memory and on-disk states of the Replica are congruent. 1041 // Requires that both r.raftMu and r.mu are held. 1042 // 1043 // TODO(tschottdorf): Consider future removal (for example, when #7224 is resolved). 1044 func (r *Replica) assertStateLocked(ctx context.Context, reader storage.Reader) { 1045 diskState, err := r.mu.stateLoader.Load(ctx, reader, r.mu.state.Desc) 1046 if err != nil { 1047 log.Fatalf(ctx, "%v", err) 1048 } 1049 if !diskState.Equal(r.mu.state) { 1050 // The roundabout way of printing here is to expose this information in sentry.io. 1051 // 1052 // TODO(dt): expose properly once #15892 is addressed. 1053 log.Errorf(ctx, "on-disk and in-memory state diverged:\n%s", 1054 pretty.Diff(diskState, r.mu.state)) 1055 r.mu.state.Desc, diskState.Desc = nil, nil 1056 log.Fatalf(ctx, "on-disk and in-memory state diverged: %s", 1057 log.Safe(pretty.Diff(diskState, r.mu.state))) 1058 } 1059 } 1060 1061 // checkExecutionCanProceed returns an error if a batch request cannot be 1062 // executed by the Replica. An error indicates that the Replica is not live and 1063 // able to serve traffic or that the request is not compatible with the state of 1064 // the Range. 1065 // 1066 // The method accepts a concurrency Guard and a LeaseStatus parameter. These are 1067 // used to indicate whether the caller has acquired latches and checked the 1068 // Range lease. The method will only check for a pending merge if both of these 1069 // conditions are true. If either !g.HoldingLatches() or st == nil then the 1070 // method will not check for a pending merge. Callers might be ok with this if 1071 // they know that they will end up checking for a pending merge at some later 1072 // time. 1073 func (r *Replica) checkExecutionCanProceed( 1074 ba *roachpb.BatchRequest, g *concurrency.Guard, st *kvserverpb.LeaseStatus, 1075 ) error { 1076 rSpan, err := keys.Range(ba.Requests) 1077 if err != nil { 1078 return err 1079 } 1080 r.mu.RLock() 1081 defer r.mu.RUnlock() 1082 if _, err := r.isDestroyedRLocked(); err != nil { 1083 return err 1084 } else if err := r.checkSpanInRangeRLocked(rSpan); err != nil { 1085 return err 1086 } else if err := r.checkTSAboveGCThresholdRLocked(ba.Timestamp, st, ba.IsAdmin()); err != nil { 1087 return err 1088 } else if g.HoldingLatches() && st != nil { 1089 // Only check for a pending merge if latches are held and the Range 1090 // lease is held by this Replica. Without both of these conditions, 1091 // checkForPendingMergeRLocked could return false negatives. 1092 return r.checkForPendingMergeRLocked(ba) 1093 } 1094 return nil 1095 } 1096 1097 // checkExecutionCanProceedForRangeFeed returns an error if a rangefeed request 1098 // cannot be executed by the Replica. 1099 func (r *Replica) checkExecutionCanProceedForRangeFeed( 1100 rSpan roachpb.RSpan, ts hlc.Timestamp, 1101 ) error { 1102 now := r.Clock().Now() 1103 r.mu.RLock() 1104 defer r.mu.RUnlock() 1105 status := r.leaseStatus(*r.mu.state.Lease, now, r.mu.minLeaseProposedTS) 1106 if _, err := r.isDestroyedRLocked(); err != nil { 1107 return err 1108 } else if err := r.checkSpanInRangeRLocked(rSpan); err != nil { 1109 return err 1110 } else if err := r.checkTSAboveGCThresholdRLocked(ts, &status, false /* isAdmin */); err != nil { 1111 return err 1112 } else if r.requiresExpiringLeaseRLocked() { 1113 // Ensure that the range does not require an expiration-based lease. If it 1114 // does, it will never get closed timestamp updates and the rangefeed will 1115 // never be able to advance its resolved timestamp. 1116 return errors.New("expiration-based leases are incompatible with rangefeeds") 1117 } 1118 return nil 1119 } 1120 1121 // checkSpanInRangeRLocked returns an error if a request (identified by its 1122 // key span) can be run on the replica. 1123 func (r *Replica) checkSpanInRangeRLocked(rspan roachpb.RSpan) error { 1124 desc := r.mu.state.Desc 1125 if desc.ContainsKeyRange(rspan.Key, rspan.EndKey) { 1126 return nil 1127 } 1128 return roachpb.NewRangeKeyMismatchError( 1129 rspan.Key.AsRawKey(), rspan.EndKey.AsRawKey(), desc, 1130 ) 1131 } 1132 1133 // checkTSAboveGCThresholdRLocked returns an error if a request (identified 1134 // by its MVCC timestamp) can be run on the replica. 1135 func (r *Replica) checkTSAboveGCThresholdRLocked( 1136 ts hlc.Timestamp, st *kvserverpb.LeaseStatus, isAdmin bool, 1137 ) error { 1138 threshold := r.getImpliedGCThresholdRLocked(st, isAdmin) 1139 if threshold.Less(ts) { 1140 return nil 1141 } 1142 return &roachpb.BatchTimestampBeforeGCError{ 1143 Timestamp: ts, 1144 Threshold: threshold, 1145 } 1146 } 1147 1148 // checkForPendingMergeRLocked determines whether the replica is being merged 1149 // into its left-hand neighbor. If so, an error is returned to prevent the 1150 // request from proceeding until the merge completes. 1151 func (r *Replica) checkForPendingMergeRLocked(ba *roachpb.BatchRequest) error { 1152 if r.getMergeCompleteChRLocked() == nil { 1153 return nil 1154 } 1155 if ba.IsSingleSubsumeRequest() { 1156 return nil 1157 } 1158 // The replica is being merged into its left-hand neighbor. This request 1159 // cannot proceed until the merge completes, signaled by the closing of the 1160 // channel. 1161 // 1162 // It is very important that this check occur after we have acquired latches 1163 // from the spanlatch manager. Only after we release these latches are we 1164 // guaranteed that we're not racing with a Subsume command. (Subsume 1165 // commands declare a conflict with all other commands.) It is also 1166 // important that this check occur after we have verified that this replica 1167 // is the leaseholder. Only the leaseholder will have its merge complete 1168 // channel set. 1169 // 1170 // Note that Subsume commands are exempt from waiting on the mergeComplete 1171 // channel. This is necessary to avoid deadlock. While normally a Subsume 1172 // request will trigger the installation of a mergeComplete channel after it 1173 // is executed, it may sometimes execute after the mergeComplete channel has 1174 // been installed. Consider the case where the RHS replica acquires a new 1175 // lease after the merge transaction deletes its local range descriptor but 1176 // before the Subsume command is sent. The lease acquisition request will 1177 // notice the intent on the local range descriptor and install a 1178 // mergeComplete channel. If the forthcoming Subsume blocked on that 1179 // channel, the merge transaction would deadlock. 1180 // 1181 // This exclusion admits a small race condition. If a Subsume request is 1182 // sent to the right-hand side of a merge, outside of a merge transaction, 1183 // after the merge has committed but before the RHS has noticed that the 1184 // merge has committed, the request may return stale data. Since the merge 1185 // has committed, the LHS may have processed writes to the keyspace 1186 // previously owned by the RHS that the RHS is unaware of. This window 1187 // closes quickly, as the RHS will soon notice the merge transaction has 1188 // committed and mark itself as destroyed, which prevents it from serving 1189 // all traffic, including Subsume requests. 1190 // 1191 // In our current, careful usage of Subsume, this race condition is 1192 // irrelevant. Subsume is only sent from within a merge transaction, and 1193 // merge transactions read the RHS descriptor at the beginning of the 1194 // transaction to verify that it has not already been merged away. 1195 // 1196 // We can't wait for the merge to complete here, though. The replica might 1197 // need to respond to a Subsume request in order for the merge to complete, 1198 // and blocking here would force that Subsume request to sit in hold its 1199 // latches forever, deadlocking the merge. Instead, we release the latches 1200 // we acquired above and return a MergeInProgressError. The store will catch 1201 // that error and resubmit the request after mergeCompleteCh closes. See 1202 // #27442 for the full context. 1203 return &roachpb.MergeInProgressError{} 1204 } 1205 1206 // isNewerThanSplit is a helper used in split(Pre|Post)Apply to 1207 // determine whether the Replica on the right hand side of the split must 1208 // have been removed from this store after the split. There is one 1209 // false negative where false will be returned but the hard state may 1210 // be due to a newer replica which is outlined below. It should be safe. 1211 // 1212 // TODO(ajwerner): Ideally if this store had ever learned that the replica 1213 // created by the split were removed it would not forget that fact. 1214 // There exists one edge case where the store may learn that it should house 1215 // a replica of the same range with a higher replica ID and then forget. 1216 // If the first raft message this store ever receives for the this range 1217 // contains a replica ID higher than the replica ID in the split trigger 1218 // then an in-memory replica at that higher replica ID will be created and 1219 // no tombstone at a lower replica ID will be written. If the server then 1220 // crashes it will forget that it had ever been the higher replica ID. The 1221 // server may then proceed to process the split and initialize a replica at 1222 // the replica ID implied by the split. This is potentially problematic as 1223 // the replica may have voted as this higher replica ID and when it rediscovers 1224 // the higher replica ID it will delete all of the state corresponding to the 1225 // older replica ID including its hard state which may have been synthesized 1226 // with votes as the newer replica ID. This case tends to be handled safely 1227 // in practice because the replica should only be receiving messages as the 1228 // newer replica ID after it has been added to the range. Prior to learner 1229 // replicas we would only add a store to a range after we've successfully 1230 // applied a pre-emptive snapshot. If the store were to split between the 1231 // preemptive snapshot and the addition then the addition would fail due to 1232 // the conditional put logic. If the store were to then enable learners then 1233 // we're still okay because we won't promote a learner unless we succeed in 1234 // sending a learner snapshot. If we fail to send the replica never becomes 1235 // a voter then its votes don't matter and are safe to discard. 1236 // 1237 // Despite the safety due to the change replicas protocol explained above 1238 // it'd be good to know for sure that a replica ID for a range on a store 1239 // is always monotonically increasing, even across restarts. 1240 // 1241 // See TestProcessSplitAfterRightHandSideHasBeenRemoved. 1242 func (r *Replica) isNewerThanSplit(split *roachpb.SplitTrigger) bool { 1243 r.mu.RLock() 1244 defer r.mu.RUnlock() 1245 return r.isNewerThanSplitRLocked(split) 1246 } 1247 1248 func (r *Replica) isNewerThanSplitRLocked(split *roachpb.SplitTrigger) bool { 1249 rightDesc, _ := split.RightDesc.GetReplicaDescriptor(r.StoreID()) 1250 // If we have written a tombstone for this range then we know that the RHS 1251 // must have already been removed at the split replica ID. 1252 return r.mu.tombstoneMinReplicaID != 0 || 1253 // If the first raft message we received for the RHS range was for a replica 1254 // ID which is above the replica ID of the split then we would not have 1255 // written a tombstone but we will have a replica ID that will exceed the 1256 // split replica ID. 1257 r.mu.replicaID > rightDesc.ReplicaID 1258 } 1259 1260 // endCmds holds necessary information to end a batch after Raft 1261 // command processing. 1262 type endCmds struct { 1263 repl *Replica 1264 g *concurrency.Guard 1265 } 1266 1267 // move moves the endCmds into the return value, clearing and making 1268 // a call to done on the receiver a no-op. 1269 func (ec *endCmds) move() endCmds { 1270 res := *ec 1271 *ec = endCmds{} 1272 return res 1273 } 1274 1275 // done releases the latches acquired by the command and updates 1276 // the timestamp cache using the final timestamp of each command. 1277 // 1278 // No-op if the receiver has been zeroed out by a call to move. 1279 // Idempotent and is safe to call more than once. 1280 func (ec *endCmds) done( 1281 ctx context.Context, ba *roachpb.BatchRequest, br *roachpb.BatchResponse, pErr *roachpb.Error, 1282 ) { 1283 if ec.repl == nil { 1284 // The endCmds were cleared. 1285 return 1286 } 1287 defer ec.move() // clear 1288 1289 // Update the timestamp cache if the request is not being re-evaluated. Each 1290 // request is considered in turn; only those marked as affecting the cache are 1291 // processed. 1292 ec.repl.updateTimestampCache(ctx, ba, br, pErr) 1293 1294 // Release the latches acquired by the request and exit lock wait-queues. 1295 // Must be done AFTER the timestamp cache is updated. ec.g is only set when 1296 // the Raft proposal has assumed responsibility for the request. 1297 if ec.g != nil { 1298 ec.repl.concMgr.FinishReq(ec.g) 1299 } 1300 } 1301 1302 // maybeWatchForMerge checks whether a merge of this replica into its left 1303 // neighbor is in its critical phase and, if so, arranges to block all requests 1304 // until the merge completes. 1305 func (r *Replica) maybeWatchForMerge(ctx context.Context) error { 1306 desc := r.Desc() 1307 descKey := keys.RangeDescriptorKey(desc.StartKey) 1308 _, intent, err := storage.MVCCGet(ctx, r.Engine(), descKey, r.Clock().Now(), 1309 storage.MVCCGetOptions{Inconsistent: true}) 1310 if err != nil { 1311 return err 1312 } else if intent == nil { 1313 return nil 1314 } 1315 val, _, err := storage.MVCCGetAsTxn( 1316 ctx, r.Engine(), descKey, intent.Txn.WriteTimestamp, intent.Txn) 1317 if err != nil { 1318 return err 1319 } else if val != nil { 1320 return nil 1321 } 1322 1323 // At this point, we know we have a deletion intent on our range descriptor. 1324 // That means a merge is in progress. Block all commands until we can 1325 // retrieve an updated range descriptor from meta2, which will indicate 1326 // whether the merge succeeded or not. 1327 1328 mergeCompleteCh := make(chan struct{}) 1329 r.mu.Lock() 1330 if r.mu.mergeComplete != nil { 1331 // Another request already noticed the merge, installed a mergeComplete 1332 // channel, and launched a goroutine to watch for the merge's completion. 1333 // Nothing more to do. 1334 r.mu.Unlock() 1335 return nil 1336 } 1337 r.mu.mergeComplete = mergeCompleteCh 1338 // The RHS of a merge is not permitted to quiesce while a mergeComplete 1339 // channel is installed. (If the RHS is quiescent when the merge commits, any 1340 // orphaned followers would fail to queue themselves for GC.) Unquiesce the 1341 // range in case it managed to quiesce between when the Subsume request 1342 // arrived and now, which is rare but entirely legal. 1343 r.unquiesceLocked() 1344 r.mu.Unlock() 1345 1346 taskCtx := r.AnnotateCtx(context.Background()) 1347 err = r.store.stopper.RunAsyncTask(taskCtx, "wait-for-merge", func(ctx context.Context) { 1348 var pushTxnRes *roachpb.PushTxnResponse 1349 for retry := retry.Start(base.DefaultRetryOptions()); retry.Next(); { 1350 // Wait for the merge transaction to complete by attempting to push it. We 1351 // don't want to accidentally abort the merge transaction, so we use the 1352 // minimum transaction priority. Note that a push type of 1353 // roachpb.PUSH_TOUCH, though it might appear more semantically correct, 1354 // returns immediately and causes us to spin hot, whereas 1355 // roachpb.PUSH_ABORT efficiently blocks until the transaction completes. 1356 b := &kv.Batch{} 1357 b.Header.Timestamp = r.Clock().Now() 1358 b.AddRawRequest(&roachpb.PushTxnRequest{ 1359 RequestHeader: roachpb.RequestHeader{Key: intent.Txn.Key}, 1360 PusherTxn: roachpb.Transaction{ 1361 TxnMeta: enginepb.TxnMeta{Priority: enginepb.MinTxnPriority}, 1362 }, 1363 PusheeTxn: intent.Txn, 1364 PushType: roachpb.PUSH_ABORT, 1365 }) 1366 if err := r.DB().Run(ctx, b); err != nil { 1367 select { 1368 case <-r.store.stopper.ShouldQuiesce(): 1369 // The server is shutting down. The error while pushing the 1370 // transaction was probably caused by the shutdown, so ignore it. 1371 return 1372 default: 1373 log.Warningf(ctx, "error while watching for merge to complete: PushTxn: %+v", err) 1374 // We can't safely unblock traffic until we can prove that the merge 1375 // transaction is committed or aborted. Nothing to do but try again. 1376 continue 1377 } 1378 } 1379 pushTxnRes = b.RawResponse().Responses[0].GetInner().(*roachpb.PushTxnResponse) 1380 break 1381 } 1382 1383 var mergeCommitted bool 1384 switch pushTxnRes.PusheeTxn.Status { 1385 case roachpb.PENDING, roachpb.STAGING: 1386 log.Fatalf(ctx, "PushTxn returned while merge transaction %s was still %s", 1387 intent.Txn.ID.Short(), pushTxnRes.PusheeTxn.Status) 1388 case roachpb.COMMITTED: 1389 // If PushTxn claims that the transaction committed, then the transaction 1390 // definitely committed. 1391 mergeCommitted = true 1392 case roachpb.ABORTED: 1393 // If PushTxn claims that the transaction aborted, it's not a guarantee 1394 // that the transaction actually aborted. It could also mean that the 1395 // transaction completed, resolved its intents, and GC'd its transaction 1396 // record before our PushTxn arrived. To figure out what happened, we 1397 // need to look in meta2. 1398 var getRes *roachpb.GetResponse 1399 for retry := retry.Start(base.DefaultRetryOptions()); retry.Next(); { 1400 metaKey := keys.RangeMetaKey(desc.EndKey) 1401 res, pErr := kv.SendWrappedWith(ctx, r.DB().NonTransactionalSender(), roachpb.Header{ 1402 // Use READ_UNCOMMITTED to avoid trying to resolve intents, since 1403 // resolving those intents might involve sending requests to this 1404 // range, and that could deadlock. See the comment on 1405 // TestStoreRangeMergeConcurrentSplit for details. 1406 ReadConsistency: roachpb.READ_UNCOMMITTED, 1407 }, &roachpb.GetRequest{ 1408 RequestHeader: roachpb.RequestHeader{Key: metaKey.AsRawKey()}, 1409 }) 1410 if pErr != nil { 1411 select { 1412 case <-r.store.stopper.ShouldQuiesce(): 1413 // The server is shutting down. The error while fetching the range 1414 // descriptor was probably caused by the shutdown, so ignore it. 1415 return 1416 default: 1417 log.Warningf(ctx, "error while watching for merge to complete: Get %s: %s", metaKey, pErr) 1418 // We can't safely unblock traffic until we can prove that the merge 1419 // transaction is committed or aborted. Nothing to do but try again. 1420 continue 1421 } 1422 } 1423 getRes = res.(*roachpb.GetResponse) 1424 break 1425 } 1426 if getRes.Value == nil { 1427 // A range descriptor with our end key is no longer present in meta2, so 1428 // the merge must have committed. 1429 mergeCommitted = true 1430 } else { 1431 // A range descriptor with our end key is still present in meta2. The 1432 // merge committed iff that range descriptor has a different range ID. 1433 var meta2Desc roachpb.RangeDescriptor 1434 if err := getRes.Value.GetProto(&meta2Desc); err != nil { 1435 log.Fatalf(ctx, "error while watching for merge to complete: "+ 1436 "unmarshaling meta2 range descriptor: %s", err) 1437 } 1438 if meta2Desc.RangeID != r.RangeID { 1439 mergeCommitted = true 1440 } 1441 } 1442 } 1443 r.raftMu.Lock() 1444 r.mu.Lock() 1445 if mergeCommitted && r.mu.destroyStatus.IsAlive() { 1446 // The merge committed but the left-hand replica on this store hasn't 1447 // subsumed this replica yet. Mark this replica as destroyed so it 1448 // doesn't serve requests when we close the mergeCompleteCh below. 1449 r.mu.destroyStatus.Set(roachpb.NewRangeNotFoundError(r.RangeID, r.store.StoreID()), destroyReasonMergePending) 1450 } 1451 // Unblock pending requests. If the merge committed, the requests will 1452 // notice that the replica has been destroyed and return an appropriate 1453 // error. If the merge aborted, the requests will be handled normally. 1454 r.mu.mergeComplete = nil 1455 close(mergeCompleteCh) 1456 r.mu.Unlock() 1457 r.raftMu.Unlock() 1458 }) 1459 if errors.Is(err, stop.ErrUnavailable) { 1460 // We weren't able to launch a goroutine to watch for the merge's completion 1461 // because the server is shutting down. Normally failing to launch the 1462 // watcher goroutine would wedge pending requests on the replica's 1463 // mergeComplete channel forever, but since we're shutting down those 1464 // requests will get dropped and retried on another node. Suppress the error. 1465 err = nil 1466 } 1467 return err 1468 } 1469 1470 func (r *Replica) maybeTransferRaftLeadership(ctx context.Context) { 1471 r.mu.Lock() 1472 r.maybeTransferRaftLeadershipLocked(ctx) 1473 r.mu.Unlock() 1474 } 1475 1476 // maybeTransferRaftLeadershipLocked attempts to transfer the leadership away 1477 // from this node to the leaseholder, if this node is the current raft leader 1478 // but not the leaseholder. We don't attempt to transfer leadership if the 1479 // leaseholder is behind on applying the log. 1480 // 1481 // We like it when leases and raft leadership are collocated because that 1482 // facilitates quick command application (requests generally need to make it to 1483 // both the lease holder and the raft leader before being applied by other 1484 // replicas). 1485 func (r *Replica) maybeTransferRaftLeadershipLocked(ctx context.Context) { 1486 if r.store.TestingKnobs().DisableLeaderFollowsLeaseholder { 1487 return 1488 } 1489 lease := *r.mu.state.Lease 1490 if lease.OwnedBy(r.StoreID()) || !r.isLeaseValidRLocked(lease, r.Clock().Now()) { 1491 return 1492 } 1493 raftStatus := r.raftStatusRLocked() 1494 if raftStatus == nil || raftStatus.RaftState != raft.StateLeader { 1495 return 1496 } 1497 lhReplicaID := uint64(lease.Replica.ReplicaID) 1498 lhProgress, ok := raftStatus.Progress[lhReplicaID] 1499 if (ok && lhProgress.Match >= raftStatus.Commit) || r.mu.draining { 1500 log.VEventf(ctx, 1, "transferring raft leadership to replica ID %v", lhReplicaID) 1501 r.store.metrics.RangeRaftLeaderTransfers.Inc(1) 1502 r.mu.internalRaftGroup.TransferLeader(lhReplicaID) 1503 } 1504 } 1505 1506 func (r *Replica) mergeInProgressRLocked() bool { 1507 return r.mu.mergeComplete != nil 1508 } 1509 1510 func (r *Replica) getReplicaDescriptorByIDRLocked( 1511 replicaID roachpb.ReplicaID, fallback roachpb.ReplicaDescriptor, 1512 ) (roachpb.ReplicaDescriptor, error) { 1513 if repDesc, ok := r.mu.state.Desc.GetReplicaDescriptorByID(replicaID); ok { 1514 return repDesc, nil 1515 } 1516 if fallback.ReplicaID == replicaID { 1517 return fallback, nil 1518 } 1519 return roachpb.ReplicaDescriptor{}, 1520 errors.Errorf("replica %d not present in %v, %v", 1521 replicaID, fallback, r.mu.state.Desc.Replicas()) 1522 } 1523 1524 // checkIfTxnAborted checks the txn AbortSpan for the given 1525 // transaction. In case the transaction has been aborted, return a 1526 // transaction abort error. 1527 func checkIfTxnAborted( 1528 ctx context.Context, rec batcheval.EvalContext, reader storage.Reader, txn roachpb.Transaction, 1529 ) *roachpb.Error { 1530 var entry roachpb.AbortSpanEntry 1531 aborted, err := rec.AbortSpan().Get(ctx, reader, txn.ID, &entry) 1532 if err != nil { 1533 return roachpb.NewError(roachpb.NewReplicaCorruptionError( 1534 errors.Wrap(err, "could not read from AbortSpan"))) 1535 } 1536 if aborted { 1537 // We hit the cache, so let the transaction restart. 1538 log.VEventf(ctx, 1, "found AbortSpan entry for %s with priority %d", 1539 txn.ID.Short(), entry.Priority) 1540 newTxn := txn.Clone() 1541 if entry.Priority > newTxn.Priority { 1542 newTxn.Priority = entry.Priority 1543 } 1544 newTxn.Status = roachpb.ABORTED 1545 return roachpb.NewErrorWithTxn( 1546 roachpb.NewTransactionAbortedError(roachpb.ABORT_REASON_ABORT_SPAN), newTxn) 1547 } 1548 return nil 1549 } 1550 1551 func (r *Replica) startKey() roachpb.RKey { 1552 return r.Desc().StartKey 1553 } 1554 1555 // Less implements the btree.Item interface. 1556 func (r *Replica) Less(i btree.Item) bool { 1557 return r.startKey().Less(i.(rangeKeyItem).startKey()) 1558 } 1559 1560 // GetLeaseHistory returns the lease history stored on this replica. 1561 func (r *Replica) GetLeaseHistory() []roachpb.Lease { 1562 if r.leaseHistory == nil { 1563 return nil 1564 } 1565 1566 return r.leaseHistory.get() 1567 } 1568 1569 // EnableLeaseHistory turns on the lease history for testing purposes. Returns 1570 // a function to return it to its original state that can be deferred. 1571 func EnableLeaseHistory(maxEntries int) func() { 1572 originalValue := leaseHistoryMaxEntries 1573 leaseHistoryMaxEntries = maxEntries 1574 return func() { 1575 leaseHistoryMaxEntries = originalValue 1576 } 1577 } 1578 1579 // GetExternalStorage returns an ExternalStorage object, based on 1580 // information parsed from a URI, stored in `dest`. 1581 func (r *Replica) GetExternalStorage( 1582 ctx context.Context, dest roachpb.ExternalStorage, 1583 ) (cloud.ExternalStorage, error) { 1584 return r.store.cfg.ExternalStorage(ctx, dest) 1585 } 1586 1587 // GetExternalStorageFromURI returns an ExternalStorage object, based on the given URI. 1588 func (r *Replica) GetExternalStorageFromURI( 1589 ctx context.Context, uri string, 1590 ) (cloud.ExternalStorage, error) { 1591 return r.store.cfg.ExternalStorageFromURI(ctx, uri) 1592 } 1593 1594 func (r *Replica) markSystemConfigGossipSuccess() { 1595 r.mu.Lock() 1596 defer r.mu.Unlock() 1597 r.mu.failureToGossipSystemConfig = false 1598 } 1599 1600 func (r *Replica) markSystemConfigGossipFailed() { 1601 r.mu.Lock() 1602 defer r.mu.Unlock() 1603 r.mu.failureToGossipSystemConfig = true 1604 } 1605 1606 func init() { 1607 tracing.RegisterTagRemapping("r", "range") 1608 }