github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_raft.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "fmt" 16 "math/rand" 17 "sort" 18 "time" 19 20 "github.com/cockroachdb/cockroach/pkg/keys" 21 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/apply" 22 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency" 23 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 24 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 25 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader" 26 "github.com/cockroachdb/cockroach/pkg/roachpb" 27 "github.com/cockroachdb/cockroach/pkg/storage" 28 "github.com/cockroachdb/cockroach/pkg/util" 29 "github.com/cockroachdb/cockroach/pkg/util/encoding" 30 "github.com/cockroachdb/cockroach/pkg/util/hlc" 31 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 32 "github.com/cockroachdb/cockroach/pkg/util/log" 33 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 34 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 35 "github.com/cockroachdb/cockroach/pkg/util/tracing" 36 "github.com/cockroachdb/cockroach/pkg/util/uuid" 37 "github.com/cockroachdb/errors" 38 "go.etcd.io/etcd/raft" 39 "go.etcd.io/etcd/raft/raftpb" 40 "go.etcd.io/etcd/raft/tracker" 41 ) 42 43 func makeIDKey() kvserverbase.CmdIDKey { 44 idKeyBuf := make([]byte, 0, raftCommandIDLen) 45 idKeyBuf = encoding.EncodeUint64Ascending(idKeyBuf, uint64(rand.Int63())) 46 return kvserverbase.CmdIDKey(idKeyBuf) 47 } 48 49 // evalAndPropose prepares the necessary pending command struct and initializes 50 // a client command ID if one hasn't been. A verified lease is supplied as a 51 // parameter if the command requires a lease; nil otherwise. It then evaluates 52 // the command and proposes it to Raft on success. 53 // 54 // The method accepts a concurrency guard, which it assumes responsibility for 55 // if it succeeds in proposing a command into Raft. If the method does not 56 // return an error, the guard is guaranteed to be eventually freed and the 57 // caller should relinquish all ownership of it. If it does return an error, the 58 // caller retains full ownership over the guard. 59 // 60 // Return values: 61 // - a channel which receives a response or error upon application 62 // - a closure used to attempt to abandon the command. When called, it unbinds 63 // the command's context from its Raft proposal. The client is then free to 64 // terminate execution, although it is given no guarantee that the proposal 65 // won't still go on to commit and apply at some later time. 66 // - the MaxLeaseIndex of the resulting proposal, if any. 67 // - any error obtained during the creation or proposal of the command, in 68 // which case the other returned values are zero. 69 func (r *Replica) evalAndPropose( 70 ctx context.Context, ba *roachpb.BatchRequest, g *concurrency.Guard, lease *roachpb.Lease, 71 ) (chan proposalResult, func(), int64, *roachpb.Error) { 72 idKey := makeIDKey() 73 proposal, pErr := r.requestToProposal(ctx, idKey, ba, g.LatchSpans()) 74 log.Event(proposal.ctx, "evaluated request") 75 76 // If the request hit a server-side concurrency retry error, immediately 77 // proagate the error. Don't assume ownership of the concurrency guard. 78 if isConcurrencyRetryError(pErr) { 79 return nil, nil, 0, pErr 80 } 81 82 // Attach the endCmds to the proposal and assume responsibility for 83 // releasing the concurrency guard if the proposal makes it to Raft. 84 proposal.ec = endCmds{repl: r, g: g} 85 86 // Pull out proposal channel to return. proposal.doneCh may be set to 87 // nil if it is signaled in this function. 88 proposalCh := proposal.doneCh 89 90 // There are two cases where request evaluation does not lead to a Raft 91 // proposal: 92 // 1. proposal.command == nil indicates that the evaluation was a no-op 93 // and that no Raft command needs to be proposed. 94 // 2. pErr != nil corresponds to a failed proposal - the command resulted 95 // in an error. 96 if proposal.command == nil { 97 intents := proposal.Local.DetachEncounteredIntents() 98 endTxns := proposal.Local.DetachEndTxns(pErr != nil /* alwaysOnly */) 99 r.handleReadWriteLocalEvalResult(ctx, *proposal.Local) 100 101 pr := proposalResult{ 102 Reply: proposal.Local.Reply, 103 Err: pErr, 104 EncounteredIntents: intents, 105 EndTxns: endTxns, 106 } 107 proposal.finishApplication(ctx, pr) 108 return proposalCh, func() {}, 0, nil 109 } 110 111 // If the request requested that Raft consensus be performed asynchronously, 112 // return a proposal result immediately on the proposal's done channel. 113 // The channel's capacity will be large enough to accommodate this. 114 if ba.AsyncConsensus { 115 if ets := proposal.Local.DetachEndTxns(false /* alwaysOnly */); len(ets) != 0 { 116 // Disallow async consensus for commands with EndTxnIntents because 117 // any !Always EndTxnIntent can't be cleaned up until after the 118 // command succeeds. 119 return nil, nil, 0, roachpb.NewErrorf("cannot perform consensus asynchronously for "+ 120 "proposal with EndTxnIntents=%v; %v", ets, ba) 121 } 122 123 // Fork the proposal's context span so that the proposal's context 124 // can outlive the original proposer's context. 125 proposal.ctx, proposal.sp = tracing.ForkCtxSpan(ctx, "async consensus") 126 127 // Signal the proposal's response channel immediately. 128 reply := *proposal.Local.Reply 129 reply.Responses = append([]roachpb.ResponseUnion(nil), reply.Responses...) 130 pr := proposalResult{ 131 Reply: &reply, 132 EncounteredIntents: proposal.Local.DetachEncounteredIntents(), 133 } 134 proposal.signalProposalResult(pr) 135 136 // Continue with proposal... 137 } 138 139 // Attach information about the proposer to the command. 140 proposal.command.ProposerLeaseSequence = lease.Sequence 141 142 // Once a command is written to the raft log, it must be loaded into memory 143 // and replayed on all replicas. If a command is too big, stop it here. If 144 // the command is not too big, acquire an appropriate amount of quota from 145 // the replica's proposal quota pool. 146 // 147 // TODO(tschottdorf): blocking a proposal here will leave it dangling in the 148 // closed timestamp tracker for an extended period of time, which will in turn 149 // prevent the node-wide closed timestamp from making progress. This is quite 150 // unfortunate; we should hoist the quota pool before the reference with the 151 // closed timestamp tracker is acquired. This is better anyway; right now many 152 // commands can evaluate but then be blocked on quota, which has worse memory 153 // behavior. 154 quotaSize := uint64(proposal.command.Size()) 155 if maxSize := uint64(MaxCommandSize.Get(&r.store.cfg.Settings.SV)); quotaSize > maxSize { 156 return nil, nil, 0, roachpb.NewError(errors.Errorf( 157 "command is too large: %d bytes (max: %d)", quotaSize, maxSize, 158 )) 159 } 160 var err error 161 proposal.quotaAlloc, err = r.maybeAcquireProposalQuota(ctx, quotaSize) 162 if err != nil { 163 return nil, nil, 0, roachpb.NewError(err) 164 } 165 // Make sure we clean up the proposal if we fail to insert it into the 166 // proposal buffer successfully. This ensures that we always release any 167 // quota that we acquire. 168 defer func() { 169 if pErr != nil { 170 proposal.releaseQuota() 171 } 172 }() 173 174 if filter := r.store.TestingKnobs().TestingProposalFilter; filter != nil { 175 filterArgs := kvserverbase.ProposalFilterArgs{ 176 Ctx: ctx, 177 Cmd: *proposal.command, 178 CmdID: idKey, 179 Req: *ba, 180 } 181 if pErr := filter(filterArgs); pErr != nil { 182 return nil, nil, 0, pErr 183 } 184 } 185 186 maxLeaseIndex, pErr := r.propose(ctx, proposal) 187 if pErr != nil { 188 return nil, nil, 0, pErr 189 } 190 // Abandoning a proposal unbinds its context so that the proposal's client 191 // is free to terminate execution. However, it does nothing to try to 192 // prevent the command from succeeding. In particular, endCmds will still be 193 // invoked when the command is applied. There are a handful of cases where 194 // the command may not be applied (or even processed): the process crashes 195 // or the local replica is removed from the range. 196 abandon := func() { 197 // The proposal may or may not be in the Replica's proposals map. 198 // Instead of trying to look it up, simply modify the captured object 199 // directly. The raftMu must be locked to modify the context of a 200 // proposal because as soon as we propose a command to Raft, ownership 201 // passes to the "below Raft" machinery. 202 r.raftMu.Lock() 203 defer r.raftMu.Unlock() 204 r.mu.Lock() 205 defer r.mu.Unlock() 206 // TODO(radu): Should this context be created via tracer.ForkCtxSpan? 207 // We'd need to make sure the span is finished eventually. 208 proposal.ctx = r.AnnotateCtx(context.TODO()) 209 } 210 return proposalCh, abandon, maxLeaseIndex, nil 211 } 212 213 // propose encodes a command, starts tracking it, and proposes it to raft. The 214 // method is also responsible for assigning the command its maximum lease index. 215 // 216 // The method hands ownership of the command over to the Raft machinery. After 217 // the method returns, all access to the command must be performed while holding 218 // Replica.mu and Replica.raftMu. If a non-nil error is returned the 219 // MaxLeaseIndex is not updated. 220 func (r *Replica) propose(ctx context.Context, p *ProposalData) (index int64, pErr *roachpb.Error) { 221 222 // If an error occurs reset the command's MaxLeaseIndex to its initial value. 223 // Failure to propose will propagate to the client. An invariant of this 224 // package is that proposals which are finished carry a raft command with a 225 // MaxLeaseIndex equal to the proposal command's max lease index. 226 defer func(prev uint64) { 227 if pErr != nil { 228 p.command.MaxLeaseIndex = prev 229 } 230 }(p.command.MaxLeaseIndex) 231 232 // Make sure the maximum lease index is unset. This field will be set in 233 // propBuf.Insert and its encoded bytes will be appended to the encoding 234 // buffer as a RaftCommandFooter. 235 p.command.MaxLeaseIndex = 0 236 237 // Determine the encoding style for the Raft command. 238 prefix := true 239 version := raftVersionStandard 240 if crt := p.command.ReplicatedEvalResult.ChangeReplicas; crt != nil { 241 // EndTxnRequest with a ChangeReplicasTrigger is special because Raft 242 // needs to understand it; it cannot simply be an opaque command. To 243 // permit this, the command is proposed by the proposal buffer using 244 // ProposeConfChange. For that reason, we also don't need a Raft command 245 // prefix because the command ID is stored in a field in 246 // raft.ConfChange. 247 log.Infof(p.ctx, "proposing %s", crt) 248 prefix = false 249 250 // Ensure that we aren't trying to remove ourselves from the range without 251 // having previously given up our lease, since the range won't be able 252 // to make progress while the lease is owned by a removed replica (and 253 // leases can stay in such a state for a very long time when using epoch- 254 // based range leases). This shouldn't happen often, but has been seen 255 // before (#12591). 256 // 257 // Note that due to atomic replication changes, when a removal is initiated, 258 // the replica remains in the descriptor, but as VOTER_{OUTGOING,DEMOTING}. 259 // We want to block it from getting into that state in the first place, 260 // since there's no stopping the actual removal/demotion once it's there. 261 // The Removed() field has contains these replicas when this first 262 // transition is initiated, so its use here is copacetic. 263 replID := r.ReplicaID() 264 for _, rDesc := range crt.Removed() { 265 if rDesc.ReplicaID == replID { 266 msg := fmt.Sprintf("received invalid ChangeReplicasTrigger %s to remove self (leaseholder)", crt) 267 log.Errorf(p.ctx, "%v", msg) 268 return 0, roachpb.NewErrorf("%s: %s", r, msg) 269 } 270 } 271 272 } else if p.command.ReplicatedEvalResult.AddSSTable != nil { 273 log.VEvent(p.ctx, 4, "sideloadable proposal detected") 274 version = raftVersionSideloaded 275 r.store.metrics.AddSSTableProposals.Inc(1) 276 277 if p.command.ReplicatedEvalResult.AddSSTable.Data == nil { 278 return 0, roachpb.NewErrorf("cannot sideload empty SSTable") 279 } 280 } else if log.V(4) { 281 log.Infof(p.ctx, "proposing command %x: %s", p.idKey, p.Request.Summary()) 282 } 283 284 // Create encoding buffer. 285 preLen := 0 286 if prefix { 287 preLen = raftCommandPrefixLen 288 } 289 cmdLen := p.command.Size() 290 cap := preLen + cmdLen + kvserverpb.MaxRaftCommandFooterSize() 291 data := make([]byte, preLen, cap) 292 // Encode prefix with command ID, if necessary. 293 if prefix { 294 encodeRaftCommandPrefix(data, version, p.idKey) 295 } 296 // Encode body of command. 297 data = data[:preLen+cmdLen] 298 if _, err := protoutil.MarshalTo(p.command, data[preLen:]); err != nil { 299 return 0, roachpb.NewError(err) 300 } 301 302 // Too verbose even for verbose logging, so manually enable if you want to 303 // debug proposal sizes. 304 if false { 305 log.Infof(p.ctx, `%s: proposal: %d 306 RaftCommand.ReplicatedEvalResult: %d 307 RaftCommand.ReplicatedEvalResult.Delta: %d 308 RaftCommand.WriteBatch: %d 309 `, p.Request.Summary(), cmdLen, 310 p.command.ReplicatedEvalResult.Size(), 311 p.command.ReplicatedEvalResult.Delta.Size(), 312 p.command.WriteBatch.Size(), 313 ) 314 } 315 316 // Log an event if this is a large proposal. These are more likely to cause 317 // blips or worse, and it's good to be able to pick them from traces. 318 // 319 // TODO(tschottdorf): can we mark them so lightstep can group them? 320 const largeProposalEventThresholdBytes = 2 << 19 // 512kb 321 if cmdLen > largeProposalEventThresholdBytes { 322 log.Eventf(p.ctx, "proposal is large: %s", humanizeutil.IBytes(int64(cmdLen))) 323 } 324 325 // Insert into the proposal buffer, which passes the command to Raft to be 326 // proposed. The proposal buffer assigns the command a maximum lease index 327 // when it sequences it. 328 // 329 // NB: we must not hold r.mu while using the proposal buffer, see comment 330 // on the field. 331 maxLeaseIndex, err := r.mu.proposalBuf.Insert(p, data) 332 if err != nil { 333 return 0, roachpb.NewError(err) 334 } 335 return int64(maxLeaseIndex), nil 336 } 337 338 func (r *Replica) numPendingProposalsRLocked() int { 339 return len(r.mu.proposals) + r.mu.proposalBuf.Len() 340 } 341 342 // hasPendingProposalsRLocked is part of the quiescer interface. 343 // It returns true if this node has any outstanding proposals. A client might be 344 // waiting for the outcome of these proposals, so we definitely don't want to 345 // quiesce while such proposals are in-flight. 346 // 347 // Note that this method says nothing about other node's outstanding proposals: 348 // if this node is the current leaseholders, previous leaseholders might have 349 // proposals on which they're waiting. If this node is not the current 350 // leaseholder, then obviously whoever is the current leaseholder might have 351 // pending proposals. This method is called in two places: on the current 352 // leaseholder when deciding whether the leaseholder should attempt to quiesce 353 // the range, and then on every follower to confirm that the range can indeed be 354 // quiesced. 355 func (r *Replica) hasPendingProposalsRLocked() bool { 356 return r.numPendingProposalsRLocked() > 0 357 } 358 359 // hasPendingProposalQuotaRLocked is part of the quiescer interface. It returns 360 // true if there are any commands that haven't completed replicating that are 361 // tracked by this node's quota pool (i.e. commands that haven't been acked by 362 // all live replicas). 363 // We can't quiesce while there's outstanding quota because the respective quota 364 // would not be released while quiesced, and it might prevent the range from 365 // unquiescing (leading to deadlock). See #46699. 366 func (r *Replica) hasPendingProposalQuotaRLocked() bool { 367 if r.mu.proposalQuota == nil { 368 return true 369 } 370 return !r.mu.proposalQuota.Full() 371 } 372 373 var errRemoved = errors.New("replica removed") 374 375 // stepRaftGroup calls Step on the replica's RawNode with the provided request's 376 // message. Before doing so, it assures that the replica is unquiesced and ready 377 // to handle the request. 378 func (r *Replica) stepRaftGroup(req *RaftMessageRequest) error { 379 // We're processing an incoming raft message (from a batch that may 380 // include MsgVotes), so don't campaign if we wake up our raft 381 // group. 382 return r.withRaftGroup(false, func(raftGroup *raft.RawNode) (bool, error) { 383 // We're processing a message from another replica which means that the 384 // other replica is not quiesced, so we don't need to wake the leader. 385 // Note that we avoid campaigning when receiving raft messages, because 386 // we expect the originator to campaign instead. 387 r.unquiesceWithOptionsLocked(false /* campaignOnWake */) 388 r.mu.lastUpdateTimes.update(req.FromReplica.ReplicaID, timeutil.Now()) 389 err := raftGroup.Step(req.Message) 390 if errors.Is(err, raft.ErrProposalDropped) { 391 // A proposal was forwarded to this replica but we couldn't propose it. 392 // Swallow the error since we don't have an effective way of signaling 393 // this to the sender. 394 // TODO(bdarnell): Handle ErrProposalDropped better. 395 // https://github.com/cockroachdb/cockroach/issues/21849 396 err = nil 397 } 398 return false /* unquiesceAndWakeLeader */, err 399 }) 400 } 401 402 type handleRaftReadyStats struct { 403 applyCommittedEntriesStats 404 } 405 406 // noSnap can be passed to handleRaftReady when no snapshot should be processed. 407 var noSnap IncomingSnapshot 408 409 // handleRaftReady processes a raft.Ready containing entries and messages that 410 // are ready to read, be saved to stable storage, committed, or sent to other 411 // peers. It takes a non-empty IncomingSnapshot to indicate that it is 412 // about to process a snapshot. 413 // 414 // The returned string is nonzero whenever an error is returned to give a 415 // non-sensitive cue as to what happened. 416 func (r *Replica) handleRaftReady( 417 ctx context.Context, inSnap IncomingSnapshot, 418 ) (handleRaftReadyStats, string, error) { 419 defer func(start time.Time) { 420 elapsed := timeutil.Since(start) 421 r.store.metrics.RaftHandleReadyLatency.RecordValue(elapsed.Nanoseconds()) 422 }(timeutil.Now()) 423 r.raftMu.Lock() 424 defer r.raftMu.Unlock() 425 return r.handleRaftReadyRaftMuLocked(ctx, inSnap) 426 } 427 428 // handleRaftReadyLocked is the same as handleRaftReady but requires that the 429 // replica's raftMu be held. 430 // 431 // The returned string is nonzero whenever an error is returned to give a 432 // non-sensitive cue as to what happened. 433 func (r *Replica) handleRaftReadyRaftMuLocked( 434 ctx context.Context, inSnap IncomingSnapshot, 435 ) (handleRaftReadyStats, string, error) { 436 var stats handleRaftReadyStats 437 438 var hasReady bool 439 var rd raft.Ready 440 r.mu.Lock() 441 lastIndex := r.mu.lastIndex // used for append below 442 lastTerm := r.mu.lastTerm 443 raftLogSize := r.mu.raftLogSize 444 leaderID := r.mu.leaderID 445 lastLeaderID := leaderID 446 err := r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) { 447 numFlushed, err := r.mu.proposalBuf.FlushLockedWithRaftGroup(raftGroup) 448 if err != nil { 449 return false, err 450 } 451 if hasReady = raftGroup.HasReady(); hasReady { 452 rd = raftGroup.Ready() 453 } 454 // We unquiesce if we have a Ready (= there's work to do). We also have 455 // to unquiesce if we just flushed some proposals but there isn't a 456 // Ready, which can happen if the proposals got dropped (raft does this 457 // if it doesn't know who the leader is). And, for extra defense in depth, 458 // we also unquiesce if there are outstanding proposals. 459 // 460 // NB: if we had the invariant that the group can only be in quiesced 461 // state if it knows the leader (state.Lead) AND we knew that raft would 462 // never give us an empty ready here (i.e. the only reason to drop a 463 // proposal is not knowing the leader) then numFlushed would not be 464 // necessary. The latter is likely true but we don't want to rely on 465 // it. The former is maybe true, but there's no easy way to enforce it. 466 unquiesceAndWakeLeader := hasReady || numFlushed > 0 || len(r.mu.proposals) > 0 467 return unquiesceAndWakeLeader, nil 468 }) 469 r.mu.Unlock() 470 if errors.Is(err, errRemoved) { 471 // If we've been removed then just return. 472 return stats, "", nil 473 } else if err != nil { 474 const expl = "while checking raft group for Ready" 475 return stats, expl, errors.Wrap(err, expl) 476 } 477 if !hasReady { 478 // We must update the proposal quota even if we don't have a ready. 479 // Consider the case when our quota is of size 1 and two out of three 480 // replicas have committed one log entry while the third is lagging 481 // behind. When the third replica finally does catch up and sends 482 // along a MsgAppResp, since the entry is already committed on the 483 // leader replica, no Ready is emitted. But given that the third 484 // replica has caught up, we can release 485 // some quota back to the pool. 486 r.updateProposalQuotaRaftMuLocked(ctx, lastLeaderID) 487 return stats, "", nil 488 } 489 490 logRaftReady(ctx, rd) 491 492 refreshReason := noReason 493 if rd.SoftState != nil && leaderID != roachpb.ReplicaID(rd.SoftState.Lead) { 494 // Refresh pending commands if the Raft leader has changed. This is usually 495 // the first indication we have of a new leader on a restarted node. 496 // 497 // TODO(peter): Re-proposing commands when SoftState.Lead changes can lead 498 // to wasteful multiple-reproposals when we later see an empty Raft command 499 // indicating a newly elected leader or a conf change. Replay protection 500 // prevents any corruption, so the waste is only a performance issue. 501 if log.V(3) { 502 log.Infof(ctx, "raft leader changed: %d -> %d", leaderID, rd.SoftState.Lead) 503 } 504 if !r.store.TestingKnobs().DisableRefreshReasonNewLeader { 505 refreshReason = reasonNewLeader 506 } 507 leaderID = roachpb.ReplicaID(rd.SoftState.Lead) 508 } 509 510 if !raft.IsEmptySnap(rd.Snapshot) { 511 snapUUID, err := uuid.FromBytes(rd.Snapshot.Data) 512 if err != nil { 513 const expl = "invalid snapshot id" 514 return stats, expl, errors.Wrap(err, expl) 515 } 516 if inSnap.SnapUUID == (uuid.UUID{}) { 517 log.Fatalf(ctx, "programming error: a snapshot application was attempted outside of the streaming snapshot codepath") 518 } 519 if snapUUID != inSnap.SnapUUID { 520 log.Fatalf(ctx, "incoming snapshot id doesn't match raft snapshot id: %s != %s", snapUUID, inSnap.SnapUUID) 521 } 522 523 // Applying this snapshot may require us to subsume one or more of our right 524 // neighbors. This occurs if this replica is informed about the merges via a 525 // Raft snapshot instead of a MsgApp containing the merge commits, e.g., 526 // because it went offline before the merge commits applied and did not come 527 // back online until after the merge commits were truncated away. 528 subsumedRepls, releaseMergeLock := r.maybeAcquireSnapshotMergeLock(ctx, inSnap) 529 defer releaseMergeLock() 530 531 if err := r.applySnapshot(ctx, inSnap, rd.Snapshot, rd.HardState, subsumedRepls); err != nil { 532 const expl = "while applying snapshot" 533 return stats, expl, errors.Wrap(err, expl) 534 } 535 536 // r.mu.lastIndex, r.mu.lastTerm and r.mu.raftLogSize were updated in 537 // applySnapshot, but we also want to make sure we reflect these changes in 538 // the local variables we're tracking here. 539 r.mu.RLock() 540 lastIndex = r.mu.lastIndex 541 lastTerm = r.mu.lastTerm 542 raftLogSize = r.mu.raftLogSize 543 r.mu.RUnlock() 544 545 // We refresh pending commands after applying a snapshot because this 546 // replica may have been temporarily partitioned from the Raft group and 547 // missed leadership changes that occurred. Suppose node A is the leader, 548 // and then node C gets partitioned away from the others. Leadership passes 549 // back and forth between A and B during the partition, but when the 550 // partition is healed node A is leader again. 551 if !r.store.TestingKnobs().DisableRefreshReasonSnapshotApplied && 552 refreshReason == noReason { 553 refreshReason = reasonSnapshotApplied 554 } 555 } 556 557 // If the ready struct includes entries that have been committed, these 558 // entries will be applied to the Replica's replicated state machine down 559 // below, after appending new entries to the raft log and sending messages 560 // to peers. However, the process of appending new entries to the raft log 561 // and then applying committed entries to the state machine can take some 562 // time - and these entries are already durably committed. If they have 563 // clients waiting on them, we'd like to acknowledge their success as soon 564 // as possible. To facilitate this, we take a quick pass over the committed 565 // entries and acknowledge as many as we can trivially prove will not be 566 // rejected beneath raft. 567 // 568 // Note that we only acknowledge up to the current last index in the Raft 569 // log. The CommittedEntries slice may contain entries that are also in the 570 // Entries slice (to be appended in this ready pass), and we don't want to 571 // acknowledge them until they are durably in our local Raft log. This is 572 // most common in single node replication groups, but it is possible when a 573 // follower in a multi-node replication group is catching up after falling 574 // behind. In the first case, the entries are not yet committed so 575 // acknowledging them would be a lie. In the second case, the entries are 576 // committed so we could acknowledge them at this point, but doing so seems 577 // risky. To avoid complications in either case, we pass lastIndex for the 578 // maxIndex argument to AckCommittedEntriesBeforeApplication. 579 sm := r.getStateMachine() 580 dec := r.getDecoder() 581 appTask := apply.MakeTask(sm, dec) 582 appTask.SetMaxBatchSize(r.store.TestingKnobs().MaxApplicationBatchSize) 583 defer appTask.Close() 584 if err := appTask.Decode(ctx, rd.CommittedEntries); err != nil { 585 return stats, getNonDeterministicFailureExplanation(err), err 586 } 587 if err := appTask.AckCommittedEntriesBeforeApplication(ctx, lastIndex); err != nil { 588 return stats, getNonDeterministicFailureExplanation(err), err 589 } 590 591 // Separate the MsgApp messages from all other Raft message types so that we 592 // can take advantage of the optimization discussed in the Raft thesis under 593 // the section: `10.2.1 Writing to the leader’s disk in parallel`. The 594 // optimization suggests that instead of a leader writing new log entries to 595 // disk before replicating them to its followers, the leader can instead 596 // write the entries to disk in parallel with replicating to its followers 597 // and them writing to their disks. 598 // 599 // Here, we invoke this optimization by: 600 // 1. sending all MsgApps. 601 // 2. syncing all entries and Raft state to disk. 602 // 3. sending all other messages. 603 // 604 // Since this is all handled in handleRaftReadyRaftMuLocked, we're assured 605 // that even though we may sync new entries to disk after sending them in 606 // MsgApps to followers, we'll always have them synced to disk before we 607 // process followers' MsgAppResps for the corresponding entries because 608 // Ready processing is sequential (and because a restart of the leader would 609 // prevent the MsgAppResp from being handled by it). This is important 610 // because it makes sure that the leader always has all of the entries in 611 // the log for its term, which is required in etcd/raft for technical 612 // reasons[1]. 613 // 614 // MsgApps are also used to inform followers of committed entries through 615 // the Commit index that they contain. Due to the optimization described 616 // above, a Commit index may be sent out to a follower before it is 617 // persisted on the leader. This is safe because the Commit index can be 618 // treated as volatile state, as is supported by raft.MustSync[2]. 619 // Additionally, the Commit index can never refer to entries from the 620 // current Ready (due to the MsgAppResp argument above) except in 621 // single-node groups, in which as a result we have to be careful to not 622 // persist a Commit index without the entries its commit index might refer 623 // to (see the HardState update below for details). 624 // 625 // [1]: the Raft thesis states that this can be made safe: 626 // 627 // > The leader may even commit an entry before it has been written to its 628 // > own disk, if a majority of followers have written it to their disks; 629 // > this is still safe. 630 // 631 // [2]: Raft thesis section: `3.8 Persisted state and server restarts`: 632 // 633 // > Other state variables are safe to lose on a restart, as they can all be 634 // > recreated. The most interesting example is the commit index, which can 635 // > safely be reinitialized to zero on a restart. 636 // 637 // Note that this will change when joint quorums are implemented, at which 638 // point we have to introduce coupling between the Commit index and 639 // persisted config changes, and also require some commit indexes to be 640 // durably synced. 641 // See: 642 // https://github.com/etcd-io/etcd/issues/7625#issuecomment-489232411 643 644 msgApps, otherMsgs := splitMsgApps(rd.Messages) 645 r.traceMessageSends(msgApps, "sending msgApp") 646 r.sendRaftMessages(ctx, msgApps) 647 648 // Use a more efficient write-only batch because we don't need to do any 649 // reads from the batch. Any reads are performed via the "distinct" batch 650 // which passes the reads through to the underlying DB. 651 batch := r.store.Engine().NewWriteOnlyBatch() 652 defer batch.Close() 653 654 // We know that all of the writes from here forward will be to distinct keys. 655 writer := batch.Distinct() 656 prevLastIndex := lastIndex 657 if len(rd.Entries) > 0 { 658 // All of the entries are appended to distinct keys, returning a new 659 // last index. 660 thinEntries, sideLoadedEntriesSize, err := r.maybeSideloadEntriesRaftMuLocked(ctx, rd.Entries) 661 if err != nil { 662 const expl = "during sideloading" 663 return stats, expl, errors.Wrap(err, expl) 664 } 665 raftLogSize += sideLoadedEntriesSize 666 if lastIndex, lastTerm, raftLogSize, err = r.append( 667 ctx, writer, lastIndex, lastTerm, raftLogSize, thinEntries, 668 ); err != nil { 669 const expl = "during append" 670 return stats, expl, errors.Wrap(err, expl) 671 } 672 } 673 if !raft.IsEmptyHardState(rd.HardState) { 674 if !r.IsInitialized() && rd.HardState.Commit != 0 { 675 log.Fatalf(ctx, "setting non-zero HardState.Commit on uninitialized replica %s. HS=%+v", r, rd.HardState) 676 } 677 // NB: Note that without additional safeguards, it's incorrect to write 678 // the HardState before appending rd.Entries. When catching up, a follower 679 // will receive Entries that are immediately Committed in the same 680 // Ready. If we persist the HardState but happen to lose the Entries, 681 // assertions can be tripped. 682 // 683 // We have both in the same batch, so there's no problem. If that ever 684 // changes, we must write and sync the Entries before the HardState. 685 if err := r.raftMu.stateLoader.SetHardState(ctx, writer, rd.HardState); err != nil { 686 const expl = "during setHardState" 687 return stats, expl, errors.Wrap(err, expl) 688 } 689 } 690 writer.Close() 691 // Synchronously commit the batch with the Raft log entries and Raft hard 692 // state as we're promising not to lose this data. 693 // 694 // Note that the data is visible to other goroutines before it is synced to 695 // disk. This is fine. The important constraints are that these syncs happen 696 // before Raft messages are sent and before the call to RawNode.Advance. Our 697 // regular locking is sufficient for this and if other goroutines can see the 698 // data early, that's fine. In particular, snapshots are not a problem (I 699 // think they're the only thing that might access log entries or HardState 700 // from other goroutines). Snapshots do not include either the HardState or 701 // uncommitted log entries, and even if they did include log entries that 702 // were not persisted to disk, it wouldn't be a problem because raft does not 703 // infer the that entries are persisted on the node that sends a snapshot. 704 commitStart := timeutil.Now() 705 if err := batch.Commit(rd.MustSync && !disableSyncRaftLog.Get(&r.store.cfg.Settings.SV)); err != nil { 706 const expl = "while committing batch" 707 return stats, expl, errors.Wrap(err, expl) 708 } 709 if rd.MustSync { 710 elapsed := timeutil.Since(commitStart) 711 r.store.metrics.RaftLogCommitLatency.RecordValue(elapsed.Nanoseconds()) 712 } 713 714 if len(rd.Entries) > 0 { 715 // We may have just overwritten parts of the log which contain 716 // sideloaded SSTables from a previous term (and perhaps discarded some 717 // entries that we didn't overwrite). Remove any such leftover on-disk 718 // payloads (we can do that now because we've committed the deletion 719 // just above). 720 firstPurge := rd.Entries[0].Index // first new entry written 721 purgeTerm := rd.Entries[0].Term - 1 722 lastPurge := prevLastIndex // old end of the log, include in deletion 723 purgedSize, err := maybePurgeSideloaded(ctx, r.raftMu.sideloaded, firstPurge, lastPurge, purgeTerm) 724 if err != nil { 725 const expl = "while purging sideloaded storage" 726 return stats, expl, err 727 } 728 raftLogSize -= purgedSize 729 if raftLogSize < 0 { 730 // Might have gone negative if node was recently restarted. 731 raftLogSize = 0 732 } 733 } 734 735 // Update protected state - last index, last term, raft log size, and raft 736 // leader ID. 737 r.mu.Lock() 738 r.mu.lastIndex = lastIndex 739 r.mu.lastTerm = lastTerm 740 r.mu.raftLogSize = raftLogSize 741 var becameLeader bool 742 if r.mu.leaderID != leaderID { 743 r.mu.leaderID = leaderID 744 // Clear the remote proposal set. Would have been nil already if not 745 // previously the leader. 746 becameLeader = r.mu.leaderID == r.mu.replicaID 747 } 748 r.mu.Unlock() 749 750 // When becoming the leader, proactively add the replica to the replicate 751 // queue. We might have been handed leadership by a remote node which wanted 752 // to remove itself from the range. 753 if becameLeader && r.store.replicateQueue != nil { 754 r.store.replicateQueue.MaybeAddAsync(ctx, r, r.store.Clock().Now()) 755 } 756 757 // Update raft log entry cache. We clear any older, uncommitted log entries 758 // and cache the latest ones. 759 r.store.raftEntryCache.Add(r.RangeID, rd.Entries, true /* truncate */) 760 r.sendRaftMessages(ctx, otherMsgs) 761 r.traceEntries(rd.CommittedEntries, "committed, before applying any entries") 762 763 applicationStart := timeutil.Now() 764 if len(rd.CommittedEntries) > 0 { 765 err := appTask.ApplyCommittedEntries(ctx) 766 stats.applyCommittedEntriesStats = sm.moveStats() 767 if errors.Is(err, apply.ErrRemoved) { 768 // We know that our replica has been removed. All future calls to 769 // r.withRaftGroup() will return errRemoved so no future Ready objects 770 // will be processed by this Replica. 771 return stats, "", err 772 } else if err != nil { 773 return stats, getNonDeterministicFailureExplanation(err), err 774 } 775 776 // etcd raft occasionally adds a nil entry (our own commands are never 777 // empty). This happens in two situations: When a new leader is elected, and 778 // when a config change is dropped due to the "one at a time" rule. In both 779 // cases we may need to resubmit our pending proposals (In the former case 780 // we resubmit everything because we proposed them to a former leader that 781 // is no longer able to commit them. In the latter case we only need to 782 // resubmit pending config changes, but it's hard to distinguish so we 783 // resubmit everything anyway). We delay resubmission until after we have 784 // processed the entire batch of entries. 785 if stats.numEmptyEntries > 0 { 786 // Overwrite unconditionally since this is the most aggressive 787 // reproposal mode. 788 if !r.store.TestingKnobs().DisableRefreshReasonNewLeaderOrConfigChange { 789 refreshReason = reasonNewLeaderOrConfigChange 790 } 791 } 792 } 793 applicationElapsed := timeutil.Since(applicationStart).Nanoseconds() 794 r.store.metrics.RaftApplyCommittedLatency.RecordValue(applicationElapsed) 795 if r.store.TestingKnobs().EnableUnconditionalRefreshesInRaftReady { 796 refreshReason = reasonNewLeaderOrConfigChange 797 } 798 if refreshReason != noReason { 799 r.mu.Lock() 800 r.refreshProposalsLocked(ctx, 0 /* refreshAtDelta */, refreshReason) 801 r.mu.Unlock() 802 } 803 804 // NB: if we just processed a command which removed this replica from the 805 // raft group we will early return before this point. This, combined with 806 // the fact that we'll refuse to process messages intended for a higher 807 // replica ID ensures that our replica ID could not have changed. 808 const expl = "during advance" 809 810 r.mu.Lock() 811 err = r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) { 812 raftGroup.Advance(rd) 813 if stats.numConfChangeEntries > 0 { 814 // If the raft leader got removed, campaign the first remaining voter. 815 // 816 // NB: this must be called after Advance() above since campaigning is 817 // a no-op in the presence of unapplied conf changes. 818 maybeCampaignAfterConfChange(ctx, r.store.StoreID(), r.descRLocked(), raftGroup) 819 } 820 821 // If the Raft group still has more to process then we immediately 822 // re-enqueue it for another round of processing. This is possible if 823 // the group's committed entries were paginated due to size limitations 824 // and we didn't apply all of them in this pass. 825 if raftGroup.HasReady() { 826 r.store.enqueueRaftUpdateCheck(r.RangeID) 827 } 828 return true, nil 829 }) 830 r.mu.Unlock() 831 if err != nil { 832 return stats, expl, errors.Wrap(err, expl) 833 } 834 835 // NB: All early returns other than the one due to not having a ready 836 // which also makes the below call are due to fatal errors. 837 // We must also update the proposal quota when have a ready; consider the 838 // case where there are two replicas and we have a quota of size 1. We 839 // acquire the quota when the write gets proposed on the leader and expect it 840 // to be released when the follower commits it locally. In order to do so we 841 // need to have the entry 'come out of raft' and in the case of a two node 842 // raft group, this only happens if hasReady == true. If we don't release 843 // quota back at the end of handleRaftReadyRaftMuLocked, the next write will 844 // get blocked. 845 r.updateProposalQuotaRaftMuLocked(ctx, lastLeaderID) 846 return stats, "", nil 847 } 848 849 // splitMsgApps splits the Raft message slice into two slices, one containing 850 // MsgApps and one containing all other message types. Each slice retains the 851 // relative ordering between messages in the original slice. 852 func splitMsgApps(msgs []raftpb.Message) (msgApps, otherMsgs []raftpb.Message) { 853 splitIdx := 0 854 for i, msg := range msgs { 855 if msg.Type == raftpb.MsgApp { 856 msgs[i], msgs[splitIdx] = msgs[splitIdx], msgs[i] 857 splitIdx++ 858 } 859 } 860 return msgs[:splitIdx], msgs[splitIdx:] 861 } 862 863 // maybeFatalOnRaftReadyErr will fatal if err is neither nil nor 864 // apply.ErrRemoved. 865 func maybeFatalOnRaftReadyErr(ctx context.Context, expl string, err error) (removed bool) { 866 switch { 867 case err == nil: 868 return false 869 case errors.Is(err, apply.ErrRemoved): 870 return true 871 default: 872 log.FatalfDepth(ctx, 1, "%s: %+v", log.Safe(expl), err) 873 panic("unreachable") 874 } 875 } 876 877 // tick the Raft group, returning true if the raft group exists and is 878 // unquiesced; false otherwise. 879 func (r *Replica) tick(livenessMap IsLiveMap) (bool, error) { 880 ctx := r.AnnotateCtx(context.TODO()) 881 882 r.unreachablesMu.Lock() 883 remotes := r.unreachablesMu.remotes 884 r.unreachablesMu.remotes = nil 885 r.unreachablesMu.Unlock() 886 887 r.raftMu.Lock() 888 defer r.raftMu.Unlock() 889 r.mu.Lock() 890 defer r.mu.Unlock() 891 892 // If the raft group is uninitialized, do not initialize on tick. 893 if r.mu.internalRaftGroup == nil { 894 return false, nil 895 } 896 897 for remoteReplica := range remotes { 898 r.mu.internalRaftGroup.ReportUnreachable(uint64(remoteReplica)) 899 } 900 901 if r.mu.quiescent { 902 return false, nil 903 } 904 if r.maybeQuiesceLocked(ctx, livenessMap) { 905 return false, nil 906 } 907 908 r.maybeTransferRaftLeadershipLocked(ctx) 909 910 // For followers, we update lastUpdateTimes when we step a message from them 911 // into the local Raft group. The leader won't hit that path, so we update 912 // it whenever it ticks. In effect, this makes sure it always sees itself as 913 // alive. 914 if r.mu.replicaID == r.mu.leaderID { 915 r.mu.lastUpdateTimes.update(r.mu.replicaID, timeutil.Now()) 916 } 917 918 r.mu.ticks++ 919 r.mu.internalRaftGroup.Tick() 920 921 refreshAtDelta := r.store.cfg.RaftElectionTimeoutTicks 922 if knob := r.store.TestingKnobs().RefreshReasonTicksPeriod; knob > 0 { 923 refreshAtDelta = knob 924 } 925 if !r.store.TestingKnobs().DisableRefreshReasonTicks && r.mu.ticks%refreshAtDelta == 0 { 926 // RaftElectionTimeoutTicks is a reasonable approximation of how long we 927 // should wait before deciding that our previous proposal didn't go 928 // through. Note that the combination of the above condition and passing 929 // RaftElectionTimeoutTicks to refreshProposalsLocked means that commands 930 // will be refreshed when they have been pending for 1 to 2 election 931 // cycles. 932 r.refreshProposalsLocked(ctx, refreshAtDelta, reasonTicks) 933 } 934 return true, nil 935 } 936 937 func (r *Replica) hasRaftReadyRLocked() bool { 938 return r.mu.internalRaftGroup.HasReady() 939 } 940 941 //go:generate stringer -type refreshRaftReason 942 type refreshRaftReason int 943 944 const ( 945 noReason refreshRaftReason = iota 946 reasonNewLeader 947 reasonNewLeaderOrConfigChange 948 // A snapshot was just applied and so it may have contained commands that we 949 // proposed whose proposal we still consider to be inflight. These commands 950 // will never receive a response through the regular channel. 951 reasonSnapshotApplied 952 reasonReplicaIDChanged 953 reasonTicks 954 ) 955 956 // refreshProposalsLocked goes through the pending proposals, notifying 957 // proposers whose proposals need to be retried, and resubmitting proposals 958 // which were likely dropped (but may still apply at a legal Lease index) - 959 // ensuring that the proposer will eventually get a reply on the channel it's 960 // waiting on. 961 // mu must be held. 962 // 963 // refreshAtDelta only applies for reasonTicks and specifies how old (in ticks) 964 // a command must be for it to be inspected; the usual value is the number of 965 // ticks of an election timeout (affect only proposals that have had ample time 966 // to apply but didn't). 967 func (r *Replica) refreshProposalsLocked( 968 ctx context.Context, refreshAtDelta int, reason refreshRaftReason, 969 ) { 970 if refreshAtDelta != 0 && reason != reasonTicks { 971 log.Fatalf(ctx, "refreshAtDelta specified for reason %s != reasonTicks", reason) 972 } 973 974 var reproposals pendingCmdSlice 975 for _, p := range r.mu.proposals { 976 if p.command.MaxLeaseIndex == 0 { 977 // Commands without a MaxLeaseIndex cannot be reproposed, as they might 978 // apply twice. We also don't want to ask the proposer to retry these 979 // special commands. 980 r.cleanupFailedProposalLocked(p) 981 log.VEventf(p.ctx, 2, "refresh (reason: %s) returning AmbiguousResultError for command "+ 982 "without MaxLeaseIndex: %v", reason, p.command) 983 p.finishApplication(ctx, proposalResult{Err: roachpb.NewError( 984 roachpb.NewAmbiguousResultError( 985 fmt.Sprintf("unknown status for command without MaxLeaseIndex "+ 986 "at refreshProposalsLocked time (refresh reason: %s)", reason)))}) 987 continue 988 } 989 switch reason { 990 case reasonSnapshotApplied: 991 // If we applied a snapshot, check the MaxLeaseIndexes of all 992 // pending commands to see if any are now prevented from 993 // applying, and if so make them return an ambiguous error. We 994 // can't tell at this point (which should be rare) whether they 995 // were included in the snapshot we received or not. 996 if p.command.MaxLeaseIndex <= r.mu.state.LeaseAppliedIndex { 997 r.cleanupFailedProposalLocked(p) 998 log.Eventf(p.ctx, "retry proposal %x: %s", p.idKey, reason) 999 p.finishApplication(ctx, proposalResult{Err: roachpb.NewError( 1000 roachpb.NewAmbiguousResultError( 1001 fmt.Sprintf("unable to determine whether command was applied via snapshot")))}) 1002 } 1003 continue 1004 1005 case reasonTicks: 1006 if p.proposedAtTicks <= r.mu.ticks-refreshAtDelta { 1007 // The command was proposed a while ago and may have been dropped. Try it again. 1008 reproposals = append(reproposals, p) 1009 } 1010 1011 default: 1012 // We have reason to believe that all pending proposals were 1013 // dropped on the floor (e.g. because of a leader election), so 1014 // repropose everything. 1015 reproposals = append(reproposals, p) 1016 } 1017 } 1018 1019 if log.V(1) && len(reproposals) > 0 { 1020 log.Infof(ctx, 1021 "pending commands: reproposing %d (at %d.%d) %s", 1022 len(reproposals), r.mu.state.RaftAppliedIndex, 1023 r.mu.state.LeaseAppliedIndex, reason) 1024 } 1025 1026 // Reproposals are those commands which we weren't able to send back to the 1027 // client (since we're not sure that another copy of them could apply at 1028 // the "correct" index). For reproposals, it's generally pretty unlikely 1029 // that they can make it in the right place. Reproposing in order is 1030 // definitely required, however. 1031 sort.Sort(reproposals) 1032 for _, p := range reproposals { 1033 log.Eventf(p.ctx, "re-submitting command %x to Raft: %s", p.idKey, reason) 1034 if err := r.mu.proposalBuf.ReinsertLocked(p); err != nil { 1035 r.cleanupFailedProposalLocked(p) 1036 p.finishApplication(ctx, proposalResult{ 1037 Err: roachpb.NewError(roachpb.NewAmbiguousResultError(err.Error())), 1038 }) 1039 } 1040 } 1041 } 1042 1043 // maybeCoalesceHeartbeat returns true if the heartbeat was coalesced and added 1044 // to the appropriate queue. 1045 func (r *Replica) maybeCoalesceHeartbeat( 1046 ctx context.Context, 1047 msg raftpb.Message, 1048 toReplica, fromReplica roachpb.ReplicaDescriptor, 1049 quiesce bool, 1050 ) bool { 1051 var hbMap map[roachpb.StoreIdent][]RaftHeartbeat 1052 switch msg.Type { 1053 case raftpb.MsgHeartbeat: 1054 r.store.coalescedMu.Lock() 1055 hbMap = r.store.coalescedMu.heartbeats 1056 case raftpb.MsgHeartbeatResp: 1057 r.store.coalescedMu.Lock() 1058 hbMap = r.store.coalescedMu.heartbeatResponses 1059 default: 1060 return false 1061 } 1062 beat := RaftHeartbeat{ 1063 RangeID: r.RangeID, 1064 ToReplicaID: toReplica.ReplicaID, 1065 FromReplicaID: fromReplica.ReplicaID, 1066 Term: msg.Term, 1067 Commit: msg.Commit, 1068 Quiesce: quiesce, 1069 ToIsLearner: toReplica.GetType() == roachpb.LEARNER, 1070 } 1071 if log.V(4) { 1072 log.Infof(ctx, "coalescing beat: %+v", beat) 1073 } 1074 toStore := roachpb.StoreIdent{ 1075 StoreID: toReplica.StoreID, 1076 NodeID: toReplica.NodeID, 1077 } 1078 hbMap[toStore] = append(hbMap[toStore], beat) 1079 r.store.coalescedMu.Unlock() 1080 return true 1081 } 1082 1083 func (r *Replica) sendRaftMessages(ctx context.Context, messages []raftpb.Message) { 1084 var lastAppResp raftpb.Message 1085 for _, message := range messages { 1086 drop := false 1087 switch message.Type { 1088 case raftpb.MsgApp: 1089 if util.RaceEnabled { 1090 // Iterate over the entries to assert that all sideloaded commands 1091 // are already inlined. replicaRaftStorage.Entries already performs 1092 // the sideload inlining for stable entries and raft.unstable always 1093 // contain fat entries. Since these are the only two sources that 1094 // raft.sendAppend gathers entries from to populate MsgApps, we 1095 // should never see thin entries here. 1096 for j := range message.Entries { 1097 assertSideloadedRaftCommandInlined(ctx, &message.Entries[j]) 1098 } 1099 } 1100 1101 case raftpb.MsgAppResp: 1102 // A successful (non-reject) MsgAppResp contains one piece of 1103 // information: the highest log index. Raft currently queues up 1104 // one MsgAppResp per incoming MsgApp, and we may process 1105 // multiple messages in one handleRaftReady call (because 1106 // multiple messages may arrive while we're blocked syncing to 1107 // disk). If we get redundant MsgAppResps, drop all but the 1108 // last (we've seen that too many MsgAppResps can overflow 1109 // message queues on the receiving side). 1110 // 1111 // Note that this reorders the chosen MsgAppResp relative to 1112 // other messages (including any MsgAppResps with the Reject flag), 1113 // but raft is fine with this reordering. 1114 // 1115 // TODO(bdarnell): Consider pushing this optimization into etcd/raft. 1116 // Similar optimizations may be possible for other message types, 1117 // although MsgAppResp is the only one that has been seen as a 1118 // problem in practice. 1119 if !message.Reject && message.Index > lastAppResp.Index { 1120 lastAppResp = message 1121 drop = true 1122 } 1123 } 1124 1125 if !drop { 1126 r.sendRaftMessage(ctx, message) 1127 } 1128 } 1129 if lastAppResp.Index > 0 { 1130 r.sendRaftMessage(ctx, lastAppResp) 1131 } 1132 } 1133 1134 // sendRaftMessage sends a Raft message. 1135 func (r *Replica) sendRaftMessage(ctx context.Context, msg raftpb.Message) { 1136 r.mu.RLock() 1137 fromReplica, fromErr := r.getReplicaDescriptorByIDRLocked(roachpb.ReplicaID(msg.From), r.mu.lastToReplica) 1138 toReplica, toErr := r.getReplicaDescriptorByIDRLocked(roachpb.ReplicaID(msg.To), r.mu.lastFromReplica) 1139 var startKey roachpb.RKey 1140 if msg.Type == raftpb.MsgApp && r.mu.internalRaftGroup != nil { 1141 // When the follower is potentially an uninitialized replica waiting for 1142 // a split trigger, send the replica's StartKey along. See the method 1143 // below for more context: 1144 _ = maybeDropMsgApp 1145 // NB: this code is allocation free. 1146 r.mu.internalRaftGroup.WithProgress(func(id uint64, _ raft.ProgressType, pr tracker.Progress) { 1147 if id == msg.To && pr.State == tracker.StateProbe { 1148 // It is moderately expensive to attach a full key to the message, but note that 1149 // a probing follower will only be appended to once per heartbeat interval (i.e. 1150 // on the order of seconds). See: 1151 // 1152 // https://github.com/etcd-io/etcd/blob/7f450bf6967638673dd88fd4e730b01d1303d5ff/raft/progress.go#L41 1153 startKey = r.descRLocked().StartKey 1154 } 1155 }) 1156 } 1157 r.mu.RUnlock() 1158 1159 if fromErr != nil { 1160 log.Warningf(ctx, "failed to look up sender replica %d in r%d while sending %s: %s", 1161 msg.From, r.RangeID, msg.Type, fromErr) 1162 return 1163 } 1164 if toErr != nil { 1165 log.Warningf(ctx, "failed to look up recipient replica %d in r%d while sending %s: %s", 1166 msg.To, r.RangeID, msg.Type, toErr) 1167 return 1168 } 1169 1170 // Raft-initiated snapshots are handled by the Raft snapshot queue. 1171 if msg.Type == raftpb.MsgSnap { 1172 r.store.raftSnapshotQueue.AddAsync(ctx, r, raftSnapshotPriority) 1173 return 1174 } 1175 1176 if r.maybeCoalesceHeartbeat(ctx, msg, toReplica, fromReplica, false) { 1177 return 1178 } 1179 1180 req := newRaftMessageRequest() 1181 *req = RaftMessageRequest{ 1182 RangeID: r.RangeID, 1183 ToReplica: toReplica, 1184 FromReplica: fromReplica, 1185 Message: msg, 1186 RangeStartKey: startKey, // usually nil 1187 } 1188 if !r.sendRaftMessageRequest(ctx, req) { 1189 if err := r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { 1190 r.mu.droppedMessages++ 1191 raftGroup.ReportUnreachable(msg.To) 1192 return true, nil 1193 }); err != nil && !errors.Is(err, errRemoved) { 1194 log.Fatalf(ctx, "%v", err) 1195 } 1196 } 1197 } 1198 1199 // addUnreachableRemoteReplica adds the given remote ReplicaID to be reported 1200 // as unreachable on the next tick. 1201 func (r *Replica) addUnreachableRemoteReplica(remoteReplica roachpb.ReplicaID) { 1202 r.unreachablesMu.Lock() 1203 if r.unreachablesMu.remotes == nil { 1204 r.unreachablesMu.remotes = make(map[roachpb.ReplicaID]struct{}) 1205 } 1206 r.unreachablesMu.remotes[remoteReplica] = struct{}{} 1207 r.unreachablesMu.Unlock() 1208 } 1209 1210 // sendRaftMessageRequest sends a raft message, returning false if the message 1211 // was dropped. It is the caller's responsibility to call ReportUnreachable on 1212 // the Raft group. 1213 func (r *Replica) sendRaftMessageRequest(ctx context.Context, req *RaftMessageRequest) bool { 1214 if log.V(4) { 1215 log.Infof(ctx, "sending raft request %+v", req) 1216 } 1217 ok := r.store.cfg.Transport.SendAsync(req, r.connectionClass.get()) 1218 // TODO(peter): Looping over all of the outgoing Raft message queues to 1219 // update this stat on every send is a bit expensive. 1220 r.store.metrics.RaftEnqueuedPending.Update(r.store.cfg.Transport.queuedMessageCount()) 1221 return ok 1222 } 1223 1224 func (r *Replica) reportSnapshotStatus(ctx context.Context, to roachpb.ReplicaID, snapErr error) { 1225 r.raftMu.Lock() 1226 defer r.raftMu.Unlock() 1227 1228 snapStatus := raft.SnapshotFinish 1229 if snapErr != nil { 1230 snapStatus = raft.SnapshotFailure 1231 } 1232 1233 if err := r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { 1234 raftGroup.ReportSnapshot(uint64(to), snapStatus) 1235 return true, nil 1236 }); err != nil && !errors.Is(err, errRemoved) { 1237 log.Fatalf(ctx, "%v", err) 1238 } 1239 } 1240 1241 type snapTruncationInfo struct { 1242 index uint64 1243 recipientStore roachpb.StoreID 1244 deadline time.Time 1245 } 1246 1247 func (r *Replica) addSnapshotLogTruncationConstraint( 1248 ctx context.Context, snapUUID uuid.UUID, index uint64, recipientStore roachpb.StoreID, 1249 ) { 1250 r.mu.Lock() 1251 defer r.mu.Unlock() 1252 r.addSnapshotLogTruncationConstraintLocked(ctx, snapUUID, index, recipientStore) 1253 } 1254 1255 func (r *Replica) addSnapshotLogTruncationConstraintLocked( 1256 ctx context.Context, snapUUID uuid.UUID, index uint64, recipientStore roachpb.StoreID, 1257 ) { 1258 if r.mu.snapshotLogTruncationConstraints == nil { 1259 r.mu.snapshotLogTruncationConstraints = make(map[uuid.UUID]snapTruncationInfo) 1260 } 1261 item, ok := r.mu.snapshotLogTruncationConstraints[snapUUID] 1262 if ok { 1263 // Uh-oh, there's either a programming error (resulting in the same snapshot 1264 // fed into this method twice) or a UUID collision. We discard the update 1265 // (which is benign) but log it loudly. If the index is the same, it's 1266 // likely the former, otherwise the latter. 1267 log.Warningf(ctx, "UUID collision at %s for %+v (index %d)", snapUUID, item, index) 1268 return 1269 } 1270 1271 r.mu.snapshotLogTruncationConstraints[snapUUID] = snapTruncationInfo{ 1272 index: index, 1273 recipientStore: recipientStore, 1274 } 1275 } 1276 1277 // completeSnapshotLogTruncationConstraint marks the given snapshot as finished, 1278 // releasing the lock on raft log truncation after a grace period. 1279 func (r *Replica) completeSnapshotLogTruncationConstraint( 1280 ctx context.Context, snapUUID uuid.UUID, now time.Time, 1281 ) { 1282 r.mu.Lock() 1283 defer r.mu.Unlock() 1284 1285 item, ok := r.mu.snapshotLogTruncationConstraints[snapUUID] 1286 if !ok { 1287 // UUID collision while adding the snapshot in originally. Nothing 1288 // else to do. 1289 return 1290 } 1291 1292 deadline := now.Add(raftLogQueuePendingSnapshotGracePeriod) 1293 item.deadline = deadline 1294 r.mu.snapshotLogTruncationConstraints[snapUUID] = item 1295 } 1296 1297 // getAndGCSnapshotLogTruncationConstraints returns the minimum index of any 1298 // currently outstanding snapshot being sent from this replica to the specified 1299 // recipient or 0 if there isn't one. Passing 0 for recipientStore means any 1300 // recipient. 1301 func (r *Replica) getAndGCSnapshotLogTruncationConstraints( 1302 now time.Time, recipientStore roachpb.StoreID, 1303 ) (minSnapIndex uint64) { 1304 r.mu.Lock() 1305 defer r.mu.Unlock() 1306 return r.getAndGCSnapshotLogTruncationConstraintsLocked(now, recipientStore) 1307 } 1308 1309 func (r *Replica) getAndGCSnapshotLogTruncationConstraintsLocked( 1310 now time.Time, recipientStore roachpb.StoreID, 1311 ) (minSnapIndex uint64) { 1312 for snapUUID, item := range r.mu.snapshotLogTruncationConstraints { 1313 if item.deadline != (time.Time{}) && item.deadline.Before(now) { 1314 // The snapshot has finished and its grace period has passed. 1315 // Ignore it when making truncation decisions. 1316 delete(r.mu.snapshotLogTruncationConstraints, snapUUID) 1317 continue 1318 } 1319 if recipientStore != 0 && item.recipientStore != recipientStore { 1320 continue 1321 } 1322 if minSnapIndex == 0 || minSnapIndex > item.index { 1323 minSnapIndex = item.index 1324 } 1325 } 1326 if len(r.mu.snapshotLogTruncationConstraints) == 0 { 1327 // Save a little bit of memory. 1328 r.mu.snapshotLogTruncationConstraints = nil 1329 } 1330 return minSnapIndex 1331 } 1332 1333 func isRaftLeader(raftStatus *raft.Status) bool { 1334 return raftStatus != nil && raftStatus.SoftState.RaftState == raft.StateLeader 1335 } 1336 1337 // HasRaftLeader returns true if the raft group has a raft leader currently. 1338 func HasRaftLeader(raftStatus *raft.Status) bool { 1339 return raftStatus != nil && raftStatus.SoftState.Lead != 0 1340 } 1341 1342 // pendingCmdSlice sorts by increasing MaxLeaseIndex. 1343 type pendingCmdSlice []*ProposalData 1344 1345 func (s pendingCmdSlice) Len() int { return len(s) } 1346 func (s pendingCmdSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 1347 func (s pendingCmdSlice) Less(i, j int) bool { 1348 return s[i].command.MaxLeaseIndex < s[j].command.MaxLeaseIndex 1349 } 1350 1351 // withRaftGroupLocked calls the supplied function with the (lazily 1352 // initialized) Raft group. The supplied function should return true for the 1353 // unquiesceAndWakeLeader argument if the replica should be unquiesced (and the 1354 // leader awoken). See handleRaftReady for an instance of where this value 1355 // varies. 1356 // 1357 // Requires that Replica.mu is held. 1358 // 1359 // If this Replica is in the process of being removed this method will return 1360 // errRemoved. 1361 func (r *Replica) withRaftGroupLocked( 1362 mayCampaignOnWake bool, f func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error), 1363 ) error { 1364 if r.mu.destroyStatus.Removed() { 1365 // Callers know to detect errRemoved as non-fatal. 1366 return errRemoved 1367 } 1368 1369 if r.mu.internalRaftGroup == nil { 1370 ctx := r.AnnotateCtx(context.TODO()) 1371 raftGroup, err := raft.NewRawNode(newRaftConfig( 1372 raft.Storage((*replicaRaftStorage)(r)), 1373 uint64(r.mu.replicaID), 1374 r.mu.state.RaftAppliedIndex, 1375 r.store.cfg, 1376 &raftLogger{ctx: ctx}, 1377 )) 1378 if err != nil { 1379 return err 1380 } 1381 r.mu.internalRaftGroup = raftGroup 1382 1383 if mayCampaignOnWake { 1384 r.maybeCampaignOnWakeLocked(ctx) 1385 } 1386 } 1387 1388 // This wrapper function is a hack to add range IDs to stack traces 1389 // using the same pattern as Replica.sendWithRangeID. 1390 unquiesce, err := func(rangeID roachpb.RangeID, raftGroup *raft.RawNode) (bool, error) { 1391 return f(raftGroup) 1392 }(r.RangeID, r.mu.internalRaftGroup) 1393 if r.mu.internalRaftGroup.BasicStatus().Lead == 0 { 1394 // If we don't know the leader, unquiesce unconditionally. As a 1395 // follower, we can't wake up the leader if we don't know who that is, 1396 // so we should find out now before someone needs us to unquiesce. 1397 // 1398 // This situation should occur rarely or never (ever since we got 1399 // stricter about validating incoming Quiesce requests) but it's good 1400 // defense-in-depth. 1401 // 1402 // Note that unquiesceAndWakeLeaderLocked won't manage to wake up the 1403 // leader since it's unknown to this replica, and at the time of writing 1404 // the heuristics for campaigning are defensive (won't campaign if there 1405 // is a live leaseholder). But if we are trying to unquiesce because 1406 // this follower was asked to propose something, then this means that a 1407 // request is going to have to wait until the leader next contacts us, 1408 // or, in the worst case, an election timeout. This is not ideal - if a 1409 // node holds a live lease, we should direct the client to it 1410 // immediately. 1411 unquiesce = true 1412 } 1413 if unquiesce { 1414 r.unquiesceAndWakeLeaderLocked() 1415 } 1416 return err 1417 } 1418 1419 // withRaftGroup calls the supplied function with the (lazily initialized) 1420 // Raft group. It acquires and releases the Replica lock, so r.mu must not be 1421 // held (or acquired by the supplied function). 1422 // 1423 // If mayCampaignOnWake is true, the replica may initiate a raft 1424 // election if it was previously in a dormant state. Most callers 1425 // should set this to true, because the prevote feature minimizes the 1426 // disruption from unnecessary elections. The exception is that we 1427 // should not initiate an election while handling incoming raft 1428 // messages (which may include MsgVotes from an election in progress, 1429 // and this election would be disrupted if we started our own). 1430 // 1431 // If this Replica is in the process of being removed this method will return 1432 // errRemoved. 1433 func (r *Replica) withRaftGroup( 1434 mayCampaignOnWake bool, f func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error), 1435 ) error { 1436 r.mu.Lock() 1437 defer r.mu.Unlock() 1438 return r.withRaftGroupLocked(mayCampaignOnWake, f) 1439 } 1440 1441 func shouldCampaignOnWake( 1442 leaseStatus kvserverpb.LeaseStatus, 1443 lease roachpb.Lease, 1444 storeID roachpb.StoreID, 1445 raftStatus raft.Status, 1446 ) bool { 1447 // When waking up a range, campaign unless we know that another 1448 // node holds a valid lease (this is most important after a split, 1449 // when all replicas create their raft groups at about the same 1450 // time, with a lease pre-assigned to one of them). Note that 1451 // thanks to PreVote, unnecessary campaigns are not disruptive so 1452 // we should err on the side of campaigining here. 1453 anotherOwnsLease := leaseStatus.State == kvserverpb.LeaseState_VALID && !lease.OwnedBy(storeID) 1454 1455 // If we're already campaigning or know who the leader is, don't 1456 // start a new term. 1457 noLeader := raftStatus.RaftState == raft.StateFollower && raftStatus.Lead == 0 1458 return !anotherOwnsLease && noLeader 1459 } 1460 1461 // maybeCampaignOnWakeLocked is called when the range wakes from a 1462 // dormant state (either the initial "raftGroup == nil" state or after 1463 // being quiescent) and campaigns for raft leadership if appropriate. 1464 func (r *Replica) maybeCampaignOnWakeLocked(ctx context.Context) { 1465 // Raft panics if a node that is not currently a member of the 1466 // group tries to campaign. That happens primarily when we apply 1467 // preemptive snapshots. 1468 if _, currentMember := r.mu.state.Desc.GetReplicaDescriptorByID(r.mu.replicaID); !currentMember { 1469 return 1470 } 1471 1472 leaseStatus := r.leaseStatus(*r.mu.state.Lease, r.store.Clock().Now(), r.mu.minLeaseProposedTS) 1473 raftStatus := r.mu.internalRaftGroup.Status() 1474 if shouldCampaignOnWake(leaseStatus, *r.mu.state.Lease, r.store.StoreID(), raftStatus) { 1475 log.VEventf(ctx, 3, "campaigning") 1476 if err := r.mu.internalRaftGroup.Campaign(); err != nil { 1477 log.VEventf(ctx, 1, "failed to campaign: %s", err) 1478 } 1479 } 1480 } 1481 1482 // a lastUpdateTimesMap is maintained on the Raft leader to keep track of the 1483 // last communication received from followers, which in turn informs the quota 1484 // pool and log truncations. 1485 type lastUpdateTimesMap map[roachpb.ReplicaID]time.Time 1486 1487 func (m lastUpdateTimesMap) update(replicaID roachpb.ReplicaID, now time.Time) { 1488 if m == nil { 1489 return 1490 } 1491 m[replicaID] = now 1492 } 1493 1494 // updateOnUnquiesce is called when the leader unquiesces. In that case, we 1495 // don't want live followers to appear as dead before their next message reaches 1496 // us; to achieve that, we optimistically mark all followers that are in 1497 // ProgressStateReplicate (or rather, were in that state when the group 1498 // quiesced) as live as of `now`. We don't want to mark other followers as 1499 // live as they may be down and could artificially seem alive forever assuming 1500 // a suitable pattern of quiesce and unquiesce operations (and this in turn 1501 // can interfere with Raft log truncations). 1502 func (m lastUpdateTimesMap) updateOnUnquiesce( 1503 descs []roachpb.ReplicaDescriptor, prs map[uint64]tracker.Progress, now time.Time, 1504 ) { 1505 for _, desc := range descs { 1506 if prs[uint64(desc.ReplicaID)].State == tracker.StateReplicate { 1507 m.update(desc.ReplicaID, now) 1508 } 1509 } 1510 } 1511 1512 // updateOnBecomeLeader is similar to updateOnUnquiesce, but is called when the 1513 // replica becomes the Raft leader. It updates all followers irrespective of 1514 // their Raft state, for the Raft state is not yet populated by the time this 1515 // callback is invoked. Raft leadership is usually stable, so there is no danger 1516 // of artificially keeping down followers alive, though if it started 1517 // flip-flopping at a <10s cadence there would be a risk of that happening. 1518 func (m lastUpdateTimesMap) updateOnBecomeLeader(descs []roachpb.ReplicaDescriptor, now time.Time) { 1519 for _, desc := range descs { 1520 m.update(desc.ReplicaID, now) 1521 } 1522 } 1523 1524 // isFollowerActiveSince returns whether the specified follower has made 1525 // communication with the leader recently (since threshold). 1526 func (m lastUpdateTimesMap) isFollowerActiveSince( 1527 ctx context.Context, replicaID roachpb.ReplicaID, now time.Time, threshold time.Duration, 1528 ) bool { 1529 lastUpdateTime, ok := m[replicaID] 1530 if !ok { 1531 // If the follower has no entry in lastUpdateTimes, it has not been 1532 // updated since r became the leader (at which point all then-existing 1533 // replicas were updated). 1534 return false 1535 } 1536 return now.Sub(lastUpdateTime) <= threshold 1537 } 1538 1539 // maybeAcquireSnapshotMergeLock checks whether the incoming snapshot subsumes 1540 // any replicas and, if so, locks them for subsumption. See acquireMergeLock 1541 // for details about the lock itself. 1542 func (r *Replica) maybeAcquireSnapshotMergeLock( 1543 ctx context.Context, inSnap IncomingSnapshot, 1544 ) (subsumedRepls []*Replica, releaseMergeLock func()) { 1545 // Any replicas that overlap with the bounds of the incoming snapshot are ours 1546 // to subsume; further, the end of the last overlapping replica will exactly 1547 // align with the end of the snapshot. How are we guaranteed this? Each merge 1548 // could not have committed unless this store had an up-to-date replica of the 1549 // RHS at the time of the merge. Nothing could have removed that RHS replica, 1550 // as the replica GC queue cannot GC a replica unless it can prove its 1551 // left-hand neighbor has no pending merges to apply. And that RHS replica 1552 // could not have been further split or merged, as it never processes another 1553 // command after the merge commits. 1554 endKey := r.Desc().EndKey 1555 if endKey == nil { 1556 // The existing replica is unitialized, in which case we've already 1557 // installed a placeholder for snapshot's keyspace. No merge lock needed. 1558 return nil, func() {} 1559 } 1560 for endKey.Less(inSnap.State.Desc.EndKey) { 1561 sRepl := r.store.LookupReplica(endKey) 1562 if sRepl == nil || !endKey.Equal(sRepl.Desc().StartKey) { 1563 log.Fatalf(ctx, "snapshot widens existing replica, but no replica exists for subsumed key %s", endKey) 1564 } 1565 sRepl.raftMu.Lock() 1566 subsumedRepls = append(subsumedRepls, sRepl) 1567 endKey = sRepl.Desc().EndKey 1568 } 1569 // TODO(benesch): we may be unnecessarily forcing another Raft snapshot here 1570 // by subsuming too much. Consider the case where [a, b) and [c, e) first 1571 // merged into [a, e), then split into [a, d) and [d, e), and we're applying a 1572 // snapshot that spans this merge and split. The bounds of this snapshot will 1573 // be [a, d), so we'll subsume [c, e). But we're still a member of [d, e)! 1574 // We'll currently be forced to get a Raft snapshot to catch up. Ideally, we'd 1575 // subsume only half of [c, e) and synthesize a new RHS [d, e), effectively 1576 // applying both the split and merge during snapshot application. This isn't a 1577 // huge deal, though: we're probably behind enough that the RHS would need to 1578 // get caught up with a Raft snapshot anyway, even if we synthesized it 1579 // properly. 1580 return subsumedRepls, func() { 1581 for _, sr := range subsumedRepls { 1582 sr.raftMu.Unlock() 1583 } 1584 } 1585 } 1586 1587 // maybeAcquireSplitMergeLock examines the given raftCmd (which need 1588 // not be applied yet) and acquires the split or merge lock if 1589 // necessary (in addition to other preparation). It returns a function 1590 // which will release any lock acquired (or nil). 1591 // 1592 // After this method returns successfully the RHS of the split or merge 1593 // is guaranteed to exist in the Store using GetReplica(). 1594 func (r *Replica) maybeAcquireSplitMergeLock( 1595 ctx context.Context, raftCmd kvserverpb.RaftCommand, 1596 ) (func(), error) { 1597 if split := raftCmd.ReplicatedEvalResult.Split; split != nil { 1598 return r.acquireSplitLock(ctx, &split.SplitTrigger) 1599 } else if merge := raftCmd.ReplicatedEvalResult.Merge; merge != nil { 1600 return r.acquireMergeLock(ctx, &merge.MergeTrigger) 1601 } 1602 return nil, nil 1603 } 1604 1605 func (r *Replica) acquireSplitLock( 1606 ctx context.Context, split *roachpb.SplitTrigger, 1607 ) (func(), error) { 1608 rightReplDesc, _ := split.RightDesc.GetReplicaDescriptor(r.StoreID()) 1609 rightRepl, _, err := r.store.getOrCreateReplica(ctx, split.RightDesc.RangeID, 1610 rightReplDesc.ReplicaID, nil, /* creatingReplica */ 1611 rightReplDesc.GetType() == roachpb.LEARNER) 1612 // If getOrCreateReplica returns RaftGroupDeletedError we know that the RHS 1613 // has already been removed. This case is handled properly in splitPostApply. 1614 if errors.HasType(err, (*roachpb.RaftGroupDeletedError)(nil)) { 1615 return func() {}, nil 1616 } 1617 if err != nil { 1618 return nil, err 1619 } 1620 if rightRepl.IsInitialized() { 1621 return nil, errors.Errorf("RHS of split %s / %s already initialized before split application", 1622 &split.LeftDesc, &split.RightDesc) 1623 } 1624 return rightRepl.raftMu.Unlock, nil 1625 } 1626 1627 func (r *Replica) acquireMergeLock( 1628 ctx context.Context, merge *roachpb.MergeTrigger, 1629 ) (func(), error) { 1630 // The merge lock is the right-hand replica's raftMu. The right-hand replica 1631 // is required to exist on this store at the merge implied replica ID. 1632 // Otherwise, an incoming snapshot could create the right-hand replica before 1633 // the merge trigger has a chance to widen the left-hand replica's end key. 1634 // The merge trigger would then fatal the node upon realizing the right-hand 1635 // replica already exists. With a right-hand replica in place, any snapshots 1636 // for the right-hand range will block on raftMu, waiting for the merge to 1637 // complete, after which the replica will realize it has been destroyed and 1638 // reject the snapshot. 1639 // 1640 // These guarantees would not be held if we were catching up from a preemptive 1641 // snapshot and were not part of the range. That scenario, however, never 1642 // arises because prior to 19.2 we would ensure that a preemptive snapshot had 1643 // been applied before adding a store to the range which would fail if the 1644 // range had merged another range and in 19.2 we detect if the raft messages 1645 // we're processing are for a learner and our current state is due to a 1646 // preemptive snapshot and remove the preemptive snapshot. 1647 rightReplDesc, _ := merge.RightDesc.GetReplicaDescriptor(r.StoreID()) 1648 rightRepl, _, err := r.store.getOrCreateReplica(ctx, merge.RightDesc.RangeID, 1649 rightReplDesc.ReplicaID, nil, /* creatingReplica */ 1650 rightReplDesc.GetType() == roachpb.LEARNER) 1651 if err != nil { 1652 return nil, err 1653 } 1654 rightDesc := rightRepl.Desc() 1655 if !rightDesc.StartKey.Equal(merge.RightDesc.StartKey) || !rightDesc.EndKey.Equal(merge.RightDesc.EndKey) { 1656 return nil, errors.Errorf("RHS of merge %s <- %s not present on store; found %s in place of the RHS", 1657 &merge.LeftDesc, &merge.RightDesc, rightDesc) 1658 } 1659 return rightRepl.raftMu.Unlock, nil 1660 } 1661 1662 // handleTruncatedStateBelowRaft is called when a Raft command updates the truncated 1663 // state. This isn't 100% trivial for two reasons: 1664 // - in 19.1 we're making the TruncatedState key unreplicated, so there's a migration 1665 // - we're making use of the above by not sending the Raft log in snapshots (the truncated 1666 // state effectively determines the first index of the log, which requires it to be unreplicated). 1667 // Updates to the HardState are sent out by a leaseholder truncating the log based on its local 1668 // knowledge. For example, the leader might have a log 10..100 and truncates to 50, and will send 1669 // out a TruncatedState with Index 50 to that effect. However, some replicas may not even have log 1670 // entries that old, and must make sure to ignore this update to the truncated state, as it would 1671 // otherwise clobber their "newer" truncated state. 1672 // 1673 // The returned boolean tells the caller whether to apply the truncated state's 1674 // side effects, which means replacing the in-memory TruncatedState and applying 1675 // the associated RaftLogDelta. It is usually expected to be true, but may not 1676 // be for the first truncation after on a replica that recently received a 1677 // snapshot. 1678 func handleTruncatedStateBelowRaft( 1679 ctx context.Context, 1680 oldTruncatedState, newTruncatedState *roachpb.RaftTruncatedState, 1681 loader stateloader.StateLoader, 1682 readWriter storage.ReadWriter, 1683 ) (_apply bool, _ error) { 1684 // If this is a log truncation, load the resulting unreplicated or legacy 1685 // replicated truncated state (in that order). If the migration is happening 1686 // in this command, the result will be an empty message. In steady state 1687 // after the migration, it's the unreplicated truncated state not taking 1688 // into account the current truncation (since the key is unreplicated). 1689 // Either way, we'll update it below. 1690 // 1691 // See VersionUnreplicatedRaftTruncatedState for details. 1692 truncStatePostApply, truncStateIsLegacy, err := loader.LoadRaftTruncatedState(ctx, readWriter) 1693 if err != nil { 1694 return false, errors.Wrap(err, "loading truncated state") 1695 } 1696 1697 // Truncate the Raft log from the entry after the previous 1698 // truncation index to the new truncation index. This is performed 1699 // atomically with the raft command application so that the 1700 // TruncatedState index is always consistent with the state of the 1701 // Raft log itself. We can use the distinct writer because we know 1702 // all writes will be to distinct keys. 1703 // 1704 // Intentionally don't use range deletion tombstones (ClearRange()) 1705 // due to performance concerns connected to having many range 1706 // deletion tombstones. There is a chance that ClearRange will 1707 // perform well here because the tombstones could be "collapsed", 1708 // but it is hardly worth the risk at this point. 1709 prefixBuf := &loader.RangeIDPrefixBuf 1710 for idx := oldTruncatedState.Index + 1; idx <= newTruncatedState.Index; idx++ { 1711 // NB: RangeIDPrefixBufs have sufficient capacity (32 bytes) to 1712 // avoid allocating when constructing Raft log keys (16 bytes). 1713 unsafeKey := prefixBuf.RaftLogKey(idx) 1714 if err := readWriter.Clear(storage.MakeMVCCMetadataKey(unsafeKey)); err != nil { 1715 return false, errors.Wrapf(err, "unable to clear truncated Raft entries for %+v", newTruncatedState) 1716 } 1717 } 1718 1719 if !truncStateIsLegacy { 1720 if truncStatePostApply.Index < newTruncatedState.Index { 1721 // There are two cases here (though handled just the same). In the 1722 // first case, the Raft command has just deleted the legacy 1723 // replicated truncated state key as part of the migration (so 1724 // truncStateIsLegacy is now false for the first time and 1725 // truncStatePostApply is zero) and we need to atomically write the 1726 // new, unreplicated, key. Or we've already migrated earlier, in 1727 // which case truncStatePostApply equals the current value of the 1728 // new key (which wasn't touched by the batch), and we need to 1729 // overwrite it if this truncation "moves it forward". 1730 1731 if err := storage.MVCCPutProto( 1732 ctx, readWriter, nil /* ms */, prefixBuf.RaftTruncatedStateKey(), 1733 hlc.Timestamp{}, nil /* txn */, newTruncatedState, 1734 ); err != nil { 1735 return false, errors.Wrap(err, "unable to migrate RaftTruncatedState") 1736 } 1737 // Have migrated and this new truncated state is moving us forward. 1738 // Tell caller that we applied it and that so should they. 1739 return true, nil 1740 } 1741 // Have migrated, but this truncated state moves the existing one 1742 // backwards, so instruct caller to not update in-memory state. 1743 return false, nil 1744 } 1745 // Haven't migrated yet, don't ever discard the update. 1746 return true, nil 1747 } 1748 1749 // ComputeRaftLogSize computes the size (in bytes) of the Raft log from the 1750 // storage engine. This will iterate over the Raft log and sideloaded files, so 1751 // depending on the size of these it can be mildly to extremely expensive and 1752 // thus should not be called frequently. 1753 // 1754 // The sideloaded storage may be nil, in which case it is treated as empty. 1755 func ComputeRaftLogSize( 1756 ctx context.Context, rangeID roachpb.RangeID, reader storage.Reader, sideloaded SideloadStorage, 1757 ) (int64, error) { 1758 prefix := keys.RaftLogPrefix(rangeID) 1759 prefixEnd := prefix.PrefixEnd() 1760 iter := reader.NewIterator(storage.IterOptions{ 1761 LowerBound: prefix, 1762 UpperBound: prefixEnd, 1763 }) 1764 defer iter.Close() 1765 ms, err := iter.ComputeStats(prefix, prefixEnd, 0 /* nowNanos */) 1766 if err != nil { 1767 return 0, err 1768 } 1769 var totalSideloaded int64 1770 if sideloaded != nil { 1771 var err error 1772 // Truncating all indexes strictly smaller than zero is a no-op but 1773 // gives us the number of bytes in the storage back. 1774 _, totalSideloaded, err = sideloaded.TruncateTo(ctx, 0) 1775 if err != nil { 1776 return 0, err 1777 } 1778 } 1779 return ms.SysBytes + totalSideloaded, nil 1780 } 1781 1782 func maybeCampaignAfterConfChange( 1783 ctx context.Context, 1784 storeID roachpb.StoreID, 1785 desc *roachpb.RangeDescriptor, 1786 raftGroup *raft.RawNode, 1787 ) { 1788 // If a config change was carried out, it's possible that the Raft 1789 // leader was removed. Verify that, and if so, campaign if we are 1790 // the first remaining voter replica. Without this, the range will 1791 // be leaderless (and thus unavailable) for a few seconds. 1792 // 1793 // We can't (or rather shouldn't) campaign on all remaining voters 1794 // because that can lead to a stalemate. For example, three voters 1795 // may all make it through PreVote and then reject each other. 1796 st := raftGroup.BasicStatus() 1797 if st.Lead == 0 { 1798 // Leader unknown. This isn't what we expect in steady state, so we 1799 // don't do anything. 1800 return 1801 } 1802 if !desc.IsInitialized() { 1803 // We don't have an initialized, so we can't figure out who is supposed 1804 // to campaign. It's possible that it's us and we're waiting for the 1805 // initial snapshot, but it's hard to tell. Don't do anything. 1806 return 1807 } 1808 // If the leader is no longer in the descriptor but we are the first voter, 1809 // campaign. 1810 _, leaderStillThere := desc.GetReplicaDescriptorByID(roachpb.ReplicaID(st.Lead)) 1811 if !leaderStillThere && storeID == desc.Replicas().Voters()[0].StoreID { 1812 log.VEventf(ctx, 3, "leader got removed by conf change; campaigning") 1813 _ = raftGroup.Campaign() 1814 } 1815 } 1816 1817 func getNonDeterministicFailureExplanation(err error) string { 1818 if nd := (*nonDeterministicFailure)(nil); errors.As(err, &nd) { 1819 return nd.safeExpl 1820 } 1821 return "???" 1822 }