github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_raft.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "sync/atomic" 16 "time" 17 "unsafe" 18 19 "github.com/cockroachdb/cockroach/pkg/roachpb" 20 "github.com/cockroachdb/cockroach/pkg/rpc" 21 "github.com/cockroachdb/cockroach/pkg/util/log" 22 "github.com/cockroachdb/cockroach/pkg/util/stop" 23 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 24 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 25 "github.com/cockroachdb/errors" 26 crdberrors "github.com/cockroachdb/errors" 27 "go.etcd.io/etcd/raft/raftpb" 28 ) 29 30 type raftRequestInfo struct { 31 req *RaftMessageRequest 32 respStream RaftMessageResponseStream 33 } 34 35 type raftRequestQueue struct { 36 syncutil.Mutex 37 infos []raftRequestInfo 38 // TODO(nvanbenschoten): consider recycling []raftRequestInfo slices. This 39 // could be done without any new mutex locking by storing two slices here 40 // and swapping them under lock in processRequestQueue. 41 } 42 43 // HandleSnapshot reads an incoming streaming snapshot and applies it if 44 // possible. 45 func (s *Store) HandleSnapshot( 46 header *SnapshotRequest_Header, stream SnapshotResponseStream, 47 ) error { 48 ctx := s.AnnotateCtx(stream.Context()) 49 const name = "storage.Store: handle snapshot" 50 return s.stopper.RunTaskWithErr(ctx, name, func(ctx context.Context) error { 51 s.metrics.raftRcvdMessages[raftpb.MsgSnap].Inc(1) 52 53 if s.IsDraining() { 54 return stream.Send(&SnapshotResponse{ 55 Status: SnapshotResponse_DECLINED, 56 Message: storeDrainingMsg, 57 }) 58 } 59 60 return s.receiveSnapshot(ctx, header, stream) 61 }) 62 } 63 64 // learnerType exists to avoid allocating on every coalesced beat to a learner. 65 var learnerType = roachpb.LEARNER 66 67 func (s *Store) uncoalesceBeats( 68 ctx context.Context, 69 beats []RaftHeartbeat, 70 fromReplica, toReplica roachpb.ReplicaDescriptor, 71 msgT raftpb.MessageType, 72 respStream RaftMessageResponseStream, 73 ) { 74 if len(beats) == 0 { 75 return 76 } 77 if log.V(4) { 78 log.Infof(ctx, "uncoalescing %d beats of type %v: %+v", len(beats), msgT, beats) 79 } 80 beatReqs := make([]RaftMessageRequest, len(beats)) 81 for i, beat := range beats { 82 msg := raftpb.Message{ 83 Type: msgT, 84 From: uint64(beat.FromReplicaID), 85 To: uint64(beat.ToReplicaID), 86 Term: beat.Term, 87 Commit: beat.Commit, 88 } 89 beatReqs[i] = RaftMessageRequest{ 90 RangeID: beat.RangeID, 91 FromReplica: roachpb.ReplicaDescriptor{ 92 NodeID: fromReplica.NodeID, 93 StoreID: fromReplica.StoreID, 94 ReplicaID: beat.FromReplicaID, 95 }, 96 ToReplica: roachpb.ReplicaDescriptor{ 97 NodeID: toReplica.NodeID, 98 StoreID: toReplica.StoreID, 99 ReplicaID: beat.ToReplicaID, 100 }, 101 Message: msg, 102 Quiesce: beat.Quiesce, 103 } 104 if beat.ToIsLearner { 105 beatReqs[i].ToReplica.Type = &learnerType 106 } 107 if log.V(4) { 108 log.Infof(ctx, "uncoalesced beat: %+v", beatReqs[i]) 109 } 110 111 if err := s.HandleRaftUncoalescedRequest(ctx, &beatReqs[i], respStream); err != nil { 112 log.Errorf(ctx, "could not handle uncoalesced heartbeat %s", err) 113 } 114 } 115 } 116 117 // HandleRaftRequest dispatches a raft message to the appropriate Replica. It 118 // requires that s.mu is not held. 119 func (s *Store) HandleRaftRequest( 120 ctx context.Context, req *RaftMessageRequest, respStream RaftMessageResponseStream, 121 ) *roachpb.Error { 122 // NB: unlike the other two RaftMessageHandler methods implemented by Store, 123 // this one doesn't need to directly run through a Stopper task because it 124 // delegates all work through a raftScheduler, whose workers' lifetimes are 125 // already tied to the Store's Stopper. 126 if len(req.Heartbeats)+len(req.HeartbeatResps) > 0 { 127 if req.RangeID != 0 { 128 log.Fatalf(ctx, "coalesced heartbeats must have rangeID == 0") 129 } 130 s.uncoalesceBeats(ctx, req.Heartbeats, req.FromReplica, req.ToReplica, raftpb.MsgHeartbeat, respStream) 131 s.uncoalesceBeats(ctx, req.HeartbeatResps, req.FromReplica, req.ToReplica, raftpb.MsgHeartbeatResp, respStream) 132 return nil 133 } 134 return s.HandleRaftUncoalescedRequest(ctx, req, respStream) 135 } 136 137 // HandleRaftUncoalescedRequest dispatches a raft message to the appropriate 138 // Replica. It requires that s.mu is not held. 139 func (s *Store) HandleRaftUncoalescedRequest( 140 ctx context.Context, req *RaftMessageRequest, respStream RaftMessageResponseStream, 141 ) *roachpb.Error { 142 143 if len(req.Heartbeats)+len(req.HeartbeatResps) > 0 { 144 log.Fatalf(ctx, "HandleRaftUncoalescedRequest cannot be given coalesced heartbeats or heartbeat responses, received %s", req) 145 } 146 // HandleRaftRequest is called on locally uncoalesced heartbeats (which are 147 // not sent over the network if the environment variable is set) so do not 148 // count them. 149 s.metrics.raftRcvdMessages[req.Message.Type].Inc(1) 150 151 value, ok := s.replicaQueues.Load(int64(req.RangeID)) 152 if !ok { 153 value, _ = s.replicaQueues.LoadOrStore(int64(req.RangeID), unsafe.Pointer(&raftRequestQueue{})) 154 } 155 q := (*raftRequestQueue)(value) 156 q.Lock() 157 if len(q.infos) >= replicaRequestQueueSize { 158 q.Unlock() 159 // TODO(peter): Return an error indicating the request was dropped. Note 160 // that dropping the request is safe. Raft will retry. 161 s.metrics.RaftRcvdMsgDropped.Inc(1) 162 return nil 163 } 164 q.infos = append(q.infos, raftRequestInfo{ 165 req: req, 166 respStream: respStream, 167 }) 168 first := len(q.infos) == 1 169 q.Unlock() 170 171 // processRequestQueue will process all infos in the slice each time it 172 // runs, so we only need to schedule a Raft request event if we added the 173 // first info in the slice. Everyone else can rely on the request that added 174 // the first info already having scheduled a Raft request event. 175 if first { 176 s.scheduler.EnqueueRaftRequest(req.RangeID) 177 } 178 return nil 179 } 180 181 // withReplicaForRequest calls the supplied function with the (lazily 182 // initialized) Replica specified in the request. The replica passed to 183 // the function will have its Replica.raftMu locked. 184 func (s *Store) withReplicaForRequest( 185 ctx context.Context, req *RaftMessageRequest, f func(context.Context, *Replica) *roachpb.Error, 186 ) *roachpb.Error { 187 // Lazily create the replica. 188 r, _, err := s.getOrCreateReplica( 189 ctx, 190 req.RangeID, 191 req.ToReplica.ReplicaID, 192 &req.FromReplica, 193 req.ToReplica.GetType() == roachpb.LEARNER, 194 ) 195 if err != nil { 196 return roachpb.NewError(err) 197 } 198 defer r.raftMu.Unlock() 199 ctx = r.AnnotateCtx(ctx) 200 r.setLastReplicaDescriptors(req) 201 return f(ctx, r) 202 } 203 204 // processRaftRequestWithReplica processes the (non-snapshot) Raft request on 205 // the specified replica. Notably, it does not handle updates to the Raft Ready 206 // state; callers will probably want to handle this themselves at some point. 207 func (s *Store) processRaftRequestWithReplica( 208 ctx context.Context, r *Replica, req *RaftMessageRequest, 209 ) *roachpb.Error { 210 if verboseRaftLoggingEnabled() { 211 log.Infof(ctx, "incoming raft message:\n%s", raftDescribeMessage(req.Message, raftEntryFormatter)) 212 } 213 214 if req.Message.Type == raftpb.MsgSnap { 215 log.Fatalf(ctx, "unexpected snapshot: %+v", req) 216 } 217 218 if req.Quiesce { 219 if req.Message.Type != raftpb.MsgHeartbeat { 220 log.Fatalf(ctx, "unexpected quiesce: %+v", req) 221 } 222 // If another replica tells us to quiesce, we verify that according to 223 // it, we are fully caught up, and that we believe it to be the leader. 224 // If we didn't do this, this replica could only unquiesce by means of 225 // an election, which means that the request prompting the unquiesce 226 // would end up with latency on the order of an election timeout. 227 // 228 // There are additional checks in quiesceLocked() that prevent us from 229 // quiescing if there's outstanding work. 230 r.mu.Lock() 231 status := r.raftBasicStatusRLocked() 232 ok := status.Term == req.Message.Term && 233 status.Commit == req.Message.Commit && 234 status.Lead == req.Message.From && 235 r.quiesceLocked() 236 r.mu.Unlock() 237 if ok { 238 return nil 239 } 240 if log.V(4) { 241 log.Infof(ctx, "not quiescing: local raft status is %+v, incoming quiesce message is %+v", status, req.Message) 242 } 243 } 244 245 if req.ToReplica.ReplicaID == 0 { 246 log.VEventf(ctx, 1, "refusing incoming Raft message %s from %+v to %+v", 247 req.Message.Type, req.FromReplica, req.ToReplica) 248 return roachpb.NewErrorf( 249 "cannot recreate replica that is not a member of its range (StoreID %s not found in r%d)", 250 r.store.StoreID(), req.RangeID, 251 ) 252 } 253 254 drop := maybeDropMsgApp(ctx, (*replicaMsgAppDropper)(r), &req.Message, req.RangeStartKey) 255 if !drop { 256 if err := r.stepRaftGroup(req); err != nil { 257 return roachpb.NewError(err) 258 } 259 } 260 return nil 261 } 262 263 // processRaftSnapshotRequest processes the incoming non-preemptive snapshot 264 // Raft request on the request's specified replica. The function makes sure to 265 // handle any updated Raft Ready state. It also adds and later removes the 266 // (potentially) necessary placeholder to protect against concurrent access to 267 // the keyspace encompassed by the snapshot but not yet guarded by the replica. 268 func (s *Store) processRaftSnapshotRequest( 269 ctx context.Context, snapHeader *SnapshotRequest_Header, inSnap IncomingSnapshot, 270 ) *roachpb.Error { 271 if snapHeader.IsPreemptive() { 272 return roachpb.NewError(crdberrors.AssertionFailedf(`expected a raft or learner snapshot`)) 273 } 274 275 return s.withReplicaForRequest(ctx, &snapHeader.RaftMessageRequest, func( 276 ctx context.Context, r *Replica, 277 ) (pErr *roachpb.Error) { 278 if snapHeader.RaftMessageRequest.Message.Type != raftpb.MsgSnap { 279 log.Fatalf(ctx, "expected snapshot: %+v", snapHeader.RaftMessageRequest) 280 } 281 282 // Check to see if a snapshot can be applied. Snapshots can always be applied 283 // to initialized replicas. Note that if we add a placeholder we need to 284 // already be holding Replica.raftMu in order to prevent concurrent 285 // raft-ready processing of uninitialized replicas. 286 var addedPlaceholder bool 287 var removePlaceholder bool 288 if err := func() error { 289 s.mu.Lock() 290 defer s.mu.Unlock() 291 placeholder, err := s.canApplySnapshotLocked(ctx, snapHeader) 292 if err != nil { 293 // If the storage cannot accept the snapshot, return an 294 // error before passing it to RawNode.Step, since our 295 // error handling options past that point are limited. 296 log.Infof(ctx, "cannot apply snapshot: %s", err) 297 return err 298 } 299 300 if placeholder != nil { 301 // NB: The placeholder added here is either removed below after a 302 // preemptive snapshot is applied or after the next call to 303 // Replica.handleRaftReady. Note that we can only get here if the 304 // replica doesn't exist or is uninitialized. 305 if err := s.addPlaceholderLocked(placeholder); err != nil { 306 log.Fatalf(ctx, "could not add vetted placeholder %s: %+v", placeholder, err) 307 } 308 addedPlaceholder = true 309 } 310 return nil 311 }(); err != nil { 312 return roachpb.NewError(err) 313 } 314 315 if addedPlaceholder { 316 // If we added a placeholder remove it before we return unless some other 317 // part of the code takes ownership of the removal (indicated by setting 318 // removePlaceholder to false). 319 removePlaceholder = true 320 defer func() { 321 if removePlaceholder { 322 if s.removePlaceholder(ctx, snapHeader.RaftMessageRequest.RangeID) { 323 atomic.AddInt32(&s.counts.removedPlaceholders, 1) 324 } 325 } 326 }() 327 } 328 // NB: we cannot get errRemoved here because we're promised by 329 // withReplicaForRequest that this replica is not currently being removed 330 // and we've been holding the raftMu the entire time. 331 if err := r.stepRaftGroup(&snapHeader.RaftMessageRequest); err != nil { 332 return roachpb.NewError(err) 333 } 334 _, expl, err := r.handleRaftReadyRaftMuLocked(ctx, inSnap) 335 maybeFatalOnRaftReadyErr(ctx, expl, err) 336 removePlaceholder = false 337 return nil 338 }) 339 } 340 341 // HandleRaftResponse implements the RaftMessageHandler interface. Per the 342 // interface specification, an error is returned if and only if the underlying 343 // Raft connection should be closed. 344 // It requires that s.mu is not held. 345 func (s *Store) HandleRaftResponse(ctx context.Context, resp *RaftMessageResponse) error { 346 ctx = s.AnnotateCtx(ctx) 347 const name = "storage.Store: handle raft response" 348 return s.stopper.RunTaskWithErr(ctx, name, func(ctx context.Context) error { 349 repl, replErr := s.GetReplica(resp.RangeID) 350 if replErr == nil { 351 // Best-effort context annotation of replica. 352 ctx = repl.AnnotateCtx(ctx) 353 } 354 switch val := resp.Union.GetValue().(type) { 355 case *roachpb.Error: 356 switch tErr := val.GetDetail().(type) { 357 case *roachpb.ReplicaTooOldError: 358 if replErr != nil { 359 // RangeNotFoundErrors are expected here; nothing else is. 360 if !errors.HasType(replErr, (*roachpb.RangeNotFoundError)(nil)) { 361 log.Errorf(ctx, "%v", replErr) 362 } 363 return nil 364 } 365 366 // Grab the raftMu in addition to the replica mu because 367 // cancelFailedProposalsLocked below requires it. 368 repl.raftMu.Lock() 369 defer repl.raftMu.Unlock() 370 repl.mu.Lock() 371 372 // If the replica ID in the error does not match then we know 373 // that the replica has been removed and re-added quickly. In 374 // that case, we don't want to add it to the replicaGCQueue. 375 // If the replica is not alive then we also should ignore this error. 376 if tErr.ReplicaID != repl.mu.replicaID || 377 !repl.mu.destroyStatus.IsAlive() || 378 // Ignore if we want to test the replicaGC queue. 379 s.TestingKnobs().DisableEagerReplicaRemoval { 380 repl.mu.Unlock() 381 return nil 382 } 383 384 // The replica will be garbage collected soon (we are sure 385 // since our replicaID is definitely too old), but in the meantime we 386 // already want to bounce all traffic from it. Note that the replica 387 // could be re-added with a higher replicaID, but we want to clear the 388 // replica's data before that happens. 389 if log.V(1) { 390 log.Infof(ctx, "setting local replica to destroyed due to ReplicaTooOld error") 391 } 392 393 repl.mu.Unlock() 394 nextReplicaID := tErr.ReplicaID + 1 395 return s.removeReplicaRaftMuLocked(ctx, repl, nextReplicaID, RemoveOptions{ 396 DestroyData: true, 397 }) 398 case *roachpb.RaftGroupDeletedError: 399 if replErr != nil { 400 // RangeNotFoundErrors are expected here; nothing else is. 401 if !errors.HasType(replErr, (*roachpb.RangeNotFoundError)(nil)) { 402 log.Errorf(ctx, "%v", replErr) 403 } 404 return nil 405 } 406 407 // If the replica is talking to a replica that's been deleted, it must be 408 // out of date. While this may just mean it's slightly behind, it can 409 // also mean that it is so far behind it no longer knows where any of the 410 // other replicas are (#23994). Add it to the replica GC queue to do a 411 // proper check. 412 s.replicaGCQueue.AddAsync(ctx, repl, replicaGCPriorityDefault) 413 case *roachpb.StoreNotFoundError: 414 log.Warningf(ctx, "raft error: node %d claims to not contain store %d for replica %s: %s", 415 resp.FromReplica.NodeID, resp.FromReplica.StoreID, resp.FromReplica, val) 416 return val.GetDetail() // close Raft connection 417 default: 418 log.Warningf(ctx, "got error from r%d, replica %s: %s", 419 resp.RangeID, resp.FromReplica, val) 420 } 421 default: 422 log.Warningf(ctx, "got unknown raft response type %T from replica %s: %s", val, resp.FromReplica, val) 423 } 424 return nil 425 }) 426 } 427 428 // enqueueRaftUpdateCheck asynchronously registers the given range ID to be 429 // checked for raft updates when the processRaft goroutine is idle. 430 func (s *Store) enqueueRaftUpdateCheck(rangeID roachpb.RangeID) { 431 s.scheduler.EnqueueRaftReady(rangeID) 432 } 433 434 func (s *Store) processRequestQueue(ctx context.Context, rangeID roachpb.RangeID) bool { 435 value, ok := s.replicaQueues.Load(int64(rangeID)) 436 if !ok { 437 return false 438 } 439 q := (*raftRequestQueue)(value) 440 q.Lock() 441 infos := q.infos 442 q.infos = nil 443 q.Unlock() 444 if len(infos) == 0 { 445 return false 446 } 447 448 var hadError bool 449 for i := range infos { 450 info := &infos[i] 451 if pErr := s.withReplicaForRequest( 452 ctx, info.req, func(ctx context.Context, r *Replica) *roachpb.Error { 453 return s.processRaftRequestWithReplica(ctx, r, info.req) 454 }, 455 ); pErr != nil { 456 hadError = true 457 if err := info.respStream.Send(newRaftMessageResponse(info.req, pErr)); err != nil { 458 // Seems excessive to log this on every occurrence as the other side 459 // might have closed. 460 log.VEventf(ctx, 1, "error sending error: %s", err) 461 } 462 } 463 } 464 465 if hadError { 466 // If we're unable to process a request, consider dropping the request queue 467 // to free up space in the map. 468 // This is relevant if requests failed because the target replica could not 469 // be created (for example due to the Raft tombstone). The particular code 470 // here takes into account that we don't want to drop the queue if there 471 // are other messages waiting on it, or if the target replica exists. Raft 472 // tolerates the occasional dropped message, but our unit tests are less 473 // forgiving. 474 // 475 // See https://github.com/cockroachdb/cockroach/issues/30951#issuecomment-428010411. 476 if _, exists := s.mu.replicas.Load(int64(rangeID)); !exists { 477 q.Lock() 478 if len(q.infos) == 0 { 479 s.replicaQueues.Delete(int64(rangeID)) 480 } 481 q.Unlock() 482 } 483 } 484 485 // NB: Even if we had errors and the corresponding replica no longer 486 // exists, returning true here won't cause a new, uninitialized replica 487 // to be created in processReady(). 488 return true // ready 489 } 490 491 func (s *Store) processReady(ctx context.Context, rangeID roachpb.RangeID) { 492 value, ok := s.mu.replicas.Load(int64(rangeID)) 493 if !ok { 494 return 495 } 496 497 r := (*Replica)(value) 498 ctx = r.AnnotateCtx(ctx) 499 start := timeutil.Now() 500 stats, expl, err := r.handleRaftReady(ctx, noSnap) 501 removed := maybeFatalOnRaftReadyErr(ctx, expl, err) 502 elapsed := timeutil.Since(start) 503 s.metrics.RaftWorkingDurationNanos.Inc(elapsed.Nanoseconds()) 504 // Warn if Raft processing took too long. We use the same duration as we 505 // use for warning about excessive raft mutex lock hold times. Long 506 // processing time means we'll have starved local replicas of ticks and 507 // remote replicas will likely start campaigning. 508 if elapsed >= defaultReplicaRaftMuWarnThreshold { 509 log.Warningf(ctx, "handle raft ready: %.1fs [applied=%d, batches=%d, state_assertions=%d]", 510 elapsed.Seconds(), stats.entriesProcessed, stats.batchesProcessed, stats.stateAssertions) 511 } 512 if !removed && !r.IsInitialized() { 513 // Only an uninitialized replica can have a placeholder since, by 514 // definition, an initialized replica will be present in the 515 // replicasByKey map. While the replica will usually consume the 516 // placeholder itself, that isn't guaranteed and so this invocation 517 // here is crucial (i.e. don't remove it). 518 // 519 // We need to hold raftMu here to prevent removing a placeholder that is 520 // actively being used by Store.processRaftRequest. 521 r.raftMu.Lock() 522 if s.removePlaceholder(ctx, r.RangeID) { 523 atomic.AddInt32(&s.counts.droppedPlaceholders, 1) 524 } 525 r.raftMu.Unlock() 526 } 527 } 528 529 func (s *Store) processTick(ctx context.Context, rangeID roachpb.RangeID) bool { 530 value, ok := s.mu.replicas.Load(int64(rangeID)) 531 if !ok { 532 return false 533 } 534 livenessMap, _ := s.livenessMap.Load().(IsLiveMap) 535 536 start := timeutil.Now() 537 r := (*Replica)(value) 538 exists, err := r.tick(livenessMap) 539 if err != nil { 540 log.Errorf(ctx, "%v", err) 541 } 542 s.metrics.RaftTickingDurationNanos.Inc(timeutil.Since(start).Nanoseconds()) 543 return exists // ready 544 } 545 546 // nodeIsLiveCallback is invoked when a node transitions from non-live 547 // to live. Iterate through all replicas and find any which belong to 548 // ranges containing the implicated node. Unquiesce if currently 549 // quiesced. Note that this mechanism can race with concurrent 550 // invocations of processTick, which may have a copy of the previous 551 // livenessMap where the now-live node is down. Those instances should 552 // be rare, however, and we expect the newly live node to eventually 553 // unquiesce the range. 554 func (s *Store) nodeIsLiveCallback(nodeID roachpb.NodeID) { 555 s.updateLivenessMap() 556 557 s.mu.replicas.Range(func(k int64, v unsafe.Pointer) bool { 558 r := (*Replica)(v) 559 for _, rep := range r.Desc().Replicas().All() { 560 if rep.NodeID == nodeID { 561 r.unquiesce() 562 } 563 } 564 return true 565 }) 566 } 567 568 func (s *Store) processRaft(ctx context.Context) { 569 if s.cfg.TestingKnobs.DisableProcessRaft { 570 return 571 } 572 573 s.scheduler.Start(ctx, s.stopper) 574 // Wait for the scheduler worker goroutines to finish. 575 s.stopper.RunWorker(ctx, s.scheduler.Wait) 576 577 s.stopper.RunWorker(ctx, s.raftTickLoop) 578 s.stopper.RunWorker(ctx, s.coalescedHeartbeatsLoop) 579 s.stopper.AddCloser(stop.CloserFn(func() { 580 s.cfg.Transport.Stop(s.StoreID()) 581 })) 582 } 583 584 func (s *Store) raftTickLoop(ctx context.Context) { 585 ticker := time.NewTicker(s.cfg.RaftTickInterval) 586 defer ticker.Stop() 587 588 var rangeIDs []roachpb.RangeID 589 590 for { 591 select { 592 case <-ticker.C: 593 rangeIDs = rangeIDs[:0] 594 // Update the liveness map. 595 if s.cfg.NodeLiveness != nil { 596 s.updateLivenessMap() 597 } 598 599 s.unquiescedReplicas.Lock() 600 // Why do we bother to ever queue a Replica on the Raft scheduler for 601 // tick processing? Couldn't we just call Replica.tick() here? Yes, but 602 // then a single bad/slow Replica can disrupt tick processing for every 603 // Replica on the store which cascades into Raft elections and more 604 // disruption. 605 for rangeID := range s.unquiescedReplicas.m { 606 rangeIDs = append(rangeIDs, rangeID) 607 } 608 s.unquiescedReplicas.Unlock() 609 610 s.scheduler.EnqueueRaftTick(rangeIDs...) 611 s.metrics.RaftTicks.Inc(1) 612 613 case <-s.stopper.ShouldStop(): 614 return 615 } 616 } 617 } 618 619 func (s *Store) updateLivenessMap() { 620 nextMap := s.cfg.NodeLiveness.GetIsLiveMap() 621 for nodeID, entry := range nextMap { 622 if entry.IsLive { 623 // Make sure we ask all live nodes for closed timestamp updates. 624 s.cfg.ClosedTimestamp.Clients.EnsureClient(nodeID) 625 continue 626 } 627 // Liveness claims that this node is down, but ConnHealth gets the last say 628 // because we'd rather quiesce a range too little than one too often. Note 629 // that this policy is different from the one governing the releasing of 630 // proposal quota; see comments over there. 631 // 632 // NB: This has false negatives. If a node doesn't have a conn open to it 633 // when ConnHealth is called, then ConnHealth will return 634 // rpc.ErrNotHeartbeated regardless of whether the node is up or not. That 635 // said, for the nodes that matter, we're likely talking to them via the 636 // Raft transport, so ConnHealth should usually indicate a real problem if 637 // it gives us an error back. The check can also have false positives if the 638 // node goes down after populating the map, but that matters even less. 639 entry.IsLive = (s.cfg.NodeDialer.ConnHealth(nodeID, rpc.SystemClass) == nil) 640 nextMap[nodeID] = entry 641 } 642 s.livenessMap.Store(nextMap) 643 } 644 645 // Since coalesced heartbeats adds latency to heartbeat messages, it is 646 // beneficial to have it run on a faster cycle than once per tick, so that 647 // the delay does not impact latency-sensitive features such as quiescence. 648 func (s *Store) coalescedHeartbeatsLoop(ctx context.Context) { 649 ticker := time.NewTicker(s.cfg.CoalescedHeartbeatsInterval) 650 defer ticker.Stop() 651 652 for { 653 select { 654 case <-ticker.C: 655 s.sendQueuedHeartbeats(ctx) 656 case <-s.stopper.ShouldStop(): 657 return 658 } 659 } 660 } 661 662 // sendQueuedHeartbeatsToNode requires that the s.coalescedMu lock is held. It 663 // returns the number of heartbeats that were sent. 664 func (s *Store) sendQueuedHeartbeatsToNode( 665 ctx context.Context, beats, resps []RaftHeartbeat, to roachpb.StoreIdent, 666 ) int { 667 var msgType raftpb.MessageType 668 669 if len(beats) == 0 && len(resps) == 0 { 670 return 0 671 } else if len(resps) == 0 { 672 msgType = raftpb.MsgHeartbeat 673 } else if len(beats) == 0 { 674 msgType = raftpb.MsgHeartbeatResp 675 } else { 676 log.Fatal(ctx, "cannot coalesce both heartbeats and responses") 677 } 678 679 chReq := newRaftMessageRequest() 680 *chReq = RaftMessageRequest{ 681 RangeID: 0, 682 ToReplica: roachpb.ReplicaDescriptor{ 683 NodeID: to.NodeID, 684 StoreID: to.StoreID, 685 ReplicaID: 0, 686 }, 687 FromReplica: roachpb.ReplicaDescriptor{ 688 NodeID: s.Ident.NodeID, 689 StoreID: s.Ident.StoreID, 690 }, 691 Message: raftpb.Message{ 692 Type: msgType, 693 }, 694 Heartbeats: beats, 695 HeartbeatResps: resps, 696 } 697 698 if log.V(4) { 699 log.Infof(ctx, "sending raft request (coalesced) %+v", chReq) 700 } 701 702 if !s.cfg.Transport.SendAsync(chReq, rpc.SystemClass) { 703 for _, beat := range beats { 704 if value, ok := s.mu.replicas.Load(int64(beat.RangeID)); ok { 705 (*Replica)(value).addUnreachableRemoteReplica(beat.ToReplicaID) 706 } 707 } 708 for _, resp := range resps { 709 if value, ok := s.mu.replicas.Load(int64(resp.RangeID)); ok { 710 (*Replica)(value).addUnreachableRemoteReplica(resp.ToReplicaID) 711 } 712 } 713 return 0 714 } 715 return len(beats) + len(resps) 716 } 717 718 func (s *Store) sendQueuedHeartbeats(ctx context.Context) { 719 s.coalescedMu.Lock() 720 heartbeats := s.coalescedMu.heartbeats 721 heartbeatResponses := s.coalescedMu.heartbeatResponses 722 s.coalescedMu.heartbeats = map[roachpb.StoreIdent][]RaftHeartbeat{} 723 s.coalescedMu.heartbeatResponses = map[roachpb.StoreIdent][]RaftHeartbeat{} 724 s.coalescedMu.Unlock() 725 726 var beatsSent int 727 728 for to, beats := range heartbeats { 729 beatsSent += s.sendQueuedHeartbeatsToNode(ctx, beats, nil, to) 730 } 731 for to, resps := range heartbeatResponses { 732 beatsSent += s.sendQueuedHeartbeatsToNode(ctx, nil, resps, to) 733 } 734 s.metrics.RaftCoalescedHeartbeatsPending.Update(int64(beatsSent)) 735 } 736 737 func (s *Store) updateCapacityGauges() error { 738 desc, err := s.Descriptor(false /* useCached */) 739 if err != nil { 740 return err 741 } 742 s.metrics.Capacity.Update(desc.Capacity.Capacity) 743 s.metrics.Available.Update(desc.Capacity.Available) 744 s.metrics.Used.Update(desc.Capacity.Used) 745 746 return nil 747 }