github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/node_liveness.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "fmt" 16 "sync/atomic" 17 "time" 18 19 "github.com/cockroachdb/cockroach/pkg/base" 20 "github.com/cockroachdb/cockroach/pkg/gossip" 21 "github.com/cockroachdb/cockroach/pkg/keys" 22 "github.com/cockroachdb/cockroach/pkg/kv" 23 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts" 24 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb" 25 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 26 "github.com/cockroachdb/cockroach/pkg/roachpb" 27 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 28 "github.com/cockroachdb/cockroach/pkg/storage" 29 "github.com/cockroachdb/cockroach/pkg/util/contextutil" 30 "github.com/cockroachdb/cockroach/pkg/util/hlc" 31 "github.com/cockroachdb/cockroach/pkg/util/log" 32 "github.com/cockroachdb/cockroach/pkg/util/metric" 33 "github.com/cockroachdb/cockroach/pkg/util/retry" 34 "github.com/cockroachdb/cockroach/pkg/util/stop" 35 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 36 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 37 "github.com/cockroachdb/cockroach/pkg/util/tracing" 38 "github.com/cockroachdb/errors" 39 ) 40 41 var ( 42 // ErrNoLivenessRecord is returned when asking for liveness information 43 // about a node for which nothing is known. 44 ErrNoLivenessRecord = errors.New("node not in the liveness table") 45 46 errChangeDecommissioningFailed = errors.New("failed to change the decommissioning status") 47 48 // ErrEpochIncremented is returned when a heartbeat request fails because 49 // the underlying liveness record has had its epoch incremented. 50 ErrEpochIncremented = errors.New("heartbeat failed on epoch increment") 51 52 // ErrEpochAlreadyIncremented is returned by IncrementEpoch when 53 // someone else has already incremented the epoch to the desired 54 // value. 55 ErrEpochAlreadyIncremented = errors.New("epoch already incremented") 56 57 errLiveClockNotLive = errors.New("not live") 58 ) 59 60 type errRetryLiveness struct { 61 error 62 } 63 64 func (e *errRetryLiveness) Cause() error { 65 return e.error 66 } 67 68 func (e *errRetryLiveness) Error() string { 69 return fmt.Sprintf("%T: %s", *e, e.error) 70 } 71 72 // Node liveness metrics counter names. 73 var ( 74 metaLiveNodes = metric.Metadata{ 75 Name: "liveness.livenodes", 76 Help: "Number of live nodes in the cluster (will be 0 if this node is not itself live)", 77 Measurement: "Nodes", 78 Unit: metric.Unit_COUNT, 79 } 80 metaHeartbeatSuccesses = metric.Metadata{ 81 Name: "liveness.heartbeatsuccesses", 82 Help: "Number of successful node liveness heartbeats from this node", 83 Measurement: "Messages", 84 Unit: metric.Unit_COUNT, 85 } 86 metaHeartbeatFailures = metric.Metadata{ 87 Name: "liveness.heartbeatfailures", 88 Help: "Number of failed node liveness heartbeats from this node", 89 Measurement: "Messages", 90 Unit: metric.Unit_COUNT, 91 } 92 metaEpochIncrements = metric.Metadata{ 93 Name: "liveness.epochincrements", 94 Help: "Number of times this node has incremented its liveness epoch", 95 Measurement: "Epochs", 96 Unit: metric.Unit_COUNT, 97 } 98 metaHeartbeatLatency = metric.Metadata{ 99 Name: "liveness.heartbeatlatency", 100 Help: "Node liveness heartbeat latency", 101 Measurement: "Latency", 102 Unit: metric.Unit_NANOSECONDS, 103 } 104 ) 105 106 // LivenessMetrics holds metrics for use with node liveness activity. 107 type LivenessMetrics struct { 108 LiveNodes *metric.Gauge 109 HeartbeatSuccesses *metric.Counter 110 HeartbeatFailures *metric.Counter 111 EpochIncrements *metric.Counter 112 HeartbeatLatency *metric.Histogram 113 } 114 115 // IsLiveCallback is invoked when a node's IsLive state changes to true. 116 // Callbacks can be registered via NodeLiveness.RegisterCallback(). 117 type IsLiveCallback func(nodeID roachpb.NodeID) 118 119 // HeartbeatCallback is invoked whenever this node updates its own liveness status, 120 // indicating that it is alive. 121 type HeartbeatCallback func(context.Context) 122 123 // NodeLiveness is a centralized failure detector that coordinates 124 // with the epoch-based range system to provide for leases of 125 // indefinite length (replacing frequent per-range lease renewals with 126 // heartbeats to the liveness system). 127 // 128 // It is also used as a general-purpose failure detector, but it is 129 // not ideal for this purpose. It is inefficient due to the use of 130 // replicated durable writes, and is not very sensitive (it primarily 131 // tests connectivity from the node to the liveness range; a node with 132 // a failing disk could still be considered live by this system). 133 // 134 // The persistent state of node liveness is stored in the KV layer, 135 // near the beginning of the keyspace. These are normal MVCC keys, 136 // written by CPut operations in 1PC transactions (the use of 137 // transactions and MVCC is regretted because it means that the 138 // liveness span depends on MVCC GC and can get overwhelmed if GC is 139 // not working. Transactions were used only to piggyback on the 140 // transaction commit trigger). The leaseholder of the liveness range 141 // gossips its contents whenever they change (only the changed 142 // portion); other nodes rarely read from this range directly. 143 // 144 // The use of conditional puts is crucial to maintain the guarantees 145 // needed by epoch-based leases. Both the Heartbeat and IncrementEpoch 146 // on this type require an expected value to be passed in; see 147 // comments on those methods for more. 148 // 149 // TODO(bdarnell): Also document interaction with draining and decommissioning. 150 type NodeLiveness struct { 151 ambientCtx log.AmbientContext 152 clock *hlc.Clock 153 db *kv.DB 154 gossip *gossip.Gossip 155 livenessThreshold time.Duration 156 heartbeatInterval time.Duration 157 selfSem chan struct{} 158 st *cluster.Settings 159 otherSem chan struct{} 160 // heartbeatPaused contains an atomically-swapped number representing a bool 161 // (1 or 0). heartbeatToken is a channel containing a token which is taken 162 // when heartbeating or when pausing the heartbeat. Used for testing. 163 heartbeatPaused uint32 164 heartbeatToken chan struct{} 165 metrics LivenessMetrics 166 167 mu struct { 168 syncutil.RWMutex 169 callbacks []IsLiveCallback 170 nodes map[roachpb.NodeID]kvserverpb.Liveness 171 heartbeatCallback HeartbeatCallback 172 // Before heartbeating, we write to each of these engines to avoid 173 // maintaining liveness when a local disks is stalled. 174 engines []storage.Engine 175 } 176 } 177 178 // NewNodeLiveness returns a new instance of NodeLiveness configured 179 // with the specified gossip instance. 180 func NewNodeLiveness( 181 ambient log.AmbientContext, 182 clock *hlc.Clock, 183 db *kv.DB, 184 g *gossip.Gossip, 185 livenessThreshold time.Duration, 186 renewalDuration time.Duration, 187 st *cluster.Settings, 188 histogramWindow time.Duration, 189 ) *NodeLiveness { 190 nl := &NodeLiveness{ 191 ambientCtx: ambient, 192 clock: clock, 193 db: db, 194 gossip: g, 195 livenessThreshold: livenessThreshold, 196 heartbeatInterval: livenessThreshold - renewalDuration, 197 selfSem: make(chan struct{}, 1), 198 st: st, 199 otherSem: make(chan struct{}, 1), 200 heartbeatToken: make(chan struct{}, 1), 201 } 202 nl.metrics = LivenessMetrics{ 203 LiveNodes: metric.NewFunctionalGauge(metaLiveNodes, nl.numLiveNodes), 204 HeartbeatSuccesses: metric.NewCounter(metaHeartbeatSuccesses), 205 HeartbeatFailures: metric.NewCounter(metaHeartbeatFailures), 206 EpochIncrements: metric.NewCounter(metaEpochIncrements), 207 HeartbeatLatency: metric.NewLatency(metaHeartbeatLatency, histogramWindow), 208 } 209 nl.mu.nodes = map[roachpb.NodeID]kvserverpb.Liveness{} 210 nl.heartbeatToken <- struct{}{} 211 212 livenessRegex := gossip.MakePrefixPattern(gossip.KeyNodeLivenessPrefix) 213 nl.gossip.RegisterCallback(livenessRegex, nl.livenessGossipUpdate) 214 215 return nl 216 } 217 218 var errNodeDrainingSet = errors.New("node is already draining") 219 220 func (nl *NodeLiveness) sem(nodeID roachpb.NodeID) chan struct{} { 221 if nodeID == nl.gossip.NodeID.Get() { 222 return nl.selfSem 223 } 224 return nl.otherSem 225 } 226 227 // SetDraining attempts to update this node's liveness record to put itself 228 // into the draining state. 229 // 230 // The reporter callback, if non-nil, is called on a best effort basis 231 // to report work that needed to be done and which may or may not have 232 // been done by the time this call returns. See the explanation in 233 // pkg/server/drain.go for details. 234 func (nl *NodeLiveness) SetDraining(ctx context.Context, drain bool, reporter func(int, string)) { 235 ctx = nl.ambientCtx.AnnotateCtx(ctx) 236 for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); { 237 liveness, err := nl.Self() 238 if err != nil && !errors.Is(err, ErrNoLivenessRecord) { 239 log.Errorf(ctx, "unexpected error getting liveness: %+v", err) 240 } 241 err = nl.setDrainingInternal(ctx, liveness, drain, reporter) 242 if err != nil { 243 if log.V(1) { 244 log.Infof(ctx, "attempting to set liveness draining status to %v: %v", drain, err) 245 } 246 continue 247 } 248 return 249 } 250 } 251 252 // SetDecommissioning runs a best-effort attempt of marking the the liveness 253 // record as decommissioning. It returns whether the function committed a 254 // transaction that updated the liveness record. 255 func (nl *NodeLiveness) SetDecommissioning( 256 ctx context.Context, nodeID roachpb.NodeID, decommission bool, 257 ) (changeCommitted bool, err error) { 258 ctx = nl.ambientCtx.AnnotateCtx(ctx) 259 260 attempt := func() (bool, error) { 261 // Allow only one decommissioning attempt in flight per node at a time. 262 // This is required for correct results since we may otherwise race with 263 // concurrent `IncrementEpoch` calls and get stuck in a situation in 264 // which the cached liveness is has decommissioning=false while it's 265 // really true, and that means that SetDecommissioning becomes a no-op 266 // (which is correct) but that our cached liveness never updates to 267 // reflect that. 268 // 269 // See https://github.com/cockroachdb/cockroach/issues/17995. 270 sem := nl.sem(nodeID) 271 select { 272 case sem <- struct{}{}: 273 case <-ctx.Done(): 274 return false, ctx.Err() 275 } 276 defer func() { 277 <-sem 278 }() 279 280 // We need the current liveness in each iteration. 281 // 282 // We ignore any liveness record in Gossip because we may have to fall back 283 // to the KV store anyway. The scenario in which this is needed is: 284 // - kill node 2 and stop node 1 285 // - wait for node 2's liveness record's Gossip entry to expire on all surviving nodes 286 // - restart node 1; it'll never see node 2 in `GetLiveness` unless the whole 287 // node liveness span gets regossiped (unlikely if it wasn't the lease holder 288 // for that span) 289 // - can't decommission node 2 from node 1 without KV fallback. 290 // 291 // See #20863. 292 // 293 // NB: this also de-flakes TestNodeLivenessDecommissionAbsent; running 294 // decommissioning commands in a tight loop on different nodes sometimes 295 // results in unintentional no-ops (due to the Gossip lag); this could be 296 // observed by users in principle, too. 297 // 298 // TODO(bdarnell): This is the one place where a range other than 299 // the leaseholder reads from this range. Should this read from 300 // gossip instead? (I have vague concerns about concurrent reads 301 // and timestamp cache pushes causing problems here) 302 var oldLiveness kvserverpb.Liveness 303 if err := nl.db.GetProto(ctx, keys.NodeLivenessKey(nodeID), &oldLiveness); err != nil { 304 return false, errors.Wrap(err, "unable to get liveness") 305 } 306 if (oldLiveness == kvserverpb.Liveness{}) { 307 return false, ErrNoLivenessRecord 308 } 309 310 // We may have discovered a Liveness not yet received via Gossip. Offer it 311 // to make sure that when we actually try to update the liveness, the 312 // previous view is correct. This, too, is required to de-flake 313 // TestNodeLivenessDecommissionAbsent. 314 nl.maybeUpdate(oldLiveness) 315 316 return nl.setDecommissioningInternal(ctx, nodeID, oldLiveness, decommission) 317 } 318 319 for { 320 changeCommitted, err := attempt() 321 if errors.Is(err, errChangeDecommissioningFailed) { 322 continue // expected when epoch incremented 323 } 324 return changeCommitted, err 325 } 326 } 327 328 func (nl *NodeLiveness) setDrainingInternal( 329 ctx context.Context, liveness kvserverpb.Liveness, drain bool, reporter func(int, string), 330 ) error { 331 nodeID := nl.gossip.NodeID.Get() 332 sem := nl.sem(nodeID) 333 // Allow only one attempt to set the draining field at a time. 334 select { 335 case sem <- struct{}{}: 336 case <-ctx.Done(): 337 return ctx.Err() 338 } 339 defer func() { 340 <-sem 341 }() 342 343 update := livenessUpdate{ 344 Liveness: kvserverpb.Liveness{ 345 NodeID: nodeID, 346 Epoch: 1, 347 }, 348 } 349 if liveness != (kvserverpb.Liveness{}) { 350 update.Liveness = liveness 351 } 352 if reporter != nil && drain && !update.Draining { 353 // Report progress to the Drain RPC. 354 reporter(1, "liveness record") 355 } 356 update.Draining = drain 357 update.ignoreCache = true 358 359 if err := nl.updateLiveness(ctx, update, liveness, func(actual kvserverpb.Liveness) error { 360 nl.maybeUpdate(actual) 361 if actual.Draining == update.Draining { 362 return errNodeDrainingSet 363 } 364 return errors.New("failed to update liveness record") 365 }); err != nil { 366 if log.V(1) { 367 log.Infof(ctx, "updating liveness record: %v", err) 368 } 369 if errors.Is(err, errNodeDrainingSet) { 370 return nil 371 } 372 return err 373 } 374 nl.maybeUpdate(update.Liveness) 375 return nil 376 } 377 378 type livenessUpdate struct { 379 kvserverpb.Liveness 380 // When ignoreCache is set, we won't assume that our in-memory cached version 381 // of the liveness record is accurate and will use a CPut on the liveness 382 // table with whatever the client supplied. This is used for operations that 383 // don't want to deal with the inconsistencies of using the cache. 384 ignoreCache bool 385 } 386 387 func (nl *NodeLiveness) setDecommissioningInternal( 388 ctx context.Context, nodeID roachpb.NodeID, liveness kvserverpb.Liveness, decommission bool, 389 ) (changeCommitted bool, err error) { 390 update := livenessUpdate{ 391 Liveness: kvserverpb.Liveness{ 392 NodeID: nodeID, 393 Epoch: 1, 394 }, 395 } 396 if liveness != (kvserverpb.Liveness{}) { 397 update.Liveness = liveness 398 } 399 update.Decommissioning = decommission 400 update.ignoreCache = true 401 402 var conditionFailed bool 403 if err := nl.updateLiveness(ctx, update, liveness, func(actual kvserverpb.Liveness) error { 404 conditionFailed = true 405 if actual.Decommissioning == update.Decommissioning { 406 return nil 407 } 408 return errChangeDecommissioningFailed 409 }); err != nil { 410 return false, err 411 } 412 committed := !conditionFailed && liveness.Decommissioning != decommission 413 return committed, nil 414 } 415 416 // GetLivenessThreshold returns the maximum duration between heartbeats 417 // before a node is considered not-live. 418 func (nl *NodeLiveness) GetLivenessThreshold() time.Duration { 419 return nl.livenessThreshold 420 } 421 422 // IsLive returns whether or not the specified node is considered live based on 423 // whether or not its liveness has expired regardless of the liveness status. It 424 // is an error if the specified node is not in the local liveness table. 425 func (nl *NodeLiveness) IsLive(nodeID roachpb.NodeID) (bool, error) { 426 liveness, err := nl.GetLiveness(nodeID) 427 if err != nil { 428 return false, err 429 } 430 // NB: We use clock.Now().GoTime() instead of clock.PhysicalTime() in order to 431 // consider clock signals from other nodes. 432 return liveness.IsLive(nl.clock.Now().GoTime()), nil 433 } 434 435 // StartHeartbeat starts a periodic heartbeat to refresh this node's last 436 // heartbeat in the node liveness table. The optionally provided 437 // HeartbeatCallback will be invoked whenever this node updates its own 438 // liveness. The slice of engines will be written to before each heartbeat to 439 // avoid maintaining liveness in the presence of disk stalls. 440 func (nl *NodeLiveness) StartHeartbeat( 441 ctx context.Context, stopper *stop.Stopper, engines []storage.Engine, alive HeartbeatCallback, 442 ) { 443 log.VEventf(ctx, 1, "starting liveness heartbeat") 444 retryOpts := base.DefaultRetryOptions() 445 retryOpts.Closer = stopper.ShouldQuiesce() 446 447 if len(engines) == 0 { 448 // Avoid silently forgetting to pass the engines. It happened before. 449 log.Fatalf(ctx, "must supply at least one engine") 450 } 451 452 nl.mu.Lock() 453 nl.mu.heartbeatCallback = alive 454 nl.mu.engines = engines 455 nl.mu.Unlock() 456 457 stopper.RunWorker(ctx, func(context.Context) { 458 ambient := nl.ambientCtx 459 ambient.AddLogTag("liveness-hb", nil) 460 ctx, cancel := stopper.WithCancelOnStop(context.Background()) 461 defer cancel() 462 ctx, sp := ambient.AnnotateCtxWithSpan(ctx, "liveness heartbeat loop") 463 defer sp.Finish() 464 465 incrementEpoch := true 466 ticker := time.NewTicker(nl.heartbeatInterval) 467 defer ticker.Stop() 468 for { 469 select { 470 case <-nl.heartbeatToken: 471 case <-stopper.ShouldStop(): 472 return 473 } 474 // Give the context a timeout approximately as long as the time we 475 // have left before our liveness entry expires. 476 if err := contextutil.RunWithTimeout(ctx, "node liveness heartbeat", nl.livenessThreshold-nl.heartbeatInterval, 477 func(ctx context.Context) error { 478 // Retry heartbeat in the event the conditional put fails. 479 for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); { 480 liveness, err := nl.Self() 481 if err != nil && !errors.Is(err, ErrNoLivenessRecord) { 482 log.Errorf(ctx, "unexpected error getting liveness: %+v", err) 483 } 484 if err := nl.heartbeatInternal(ctx, liveness, incrementEpoch); err != nil { 485 if errors.Is(err, ErrEpochIncremented) { 486 log.Infof(ctx, "%s; retrying", err) 487 continue 488 } 489 return err 490 } 491 incrementEpoch = false // don't increment epoch after first heartbeat 492 break 493 } 494 return nil 495 }); err != nil { 496 log.Warningf(ctx, "failed node liveness heartbeat: %+v", err) 497 } 498 499 nl.heartbeatToken <- struct{}{} 500 select { 501 case <-ticker.C: 502 case <-stopper.ShouldStop(): 503 return 504 } 505 } 506 }) 507 } 508 509 // PauseHeartbeat stops or restarts the periodic heartbeat depending on the 510 // pause parameter. When pause is true, waits until it acquires the heartbeatToken 511 // (unless heartbeat was already paused); this ensures that no heartbeats happen 512 // after this is called. This function is only safe for use in tests. 513 func (nl *NodeLiveness) PauseHeartbeat(pause bool) { 514 if pause { 515 if swapped := atomic.CompareAndSwapUint32(&nl.heartbeatPaused, 0, 1); swapped { 516 <-nl.heartbeatToken 517 } 518 } else { 519 if swapped := atomic.CompareAndSwapUint32(&nl.heartbeatPaused, 1, 0); swapped { 520 nl.heartbeatToken <- struct{}{} 521 } 522 } 523 } 524 525 // DisableAllHeartbeatsForTest disables all node liveness heartbeats, including 526 // those triggered from outside the normal StartHeartbeat loop. Returns a 527 // closure to call to re-enable heartbeats. Only safe for use in tests. 528 func (nl *NodeLiveness) DisableAllHeartbeatsForTest() func() { 529 nl.PauseHeartbeat(true) 530 nl.selfSem <- struct{}{} 531 nl.otherSem <- struct{}{} 532 return func() { 533 <-nl.selfSem 534 <-nl.otherSem 535 } 536 } 537 538 var errNodeAlreadyLive = errors.New("node already live") 539 540 // Heartbeat is called to update a node's expiration timestamp. This 541 // method does a conditional put on the node liveness record, and if 542 // successful, stores the updated liveness record in the nodes map. 543 // 544 // The liveness argument is the expected previous value of this node's 545 // liveness. 546 // 547 // If this method returns nil, the node's liveness has been extended, 548 // relative to the previous value. It may or may not still be alive 549 // when this method returns. 550 // 551 // On failure, this method returns ErrEpochIncremented, although this 552 // may not necessarily mean that the epoch was actually incremented. 553 // TODO(bdarnell): Fix error semantics here. 554 // 555 // This method is rarely called directly; heartbeats are normally sent 556 // by the StartHeartbeat loop. 557 // TODO(bdarnell): Should we just remove this synchronous heartbeat completely? 558 func (nl *NodeLiveness) Heartbeat(ctx context.Context, liveness kvserverpb.Liveness) error { 559 return nl.heartbeatInternal(ctx, liveness, false /* increment epoch */) 560 } 561 562 func (nl *NodeLiveness) heartbeatInternal( 563 ctx context.Context, liveness kvserverpb.Liveness, incrementEpoch bool, 564 ) error { 565 ctx, sp := tracing.EnsureChildSpan(ctx, nl.ambientCtx.Tracer, "liveness heartbeat") 566 defer sp.Finish() 567 defer func(start time.Time) { 568 dur := timeutil.Now().Sub(start) 569 nl.metrics.HeartbeatLatency.RecordValue(dur.Nanoseconds()) 570 if dur > time.Second { 571 log.Warningf(ctx, "slow heartbeat took %0.1fs", dur.Seconds()) 572 } 573 }(timeutil.Now()) 574 575 // Allow only one heartbeat at a time. 576 nodeID := nl.gossip.NodeID.Get() 577 sem := nl.sem(nodeID) 578 select { 579 case sem <- struct{}{}: 580 case <-ctx.Done(): 581 return ctx.Err() 582 } 583 defer func() { 584 <-sem 585 }() 586 587 update := livenessUpdate{ 588 Liveness: kvserverpb.Liveness{ 589 NodeID: nodeID, 590 Epoch: 1, 591 }, 592 } 593 if liveness != (kvserverpb.Liveness{}) { 594 update.Liveness = liveness 595 if incrementEpoch { 596 update.Epoch++ 597 // Clear draining field. 598 update.Draining = false 599 } 600 } 601 // We need to add the maximum clock offset to the expiration because it's 602 // used when determining liveness for a node. 603 { 604 update.Expiration = hlc.LegacyTimestamp( 605 nl.clock.Now().Add((nl.livenessThreshold).Nanoseconds(), 0)) 606 // This guards against the system clock moving backwards. As long 607 // as the cockroach process is running, checks inside hlc.Clock 608 // will ensure that the clock never moves backwards, but these 609 // checks don't work across process restarts. 610 if update.Expiration.Less(liveness.Expiration) { 611 return errors.Errorf("proposed liveness update expires earlier than previous record") 612 } 613 } 614 if err := nl.updateLiveness(ctx, update, liveness, func(actual kvserverpb.Liveness) error { 615 // Update liveness to actual value on mismatch. 616 nl.maybeUpdate(actual) 617 // If the actual liveness is different than expected, but is 618 // considered live, treat the heartbeat as a success. This can 619 // happen when the periodic heartbeater races with a concurrent 620 // lease acquisition. 621 // 622 // TODO(bdarnell): If things are very slow, the new liveness may 623 // have already expired and we'd incorrectly return 624 // ErrEpochIncremented. Is this check even necessary? The common 625 // path through this method doesn't check whether the liveness 626 // expired while in flight, so maybe we don't have to care about 627 // that and only need to distinguish between same and different 628 // epochs in our return value. 629 if actual.IsLive(nl.clock.Now().GoTime()) && !incrementEpoch { 630 return errNodeAlreadyLive 631 } 632 // Otherwise, return error. 633 return ErrEpochIncremented 634 }); err != nil { 635 if errors.Is(err, errNodeAlreadyLive) { 636 nl.metrics.HeartbeatSuccesses.Inc(1) 637 return nil 638 } 639 nl.metrics.HeartbeatFailures.Inc(1) 640 return err 641 } 642 643 log.VEventf(ctx, 1, "heartbeat %+v", update.Expiration) 644 nl.maybeUpdate(update.Liveness) 645 nl.metrics.HeartbeatSuccesses.Inc(1) 646 return nil 647 } 648 649 // Self returns the liveness record for this node. ErrNoLivenessRecord 650 // is returned in the event that the node has neither heartbeat its 651 // liveness record successfully, nor received a gossip message containing 652 // a former liveness update on restart. 653 func (nl *NodeLiveness) Self() (kvserverpb.Liveness, error) { 654 nl.mu.RLock() 655 defer nl.mu.RUnlock() 656 return nl.getLivenessLocked(nl.gossip.NodeID.Get()) 657 } 658 659 // IsLiveMapEntry encapsulates data about current liveness for a 660 // node. 661 type IsLiveMapEntry struct { 662 IsLive bool 663 Epoch int64 664 } 665 666 // IsLiveMap is a type alias for a map from NodeID to IsLiveMapEntry. 667 type IsLiveMap map[roachpb.NodeID]IsLiveMapEntry 668 669 // GetIsLiveMap returns a map of nodeID to boolean liveness status of 670 // each node. This excludes nodes that were removed completely (dead + 671 // decommissioning). 672 func (nl *NodeLiveness) GetIsLiveMap() IsLiveMap { 673 lMap := IsLiveMap{} 674 nl.mu.RLock() 675 defer nl.mu.RUnlock() 676 now := nl.clock.Now().GoTime() 677 for nID, l := range nl.mu.nodes { 678 isLive := l.IsLive(now) 679 if !isLive && l.Decommissioning { 680 // This is a node that was completely removed. Skip over it. 681 continue 682 } 683 lMap[nID] = IsLiveMapEntry{ 684 IsLive: isLive, 685 Epoch: l.Epoch, 686 } 687 } 688 return lMap 689 } 690 691 // GetLivenesses returns a slice containing the liveness status of 692 // every node on the cluster known to gossip. Callers should consider 693 // calling (statusServer).NodesWithLiveness() instead where possible. 694 func (nl *NodeLiveness) GetLivenesses() []kvserverpb.Liveness { 695 nl.mu.RLock() 696 defer nl.mu.RUnlock() 697 livenesses := make([]kvserverpb.Liveness, 0, len(nl.mu.nodes)) 698 for _, l := range nl.mu.nodes { 699 livenesses = append(livenesses, l) 700 } 701 return livenesses 702 } 703 704 // GetLiveness returns the liveness record for the specified nodeID. 705 // ErrNoLivenessRecord is returned in the event that nothing is yet 706 // known about nodeID via liveness gossip. 707 func (nl *NodeLiveness) GetLiveness(nodeID roachpb.NodeID) (kvserverpb.Liveness, error) { 708 nl.mu.RLock() 709 defer nl.mu.RUnlock() 710 return nl.getLivenessLocked(nodeID) 711 } 712 713 func (nl *NodeLiveness) getLivenessLocked(nodeID roachpb.NodeID) (kvserverpb.Liveness, error) { 714 if l, ok := nl.mu.nodes[nodeID]; ok { 715 return l, nil 716 } 717 return kvserverpb.Liveness{}, ErrNoLivenessRecord 718 } 719 720 // IncrementEpoch is called to attempt to revoke another node's 721 // current epoch, causing an expiration of all its leases. This method 722 // does a conditional put on the node liveness record, and if 723 // successful, stores the updated liveness record in the nodes map. If 724 // this method is called on a node ID which is considered live 725 // according to the most recent information gathered through gossip, 726 // an error is returned. 727 // 728 // The liveness argument is used as the expected value on the 729 // conditional put. If this method returns nil, there was a match and 730 // the epoch has been incremented. This means that the expiration time 731 // in the supplied liveness accurately reflects the time at which the 732 // epoch ended. 733 // 734 // If this method returns ErrEpochAlreadyIncremented, the epoch has 735 // already been incremented past the one in the liveness argument, but 736 // the conditional put did not find a match. This means that another 737 // node performed a successful IncrementEpoch, but we can't tell at 738 // what time the epoch actually ended. (Usually when multiple 739 // IncrementEpoch calls race, they're using the same expected value. 740 // But when there is a severe backlog, it's possible for one increment 741 // to get stuck in a queue long enough for the dead node to make 742 // another successful heartbeat, and a second increment to come in 743 // after that) 744 func (nl *NodeLiveness) IncrementEpoch(ctx context.Context, liveness kvserverpb.Liveness) error { 745 // Allow only one increment at a time. 746 sem := nl.sem(liveness.NodeID) 747 select { 748 case sem <- struct{}{}: 749 case <-ctx.Done(): 750 return ctx.Err() 751 } 752 defer func() { 753 <-sem 754 }() 755 756 if liveness.IsLive(nl.clock.Now().GoTime()) { 757 return errors.Errorf("cannot increment epoch on live node: %+v", liveness) 758 } 759 update := livenessUpdate{Liveness: liveness} 760 update.Epoch++ 761 if err := nl.updateLiveness(ctx, update, liveness, func(actual kvserverpb.Liveness) error { 762 defer nl.maybeUpdate(actual) 763 if actual.Epoch > liveness.Epoch { 764 return ErrEpochAlreadyIncremented 765 } else if actual.Epoch < liveness.Epoch { 766 return errors.Errorf("unexpected liveness epoch %d; expected >= %d", actual.Epoch, liveness.Epoch) 767 } 768 return errors.Errorf("mismatch incrementing epoch for %+v; actual is %+v", liveness, actual) 769 }); err != nil { 770 return err 771 } 772 773 log.Infof(ctx, "incremented n%d liveness epoch to %d", update.NodeID, update.Epoch) 774 nl.maybeUpdate(update.Liveness) 775 nl.metrics.EpochIncrements.Inc(1) 776 return nil 777 } 778 779 // Metrics returns a struct which contains metrics related to node 780 // liveness activity. 781 func (nl *NodeLiveness) Metrics() LivenessMetrics { 782 return nl.metrics 783 } 784 785 // RegisterCallback registers a callback to be invoked any time a 786 // node's IsLive() state changes to true. 787 func (nl *NodeLiveness) RegisterCallback(cb IsLiveCallback) { 788 nl.mu.Lock() 789 defer nl.mu.Unlock() 790 nl.mu.callbacks = append(nl.mu.callbacks, cb) 791 } 792 793 // updateLiveness does a conditional put on the node liveness record for the 794 // node specified by nodeID. In the event that the conditional put fails, and 795 // the handleCondFailed callback is not nil, it's invoked with the actual node 796 // liveness record and nil is returned for an error. If handleCondFailed is nil, 797 // any conditional put failure is returned as an error to the caller. The 798 // conditional put is done as a 1PC transaction with a ModifiedSpanTrigger which 799 // indicates the node liveness record that the range leader should gossip on 800 // commit. 801 // 802 // updateLiveness terminates certain errors that are expected to occur 803 // sporadically, such as TransactionStatusError (due to the 1PC requirement of 804 // the liveness txn, and ambiguous results). 805 func (nl *NodeLiveness) updateLiveness( 806 ctx context.Context, 807 update livenessUpdate, 808 oldLiveness kvserverpb.Liveness, 809 handleCondFailed func(actual kvserverpb.Liveness) error, 810 ) error { 811 for { 812 // Before each attempt, ensure that the context has not expired. 813 if err := ctx.Err(); err != nil { 814 return err 815 } 816 817 nl.mu.RLock() 818 engines := nl.mu.engines 819 nl.mu.RUnlock() 820 for _, eng := range engines { 821 // We synchronously write to all disks before updating liveness because we 822 // don't want any excessively slow disks to prevent leases from being 823 // shifted to other nodes. A slow/stalled disk would block here and cause 824 // the node to lose its leases. 825 if err := storage.WriteSyncNoop(ctx, eng); err != nil { 826 return errors.Wrapf(err, "couldn't update node liveness because disk write failed") 827 } 828 } 829 if err := nl.updateLivenessAttempt(ctx, update, oldLiveness, handleCondFailed); err != nil { 830 // Intentionally don't errors.Cause() the error, or we'd hop past errRetryLiveness. 831 if errors.HasType(err, (*errRetryLiveness)(nil)) { 832 log.Infof(ctx, "retrying liveness update after %s", err) 833 continue 834 } 835 return err 836 } 837 return nil 838 } 839 } 840 841 func (nl *NodeLiveness) updateLivenessAttempt( 842 ctx context.Context, 843 update livenessUpdate, 844 oldLiveness kvserverpb.Liveness, 845 handleCondFailed func(actual kvserverpb.Liveness) error, 846 ) error { 847 // First check the existing liveness map to avoid known conditional 848 // put failures. 849 if !update.ignoreCache { 850 l, err := nl.GetLiveness(update.NodeID) 851 if err != nil && !errors.Is(err, ErrNoLivenessRecord) { 852 return err 853 } 854 if err == nil && l != oldLiveness { 855 return handleCondFailed(l) 856 } 857 } 858 859 if err := nl.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 860 b := txn.NewBatch() 861 key := keys.NodeLivenessKey(update.NodeID) 862 val := update.Liveness 863 if oldLiveness == (kvserverpb.Liveness{}) { 864 b.CPut(key, &val, nil) 865 } else { 866 expVal := oldLiveness 867 // TODO(andrei): Plumb along oldLiveness as the raw bytes we read from the 868 // database, not as a proto, so that the proto's encoding can change. See 869 // #38308. If we do that, we can remove Liveness from belowRaftProtos. 870 b.CPutDeprecated(key, &val, &expVal) 871 } 872 // Use a trigger on EndTxn to indicate that node liveness should be 873 // re-gossiped. Further, require that this transaction complete as a one 874 // phase commit to eliminate the possibility of leaving write intents. 875 b.AddRawRequest(&roachpb.EndTxnRequest{ 876 Commit: true, 877 Require1PC: true, 878 InternalCommitTrigger: &roachpb.InternalCommitTrigger{ 879 ModifiedSpanTrigger: &roachpb.ModifiedSpanTrigger{ 880 NodeLivenessSpan: &roachpb.Span{ 881 Key: key, 882 EndKey: key.Next(), 883 }, 884 }, 885 }, 886 }) 887 return txn.Run(ctx, b) 888 }); err != nil { 889 if tErr := (*roachpb.ConditionFailedError)(nil); errors.As(err, &tErr) { 890 if handleCondFailed != nil { 891 if tErr.ActualValue == nil { 892 return handleCondFailed(kvserverpb.Liveness{}) 893 } 894 var actualLiveness kvserverpb.Liveness 895 if err := tErr.ActualValue.GetProto(&actualLiveness); err != nil { 896 return errors.Wrapf(err, "couldn't update node liveness from CPut actual value") 897 } 898 return handleCondFailed(actualLiveness) 899 } 900 } else if errors.HasType(err, (*roachpb.TransactionStatusError)(nil)) || 901 errors.HasType(err, (*roachpb.AmbiguousResultError)(nil)) { 902 return &errRetryLiveness{err} 903 } 904 return err 905 } 906 907 nl.mu.RLock() 908 cb := nl.mu.heartbeatCallback 909 nl.mu.RUnlock() 910 if cb != nil { 911 cb(ctx) 912 } 913 return nil 914 } 915 916 // maybeUpdate replaces the liveness (if it appears newer) and invokes the 917 // registered callbacks if the node became live in the process. 918 func (nl *NodeLiveness) maybeUpdate(new kvserverpb.Liveness) { 919 nl.mu.Lock() 920 // Note that this works fine even if `old` is empty. 921 old := nl.mu.nodes[new.NodeID] 922 should := shouldReplaceLiveness(old, new) 923 var callbacks []IsLiveCallback 924 if should { 925 nl.mu.nodes[new.NodeID] = new 926 callbacks = append(callbacks, nl.mu.callbacks...) 927 } 928 nl.mu.Unlock() 929 930 if !should { 931 return 932 } 933 934 now := nl.clock.Now().GoTime() 935 if !old.IsLive(now) && new.IsLive(now) { 936 for _, fn := range callbacks { 937 fn(new.NodeID) 938 } 939 } 940 } 941 942 func shouldReplaceLiveness(old, new kvserverpb.Liveness) bool { 943 if (old == kvserverpb.Liveness{}) { 944 return true 945 } 946 947 // Compare first Epoch, and no change there, Expiration. 948 if old.Epoch != new.Epoch { 949 return old.Epoch < new.Epoch 950 } 951 if old.Expiration != new.Expiration { 952 return old.Expiration.Less(new.Expiration) 953 } 954 955 // If Epoch and Expiration are unchanged, assume that the update is newer 956 // when its draining or decommissioning field changed. 957 // 958 // This has false positives (in which case we're clobbering the liveness). A 959 // better way to handle liveness updates in general is to add a sequence 960 // number. 961 // 962 // See #18219. 963 return old.Draining != new.Draining || old.Decommissioning != new.Decommissioning 964 } 965 966 // livenessGossipUpdate is the gossip callback used to keep the 967 // in-memory liveness info up to date. 968 func (nl *NodeLiveness) livenessGossipUpdate(key string, content roachpb.Value) { 969 var liveness kvserverpb.Liveness 970 if err := content.GetProto(&liveness); err != nil { 971 log.Errorf(context.TODO(), "%v", err) 972 return 973 } 974 975 nl.maybeUpdate(liveness) 976 } 977 978 // numLiveNodes is used to populate a metric that tracks the number of live 979 // nodes in the cluster. Returns 0 if this node is not itself live, to avoid 980 // reporting potentially inaccurate data. 981 // We export this metric from every live node rather than a single particular 982 // live node because liveness information is gossiped and thus may be stale. 983 // That staleness could result in no nodes reporting the metric or multiple 984 // nodes reporting the metric, so it's simplest to just have all live nodes 985 // report it. 986 func (nl *NodeLiveness) numLiveNodes() int64 { 987 ctx := nl.ambientCtx.AnnotateCtx(context.Background()) 988 989 selfID := nl.gossip.NodeID.Get() 990 if selfID == 0 { 991 return 0 992 } 993 994 nl.mu.RLock() 995 defer nl.mu.RUnlock() 996 997 self, err := nl.getLivenessLocked(selfID) 998 if errors.Is(err, ErrNoLivenessRecord) { 999 return 0 1000 } 1001 if err != nil { 1002 log.Warningf(ctx, "looking up own liveness: %+v", err) 1003 return 0 1004 } 1005 now := nl.clock.Now().GoTime() 1006 // If this node isn't live, we don't want to report its view of node liveness 1007 // because it's more likely to be inaccurate than the view of a live node. 1008 if !self.IsLive(now) { 1009 return 0 1010 } 1011 var liveNodes int64 1012 for _, l := range nl.mu.nodes { 1013 if l.IsLive(now) { 1014 liveNodes++ 1015 } 1016 } 1017 return liveNodes 1018 } 1019 1020 // AsLiveClock returns a closedts.LiveClockFn that takes a current timestamp off 1021 // the clock and returns it only if node liveness indicates that the node is live 1022 // at that timestamp and the returned epoch. 1023 func (nl *NodeLiveness) AsLiveClock() closedts.LiveClockFn { 1024 return func(nodeID roachpb.NodeID) (hlc.Timestamp, ctpb.Epoch, error) { 1025 now := nl.clock.Now() 1026 liveness, err := nl.GetLiveness(nodeID) 1027 if err != nil { 1028 return hlc.Timestamp{}, 0, err 1029 } 1030 if !liveness.IsLive(now.GoTime()) { 1031 return hlc.Timestamp{}, 0, errLiveClockNotLive 1032 } 1033 return now, ctpb.Epoch(liveness.Epoch), nil 1034 } 1035 } 1036 1037 // GetNodeCount returns a count of the number of nodes in the cluster, 1038 // including dead nodes, but excluding decommissioning or decommissioned nodes. 1039 func (nl *NodeLiveness) GetNodeCount() int { 1040 nl.mu.RLock() 1041 defer nl.mu.RUnlock() 1042 var count int 1043 for _, l := range nl.mu.nodes { 1044 if !l.Decommissioning { 1045 count++ 1046 } 1047 } 1048 return count 1049 }