github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/node_liveness.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/node_liveness.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"sync/atomic"
    17  	"time"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/base"
    20  	"github.com/cockroachdb/cockroach/pkg/gossip"
    21  	"github.com/cockroachdb/cockroach/pkg/keys"
    22  	"github.com/cockroachdb/cockroach/pkg/kv"
    23  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts"
    24  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb"
    25  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    26  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    27  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    28  	"github.com/cockroachdb/cockroach/pkg/storage"
    29  	"github.com/cockroachdb/cockroach/pkg/util/contextutil"
    30  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    31  	"github.com/cockroachdb/cockroach/pkg/util/log"
    32  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    33  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    34  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    35  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    36  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    37  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    38  	"github.com/cockroachdb/errors"
    39  )
    40  
    41  var (
    42  	// ErrNoLivenessRecord is returned when asking for liveness information
    43  	// about a node for which nothing is known.
    44  	ErrNoLivenessRecord = errors.New("node not in the liveness table")
    45  
    46  	errChangeDecommissioningFailed = errors.New("failed to change the decommissioning status")
    47  
    48  	// ErrEpochIncremented is returned when a heartbeat request fails because
    49  	// the underlying liveness record has had its epoch incremented.
    50  	ErrEpochIncremented = errors.New("heartbeat failed on epoch increment")
    51  
    52  	// ErrEpochAlreadyIncremented is returned by IncrementEpoch when
    53  	// someone else has already incremented the epoch to the desired
    54  	// value.
    55  	ErrEpochAlreadyIncremented = errors.New("epoch already incremented")
    56  
    57  	errLiveClockNotLive = errors.New("not live")
    58  )
    59  
    60  type errRetryLiveness struct {
    61  	error
    62  }
    63  
    64  func (e *errRetryLiveness) Cause() error {
    65  	return e.error
    66  }
    67  
    68  func (e *errRetryLiveness) Error() string {
    69  	return fmt.Sprintf("%T: %s", *e, e.error)
    70  }
    71  
    72  // Node liveness metrics counter names.
    73  var (
    74  	metaLiveNodes = metric.Metadata{
    75  		Name:        "liveness.livenodes",
    76  		Help:        "Number of live nodes in the cluster (will be 0 if this node is not itself live)",
    77  		Measurement: "Nodes",
    78  		Unit:        metric.Unit_COUNT,
    79  	}
    80  	metaHeartbeatSuccesses = metric.Metadata{
    81  		Name:        "liveness.heartbeatsuccesses",
    82  		Help:        "Number of successful node liveness heartbeats from this node",
    83  		Measurement: "Messages",
    84  		Unit:        metric.Unit_COUNT,
    85  	}
    86  	metaHeartbeatFailures = metric.Metadata{
    87  		Name:        "liveness.heartbeatfailures",
    88  		Help:        "Number of failed node liveness heartbeats from this node",
    89  		Measurement: "Messages",
    90  		Unit:        metric.Unit_COUNT,
    91  	}
    92  	metaEpochIncrements = metric.Metadata{
    93  		Name:        "liveness.epochincrements",
    94  		Help:        "Number of times this node has incremented its liveness epoch",
    95  		Measurement: "Epochs",
    96  		Unit:        metric.Unit_COUNT,
    97  	}
    98  	metaHeartbeatLatency = metric.Metadata{
    99  		Name:        "liveness.heartbeatlatency",
   100  		Help:        "Node liveness heartbeat latency",
   101  		Measurement: "Latency",
   102  		Unit:        metric.Unit_NANOSECONDS,
   103  	}
   104  )
   105  
   106  // LivenessMetrics holds metrics for use with node liveness activity.
   107  type LivenessMetrics struct {
   108  	LiveNodes          *metric.Gauge
   109  	HeartbeatSuccesses *metric.Counter
   110  	HeartbeatFailures  *metric.Counter
   111  	EpochIncrements    *metric.Counter
   112  	HeartbeatLatency   *metric.Histogram
   113  }
   114  
   115  // IsLiveCallback is invoked when a node's IsLive state changes to true.
   116  // Callbacks can be registered via NodeLiveness.RegisterCallback().
   117  type IsLiveCallback func(nodeID roachpb.NodeID)
   118  
   119  // HeartbeatCallback is invoked whenever this node updates its own liveness status,
   120  // indicating that it is alive.
   121  type HeartbeatCallback func(context.Context)
   122  
   123  // NodeLiveness is a centralized failure detector that coordinates
   124  // with the epoch-based range system to provide for leases of
   125  // indefinite length (replacing frequent per-range lease renewals with
   126  // heartbeats to the liveness system).
   127  //
   128  // It is also used as a general-purpose failure detector, but it is
   129  // not ideal for this purpose. It is inefficient due to the use of
   130  // replicated durable writes, and is not very sensitive (it primarily
   131  // tests connectivity from the node to the liveness range; a node with
   132  // a failing disk could still be considered live by this system).
   133  //
   134  // The persistent state of node liveness is stored in the KV layer,
   135  // near the beginning of the keyspace. These are normal MVCC keys,
   136  // written by CPut operations in 1PC transactions (the use of
   137  // transactions and MVCC is regretted because it means that the
   138  // liveness span depends on MVCC GC and can get overwhelmed if GC is
   139  // not working. Transactions were used only to piggyback on the
   140  // transaction commit trigger). The leaseholder of the liveness range
   141  // gossips its contents whenever they change (only the changed
   142  // portion); other nodes rarely read from this range directly.
   143  //
   144  // The use of conditional puts is crucial to maintain the guarantees
   145  // needed by epoch-based leases. Both the Heartbeat and IncrementEpoch
   146  // on this type require an expected value to be passed in; see
   147  // comments on those methods for more.
   148  //
   149  // TODO(bdarnell): Also document interaction with draining and decommissioning.
   150  type NodeLiveness struct {
   151  	ambientCtx        log.AmbientContext
   152  	clock             *hlc.Clock
   153  	db                *kv.DB
   154  	gossip            *gossip.Gossip
   155  	livenessThreshold time.Duration
   156  	heartbeatInterval time.Duration
   157  	selfSem           chan struct{}
   158  	st                *cluster.Settings
   159  	otherSem          chan struct{}
   160  	// heartbeatPaused contains an atomically-swapped number representing a bool
   161  	// (1 or 0). heartbeatToken is a channel containing a token which is taken
   162  	// when heartbeating or when pausing the heartbeat. Used for testing.
   163  	heartbeatPaused uint32
   164  	heartbeatToken  chan struct{}
   165  	metrics         LivenessMetrics
   166  
   167  	mu struct {
   168  		syncutil.RWMutex
   169  		callbacks         []IsLiveCallback
   170  		nodes             map[roachpb.NodeID]kvserverpb.Liveness
   171  		heartbeatCallback HeartbeatCallback
   172  		// Before heartbeating, we write to each of these engines to avoid
   173  		// maintaining liveness when a local disks is stalled.
   174  		engines []storage.Engine
   175  	}
   176  }
   177  
   178  // NewNodeLiveness returns a new instance of NodeLiveness configured
   179  // with the specified gossip instance.
   180  func NewNodeLiveness(
   181  	ambient log.AmbientContext,
   182  	clock *hlc.Clock,
   183  	db *kv.DB,
   184  	g *gossip.Gossip,
   185  	livenessThreshold time.Duration,
   186  	renewalDuration time.Duration,
   187  	st *cluster.Settings,
   188  	histogramWindow time.Duration,
   189  ) *NodeLiveness {
   190  	nl := &NodeLiveness{
   191  		ambientCtx:        ambient,
   192  		clock:             clock,
   193  		db:                db,
   194  		gossip:            g,
   195  		livenessThreshold: livenessThreshold,
   196  		heartbeatInterval: livenessThreshold - renewalDuration,
   197  		selfSem:           make(chan struct{}, 1),
   198  		st:                st,
   199  		otherSem:          make(chan struct{}, 1),
   200  		heartbeatToken:    make(chan struct{}, 1),
   201  	}
   202  	nl.metrics = LivenessMetrics{
   203  		LiveNodes:          metric.NewFunctionalGauge(metaLiveNodes, nl.numLiveNodes),
   204  		HeartbeatSuccesses: metric.NewCounter(metaHeartbeatSuccesses),
   205  		HeartbeatFailures:  metric.NewCounter(metaHeartbeatFailures),
   206  		EpochIncrements:    metric.NewCounter(metaEpochIncrements),
   207  		HeartbeatLatency:   metric.NewLatency(metaHeartbeatLatency, histogramWindow),
   208  	}
   209  	nl.mu.nodes = map[roachpb.NodeID]kvserverpb.Liveness{}
   210  	nl.heartbeatToken <- struct{}{}
   211  
   212  	livenessRegex := gossip.MakePrefixPattern(gossip.KeyNodeLivenessPrefix)
   213  	nl.gossip.RegisterCallback(livenessRegex, nl.livenessGossipUpdate)
   214  
   215  	return nl
   216  }
   217  
   218  var errNodeDrainingSet = errors.New("node is already draining")
   219  
   220  func (nl *NodeLiveness) sem(nodeID roachpb.NodeID) chan struct{} {
   221  	if nodeID == nl.gossip.NodeID.Get() {
   222  		return nl.selfSem
   223  	}
   224  	return nl.otherSem
   225  }
   226  
   227  // SetDraining attempts to update this node's liveness record to put itself
   228  // into the draining state.
   229  //
   230  // The reporter callback, if non-nil, is called on a best effort basis
   231  // to report work that needed to be done and which may or may not have
   232  // been done by the time this call returns. See the explanation in
   233  // pkg/server/drain.go for details.
   234  func (nl *NodeLiveness) SetDraining(ctx context.Context, drain bool, reporter func(int, string)) {
   235  	ctx = nl.ambientCtx.AnnotateCtx(ctx)
   236  	for r := retry.StartWithCtx(ctx, base.DefaultRetryOptions()); r.Next(); {
   237  		liveness, err := nl.Self()
   238  		if err != nil && !errors.Is(err, ErrNoLivenessRecord) {
   239  			log.Errorf(ctx, "unexpected error getting liveness: %+v", err)
   240  		}
   241  		err = nl.setDrainingInternal(ctx, liveness, drain, reporter)
   242  		if err != nil {
   243  			if log.V(1) {
   244  				log.Infof(ctx, "attempting to set liveness draining status to %v: %v", drain, err)
   245  			}
   246  			continue
   247  		}
   248  		return
   249  	}
   250  }
   251  
   252  // SetDecommissioning runs a best-effort attempt of marking the the liveness
   253  // record as decommissioning. It returns whether the function committed a
   254  // transaction that updated the liveness record.
   255  func (nl *NodeLiveness) SetDecommissioning(
   256  	ctx context.Context, nodeID roachpb.NodeID, decommission bool,
   257  ) (changeCommitted bool, err error) {
   258  	ctx = nl.ambientCtx.AnnotateCtx(ctx)
   259  
   260  	attempt := func() (bool, error) {
   261  		// Allow only one decommissioning attempt in flight per node at a time.
   262  		// This is required for correct results since we may otherwise race with
   263  		// concurrent `IncrementEpoch` calls and get stuck in a situation in
   264  		// which the cached liveness is has decommissioning=false while it's
   265  		// really true, and that means that SetDecommissioning becomes a no-op
   266  		// (which is correct) but that our cached liveness never updates to
   267  		// reflect that.
   268  		//
   269  		// See https://github.com/cockroachdb/cockroach/issues/17995.
   270  		sem := nl.sem(nodeID)
   271  		select {
   272  		case sem <- struct{}{}:
   273  		case <-ctx.Done():
   274  			return false, ctx.Err()
   275  		}
   276  		defer func() {
   277  			<-sem
   278  		}()
   279  
   280  		// We need the current liveness in each iteration.
   281  		//
   282  		// We ignore any liveness record in Gossip because we may have to fall back
   283  		// to the KV store anyway. The scenario in which this is needed is:
   284  		// - kill node 2 and stop node 1
   285  		// - wait for node 2's liveness record's Gossip entry to expire on all surviving nodes
   286  		// - restart node 1; it'll never see node 2 in `GetLiveness` unless the whole
   287  		//   node liveness span gets regossiped (unlikely if it wasn't the lease holder
   288  		//   for that span)
   289  		// - can't decommission node 2 from node 1 without KV fallback.
   290  		//
   291  		// See #20863.
   292  		//
   293  		// NB: this also de-flakes TestNodeLivenessDecommissionAbsent; running
   294  		// decommissioning commands in a tight loop on different nodes sometimes
   295  		// results in unintentional no-ops (due to the Gossip lag); this could be
   296  		// observed by users in principle, too.
   297  		//
   298  		// TODO(bdarnell): This is the one place where a range other than
   299  		// the leaseholder reads from this range. Should this read from
   300  		// gossip instead? (I have vague concerns about concurrent reads
   301  		// and timestamp cache pushes causing problems here)
   302  		var oldLiveness kvserverpb.Liveness
   303  		if err := nl.db.GetProto(ctx, keys.NodeLivenessKey(nodeID), &oldLiveness); err != nil {
   304  			return false, errors.Wrap(err, "unable to get liveness")
   305  		}
   306  		if (oldLiveness == kvserverpb.Liveness{}) {
   307  			return false, ErrNoLivenessRecord
   308  		}
   309  
   310  		// We may have discovered a Liveness not yet received via Gossip. Offer it
   311  		// to make sure that when we actually try to update the liveness, the
   312  		// previous view is correct. This, too, is required to de-flake
   313  		// TestNodeLivenessDecommissionAbsent.
   314  		nl.maybeUpdate(oldLiveness)
   315  
   316  		return nl.setDecommissioningInternal(ctx, nodeID, oldLiveness, decommission)
   317  	}
   318  
   319  	for {
   320  		changeCommitted, err := attempt()
   321  		if errors.Is(err, errChangeDecommissioningFailed) {
   322  			continue // expected when epoch incremented
   323  		}
   324  		return changeCommitted, err
   325  	}
   326  }
   327  
   328  func (nl *NodeLiveness) setDrainingInternal(
   329  	ctx context.Context, liveness kvserverpb.Liveness, drain bool, reporter func(int, string),
   330  ) error {
   331  	nodeID := nl.gossip.NodeID.Get()
   332  	sem := nl.sem(nodeID)
   333  	// Allow only one attempt to set the draining field at a time.
   334  	select {
   335  	case sem <- struct{}{}:
   336  	case <-ctx.Done():
   337  		return ctx.Err()
   338  	}
   339  	defer func() {
   340  		<-sem
   341  	}()
   342  
   343  	update := livenessUpdate{
   344  		Liveness: kvserverpb.Liveness{
   345  			NodeID: nodeID,
   346  			Epoch:  1,
   347  		},
   348  	}
   349  	if liveness != (kvserverpb.Liveness{}) {
   350  		update.Liveness = liveness
   351  	}
   352  	if reporter != nil && drain && !update.Draining {
   353  		// Report progress to the Drain RPC.
   354  		reporter(1, "liveness record")
   355  	}
   356  	update.Draining = drain
   357  	update.ignoreCache = true
   358  
   359  	if err := nl.updateLiveness(ctx, update, liveness, func(actual kvserverpb.Liveness) error {
   360  		nl.maybeUpdate(actual)
   361  		if actual.Draining == update.Draining {
   362  			return errNodeDrainingSet
   363  		}
   364  		return errors.New("failed to update liveness record")
   365  	}); err != nil {
   366  		if log.V(1) {
   367  			log.Infof(ctx, "updating liveness record: %v", err)
   368  		}
   369  		if errors.Is(err, errNodeDrainingSet) {
   370  			return nil
   371  		}
   372  		return err
   373  	}
   374  	nl.maybeUpdate(update.Liveness)
   375  	return nil
   376  }
   377  
   378  type livenessUpdate struct {
   379  	kvserverpb.Liveness
   380  	// When ignoreCache is set, we won't assume that our in-memory cached version
   381  	// of the liveness record is accurate and will use a CPut on the liveness
   382  	// table with whatever the client supplied. This is used for operations that
   383  	// don't want to deal with the inconsistencies of using the cache.
   384  	ignoreCache bool
   385  }
   386  
   387  func (nl *NodeLiveness) setDecommissioningInternal(
   388  	ctx context.Context, nodeID roachpb.NodeID, liveness kvserverpb.Liveness, decommission bool,
   389  ) (changeCommitted bool, err error) {
   390  	update := livenessUpdate{
   391  		Liveness: kvserverpb.Liveness{
   392  			NodeID: nodeID,
   393  			Epoch:  1,
   394  		},
   395  	}
   396  	if liveness != (kvserverpb.Liveness{}) {
   397  		update.Liveness = liveness
   398  	}
   399  	update.Decommissioning = decommission
   400  	update.ignoreCache = true
   401  
   402  	var conditionFailed bool
   403  	if err := nl.updateLiveness(ctx, update, liveness, func(actual kvserverpb.Liveness) error {
   404  		conditionFailed = true
   405  		if actual.Decommissioning == update.Decommissioning {
   406  			return nil
   407  		}
   408  		return errChangeDecommissioningFailed
   409  	}); err != nil {
   410  		return false, err
   411  	}
   412  	committed := !conditionFailed && liveness.Decommissioning != decommission
   413  	return committed, nil
   414  }
   415  
   416  // GetLivenessThreshold returns the maximum duration between heartbeats
   417  // before a node is considered not-live.
   418  func (nl *NodeLiveness) GetLivenessThreshold() time.Duration {
   419  	return nl.livenessThreshold
   420  }
   421  
   422  // IsLive returns whether or not the specified node is considered live based on
   423  // whether or not its liveness has expired regardless of the liveness status. It
   424  // is an error if the specified node is not in the local liveness table.
   425  func (nl *NodeLiveness) IsLive(nodeID roachpb.NodeID) (bool, error) {
   426  	liveness, err := nl.GetLiveness(nodeID)
   427  	if err != nil {
   428  		return false, err
   429  	}
   430  	// NB: We use clock.Now().GoTime() instead of clock.PhysicalTime() in order to
   431  	// consider clock signals from other nodes.
   432  	return liveness.IsLive(nl.clock.Now().GoTime()), nil
   433  }
   434  
   435  // StartHeartbeat starts a periodic heartbeat to refresh this node's last
   436  // heartbeat in the node liveness table. The optionally provided
   437  // HeartbeatCallback will be invoked whenever this node updates its own
   438  // liveness. The slice of engines will be written to before each heartbeat to
   439  // avoid maintaining liveness in the presence of disk stalls.
   440  func (nl *NodeLiveness) StartHeartbeat(
   441  	ctx context.Context, stopper *stop.Stopper, engines []storage.Engine, alive HeartbeatCallback,
   442  ) {
   443  	log.VEventf(ctx, 1, "starting liveness heartbeat")
   444  	retryOpts := base.DefaultRetryOptions()
   445  	retryOpts.Closer = stopper.ShouldQuiesce()
   446  
   447  	if len(engines) == 0 {
   448  		// Avoid silently forgetting to pass the engines. It happened before.
   449  		log.Fatalf(ctx, "must supply at least one engine")
   450  	}
   451  
   452  	nl.mu.Lock()
   453  	nl.mu.heartbeatCallback = alive
   454  	nl.mu.engines = engines
   455  	nl.mu.Unlock()
   456  
   457  	stopper.RunWorker(ctx, func(context.Context) {
   458  		ambient := nl.ambientCtx
   459  		ambient.AddLogTag("liveness-hb", nil)
   460  		ctx, cancel := stopper.WithCancelOnStop(context.Background())
   461  		defer cancel()
   462  		ctx, sp := ambient.AnnotateCtxWithSpan(ctx, "liveness heartbeat loop")
   463  		defer sp.Finish()
   464  
   465  		incrementEpoch := true
   466  		ticker := time.NewTicker(nl.heartbeatInterval)
   467  		defer ticker.Stop()
   468  		for {
   469  			select {
   470  			case <-nl.heartbeatToken:
   471  			case <-stopper.ShouldStop():
   472  				return
   473  			}
   474  			// Give the context a timeout approximately as long as the time we
   475  			// have left before our liveness entry expires.
   476  			if err := contextutil.RunWithTimeout(ctx, "node liveness heartbeat", nl.livenessThreshold-nl.heartbeatInterval,
   477  				func(ctx context.Context) error {
   478  					// Retry heartbeat in the event the conditional put fails.
   479  					for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
   480  						liveness, err := nl.Self()
   481  						if err != nil && !errors.Is(err, ErrNoLivenessRecord) {
   482  							log.Errorf(ctx, "unexpected error getting liveness: %+v", err)
   483  						}
   484  						if err := nl.heartbeatInternal(ctx, liveness, incrementEpoch); err != nil {
   485  							if errors.Is(err, ErrEpochIncremented) {
   486  								log.Infof(ctx, "%s; retrying", err)
   487  								continue
   488  							}
   489  							return err
   490  						}
   491  						incrementEpoch = false // don't increment epoch after first heartbeat
   492  						break
   493  					}
   494  					return nil
   495  				}); err != nil {
   496  				log.Warningf(ctx, "failed node liveness heartbeat: %+v", err)
   497  			}
   498  
   499  			nl.heartbeatToken <- struct{}{}
   500  			select {
   501  			case <-ticker.C:
   502  			case <-stopper.ShouldStop():
   503  				return
   504  			}
   505  		}
   506  	})
   507  }
   508  
   509  // PauseHeartbeat stops or restarts the periodic heartbeat depending on the
   510  // pause parameter. When pause is true, waits until it acquires the heartbeatToken
   511  // (unless heartbeat was already paused); this ensures that no heartbeats happen
   512  // after this is called. This function is only safe for use in tests.
   513  func (nl *NodeLiveness) PauseHeartbeat(pause bool) {
   514  	if pause {
   515  		if swapped := atomic.CompareAndSwapUint32(&nl.heartbeatPaused, 0, 1); swapped {
   516  			<-nl.heartbeatToken
   517  		}
   518  	} else {
   519  		if swapped := atomic.CompareAndSwapUint32(&nl.heartbeatPaused, 1, 0); swapped {
   520  			nl.heartbeatToken <- struct{}{}
   521  		}
   522  	}
   523  }
   524  
   525  // DisableAllHeartbeatsForTest disables all node liveness heartbeats, including
   526  // those triggered from outside the normal StartHeartbeat loop. Returns a
   527  // closure to call to re-enable heartbeats. Only safe for use in tests.
   528  func (nl *NodeLiveness) DisableAllHeartbeatsForTest() func() {
   529  	nl.PauseHeartbeat(true)
   530  	nl.selfSem <- struct{}{}
   531  	nl.otherSem <- struct{}{}
   532  	return func() {
   533  		<-nl.selfSem
   534  		<-nl.otherSem
   535  	}
   536  }
   537  
   538  var errNodeAlreadyLive = errors.New("node already live")
   539  
   540  // Heartbeat is called to update a node's expiration timestamp. This
   541  // method does a conditional put on the node liveness record, and if
   542  // successful, stores the updated liveness record in the nodes map.
   543  //
   544  // The liveness argument is the expected previous value of this node's
   545  // liveness.
   546  //
   547  // If this method returns nil, the node's liveness has been extended,
   548  // relative to the previous value. It may or may not still be alive
   549  // when this method returns.
   550  //
   551  // On failure, this method returns ErrEpochIncremented, although this
   552  // may not necessarily mean that the epoch was actually incremented.
   553  // TODO(bdarnell): Fix error semantics here.
   554  //
   555  // This method is rarely called directly; heartbeats are normally sent
   556  // by the StartHeartbeat loop.
   557  // TODO(bdarnell): Should we just remove this synchronous heartbeat completely?
   558  func (nl *NodeLiveness) Heartbeat(ctx context.Context, liveness kvserverpb.Liveness) error {
   559  	return nl.heartbeatInternal(ctx, liveness, false /* increment epoch */)
   560  }
   561  
   562  func (nl *NodeLiveness) heartbeatInternal(
   563  	ctx context.Context, liveness kvserverpb.Liveness, incrementEpoch bool,
   564  ) error {
   565  	ctx, sp := tracing.EnsureChildSpan(ctx, nl.ambientCtx.Tracer, "liveness heartbeat")
   566  	defer sp.Finish()
   567  	defer func(start time.Time) {
   568  		dur := timeutil.Now().Sub(start)
   569  		nl.metrics.HeartbeatLatency.RecordValue(dur.Nanoseconds())
   570  		if dur > time.Second {
   571  			log.Warningf(ctx, "slow heartbeat took %0.1fs", dur.Seconds())
   572  		}
   573  	}(timeutil.Now())
   574  
   575  	// Allow only one heartbeat at a time.
   576  	nodeID := nl.gossip.NodeID.Get()
   577  	sem := nl.sem(nodeID)
   578  	select {
   579  	case sem <- struct{}{}:
   580  	case <-ctx.Done():
   581  		return ctx.Err()
   582  	}
   583  	defer func() {
   584  		<-sem
   585  	}()
   586  
   587  	update := livenessUpdate{
   588  		Liveness: kvserverpb.Liveness{
   589  			NodeID: nodeID,
   590  			Epoch:  1,
   591  		},
   592  	}
   593  	if liveness != (kvserverpb.Liveness{}) {
   594  		update.Liveness = liveness
   595  		if incrementEpoch {
   596  			update.Epoch++
   597  			// Clear draining field.
   598  			update.Draining = false
   599  		}
   600  	}
   601  	// We need to add the maximum clock offset to the expiration because it's
   602  	// used when determining liveness for a node.
   603  	{
   604  		update.Expiration = hlc.LegacyTimestamp(
   605  			nl.clock.Now().Add((nl.livenessThreshold).Nanoseconds(), 0))
   606  		// This guards against the system clock moving backwards. As long
   607  		// as the cockroach process is running, checks inside hlc.Clock
   608  		// will ensure that the clock never moves backwards, but these
   609  		// checks don't work across process restarts.
   610  		if update.Expiration.Less(liveness.Expiration) {
   611  			return errors.Errorf("proposed liveness update expires earlier than previous record")
   612  		}
   613  	}
   614  	if err := nl.updateLiveness(ctx, update, liveness, func(actual kvserverpb.Liveness) error {
   615  		// Update liveness to actual value on mismatch.
   616  		nl.maybeUpdate(actual)
   617  		// If the actual liveness is different than expected, but is
   618  		// considered live, treat the heartbeat as a success. This can
   619  		// happen when the periodic heartbeater races with a concurrent
   620  		// lease acquisition.
   621  		//
   622  		// TODO(bdarnell): If things are very slow, the new liveness may
   623  		// have already expired and we'd incorrectly return
   624  		// ErrEpochIncremented. Is this check even necessary? The common
   625  		// path through this method doesn't check whether the liveness
   626  		// expired while in flight, so maybe we don't have to care about
   627  		// that and only need to distinguish between same and different
   628  		// epochs in our return value.
   629  		if actual.IsLive(nl.clock.Now().GoTime()) && !incrementEpoch {
   630  			return errNodeAlreadyLive
   631  		}
   632  		// Otherwise, return error.
   633  		return ErrEpochIncremented
   634  	}); err != nil {
   635  		if errors.Is(err, errNodeAlreadyLive) {
   636  			nl.metrics.HeartbeatSuccesses.Inc(1)
   637  			return nil
   638  		}
   639  		nl.metrics.HeartbeatFailures.Inc(1)
   640  		return err
   641  	}
   642  
   643  	log.VEventf(ctx, 1, "heartbeat %+v", update.Expiration)
   644  	nl.maybeUpdate(update.Liveness)
   645  	nl.metrics.HeartbeatSuccesses.Inc(1)
   646  	return nil
   647  }
   648  
   649  // Self returns the liveness record for this node. ErrNoLivenessRecord
   650  // is returned in the event that the node has neither heartbeat its
   651  // liveness record successfully, nor received a gossip message containing
   652  // a former liveness update on restart.
   653  func (nl *NodeLiveness) Self() (kvserverpb.Liveness, error) {
   654  	nl.mu.RLock()
   655  	defer nl.mu.RUnlock()
   656  	return nl.getLivenessLocked(nl.gossip.NodeID.Get())
   657  }
   658  
   659  // IsLiveMapEntry encapsulates data about current liveness for a
   660  // node.
   661  type IsLiveMapEntry struct {
   662  	IsLive bool
   663  	Epoch  int64
   664  }
   665  
   666  // IsLiveMap is a type alias for a map from NodeID to IsLiveMapEntry.
   667  type IsLiveMap map[roachpb.NodeID]IsLiveMapEntry
   668  
   669  // GetIsLiveMap returns a map of nodeID to boolean liveness status of
   670  // each node. This excludes nodes that were removed completely (dead +
   671  // decommissioning).
   672  func (nl *NodeLiveness) GetIsLiveMap() IsLiveMap {
   673  	lMap := IsLiveMap{}
   674  	nl.mu.RLock()
   675  	defer nl.mu.RUnlock()
   676  	now := nl.clock.Now().GoTime()
   677  	for nID, l := range nl.mu.nodes {
   678  		isLive := l.IsLive(now)
   679  		if !isLive && l.Decommissioning {
   680  			// This is a node that was completely removed. Skip over it.
   681  			continue
   682  		}
   683  		lMap[nID] = IsLiveMapEntry{
   684  			IsLive: isLive,
   685  			Epoch:  l.Epoch,
   686  		}
   687  	}
   688  	return lMap
   689  }
   690  
   691  // GetLivenesses returns a slice containing the liveness status of
   692  // every node on the cluster known to gossip. Callers should consider
   693  // calling (statusServer).NodesWithLiveness() instead where possible.
   694  func (nl *NodeLiveness) GetLivenesses() []kvserverpb.Liveness {
   695  	nl.mu.RLock()
   696  	defer nl.mu.RUnlock()
   697  	livenesses := make([]kvserverpb.Liveness, 0, len(nl.mu.nodes))
   698  	for _, l := range nl.mu.nodes {
   699  		livenesses = append(livenesses, l)
   700  	}
   701  	return livenesses
   702  }
   703  
   704  // GetLiveness returns the liveness record for the specified nodeID.
   705  // ErrNoLivenessRecord is returned in the event that nothing is yet
   706  // known about nodeID via liveness gossip.
   707  func (nl *NodeLiveness) GetLiveness(nodeID roachpb.NodeID) (kvserverpb.Liveness, error) {
   708  	nl.mu.RLock()
   709  	defer nl.mu.RUnlock()
   710  	return nl.getLivenessLocked(nodeID)
   711  }
   712  
   713  func (nl *NodeLiveness) getLivenessLocked(nodeID roachpb.NodeID) (kvserverpb.Liveness, error) {
   714  	if l, ok := nl.mu.nodes[nodeID]; ok {
   715  		return l, nil
   716  	}
   717  	return kvserverpb.Liveness{}, ErrNoLivenessRecord
   718  }
   719  
   720  // IncrementEpoch is called to attempt to revoke another node's
   721  // current epoch, causing an expiration of all its leases. This method
   722  // does a conditional put on the node liveness record, and if
   723  // successful, stores the updated liveness record in the nodes map. If
   724  // this method is called on a node ID which is considered live
   725  // according to the most recent information gathered through gossip,
   726  // an error is returned.
   727  //
   728  // The liveness argument is used as the expected value on the
   729  // conditional put. If this method returns nil, there was a match and
   730  // the epoch has been incremented. This means that the expiration time
   731  // in the supplied liveness accurately reflects the time at which the
   732  // epoch ended.
   733  //
   734  // If this method returns ErrEpochAlreadyIncremented, the epoch has
   735  // already been incremented past the one in the liveness argument, but
   736  // the conditional put did not find a match. This means that another
   737  // node performed a successful IncrementEpoch, but we can't tell at
   738  // what time the epoch actually ended. (Usually when multiple
   739  // IncrementEpoch calls race, they're using the same expected value.
   740  // But when there is a severe backlog, it's possible for one increment
   741  // to get stuck in a queue long enough for the dead node to make
   742  // another successful heartbeat, and a second increment to come in
   743  // after that)
   744  func (nl *NodeLiveness) IncrementEpoch(ctx context.Context, liveness kvserverpb.Liveness) error {
   745  	// Allow only one increment at a time.
   746  	sem := nl.sem(liveness.NodeID)
   747  	select {
   748  	case sem <- struct{}{}:
   749  	case <-ctx.Done():
   750  		return ctx.Err()
   751  	}
   752  	defer func() {
   753  		<-sem
   754  	}()
   755  
   756  	if liveness.IsLive(nl.clock.Now().GoTime()) {
   757  		return errors.Errorf("cannot increment epoch on live node: %+v", liveness)
   758  	}
   759  	update := livenessUpdate{Liveness: liveness}
   760  	update.Epoch++
   761  	if err := nl.updateLiveness(ctx, update, liveness, func(actual kvserverpb.Liveness) error {
   762  		defer nl.maybeUpdate(actual)
   763  		if actual.Epoch > liveness.Epoch {
   764  			return ErrEpochAlreadyIncremented
   765  		} else if actual.Epoch < liveness.Epoch {
   766  			return errors.Errorf("unexpected liveness epoch %d; expected >= %d", actual.Epoch, liveness.Epoch)
   767  		}
   768  		return errors.Errorf("mismatch incrementing epoch for %+v; actual is %+v", liveness, actual)
   769  	}); err != nil {
   770  		return err
   771  	}
   772  
   773  	log.Infof(ctx, "incremented n%d liveness epoch to %d", update.NodeID, update.Epoch)
   774  	nl.maybeUpdate(update.Liveness)
   775  	nl.metrics.EpochIncrements.Inc(1)
   776  	return nil
   777  }
   778  
   779  // Metrics returns a struct which contains metrics related to node
   780  // liveness activity.
   781  func (nl *NodeLiveness) Metrics() LivenessMetrics {
   782  	return nl.metrics
   783  }
   784  
   785  // RegisterCallback registers a callback to be invoked any time a
   786  // node's IsLive() state changes to true.
   787  func (nl *NodeLiveness) RegisterCallback(cb IsLiveCallback) {
   788  	nl.mu.Lock()
   789  	defer nl.mu.Unlock()
   790  	nl.mu.callbacks = append(nl.mu.callbacks, cb)
   791  }
   792  
   793  // updateLiveness does a conditional put on the node liveness record for the
   794  // node specified by nodeID. In the event that the conditional put fails, and
   795  // the handleCondFailed callback is not nil, it's invoked with the actual node
   796  // liveness record and nil is returned for an error. If handleCondFailed is nil,
   797  // any conditional put failure is returned as an error to the caller. The
   798  // conditional put is done as a 1PC transaction with a ModifiedSpanTrigger which
   799  // indicates the node liveness record that the range leader should gossip on
   800  // commit.
   801  //
   802  // updateLiveness terminates certain errors that are expected to occur
   803  // sporadically, such as TransactionStatusError (due to the 1PC requirement of
   804  // the liveness txn, and ambiguous results).
   805  func (nl *NodeLiveness) updateLiveness(
   806  	ctx context.Context,
   807  	update livenessUpdate,
   808  	oldLiveness kvserverpb.Liveness,
   809  	handleCondFailed func(actual kvserverpb.Liveness) error,
   810  ) error {
   811  	for {
   812  		// Before each attempt, ensure that the context has not expired.
   813  		if err := ctx.Err(); err != nil {
   814  			return err
   815  		}
   816  
   817  		nl.mu.RLock()
   818  		engines := nl.mu.engines
   819  		nl.mu.RUnlock()
   820  		for _, eng := range engines {
   821  			// We synchronously write to all disks before updating liveness because we
   822  			// don't want any excessively slow disks to prevent leases from being
   823  			// shifted to other nodes. A slow/stalled disk would block here and cause
   824  			// the node to lose its leases.
   825  			if err := storage.WriteSyncNoop(ctx, eng); err != nil {
   826  				return errors.Wrapf(err, "couldn't update node liveness because disk write failed")
   827  			}
   828  		}
   829  		if err := nl.updateLivenessAttempt(ctx, update, oldLiveness, handleCondFailed); err != nil {
   830  			// Intentionally don't errors.Cause() the error, or we'd hop past errRetryLiveness.
   831  			if errors.HasType(err, (*errRetryLiveness)(nil)) {
   832  				log.Infof(ctx, "retrying liveness update after %s", err)
   833  				continue
   834  			}
   835  			return err
   836  		}
   837  		return nil
   838  	}
   839  }
   840  
   841  func (nl *NodeLiveness) updateLivenessAttempt(
   842  	ctx context.Context,
   843  	update livenessUpdate,
   844  	oldLiveness kvserverpb.Liveness,
   845  	handleCondFailed func(actual kvserverpb.Liveness) error,
   846  ) error {
   847  	// First check the existing liveness map to avoid known conditional
   848  	// put failures.
   849  	if !update.ignoreCache {
   850  		l, err := nl.GetLiveness(update.NodeID)
   851  		if err != nil && !errors.Is(err, ErrNoLivenessRecord) {
   852  			return err
   853  		}
   854  		if err == nil && l != oldLiveness {
   855  			return handleCondFailed(l)
   856  		}
   857  	}
   858  
   859  	if err := nl.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   860  		b := txn.NewBatch()
   861  		key := keys.NodeLivenessKey(update.NodeID)
   862  		val := update.Liveness
   863  		if oldLiveness == (kvserverpb.Liveness{}) {
   864  			b.CPut(key, &val, nil)
   865  		} else {
   866  			expVal := oldLiveness
   867  			// TODO(andrei): Plumb along oldLiveness as the raw bytes we read from the
   868  			// database, not as a proto, so that the proto's encoding can change. See
   869  			// #38308. If we do that, we can remove Liveness from belowRaftProtos.
   870  			b.CPutDeprecated(key, &val, &expVal)
   871  		}
   872  		// Use a trigger on EndTxn to indicate that node liveness should be
   873  		// re-gossiped. Further, require that this transaction complete as a one
   874  		// phase commit to eliminate the possibility of leaving write intents.
   875  		b.AddRawRequest(&roachpb.EndTxnRequest{
   876  			Commit:     true,
   877  			Require1PC: true,
   878  			InternalCommitTrigger: &roachpb.InternalCommitTrigger{
   879  				ModifiedSpanTrigger: &roachpb.ModifiedSpanTrigger{
   880  					NodeLivenessSpan: &roachpb.Span{
   881  						Key:    key,
   882  						EndKey: key.Next(),
   883  					},
   884  				},
   885  			},
   886  		})
   887  		return txn.Run(ctx, b)
   888  	}); err != nil {
   889  		if tErr := (*roachpb.ConditionFailedError)(nil); errors.As(err, &tErr) {
   890  			if handleCondFailed != nil {
   891  				if tErr.ActualValue == nil {
   892  					return handleCondFailed(kvserverpb.Liveness{})
   893  				}
   894  				var actualLiveness kvserverpb.Liveness
   895  				if err := tErr.ActualValue.GetProto(&actualLiveness); err != nil {
   896  					return errors.Wrapf(err, "couldn't update node liveness from CPut actual value")
   897  				}
   898  				return handleCondFailed(actualLiveness)
   899  			}
   900  		} else if errors.HasType(err, (*roachpb.TransactionStatusError)(nil)) ||
   901  			errors.HasType(err, (*roachpb.AmbiguousResultError)(nil)) {
   902  			return &errRetryLiveness{err}
   903  		}
   904  		return err
   905  	}
   906  
   907  	nl.mu.RLock()
   908  	cb := nl.mu.heartbeatCallback
   909  	nl.mu.RUnlock()
   910  	if cb != nil {
   911  		cb(ctx)
   912  	}
   913  	return nil
   914  }
   915  
   916  // maybeUpdate replaces the liveness (if it appears newer) and invokes the
   917  // registered callbacks if the node became live in the process.
   918  func (nl *NodeLiveness) maybeUpdate(new kvserverpb.Liveness) {
   919  	nl.mu.Lock()
   920  	// Note that this works fine even if `old` is empty.
   921  	old := nl.mu.nodes[new.NodeID]
   922  	should := shouldReplaceLiveness(old, new)
   923  	var callbacks []IsLiveCallback
   924  	if should {
   925  		nl.mu.nodes[new.NodeID] = new
   926  		callbacks = append(callbacks, nl.mu.callbacks...)
   927  	}
   928  	nl.mu.Unlock()
   929  
   930  	if !should {
   931  		return
   932  	}
   933  
   934  	now := nl.clock.Now().GoTime()
   935  	if !old.IsLive(now) && new.IsLive(now) {
   936  		for _, fn := range callbacks {
   937  			fn(new.NodeID)
   938  		}
   939  	}
   940  }
   941  
   942  func shouldReplaceLiveness(old, new kvserverpb.Liveness) bool {
   943  	if (old == kvserverpb.Liveness{}) {
   944  		return true
   945  	}
   946  
   947  	// Compare first Epoch, and no change there, Expiration.
   948  	if old.Epoch != new.Epoch {
   949  		return old.Epoch < new.Epoch
   950  	}
   951  	if old.Expiration != new.Expiration {
   952  		return old.Expiration.Less(new.Expiration)
   953  	}
   954  
   955  	// If Epoch and Expiration are unchanged, assume that the update is newer
   956  	// when its draining or decommissioning field changed.
   957  	//
   958  	// This has false positives (in which case we're clobbering the liveness). A
   959  	// better way to handle liveness updates in general is to add a sequence
   960  	// number.
   961  	//
   962  	// See #18219.
   963  	return old.Draining != new.Draining || old.Decommissioning != new.Decommissioning
   964  }
   965  
   966  // livenessGossipUpdate is the gossip callback used to keep the
   967  // in-memory liveness info up to date.
   968  func (nl *NodeLiveness) livenessGossipUpdate(key string, content roachpb.Value) {
   969  	var liveness kvserverpb.Liveness
   970  	if err := content.GetProto(&liveness); err != nil {
   971  		log.Errorf(context.TODO(), "%v", err)
   972  		return
   973  	}
   974  
   975  	nl.maybeUpdate(liveness)
   976  }
   977  
   978  // numLiveNodes is used to populate a metric that tracks the number of live
   979  // nodes in the cluster. Returns 0 if this node is not itself live, to avoid
   980  // reporting potentially inaccurate data.
   981  // We export this metric from every live node rather than a single particular
   982  // live node because liveness information is gossiped and thus may be stale.
   983  // That staleness could result in no nodes reporting the metric or multiple
   984  // nodes reporting the metric, so it's simplest to just have all live nodes
   985  // report it.
   986  func (nl *NodeLiveness) numLiveNodes() int64 {
   987  	ctx := nl.ambientCtx.AnnotateCtx(context.Background())
   988  
   989  	selfID := nl.gossip.NodeID.Get()
   990  	if selfID == 0 {
   991  		return 0
   992  	}
   993  
   994  	nl.mu.RLock()
   995  	defer nl.mu.RUnlock()
   996  
   997  	self, err := nl.getLivenessLocked(selfID)
   998  	if errors.Is(err, ErrNoLivenessRecord) {
   999  		return 0
  1000  	}
  1001  	if err != nil {
  1002  		log.Warningf(ctx, "looking up own liveness: %+v", err)
  1003  		return 0
  1004  	}
  1005  	now := nl.clock.Now().GoTime()
  1006  	// If this node isn't live, we don't want to report its view of node liveness
  1007  	// because it's more likely to be inaccurate than the view of a live node.
  1008  	if !self.IsLive(now) {
  1009  		return 0
  1010  	}
  1011  	var liveNodes int64
  1012  	for _, l := range nl.mu.nodes {
  1013  		if l.IsLive(now) {
  1014  			liveNodes++
  1015  		}
  1016  	}
  1017  	return liveNodes
  1018  }
  1019  
  1020  // AsLiveClock returns a closedts.LiveClockFn that takes a current timestamp off
  1021  // the clock and returns it only if node liveness indicates that the node is live
  1022  // at that timestamp and the returned epoch.
  1023  func (nl *NodeLiveness) AsLiveClock() closedts.LiveClockFn {
  1024  	return func(nodeID roachpb.NodeID) (hlc.Timestamp, ctpb.Epoch, error) {
  1025  		now := nl.clock.Now()
  1026  		liveness, err := nl.GetLiveness(nodeID)
  1027  		if err != nil {
  1028  			return hlc.Timestamp{}, 0, err
  1029  		}
  1030  		if !liveness.IsLive(now.GoTime()) {
  1031  			return hlc.Timestamp{}, 0, errLiveClockNotLive
  1032  		}
  1033  		return now, ctpb.Epoch(liveness.Epoch), nil
  1034  	}
  1035  }
  1036  
  1037  // GetNodeCount returns a count of the number of nodes in the cluster,
  1038  // including dead nodes, but excluding decommissioning or decommissioned nodes.
  1039  func (nl *NodeLiveness) GetNodeCount() int {
  1040  	nl.mu.RLock()
  1041  	defer nl.mu.RUnlock()
  1042  	var count int
  1043  	for _, l := range nl.mu.nodes {
  1044  		if !l.Decommissioning {
  1045  			count++
  1046  		}
  1047  	}
  1048  	return count
  1049  }