github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/rpc/clock_offset.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package rpc
    12  
    13  import (
    14  	"context"
    15  	"math"
    16  	"time"
    17  
    18  	"github.com/VividCortex/ewma"
    19  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    20  	"github.com/cockroachdb/cockroach/pkg/util/log"
    21  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    22  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    23  	"github.com/cockroachdb/errors"
    24  	"github.com/montanaflynn/stats"
    25  )
    26  
    27  // RemoteClockMetrics is the collection of metrics for the clock monitor.
    28  type RemoteClockMetrics struct {
    29  	ClockOffsetMeanNanos   *metric.Gauge
    30  	ClockOffsetStdDevNanos *metric.Gauge
    31  	LatencyHistogramNanos  *metric.Histogram
    32  }
    33  
    34  // avgLatencyMeasurementAge determines how to exponentially weight the
    35  // moving average of latency measurements. This means that the weight
    36  // will center around the 20th oldest measurement, such that for measurements
    37  // that are made every 3 seconds, the average measurement will be about one
    38  // minute old.
    39  const avgLatencyMeasurementAge = 20.0
    40  
    41  var (
    42  	metaClockOffsetMeanNanos = metric.Metadata{
    43  		Name:        "clock-offset.meannanos",
    44  		Help:        "Mean clock offset with other nodes",
    45  		Measurement: "Clock Offset",
    46  		Unit:        metric.Unit_NANOSECONDS,
    47  	}
    48  	metaClockOffsetStdDevNanos = metric.Metadata{
    49  		Name:        "clock-offset.stddevnanos",
    50  		Help:        "Stddev clock offset with other nodes",
    51  		Measurement: "Clock Offset",
    52  		Unit:        metric.Unit_NANOSECONDS,
    53  	}
    54  	metaLatencyHistogramNanos = metric.Metadata{
    55  		Name:        "round-trip-latency",
    56  		Help:        "Distribution of round-trip latencies with other nodes",
    57  		Measurement: "Roundtrip Latency",
    58  		Unit:        metric.Unit_NANOSECONDS,
    59  	}
    60  )
    61  
    62  // RemoteClockMonitor keeps track of the most recent measurements of remote
    63  // offsets and round-trip latency from this node to connected nodes.
    64  type RemoteClockMonitor struct {
    65  	clock     *hlc.Clock
    66  	offsetTTL time.Duration
    67  
    68  	mu struct {
    69  		syncutil.RWMutex
    70  		offsets        map[string]RemoteOffset
    71  		latenciesNanos map[string]ewma.MovingAverage
    72  	}
    73  
    74  	metrics RemoteClockMetrics
    75  }
    76  
    77  // newRemoteClockMonitor returns a monitor with the given server clock.
    78  func newRemoteClockMonitor(
    79  	clock *hlc.Clock, offsetTTL time.Duration, histogramWindowInterval time.Duration,
    80  ) *RemoteClockMonitor {
    81  	r := RemoteClockMonitor{
    82  		clock:     clock,
    83  		offsetTTL: offsetTTL,
    84  	}
    85  	r.mu.offsets = make(map[string]RemoteOffset)
    86  	r.mu.latenciesNanos = make(map[string]ewma.MovingAverage)
    87  	if histogramWindowInterval == 0 {
    88  		histogramWindowInterval = time.Duration(math.MaxInt64)
    89  	}
    90  	r.metrics = RemoteClockMetrics{
    91  		ClockOffsetMeanNanos:   metric.NewGauge(metaClockOffsetMeanNanos),
    92  		ClockOffsetStdDevNanos: metric.NewGauge(metaClockOffsetStdDevNanos),
    93  		LatencyHistogramNanos:  metric.NewLatency(metaLatencyHistogramNanos, histogramWindowInterval),
    94  	}
    95  	return &r
    96  }
    97  
    98  // Metrics returns the metrics struct. Useful to examine individual metrics,
    99  // or to add to the registry.
   100  func (r *RemoteClockMonitor) Metrics() *RemoteClockMetrics {
   101  	return &r.metrics
   102  }
   103  
   104  // Latency returns the exponentially weighted moving average latency to the
   105  // given node address. Returns true if the measurement is valid, or false if
   106  // we don't have enough samples to compute a reliable average.
   107  func (r *RemoteClockMonitor) Latency(addr string) (time.Duration, bool) {
   108  	r.mu.RLock()
   109  	defer r.mu.RUnlock()
   110  	if avg, ok := r.mu.latenciesNanos[addr]; ok && avg.Value() != 0.0 {
   111  		return time.Duration(int64(avg.Value())), true
   112  	}
   113  	return 0, false
   114  }
   115  
   116  // AllLatencies returns a map of all currently valid latency measurements.
   117  func (r *RemoteClockMonitor) AllLatencies() map[string]time.Duration {
   118  	r.mu.RLock()
   119  	defer r.mu.RUnlock()
   120  	result := make(map[string]time.Duration)
   121  	for addr, avg := range r.mu.latenciesNanos {
   122  		if avg.Value() != 0.0 {
   123  			result[addr] = time.Duration(int64(avg.Value()))
   124  		}
   125  	}
   126  	return result
   127  }
   128  
   129  // UpdateOffset is a thread-safe way to update the remote clock and latency
   130  // measurements.
   131  //
   132  // It only updates the offset for addr if one of the following cases holds:
   133  // 1. There is no prior offset for that address.
   134  // 2. The old offset for addr was measured long enough ago to be considered
   135  // stale.
   136  // 3. The new offset's error is smaller than the old offset's error.
   137  //
   138  // Pass a roundTripLatency of 0 or less to avoid recording the latency.
   139  func (r *RemoteClockMonitor) UpdateOffset(
   140  	ctx context.Context, addr string, offset RemoteOffset, roundTripLatency time.Duration,
   141  ) {
   142  	emptyOffset := offset == RemoteOffset{}
   143  
   144  	r.mu.Lock()
   145  	defer r.mu.Unlock()
   146  
   147  	if oldOffset, ok := r.mu.offsets[addr]; !ok {
   148  		// We don't have a measurement - if the incoming measurement is not empty,
   149  		// set it.
   150  		if !emptyOffset {
   151  			r.mu.offsets[addr] = offset
   152  		}
   153  	} else if oldOffset.isStale(r.offsetTTL, r.clock.PhysicalTime()) {
   154  		// We have a measurement but it's old - if the incoming measurement is not empty,
   155  		// set it, otherwise delete the old measurement.
   156  		if !emptyOffset {
   157  			r.mu.offsets[addr] = offset
   158  		} else {
   159  			delete(r.mu.offsets, addr)
   160  		}
   161  	} else if offset.Uncertainty < oldOffset.Uncertainty {
   162  		// We have a measurement but its uncertainty is greater than that of the
   163  		// incoming measurement - if the incoming measurement is not empty, set it.
   164  		if !emptyOffset {
   165  			r.mu.offsets[addr] = offset
   166  		}
   167  	}
   168  
   169  	if roundTripLatency > 0 {
   170  		latencyAvg, ok := r.mu.latenciesNanos[addr]
   171  		if !ok {
   172  			latencyAvg = ewma.NewMovingAverage(avgLatencyMeasurementAge)
   173  			r.mu.latenciesNanos[addr] = latencyAvg
   174  		}
   175  		latencyAvg.Add(float64(roundTripLatency.Nanoseconds()))
   176  		r.metrics.LatencyHistogramNanos.RecordValue(roundTripLatency.Nanoseconds())
   177  	}
   178  
   179  	if log.V(2) {
   180  		log.Infof(ctx, "update offset: %s %v", addr, r.mu.offsets[addr])
   181  	}
   182  }
   183  
   184  // VerifyClockOffset calculates the number of nodes to which the known offset
   185  // is healthy (as defined by RemoteOffset.isHealthy). It returns nil iff more
   186  // than half the known offsets are healthy, and an error otherwise. A non-nil
   187  // return indicates that this node's clock is unreliable, and that the node
   188  // should terminate.
   189  func (r *RemoteClockMonitor) VerifyClockOffset(ctx context.Context) error {
   190  	// By the contract of the hlc, if the value is 0, then safety checking of
   191  	// the max offset is disabled. However we may still want to propagate the
   192  	// information to a status node.
   193  	//
   194  	// TODO(tschottdorf): disallow maxOffset == 0 but probably lots of tests to
   195  	// fix.
   196  	if maxOffset := r.clock.MaxOffset(); maxOffset != 0 {
   197  		now := r.clock.PhysicalTime()
   198  
   199  		healthyOffsetCount := 0
   200  
   201  		r.mu.Lock()
   202  		// Each measurement is recorded as its minimum and maximum value.
   203  		offsets := make(stats.Float64Data, 0, 2*len(r.mu.offsets))
   204  		for addr, offset := range r.mu.offsets {
   205  			if offset.isStale(r.offsetTTL, now) {
   206  				delete(r.mu.offsets, addr)
   207  				continue
   208  			}
   209  			offsets = append(offsets, float64(offset.Offset+offset.Uncertainty))
   210  			offsets = append(offsets, float64(offset.Offset-offset.Uncertainty))
   211  			if offset.isHealthy(ctx, maxOffset) {
   212  				healthyOffsetCount++
   213  			}
   214  		}
   215  		numClocks := len(r.mu.offsets)
   216  		r.mu.Unlock()
   217  
   218  		mean, err := offsets.Mean()
   219  		if err != nil && !errors.Is(err, stats.EmptyInput) {
   220  			return err
   221  		}
   222  		stdDev, err := offsets.StandardDeviation()
   223  		if err != nil && !errors.Is(err, stats.EmptyInput) {
   224  			return err
   225  		}
   226  		r.metrics.ClockOffsetMeanNanos.Update(int64(mean))
   227  		r.metrics.ClockOffsetStdDevNanos.Update(int64(stdDev))
   228  
   229  		if numClocks > 0 && healthyOffsetCount <= numClocks/2 {
   230  			return errors.Errorf(
   231  				"clock synchronization error: this node is more than %s away from at least half of the known nodes (%d of %d are within the offset)",
   232  				maxOffset, healthyOffsetCount, numClocks)
   233  		}
   234  		if log.V(1) {
   235  			log.Infof(ctx, "%d of %d nodes are within the maximum clock offset of %s", healthyOffsetCount, numClocks, maxOffset)
   236  		}
   237  	}
   238  
   239  	return nil
   240  }
   241  
   242  func (r RemoteOffset) isHealthy(ctx context.Context, maxOffset time.Duration) bool {
   243  	// Tolerate up to 80% of the maximum offset.
   244  	toleratedOffset := maxOffset * 4 / 5
   245  
   246  	// Offset may be negative, but Uncertainty is always positive.
   247  	absOffset := r.Offset
   248  	if absOffset < 0 {
   249  		absOffset = -absOffset
   250  	}
   251  	switch {
   252  	case time.Duration(absOffset-r.Uncertainty)*time.Nanosecond > toleratedOffset:
   253  		// The minimum possible true offset exceeds the maximum offset; definitely
   254  		// unhealthy.
   255  		return false
   256  
   257  	case time.Duration(absOffset+r.Uncertainty)*time.Nanosecond < toleratedOffset:
   258  		// The maximum possible true offset does not exceed the maximum offset;
   259  		// definitely healthy.
   260  		return true
   261  
   262  	default:
   263  		// The maximum offset is in the uncertainty window of the measured offset;
   264  		// health is ambiguous. For now, we err on the side of not spuriously
   265  		// killing nodes.
   266  		if log.V(1) {
   267  			log.Infof(ctx, "uncertain remote offset %s for maximum tolerated offset %s, treating as healthy", r, toleratedOffset)
   268  		}
   269  		return true
   270  	}
   271  }
   272  
   273  func (r RemoteOffset) isStale(ttl time.Duration, now time.Time) bool {
   274  	return r.measuredAt().Add(ttl).Before(now)
   275  }