github.com/MetalBlockchain/metalgo@v1.11.9/utils/timer/adaptive_timeout_manager.go

github.com/MetalBlockchain/metalgo@v1.11.9/utils/timer/adaptive_timeout_manager.go (about)

     1  // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package timer
     5  
     6  import (
     7  	"errors"
     8  	"fmt"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/prometheus/client_golang/prometheus"
    13  
    14  	"github.com/MetalBlockchain/metalgo/ids"
    15  	"github.com/MetalBlockchain/metalgo/utils/heap"
    16  	"github.com/MetalBlockchain/metalgo/utils/math"
    17  	"github.com/MetalBlockchain/metalgo/utils/timer/mockable"
    18  )
    19  
    20  var (
    21  	errNonPositiveHalflife        = errors.New("timeout halflife must be positive")
    22  	errInitialTimeoutAboveMaximum = errors.New("initial timeout cannot be greater than maximum timeout")
    23  	errInitialTimeoutBelowMinimum = errors.New("initial timeout cannot be less than minimum timeout")
    24  	errTooSmallTimeoutCoefficient = errors.New("timeout coefficient must be >= 1")
    25  
    26  	_ AdaptiveTimeoutManager = (*adaptiveTimeoutManager)(nil)
    27  )
    28  
    29  type adaptiveTimeout struct {
    30  	id             ids.RequestID // Unique ID of this timeout
    31  	handler        func()        // Function to execute if timed out
    32  	duration       time.Duration // How long this timeout was set for
    33  	deadline       time.Time     // When this timeout should be fired
    34  	measureLatency bool          // Whether this request should impact latency
    35  }
    36  
    37  // AdaptiveTimeoutConfig contains the parameters provided to the
    38  // adaptive timeout manager.
    39  type AdaptiveTimeoutConfig struct {
    40  	InitialTimeout time.Duration `json:"initialTimeout"`
    41  	MinimumTimeout time.Duration `json:"minimumTimeout"`
    42  	MaximumTimeout time.Duration `json:"maximumTimeout"`
    43  	// Timeout is [timeoutCoefficient] * average response time
    44  	// [timeoutCoefficient] must be > 1
    45  	TimeoutCoefficient float64 `json:"timeoutCoefficient"`
    46  	// Larger halflife --> less volatile timeout
    47  	// [timeoutHalfLife] must be positive
    48  	TimeoutHalflife time.Duration `json:"timeoutHalflife"`
    49  }
    50  
    51  type AdaptiveTimeoutManager interface {
    52  	// Start the timeout manager.
    53  	// Must be called before any other method.
    54  	// Must only be called once.
    55  	Dispatch()
    56  	// Stop the timeout manager.
    57  	// Must only be called once.
    58  	Stop()
    59  	// Returns the current network timeout duration.
    60  	TimeoutDuration() time.Duration
    61  	// Registers a timeout for the item with the given [id].
    62  	// If the timeout occurs before the item is Removed, [timeoutHandler] is called.
    63  	Put(id ids.RequestID, measureLatency bool, timeoutHandler func())
    64  	// Remove the timeout associated with [id].
    65  	// Its timeout handler will not be called.
    66  	Remove(id ids.RequestID)
    67  	// ObserveLatency manually registers a response latency.
    68  	// We use this to pretend that it a query to a benched validator
    69  	// timed out when actually, we never even sent them a request.
    70  	ObserveLatency(latency time.Duration)
    71  }
    72  
    73  type adaptiveTimeoutManager struct {
    74  	lock sync.Mutex
    75  	// Tells the time. Can be faked for testing.
    76  	clock                            mockable.Clock
    77  	networkTimeoutMetric, avgLatency prometheus.Gauge
    78  	numTimeouts                      prometheus.Counter
    79  	numPendingTimeouts               prometheus.Gauge
    80  	// Averages the response time from all peers
    81  	averager math.Averager
    82  	// Timeout is [timeoutCoefficient] * average response time
    83  	// [timeoutCoefficient] must be > 1
    84  	timeoutCoefficient float64
    85  	minimumTimeout     time.Duration
    86  	maximumTimeout     time.Duration
    87  	currentTimeout     time.Duration // Amount of time before a timeout
    88  	timeoutHeap        heap.Map[ids.RequestID, *adaptiveTimeout]
    89  	timer              *Timer // Timer that will fire to clear the timeouts
    90  }
    91  
    92  func NewAdaptiveTimeoutManager(
    93  	config *AdaptiveTimeoutConfig,
    94  	reg prometheus.Registerer,
    95  ) (AdaptiveTimeoutManager, error) {
    96  	switch {
    97  	case config.InitialTimeout > config.MaximumTimeout:
    98  		return nil, fmt.Errorf("%w: (%s) > (%s)", errInitialTimeoutAboveMaximum, config.InitialTimeout, config.MaximumTimeout)
    99  	case config.InitialTimeout < config.MinimumTimeout:
   100  		return nil, fmt.Errorf("%w: (%s) < (%s)", errInitialTimeoutBelowMinimum, config.InitialTimeout, config.MinimumTimeout)
   101  	case config.TimeoutCoefficient < 1:
   102  		return nil, fmt.Errorf("%w: %f", errTooSmallTimeoutCoefficient, config.TimeoutCoefficient)
   103  	case config.TimeoutHalflife <= 0:
   104  		return nil, errNonPositiveHalflife
   105  	}
   106  
   107  	tm := &adaptiveTimeoutManager{
   108  		networkTimeoutMetric: prometheus.NewGauge(prometheus.GaugeOpts{
   109  			Name: "current_timeout",
   110  			Help: "Duration of current network timeout in nanoseconds",
   111  		}),
   112  		avgLatency: prometheus.NewGauge(prometheus.GaugeOpts{
   113  			Name: "average_latency",
   114  			Help: "Average network latency in nanoseconds",
   115  		}),
   116  		numTimeouts: prometheus.NewCounter(prometheus.CounterOpts{
   117  			Name: "timeouts",
   118  			Help: "Number of timed out requests",
   119  		}),
   120  		numPendingTimeouts: prometheus.NewGauge(prometheus.GaugeOpts{
   121  			Name: "pending_timeouts",
   122  			Help: "Number of pending timeouts",
   123  		}),
   124  		minimumTimeout:     config.MinimumTimeout,
   125  		maximumTimeout:     config.MaximumTimeout,
   126  		currentTimeout:     config.InitialTimeout,
   127  		timeoutCoefficient: config.TimeoutCoefficient,
   128  		timeoutHeap: heap.NewMap[ids.RequestID, *adaptiveTimeout](func(a, b *adaptiveTimeout) bool {
   129  			return a.deadline.Before(b.deadline)
   130  		}),
   131  	}
   132  	tm.timer = NewTimer(tm.timeout)
   133  	tm.averager = math.NewAverager(float64(config.InitialTimeout), config.TimeoutHalflife, tm.clock.Time())
   134  
   135  	err := errors.Join(
   136  		reg.Register(tm.networkTimeoutMetric),
   137  		reg.Register(tm.avgLatency),
   138  		reg.Register(tm.numTimeouts),
   139  		reg.Register(tm.numPendingTimeouts),
   140  	)
   141  	return tm, err
   142  }
   143  
   144  func (tm *adaptiveTimeoutManager) TimeoutDuration() time.Duration {
   145  	tm.lock.Lock()
   146  	defer tm.lock.Unlock()
   147  
   148  	return tm.currentTimeout
   149  }
   150  
   151  func (tm *adaptiveTimeoutManager) Dispatch() {
   152  	tm.timer.Dispatch()
   153  }
   154  
   155  func (tm *adaptiveTimeoutManager) Stop() {
   156  	tm.timer.Stop()
   157  }
   158  
   159  func (tm *adaptiveTimeoutManager) Put(id ids.RequestID, measureLatency bool, timeoutHandler func()) {
   160  	tm.lock.Lock()
   161  	defer tm.lock.Unlock()
   162  
   163  	tm.put(id, measureLatency, timeoutHandler)
   164  }
   165  
   166  // Assumes [tm.lock] is held
   167  func (tm *adaptiveTimeoutManager) put(id ids.RequestID, measureLatency bool, handler func()) {
   168  	now := tm.clock.Time()
   169  	tm.remove(id, now)
   170  
   171  	timeout := &adaptiveTimeout{
   172  		id:             id,
   173  		handler:        handler,
   174  		duration:       tm.currentTimeout,
   175  		deadline:       now.Add(tm.currentTimeout),
   176  		measureLatency: measureLatency,
   177  	}
   178  	tm.timeoutHeap.Push(id, timeout)
   179  	tm.numPendingTimeouts.Set(float64(tm.timeoutHeap.Len()))
   180  
   181  	tm.setNextTimeoutTime()
   182  }
   183  
   184  func (tm *adaptiveTimeoutManager) Remove(id ids.RequestID) {
   185  	tm.lock.Lock()
   186  	defer tm.lock.Unlock()
   187  
   188  	tm.remove(id, tm.clock.Time())
   189  }
   190  
   191  // Assumes [tm.lock] is held
   192  func (tm *adaptiveTimeoutManager) remove(id ids.RequestID, now time.Time) {
   193  	// Observe the response time to update average network response time.
   194  	timeout, exists := tm.timeoutHeap.Remove(id)
   195  	if !exists {
   196  		return
   197  	}
   198  
   199  	if timeout.measureLatency {
   200  		timeoutRegisteredAt := timeout.deadline.Add(-1 * timeout.duration)
   201  		latency := now.Sub(timeoutRegisteredAt)
   202  		tm.observeLatencyAndUpdateTimeout(latency, now)
   203  	}
   204  	tm.numPendingTimeouts.Set(float64(tm.timeoutHeap.Len()))
   205  }
   206  
   207  // Assumes [tm.lock] is not held.
   208  func (tm *adaptiveTimeoutManager) timeout() {
   209  	tm.lock.Lock()
   210  	defer tm.lock.Unlock()
   211  
   212  	now := tm.clock.Time()
   213  	for {
   214  		// getNextTimeoutHandler returns nil once there is nothing left to remove
   215  		timeoutHandler := tm.getNextTimeoutHandler(now)
   216  		if timeoutHandler == nil {
   217  			break
   218  		}
   219  		tm.numTimeouts.Inc()
   220  
   221  		// Don't execute a callback with a lock held
   222  		tm.lock.Unlock()
   223  		timeoutHandler()
   224  		tm.lock.Lock()
   225  	}
   226  	tm.setNextTimeoutTime()
   227  }
   228  
   229  func (tm *adaptiveTimeoutManager) ObserveLatency(latency time.Duration) {
   230  	tm.lock.Lock()
   231  	defer tm.lock.Unlock()
   232  
   233  	tm.observeLatencyAndUpdateTimeout(latency, tm.clock.Time())
   234  }
   235  
   236  // Assumes [tm.lock] is held
   237  func (tm *adaptiveTimeoutManager) observeLatencyAndUpdateTimeout(latency time.Duration, now time.Time) {
   238  	tm.averager.Observe(float64(latency), now)
   239  	avgLatency := tm.averager.Read()
   240  	tm.currentTimeout = time.Duration(tm.timeoutCoefficient * avgLatency)
   241  	if tm.currentTimeout > tm.maximumTimeout {
   242  		tm.currentTimeout = tm.maximumTimeout
   243  	} else if tm.currentTimeout < tm.minimumTimeout {
   244  		tm.currentTimeout = tm.minimumTimeout
   245  	}
   246  	// Update the metrics
   247  	tm.networkTimeoutMetric.Set(float64(tm.currentTimeout))
   248  	tm.avgLatency.Set(avgLatency)
   249  }
   250  
   251  // Returns the handler function associated with the next timeout.
   252  // If there are no timeouts, or if the next timeout is after [now],
   253  // returns nil.
   254  // Assumes [tm.lock] is held
   255  func (tm *adaptiveTimeoutManager) getNextTimeoutHandler(now time.Time) func() {
   256  	_, nextTimeout, ok := tm.timeoutHeap.Peek()
   257  	if !ok {
   258  		return nil
   259  	}
   260  	if nextTimeout.deadline.After(now) {
   261  		return nil
   262  	}
   263  	tm.remove(nextTimeout.id, now)
   264  	return nextTimeout.handler
   265  }
   266  
   267  // Calculate the time of the next timeout and set
   268  // the timer to fire at that time.
   269  func (tm *adaptiveTimeoutManager) setNextTimeoutTime() {
   270  	_, nextTimeout, ok := tm.timeoutHeap.Peek()
   271  	if !ok {
   272  		// There are no pending timeouts
   273  		tm.timer.Cancel()
   274  		return
   275  	}
   276  
   277  	now := tm.clock.Time()
   278  	timeToNextTimeout := nextTimeout.deadline.Sub(now)
   279  	tm.timer.SetTimeoutIn(timeToNextTimeout)
   280  }