github.com/MetalBlockchain/metalgo@v1.11.9/utils/timer/adaptive_timeout_manager.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package timer 5 6 import ( 7 "errors" 8 "fmt" 9 "sync" 10 "time" 11 12 "github.com/prometheus/client_golang/prometheus" 13 14 "github.com/MetalBlockchain/metalgo/ids" 15 "github.com/MetalBlockchain/metalgo/utils/heap" 16 "github.com/MetalBlockchain/metalgo/utils/math" 17 "github.com/MetalBlockchain/metalgo/utils/timer/mockable" 18 ) 19 20 var ( 21 errNonPositiveHalflife = errors.New("timeout halflife must be positive") 22 errInitialTimeoutAboveMaximum = errors.New("initial timeout cannot be greater than maximum timeout") 23 errInitialTimeoutBelowMinimum = errors.New("initial timeout cannot be less than minimum timeout") 24 errTooSmallTimeoutCoefficient = errors.New("timeout coefficient must be >= 1") 25 26 _ AdaptiveTimeoutManager = (*adaptiveTimeoutManager)(nil) 27 ) 28 29 type adaptiveTimeout struct { 30 id ids.RequestID // Unique ID of this timeout 31 handler func() // Function to execute if timed out 32 duration time.Duration // How long this timeout was set for 33 deadline time.Time // When this timeout should be fired 34 measureLatency bool // Whether this request should impact latency 35 } 36 37 // AdaptiveTimeoutConfig contains the parameters provided to the 38 // adaptive timeout manager. 39 type AdaptiveTimeoutConfig struct { 40 InitialTimeout time.Duration `json:"initialTimeout"` 41 MinimumTimeout time.Duration `json:"minimumTimeout"` 42 MaximumTimeout time.Duration `json:"maximumTimeout"` 43 // Timeout is [timeoutCoefficient] * average response time 44 // [timeoutCoefficient] must be > 1 45 TimeoutCoefficient float64 `json:"timeoutCoefficient"` 46 // Larger halflife --> less volatile timeout 47 // [timeoutHalfLife] must be positive 48 TimeoutHalflife time.Duration `json:"timeoutHalflife"` 49 } 50 51 type AdaptiveTimeoutManager interface { 52 // Start the timeout manager. 53 // Must be called before any other method. 54 // Must only be called once. 55 Dispatch() 56 // Stop the timeout manager. 57 // Must only be called once. 58 Stop() 59 // Returns the current network timeout duration. 60 TimeoutDuration() time.Duration 61 // Registers a timeout for the item with the given [id]. 62 // If the timeout occurs before the item is Removed, [timeoutHandler] is called. 63 Put(id ids.RequestID, measureLatency bool, timeoutHandler func()) 64 // Remove the timeout associated with [id]. 65 // Its timeout handler will not be called. 66 Remove(id ids.RequestID) 67 // ObserveLatency manually registers a response latency. 68 // We use this to pretend that it a query to a benched validator 69 // timed out when actually, we never even sent them a request. 70 ObserveLatency(latency time.Duration) 71 } 72 73 type adaptiveTimeoutManager struct { 74 lock sync.Mutex 75 // Tells the time. Can be faked for testing. 76 clock mockable.Clock 77 networkTimeoutMetric, avgLatency prometheus.Gauge 78 numTimeouts prometheus.Counter 79 numPendingTimeouts prometheus.Gauge 80 // Averages the response time from all peers 81 averager math.Averager 82 // Timeout is [timeoutCoefficient] * average response time 83 // [timeoutCoefficient] must be > 1 84 timeoutCoefficient float64 85 minimumTimeout time.Duration 86 maximumTimeout time.Duration 87 currentTimeout time.Duration // Amount of time before a timeout 88 timeoutHeap heap.Map[ids.RequestID, *adaptiveTimeout] 89 timer *Timer // Timer that will fire to clear the timeouts 90 } 91 92 func NewAdaptiveTimeoutManager( 93 config *AdaptiveTimeoutConfig, 94 reg prometheus.Registerer, 95 ) (AdaptiveTimeoutManager, error) { 96 switch { 97 case config.InitialTimeout > config.MaximumTimeout: 98 return nil, fmt.Errorf("%w: (%s) > (%s)", errInitialTimeoutAboveMaximum, config.InitialTimeout, config.MaximumTimeout) 99 case config.InitialTimeout < config.MinimumTimeout: 100 return nil, fmt.Errorf("%w: (%s) < (%s)", errInitialTimeoutBelowMinimum, config.InitialTimeout, config.MinimumTimeout) 101 case config.TimeoutCoefficient < 1: 102 return nil, fmt.Errorf("%w: %f", errTooSmallTimeoutCoefficient, config.TimeoutCoefficient) 103 case config.TimeoutHalflife <= 0: 104 return nil, errNonPositiveHalflife 105 } 106 107 tm := &adaptiveTimeoutManager{ 108 networkTimeoutMetric: prometheus.NewGauge(prometheus.GaugeOpts{ 109 Name: "current_timeout", 110 Help: "Duration of current network timeout in nanoseconds", 111 }), 112 avgLatency: prometheus.NewGauge(prometheus.GaugeOpts{ 113 Name: "average_latency", 114 Help: "Average network latency in nanoseconds", 115 }), 116 numTimeouts: prometheus.NewCounter(prometheus.CounterOpts{ 117 Name: "timeouts", 118 Help: "Number of timed out requests", 119 }), 120 numPendingTimeouts: prometheus.NewGauge(prometheus.GaugeOpts{ 121 Name: "pending_timeouts", 122 Help: "Number of pending timeouts", 123 }), 124 minimumTimeout: config.MinimumTimeout, 125 maximumTimeout: config.MaximumTimeout, 126 currentTimeout: config.InitialTimeout, 127 timeoutCoefficient: config.TimeoutCoefficient, 128 timeoutHeap: heap.NewMap[ids.RequestID, *adaptiveTimeout](func(a, b *adaptiveTimeout) bool { 129 return a.deadline.Before(b.deadline) 130 }), 131 } 132 tm.timer = NewTimer(tm.timeout) 133 tm.averager = math.NewAverager(float64(config.InitialTimeout), config.TimeoutHalflife, tm.clock.Time()) 134 135 err := errors.Join( 136 reg.Register(tm.networkTimeoutMetric), 137 reg.Register(tm.avgLatency), 138 reg.Register(tm.numTimeouts), 139 reg.Register(tm.numPendingTimeouts), 140 ) 141 return tm, err 142 } 143 144 func (tm *adaptiveTimeoutManager) TimeoutDuration() time.Duration { 145 tm.lock.Lock() 146 defer tm.lock.Unlock() 147 148 return tm.currentTimeout 149 } 150 151 func (tm *adaptiveTimeoutManager) Dispatch() { 152 tm.timer.Dispatch() 153 } 154 155 func (tm *adaptiveTimeoutManager) Stop() { 156 tm.timer.Stop() 157 } 158 159 func (tm *adaptiveTimeoutManager) Put(id ids.RequestID, measureLatency bool, timeoutHandler func()) { 160 tm.lock.Lock() 161 defer tm.lock.Unlock() 162 163 tm.put(id, measureLatency, timeoutHandler) 164 } 165 166 // Assumes [tm.lock] is held 167 func (tm *adaptiveTimeoutManager) put(id ids.RequestID, measureLatency bool, handler func()) { 168 now := tm.clock.Time() 169 tm.remove(id, now) 170 171 timeout := &adaptiveTimeout{ 172 id: id, 173 handler: handler, 174 duration: tm.currentTimeout, 175 deadline: now.Add(tm.currentTimeout), 176 measureLatency: measureLatency, 177 } 178 tm.timeoutHeap.Push(id, timeout) 179 tm.numPendingTimeouts.Set(float64(tm.timeoutHeap.Len())) 180 181 tm.setNextTimeoutTime() 182 } 183 184 func (tm *adaptiveTimeoutManager) Remove(id ids.RequestID) { 185 tm.lock.Lock() 186 defer tm.lock.Unlock() 187 188 tm.remove(id, tm.clock.Time()) 189 } 190 191 // Assumes [tm.lock] is held 192 func (tm *adaptiveTimeoutManager) remove(id ids.RequestID, now time.Time) { 193 // Observe the response time to update average network response time. 194 timeout, exists := tm.timeoutHeap.Remove(id) 195 if !exists { 196 return 197 } 198 199 if timeout.measureLatency { 200 timeoutRegisteredAt := timeout.deadline.Add(-1 * timeout.duration) 201 latency := now.Sub(timeoutRegisteredAt) 202 tm.observeLatencyAndUpdateTimeout(latency, now) 203 } 204 tm.numPendingTimeouts.Set(float64(tm.timeoutHeap.Len())) 205 } 206 207 // Assumes [tm.lock] is not held. 208 func (tm *adaptiveTimeoutManager) timeout() { 209 tm.lock.Lock() 210 defer tm.lock.Unlock() 211 212 now := tm.clock.Time() 213 for { 214 // getNextTimeoutHandler returns nil once there is nothing left to remove 215 timeoutHandler := tm.getNextTimeoutHandler(now) 216 if timeoutHandler == nil { 217 break 218 } 219 tm.numTimeouts.Inc() 220 221 // Don't execute a callback with a lock held 222 tm.lock.Unlock() 223 timeoutHandler() 224 tm.lock.Lock() 225 } 226 tm.setNextTimeoutTime() 227 } 228 229 func (tm *adaptiveTimeoutManager) ObserveLatency(latency time.Duration) { 230 tm.lock.Lock() 231 defer tm.lock.Unlock() 232 233 tm.observeLatencyAndUpdateTimeout(latency, tm.clock.Time()) 234 } 235 236 // Assumes [tm.lock] is held 237 func (tm *adaptiveTimeoutManager) observeLatencyAndUpdateTimeout(latency time.Duration, now time.Time) { 238 tm.averager.Observe(float64(latency), now) 239 avgLatency := tm.averager.Read() 240 tm.currentTimeout = time.Duration(tm.timeoutCoefficient * avgLatency) 241 if tm.currentTimeout > tm.maximumTimeout { 242 tm.currentTimeout = tm.maximumTimeout 243 } else if tm.currentTimeout < tm.minimumTimeout { 244 tm.currentTimeout = tm.minimumTimeout 245 } 246 // Update the metrics 247 tm.networkTimeoutMetric.Set(float64(tm.currentTimeout)) 248 tm.avgLatency.Set(avgLatency) 249 } 250 251 // Returns the handler function associated with the next timeout. 252 // If there are no timeouts, or if the next timeout is after [now], 253 // returns nil. 254 // Assumes [tm.lock] is held 255 func (tm *adaptiveTimeoutManager) getNextTimeoutHandler(now time.Time) func() { 256 _, nextTimeout, ok := tm.timeoutHeap.Peek() 257 if !ok { 258 return nil 259 } 260 if nextTimeout.deadline.After(now) { 261 return nil 262 } 263 tm.remove(nextTimeout.id, now) 264 return nextTimeout.handler 265 } 266 267 // Calculate the time of the next timeout and set 268 // the timer to fire at that time. 269 func (tm *adaptiveTimeoutManager) setNextTimeoutTime() { 270 _, nextTimeout, ok := tm.timeoutHeap.Peek() 271 if !ok { 272 // There are no pending timeouts 273 tm.timer.Cancel() 274 return 275 } 276 277 now := tm.clock.Time() 278 timeToNextTimeout := nextTimeout.deadline.Sub(now) 279 tm.timer.SetTimeoutIn(timeToNextTimeout) 280 }