github.com/ava-labs/avalanchego@v1.11.11/network/throttling/inbound_resource_throttler.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package throttling 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 "sync" 11 "time" 12 13 "github.com/prometheus/client_golang/prometheus" 14 15 "github.com/ava-labs/avalanchego/ids" 16 "github.com/ava-labs/avalanchego/snow/networking/tracker" 17 "github.com/ava-labs/avalanchego/utils/timer/mockable" 18 19 timerpkg "github.com/ava-labs/avalanchego/utils/timer" 20 ) 21 22 const epsilon = time.Millisecond 23 24 var ( 25 _ SystemThrottler = (*systemThrottler)(nil) 26 _ SystemThrottler = noSystemThrottler{} 27 ) 28 29 // SystemThrottler rate-limits based on the system metrics usage caused by each 30 // peer. We will not read messages from peers whose messages cause excessive 31 // usage until the usage caused by the peer drops to an acceptable level. 32 type SystemThrottler interface { 33 // Blocks until we can read a message from the given peer. 34 // If [ctx] is canceled, returns immediately. 35 Acquire(ctx context.Context, nodeID ids.NodeID) 36 } 37 38 // A system throttler that always immediately returns on [Acquire]. 39 type noSystemThrottler struct{} 40 41 func (noSystemThrottler) Acquire(context.Context, ids.NodeID) {} 42 43 type SystemThrottlerConfig struct { 44 Clock mockable.Clock `json:"-"` 45 // The maximum amount of time we'll wait before re-checking whether a call 46 // to [Acquire] can return. 47 MaxRecheckDelay time.Duration `json:"maxRecheckDelay"` 48 } 49 50 type systemThrottler struct { 51 SystemThrottlerConfig 52 metrics *systemThrottlerMetrics 53 // Tells us the target utilization of each node. 54 targeter tracker.Targeter 55 // Tells us the utilization of each node. 56 tracker tracker.Tracker 57 // Invariant: [timerPool] only returns timers that have been stopped and drained. 58 timerPool sync.Pool 59 } 60 61 type systemThrottlerMetrics struct { 62 totalWaits prometheus.Counter 63 totalNoWaits prometheus.Counter 64 awaitingAcquire prometheus.Gauge 65 } 66 67 func newSystemThrottlerMetrics(namespace string, reg prometheus.Registerer) (*systemThrottlerMetrics, error) { 68 m := &systemThrottlerMetrics{ 69 totalWaits: prometheus.NewCounter(prometheus.CounterOpts{ 70 Namespace: namespace, 71 Name: "throttler_total_waits", 72 Help: "Number of times we've waited to read a message from a node because their usage was too high", 73 }), 74 totalNoWaits: prometheus.NewCounter(prometheus.CounterOpts{ 75 Namespace: namespace, 76 Name: "throttler_total_no_waits", 77 Help: "Number of times we didn't wait to read a message because their usage is too high", 78 }), 79 awaitingAcquire: prometheus.NewGauge(prometheus.GaugeOpts{ 80 Namespace: namespace, 81 Name: "throttler_awaiting_acquire", 82 Help: "Number of nodes we're waiting to read a message from because their usage is too high", 83 }), 84 } 85 err := errors.Join( 86 reg.Register(m.totalWaits), 87 reg.Register(m.totalNoWaits), 88 reg.Register(m.awaitingAcquire), 89 ) 90 return m, err 91 } 92 93 func NewSystemThrottler( 94 namespace string, 95 reg prometheus.Registerer, 96 config SystemThrottlerConfig, 97 tracker tracker.Tracker, 98 targeter tracker.Targeter, 99 ) (SystemThrottler, error) { 100 metrics, err := newSystemThrottlerMetrics(namespace, reg) 101 if err != nil { 102 return nil, fmt.Errorf("couldn't initialize system throttler metrics: %w", err) 103 } 104 return &systemThrottler{ 105 metrics: metrics, 106 SystemThrottlerConfig: config, 107 targeter: targeter, 108 tracker: tracker, 109 timerPool: sync.Pool{ 110 New: func() interface{} { 111 // Satisfy invariant that timer is stopped and drained. 112 return timerpkg.StoppedTimer() 113 }, 114 }, 115 }, nil 116 } 117 118 func (t *systemThrottler) Acquire(ctx context.Context, nodeID ids.NodeID) { 119 // [timer] fires when we should re-check whether this node's 120 // usage has fallen to an acceptable level. 121 // Lazily initialize timer only if we actually need to wait. 122 var timer *time.Timer 123 defer func() { 124 if timer != nil { // We waited at least once for usage to fall. 125 t.metrics.totalWaits.Inc() 126 // Note that [t.metrics.awaitingAcquire.Inc()] was called once if 127 // and only if [waited] is true. 128 t.metrics.awaitingAcquire.Dec() 129 } else { 130 t.metrics.totalNoWaits.Inc() 131 } 132 }() 133 134 for { 135 now := t.Clock.Time() 136 // Get target usage for this node. 137 target := t.targeter.TargetUsage(nodeID) 138 // Get actual usage for this node. 139 usage := t.tracker.Usage(nodeID, now) 140 if usage <= target { 141 return 142 } 143 // See how long it will take for actual usage to drop to the target, 144 // assuming this node uses no more resources. 145 waitDuration := t.tracker.TimeUntilUsage(nodeID, now, target) 146 if waitDuration < epsilon { 147 // If the amount of time until we reach the target is very small, 148 // just return to avoid a situation where we excessively re-check. 149 return 150 } 151 if waitDuration > t.MaxRecheckDelay { 152 // Re-check at least every [t.MaxRecheckDelay] in case it will be a 153 // very long time until usage reaches the target level. 154 // 155 // Note that not only can a node's usage decrease over time, but 156 // also its target usage may increase. 157 // In this case, the node's usage can drop to the target level 158 // sooner than [waitDuration] because the target has increased. 159 // The minimum re-check frequency accounts for that case by 160 // optimistically re-checking whether the node's usage is now at an 161 // acceptable level. 162 waitDuration = t.MaxRecheckDelay 163 } 164 165 if timer == nil { 166 // Note this is called at most once. 167 t.metrics.awaitingAcquire.Inc() 168 169 timer = t.timerPool.Get().(*time.Timer) 170 defer t.timerPool.Put(timer) 171 } 172 173 timer.Reset(waitDuration) 174 select { 175 case <-ctx.Done(): 176 // Satisfy [t.timerPool] invariant. 177 if !timer.Stop() { 178 <-timer.C 179 } 180 return 181 case <-timer.C: 182 } 183 } 184 }