github.com/ava-labs/avalanchego@v1.11.11/network/throttling/inbound_resource_throttler.go (about)

     1  // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package throttling
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/prometheus/client_golang/prometheus"
    14  
    15  	"github.com/ava-labs/avalanchego/ids"
    16  	"github.com/ava-labs/avalanchego/snow/networking/tracker"
    17  	"github.com/ava-labs/avalanchego/utils/timer/mockable"
    18  
    19  	timerpkg "github.com/ava-labs/avalanchego/utils/timer"
    20  )
    21  
    22  const epsilon = time.Millisecond
    23  
    24  var (
    25  	_ SystemThrottler = (*systemThrottler)(nil)
    26  	_ SystemThrottler = noSystemThrottler{}
    27  )
    28  
    29  // SystemThrottler rate-limits based on the system metrics usage caused by each
    30  // peer. We will not read messages from peers whose messages cause excessive
    31  // usage until the usage caused by the peer drops to an acceptable level.
    32  type SystemThrottler interface {
    33  	// Blocks until we can read a message from the given peer.
    34  	// If [ctx] is canceled, returns immediately.
    35  	Acquire(ctx context.Context, nodeID ids.NodeID)
    36  }
    37  
    38  // A system throttler that always immediately returns on [Acquire].
    39  type noSystemThrottler struct{}
    40  
    41  func (noSystemThrottler) Acquire(context.Context, ids.NodeID) {}
    42  
    43  type SystemThrottlerConfig struct {
    44  	Clock mockable.Clock `json:"-"`
    45  	// The maximum amount of time we'll wait before re-checking whether a call
    46  	// to [Acquire] can return.
    47  	MaxRecheckDelay time.Duration `json:"maxRecheckDelay"`
    48  }
    49  
    50  type systemThrottler struct {
    51  	SystemThrottlerConfig
    52  	metrics *systemThrottlerMetrics
    53  	// Tells us the target utilization of each node.
    54  	targeter tracker.Targeter
    55  	// Tells us the utilization of each node.
    56  	tracker tracker.Tracker
    57  	// Invariant: [timerPool] only returns timers that have been stopped and drained.
    58  	timerPool sync.Pool
    59  }
    60  
    61  type systemThrottlerMetrics struct {
    62  	totalWaits      prometheus.Counter
    63  	totalNoWaits    prometheus.Counter
    64  	awaitingAcquire prometheus.Gauge
    65  }
    66  
    67  func newSystemThrottlerMetrics(namespace string, reg prometheus.Registerer) (*systemThrottlerMetrics, error) {
    68  	m := &systemThrottlerMetrics{
    69  		totalWaits: prometheus.NewCounter(prometheus.CounterOpts{
    70  			Namespace: namespace,
    71  			Name:      "throttler_total_waits",
    72  			Help:      "Number of times we've waited to read a message from a node because their usage was too high",
    73  		}),
    74  		totalNoWaits: prometheus.NewCounter(prometheus.CounterOpts{
    75  			Namespace: namespace,
    76  			Name:      "throttler_total_no_waits",
    77  			Help:      "Number of times we didn't wait to read a message because their usage is too high",
    78  		}),
    79  		awaitingAcquire: prometheus.NewGauge(prometheus.GaugeOpts{
    80  			Namespace: namespace,
    81  			Name:      "throttler_awaiting_acquire",
    82  			Help:      "Number of nodes we're waiting to read a message from because their usage is too high",
    83  		}),
    84  	}
    85  	err := errors.Join(
    86  		reg.Register(m.totalWaits),
    87  		reg.Register(m.totalNoWaits),
    88  		reg.Register(m.awaitingAcquire),
    89  	)
    90  	return m, err
    91  }
    92  
    93  func NewSystemThrottler(
    94  	namespace string,
    95  	reg prometheus.Registerer,
    96  	config SystemThrottlerConfig,
    97  	tracker tracker.Tracker,
    98  	targeter tracker.Targeter,
    99  ) (SystemThrottler, error) {
   100  	metrics, err := newSystemThrottlerMetrics(namespace, reg)
   101  	if err != nil {
   102  		return nil, fmt.Errorf("couldn't initialize system throttler metrics: %w", err)
   103  	}
   104  	return &systemThrottler{
   105  		metrics:               metrics,
   106  		SystemThrottlerConfig: config,
   107  		targeter:              targeter,
   108  		tracker:               tracker,
   109  		timerPool: sync.Pool{
   110  			New: func() interface{} {
   111  				// Satisfy invariant that timer is stopped and drained.
   112  				return timerpkg.StoppedTimer()
   113  			},
   114  		},
   115  	}, nil
   116  }
   117  
   118  func (t *systemThrottler) Acquire(ctx context.Context, nodeID ids.NodeID) {
   119  	// [timer] fires when we should re-check whether this node's
   120  	// usage has fallen to an acceptable level.
   121  	// Lazily initialize timer only if we actually need to wait.
   122  	var timer *time.Timer
   123  	defer func() {
   124  		if timer != nil { // We waited at least once for usage to fall.
   125  			t.metrics.totalWaits.Inc()
   126  			// Note that [t.metrics.awaitingAcquire.Inc()] was called once if
   127  			// and only if [waited] is true.
   128  			t.metrics.awaitingAcquire.Dec()
   129  		} else {
   130  			t.metrics.totalNoWaits.Inc()
   131  		}
   132  	}()
   133  
   134  	for {
   135  		now := t.Clock.Time()
   136  		// Get target usage for this node.
   137  		target := t.targeter.TargetUsage(nodeID)
   138  		// Get actual usage for this node.
   139  		usage := t.tracker.Usage(nodeID, now)
   140  		if usage <= target {
   141  			return
   142  		}
   143  		// See how long it will take for actual usage to drop to the target,
   144  		// assuming this node uses no more resources.
   145  		waitDuration := t.tracker.TimeUntilUsage(nodeID, now, target)
   146  		if waitDuration < epsilon {
   147  			// If the amount of time until we reach the target is very small,
   148  			// just return to avoid a situation where we excessively re-check.
   149  			return
   150  		}
   151  		if waitDuration > t.MaxRecheckDelay {
   152  			// Re-check at least every [t.MaxRecheckDelay] in case it will be a
   153  			// very long time until usage reaches the target level.
   154  			//
   155  			// Note that not only can a node's usage decrease over time, but
   156  			// also its target usage may increase.
   157  			// In this case, the node's usage can drop to the target level
   158  			// sooner than [waitDuration] because the target has increased.
   159  			// The minimum re-check frequency accounts for that case by
   160  			// optimistically re-checking whether the node's usage is now at an
   161  			// acceptable level.
   162  			waitDuration = t.MaxRecheckDelay
   163  		}
   164  
   165  		if timer == nil {
   166  			// Note this is called at most once.
   167  			t.metrics.awaitingAcquire.Inc()
   168  
   169  			timer = t.timerPool.Get().(*time.Timer)
   170  			defer t.timerPool.Put(timer)
   171  		}
   172  
   173  		timer.Reset(waitDuration)
   174  		select {
   175  		case <-ctx.Done():
   176  			// Satisfy [t.timerPool] invariant.
   177  			if !timer.Stop() {
   178  				<-timer.C
   179  			}
   180  			return
   181  		case <-timer.C:
   182  		}
   183  	}
   184  }