github.com/MetalBlockchain/metalgo@v1.11.9/network/throttling/inbound_resource_throttler.go

github.com/MetalBlockchain/metalgo@v1.11.9/network/throttling/inbound_resource_throttler.go (about)

     1  // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package throttling
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/prometheus/client_golang/prometheus"
    14  
    15  	"github.com/MetalBlockchain/metalgo/ids"
    16  	"github.com/MetalBlockchain/metalgo/snow/networking/tracker"
    17  	"github.com/MetalBlockchain/metalgo/utils/timer/mockable"
    18  )
    19  
    20  const epsilon = time.Millisecond
    21  
    22  var (
    23  	_ SystemThrottler = (*systemThrottler)(nil)
    24  	_ SystemThrottler = noSystemThrottler{}
    25  )
    26  
    27  // SystemThrottler rate-limits based on the system metrics usage caused by each
    28  // peer. We will not read messages from peers whose messages cause excessive
    29  // usage until the usage caused by the peer drops to an acceptable level.
    30  type SystemThrottler interface {
    31  	// Blocks until we can read a message from the given peer.
    32  	// If [ctx] is canceled, returns immediately.
    33  	Acquire(ctx context.Context, nodeID ids.NodeID)
    34  }
    35  
    36  // A system throttler that always immediately returns on [Acquire].
    37  type noSystemThrottler struct{}
    38  
    39  func (noSystemThrottler) Acquire(context.Context, ids.NodeID) {}
    40  
    41  type SystemThrottlerConfig struct {
    42  	Clock mockable.Clock `json:"-"`
    43  	// The maximum amount of time we'll wait before re-checking whether a call
    44  	// to [Acquire] can return.
    45  	MaxRecheckDelay time.Duration `json:"maxRecheckDelay"`
    46  }
    47  
    48  type systemThrottler struct {
    49  	SystemThrottlerConfig
    50  	metrics *systemThrottlerMetrics
    51  	// Tells us the target utilization of each node.
    52  	targeter tracker.Targeter
    53  	// Tells us the utilization of each node.
    54  	tracker tracker.Tracker
    55  	// Invariant: [timerPool] only returns timers that have been stopped and drained.
    56  	timerPool sync.Pool
    57  }
    58  
    59  type systemThrottlerMetrics struct {
    60  	totalWaits      prometheus.Counter
    61  	totalNoWaits    prometheus.Counter
    62  	awaitingAcquire prometheus.Gauge
    63  }
    64  
    65  func newSystemThrottlerMetrics(namespace string, reg prometheus.Registerer) (*systemThrottlerMetrics, error) {
    66  	m := &systemThrottlerMetrics{
    67  		totalWaits: prometheus.NewCounter(prometheus.CounterOpts{
    68  			Namespace: namespace,
    69  			Name:      "throttler_total_waits",
    70  			Help:      "Number of times we've waited to read a message from a node because their usage was too high",
    71  		}),
    72  		totalNoWaits: prometheus.NewCounter(prometheus.CounterOpts{
    73  			Namespace: namespace,
    74  			Name:      "throttler_total_no_waits",
    75  			Help:      "Number of times we didn't wait to read a message because their usage is too high",
    76  		}),
    77  		awaitingAcquire: prometheus.NewGauge(prometheus.GaugeOpts{
    78  			Namespace: namespace,
    79  			Name:      "throttler_awaiting_acquire",
    80  			Help:      "Number of nodes we're waiting to read a message from because their usage is too high",
    81  		}),
    82  	}
    83  	err := errors.Join(
    84  		reg.Register(m.totalWaits),
    85  		reg.Register(m.totalNoWaits),
    86  		reg.Register(m.awaitingAcquire),
    87  	)
    88  	return m, err
    89  }
    90  
    91  func NewSystemThrottler(
    92  	namespace string,
    93  	reg prometheus.Registerer,
    94  	config SystemThrottlerConfig,
    95  	tracker tracker.Tracker,
    96  	targeter tracker.Targeter,
    97  ) (SystemThrottler, error) {
    98  	metrics, err := newSystemThrottlerMetrics(namespace, reg)
    99  	if err != nil {
   100  		return nil, fmt.Errorf("couldn't initialize system throttler metrics: %w", err)
   101  	}
   102  	return &systemThrottler{
   103  		metrics:               metrics,
   104  		SystemThrottlerConfig: config,
   105  		targeter:              targeter,
   106  		tracker:               tracker,
   107  		timerPool: sync.Pool{
   108  			New: func() interface{} {
   109  				// Satisfy invariant that timer is stopped and drained.
   110  				timer := time.NewTimer(0)
   111  				if !timer.Stop() {
   112  					<-timer.C
   113  				}
   114  				return timer
   115  			},
   116  		},
   117  	}, nil
   118  }
   119  
   120  func (t *systemThrottler) Acquire(ctx context.Context, nodeID ids.NodeID) {
   121  	// [timer] fires when we should re-check whether this node's
   122  	// usage has fallen to an acceptable level.
   123  	// Lazily initialize timer only if we actually need to wait.
   124  	var timer *time.Timer
   125  	defer func() {
   126  		if timer != nil { // We waited at least once for usage to fall.
   127  			t.metrics.totalWaits.Inc()
   128  			// Note that [t.metrics.awaitingAcquire.Inc()] was called once if
   129  			// and only if [waited] is true.
   130  			t.metrics.awaitingAcquire.Dec()
   131  		} else {
   132  			t.metrics.totalNoWaits.Inc()
   133  		}
   134  	}()
   135  
   136  	for {
   137  		now := t.Clock.Time()
   138  		// Get target usage for this node.
   139  		target := t.targeter.TargetUsage(nodeID)
   140  		// Get actual usage for this node.
   141  		usage := t.tracker.Usage(nodeID, now)
   142  		if usage <= target {
   143  			return
   144  		}
   145  		// See how long it will take for actual usage to drop to the target,
   146  		// assuming this node uses no more resources.
   147  		waitDuration := t.tracker.TimeUntilUsage(nodeID, now, target)
   148  		if waitDuration < epsilon {
   149  			// If the amount of time until we reach the target is very small,
   150  			// just return to avoid a situation where we excessively re-check.
   151  			return
   152  		}
   153  		if waitDuration > t.MaxRecheckDelay {
   154  			// Re-check at least every [t.MaxRecheckDelay] in case it will be a
   155  			// very long time until usage reaches the target level.
   156  			//
   157  			// Note that not only can a node's usage decrease over time, but
   158  			// also its target usage may increase.
   159  			// In this case, the node's usage can drop to the target level
   160  			// sooner than [waitDuration] because the target has increased.
   161  			// The minimum re-check frequency accounts for that case by
   162  			// optimistically re-checking whether the node's usage is now at an
   163  			// acceptable level.
   164  			waitDuration = t.MaxRecheckDelay
   165  		}
   166  
   167  		// Reset [timer].
   168  		if timer == nil {
   169  			// Note this is called at most once.
   170  			t.metrics.awaitingAcquire.Inc()
   171  
   172  			timer = t.timerPool.Get().(*time.Timer)
   173  			defer func() {
   174  				// Satisfy [t.timerPool] invariant.
   175  				if !timer.Stop() {
   176  					// The default ensures we don't wait forever in the case
   177  					// that the channel was already drained.
   178  					select {
   179  					case <-timer.C:
   180  					default:
   181  					}
   182  				}
   183  				t.timerPool.Put(timer)
   184  			}()
   185  		}
   186  		timer.Reset(waitDuration)
   187  		select {
   188  		case <-ctx.Done():
   189  			return
   190  		case <-timer.C:
   191  		}
   192  	}
   193  }