github.com/MetalBlockchain/metalgo@v1.11.9/snow/networking/timeout/manager.go (about)

     1  // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package timeout
     5  
     6  import (
     7  	"fmt"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/prometheus/client_golang/prometheus"
    12  
    13  	"github.com/MetalBlockchain/metalgo/ids"
    14  	"github.com/MetalBlockchain/metalgo/message"
    15  	"github.com/MetalBlockchain/metalgo/snow"
    16  	"github.com/MetalBlockchain/metalgo/snow/networking/benchlist"
    17  	"github.com/MetalBlockchain/metalgo/utils/timer"
    18  )
    19  
    20  var _ Manager = (*manager)(nil)
    21  
    22  // Manages timeouts for requests sent to peers.
    23  type Manager interface {
    24  	// Start the manager. Must be called before any other method.
    25  	// Should be called in a goroutine.
    26  	Dispatch()
    27  	// TimeoutDuration returns the current timeout duration.
    28  	TimeoutDuration() time.Duration
    29  	// IsBenched returns true if messages to [nodeID] regarding [chainID]
    30  	// should not be sent over the network and should immediately fail.
    31  	IsBenched(nodeID ids.NodeID, chainID ids.ID) bool
    32  	// Register the existence of the given chain.
    33  	// Must be called before any method calls that use the
    34  	// ID of the chain.
    35  	RegisterChain(ctx *snow.ConsensusContext) error
    36  	// RegisterRequest notes that we expect a response of type [op] from
    37  	// [nodeID] for chain [chainID]. If we don't receive a response in
    38  	// time, [timeoutHandler] is executed.
    39  	RegisterRequest(
    40  		nodeID ids.NodeID,
    41  		chainID ids.ID,
    42  		measureLatency bool,
    43  		requestID ids.RequestID,
    44  		timeoutHandler func(),
    45  	)
    46  	// Registers that we would have sent a request to a validator but they
    47  	// are unreachable because they are benched or because of network conditions
    48  	// (e.g. we're not connected), so we didn't send the query. For the sake
    49  	// of calculating the average latency and network timeout, we act as
    50  	// though we sent the validator a request and it timed out.
    51  	RegisterRequestToUnreachableValidator()
    52  	// Registers that [nodeID] sent us a response of type [op]
    53  	// for the given chain. The response corresponds to the given
    54  	// requestID we sent them. [latency] is the time between us
    55  	// sending them the request and receiving their response.
    56  	RegisterResponse(
    57  		nodeID ids.NodeID,
    58  		chainID ids.ID,
    59  		requestID ids.RequestID,
    60  		op message.Op,
    61  		latency time.Duration,
    62  	)
    63  	// Mark that we no longer expect a response to this request we sent.
    64  	// Does not modify the timeout.
    65  	RemoveRequest(requestID ids.RequestID)
    66  
    67  	// Stops the manager.
    68  	Stop()
    69  }
    70  
    71  func NewManager(
    72  	timeoutConfig *timer.AdaptiveTimeoutConfig,
    73  	benchlistMgr benchlist.Manager,
    74  	requestReg prometheus.Registerer,
    75  	responseReg prometheus.Registerer,
    76  ) (Manager, error) {
    77  	tm, err := timer.NewAdaptiveTimeoutManager(
    78  		timeoutConfig,
    79  		requestReg,
    80  	)
    81  	if err != nil {
    82  		return nil, fmt.Errorf("couldn't create timeout manager: %w", err)
    83  	}
    84  
    85  	m, err := newTimeoutMetrics(responseReg)
    86  	if err != nil {
    87  		return nil, fmt.Errorf("couldn't create timeout metrics: %w", err)
    88  	}
    89  
    90  	return &manager{
    91  		tm:           tm,
    92  		benchlistMgr: benchlistMgr,
    93  		metrics:      m,
    94  	}, nil
    95  }
    96  
    97  type manager struct {
    98  	tm           timer.AdaptiveTimeoutManager
    99  	benchlistMgr benchlist.Manager
   100  	metrics      *timeoutMetrics
   101  	stopOnce     sync.Once
   102  }
   103  
   104  func (m *manager) Dispatch() {
   105  	m.tm.Dispatch()
   106  }
   107  
   108  func (m *manager) TimeoutDuration() time.Duration {
   109  	return m.tm.TimeoutDuration()
   110  }
   111  
   112  // IsBenched returns true if messages to [nodeID] regarding [chainID]
   113  // should not be sent over the network and should immediately fail.
   114  func (m *manager) IsBenched(nodeID ids.NodeID, chainID ids.ID) bool {
   115  	return m.benchlistMgr.IsBenched(nodeID, chainID)
   116  }
   117  
   118  func (m *manager) RegisterChain(ctx *snow.ConsensusContext) error {
   119  	if err := m.metrics.RegisterChain(ctx); err != nil {
   120  		return fmt.Errorf("couldn't register timeout metrics for chain %s: %w", ctx.ChainID, err)
   121  	}
   122  	if err := m.benchlistMgr.RegisterChain(ctx); err != nil {
   123  		return fmt.Errorf("couldn't register chain %s with benchlist manager: %w", ctx.ChainID, err)
   124  	}
   125  	return nil
   126  }
   127  
   128  // RegisterRequest notes that we expect a response of type [op] from
   129  // [nodeID] regarding chain [chainID]. If we don't receive a response in
   130  // time, [timeoutHandler]  is executed.
   131  func (m *manager) RegisterRequest(
   132  	nodeID ids.NodeID,
   133  	chainID ids.ID,
   134  	measureLatency bool,
   135  	requestID ids.RequestID,
   136  	timeoutHandler func(),
   137  ) {
   138  	newTimeoutHandler := func() {
   139  		if requestID.Op != byte(message.AppResponseOp) {
   140  			// If the request timed out and wasn't an AppRequest, tell the
   141  			// benchlist manager.
   142  			m.benchlistMgr.RegisterFailure(chainID, nodeID)
   143  		}
   144  		timeoutHandler()
   145  	}
   146  	m.tm.Put(requestID, measureLatency, newTimeoutHandler)
   147  }
   148  
   149  // RegisterResponse registers that we received a response from [nodeID]
   150  // regarding the given request ID and chain.
   151  func (m *manager) RegisterResponse(
   152  	nodeID ids.NodeID,
   153  	chainID ids.ID,
   154  	requestID ids.RequestID,
   155  	op message.Op,
   156  	latency time.Duration,
   157  ) {
   158  	m.metrics.Observe(chainID, op, latency)
   159  	m.benchlistMgr.RegisterResponse(chainID, nodeID)
   160  	m.tm.Remove(requestID)
   161  }
   162  
   163  func (m *manager) RemoveRequest(requestID ids.RequestID) {
   164  	m.tm.Remove(requestID)
   165  }
   166  
   167  func (m *manager) RegisterRequestToUnreachableValidator() {
   168  	m.tm.ObserveLatency(m.TimeoutDuration())
   169  }
   170  
   171  func (m *manager) Stop() {
   172  	m.stopOnce.Do(m.tm.Stop)
   173  }