github.com/MetalBlockchain/metalgo@v1.11.9/snow/networking/timeout/manager.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package timeout 5 6 import ( 7 "fmt" 8 "sync" 9 "time" 10 11 "github.com/prometheus/client_golang/prometheus" 12 13 "github.com/MetalBlockchain/metalgo/ids" 14 "github.com/MetalBlockchain/metalgo/message" 15 "github.com/MetalBlockchain/metalgo/snow" 16 "github.com/MetalBlockchain/metalgo/snow/networking/benchlist" 17 "github.com/MetalBlockchain/metalgo/utils/timer" 18 ) 19 20 var _ Manager = (*manager)(nil) 21 22 // Manages timeouts for requests sent to peers. 23 type Manager interface { 24 // Start the manager. Must be called before any other method. 25 // Should be called in a goroutine. 26 Dispatch() 27 // TimeoutDuration returns the current timeout duration. 28 TimeoutDuration() time.Duration 29 // IsBenched returns true if messages to [nodeID] regarding [chainID] 30 // should not be sent over the network and should immediately fail. 31 IsBenched(nodeID ids.NodeID, chainID ids.ID) bool 32 // Register the existence of the given chain. 33 // Must be called before any method calls that use the 34 // ID of the chain. 35 RegisterChain(ctx *snow.ConsensusContext) error 36 // RegisterRequest notes that we expect a response of type [op] from 37 // [nodeID] for chain [chainID]. If we don't receive a response in 38 // time, [timeoutHandler] is executed. 39 RegisterRequest( 40 nodeID ids.NodeID, 41 chainID ids.ID, 42 measureLatency bool, 43 requestID ids.RequestID, 44 timeoutHandler func(), 45 ) 46 // Registers that we would have sent a request to a validator but they 47 // are unreachable because they are benched or because of network conditions 48 // (e.g. we're not connected), so we didn't send the query. For the sake 49 // of calculating the average latency and network timeout, we act as 50 // though we sent the validator a request and it timed out. 51 RegisterRequestToUnreachableValidator() 52 // Registers that [nodeID] sent us a response of type [op] 53 // for the given chain. The response corresponds to the given 54 // requestID we sent them. [latency] is the time between us 55 // sending them the request and receiving their response. 56 RegisterResponse( 57 nodeID ids.NodeID, 58 chainID ids.ID, 59 requestID ids.RequestID, 60 op message.Op, 61 latency time.Duration, 62 ) 63 // Mark that we no longer expect a response to this request we sent. 64 // Does not modify the timeout. 65 RemoveRequest(requestID ids.RequestID) 66 67 // Stops the manager. 68 Stop() 69 } 70 71 func NewManager( 72 timeoutConfig *timer.AdaptiveTimeoutConfig, 73 benchlistMgr benchlist.Manager, 74 requestReg prometheus.Registerer, 75 responseReg prometheus.Registerer, 76 ) (Manager, error) { 77 tm, err := timer.NewAdaptiveTimeoutManager( 78 timeoutConfig, 79 requestReg, 80 ) 81 if err != nil { 82 return nil, fmt.Errorf("couldn't create timeout manager: %w", err) 83 } 84 85 m, err := newTimeoutMetrics(responseReg) 86 if err != nil { 87 return nil, fmt.Errorf("couldn't create timeout metrics: %w", err) 88 } 89 90 return &manager{ 91 tm: tm, 92 benchlistMgr: benchlistMgr, 93 metrics: m, 94 }, nil 95 } 96 97 type manager struct { 98 tm timer.AdaptiveTimeoutManager 99 benchlistMgr benchlist.Manager 100 metrics *timeoutMetrics 101 stopOnce sync.Once 102 } 103 104 func (m *manager) Dispatch() { 105 m.tm.Dispatch() 106 } 107 108 func (m *manager) TimeoutDuration() time.Duration { 109 return m.tm.TimeoutDuration() 110 } 111 112 // IsBenched returns true if messages to [nodeID] regarding [chainID] 113 // should not be sent over the network and should immediately fail. 114 func (m *manager) IsBenched(nodeID ids.NodeID, chainID ids.ID) bool { 115 return m.benchlistMgr.IsBenched(nodeID, chainID) 116 } 117 118 func (m *manager) RegisterChain(ctx *snow.ConsensusContext) error { 119 if err := m.metrics.RegisterChain(ctx); err != nil { 120 return fmt.Errorf("couldn't register timeout metrics for chain %s: %w", ctx.ChainID, err) 121 } 122 if err := m.benchlistMgr.RegisterChain(ctx); err != nil { 123 return fmt.Errorf("couldn't register chain %s with benchlist manager: %w", ctx.ChainID, err) 124 } 125 return nil 126 } 127 128 // RegisterRequest notes that we expect a response of type [op] from 129 // [nodeID] regarding chain [chainID]. If we don't receive a response in 130 // time, [timeoutHandler] is executed. 131 func (m *manager) RegisterRequest( 132 nodeID ids.NodeID, 133 chainID ids.ID, 134 measureLatency bool, 135 requestID ids.RequestID, 136 timeoutHandler func(), 137 ) { 138 newTimeoutHandler := func() { 139 if requestID.Op != byte(message.AppResponseOp) { 140 // If the request timed out and wasn't an AppRequest, tell the 141 // benchlist manager. 142 m.benchlistMgr.RegisterFailure(chainID, nodeID) 143 } 144 timeoutHandler() 145 } 146 m.tm.Put(requestID, measureLatency, newTimeoutHandler) 147 } 148 149 // RegisterResponse registers that we received a response from [nodeID] 150 // regarding the given request ID and chain. 151 func (m *manager) RegisterResponse( 152 nodeID ids.NodeID, 153 chainID ids.ID, 154 requestID ids.RequestID, 155 op message.Op, 156 latency time.Duration, 157 ) { 158 m.metrics.Observe(chainID, op, latency) 159 m.benchlistMgr.RegisterResponse(chainID, nodeID) 160 m.tm.Remove(requestID) 161 } 162 163 func (m *manager) RemoveRequest(requestID ids.RequestID) { 164 m.tm.Remove(requestID) 165 } 166 167 func (m *manager) RegisterRequestToUnreachableValidator() { 168 m.tm.ObserveLatency(m.TimeoutDuration()) 169 } 170 171 func (m *manager) Stop() { 172 m.stopOnce.Do(m.tm.Stop) 173 }