github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/network/p2p/unicast/manager.go (about) 1 package unicast 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "time" 8 9 "github.com/go-playground/validator/v10" 10 "github.com/hashicorp/go-multierror" 11 libp2pnet "github.com/libp2p/go-libp2p/core/network" 12 "github.com/libp2p/go-libp2p/core/peer" 13 "github.com/libp2p/go-libp2p/core/protocol" 14 "github.com/libp2p/go-libp2p/p2p/net/swarm" 15 "github.com/rs/zerolog" 16 "github.com/sethvargo/go-retry" 17 18 "github.com/onflow/flow-go/model/flow" 19 "github.com/onflow/flow-go/module" 20 "github.com/onflow/flow-go/network/p2p" 21 p2plogging "github.com/onflow/flow-go/network/p2p/logging" 22 "github.com/onflow/flow-go/network/p2p/unicast/protocols" 23 "github.com/onflow/flow-go/network/p2p/unicast/stream" 24 "github.com/onflow/flow-go/utils/logging" 25 ) 26 27 const ( 28 // MaxRetryJitter is the maximum number of milliseconds to wait between attempts for a 1-1 direct connection 29 MaxRetryJitter = 5 30 ) 31 32 var ( 33 _ p2p.UnicastManager = (*Manager)(nil) 34 ) 35 36 type DialConfigCacheFactory func(configFactory func() Config) ConfigCache 37 38 // Manager manages libp2p stream negotiation and creation, which is utilized for unicast dispatches. 39 type Manager struct { 40 logger zerolog.Logger 41 streamFactory p2p.StreamFactory 42 protocols []protocols.Protocol 43 defaultHandler libp2pnet.StreamHandler 44 sporkId flow.Identifier 45 metrics module.UnicastManagerMetrics 46 47 // createStreamBackoffDelay is the delay between each stream creation retry attempt. 48 // The manager uses an exponential backoff strategy to retry stream creation, and this parameter 49 // is the initial delay between each retry attempt. The delay is doubled after each retry attempt. 50 createStreamBackoffDelay time.Duration 51 52 // dialConfigCache is a cache to store the dial config for each peer. 53 // TODO: encapsulation can be further improved by wrapping the dialConfigCache together with the dial config adjustment logic into a single struct. 54 dialConfigCache ConfigCache 55 56 // streamZeroBackoffResetThreshold is the threshold that determines when to reset the stream creation backoff budget to the default value. 57 // 58 // For example the default value of 100 means that if the stream creation backoff budget is decreased to 0, then it will be reset to default value 59 // when the number of consecutive successful streams reaches 100. 60 // 61 // This is to prevent the backoff budget from being reset too frequently, as the backoff budget is used to gauge the reliability of the stream creation. 62 // When the stream creation backoff budget is reset to the default value, it means that the stream creation is reliable enough to be trusted again. 63 // This parameter mandates when the stream creation is reliable enough to be trusted again; i.e., when the number of consecutive successful streams reaches this threshold. 64 // Note that the counter is reset to 0 when the stream creation fails, so the value of for example 100 means that the stream creation is reliable enough that the recent 65 // 100 stream creations are all successful. 66 streamZeroBackoffResetThreshold uint64 67 68 // maxStreamCreationAttemptTimes is the maximum number of attempts to be made to create a stream to a remote node over a direct unicast (1:1) connection before we give up. 69 maxStreamCreationAttemptTimes uint64 70 } 71 72 // NewUnicastManager creates a new unicast manager. 73 // Args: 74 // - cfg: configuration for the unicast manager. 75 // 76 // Returns: 77 // - a new unicast manager. 78 // - an error if the configuration is invalid, any error is irrecoverable. 79 func NewUnicastManager(cfg *ManagerConfig) (*Manager, error) { 80 if err := validator.New().Struct(cfg); err != nil { 81 return nil, fmt.Errorf("invalid unicast manager config: %w", err) 82 } 83 84 m := &Manager{ 85 logger: cfg.Logger.With().Str("module", "unicast-manager").Logger(), 86 dialConfigCache: cfg.UnicastConfigCacheFactory(func() Config { 87 return Config{ 88 StreamCreationRetryAttemptBudget: cfg.Parameters.MaxStreamCreationRetryAttemptTimes, 89 } 90 }), 91 streamFactory: cfg.StreamFactory, 92 sporkId: cfg.SporkId, 93 metrics: cfg.Metrics, 94 createStreamBackoffDelay: cfg.Parameters.CreateStreamBackoffDelay, 95 streamZeroBackoffResetThreshold: cfg.Parameters.StreamZeroRetryResetThreshold, 96 maxStreamCreationAttemptTimes: cfg.Parameters.MaxStreamCreationRetryAttemptTimes, 97 } 98 99 m.logger.Info(). 100 Hex("spork_id", logging.ID(cfg.SporkId)). 101 Dur("create_stream_backoff_delay", cfg.Parameters.CreateStreamBackoffDelay). 102 Uint64("stream_zero_backoff_reset_threshold", cfg.Parameters.StreamZeroRetryResetThreshold). 103 Msg("unicast manager created") 104 105 return m, nil 106 } 107 108 // SetDefaultHandler sets the default stream handler for this unicast manager. The default handler is utilized 109 // as the core handler for other unicast protocols, e.g., compressions. 110 func (m *Manager) SetDefaultHandler(defaultHandler libp2pnet.StreamHandler) { 111 defaultProtocolID := protocols.FlowProtocolID(m.sporkId) 112 if len(m.protocols) > 0 { 113 panic("default handler must be set only once before any unicast registration") 114 } 115 116 m.defaultHandler = defaultHandler 117 118 m.protocols = []protocols.Protocol{ 119 stream.NewPlainStream(defaultHandler, defaultProtocolID), 120 } 121 122 m.streamFactory.SetStreamHandler(defaultProtocolID, defaultHandler) 123 m.logger.Info().Str("protocol_id", string(defaultProtocolID)).Msg("default unicast handler registered") 124 } 125 126 // Register registers given protocol name as preferred unicast. Each invocation of register prioritizes the current protocol 127 // over previously registered ones. 128 func (m *Manager) Register(protocol protocols.ProtocolName) error { 129 factory, err := protocols.ToProtocolFactory(protocol) 130 if err != nil { 131 return fmt.Errorf("could not translate protocol name into factory: %w", err) 132 } 133 134 u := factory(m.logger, m.sporkId, m.defaultHandler) 135 136 m.protocols = append(m.protocols, u) 137 m.streamFactory.SetStreamHandler(u.ProtocolId(), u.Handler) 138 m.logger.Info().Str("protocol_id", string(u.ProtocolId())).Msg("unicast handler registered") 139 140 return nil 141 } 142 143 // CreateStream tries establishing a libp2p stream to the remote peer id. It tries creating streams in the descending order of preference until 144 // it either creates a successful stream or runs out of options. 145 // Args: 146 // - ctx: context for the stream creation. 147 // - peerID: peer ID of the remote peer. 148 // 149 // Returns: 150 // - a new libp2p stream. 151 // - error if the stream creation fails; the error is benign and can be retried. 152 func (m *Manager) CreateStream(ctx context.Context, peerID peer.ID) (libp2pnet.Stream, error) { 153 var errs error 154 dialCfg, err := m.getDialConfig(peerID) 155 if err != nil { 156 // TODO: technically, we better to return an error here, but the error must be irrecoverable, and we cannot 157 // guarantee a clear distinction between recoverable and irrecoverable errors at the moment with CreateStream. 158 // We have to revisit this once we studied the error handling paths in the unicast manager. 159 m.logger.Fatal(). 160 Err(err). 161 Bool(logging.KeyNetworkingSecurity, true). 162 Str("peer_id", p2plogging.PeerId(peerID)). 163 Msg("failed to retrieve dial config for peer id") 164 } 165 166 m.logger.Debug(). 167 Str("peer_id", p2plogging.PeerId(peerID)). 168 Str("dial_config", fmt.Sprintf("%+v", dialCfg)). 169 Msg("dial config for the peer retrieved") 170 171 for i := len(m.protocols) - 1; i >= 0; i-- { 172 s, err := m.createStream(ctx, peerID, m.protocols[i], dialCfg) 173 if err != nil { 174 errs = multierror.Append(errs, err) 175 continue 176 } 177 178 // return first successful stream 179 return s, nil 180 } 181 182 updatedCfg, err := m.adjustUnsuccessfulStreamAttempt(peerID) 183 if err != nil { 184 // TODO: technically, we better to return an error here, but the error must be irrecoverable, and we cannot 185 // guarantee a clear distinction between recoverable and irrecoverable errors at the moment with CreateStream. 186 // We have to revisit this once we studied the error handling paths in the unicast manager. 187 m.logger.Fatal(). 188 Err(err). 189 Bool(logging.KeyNetworkingSecurity, true). 190 Str("peer_id", p2plogging.PeerId(peerID)). 191 Msg("failed to adjust dial config for peer id") 192 } 193 194 m.logger.Warn(). 195 Err(errs). 196 Bool(logging.KeySuspicious, true). 197 Str("peer_id", p2plogging.PeerId(peerID)). 198 Str("dial_config", fmt.Sprintf("%+v", updatedCfg)). 199 Msg("failed to create stream to peer id, dial config adjusted") 200 201 return nil, fmt.Errorf("could not create stream on any available unicast protocol: %w", errs) 202 } 203 204 // createStream attempts to establish a new stream with a peer using the specified protocol. It employs 205 // exponential backoff with a maximum number of attempts defined by dialCfg.StreamCreationRetryAttemptBudget. 206 // If the stream cannot be established after the maximum attempts, it returns a compiled multierror of all 207 // encountered errors. Errors related to in-progress dials trigger a retry until a connection is established 208 // or the attempt budget is exhausted. 209 // 210 // The function increments the Config's ConsecutiveSuccessfulStream count upon success. In the case of 211 // adjustment errors in Config, a fatal error is logged indicating an issue that requires attention. 212 // Metrics are collected to monitor the duration and number of attempts for stream creation. 213 // 214 // Arguments: 215 // - ctx: Context to control the lifecycle of the stream creation. 216 // - peerID: The ID of the peer with which the stream is to be established. 217 // - protocol: The specific protocol used for the stream. 218 // - dialCfg: Configuration parameters for dialing and stream creation, including retry logic. 219 // 220 // Returns: 221 // - libp2pnet.Stream: The successfully created stream, or nil if the stream creation fails. 222 // - error: An aggregated multierror of all encountered errors during stream creation, or nil if successful; any returned error is benign and can be retried. 223 func (m *Manager) createStream(ctx context.Context, peerID peer.ID, protocol protocols.Protocol, dialCfg *Config) (libp2pnet.Stream, error) { 224 var err error 225 var s libp2pnet.Stream 226 227 s, err = m.createStreamWithRetry(ctx, peerID, protocol.ProtocolId(), dialCfg) 228 if err != nil { 229 return nil, fmt.Errorf("failed to create a stream to peer: %w", err) 230 } 231 232 s, err = protocol.UpgradeRawStream(s) 233 if err != nil { 234 return nil, fmt.Errorf("failed to upgrade raw stream: %w", err) 235 } 236 237 updatedConfig, err := m.dialConfigCache.AdjustWithInit(peerID, func(config Config) (Config, error) { 238 config.ConsecutiveSuccessfulStream++ // increase consecutive successful stream count. 239 return config, nil 240 }) 241 if err != nil { 242 // This is not a connection retryable error, this is a fatal error. 243 // TODO: technically, we better to return an error here, but the error must be irrecoverable, and we cannot 244 // guarantee a clear distinction between recoverable and irrecoverable errors at the moment with CreateStream. 245 // We have to revisit this once we studied the error handling paths in the unicast manager. 246 m.logger.Fatal(). 247 Err(err). 248 Bool(logging.KeyNetworkingSecurity, true). 249 Str("peer_id", p2plogging.PeerId(peerID)). 250 Msg("failed to adjust dial config for peer id") 251 } 252 m.logger.Debug(). 253 Str("peer_id", p2plogging.PeerId(peerID)). 254 Str("updated_dial_config", fmt.Sprintf("%+v", updatedConfig)). 255 Msg("stream created successfully") 256 return s, nil 257 } 258 259 // createStreamWithRetry attempts to create a new stream to the specified peer using the given protocolID. 260 // This function is streamlined for use-cases where retries are managed externally or 261 // not required at all. 262 // 263 // Expected errors: 264 // - If the context expires before stream creation, it returns a context-related error with the number of attempts. 265 // - If the protocol ID is not supported, no retries are attempted and the error is returned immediately. 266 // 267 // Metrics are collected to monitor the duration and attempts of the stream creation process. 268 // 269 // Arguments: 270 // - ctx: Context to control the lifecycle of the stream creation. 271 // - peerID: The ID of the peer with which the stream is to be established. 272 // - protocolID: The identifier for the protocol used for the stream. 273 // - dialCfg: Configuration parameters for dialing, including the retry attempt budget. 274 // 275 // Returns: 276 // - libp2pnet.Stream: The successfully created stream, or nil if an error occurs. 277 // - error: An error encountered during the stream creation, or nil if the stream is successfully established. 278 func (m *Manager) createStreamWithRetry(ctx context.Context, peerID peer.ID, protocolID protocol.ID, dialCfg *Config) (libp2pnet.Stream, error) { 279 // aggregated retryable errors that occur during retries, errs will be returned 280 // if retry context times out or maxAttempts have been made before a successful retry occurs 281 var errs error 282 var s libp2pnet.Stream 283 attempts := 0 284 f := func(context.Context) error { 285 attempts++ 286 select { 287 case <-ctx.Done(): 288 return fmt.Errorf("context done before stream could be created (retry attempt: %d, errors: %w)", attempts, errs) 289 default: 290 } 291 292 var err error 293 // creates stream using stream factory 294 s, err = m.streamFactory.NewStream(ctx, peerID, protocolID) 295 if err != nil { 296 // if the stream creation failed due to invalid protocol id or no address, skip the re-attempt 297 if stream.IsErrProtocolNotSupported(err) || 298 errors.Is(err, swarm.ErrNoAddresses) || 299 stream.IsErrSecurityProtocolNegotiationFailed(err) || 300 stream.IsErrGaterDisallowedConnection(err) { 301 return err 302 } 303 return retry.RetryableError(multierror.Append(errs, err)) 304 } 305 return nil 306 } 307 308 start := time.Now() 309 err := retry.Do(ctx, retryBackoff(dialCfg.StreamCreationRetryAttemptBudget, m.createStreamBackoffDelay), f) 310 duration := time.Since(start) 311 if err != nil { 312 m.metrics.OnEstablishStreamFailure(duration, attempts) 313 return nil, retryFailedError(uint64(attempts), dialCfg.StreamCreationRetryAttemptBudget, fmt.Errorf("failed to create a stream to peer: %w", err)) 314 } 315 m.metrics.OnStreamEstablished(duration, attempts) 316 return s, nil 317 } 318 319 // retryBackoff creates and returns a retry exponential backoff with the given maximum number of retries. 320 // Note that the retryBackoff by default makes one attempt. Hence, that total number of attempts are 1 + maxRetries. 321 // Args: 322 // - maxRetries: maximum number of retries (in addition to the first backoff). 323 // - retryInterval: initial retry interval for exponential backoff. 324 // Returns: 325 // - a retry backoff object that makes maximum of maxRetries + 1 attempts. 326 func retryBackoff(maxRetries uint64, retryInterval time.Duration) retry.Backoff { 327 // create backoff 328 backoff := retry.NewConstant(retryInterval) 329 // add a MaxRetryJitter*time.Millisecond jitter to our backoff to ensure that this node and the target node don't attempt to reconnect at the same time 330 backoff = retry.WithJitter(MaxRetryJitter*time.Millisecond, backoff) 331 332 // https://github.com/sethvargo/go-retry#maxretries retries counter starts at zero and library will make last attempt 333 // when retries == maxRetries. Hence, the total number of invocations is maxRetires + 1 334 backoff = retry.WithMaxRetries(maxRetries, backoff) 335 return backoff 336 } 337 338 // retryFailedError wraps the given error in a ErrMaxRetries if maxAttempts were made. 339 func retryFailedError(dialAttempts, maxAttempts uint64, err error) error { 340 if dialAttempts == maxAttempts { 341 return NewMaxRetriesErr(dialAttempts, err) 342 } 343 return err 344 } 345 346 // getDialConfig gets the dial config for the given peer id. 347 // It also adjusts the dial config if necessary based on the current dial config, i.e., it resets the dial backoff budget to the default value if the last successful dial was long enough ago, 348 // and it resets the stream creation backoff budget to the default value if the number of consecutive successful streams reaches the threshold. 349 // Args: 350 // - peerID: peer id of the remote peer. 351 // 352 // Returns: 353 // - dial config for the given peer id. 354 // - error if the dial config cannot be retrieved or adjusted; any error is irrecoverable and indicates a fatal error. 355 func (m *Manager) getDialConfig(peerID peer.ID) (*Config, error) { 356 dialCfg, err := m.dialConfigCache.GetWithInit(peerID) 357 if err != nil { 358 return nil, fmt.Errorf("failed to get or init dial config for peer id: %w", err) 359 } 360 361 if dialCfg.StreamCreationRetryAttemptBudget == uint64(0) && dialCfg.ConsecutiveSuccessfulStream >= m.streamZeroBackoffResetThreshold { 362 // reset the stream creation backoff budget to the default value if the number of consecutive successful streams reaches the threshold, 363 // as the stream creation is reliable enough to be trusted again. 364 dialCfg, err = m.dialConfigCache.AdjustWithInit(peerID, func(config Config) (Config, error) { 365 config.StreamCreationRetryAttemptBudget = m.maxStreamCreationAttemptTimes 366 m.metrics.OnStreamCreationRetryBudgetUpdated(config.StreamCreationRetryAttemptBudget) 367 m.metrics.OnStreamCreationRetryBudgetResetToDefault() 368 return config, nil 369 }) 370 if err != nil { 371 return nil, fmt.Errorf("failed to adjust dial config for peer id (resetting stream creation attempt budget): %w", err) 372 } 373 } 374 return dialCfg, nil 375 } 376 377 // adjustUnsuccessfulStreamAttempt adjusts the dial config for the given peer id if the stream creation fails. 378 // It resets the stream creation backoff budget to the default value if the number of consecutive successful streams reaches the threshold, 379 // and it resets the dial backoff budget to the default value if there is no connection to the peer. 380 // Args: 381 // - peerID: peer id of the remote peer. 382 // 383 // Returns: 384 // - dial config for the given peer id. 385 // - connected indicates whether there is a connection to the peer. 386 // - error if the dial config cannot be adjusted; any error is irrecoverable and indicates a fatal error. 387 func (m *Manager) adjustUnsuccessfulStreamAttempt(peerID peer.ID) (*Config, error) { 388 updatedCfg, err := m.dialConfigCache.AdjustWithInit(peerID, func(config Config) (Config, error) { 389 // consecutive successful stream count is reset to 0 if we fail to create a stream or connection to the peer. 390 config.ConsecutiveSuccessfulStream = 0 391 392 // there is a connection to the peer it means that the stream creation failed, hence we decrease the stream backoff budget 393 // to try to create a stream with a more strict dial config next time. 394 if config.StreamCreationRetryAttemptBudget > 0 { 395 config.StreamCreationRetryAttemptBudget-- 396 m.metrics.OnStreamCreationRetryBudgetUpdated(config.StreamCreationRetryAttemptBudget) 397 } 398 399 return config, nil 400 }) 401 402 if err != nil { 403 return nil, fmt.Errorf("failed to adjust dial config for peer id: %w", err) 404 } 405 406 return updatedCfg, nil 407 }