github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/network/p2p/unicast/manager.go (about)

     1  package unicast
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"time"
     8  
     9  	"github.com/go-playground/validator/v10"
    10  	"github.com/hashicorp/go-multierror"
    11  	libp2pnet "github.com/libp2p/go-libp2p/core/network"
    12  	"github.com/libp2p/go-libp2p/core/peer"
    13  	"github.com/libp2p/go-libp2p/core/protocol"
    14  	"github.com/libp2p/go-libp2p/p2p/net/swarm"
    15  	"github.com/rs/zerolog"
    16  	"github.com/sethvargo/go-retry"
    17  
    18  	"github.com/onflow/flow-go/model/flow"
    19  	"github.com/onflow/flow-go/module"
    20  	"github.com/onflow/flow-go/network/p2p"
    21  	p2plogging "github.com/onflow/flow-go/network/p2p/logging"
    22  	"github.com/onflow/flow-go/network/p2p/unicast/protocols"
    23  	"github.com/onflow/flow-go/network/p2p/unicast/stream"
    24  	"github.com/onflow/flow-go/utils/logging"
    25  )
    26  
    27  const (
    28  	// MaxRetryJitter is the maximum number of milliseconds to wait between attempts for a 1-1 direct connection
    29  	MaxRetryJitter = 5
    30  )
    31  
    32  var (
    33  	_ p2p.UnicastManager = (*Manager)(nil)
    34  )
    35  
    36  type DialConfigCacheFactory func(configFactory func() Config) ConfigCache
    37  
    38  // Manager manages libp2p stream negotiation and creation, which is utilized for unicast dispatches.
    39  type Manager struct {
    40  	logger         zerolog.Logger
    41  	streamFactory  p2p.StreamFactory
    42  	protocols      []protocols.Protocol
    43  	defaultHandler libp2pnet.StreamHandler
    44  	sporkId        flow.Identifier
    45  	metrics        module.UnicastManagerMetrics
    46  
    47  	// createStreamBackoffDelay is the delay between each stream creation retry attempt.
    48  	// The manager uses an exponential backoff strategy to retry stream creation, and this parameter
    49  	// is the initial delay between each retry attempt. The delay is doubled after each retry attempt.
    50  	createStreamBackoffDelay time.Duration
    51  
    52  	// dialConfigCache is a cache to store the dial config for each peer.
    53  	// TODO: encapsulation can be further improved by wrapping the dialConfigCache together with the dial config adjustment logic into a single struct.
    54  	dialConfigCache ConfigCache
    55  
    56  	// streamZeroBackoffResetThreshold is the threshold that determines when to reset the stream creation backoff budget to the default value.
    57  	//
    58  	// For example the default value of 100 means that if the stream creation backoff budget is decreased to 0, then it will be reset to default value
    59  	// when the number of consecutive successful streams reaches 100.
    60  	//
    61  	// This is to prevent the backoff budget from being reset too frequently, as the backoff budget is used to gauge the reliability of the stream creation.
    62  	// When the stream creation backoff budget is reset to the default value, it means that the stream creation is reliable enough to be trusted again.
    63  	// This parameter mandates when the stream creation is reliable enough to be trusted again; i.e., when the number of consecutive successful streams reaches this threshold.
    64  	// Note that the counter is reset to 0 when the stream creation fails, so the value of for example 100 means that the stream creation is reliable enough that the recent
    65  	// 100 stream creations are all successful.
    66  	streamZeroBackoffResetThreshold uint64
    67  
    68  	// maxStreamCreationAttemptTimes is the maximum number of attempts to be made to create a stream to a remote node over a direct unicast (1:1) connection before we give up.
    69  	maxStreamCreationAttemptTimes uint64
    70  }
    71  
    72  // NewUnicastManager creates a new unicast manager.
    73  // Args:
    74  //   - cfg: configuration for the unicast manager.
    75  //
    76  // Returns:
    77  //   - a new unicast manager.
    78  //   - an error if the configuration is invalid, any error is irrecoverable.
    79  func NewUnicastManager(cfg *ManagerConfig) (*Manager, error) {
    80  	if err := validator.New().Struct(cfg); err != nil {
    81  		return nil, fmt.Errorf("invalid unicast manager config: %w", err)
    82  	}
    83  
    84  	m := &Manager{
    85  		logger: cfg.Logger.With().Str("module", "unicast-manager").Logger(),
    86  		dialConfigCache: cfg.UnicastConfigCacheFactory(func() Config {
    87  			return Config{
    88  				StreamCreationRetryAttemptBudget: cfg.Parameters.MaxStreamCreationRetryAttemptTimes,
    89  			}
    90  		}),
    91  		streamFactory:                   cfg.StreamFactory,
    92  		sporkId:                         cfg.SporkId,
    93  		metrics:                         cfg.Metrics,
    94  		createStreamBackoffDelay:        cfg.Parameters.CreateStreamBackoffDelay,
    95  		streamZeroBackoffResetThreshold: cfg.Parameters.StreamZeroRetryResetThreshold,
    96  		maxStreamCreationAttemptTimes:   cfg.Parameters.MaxStreamCreationRetryAttemptTimes,
    97  	}
    98  
    99  	m.logger.Info().
   100  		Hex("spork_id", logging.ID(cfg.SporkId)).
   101  		Dur("create_stream_backoff_delay", cfg.Parameters.CreateStreamBackoffDelay).
   102  		Uint64("stream_zero_backoff_reset_threshold", cfg.Parameters.StreamZeroRetryResetThreshold).
   103  		Msg("unicast manager created")
   104  
   105  	return m, nil
   106  }
   107  
   108  // SetDefaultHandler sets the default stream handler for this unicast manager. The default handler is utilized
   109  // as the core handler for other unicast protocols, e.g., compressions.
   110  func (m *Manager) SetDefaultHandler(defaultHandler libp2pnet.StreamHandler) {
   111  	defaultProtocolID := protocols.FlowProtocolID(m.sporkId)
   112  	if len(m.protocols) > 0 {
   113  		panic("default handler must be set only once before any unicast registration")
   114  	}
   115  
   116  	m.defaultHandler = defaultHandler
   117  
   118  	m.protocols = []protocols.Protocol{
   119  		stream.NewPlainStream(defaultHandler, defaultProtocolID),
   120  	}
   121  
   122  	m.streamFactory.SetStreamHandler(defaultProtocolID, defaultHandler)
   123  	m.logger.Info().Str("protocol_id", string(defaultProtocolID)).Msg("default unicast handler registered")
   124  }
   125  
   126  // Register registers given protocol name as preferred unicast. Each invocation of register prioritizes the current protocol
   127  // over previously registered ones.
   128  func (m *Manager) Register(protocol protocols.ProtocolName) error {
   129  	factory, err := protocols.ToProtocolFactory(protocol)
   130  	if err != nil {
   131  		return fmt.Errorf("could not translate protocol name into factory: %w", err)
   132  	}
   133  
   134  	u := factory(m.logger, m.sporkId, m.defaultHandler)
   135  
   136  	m.protocols = append(m.protocols, u)
   137  	m.streamFactory.SetStreamHandler(u.ProtocolId(), u.Handler)
   138  	m.logger.Info().Str("protocol_id", string(u.ProtocolId())).Msg("unicast handler registered")
   139  
   140  	return nil
   141  }
   142  
   143  // CreateStream tries establishing a libp2p stream to the remote peer id. It tries creating streams in the descending order of preference until
   144  // it either creates a successful stream or runs out of options.
   145  // Args:
   146  //   - ctx: context for the stream creation.
   147  //   - peerID: peer ID of the remote peer.
   148  //
   149  // Returns:
   150  //   - a new libp2p stream.
   151  //   - error if the stream creation fails; the error is benign and can be retried.
   152  func (m *Manager) CreateStream(ctx context.Context, peerID peer.ID) (libp2pnet.Stream, error) {
   153  	var errs error
   154  	dialCfg, err := m.getDialConfig(peerID)
   155  	if err != nil {
   156  		// TODO: technically, we better to return an error here, but the error must be irrecoverable, and we cannot
   157  		//       guarantee a clear distinction between recoverable and irrecoverable errors at the moment with CreateStream.
   158  		//       We have to revisit this once we studied the error handling paths in the unicast manager.
   159  		m.logger.Fatal().
   160  			Err(err).
   161  			Bool(logging.KeyNetworkingSecurity, true).
   162  			Str("peer_id", p2plogging.PeerId(peerID)).
   163  			Msg("failed to retrieve dial config for peer id")
   164  	}
   165  
   166  	m.logger.Debug().
   167  		Str("peer_id", p2plogging.PeerId(peerID)).
   168  		Str("dial_config", fmt.Sprintf("%+v", dialCfg)).
   169  		Msg("dial config for the peer retrieved")
   170  
   171  	for i := len(m.protocols) - 1; i >= 0; i-- {
   172  		s, err := m.createStream(ctx, peerID, m.protocols[i], dialCfg)
   173  		if err != nil {
   174  			errs = multierror.Append(errs, err)
   175  			continue
   176  		}
   177  
   178  		// return first successful stream
   179  		return s, nil
   180  	}
   181  
   182  	updatedCfg, err := m.adjustUnsuccessfulStreamAttempt(peerID)
   183  	if err != nil {
   184  		// TODO: technically, we better to return an error here, but the error must be irrecoverable, and we cannot
   185  		//       guarantee a clear distinction between recoverable and irrecoverable errors at the moment with CreateStream.
   186  		//       We have to revisit this once we studied the error handling paths in the unicast manager.
   187  		m.logger.Fatal().
   188  			Err(err).
   189  			Bool(logging.KeyNetworkingSecurity, true).
   190  			Str("peer_id", p2plogging.PeerId(peerID)).
   191  			Msg("failed to adjust dial config for peer id")
   192  	}
   193  
   194  	m.logger.Warn().
   195  		Err(errs).
   196  		Bool(logging.KeySuspicious, true).
   197  		Str("peer_id", p2plogging.PeerId(peerID)).
   198  		Str("dial_config", fmt.Sprintf("%+v", updatedCfg)).
   199  		Msg("failed to create stream to peer id, dial config adjusted")
   200  
   201  	return nil, fmt.Errorf("could not create stream on any available unicast protocol: %w", errs)
   202  }
   203  
   204  // createStream attempts to establish a new stream with a peer using the specified protocol. It employs
   205  // exponential backoff with a maximum number of attempts defined by dialCfg.StreamCreationRetryAttemptBudget.
   206  // If the stream cannot be established after the maximum attempts, it returns a compiled multierror of all
   207  // encountered errors. Errors related to in-progress dials trigger a retry until a connection is established
   208  // or the attempt budget is exhausted.
   209  //
   210  // The function increments the Config's ConsecutiveSuccessfulStream count upon success. In the case of
   211  // adjustment errors in Config, a fatal error is logged indicating an issue that requires attention.
   212  // Metrics are collected to monitor the duration and number of attempts for stream creation.
   213  //
   214  // Arguments:
   215  // - ctx: Context to control the lifecycle of the stream creation.
   216  // - peerID: The ID of the peer with which the stream is to be established.
   217  // - protocol: The specific protocol used for the stream.
   218  // - dialCfg: Configuration parameters for dialing and stream creation, including retry logic.
   219  //
   220  // Returns:
   221  // - libp2pnet.Stream: The successfully created stream, or nil if the stream creation fails.
   222  // - error: An aggregated multierror of all encountered errors during stream creation, or nil if successful; any returned error is benign and can be retried.
   223  func (m *Manager) createStream(ctx context.Context, peerID peer.ID, protocol protocols.Protocol, dialCfg *Config) (libp2pnet.Stream, error) {
   224  	var err error
   225  	var s libp2pnet.Stream
   226  
   227  	s, err = m.createStreamWithRetry(ctx, peerID, protocol.ProtocolId(), dialCfg)
   228  	if err != nil {
   229  		return nil, fmt.Errorf("failed to create a stream to peer: %w", err)
   230  	}
   231  
   232  	s, err = protocol.UpgradeRawStream(s)
   233  	if err != nil {
   234  		return nil, fmt.Errorf("failed to upgrade raw stream: %w", err)
   235  	}
   236  
   237  	updatedConfig, err := m.dialConfigCache.AdjustWithInit(peerID, func(config Config) (Config, error) {
   238  		config.ConsecutiveSuccessfulStream++ // increase consecutive successful stream count.
   239  		return config, nil
   240  	})
   241  	if err != nil {
   242  		// This is not a connection retryable error, this is a fatal error.
   243  		// TODO: technically, we better to return an error here, but the error must be irrecoverable, and we cannot
   244  		//       guarantee a clear distinction between recoverable and irrecoverable errors at the moment with CreateStream.
   245  		//       We have to revisit this once we studied the error handling paths in the unicast manager.
   246  		m.logger.Fatal().
   247  			Err(err).
   248  			Bool(logging.KeyNetworkingSecurity, true).
   249  			Str("peer_id", p2plogging.PeerId(peerID)).
   250  			Msg("failed to adjust dial config for peer id")
   251  	}
   252  	m.logger.Debug().
   253  		Str("peer_id", p2plogging.PeerId(peerID)).
   254  		Str("updated_dial_config", fmt.Sprintf("%+v", updatedConfig)).
   255  		Msg("stream created successfully")
   256  	return s, nil
   257  }
   258  
   259  // createStreamWithRetry attempts to create a new stream to the specified peer using the given protocolID.
   260  // This function is streamlined for use-cases where retries are managed externally or
   261  // not required at all.
   262  //
   263  // Expected errors:
   264  //   - If the context expires before stream creation, it returns a context-related error with the number of attempts.
   265  //   - If the protocol ID is not supported, no retries are attempted and the error is returned immediately.
   266  //
   267  // Metrics are collected to monitor the duration and attempts of the stream creation process.
   268  //
   269  // Arguments:
   270  // - ctx: Context to control the lifecycle of the stream creation.
   271  // - peerID: The ID of the peer with which the stream is to be established.
   272  // - protocolID: The identifier for the protocol used for the stream.
   273  // - dialCfg: Configuration parameters for dialing, including the retry attempt budget.
   274  //
   275  // Returns:
   276  // - libp2pnet.Stream: The successfully created stream, or nil if an error occurs.
   277  // - error: An error encountered during the stream creation, or nil if the stream is successfully established.
   278  func (m *Manager) createStreamWithRetry(ctx context.Context, peerID peer.ID, protocolID protocol.ID, dialCfg *Config) (libp2pnet.Stream, error) {
   279  	// aggregated retryable errors that occur during retries, errs will be returned
   280  	// if retry context times out or maxAttempts have been made before a successful retry occurs
   281  	var errs error
   282  	var s libp2pnet.Stream
   283  	attempts := 0
   284  	f := func(context.Context) error {
   285  		attempts++
   286  		select {
   287  		case <-ctx.Done():
   288  			return fmt.Errorf("context done before stream could be created (retry attempt: %d, errors: %w)", attempts, errs)
   289  		default:
   290  		}
   291  
   292  		var err error
   293  		// creates stream using stream factory
   294  		s, err = m.streamFactory.NewStream(ctx, peerID, protocolID)
   295  		if err != nil {
   296  			// if the stream creation failed due to invalid protocol id or no address, skip the re-attempt
   297  			if stream.IsErrProtocolNotSupported(err) ||
   298  				errors.Is(err, swarm.ErrNoAddresses) ||
   299  				stream.IsErrSecurityProtocolNegotiationFailed(err) ||
   300  				stream.IsErrGaterDisallowedConnection(err) {
   301  				return err
   302  			}
   303  			return retry.RetryableError(multierror.Append(errs, err))
   304  		}
   305  		return nil
   306  	}
   307  
   308  	start := time.Now()
   309  	err := retry.Do(ctx, retryBackoff(dialCfg.StreamCreationRetryAttemptBudget, m.createStreamBackoffDelay), f)
   310  	duration := time.Since(start)
   311  	if err != nil {
   312  		m.metrics.OnEstablishStreamFailure(duration, attempts)
   313  		return nil, retryFailedError(uint64(attempts), dialCfg.StreamCreationRetryAttemptBudget, fmt.Errorf("failed to create a stream to peer: %w", err))
   314  	}
   315  	m.metrics.OnStreamEstablished(duration, attempts)
   316  	return s, nil
   317  }
   318  
   319  // retryBackoff creates and returns a retry exponential backoff with the given maximum number of retries.
   320  // Note that the retryBackoff by default makes one attempt. Hence, that total number of attempts are 1 + maxRetries.
   321  // Args:
   322  // - maxRetries: maximum number of retries (in addition to the first backoff).
   323  // - retryInterval: initial retry interval for exponential backoff.
   324  // Returns:
   325  // - a retry backoff object that makes maximum of maxRetries + 1 attempts.
   326  func retryBackoff(maxRetries uint64, retryInterval time.Duration) retry.Backoff {
   327  	// create backoff
   328  	backoff := retry.NewConstant(retryInterval)
   329  	// add a MaxRetryJitter*time.Millisecond jitter to our backoff to ensure that this node and the target node don't attempt to reconnect at the same time
   330  	backoff = retry.WithJitter(MaxRetryJitter*time.Millisecond, backoff)
   331  
   332  	// https://github.com/sethvargo/go-retry#maxretries retries counter starts at zero and library will make last attempt
   333  	// when retries == maxRetries. Hence, the total number of invocations is maxRetires + 1
   334  	backoff = retry.WithMaxRetries(maxRetries, backoff)
   335  	return backoff
   336  }
   337  
   338  // retryFailedError wraps the given error in a ErrMaxRetries if maxAttempts were made.
   339  func retryFailedError(dialAttempts, maxAttempts uint64, err error) error {
   340  	if dialAttempts == maxAttempts {
   341  		return NewMaxRetriesErr(dialAttempts, err)
   342  	}
   343  	return err
   344  }
   345  
   346  // getDialConfig gets the dial config for the given peer id.
   347  // It also adjusts the dial config if necessary based on the current dial config, i.e., it resets the dial backoff budget to the default value if the last successful dial was long enough ago,
   348  // and it resets the stream creation backoff budget to the default value if the number of consecutive successful streams reaches the threshold.
   349  // Args:
   350  //   - peerID: peer id of the remote peer.
   351  //
   352  // Returns:
   353  //   - dial config for the given peer id.
   354  //   - error if the dial config cannot be retrieved or adjusted; any error is irrecoverable and indicates a fatal error.
   355  func (m *Manager) getDialConfig(peerID peer.ID) (*Config, error) {
   356  	dialCfg, err := m.dialConfigCache.GetWithInit(peerID)
   357  	if err != nil {
   358  		return nil, fmt.Errorf("failed to get or init dial config for peer id: %w", err)
   359  	}
   360  
   361  	if dialCfg.StreamCreationRetryAttemptBudget == uint64(0) && dialCfg.ConsecutiveSuccessfulStream >= m.streamZeroBackoffResetThreshold {
   362  		// reset the stream creation backoff budget to the default value if the number of consecutive successful streams reaches the threshold,
   363  		// as the stream creation is reliable enough to be trusted again.
   364  		dialCfg, err = m.dialConfigCache.AdjustWithInit(peerID, func(config Config) (Config, error) {
   365  			config.StreamCreationRetryAttemptBudget = m.maxStreamCreationAttemptTimes
   366  			m.metrics.OnStreamCreationRetryBudgetUpdated(config.StreamCreationRetryAttemptBudget)
   367  			m.metrics.OnStreamCreationRetryBudgetResetToDefault()
   368  			return config, nil
   369  		})
   370  		if err != nil {
   371  			return nil, fmt.Errorf("failed to adjust dial config for peer id (resetting stream creation attempt budget): %w", err)
   372  		}
   373  	}
   374  	return dialCfg, nil
   375  }
   376  
   377  // adjustUnsuccessfulStreamAttempt adjusts the dial config for the given peer id if the stream creation fails.
   378  // It resets the stream creation backoff budget to the default value if the number of consecutive successful streams reaches the threshold,
   379  // and it resets the dial backoff budget to the default value if there is no connection to the peer.
   380  // Args:
   381  //   - peerID: peer id of the remote peer.
   382  //
   383  // Returns:
   384  // - dial config for the given peer id.
   385  // - connected indicates whether there is a connection to the peer.
   386  // - error if the dial config cannot be adjusted; any error is irrecoverable and indicates a fatal error.
   387  func (m *Manager) adjustUnsuccessfulStreamAttempt(peerID peer.ID) (*Config, error) {
   388  	updatedCfg, err := m.dialConfigCache.AdjustWithInit(peerID, func(config Config) (Config, error) {
   389  		// consecutive successful stream count is reset to 0 if we fail to create a stream or connection to the peer.
   390  		config.ConsecutiveSuccessfulStream = 0
   391  
   392  		// there is a connection to the peer it means that the stream creation failed, hence we decrease the stream backoff budget
   393  		// to try to create a stream with a more strict dial config next time.
   394  		if config.StreamCreationRetryAttemptBudget > 0 {
   395  			config.StreamCreationRetryAttemptBudget--
   396  			m.metrics.OnStreamCreationRetryBudgetUpdated(config.StreamCreationRetryAttemptBudget)
   397  		}
   398  
   399  		return config, nil
   400  	})
   401  
   402  	if err != nil {
   403  		return nil, fmt.Errorf("failed to adjust dial config for peer id: %w", err)
   404  	}
   405  
   406  	return updatedCfg, nil
   407  }