github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/consensus/dkg/messaging_engine.go (about)

     1  package dkg
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"time"
     8  
     9  	"github.com/rs/zerolog"
    10  	"github.com/sethvargo/go-retry"
    11  
    12  	"github.com/onflow/flow-go/engine"
    13  	"github.com/onflow/flow-go/engine/common/fifoqueue"
    14  	"github.com/onflow/flow-go/model/flow"
    15  	msg "github.com/onflow/flow-go/model/messages"
    16  	"github.com/onflow/flow-go/module"
    17  	"github.com/onflow/flow-go/module/component"
    18  	"github.com/onflow/flow-go/module/dkg"
    19  	"github.com/onflow/flow-go/module/irrecoverable"
    20  	"github.com/onflow/flow-go/module/metrics"
    21  	"github.com/onflow/flow-go/network"
    22  	"github.com/onflow/flow-go/network/channels"
    23  	"github.com/onflow/flow-go/utils/logging"
    24  )
    25  
    26  // MessagingEngineConfig configures outbound message submission.
    27  type MessagingEngineConfig struct {
    28  	// RetryMax is the maximum number of times the engine will attempt to send
    29  	// an outbound message before permanently giving up.
    30  	RetryMax uint64
    31  	// RetryBaseWait is the duration to wait between the two first send attempts.
    32  	RetryBaseWait time.Duration
    33  	// RetryJitterPercent is the percent jitter to add to each inter-retry wait.
    34  	RetryJitterPercent uint64
    35  }
    36  
    37  // DefaultMessagingEngineConfig returns the config defaults. With 9 attempts and
    38  // exponential backoff, this will retry for about 8m before giving up.
    39  func DefaultMessagingEngineConfig() MessagingEngineConfig {
    40  	return MessagingEngineConfig{
    41  		RetryMax:           9,
    42  		RetryBaseWait:      time.Second,
    43  		RetryJitterPercent: 25,
    44  	}
    45  }
    46  
    47  // MessagingEngine is an engine which sends and receives all DKG private messages.
    48  // The same engine instance is used for the lifetime of a node and will be used
    49  // for different DKG instances. The ReactorEngine is responsible for the lifecycle
    50  // of components which are scoped one DKG instance, for example the DKGController.
    51  // The dkg.BrokerTunnel handles routing messages to/from the current DKG instance.
    52  type MessagingEngine struct {
    53  	log     zerolog.Logger
    54  	me      module.Local          // local object to identify the node
    55  	conduit network.Conduit       // network conduit for sending and receiving private messages
    56  	tunnel  *dkg.BrokerTunnel     // tunnel for relaying private messages to and from controllers
    57  	config  MessagingEngineConfig // config for outbound message transmission
    58  
    59  	messageHandler *engine.MessageHandler // encapsulates enqueueing messages from network
    60  	notifier       engine.Notifier        // notifies inbound messages available for forwarding
    61  	inbound        *fifoqueue.FifoQueue   // messages from the network, to be processed by DKG Controller
    62  
    63  	component.Component
    64  	cm *component.ComponentManager
    65  }
    66  
    67  var _ network.MessageProcessor = (*MessagingEngine)(nil)
    68  var _ component.Component = (*MessagingEngine)(nil)
    69  
    70  // NewMessagingEngine returns a new MessagingEngine.
    71  func NewMessagingEngine(
    72  	log zerolog.Logger,
    73  	net network.EngineRegistry,
    74  	me module.Local,
    75  	tunnel *dkg.BrokerTunnel,
    76  	collector module.MempoolMetrics,
    77  	config MessagingEngineConfig,
    78  ) (*MessagingEngine, error) {
    79  	log = log.With().Str("engine", "dkg_messaging").Logger()
    80  
    81  	inbound, err := fifoqueue.NewFifoQueue(
    82  		1000,
    83  		fifoqueue.WithLengthMetricObserver(metrics.ResourceDKGMessage, collector.MempoolEntries))
    84  	if err != nil {
    85  		return nil, fmt.Errorf("could not create inbound fifoqueue: %w", err)
    86  	}
    87  
    88  	notifier := engine.NewNotifier()
    89  	messageHandler := engine.NewMessageHandler(log, notifier, engine.Pattern{
    90  		Match: engine.MatchType[*msg.DKGMessage],
    91  		Store: &engine.FifoMessageStore{FifoQueue: inbound},
    92  	})
    93  
    94  	eng := MessagingEngine{
    95  		log:            log,
    96  		me:             me,
    97  		tunnel:         tunnel,
    98  		messageHandler: messageHandler,
    99  		notifier:       notifier,
   100  		inbound:        inbound,
   101  		config:         config,
   102  	}
   103  
   104  	conduit, err := net.Register(channels.DKGCommittee, &eng)
   105  	if err != nil {
   106  		return nil, fmt.Errorf("could not register dkg network engine: %w", err)
   107  	}
   108  	eng.conduit = conduit
   109  
   110  	eng.cm = component.NewComponentManagerBuilder().
   111  		AddWorker(eng.forwardInboundMessagesWorker).
   112  		AddWorker(eng.forwardOutboundMessagesWorker).
   113  		Build()
   114  	eng.Component = eng.cm
   115  
   116  	return &eng, nil
   117  }
   118  
   119  // Process processes messages from the networking layer.
   120  // No errors are expected during normal operation.
   121  func (e *MessagingEngine) Process(channel channels.Channel, originID flow.Identifier, message any) error {
   122  	err := e.messageHandler.Process(originID, message)
   123  	if err != nil {
   124  		if errors.Is(err, engine.IncompatibleInputTypeError) {
   125  			e.log.Warn().Bool(logging.KeySuspicious, true).Msgf("%v delivered unsupported message %T through %v", originID, message, channel)
   126  			return nil
   127  		}
   128  		return fmt.Errorf("unexpected failure to process inbound dkg message: %w", err)
   129  	}
   130  	return nil
   131  }
   132  
   133  // forwardInboundMessagesWorker reads queued inbound messages and forwards them
   134  // through the broker tunnel to the DKG Controller for processing.
   135  // This is a worker routine which runs for the lifetime of the engine.
   136  func (e *MessagingEngine) forwardInboundMessagesWorker(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) {
   137  	ready()
   138  
   139  	done := ctx.Done()
   140  	wake := e.notifier.Channel()
   141  	for {
   142  		select {
   143  		case <-done:
   144  			return
   145  		case <-wake:
   146  			e.forwardInboundMessagesWhileAvailable(ctx)
   147  		}
   148  	}
   149  }
   150  
   151  // popNextInboundMessage pops one message from the queue and returns it as the
   152  // appropriate type expected by the DKG controller.
   153  func (e *MessagingEngine) popNextInboundMessage() (msg.PrivDKGMessageIn, bool) {
   154  	nextMessage, ok := e.inbound.Pop()
   155  	if !ok {
   156  		return msg.PrivDKGMessageIn{}, false
   157  	}
   158  	asEngineWrapper := nextMessage.(*engine.Message)
   159  	asDKGMsg := asEngineWrapper.Payload.(*msg.DKGMessage)
   160  	originID := asEngineWrapper.OriginID
   161  
   162  	message := msg.PrivDKGMessageIn{
   163  		DKGMessage: *asDKGMsg,
   164  		OriginID:   originID,
   165  	}
   166  	return message, true
   167  }
   168  
   169  // forwardInboundMessagesWhileAvailable retrieves all inbound messages from the queue and
   170  // sends to the DKG Controller over the broker tunnel. Exists when the queue is empty.
   171  func (e *MessagingEngine) forwardInboundMessagesWhileAvailable(ctx context.Context) {
   172  	for {
   173  		started := time.Now()
   174  		message, ok := e.popNextInboundMessage()
   175  		if !ok {
   176  			return
   177  		}
   178  
   179  		select {
   180  		case <-ctx.Done():
   181  			return
   182  		case e.tunnel.MsgChIn <- message:
   183  			e.log.Debug().Dur("waited", time.Since(started)).Msg("forwarded DKG message to Broker")
   184  			continue
   185  		}
   186  	}
   187  }
   188  
   189  // forwardOutboundMessagesWorker reads outbound DKG messages created by our DKG Controller
   190  // and sends them to the appropriate other DKG participant. Each outbound message is sent
   191  // async in an ad-hoc goroutine, which internally manages retry backoff for the message.
   192  // This is a worker routine which runs for the lifetime of the engine.
   193  func (e *MessagingEngine) forwardOutboundMessagesWorker(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) {
   194  	ready()
   195  
   196  	done := ctx.Done()
   197  	for {
   198  		select {
   199  		case <-done:
   200  			return
   201  		case message := <-e.tunnel.MsgChOut:
   202  			go e.forwardOutboundMessage(ctx, message)
   203  		}
   204  	}
   205  }
   206  
   207  // forwardOutboundMessage transmits message to the target DKG participant.
   208  // Upon any error from the Unicast, we will retry with an exponential backoff.
   209  // After a limited number of attempts, we will log an error and exit.
   210  // The DKG protocol tolerates a number of failed private messages - these will
   211  // be resolved by broadcasting complaints in later phases.
   212  // Must be invoked as a goroutine.
   213  func (e *MessagingEngine) forwardOutboundMessage(ctx context.Context, message msg.PrivDKGMessageOut) {
   214  	backoff := retry.NewExponential(e.config.RetryBaseWait)
   215  	backoff = retry.WithMaxRetries(e.config.RetryMax, backoff)
   216  	backoff = retry.WithJitterPercent(e.config.RetryJitterPercent, backoff)
   217  
   218  	started := time.Now()
   219  	log := e.log.With().Str("target", message.DestID.String()).Logger()
   220  
   221  	attempts := 0
   222  	err := retry.Do(ctx, backoff, func(ctx context.Context) error {
   223  		attempts++
   224  		err := e.conduit.Unicast(&message.DKGMessage, message.DestID)
   225  		// TODO Unicast does not document expected errors, therefore we treat all errors as benign networking failures here
   226  		if err != nil {
   227  			log.Warn().
   228  				Err(err).
   229  				Int("attempt", attempts).
   230  				Dur("send_time", time.Since(started)).
   231  				Msgf("error sending dkg message on attempt %d - will retry...", attempts)
   232  		}
   233  
   234  		return retry.RetryableError(err)
   235  	})
   236  
   237  	// TODO Unicast does not document expected errors, therefore we treat all errors as benign networking failures here
   238  	if err != nil {
   239  		log.Error().
   240  			Err(err).
   241  			Int("total_attempts", attempts).
   242  			Dur("total_send_time", time.Since(started)).
   243  			Msgf("failed to send private dkg message after %d attempts - will not retry", attempts)
   244  	}
   245  }