github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/dkg/broker.go (about)

     1  package dkg
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/onflow/crypto"
    10  	"github.com/rs/zerolog"
    11  	"github.com/sethvargo/go-retry"
    12  
    13  	"github.com/onflow/flow-go/engine"
    14  	"github.com/onflow/flow-go/model/fingerprint"
    15  	"github.com/onflow/flow-go/model/flow"
    16  	"github.com/onflow/flow-go/model/messages"
    17  	"github.com/onflow/flow-go/module"
    18  	"github.com/onflow/flow-go/module/retrymiddleware"
    19  )
    20  
    21  // BrokerOpt is a functional option which modifies the DKG Broker config.
    22  type BrokerOpt func(*BrokerConfig)
    23  
    24  // BrokerConfig is configuration for the DKG Broker component.
    25  type BrokerConfig struct {
    26  	// PublishMaxRetries is the maximum number of times the broker will attempt
    27  	// to broadcast a message or publish a result.
    28  	PublishMaxRetries uint64
    29  	// ReadMaxRetries is the max number of times the broker will attempt to
    30  	// read messages before giving up.
    31  	ReadMaxRetries uint64
    32  	// RetryMaxConsecutiveFailures is the number of consecutive failures allowed
    33  	// before we switch to a different Access client for subsequent attempts.
    34  	RetryMaxConsecutiveFailures int
    35  	// RetryInitialWait is the initial duration to wait between retries for all
    36  	// retryable requests - increases exponentially for subsequent retries.
    37  	RetryInitialWait time.Duration
    38  	// RetryJitterPct is the percentage jitter to introduce to each retry interval
    39  	// for all retryable requests.
    40  	RetryJitterPct uint64
    41  }
    42  
    43  // DefaultBrokerConfig returns the default config for the DKG Broker component.
    44  func DefaultBrokerConfig() BrokerConfig {
    45  	return BrokerConfig{
    46  		PublishMaxRetries:           10,
    47  		ReadMaxRetries:              3,
    48  		RetryMaxConsecutiveFailures: 2,
    49  		RetryInitialWait:            time.Second,
    50  		RetryJitterPct:              25,
    51  	}
    52  }
    53  
    54  // Broker is an implementation of the DKGBroker interface which is intended to
    55  // be used in conjunction with the DKG MessagingEngine for private messages, and
    56  // with the DKG smart-contract for broadcast messages.
    57  type Broker struct {
    58  	config                    BrokerConfig
    59  	log                       zerolog.Logger
    60  	unit                      *engine.Unit
    61  	dkgInstanceID             string                            // unique identifier of the current dkg run (prevent replay attacks)
    62  	committee                 flow.IdentitySkeletonList         // identities of DKG members
    63  	me                        module.Local                      // used for signing broadcast messages
    64  	myIndex                   int                               // index of this instance in the committee
    65  	dkgContractClients        []module.DKGContractClient        // array of clients to communicate with the DKG smart contract in priority order for fallbacks during retries
    66  	lastSuccessfulClientIndex int                               // index of the contract client that was last successful during retries
    67  	tunnel                    *BrokerTunnel                     // channels through which the broker communicates with the network engine
    68  	privateMsgCh              chan messages.PrivDKGMessageIn    // channel to forward incoming private messages to consumers
    69  	broadcastMsgCh            chan messages.BroadcastDKGMessage // channel to forward incoming broadcast messages to consumers
    70  	messageOffset             uint                              // offset for next broadcast messages to fetch
    71  	shutdownCh                chan struct{}                     // channel to stop the broker from listening
    72  
    73  	broadcasts uint // broadcasts counts the number of attempted broadcasts
    74  
    75  	clientLock    sync.Mutex // lock around updates to current client
    76  	broadcastLock sync.Mutex // lock around outbound broadcasts
    77  	pollLock      sync.Mutex // lock around polls to read inbound broadcasts
    78  }
    79  
    80  var _ module.DKGBroker = (*Broker)(nil)
    81  
    82  // NewBroker instantiates a new epoch-specific broker capable of communicating
    83  // with other nodes via a network engine and dkg smart-contract.
    84  func NewBroker(
    85  	log zerolog.Logger,
    86  	dkgInstanceID string,
    87  	committee flow.IdentitySkeletonList,
    88  	me module.Local,
    89  	myIndex int,
    90  	dkgContractClients []module.DKGContractClient,
    91  	tunnel *BrokerTunnel,
    92  	opts ...BrokerOpt,
    93  ) *Broker {
    94  
    95  	config := DefaultBrokerConfig()
    96  	for _, apply := range opts {
    97  		apply(&config)
    98  	}
    99  
   100  	b := &Broker{
   101  		config:             config,
   102  		log:                log.With().Str("component", "dkg_broker").Str("dkg_instance_id", dkgInstanceID).Logger(),
   103  		unit:               engine.NewUnit(),
   104  		dkgInstanceID:      dkgInstanceID,
   105  		committee:          committee,
   106  		me:                 me,
   107  		myIndex:            myIndex,
   108  		dkgContractClients: dkgContractClients,
   109  		tunnel:             tunnel,
   110  		privateMsgCh:       make(chan messages.PrivDKGMessageIn),
   111  		broadcastMsgCh:     make(chan messages.BroadcastDKGMessage),
   112  		shutdownCh:         make(chan struct{}),
   113  	}
   114  
   115  	go b.listen()
   116  
   117  	return b
   118  }
   119  
   120  /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   121  Implement DKGBroker
   122  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
   123  
   124  // GetIndex returns the index of this node in the committee list.
   125  func (b *Broker) GetIndex() int {
   126  	return b.myIndex
   127  }
   128  
   129  // PrivateSend sends a DKGMessage to a destination over a private channel. It
   130  // appends the current DKG instance ID to the message.
   131  func (b *Broker) PrivateSend(dest int, data []byte) {
   132  	if dest >= len(b.committee) || dest < 0 {
   133  		b.log.Error().Msgf("destination id out of range: %d", dest)
   134  		return
   135  	}
   136  	dkgMessageOut := messages.PrivDKGMessageOut{
   137  		DKGMessage: messages.NewDKGMessage(data, b.dkgInstanceID),
   138  		DestID:     b.committee[dest].NodeID,
   139  	}
   140  	b.tunnel.SendOut(dkgMessageOut)
   141  }
   142  
   143  // Broadcast signs and broadcasts a message to all participants.
   144  func (b *Broker) Broadcast(data []byte) {
   145  	b.unit.Launch(func() {
   146  
   147  		// NOTE: We're counting the number of times the underlying DKG requested
   148  		// a broadcast so that we can detect an unhappy path (any time there is
   149  		// more than 1 broadcast message per DKG) Thus incrementing broadcasts
   150  		// before we perform the broadcasts is okay.
   151  		b.broadcastLock.Lock()
   152  		if b.broadcasts > 0 {
   153  			// The warn-level log is used by the integration tests to check if this
   154  			// func is called more than once within one epoch (unhappy path).
   155  			b.log.Warn().Msgf("preparing to send DKG broadcast number %d with header %d", b.broadcasts+1, data[0])
   156  		} else {
   157  			b.log.Info().Msgf("preparing to send DKG message broadcast with header %d", data[0])
   158  		}
   159  		b.broadcasts++
   160  		log := b.log.With().Uint("broadcast_number", b.broadcasts).Logger()
   161  		b.broadcastLock.Unlock()
   162  
   163  		bcastMsg, err := b.prepareBroadcastMessage(data)
   164  		if err != nil {
   165  			log.Fatal().Err(err).Msg("failed to create broadcast message")
   166  		}
   167  
   168  		backoff := retry.NewExponential(b.config.RetryInitialWait)
   169  		backoff = retry.WithMaxRetries(b.config.PublishMaxRetries, backoff)
   170  		backoff = retry.WithJitterPercent(b.config.RetryJitterPct, backoff)
   171  
   172  		clientIndex, dkgContractClient := b.getInitialContractClient()
   173  		onMaxConsecutiveRetries := func(totalAttempts int) {
   174  			clientIndex, dkgContractClient = b.updateContractClient(clientIndex)
   175  			log.Warn().Msgf("broadcast: retrying on attempt (%d) with fallback access node at index (%d)", totalAttempts, clientIndex)
   176  		}
   177  		backoff = retrymiddleware.AfterConsecutiveFailures(b.config.RetryMaxConsecutiveFailures, backoff, onMaxConsecutiveRetries)
   178  
   179  		b.broadcastLock.Lock()
   180  		attempts := 1
   181  		err = retry.Do(b.unit.Ctx(), backoff, func(ctx context.Context) error {
   182  			err := dkgContractClient.Broadcast(bcastMsg)
   183  			if err != nil {
   184  				log.Error().Err(err).Msgf("error broadcasting, retrying (attempt %d)", attempts)
   185  				attempts++
   186  				return retry.RetryableError(err)
   187  			}
   188  
   189  			// update our last successful client index for future calls
   190  			b.updateLastSuccessfulClient(clientIndex)
   191  			return nil
   192  		})
   193  		b.broadcastLock.Unlock()
   194  
   195  		// Various network conditions can result in errors while broadcasting DKG messages.
   196  		// Because the overall DKG is resilient to individual message failures,
   197  		// it is acceptable to log the error and move on.
   198  		if err != nil {
   199  			log.Error().Err(err).Msgf("failed to broadcast message after %d attempts", attempts)
   200  			return
   201  		}
   202  		log.Info().Msgf("dkg broadcast successfully on attempt %d", attempts)
   203  	})
   204  }
   205  
   206  // SubmitResult publishes the result of the DKG protocol to the smart contract.
   207  func (b *Broker) SubmitResult(groupKey crypto.PublicKey, pubKeys []crypto.PublicKey) error {
   208  
   209  	// If the DKG failed locally, we will get a nil key vector here. We need to convert
   210  	// the nil slice to a slice of nil keys before submission.
   211  	//
   212  	// In general, if pubKeys does not have one key per participant, we cannot submit
   213  	// a valid result - therefore we submit a nil vector (indicating that we have
   214  	// completed the process, but we know that we don't have a valid result).
   215  	if len(pubKeys) != len(b.committee) {
   216  		b.log.Warn().Msgf("submitting dkg result with incomplete key vector (len=%d, expected=%d)", len(pubKeys), len(b.committee))
   217  		// create a key vector with one nil entry for each committee member
   218  		pubKeys = make([]crypto.PublicKey, len(b.committee))
   219  	}
   220  
   221  	backoff := retry.NewExponential(b.config.RetryInitialWait)
   222  	backoff = retry.WithMaxRetries(b.config.PublishMaxRetries, backoff)
   223  	backoff = retry.WithJitterPercent(b.config.RetryJitterPct, backoff)
   224  
   225  	clientIndex, dkgContractClient := b.getInitialContractClient()
   226  	onMaxConsecutiveRetries := func(totalAttempts int) {
   227  		clientIndex, dkgContractClient = b.updateContractClient(clientIndex)
   228  		b.log.Warn().Msgf("submit result: retrying on attempt (%d) with fallback access node at index (%d)", totalAttempts, clientIndex)
   229  	}
   230  	backoff = retrymiddleware.AfterConsecutiveFailures(b.config.RetryMaxConsecutiveFailures, backoff, onMaxConsecutiveRetries)
   231  
   232  	attempts := 1
   233  	err := retry.Do(b.unit.Ctx(), backoff, func(ctx context.Context) error {
   234  		err := dkgContractClient.SubmitResult(groupKey, pubKeys)
   235  		if err != nil {
   236  			b.log.Error().Err(err).Msgf("error submitting DKG result, retrying (attempt %d)", attempts)
   237  			attempts++
   238  			return retry.RetryableError(err)
   239  		}
   240  
   241  		// update our last successful client index for future calls
   242  		b.updateLastSuccessfulClient(clientIndex)
   243  		return nil
   244  	})
   245  	if err != nil {
   246  		return fmt.Errorf("failed to submit dkg result after %d attempts: %w", attempts, err)
   247  	}
   248  
   249  	b.log.Info().Msgf("dkg result submitted successfully on attempt %d", attempts)
   250  	return nil
   251  }
   252  
   253  // Disqualify flags that a node is misbehaving and got disqualified
   254  func (b *Broker) Disqualify(node int, log string) {
   255  	var nodeID flow.Identifier
   256  	if node < len(b.committee) {
   257  		nodeID = b.committee[node].NodeID
   258  	}
   259  
   260  	// The warn-level log is used by the integration tests to check if this method is called.
   261  	b.log.Warn().Msgf("participant %d (this node) is disqualifying participant (index=%d, node_id=%s) because: %s",
   262  		b.myIndex, node, nodeID, log)
   263  }
   264  
   265  // FlagMisbehavior warns that a node is misbehaving.
   266  func (b *Broker) FlagMisbehavior(node int, log string) {
   267  	var nodeID flow.Identifier
   268  	if node < len(b.committee) {
   269  		nodeID = b.committee[node].NodeID
   270  	}
   271  
   272  	// The warn-level log is used by the integration tests to check if this method is called.
   273  	b.log.Warn().Msgf("participant %d (this node) is flagging participant (index=%d, node_id=%s) because: %s",
   274  		b.myIndex, node, nodeID, log)
   275  }
   276  
   277  // GetPrivateMsgCh returns the channel through which consumers can receive
   278  // incoming private DKG messages.
   279  func (b *Broker) GetPrivateMsgCh() <-chan messages.PrivDKGMessageIn {
   280  	return b.privateMsgCh
   281  }
   282  
   283  // GetBroadcastMsgCh returns the channel through which consumers can receive
   284  // incoming broadcast DKG messages.
   285  func (b *Broker) GetBroadcastMsgCh() <-chan messages.BroadcastDKGMessage {
   286  	return b.broadcastMsgCh
   287  }
   288  
   289  // Poll calls the DKG smart contract to get missing DKG messages for the current
   290  // epoch, and forwards them to the msgCh. It should be called with the ID of a
   291  // block whose seal is finalized. The function doesn't return until the received
   292  // messages are processed by the consumer because b.msgCh is not buffered.
   293  func (b *Broker) Poll(referenceBlock flow.Identifier) error {
   294  	// We only issue one poll at a time to avoid delivering duplicate broadcast messages.
   295  	// The messageOffset determines which messages are retrieved by a Poll,
   296  	// and is not updated until the end of this function.
   297  	b.pollLock.Lock()
   298  	defer b.pollLock.Unlock()
   299  
   300  	backoff := retry.NewExponential(b.config.RetryInitialWait)
   301  	backoff = retry.WithMaxRetries(b.config.ReadMaxRetries, backoff)
   302  	backoff = retry.WithJitterPercent(b.config.RetryJitterPct, backoff)
   303  
   304  	clientIndex, dkgContractClient := b.getInitialContractClient()
   305  	onMaxConsecutiveRetries := func(totalAttempts int) {
   306  		clientIndex, dkgContractClient = b.updateContractClient(clientIndex)
   307  		b.log.Warn().Msgf("poll: retrying on attempt (%d) with fallback access node at index (%d)", totalAttempts, clientIndex)
   308  	}
   309  	backoff = retrymiddleware.AfterConsecutiveFailures(b.config.RetryMaxConsecutiveFailures, backoff, onMaxConsecutiveRetries)
   310  
   311  	var msgs []messages.BroadcastDKGMessage
   312  	var err error
   313  	attempt := 1
   314  	err = retry.Do(b.unit.Ctx(), backoff, func(ctx context.Context) error {
   315  		msgs, err = dkgContractClient.ReadBroadcast(b.messageOffset, referenceBlock)
   316  		if err != nil {
   317  			err = fmt.Errorf("could not read broadcast messages (attempt: %d, offset: %d, ref: %v): %w", attempt, b.messageOffset, referenceBlock, err)
   318  			attempt++
   319  			return retry.RetryableError(err)
   320  		}
   321  
   322  		// update our last successful client index for future calls
   323  		b.updateLastSuccessfulClient(clientIndex)
   324  		return nil
   325  	})
   326  	// Various network conditions can result in errors while reading DKG messages
   327  	// We will read any missed messages during the next poll because messageOffset is not increased
   328  	if err != nil {
   329  		b.log.Error().Err(err).Msgf("failed to read messages after %d attempts", attempt)
   330  		return nil
   331  	}
   332  
   333  	for _, msg := range msgs {
   334  		// set the CommitteeMemberIndex field for the message
   335  		memberIndex, ok := b.committee.GetIndex(msg.NodeID)
   336  		if !ok {
   337  			b.log.Error().Msgf("broadcast message from node with id (%v) does not match the ID of any committee member", msg.NodeID)
   338  			continue
   339  		}
   340  		msg.CommitteeMemberIndex = uint64(memberIndex)
   341  
   342  		ok, err := b.verifyBroadcastMessage(msg)
   343  		if err != nil {
   344  			b.log.Error().Err(err).Msg("unable to verify broadcast message")
   345  			continue
   346  		}
   347  		if !ok {
   348  			b.log.Error().Err(err).Msg("invalid signature on broadcast dkg message")
   349  			continue
   350  		}
   351  		b.log.Debug().Msgf("forwarding broadcast message to controller")
   352  		b.broadcastMsgCh <- msg
   353  	}
   354  
   355  	// update message offset to use for future polls, this avoids forwarding the
   356  	// same message more than once
   357  	b.messageOffset += uint(len(msgs))
   358  	return nil
   359  }
   360  
   361  // Shutdown stop the goroutine that listens to incoming private messages.
   362  func (b *Broker) Shutdown() {
   363  	close(b.shutdownCh)
   364  }
   365  
   366  /*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
   367  
   368  // updateContractClient will return the last successful client index by default for all initial operations or else
   369  // it will return the appropriate client index with respect to last successful and number of client.
   370  func (b *Broker) updateContractClient(clientIndex int) (int, module.DKGContractClient) {
   371  	b.clientLock.Lock()
   372  	defer b.clientLock.Unlock()
   373  	if clientIndex == b.lastSuccessfulClientIndex {
   374  		if clientIndex == len(b.dkgContractClients)-1 {
   375  			clientIndex = 0
   376  		} else {
   377  			clientIndex++
   378  		}
   379  	} else {
   380  		clientIndex = b.lastSuccessfulClientIndex
   381  	}
   382  
   383  	return clientIndex, b.dkgContractClients[clientIndex]
   384  }
   385  
   386  // getInitialContractClient will return the last successful contract client or the initial
   387  func (b *Broker) getInitialContractClient() (int, module.DKGContractClient) {
   388  	b.clientLock.Lock()
   389  	defer b.clientLock.Unlock()
   390  	return b.lastSuccessfulClientIndex, b.dkgContractClients[b.lastSuccessfulClientIndex]
   391  }
   392  
   393  // updateLastSuccessfulClient set lastSuccessfulClientIndex in concurrency safe way
   394  func (b *Broker) updateLastSuccessfulClient(clientIndex int) {
   395  	b.clientLock.Lock()
   396  	defer b.clientLock.Unlock()
   397  
   398  	b.lastSuccessfulClientIndex = clientIndex
   399  }
   400  
   401  // listen is a blocking call that processes incoming messages from the network
   402  // engine.
   403  func (b *Broker) listen() {
   404  	for {
   405  		select {
   406  		case msg := <-b.tunnel.MsgChIn:
   407  			b.onPrivateMessage(msg.OriginID, msg.DKGMessage)
   408  		case <-b.shutdownCh:
   409  			return
   410  		}
   411  	}
   412  }
   413  
   414  // onPrivateMessage verifies the integrity of an incoming message, sets the CommitteeMemberIndex and forwards
   415  // it to consumers via the msgCh.
   416  func (b *Broker) onPrivateMessage(originID flow.Identifier, msg messages.DKGMessage) {
   417  	memberIndex, ok := b.committee.GetIndex(originID)
   418  	if !ok {
   419  		b.log.Error().Msgf("bad message: OriginID (%v) does not match the NodeID of any committee member", originID)
   420  		return
   421  	}
   422  
   423  	err := b.hasValidDKGInstanceID(msg)
   424  	if err != nil {
   425  		b.log.Err(err).Msg("bad message")
   426  		return
   427  	}
   428  
   429  	b.privateMsgCh <- messages.PrivDKGMessageIn{DKGMessage: msg, OriginID: originID, CommitteeMemberIndex: uint64(memberIndex)}
   430  }
   431  
   432  // hasValidDKGInstanceID returns an error if msg.DKGInstanceID does not match Broker.DKGInstanceID
   433  func (b *Broker) hasValidDKGInstanceID(msg messages.DKGMessage) error {
   434  	// check that the message corresponds to the current epoch
   435  	if b.dkgInstanceID != msg.DKGInstanceID {
   436  		return fmt.Errorf("wrong DKG instance. Got %s, want %s", msg.DKGInstanceID, b.dkgInstanceID)
   437  	}
   438  
   439  	return nil
   440  }
   441  
   442  // prepareBroadcastMessage creates BroadcastDKGMessage with a signature from the
   443  // node's staking key.
   444  func (b *Broker) prepareBroadcastMessage(data []byte) (messages.BroadcastDKGMessage, error) {
   445  	dkgMessage := messages.NewDKGMessage(
   446  		data,
   447  		b.dkgInstanceID,
   448  	)
   449  	sigData := fingerprint.Fingerprint(dkgMessage)
   450  	signature, err := b.me.Sign(sigData[:], NewDKGMessageHasher())
   451  	if err != nil {
   452  		return messages.BroadcastDKGMessage{}, err
   453  	}
   454  	bcastMsg := messages.BroadcastDKGMessage{
   455  		DKGMessage: dkgMessage,
   456  		Signature:  signature,
   457  	}
   458  	return bcastMsg, nil
   459  }
   460  
   461  // verifyBroadcastMessage checks the DKG instance of a broadcast
   462  // message, as well as the signature against the staking key of the sender.
   463  // Returns:
   464  //   - true, nil if the message contents are valid and have a valid signature
   465  //   - false, nil if the message contents are valid but have an invalid signature
   466  //   - false, err if the message contents are invalid, or could not be checked,
   467  //     or the signature could not be checked
   468  //
   469  // TODO differentiate errors
   470  func (b *Broker) verifyBroadcastMessage(bcastMsg messages.BroadcastDKGMessage) (bool, error) {
   471  	err := b.hasValidDKGInstanceID(bcastMsg.DKGMessage)
   472  	if err != nil {
   473  		return false, fmt.Errorf("invalid dkg instance: %w", err)
   474  	}
   475  	origin := b.committee[bcastMsg.CommitteeMemberIndex]
   476  	signData := fingerprint.Fingerprint(bcastMsg.DKGMessage)
   477  	return origin.StakingPubKey.Verify(
   478  		bcastMsg.Signature,
   479  		signData[:],
   480  		NewDKGMessageHasher(),
   481  	)
   482  }