github.com/koko1123/flow-go-1@v0.29.6/module/dkg/controller.go (about)

     1  package dkg
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"math/rand"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/rs/zerolog"
    11  
    12  	"github.com/onflow/flow-go/crypto"
    13  	"github.com/koko1123/flow-go-1/model/flow"
    14  	"github.com/koko1123/flow-go-1/module"
    15  )
    16  
    17  const (
    18  
    19  	// DefaultBaseStartDelay is the default base delay to use when introducing
    20  	// random delay to the DKG start process. See preStartDelay for details.
    21  	DefaultBaseStartDelay = 500 * time.Microsecond
    22  
    23  	// DefaultBaseHandleFirstBroadcastDelay is the default base to use when
    24  	// introducing random delay to processing the first DKG broadcast message.
    25  	// See preHandleFirstBroadcastDelay for details.
    26  	//
    27  	// For a 150-node DKG, we observe a cost of ~2.5s per message to process
    28  	// broadcast messages during phase 1, for a total of ~6m of total CPU time.
    29  	// We would like to target spreading this cost over a 30 minute period.
    30  	// With the default value for DefaultHandleSubsequentBroadcastDelay, this
    31  	// results in processing all phase 1 messages in 6m+6m=12m, so for a maximum
    32  	// total processing time of 30m, we sample the initial delay from [0,18m].
    33  	// We use 50ms as the default because 50ms*150^2 = 18.75m
    34  	//
    35  	DefaultBaseHandleFirstBroadcastDelay = 50 * time.Millisecond
    36  
    37  	// DefaultHandleSubsequentBroadcastDelay is the default delay to use before
    38  	// processing all DKG broadcasts after the first.
    39  	DefaultHandleSubsequentBroadcastDelay = 2500 * time.Millisecond
    40  )
    41  
    42  // ControllerConfig defines configuration for the DKG Controller. These define
    43  // how the DKG controller introduces delays to expensive DKG computations.
    44  //
    45  // We introduce delays for two reasons:
    46  // 1. Avoid running long-running expensive DKG computations consecutively.
    47  // 2. Avoid synchronizing expensive DKG computations across the DKG committee.
    48  //
    49  // Delays introduced prior to DKG start and prior to processing the FIRST broadcast
    50  // message are sampled uniformly from [0,m), where m=b*n^2
    51  //
    52  //	b = base delay (from config)
    53  //	n = size of DKG committee
    54  //
    55  // Delays introduced prior to processing subsequent broadcast messages are constant.
    56  type ControllerConfig struct {
    57  	// BaseStartDelay determines the maximum delay before starting the DKG.
    58  	BaseStartDelay time.Duration
    59  	// BaseHandleFirstBroadcastDelay determines the maximum delay before handling
    60  	// the first broadcast message.
    61  	BaseHandleFirstBroadcastDelay time.Duration
    62  	// HandleSubsequentBroadcastDelay determines the constant delay before handling
    63  	// all broadcast messages following the first.
    64  	HandleSubsequentBroadcastDelay time.Duration
    65  }
    66  
    67  // Controller implements the DKGController interface. It controls the execution
    68  // of a Joint Feldman DKG instance. A new Controller must be instantiated for
    69  // every epoch.
    70  type Controller struct {
    71  	// The embedded state Manager is used to manage the controller's underlying
    72  	// state.
    73  	Manager
    74  
    75  	log zerolog.Logger
    76  
    77  	// DKGState is the object that actually executes the protocol steps.
    78  	dkg crypto.DKGState
    79  
    80  	// dkgLock protects access to dkg
    81  	dkgLock sync.Mutex
    82  
    83  	// seed is required by DKGState
    84  	seed []byte
    85  
    86  	// broker enables the controller to communicate with other nodes
    87  	broker module.DKGBroker
    88  
    89  	// Channels used internally to trigger state transitions
    90  	h1Ch       chan struct{}
    91  	h2Ch       chan struct{}
    92  	endCh      chan struct{}
    93  	shutdownCh chan struct{}
    94  
    95  	// private fields that hold the DKG artifacts when the protocol runs to
    96  	// completion
    97  	privateShare   crypto.PrivateKey
    98  	publicKeys     []crypto.PublicKey
    99  	groupPublicKey crypto.PublicKey
   100  
   101  	// artifactsLock protects access to artifacts
   102  	artifactsLock sync.Mutex
   103  
   104  	config ControllerConfig
   105  	once   *sync.Once
   106  }
   107  
   108  // NewController instantiates a new Joint Feldman DKG controller.
   109  func NewController(
   110  	log zerolog.Logger,
   111  	dkgInstanceID string,
   112  	dkg crypto.DKGState,
   113  	seed []byte,
   114  	broker module.DKGBroker,
   115  	config ControllerConfig,
   116  ) *Controller {
   117  
   118  	logger := log.With().
   119  		Str("component", "dkg_controller").
   120  		Str("dkg_instance_id", dkgInstanceID).
   121  		Logger()
   122  
   123  	return &Controller{
   124  		log:        logger,
   125  		dkg:        dkg,
   126  		seed:       seed,
   127  		broker:     broker,
   128  		h1Ch:       make(chan struct{}),
   129  		h2Ch:       make(chan struct{}),
   130  		endCh:      make(chan struct{}),
   131  		shutdownCh: make(chan struct{}),
   132  		once:       new(sync.Once),
   133  		config:     config,
   134  	}
   135  }
   136  
   137  /*******************************************************************************
   138  Implement DKGController
   139  *******************************************************************************/
   140  
   141  // Run starts the DKG controller and executes the DKG state-machine. It blocks
   142  // until the controller is shutdown or until an error is encountered in one of
   143  // the protocol phases.
   144  func (c *Controller) Run() error {
   145  
   146  	// Start DKG and transition to phase 1
   147  	err := c.start()
   148  	if err != nil {
   149  		return err
   150  	}
   151  
   152  	// Start a background routine to listen for incoming private and broadcast
   153  	// messages from other nodes
   154  	go c.doBackgroundWork()
   155  
   156  	// Execute DKG State Machine
   157  	for {
   158  		state := c.GetState()
   159  		c.log.Debug().Msgf("DKG: %s", c.state)
   160  
   161  		switch state {
   162  		case Phase1:
   163  			err := c.phase1()
   164  			if err != nil {
   165  				return err
   166  			}
   167  		case Phase2:
   168  			err := c.phase2()
   169  			if err != nil {
   170  				return err
   171  			}
   172  		case Phase3:
   173  			err := c.phase3()
   174  			if err != nil {
   175  				return err
   176  			}
   177  		case End:
   178  			c.Shutdown()
   179  		case Shutdown:
   180  			return nil
   181  		}
   182  	}
   183  }
   184  
   185  // EndPhase1 notifies the controller to end phase 1, and start phase 2
   186  func (c *Controller) EndPhase1() error {
   187  	state := c.GetState()
   188  	if state != Phase1 {
   189  		return NewInvalidStateTransitionError(state, Phase2)
   190  	}
   191  
   192  	c.SetState(Phase2)
   193  	close(c.h1Ch)
   194  
   195  	return nil
   196  }
   197  
   198  // EndPhase2 notifies the controller to end phase 2, and start phase 3
   199  func (c *Controller) EndPhase2() error {
   200  	state := c.GetState()
   201  	if state != Phase2 {
   202  		return NewInvalidStateTransitionError(state, Phase3)
   203  	}
   204  
   205  	c.SetState(Phase3)
   206  	close(c.h2Ch)
   207  
   208  	return nil
   209  }
   210  
   211  // End terminates the DKG state machine and records the artifacts.
   212  func (c *Controller) End() error {
   213  	state := c.GetState()
   214  	if state != Phase3 {
   215  		return NewInvalidStateTransitionError(state, End)
   216  	}
   217  
   218  	c.log.Debug().Msg("DKG engine end")
   219  
   220  	// end and retrieve products of the DKG protocol
   221  	c.dkgLock.Lock()
   222  
   223  	privateShare, groupPublicKey, publicKeys, err := c.dkg.End()
   224  	c.dkgLock.Unlock()
   225  	if err != nil {
   226  		return err
   227  	}
   228  
   229  	c.artifactsLock.Lock()
   230  	c.privateShare = privateShare
   231  	c.groupPublicKey = groupPublicKey
   232  	c.publicKeys = publicKeys
   233  	c.artifactsLock.Unlock()
   234  
   235  	c.SetState(End)
   236  	close(c.endCh)
   237  
   238  	return nil
   239  }
   240  
   241  // Shutdown stops the controller regardless of the current state.
   242  func (c *Controller) Shutdown() {
   243  	c.broker.Shutdown()
   244  	c.SetState(Shutdown)
   245  	close(c.shutdownCh)
   246  }
   247  
   248  // Poll instructs the broker to read new broadcast messages, which will be
   249  // relayed through the message channel. The function does not return until the
   250  // received messages are processed.
   251  func (c *Controller) Poll(blockReference flow.Identifier) error {
   252  	return c.broker.Poll(blockReference)
   253  }
   254  
   255  // GetArtifacts returns our node's private key share, the group public key,
   256  // and the list of all nodes' public keys (including ours), as computed by
   257  // the DKG.
   258  func (c *Controller) GetArtifacts() (crypto.PrivateKey, crypto.PublicKey, []crypto.PublicKey) {
   259  	c.artifactsLock.Lock()
   260  	defer c.artifactsLock.Unlock()
   261  	return c.privateShare, c.groupPublicKey, c.publicKeys
   262  }
   263  
   264  // GetIndex returns the index of this node in the DKG committee list.
   265  func (c *Controller) GetIndex() int {
   266  	return c.broker.GetIndex()
   267  }
   268  
   269  // SubmitResult instructs the broker to submit DKG results. It is up to the
   270  // caller to ensure that this method is called after a succesfull run of the
   271  // protocol.
   272  func (c *Controller) SubmitResult() error {
   273  	_, pubKey, groupKeys := c.GetArtifacts()
   274  	return c.broker.SubmitResult(pubKey, groupKeys)
   275  }
   276  
   277  /*******************************************************************************
   278  WORKERS
   279  *******************************************************************************/
   280  
   281  func (c *Controller) doBackgroundWork() {
   282  	privateMsgCh := c.broker.GetPrivateMsgCh()
   283  	broadcastMsgCh := c.broker.GetBroadcastMsgCh()
   284  	for {
   285  		select {
   286  		case msg := <-privateMsgCh:
   287  			c.dkgLock.Lock()
   288  			err := c.dkg.HandlePrivateMsg(int(msg.CommitteeMemberIndex), msg.Data)
   289  			c.dkgLock.Unlock()
   290  			if err != nil {
   291  				c.log.Err(err).Msg("error processing DKG private message")
   292  			}
   293  
   294  		case msg := <-broadcastMsgCh:
   295  
   296  			// before processing a broadcast message during phase 1, sleep for a
   297  			// random delay to avoid synchronizing this expensive operation across
   298  			// all consensus nodes
   299  			state := c.GetState()
   300  			if state == Phase1 {
   301  
   302  				// introduce a large, uniformly sampled delay prior to processing
   303  				// the first message
   304  				isFirstMessage := false
   305  				c.once.Do(func() {
   306  					isFirstMessage = true
   307  					delay := c.preHandleFirstBroadcastDelay()
   308  					c.log.Info().Msgf("sleeping for %s before processing first phase 1 broadcast message", delay)
   309  					time.Sleep(delay)
   310  				})
   311  
   312  				if !isFirstMessage {
   313  					// introduce a constant delay for all subsequent messages
   314  					c.log.Debug().Msgf("sleeping for %s before processing subsequent phase 1 broadcast message", c.config.HandleSubsequentBroadcastDelay)
   315  					time.Sleep(c.config.HandleSubsequentBroadcastDelay)
   316  				}
   317  			}
   318  
   319  			c.dkgLock.Lock()
   320  			err := c.dkg.HandleBroadcastMsg(int(msg.CommitteeMemberIndex), msg.Data)
   321  			c.dkgLock.Unlock()
   322  			if err != nil {
   323  				c.log.Err(err).Msg("error processing DKG broadcast message")
   324  			}
   325  
   326  		case <-c.shutdownCh:
   327  			return
   328  		}
   329  	}
   330  }
   331  
   332  func (c *Controller) start() error {
   333  	state := c.GetState()
   334  	if state != Init {
   335  		return fmt.Errorf("cannot execute start routine in state %s", state)
   336  	}
   337  
   338  	// before starting the DKG, sleep for a random delay to avoid synchronizing
   339  	// this expensive operation across all consensus nodes
   340  	delay := c.preStartDelay()
   341  	c.log.Debug().Msgf("sleeping for %s before starting DKG", delay)
   342  	time.Sleep(delay)
   343  
   344  	c.dkgLock.Lock()
   345  	err := c.dkg.Start(c.seed)
   346  	c.dkgLock.Unlock()
   347  	if err != nil {
   348  		return fmt.Errorf("Error starting DKG: %w", err)
   349  	}
   350  
   351  	c.log.Debug().Msg("DKG engine started")
   352  	c.SetState(Phase1)
   353  	return nil
   354  }
   355  
   356  func (c *Controller) phase1() error {
   357  	state := c.GetState()
   358  	if state != Phase1 {
   359  		return fmt.Errorf("Cannot execute phase1 routine in state %s", state)
   360  	}
   361  
   362  	c.log.Debug().Msg("Waiting for end of phase 1")
   363  	for {
   364  		select {
   365  		case <-c.h1Ch:
   366  			return nil
   367  		case <-c.shutdownCh:
   368  			return nil
   369  		}
   370  	}
   371  }
   372  
   373  func (c *Controller) phase2() error {
   374  	state := c.GetState()
   375  	if state != Phase2 {
   376  		return fmt.Errorf("Cannot execute phase2 routine in state %s", state)
   377  	}
   378  
   379  	c.dkgLock.Lock()
   380  	err := c.dkg.NextTimeout()
   381  	c.dkgLock.Unlock()
   382  	if err != nil {
   383  		return fmt.Errorf("Error calling NextTimeout: %w", err)
   384  	}
   385  
   386  	c.log.Debug().Msg("Waiting for end of phase 2")
   387  	for {
   388  		select {
   389  		case <-c.h2Ch:
   390  			return nil
   391  		case <-c.shutdownCh:
   392  			return nil
   393  		}
   394  	}
   395  }
   396  
   397  func (c *Controller) phase3() error {
   398  	state := c.GetState()
   399  	if state != Phase3 {
   400  		return fmt.Errorf("Cannot execute phase3 routine in state %s", state)
   401  	}
   402  
   403  	c.dkgLock.Lock()
   404  	err := c.dkg.NextTimeout()
   405  	c.dkgLock.Unlock()
   406  	if err != nil {
   407  		return fmt.Errorf("Error calling NextTimeout: %w", err)
   408  	}
   409  
   410  	c.log.Debug().Msg("Waiting for end of phase 3")
   411  	for {
   412  		select {
   413  		case <-c.endCh:
   414  			return nil
   415  		case <-c.shutdownCh:
   416  			return nil
   417  		}
   418  	}
   419  }
   420  
   421  // preStartDelay returns a duration to delay prior to starting the DKG process.
   422  // This prevents synchronization of the DKG starting (an expensive operation)
   423  // across the network, which can impact finalization.
   424  func (c *Controller) preStartDelay() time.Duration {
   425  	delay := computePreprocessingDelay(c.config.BaseStartDelay, c.dkg.Size())
   426  	return delay
   427  }
   428  
   429  // preHandleFirstBroadcastDelay returns a duration to delay prior to handling
   430  // the first broadcast message. This delay is used only during phase 1 of the DKG.
   431  // This prevents synchronization of processing verification vectors (an
   432  // expensive operation) across the network, which can impact finalization.
   433  func (c *Controller) preHandleFirstBroadcastDelay() time.Duration {
   434  	delay := computePreprocessingDelay(c.config.BaseHandleFirstBroadcastDelay, c.dkg.Size())
   435  	return delay
   436  }
   437  
   438  // computePreprocessingDelay computes a random delay to introduce before an
   439  // expensive operation.
   440  //
   441  // The maximum delay is m=b*n^2 where:
   442  // * b is a configurable base delay
   443  // * n is the size of the DKG committee
   444  func computePreprocessingDelay(baseDelay time.Duration, dkgSize int) time.Duration {
   445  
   446  	maxDelay := computePreprocessingDelayMax(baseDelay, dkgSize)
   447  	if maxDelay <= 0 {
   448  		return 0
   449  	}
   450  	// select delay from [0,m)
   451  	delay := time.Duration(rand.Int63n(maxDelay.Nanoseconds()))
   452  	return delay
   453  }
   454  
   455  // computePreprocessingDelayMax computes the maximum dely for computePreprocessingDelay.
   456  func computePreprocessingDelayMax(baseDelay time.Duration, dkgSize int) time.Duration {
   457  	// sanity checks
   458  	if baseDelay < 0 {
   459  		baseDelay = 0
   460  	}
   461  	if dkgSize < 0 {
   462  		dkgSize = 0
   463  	}
   464  
   465  	// m=b*n^2
   466  	maxDelay := time.Duration(math.Pow(float64(dkgSize), 2)) * baseDelay
   467  	if maxDelay <= 0 {
   468  		return 0
   469  	}
   470  	return maxDelay
   471  }