github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/consensus/dkg/reactor_engine.go (about)

     1  package dkg
     2  
     3  import (
     4  	"crypto/rand"
     5  	"errors"
     6  	"fmt"
     7  
     8  	"github.com/onflow/crypto"
     9  	"github.com/rs/zerolog"
    10  
    11  	"github.com/onflow/flow-go/engine"
    12  	"github.com/onflow/flow-go/model/flow"
    13  	"github.com/onflow/flow-go/model/flow/filter"
    14  	"github.com/onflow/flow-go/module"
    15  	dkgmodule "github.com/onflow/flow-go/module/dkg"
    16  	"github.com/onflow/flow-go/state/protocol"
    17  	"github.com/onflow/flow-go/state/protocol/events"
    18  	"github.com/onflow/flow-go/storage"
    19  )
    20  
    21  // DefaultPollStep specifies the default number of views that separate two calls
    22  // to the DKG smart-contract to read broadcast messages.
    23  const DefaultPollStep = 10
    24  
    25  // dkgInfo consolidates information about the current DKG protocol instance.
    26  type dkgInfo struct {
    27  	identities      flow.IdentitySkeletonList
    28  	phase1FinalView uint64
    29  	phase2FinalView uint64
    30  	phase3FinalView uint64
    31  	// seed must be generated for each DKG instance, using a randomness source that is independent from all other nodes.
    32  	seed []byte
    33  }
    34  
    35  // ReactorEngine is an engine that reacts to chain events to start new DKG runs,
    36  // and manage subsequent phase transitions. Any unexpected error triggers a
    37  // panic as it would undermine the security of the protocol.
    38  // TODO replace engine.Unit with component.Component
    39  type ReactorEngine struct {
    40  	events.Noop
    41  	unit              *engine.Unit
    42  	log               zerolog.Logger
    43  	me                module.Local
    44  	State             protocol.State
    45  	dkgState          storage.DKGState
    46  	controller        module.DKGController
    47  	controllerFactory module.DKGControllerFactory
    48  	viewEvents        events.Views
    49  	pollStep          uint64
    50  }
    51  
    52  // NewReactorEngine return a new ReactorEngine.
    53  func NewReactorEngine(
    54  	log zerolog.Logger,
    55  	me module.Local,
    56  	state protocol.State,
    57  	dkgState storage.DKGState,
    58  	controllerFactory module.DKGControllerFactory,
    59  	viewEvents events.Views,
    60  ) *ReactorEngine {
    61  
    62  	logger := log.With().
    63  		Str("engine", "dkg_reactor").
    64  		Logger()
    65  
    66  	return &ReactorEngine{
    67  		unit:              engine.NewUnit(),
    68  		log:               logger,
    69  		me:                me,
    70  		State:             state,
    71  		dkgState:          dkgState,
    72  		controllerFactory: controllerFactory,
    73  		viewEvents:        viewEvents,
    74  		pollStep:          DefaultPollStep,
    75  	}
    76  }
    77  
    78  // Ready implements the module ReadyDoneAware interface. It returns a channel
    79  // that will close when the engine has successfully started.
    80  func (e *ReactorEngine) Ready() <-chan struct{} {
    81  	return e.unit.Ready(func() {
    82  		// If we are starting up in the EpochSetup phase, try to start the DKG.
    83  		// If the DKG for this epoch has been started previously, we will exit
    84  		// and fail this epoch's DKG.
    85  		snap := e.State.Final()
    86  
    87  		phase, err := snap.Phase()
    88  		if err != nil {
    89  			// unexpected storage-level error
    90  			// TODO use irrecoverable context
    91  			e.log.Fatal().Err(err).Msg("failed to check epoch phase when starting DKG reactor engine")
    92  			return
    93  		}
    94  		currentCounter, err := snap.Epochs().Current().Counter()
    95  		if err != nil {
    96  			// unexpected storage-level error
    97  			// TODO use irrecoverable context
    98  			e.log.Fatal().Err(err).Msg("failed to retrieve current epoch counter when starting DKG reactor engine")
    99  			return
   100  		}
   101  		first, err := snap.Head()
   102  		if err != nil {
   103  			// unexpected storage-level error
   104  			// TODO use irrecoverable context
   105  			e.log.Fatal().Err(err).Msg("failed to retrieve finalized header when starting DKG reactor engine")
   106  			return
   107  		}
   108  
   109  		// If we start up in EpochSetup phase, attempt to start the DKG in case it wasn't started previously
   110  		if phase == flow.EpochPhaseSetup {
   111  			e.startDKGForEpoch(currentCounter, first)
   112  		} else if phase == flow.EpochPhaseCommitted {
   113  			// If we start up in EpochCommitted phase, ensure the DKG end state is set correctly.
   114  			e.handleEpochCommittedPhaseStarted(currentCounter, first)
   115  		}
   116  	})
   117  }
   118  
   119  // Done implements the module ReadyDoneAware interface. It returns a channel
   120  // that will close when the engine has successfully stopped.
   121  func (e *ReactorEngine) Done() <-chan struct{} {
   122  	return e.unit.Done()
   123  }
   124  
   125  // EpochSetupPhaseStarted handles the EpochSetupPhaseStarted protocol event by
   126  // starting the DKG process.
   127  // NOTE: ReactorEngine will not recover from mid-DKG crashes, therefore we do not need to handle dropped protocol events here.
   128  func (e *ReactorEngine) EpochSetupPhaseStarted(currentEpochCounter uint64, first *flow.Header) {
   129  	e.startDKGForEpoch(currentEpochCounter, first)
   130  }
   131  
   132  // EpochCommittedPhaseStarted handles the EpochCommittedPhaseStarted protocol
   133  // event by checking the consistency of our locally computed key share.
   134  // NOTE: ReactorEngine will not recover from mid-DKG crashes, therefore we do not need to handle dropped protocol events here.
   135  func (e *ReactorEngine) EpochCommittedPhaseStarted(currentEpochCounter uint64, first *flow.Header) {
   136  	e.handleEpochCommittedPhaseStarted(currentEpochCounter, first)
   137  }
   138  
   139  // startDKGForEpoch attempts to start the DKG instance for the given epoch,
   140  // only if we have never started the DKG during setup phase for the given epoch.
   141  // This allows consensus nodes which boot from a state snapshot within the
   142  // EpochSetup phase to run the DKG.
   143  //
   144  // It starts a new controller for the epoch and registers the triggers to regularly
   145  // query the DKG smart-contract and transition between phases at the specified views.
   146  func (e *ReactorEngine) startDKGForEpoch(currentEpochCounter uint64, first *flow.Header) {
   147  
   148  	firstID := first.ID()
   149  	nextEpochCounter := currentEpochCounter + 1
   150  	log := e.log.With().
   151  		Uint64("cur_epoch", currentEpochCounter). // the epoch we are in the middle of
   152  		Uint64("next_epoch", nextEpochCounter).   // the epoch we are running the DKG for
   153  		Uint64("first_block_view", first.View).   // view of first block in EpochSetup phase
   154  		Hex("first_block_id", firstID[:]).        // id of first block in EpochSetup phase
   155  		Logger()
   156  
   157  	// if we have started the dkg for this epoch already, exit
   158  	started, err := e.dkgState.GetDKGStarted(nextEpochCounter)
   159  	if err != nil {
   160  		// unexpected storage-level error
   161  		// TODO use irrecoverable context
   162  		log.Fatal().Err(err).Msg("could not check whether DKG is started")
   163  	}
   164  	if started {
   165  		log.Warn().Msg("DKG started before, skipping starting the DKG for this epoch")
   166  		return
   167  	}
   168  
   169  	// flag that we are starting the dkg for this epoch
   170  	err = e.dkgState.SetDKGStarted(nextEpochCounter)
   171  	if err != nil {
   172  		// unexpected storage-level error
   173  		// TODO use irrecoverable context
   174  		log.Fatal().Err(err).Msg("could not set dkg started")
   175  	}
   176  
   177  	curDKGInfo, err := e.getDKGInfo(firstID)
   178  	if err != nil {
   179  		// unexpected storage-level error
   180  		// TODO use irrecoverable context
   181  		log.Fatal().Err(err).Msg("could not retrieve epoch info")
   182  	}
   183  
   184  	committee := curDKGInfo.identities.Filter(filter.IsConsensusCommitteeMember)
   185  
   186  	log.Info().
   187  		Uint64("phase1", curDKGInfo.phase1FinalView).
   188  		Uint64("phase2", curDKGInfo.phase2FinalView).
   189  		Uint64("phase3", curDKGInfo.phase3FinalView).
   190  		Interface("members", committee.NodeIDs()).
   191  		Msg("epoch info")
   192  
   193  	if _, ok := committee.GetIndex(e.me.NodeID()); !ok {
   194  		// node not found in DKG committee bypass starting the DKG
   195  		log.Warn().Str("node_id", e.me.NodeID().String()).Msg("failed to find our node ID in the DKG committee skip starting DKG engine, this node will not participate in consensus after the next epoch starts")
   196  		return
   197  	}
   198  	controller, err := e.controllerFactory.Create(
   199  		dkgmodule.CanonicalInstanceID(first.ChainID, nextEpochCounter),
   200  		committee,
   201  		curDKGInfo.seed,
   202  	)
   203  	if err != nil {
   204  		// no expected errors in controller factory
   205  		// TODO use irrecoverable context
   206  		log.Fatal().Err(err).Msg("could not create DKG controller")
   207  	}
   208  	e.controller = controller
   209  
   210  	e.unit.Launch(func() {
   211  		log.Info().Msg("DKG Run")
   212  		err := e.controller.Run()
   213  		if err != nil {
   214  			// TODO handle crypto sentinels and do not crash here
   215  			log.Fatal().Err(err).Msg("DKG Run error")
   216  		}
   217  	})
   218  
   219  	// NOTE:
   220  	// We register two callbacks for views that mark a state transition: one for
   221  	// polling broadcast messages, and one for triggering the phase transition.
   222  	// It is essential that all polled broadcast messages are processed before
   223  	// starting the phase transition. Here we register the polling callback
   224  	// before the phase transition, which guarantees that it will be called
   225  	// before because callbacks for the same views are executed on a FIFO basis.
   226  	// Moreover, the poll callback does not return until all received messages
   227  	// are processed by the underlying DKG controller (as guaranteed by the
   228  	// specifications and implementations of the DKGBroker and DKGController
   229  	// interfaces).
   230  
   231  	for view := curDKGInfo.phase1FinalView; view > first.View; view -= e.pollStep {
   232  		e.registerPoll(view)
   233  	}
   234  	e.registerPhaseTransition(curDKGInfo.phase1FinalView, dkgmodule.Phase1, e.controller.EndPhase1)
   235  
   236  	for view := curDKGInfo.phase2FinalView; view > curDKGInfo.phase1FinalView; view -= e.pollStep {
   237  		e.registerPoll(view)
   238  	}
   239  	e.registerPhaseTransition(curDKGInfo.phase2FinalView, dkgmodule.Phase2, e.controller.EndPhase2)
   240  
   241  	for view := curDKGInfo.phase3FinalView; view > curDKGInfo.phase2FinalView; view -= e.pollStep {
   242  		e.registerPoll(view)
   243  	}
   244  	e.registerPhaseTransition(curDKGInfo.phase3FinalView, dkgmodule.Phase3, e.end(nextEpochCounter))
   245  }
   246  
   247  // handleEpochCommittedPhaseStarted is invoked upon the transition to the EpochCommitted
   248  // phase, when the canonical beacon key vector is incorporated into the protocol state.
   249  //
   250  // This function checks that the local DKG completed and that our locally computed
   251  // key share is consistent with the canonical key vector. When this function returns,
   252  // an end state for the just-completed DKG is guaranteed to be stored (if not, the
   253  // program will crash). Since this function is invoked synchronously before the end
   254  // of the current epoch, this guarantees that when we reach the end of the current epoch
   255  // we will either have a usable beacon key (successful DKG) or a DKG failure end state
   256  // stored, so we can safely fall back to using our staking key.
   257  //
   258  // CAUTION: This function is not safe for concurrent use. This is not enforced within
   259  // the ReactorEngine - instead we rely on the protocol event emission being single-threaded
   260  func (e *ReactorEngine) handleEpochCommittedPhaseStarted(currentEpochCounter uint64, firstBlock *flow.Header) {
   261  
   262  	// the DKG we have just completed produces keys that we will use in the next epoch
   263  	nextEpochCounter := currentEpochCounter + 1
   264  
   265  	log := e.log.With().
   266  		Uint64("cur_epoch", currentEpochCounter). // the epoch we are in the middle of
   267  		Uint64("next_epoch", nextEpochCounter).   // the epoch the just-finished DKG was preparing for
   268  		Logger()
   269  
   270  	// Check whether we have already set the end state for this DKG.
   271  	// This can happen if the DKG failed locally, if we failed to generate
   272  	// a local private beacon key, or if we crashed while performing this
   273  	// check previously.
   274  	endState, err := e.dkgState.GetDKGEndState(nextEpochCounter)
   275  	if err == nil {
   276  		log.Warn().Msgf("checking beacon key consistency: exiting because dkg end state was already set: %s", endState.String())
   277  		return
   278  	}
   279  
   280  	// Since epoch phase transitions are emitted when the first block of the new
   281  	// phase is finalized, the block's snapshot is guaranteed to already be
   282  	// accessible in the protocol state at this point (even though the Badger
   283  	// transaction finalizing the block has not been committed yet).
   284  	nextDKG, err := e.State.AtBlockID(firstBlock.ID()).Epochs().Next().DKG()
   285  	if err != nil {
   286  		// CAUTION: this should never happen, indicates a storage failure or corruption
   287  		// TODO use irrecoverable context
   288  		log.Fatal().Err(err).Msg("checking beacon key consistency: could not retrieve next DKG info")
   289  		return
   290  	}
   291  
   292  	myBeaconPrivKey, err := e.dkgState.RetrieveMyBeaconPrivateKey(nextEpochCounter)
   293  	if errors.Is(err, storage.ErrNotFound) {
   294  		log.Warn().Msg("checking beacon key consistency: no key found")
   295  		err := e.dkgState.SetDKGEndState(nextEpochCounter, flow.DKGEndStateNoKey)
   296  		if err != nil {
   297  			// TODO use irrecoverable context
   298  			log.Fatal().Err(err).Msg("failed to set dkg end state")
   299  		}
   300  		return
   301  	} else if err != nil {
   302  		// TODO use irrecoverable context
   303  		log.Fatal().Err(err).Msg("checking beacon key consistency: could not retrieve beacon private key for next epoch")
   304  		return
   305  	}
   306  
   307  	nextDKGPubKey, err := nextDKG.KeyShare(e.me.NodeID())
   308  	if err != nil {
   309  		// TODO use irrecoverable context
   310  		log.Fatal().Err(err).Msg("checking beacon key consistency: could not retrieve my beacon public key for next epoch")
   311  		return
   312  	}
   313  	localPubKey := myBeaconPrivKey.PublicKey()
   314  
   315  	// we computed a local beacon key but it is inconsistent with our canonical
   316  	// public key - therefore it is unsafe for use
   317  	if !nextDKGPubKey.Equals(localPubKey) {
   318  		log.Warn().
   319  			Str("computed_beacon_pub_key", localPubKey.String()).
   320  			Str("canonical_beacon_pub_key", nextDKGPubKey.String()).
   321  			Msg("checking beacon key consistency: locally computed beacon public key does not match beacon public key for next epoch")
   322  		err := e.dkgState.SetDKGEndState(nextEpochCounter, flow.DKGEndStateInconsistentKey)
   323  		if err != nil {
   324  			// TODO use irrecoverable context
   325  			log.Fatal().Err(err).Msg("failed to set dkg end state")
   326  		}
   327  		return
   328  	}
   329  
   330  	err = e.dkgState.SetDKGEndState(nextEpochCounter, flow.DKGEndStateSuccess)
   331  	if err != nil {
   332  		// TODO use irrecoverable context
   333  		e.log.Fatal().Err(err).Msg("failed to set dkg end state")
   334  	}
   335  	log.Info().Msgf("successfully ended DKG, my beacon pub key for epoch %d is %s", nextEpochCounter, localPubKey)
   336  }
   337  
   338  // TODO document error returns
   339  func (e *ReactorEngine) getDKGInfo(firstBlockID flow.Identifier) (*dkgInfo, error) {
   340  	currEpoch := e.State.AtBlockID(firstBlockID).Epochs().Current()
   341  	nextEpoch := e.State.AtBlockID(firstBlockID).Epochs().Next()
   342  
   343  	identities, err := nextEpoch.InitialIdentities()
   344  	if err != nil {
   345  		return nil, fmt.Errorf("could not retrieve epoch identities: %w", err)
   346  	}
   347  	phase1Final, phase2Final, phase3Final, err := protocol.DKGPhaseViews(currEpoch)
   348  	if err != nil {
   349  		return nil, fmt.Errorf("could not retrieve epoch dkg final views: %w", err)
   350  	}
   351  	seed := make([]byte, crypto.KeyGenSeedMinLen)
   352  	_, err = rand.Read(seed)
   353  	if err != nil {
   354  		return nil, fmt.Errorf("could not generate random seed: %w", err)
   355  	}
   356  
   357  	info := &dkgInfo{
   358  		identities:      identities,
   359  		phase1FinalView: phase1Final,
   360  		phase2FinalView: phase2Final,
   361  		phase3FinalView: phase3Final,
   362  		seed:            seed,
   363  	}
   364  	return info, nil
   365  }
   366  
   367  // registerPoll instructs the engine to query the DKG smart-contract for new
   368  // broadcast messages at the specified view.
   369  func (e *ReactorEngine) registerPoll(view uint64) {
   370  	e.viewEvents.OnView(view, func(header *flow.Header) {
   371  		e.unit.Launch(func() {
   372  			e.unit.Lock()
   373  			defer e.unit.Unlock()
   374  
   375  			blockID := header.ID()
   376  			log := e.log.With().
   377  				Uint64("view", view).
   378  				Uint64("height", header.Height).
   379  				Hex("block_id", blockID[:]).
   380  				Logger()
   381  
   382  			log.Info().Msg("polling DKG smart-contract...")
   383  			err := e.controller.Poll(header.ID())
   384  			if err != nil {
   385  				log.Err(err).Msg("failed to poll DKG smart-contract")
   386  			}
   387  		})
   388  	})
   389  }
   390  
   391  // registerPhaseTransition instructs the engine to change phases at the
   392  // specified view.
   393  func (e *ReactorEngine) registerPhaseTransition(view uint64, fromState dkgmodule.State, phaseTransition func() error) {
   394  	e.viewEvents.OnView(view, func(header *flow.Header) {
   395  		e.unit.Launch(func() {
   396  			e.unit.Lock()
   397  			defer e.unit.Unlock()
   398  
   399  			blockID := header.ID()
   400  			log := e.log.With().
   401  				Uint64("view", view).
   402  				Hex("block_id", blockID[:]).
   403  				Logger()
   404  
   405  			log.Info().Msgf("ending %s...", fromState)
   406  			err := phaseTransition()
   407  			if err != nil {
   408  				// TODO use irrecoverable context
   409  				log.Fatal().Err(err).Msgf("node failed to end %s", fromState)
   410  			}
   411  			log.Info().Msgf("ended %s successfully", fromState)
   412  		})
   413  	})
   414  }
   415  
   416  // end returns a callback that is used to end the DKG protocol, save the
   417  // resulting private key to storage, and publish the other results to the DKG
   418  // smart-contract.
   419  func (e *ReactorEngine) end(nextEpochCounter uint64) func() error {
   420  	return func() error {
   421  
   422  		err := e.controller.End()
   423  		if crypto.IsDKGFailureError(err) {
   424  			e.log.Warn().Err(err).Msgf("node %s with index %d failed DKG locally", e.me.NodeID(), e.controller.GetIndex())
   425  			err := e.dkgState.SetDKGEndState(nextEpochCounter, flow.DKGEndStateDKGFailure)
   426  			if err != nil {
   427  				return fmt.Errorf("failed to set dkg end state following dkg end error: %w", err)
   428  			}
   429  		} else if err != nil {
   430  			return fmt.Errorf("unknown error ending the dkg: %w", err)
   431  		}
   432  
   433  		privateShare, _, _ := e.controller.GetArtifacts()
   434  		if privateShare != nil {
   435  			// we only store our key if one was computed
   436  			err = e.dkgState.InsertMyBeaconPrivateKey(nextEpochCounter, privateShare)
   437  			if err != nil {
   438  				return fmt.Errorf("could not save beacon private key in db: %w", err)
   439  			}
   440  		}
   441  
   442  		err = e.controller.SubmitResult()
   443  		if err != nil {
   444  			return fmt.Errorf("couldn't publish DKG results: %w", err)
   445  		}
   446  
   447  		return nil
   448  	}
   449  }