github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/consensus/dkg/reactor_engine.go (about) 1 package dkg 2 3 import ( 4 "crypto/rand" 5 "errors" 6 "fmt" 7 8 "github.com/onflow/crypto" 9 "github.com/rs/zerolog" 10 11 "github.com/onflow/flow-go/engine" 12 "github.com/onflow/flow-go/model/flow" 13 "github.com/onflow/flow-go/model/flow/filter" 14 "github.com/onflow/flow-go/module" 15 dkgmodule "github.com/onflow/flow-go/module/dkg" 16 "github.com/onflow/flow-go/state/protocol" 17 "github.com/onflow/flow-go/state/protocol/events" 18 "github.com/onflow/flow-go/storage" 19 ) 20 21 // DefaultPollStep specifies the default number of views that separate two calls 22 // to the DKG smart-contract to read broadcast messages. 23 const DefaultPollStep = 10 24 25 // dkgInfo consolidates information about the current DKG protocol instance. 26 type dkgInfo struct { 27 identities flow.IdentitySkeletonList 28 phase1FinalView uint64 29 phase2FinalView uint64 30 phase3FinalView uint64 31 // seed must be generated for each DKG instance, using a randomness source that is independent from all other nodes. 32 seed []byte 33 } 34 35 // ReactorEngine is an engine that reacts to chain events to start new DKG runs, 36 // and manage subsequent phase transitions. Any unexpected error triggers a 37 // panic as it would undermine the security of the protocol. 38 // TODO replace engine.Unit with component.Component 39 type ReactorEngine struct { 40 events.Noop 41 unit *engine.Unit 42 log zerolog.Logger 43 me module.Local 44 State protocol.State 45 dkgState storage.DKGState 46 controller module.DKGController 47 controllerFactory module.DKGControllerFactory 48 viewEvents events.Views 49 pollStep uint64 50 } 51 52 // NewReactorEngine return a new ReactorEngine. 53 func NewReactorEngine( 54 log zerolog.Logger, 55 me module.Local, 56 state protocol.State, 57 dkgState storage.DKGState, 58 controllerFactory module.DKGControllerFactory, 59 viewEvents events.Views, 60 ) *ReactorEngine { 61 62 logger := log.With(). 63 Str("engine", "dkg_reactor"). 64 Logger() 65 66 return &ReactorEngine{ 67 unit: engine.NewUnit(), 68 log: logger, 69 me: me, 70 State: state, 71 dkgState: dkgState, 72 controllerFactory: controllerFactory, 73 viewEvents: viewEvents, 74 pollStep: DefaultPollStep, 75 } 76 } 77 78 // Ready implements the module ReadyDoneAware interface. It returns a channel 79 // that will close when the engine has successfully started. 80 func (e *ReactorEngine) Ready() <-chan struct{} { 81 return e.unit.Ready(func() { 82 // If we are starting up in the EpochSetup phase, try to start the DKG. 83 // If the DKG for this epoch has been started previously, we will exit 84 // and fail this epoch's DKG. 85 snap := e.State.Final() 86 87 phase, err := snap.Phase() 88 if err != nil { 89 // unexpected storage-level error 90 // TODO use irrecoverable context 91 e.log.Fatal().Err(err).Msg("failed to check epoch phase when starting DKG reactor engine") 92 return 93 } 94 currentCounter, err := snap.Epochs().Current().Counter() 95 if err != nil { 96 // unexpected storage-level error 97 // TODO use irrecoverable context 98 e.log.Fatal().Err(err).Msg("failed to retrieve current epoch counter when starting DKG reactor engine") 99 return 100 } 101 first, err := snap.Head() 102 if err != nil { 103 // unexpected storage-level error 104 // TODO use irrecoverable context 105 e.log.Fatal().Err(err).Msg("failed to retrieve finalized header when starting DKG reactor engine") 106 return 107 } 108 109 // If we start up in EpochSetup phase, attempt to start the DKG in case it wasn't started previously 110 if phase == flow.EpochPhaseSetup { 111 e.startDKGForEpoch(currentCounter, first) 112 } else if phase == flow.EpochPhaseCommitted { 113 // If we start up in EpochCommitted phase, ensure the DKG end state is set correctly. 114 e.handleEpochCommittedPhaseStarted(currentCounter, first) 115 } 116 }) 117 } 118 119 // Done implements the module ReadyDoneAware interface. It returns a channel 120 // that will close when the engine has successfully stopped. 121 func (e *ReactorEngine) Done() <-chan struct{} { 122 return e.unit.Done() 123 } 124 125 // EpochSetupPhaseStarted handles the EpochSetupPhaseStarted protocol event by 126 // starting the DKG process. 127 // NOTE: ReactorEngine will not recover from mid-DKG crashes, therefore we do not need to handle dropped protocol events here. 128 func (e *ReactorEngine) EpochSetupPhaseStarted(currentEpochCounter uint64, first *flow.Header) { 129 e.startDKGForEpoch(currentEpochCounter, first) 130 } 131 132 // EpochCommittedPhaseStarted handles the EpochCommittedPhaseStarted protocol 133 // event by checking the consistency of our locally computed key share. 134 // NOTE: ReactorEngine will not recover from mid-DKG crashes, therefore we do not need to handle dropped protocol events here. 135 func (e *ReactorEngine) EpochCommittedPhaseStarted(currentEpochCounter uint64, first *flow.Header) { 136 e.handleEpochCommittedPhaseStarted(currentEpochCounter, first) 137 } 138 139 // startDKGForEpoch attempts to start the DKG instance for the given epoch, 140 // only if we have never started the DKG during setup phase for the given epoch. 141 // This allows consensus nodes which boot from a state snapshot within the 142 // EpochSetup phase to run the DKG. 143 // 144 // It starts a new controller for the epoch and registers the triggers to regularly 145 // query the DKG smart-contract and transition between phases at the specified views. 146 func (e *ReactorEngine) startDKGForEpoch(currentEpochCounter uint64, first *flow.Header) { 147 148 firstID := first.ID() 149 nextEpochCounter := currentEpochCounter + 1 150 log := e.log.With(). 151 Uint64("cur_epoch", currentEpochCounter). // the epoch we are in the middle of 152 Uint64("next_epoch", nextEpochCounter). // the epoch we are running the DKG for 153 Uint64("first_block_view", first.View). // view of first block in EpochSetup phase 154 Hex("first_block_id", firstID[:]). // id of first block in EpochSetup phase 155 Logger() 156 157 // if we have started the dkg for this epoch already, exit 158 started, err := e.dkgState.GetDKGStarted(nextEpochCounter) 159 if err != nil { 160 // unexpected storage-level error 161 // TODO use irrecoverable context 162 log.Fatal().Err(err).Msg("could not check whether DKG is started") 163 } 164 if started { 165 log.Warn().Msg("DKG started before, skipping starting the DKG for this epoch") 166 return 167 } 168 169 // flag that we are starting the dkg for this epoch 170 err = e.dkgState.SetDKGStarted(nextEpochCounter) 171 if err != nil { 172 // unexpected storage-level error 173 // TODO use irrecoverable context 174 log.Fatal().Err(err).Msg("could not set dkg started") 175 } 176 177 curDKGInfo, err := e.getDKGInfo(firstID) 178 if err != nil { 179 // unexpected storage-level error 180 // TODO use irrecoverable context 181 log.Fatal().Err(err).Msg("could not retrieve epoch info") 182 } 183 184 committee := curDKGInfo.identities.Filter(filter.IsConsensusCommitteeMember) 185 186 log.Info(). 187 Uint64("phase1", curDKGInfo.phase1FinalView). 188 Uint64("phase2", curDKGInfo.phase2FinalView). 189 Uint64("phase3", curDKGInfo.phase3FinalView). 190 Interface("members", committee.NodeIDs()). 191 Msg("epoch info") 192 193 if _, ok := committee.GetIndex(e.me.NodeID()); !ok { 194 // node not found in DKG committee bypass starting the DKG 195 log.Warn().Str("node_id", e.me.NodeID().String()).Msg("failed to find our node ID in the DKG committee skip starting DKG engine, this node will not participate in consensus after the next epoch starts") 196 return 197 } 198 controller, err := e.controllerFactory.Create( 199 dkgmodule.CanonicalInstanceID(first.ChainID, nextEpochCounter), 200 committee, 201 curDKGInfo.seed, 202 ) 203 if err != nil { 204 // no expected errors in controller factory 205 // TODO use irrecoverable context 206 log.Fatal().Err(err).Msg("could not create DKG controller") 207 } 208 e.controller = controller 209 210 e.unit.Launch(func() { 211 log.Info().Msg("DKG Run") 212 err := e.controller.Run() 213 if err != nil { 214 // TODO handle crypto sentinels and do not crash here 215 log.Fatal().Err(err).Msg("DKG Run error") 216 } 217 }) 218 219 // NOTE: 220 // We register two callbacks for views that mark a state transition: one for 221 // polling broadcast messages, and one for triggering the phase transition. 222 // It is essential that all polled broadcast messages are processed before 223 // starting the phase transition. Here we register the polling callback 224 // before the phase transition, which guarantees that it will be called 225 // before because callbacks for the same views are executed on a FIFO basis. 226 // Moreover, the poll callback does not return until all received messages 227 // are processed by the underlying DKG controller (as guaranteed by the 228 // specifications and implementations of the DKGBroker and DKGController 229 // interfaces). 230 231 for view := curDKGInfo.phase1FinalView; view > first.View; view -= e.pollStep { 232 e.registerPoll(view) 233 } 234 e.registerPhaseTransition(curDKGInfo.phase1FinalView, dkgmodule.Phase1, e.controller.EndPhase1) 235 236 for view := curDKGInfo.phase2FinalView; view > curDKGInfo.phase1FinalView; view -= e.pollStep { 237 e.registerPoll(view) 238 } 239 e.registerPhaseTransition(curDKGInfo.phase2FinalView, dkgmodule.Phase2, e.controller.EndPhase2) 240 241 for view := curDKGInfo.phase3FinalView; view > curDKGInfo.phase2FinalView; view -= e.pollStep { 242 e.registerPoll(view) 243 } 244 e.registerPhaseTransition(curDKGInfo.phase3FinalView, dkgmodule.Phase3, e.end(nextEpochCounter)) 245 } 246 247 // handleEpochCommittedPhaseStarted is invoked upon the transition to the EpochCommitted 248 // phase, when the canonical beacon key vector is incorporated into the protocol state. 249 // 250 // This function checks that the local DKG completed and that our locally computed 251 // key share is consistent with the canonical key vector. When this function returns, 252 // an end state for the just-completed DKG is guaranteed to be stored (if not, the 253 // program will crash). Since this function is invoked synchronously before the end 254 // of the current epoch, this guarantees that when we reach the end of the current epoch 255 // we will either have a usable beacon key (successful DKG) or a DKG failure end state 256 // stored, so we can safely fall back to using our staking key. 257 // 258 // CAUTION: This function is not safe for concurrent use. This is not enforced within 259 // the ReactorEngine - instead we rely on the protocol event emission being single-threaded 260 func (e *ReactorEngine) handleEpochCommittedPhaseStarted(currentEpochCounter uint64, firstBlock *flow.Header) { 261 262 // the DKG we have just completed produces keys that we will use in the next epoch 263 nextEpochCounter := currentEpochCounter + 1 264 265 log := e.log.With(). 266 Uint64("cur_epoch", currentEpochCounter). // the epoch we are in the middle of 267 Uint64("next_epoch", nextEpochCounter). // the epoch the just-finished DKG was preparing for 268 Logger() 269 270 // Check whether we have already set the end state for this DKG. 271 // This can happen if the DKG failed locally, if we failed to generate 272 // a local private beacon key, or if we crashed while performing this 273 // check previously. 274 endState, err := e.dkgState.GetDKGEndState(nextEpochCounter) 275 if err == nil { 276 log.Warn().Msgf("checking beacon key consistency: exiting because dkg end state was already set: %s", endState.String()) 277 return 278 } 279 280 // Since epoch phase transitions are emitted when the first block of the new 281 // phase is finalized, the block's snapshot is guaranteed to already be 282 // accessible in the protocol state at this point (even though the Badger 283 // transaction finalizing the block has not been committed yet). 284 nextDKG, err := e.State.AtBlockID(firstBlock.ID()).Epochs().Next().DKG() 285 if err != nil { 286 // CAUTION: this should never happen, indicates a storage failure or corruption 287 // TODO use irrecoverable context 288 log.Fatal().Err(err).Msg("checking beacon key consistency: could not retrieve next DKG info") 289 return 290 } 291 292 myBeaconPrivKey, err := e.dkgState.RetrieveMyBeaconPrivateKey(nextEpochCounter) 293 if errors.Is(err, storage.ErrNotFound) { 294 log.Warn().Msg("checking beacon key consistency: no key found") 295 err := e.dkgState.SetDKGEndState(nextEpochCounter, flow.DKGEndStateNoKey) 296 if err != nil { 297 // TODO use irrecoverable context 298 log.Fatal().Err(err).Msg("failed to set dkg end state") 299 } 300 return 301 } else if err != nil { 302 // TODO use irrecoverable context 303 log.Fatal().Err(err).Msg("checking beacon key consistency: could not retrieve beacon private key for next epoch") 304 return 305 } 306 307 nextDKGPubKey, err := nextDKG.KeyShare(e.me.NodeID()) 308 if err != nil { 309 // TODO use irrecoverable context 310 log.Fatal().Err(err).Msg("checking beacon key consistency: could not retrieve my beacon public key for next epoch") 311 return 312 } 313 localPubKey := myBeaconPrivKey.PublicKey() 314 315 // we computed a local beacon key but it is inconsistent with our canonical 316 // public key - therefore it is unsafe for use 317 if !nextDKGPubKey.Equals(localPubKey) { 318 log.Warn(). 319 Str("computed_beacon_pub_key", localPubKey.String()). 320 Str("canonical_beacon_pub_key", nextDKGPubKey.String()). 321 Msg("checking beacon key consistency: locally computed beacon public key does not match beacon public key for next epoch") 322 err := e.dkgState.SetDKGEndState(nextEpochCounter, flow.DKGEndStateInconsistentKey) 323 if err != nil { 324 // TODO use irrecoverable context 325 log.Fatal().Err(err).Msg("failed to set dkg end state") 326 } 327 return 328 } 329 330 err = e.dkgState.SetDKGEndState(nextEpochCounter, flow.DKGEndStateSuccess) 331 if err != nil { 332 // TODO use irrecoverable context 333 e.log.Fatal().Err(err).Msg("failed to set dkg end state") 334 } 335 log.Info().Msgf("successfully ended DKG, my beacon pub key for epoch %d is %s", nextEpochCounter, localPubKey) 336 } 337 338 // TODO document error returns 339 func (e *ReactorEngine) getDKGInfo(firstBlockID flow.Identifier) (*dkgInfo, error) { 340 currEpoch := e.State.AtBlockID(firstBlockID).Epochs().Current() 341 nextEpoch := e.State.AtBlockID(firstBlockID).Epochs().Next() 342 343 identities, err := nextEpoch.InitialIdentities() 344 if err != nil { 345 return nil, fmt.Errorf("could not retrieve epoch identities: %w", err) 346 } 347 phase1Final, phase2Final, phase3Final, err := protocol.DKGPhaseViews(currEpoch) 348 if err != nil { 349 return nil, fmt.Errorf("could not retrieve epoch dkg final views: %w", err) 350 } 351 seed := make([]byte, crypto.KeyGenSeedMinLen) 352 _, err = rand.Read(seed) 353 if err != nil { 354 return nil, fmt.Errorf("could not generate random seed: %w", err) 355 } 356 357 info := &dkgInfo{ 358 identities: identities, 359 phase1FinalView: phase1Final, 360 phase2FinalView: phase2Final, 361 phase3FinalView: phase3Final, 362 seed: seed, 363 } 364 return info, nil 365 } 366 367 // registerPoll instructs the engine to query the DKG smart-contract for new 368 // broadcast messages at the specified view. 369 func (e *ReactorEngine) registerPoll(view uint64) { 370 e.viewEvents.OnView(view, func(header *flow.Header) { 371 e.unit.Launch(func() { 372 e.unit.Lock() 373 defer e.unit.Unlock() 374 375 blockID := header.ID() 376 log := e.log.With(). 377 Uint64("view", view). 378 Uint64("height", header.Height). 379 Hex("block_id", blockID[:]). 380 Logger() 381 382 log.Info().Msg("polling DKG smart-contract...") 383 err := e.controller.Poll(header.ID()) 384 if err != nil { 385 log.Err(err).Msg("failed to poll DKG smart-contract") 386 } 387 }) 388 }) 389 } 390 391 // registerPhaseTransition instructs the engine to change phases at the 392 // specified view. 393 func (e *ReactorEngine) registerPhaseTransition(view uint64, fromState dkgmodule.State, phaseTransition func() error) { 394 e.viewEvents.OnView(view, func(header *flow.Header) { 395 e.unit.Launch(func() { 396 e.unit.Lock() 397 defer e.unit.Unlock() 398 399 blockID := header.ID() 400 log := e.log.With(). 401 Uint64("view", view). 402 Hex("block_id", blockID[:]). 403 Logger() 404 405 log.Info().Msgf("ending %s...", fromState) 406 err := phaseTransition() 407 if err != nil { 408 // TODO use irrecoverable context 409 log.Fatal().Err(err).Msgf("node failed to end %s", fromState) 410 } 411 log.Info().Msgf("ended %s successfully", fromState) 412 }) 413 }) 414 } 415 416 // end returns a callback that is used to end the DKG protocol, save the 417 // resulting private key to storage, and publish the other results to the DKG 418 // smart-contract. 419 func (e *ReactorEngine) end(nextEpochCounter uint64) func() error { 420 return func() error { 421 422 err := e.controller.End() 423 if crypto.IsDKGFailureError(err) { 424 e.log.Warn().Err(err).Msgf("node %s with index %d failed DKG locally", e.me.NodeID(), e.controller.GetIndex()) 425 err := e.dkgState.SetDKGEndState(nextEpochCounter, flow.DKGEndStateDKGFailure) 426 if err != nil { 427 return fmt.Errorf("failed to set dkg end state following dkg end error: %w", err) 428 } 429 } else if err != nil { 430 return fmt.Errorf("unknown error ending the dkg: %w", err) 431 } 432 433 privateShare, _, _ := e.controller.GetArtifacts() 434 if privateShare != nil { 435 // we only store our key if one was computed 436 err = e.dkgState.InsertMyBeaconPrivateKey(nextEpochCounter, privateShare) 437 if err != nil { 438 return fmt.Errorf("could not save beacon private key in db: %w", err) 439 } 440 } 441 442 err = e.controller.SubmitResult() 443 if err != nil { 444 return fmt.Errorf("couldn't publish DKG results: %w", err) 445 } 446 447 return nil 448 } 449 }