github.com/onflow/flow-go@v0.33.17/engine/collection/epochmgr/engine.go (about) 1 package epochmgr 2 3 import ( 4 "errors" 5 "fmt" 6 "sync" 7 "time" 8 9 "github.com/rs/zerolog" 10 11 "github.com/onflow/flow-go/engine/collection" 12 "github.com/onflow/flow-go/model/flow" 13 "github.com/onflow/flow-go/module" 14 "github.com/onflow/flow-go/module/component" 15 "github.com/onflow/flow-go/module/epochs" 16 "github.com/onflow/flow-go/module/irrecoverable" 17 epochpool "github.com/onflow/flow-go/module/mempool/epochs" 18 "github.com/onflow/flow-go/module/util" 19 "github.com/onflow/flow-go/state/protocol" 20 "github.com/onflow/flow-go/state/protocol/events" 21 ) 22 23 // DefaultStartupTimeout is the default time we wait when starting epoch components before giving up. 24 const DefaultStartupTimeout = time.Minute 25 26 // ErrNotAuthorizedForEpoch is returned when we attempt to create epoch components 27 // for an epoch in which we are not an authorized network participant. This is the 28 // case for epochs during which this node is joining or leaving the network. 29 var ErrNotAuthorizedForEpoch = fmt.Errorf("we are not an authorized participant for the epoch") 30 31 // Engine is the epoch manager, which coordinates the lifecycle of other modules 32 // and processes that are epoch-dependent. The manager is responsible for 33 // spinning up engines when a new epoch is about to start and spinning down 34 // engines for an epoch that has ended. 35 // 36 // The `epochmgr.Engine` implements the `protocol.Consumer` interface. In particular, it 37 // ingests the following notifications from the protocol state: 38 // - EpochSetupPhaseStarted 39 // - EpochTransition 40 // 41 // As part of the engine starting, it executes pending actions that should have been triggered 42 // by protocol events but those events were missed during a crash/restart. See respective 43 // consumer methods for further details. 44 type Engine struct { 45 events.Noop // satisfy protocol events consumer interface 46 47 log zerolog.Logger 48 me module.Local 49 state protocol.State 50 pools *epochpool.TransactionPools // epoch-scoped transaction pools 51 factory EpochComponentsFactory // consolidates creating epoch for an epoch 52 voter module.ClusterRootQCVoter // manages process of voting for next epoch's QC 53 heightEvents events.Heights // allows subscribing to particular heights 54 startupTimeout time.Duration // how long we wait for epoch components to start up 55 56 mu sync.RWMutex // protects epochs map 57 epochs map[uint64]*RunningEpochComponents // epoch-scoped components per epoch 58 59 // internal event notifications 60 epochTransitionEvents chan *flow.Header // sends first block of new epoch 61 epochSetupPhaseStartedEvents chan *flow.Header // sends first block of EpochSetup phase 62 epochStopEvents chan uint64 // sends counter of epoch to stop 63 clusterIDUpdateDistributor collection.ClusterEvents // sends cluster ID updates to consumers 64 cm *component.ComponentManager 65 component.Component 66 } 67 68 var _ component.Component = (*Engine)(nil) 69 var _ protocol.Consumer = (*Engine)(nil) 70 71 func New( 72 log zerolog.Logger, 73 me module.Local, 74 state protocol.State, 75 pools *epochpool.TransactionPools, 76 voter module.ClusterRootQCVoter, 77 factory EpochComponentsFactory, 78 heightEvents events.Heights, 79 clusterIDUpdateDistributor collection.ClusterEvents, 80 ) (*Engine, error) { 81 e := &Engine{ 82 log: log.With().Str("engine", "epochmgr").Logger(), 83 me: me, 84 state: state, 85 pools: pools, 86 voter: voter, 87 factory: factory, 88 heightEvents: heightEvents, 89 epochs: make(map[uint64]*RunningEpochComponents), 90 startupTimeout: DefaultStartupTimeout, 91 epochTransitionEvents: make(chan *flow.Header, 1), 92 epochSetupPhaseStartedEvents: make(chan *flow.Header, 1), 93 epochStopEvents: make(chan uint64, 1), 94 clusterIDUpdateDistributor: clusterIDUpdateDistributor, 95 } 96 97 e.cm = component.NewComponentManagerBuilder(). 98 AddWorker(e.handleEpochEvents). 99 Build() 100 e.Component = e.cm 101 102 return e, nil 103 } 104 105 // Start starts the engine. 106 func (e *Engine) Start(ctx irrecoverable.SignalerContext) { 107 // (1) start engine-scoped workers 108 e.cm.Start(ctx) 109 110 // (2) Retrieve protocol state as of latest finalized block. We use this state 111 // to catch up on events, whose execution was missed during crash-restart. 112 finalSnapshot := e.state.Final() 113 114 // (3) check if we should attempt to vote after startup 115 err := e.checkShouldVoteOnStartup(finalSnapshot) 116 if err != nil { 117 ctx.Throw(fmt.Errorf("could not vote on startup: %w", err)) 118 } 119 120 // (4) start epoch-scoped components: 121 // (a) set up epoch-scoped epoch managed by this engine for the current epoch 122 err = e.checkShouldStartCurrentEpochComponentsOnStartup(ctx, finalSnapshot) 123 if err != nil { 124 ctx.Throw(fmt.Errorf("could not check or start current epoch components: %w", err)) 125 } 126 127 // (b) set up epoch-scoped epoch components for the previous epoch 128 err = e.checkShouldStartPreviousEpochComponentsOnStartup(ctx, finalSnapshot) 129 if err != nil { 130 ctx.Throw(fmt.Errorf("could not check or start previous epoch components: %w", err)) 131 } 132 } 133 134 // checkShouldStartCurrentEpochComponentsOnStartup checks whether we should instantiate 135 // consensus components for the current epoch upon startup, and if so, starts them. 136 // We always start current epoch consensus components, unless this node is not an 137 // authorized participant in the current epoch. 138 // No errors are expected during normal operation. 139 func (e *Engine) checkShouldStartCurrentEpochComponentsOnStartup(ctx irrecoverable.SignalerContext, finalSnapshot protocol.Snapshot) error { 140 currentEpoch := finalSnapshot.Epochs().Current() 141 currentEpochCounter, err := currentEpoch.Counter() 142 if err != nil { 143 return fmt.Errorf("could not get epoch counter: %w", err) 144 } 145 146 components, err := e.createEpochComponents(currentEpoch) 147 if err != nil { 148 if errors.Is(err, ErrNotAuthorizedForEpoch) { 149 // don't set up consensus components if we aren't authorized in current epoch 150 e.log.Info().Msg("node is not authorized for current epoch - skipping initializing cluster consensus") 151 return nil 152 } 153 return fmt.Errorf("could not create epoch components: %w", err) 154 } 155 err = e.startEpochComponents(ctx, currentEpochCounter, components) 156 if err != nil { 157 // all failures to start epoch components are critical 158 return fmt.Errorf("could not start epoch components: %w", err) 159 } 160 return nil 161 } 162 163 // checkShouldStartPreviousEpochComponentsOnStartup checks whether we should re-instantiate 164 // consensus components for the previous epoch upon startup, and if so, starts them. 165 // One cluster is responsible for a portion of transactions with reference blocks 166 // with one epoch. Since transactions may use reference blocks up to flow.DefaultTransactionExpiry 167 // many heights old, clusters don't shut down until this many blocks have been finalized 168 // past the final block of the cluster's epoch. 169 // No errors are expected during normal operation. 170 func (e *Engine) checkShouldStartPreviousEpochComponentsOnStartup(engineCtx irrecoverable.SignalerContext, finalSnapshot protocol.Snapshot) error { 171 finalHeader, err := finalSnapshot.Head() 172 if err != nil { 173 return fmt.Errorf("[unexpected] could not get finalized header: %w", err) 174 } 175 finalizedHeight := finalHeader.Height 176 177 prevEpoch := finalSnapshot.Epochs().Previous() 178 prevEpochCounter, err := prevEpoch.Counter() 179 if err != nil { 180 if errors.Is(err, protocol.ErrNoPreviousEpoch) { 181 return nil 182 } 183 return fmt.Errorf("[unexpected] could not get previous epoch counter: %w", err) 184 } 185 prevEpochFinalHeight, err := prevEpoch.FinalHeight() 186 if err != nil { 187 // no expected errors because we are querying finalized snapshot 188 return fmt.Errorf("[unexpected] could not get previous epoch final height: %w", err) 189 } 190 prevEpochClusterConsensusStopHeight := prevEpochFinalHeight + flow.DefaultTransactionExpiry + 1 191 192 log := e.log.With(). 193 Uint64("finalized_height", finalizedHeight). 194 Uint64("prev_epoch_counter", prevEpochCounter). 195 Uint64("prev_epoch_final_height", prevEpochFinalHeight). 196 Uint64("prev_epoch_cluster_stop_height", prevEpochClusterConsensusStopHeight). 197 Logger() 198 199 if finalizedHeight >= prevEpochClusterConsensusStopHeight { 200 log.Debug().Msg("not re-starting previous epoch cluster consensus on startup - past stop height") 201 return nil 202 } 203 204 components, err := e.createEpochComponents(prevEpoch) 205 if err != nil { 206 if errors.Is(err, ErrNotAuthorizedForEpoch) { 207 // don't set up consensus components if we aren't authorized in previous epoch 208 log.Info().Msg("node is not authorized for previous epoch - skipping re-initializing last epoch cluster consensus") 209 return nil 210 } 211 return fmt.Errorf("[unexpected] could not create previous epoch components: %w", err) 212 } 213 err = e.startEpochComponents(engineCtx, prevEpochCounter, components) 214 if err != nil { 215 // all failures to start epoch components are critical 216 return fmt.Errorf("[unexpected] could not epoch components: %w", err) 217 } 218 e.prepareToStopEpochComponents(prevEpochCounter, prevEpochFinalHeight) 219 220 log.Info().Msgf("re-started last epoch cluster consensus - will stop at height %d", prevEpochClusterConsensusStopHeight) 221 return nil 222 } 223 224 // checkShouldVoteOnStartup checks whether we should vote, and if so, sends a signal 225 // to the worker thread responsible for voting. 226 // No errors are expected during normal operation. 227 func (e *Engine) checkShouldVoteOnStartup(finalSnapshot protocol.Snapshot) error { 228 // check the current phase on startup, in case we are in setup phase 229 // and haven't yet voted for the next root QC 230 phase, err := finalSnapshot.Phase() 231 if err != nil { 232 return fmt.Errorf("could not get epoch phase for finalized snapshot: %w", err) 233 } 234 if phase == flow.EpochPhaseSetup { 235 header, err := finalSnapshot.Head() 236 if err != nil { 237 return fmt.Errorf("could not get header for finalized snapshot: %w", err) 238 } 239 e.epochSetupPhaseStartedEvents <- header 240 } 241 return nil 242 } 243 244 // Ready returns a ready channel that is closed once the engine has fully started. 245 // This is true when the engine-scoped worker threads have started, and all presently 246 // running epoch components (max 2) have started. 247 func (e *Engine) Ready() <-chan struct{} { 248 e.mu.RLock() 249 components := make([]module.ReadyDoneAware, 0, len(e.epochs)+1) 250 components = append(components, e.cm) 251 for _, epoch := range e.epochs { 252 components = append(components, epoch) 253 } 254 e.mu.RUnlock() 255 256 return util.AllReady(components...) 257 } 258 259 // Done returns a done channel that is closed once the engine has fully stopped. 260 // This is true when the engine-scoped worker threads have stopped, and all presently 261 // running epoch components (max 2) have stopped. 262 func (e *Engine) Done() <-chan struct{} { 263 e.mu.RLock() 264 components := make([]module.ReadyDoneAware, 0, len(e.epochs)+1) 265 components = append(components, e.cm) 266 for _, epoch := range e.epochs { 267 components = append(components, epoch) 268 } 269 e.mu.RUnlock() 270 271 return util.AllDone(components...) 272 } 273 274 // createEpochComponents instantiates and returns epoch-scoped components for 275 // the given epoch, using the configured factory. 276 // Error returns: 277 // - ErrNotAuthorizedForEpoch if this node is not authorized in the epoch. 278 func (e *Engine) createEpochComponents(epoch protocol.Epoch) (*EpochComponents, error) { 279 counter, err := epoch.Counter() 280 if err != nil { 281 return nil, fmt.Errorf("could not get epoch counter: %w", err) 282 } 283 state, prop, sync, hot, voteAggregator, timeoutAggregator, messageHub, err := e.factory.Create(epoch) 284 if err != nil { 285 return nil, fmt.Errorf("could not setup requirements for epoch (%d): %w", counter, err) 286 } 287 288 components := NewEpochComponents(state, prop, sync, hot, voteAggregator, timeoutAggregator, messageHub) 289 return components, nil 290 } 291 292 // EpochTransition handles the epoch transition protocol event. 293 // NOTE: epochmgr.Engine will not restart trailing cluster consensus instances from previous epoch, 294 // therefore no need to handle dropped protocol events here (see issue below). 295 // TODO gracefully handle restarts in first 600 blocks of epoch https://github.com/dapperlabs/flow-go/issues/5659 296 func (e *Engine) EpochTransition(_ uint64, first *flow.Header) { 297 e.epochTransitionEvents <- first 298 } 299 300 // EpochSetupPhaseStarted handles the epoch setup phase started protocol event. 301 // NOTE: Ready will check if we start up in the EpochSetup phase at initialization and trigger QC voting. 302 // This handles dropped protocol events and restarts interrupting QC voting. 303 func (e *Engine) EpochSetupPhaseStarted(_ uint64, first *flow.Header) { 304 e.epochSetupPhaseStartedEvents <- first 305 } 306 307 // handleEpochEvents handles events relating to the epoch lifecycle: 308 // - EpochTransition protocol event - we start epoch components for the starting epoch, 309 // and schedule shutdown for the ending epoch 310 // - EpochSetupPhaseStarted protocol event - we submit our node's vote for our cluster's 311 // root block in the next epoch 312 // - epochStopEvents - signalled when a previously scheduled shutdown height is reached. 313 // We shut down components associated with the epoch. 314 func (e *Engine) handleEpochEvents(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) { 315 ready() 316 317 for { 318 select { 319 case <-ctx.Done(): 320 return 321 case firstBlock := <-e.epochTransitionEvents: 322 err := e.onEpochTransition(ctx, firstBlock) 323 if err != nil { 324 ctx.Throw(err) 325 } 326 case firstBlock := <-e.epochSetupPhaseStartedEvents: 327 nextEpoch := e.state.AtBlockID(firstBlock.ID()).Epochs().Next() 328 e.onEpochSetupPhaseStarted(ctx, nextEpoch) 329 case epochCounter := <-e.epochStopEvents: 330 err := e.stopEpochComponents(epochCounter) 331 if err != nil { 332 ctx.Throw(err) 333 } 334 } 335 } 336 } 337 338 // handleEpochErrors checks for irrecoverable errors thrown from any components from 339 // some epoch, and handles them. Currently, handling them means simply throwing them 340 // to the engine-level signaller context, which should cause the node to crash. 341 // In the future, we could restart the failed epoch's components instead. 342 // Must be run as a goroutine. 343 func (e *Engine) handleEpochErrors(ctx irrecoverable.SignalerContext, errCh <-chan error) { 344 select { 345 case <-ctx.Done(): 346 return 347 case err := <-errCh: 348 if err != nil { 349 ctx.Throw(err) 350 } 351 } 352 } 353 354 // onEpochTransition is called when we transition to a new epoch. It arranges 355 // to shut down the last epoch's components and starts up the new epoch's. 356 // 357 // No errors are expected during normal operation. 358 func (e *Engine) onEpochTransition(ctx irrecoverable.SignalerContext, first *flow.Header) error { 359 epoch := e.state.AtBlockID(first.ID()).Epochs().Current() 360 counter, err := epoch.Counter() 361 if err != nil { 362 return fmt.Errorf("could not get epoch counter: %w", err) 363 } 364 365 // greatest block height in the previous epoch is one less than the first 366 // block in current epoch 367 lastEpochMaxHeight := first.Height - 1 368 369 log := e.log.With(). 370 Uint64("last_epoch_max_height", lastEpochMaxHeight). 371 Uint64("cur_epoch_counter", counter). 372 Logger() 373 374 // exit early and log if the epoch already exists 375 _, exists := e.getEpochComponents(counter) 376 if exists { 377 log.Warn().Msg("epoch transition: components for new epoch already setup, exiting...") 378 return nil 379 } 380 381 // register a callback to stop the just-ended epoch at the appropriate block height 382 e.prepareToStopEpochComponents(counter-1, lastEpochMaxHeight) 383 384 log.Info().Msg("epoch transition: creating components for new epoch...") 385 386 // create components for new epoch 387 components, err := e.createEpochComponents(epoch) 388 if err != nil { 389 if errors.Is(err, ErrNotAuthorizedForEpoch) { 390 // if we are not authorized in this epoch, skip starting up cluster consensus 391 log.Info().Msg("epoch transition: we are not authorized for new epoch, exiting...") 392 return nil 393 } 394 return fmt.Errorf("could not create epoch components: %w", err) 395 } 396 397 // start up components 398 err = e.startEpochComponents(ctx, counter, components) 399 if err != nil { 400 return fmt.Errorf("unexpected failure starting epoch components: %w", err) 401 } 402 403 log.Info().Msg("epoch transition: new epoch components started successfully") 404 405 return nil 406 } 407 408 // prepareToStopEpochComponents registers a callback to stop the epoch with the 409 // given counter once it is no longer possible to receive transactions from that 410 // epoch. This occurs when we finalize sufficiently many blocks in the new epoch 411 // that a transaction referencing any block from the previous epoch would be 412 // considered immediately expired. 413 // 414 // Transactions referencing blocks from the previous epoch are only valid for 415 // inclusion in collections built by clusters from that epoch. Consequently, it 416 // remains possible for the previous epoch's cluster to produce valid collections 417 // until all such transactions have expired. In fact, since these transactions 418 // can NOT be included by clusters in the new epoch, we MUST continue producing 419 // these collections within the previous epoch's clusters. 420 func (e *Engine) prepareToStopEpochComponents(epochCounter, epochMaxHeight uint64) { 421 stopAtHeight := epochMaxHeight + flow.DefaultTransactionExpiry + 1 422 e.log.Info(). 423 Uint64("stopping_epoch_max_height", epochMaxHeight). 424 Uint64("stopping_epoch_counter", epochCounter). 425 Uint64("stop_at_height", stopAtHeight). 426 Str("step", "epoch_transition"). 427 Msgf("preparing to stop epoch components at height %d", stopAtHeight) 428 429 e.heightEvents.OnHeight(stopAtHeight, func() { 430 e.epochStopEvents <- epochCounter 431 }) 432 } 433 434 // onEpochSetupPhaseStarted is called either when we transition into the epoch 435 // setup phase, or when the node is restarted during the epoch setup phase. It 436 // kicks off setup tasks for the phase, in particular submitting a vote for the 437 // next epoch's root cluster QC. 438 func (e *Engine) onEpochSetupPhaseStarted(ctx irrecoverable.SignalerContext, nextEpoch protocol.Epoch) { 439 err := e.voter.Vote(ctx, nextEpoch) 440 if err != nil { 441 if epochs.IsClusterQCNoVoteError(err) { 442 e.log.Warn().Err(err).Msg("unable to submit QC vote for next epoch") 443 return 444 } 445 ctx.Throw(fmt.Errorf("unexpected failure to submit QC vote for next epoch: %w", err)) 446 } 447 } 448 449 // startEpochComponents starts the components for the given epoch and adds them 450 // to the engine's internal mapping. 451 // No errors are expected during normal operation. 452 func (e *Engine) startEpochComponents(engineCtx irrecoverable.SignalerContext, counter uint64, components *EpochComponents) error { 453 epochCtx, cancel, errCh := irrecoverable.WithSignallerAndCancel(engineCtx) 454 // start component using its own context 455 components.Start(epochCtx) 456 go e.handleEpochErrors(engineCtx, errCh) 457 458 select { 459 case <-components.Ready(): 460 e.storeEpochComponents(counter, NewRunningEpochComponents(components, cancel)) 461 activeClusterIDS, err := e.activeClusterIDs() 462 if err != nil { 463 return fmt.Errorf("failed to get active cluster IDs: %w", err) 464 } 465 e.clusterIDUpdateDistributor.ActiveClustersChanged(activeClusterIDS) 466 return nil 467 case <-time.After(e.startupTimeout): 468 cancel() // cancel current context if we didn't start in time 469 return fmt.Errorf("could not start epoch %d components after %s", counter, e.startupTimeout) 470 } 471 } 472 473 // stopEpochComponents stops the components for the given epoch and removes them 474 // from the engine's internal mapping. If no components exit for the given epoch, 475 // this is a no-op and a warning is logged. 476 // No errors are expected during normal operation. 477 func (e *Engine) stopEpochComponents(counter uint64) error { 478 components, exists := e.getEpochComponents(counter) 479 if !exists { 480 e.log.Warn().Msgf("attempted to stop non-existent epoch %d", counter) 481 return nil 482 } 483 484 // stop individual component 485 components.cancel() 486 487 select { 488 case <-components.Done(): 489 e.removeEpoch(counter) 490 e.pools.ForEpoch(counter).Clear() 491 activeClusterIDS, err := e.activeClusterIDs() 492 if err != nil { 493 return fmt.Errorf("failed to get active cluster IDs: %w", err) 494 } 495 e.clusterIDUpdateDistributor.ActiveClustersChanged(activeClusterIDS) 496 return nil 497 case <-time.After(e.startupTimeout): 498 return fmt.Errorf("could not stop epoch %d components after %s", counter, e.startupTimeout) 499 } 500 } 501 502 // getEpochComponents retrieves the stored (running) epoch components for the given epoch counter. 503 // If no epoch with the counter is stored, returns (nil, false). 504 // Safe for concurrent use. 505 func (e *Engine) getEpochComponents(counter uint64) (*RunningEpochComponents, bool) { 506 e.mu.RLock() 507 epoch, ok := e.epochs[counter] 508 e.mu.RUnlock() 509 return epoch, ok 510 } 511 512 // storeEpochComponents stores the given epoch components in the engine's mapping. 513 // Safe for concurrent use. 514 func (e *Engine) storeEpochComponents(counter uint64, components *RunningEpochComponents) { 515 e.mu.Lock() 516 e.epochs[counter] = components 517 e.mu.Unlock() 518 } 519 520 // removeEpoch removes the epoch components with the given counter. 521 // Safe for concurrent use. 522 func (e *Engine) removeEpoch(counter uint64) { 523 e.mu.Lock() 524 delete(e.epochs, counter) 525 e.mu.Unlock() 526 } 527 528 // activeClusterIDs returns the active canonical cluster ID's for the assigned collection clusters. 529 // No errors are expected during normal operation. 530 func (e *Engine) activeClusterIDs() (flow.ChainIDList, error) { 531 e.mu.RLock() 532 defer e.mu.RUnlock() 533 clusterIDs := make(flow.ChainIDList, 0) 534 for _, epoch := range e.epochs { 535 chainID, err := epoch.state.Params().ChainID() // cached, does not hit database 536 if err != nil { 537 return nil, fmt.Errorf("failed to get active cluster ids: %w", err) 538 } 539 clusterIDs = append(clusterIDs, chainID) 540 } 541 return clusterIDs, nil 542 }