github.com/koko1123/flow-go-1@v0.29.6/engine/collection/epochmgr/engine.go (about) 1 package epochmgr 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "time" 8 9 "github.com/rs/zerolog" 10 11 "github.com/koko1123/flow-go-1/consensus/hotstuff" 12 "github.com/koko1123/flow-go-1/engine" 13 "github.com/koko1123/flow-go-1/model/flow" 14 "github.com/koko1123/flow-go-1/module" 15 "github.com/koko1123/flow-go-1/module/component" 16 "github.com/koko1123/flow-go-1/module/irrecoverable" 17 "github.com/koko1123/flow-go-1/module/mempool/epochs" 18 "github.com/koko1123/flow-go-1/module/util" 19 "github.com/koko1123/flow-go-1/network" 20 "github.com/koko1123/flow-go-1/state/cluster" 21 "github.com/koko1123/flow-go-1/state/protocol" 22 "github.com/koko1123/flow-go-1/state/protocol/events" 23 ) 24 25 // DefaultStartupTimeout is the default time we wait when starting epoch 26 // components before giving up. 27 const DefaultStartupTimeout = 30 * time.Second 28 29 // ErrNotAuthorizedForEpoch is returned when we attempt to create epoch components 30 // for an epoch in which we are not an authorized network participant. This is the 31 // case for epochs during which this node is joining or leaving the network. 32 var ErrNotAuthorizedForEpoch = fmt.Errorf("we are not an authorized participant for the epoch") 33 34 // EpochComponents represents all dependencies for running an epoch. 35 type EpochComponents struct { 36 *component.ComponentManager 37 state cluster.State 38 prop network.Engine 39 sync network.Engine 40 hotstuff module.HotStuff 41 aggregator hotstuff.VoteAggregator 42 } 43 44 var _ component.Component = (*EpochComponents)(nil) 45 46 func NewEpochComponents( 47 state cluster.State, 48 prop network.Engine, 49 sync network.Engine, 50 hotstuff module.HotStuff, 51 aggregator hotstuff.VoteAggregator, 52 ) *EpochComponents { 53 components := &EpochComponents{ 54 state: state, 55 prop: prop, 56 sync: sync, 57 hotstuff: hotstuff, 58 aggregator: aggregator, 59 } 60 61 builder := component.NewComponentManagerBuilder() 62 // start new worker that will start child components and wait for them to finish 63 builder.AddWorker(func(parentCtx irrecoverable.SignalerContext, ready component.ReadyFunc) { 64 // create a separate context that is not connected to parent, reason: 65 // we want to stop vote aggregator after event loop and compliance engine have shutdown 66 ctx, cancel := context.WithCancel(context.Background()) 67 signalerCtx, _ := irrecoverable.WithSignaler(ctx) 68 // start aggregator, hotstuff will be started by compliance engine 69 aggregator.Start(signalerCtx) 70 // wait until all components start 71 <-util.AllReady(components.prop, components.sync, components.aggregator) 72 // signal that startup has finished and we are ready to go 73 ready() 74 // wait for shutdown to be commenced 75 <-parentCtx.Done() 76 // wait for compliance engine and event loop to shut down 77 <-util.AllDone(components.prop, components.sync) 78 // after event loop and engines were stopped proceed with stopping vote aggregator 79 cancel() 80 // wait until it stops 81 <-components.aggregator.Done() 82 }) 83 components.ComponentManager = builder.Build() 84 85 return components 86 } 87 88 type StartableEpochComponents struct { 89 *EpochComponents 90 signalerCtx irrecoverable.SignalerContext // used to start the component 91 cancel context.CancelFunc // used to stop the epoch components 92 } 93 94 func NewStartableEpochComponents(components *EpochComponents, signalerCtx irrecoverable.SignalerContext, cancel context.CancelFunc) *StartableEpochComponents { 95 return &StartableEpochComponents{ 96 EpochComponents: components, 97 signalerCtx: signalerCtx, 98 cancel: cancel, 99 } 100 } 101 102 // Engine is the epoch manager, which coordinates the lifecycle of other modules 103 // and processes that are epoch-dependent. The manager is responsible for 104 // spinning up engines when a new epoch is about to start and spinning down 105 // engines for an epoch that has ended. 106 type Engine struct { 107 events.Noop // satisfy protocol events consumer interface 108 109 unit *engine.Unit 110 log zerolog.Logger 111 me module.Local 112 state protocol.State 113 pools *epochs.TransactionPools // epoch-scoped transaction pools 114 factory EpochComponentsFactory // consolidates creating epoch for an epoch 115 voter module.ClusterRootQCVoter // manages process of voting for next epoch's QC 116 heightEvents events.Heights // allows subscribing to particular heights 117 irrecoverableCtx irrecoverable.SignalerContext // parent context for canceling all started epochs 118 stopComponents context.CancelFunc // used to stop all components 119 120 epochs map[uint64]*StartableEpochComponents // epoch-scoped components per epoch 121 startupTimeout time.Duration // how long we wait for epoch components to start up 122 } 123 124 func New( 125 log zerolog.Logger, 126 me module.Local, 127 state protocol.State, 128 pools *epochs.TransactionPools, 129 voter module.ClusterRootQCVoter, 130 factory EpochComponentsFactory, 131 heightEvents events.Heights, 132 ) (*Engine, error) { 133 ctx, stopComponents := context.WithCancel(context.Background()) 134 signalerCtx, _ := irrecoverable.WithSignaler(ctx) 135 136 e := &Engine{ 137 unit: engine.NewUnit(), 138 log: log.With().Str("engine", "epochmgr").Logger(), 139 me: me, 140 state: state, 141 pools: pools, 142 voter: voter, 143 factory: factory, 144 heightEvents: heightEvents, 145 epochs: make(map[uint64]*StartableEpochComponents), 146 startupTimeout: DefaultStartupTimeout, 147 irrecoverableCtx: signalerCtx, 148 stopComponents: stopComponents, 149 } 150 151 // set up epoch-scoped epoch managed by this engine for the current epoch 152 epoch := e.state.Final().Epochs().Current() 153 counter, err := epoch.Counter() 154 if err != nil { 155 return nil, fmt.Errorf("could not get epoch counter: %w", err) 156 } 157 158 components, err := e.createEpochComponents(epoch) 159 // don't set up consensus components if we aren't authorized in current epoch 160 if errors.Is(err, ErrNotAuthorizedForEpoch) { 161 return e, nil 162 } 163 if err != nil { 164 return nil, fmt.Errorf("could not create epoch components for current epoch: %w", err) 165 } 166 167 ctx, cancel := context.WithCancel(e.irrecoverableCtx) 168 signalerCtx, _ = irrecoverable.WithSignaler(ctx) 169 170 e.epochs[counter] = NewStartableEpochComponents(components, signalerCtx, cancel) 171 172 return e, nil 173 } 174 175 // Ready returns a ready channel that is closed once the engine has fully 176 // started. For proposal engine, this is true once the underlying consensus 177 // algorithm has started. 178 func (e *Engine) Ready() <-chan struct{} { 179 return e.unit.Ready(func() { 180 // Start up components for all epochs. This is typically a single epoch 181 // but can be multiple near epoch boundaries 182 epochs := make([]module.ReadyDoneAware, 0, len(e.epochs)) 183 for _, epoch := range e.epochs { 184 epochs = append(epochs, epoch) 185 epoch.Start(epoch.signalerCtx) // start every component using its own context 186 } 187 // wait for all engines to start 188 <-util.AllReady(epochs...) 189 }, func() { 190 // check the current phase on startup, in case we are in setup phase 191 // and haven't yet voted for the next root QC 192 finalSnapshot := e.state.Final() 193 phase, err := finalSnapshot.Phase() 194 if err != nil { 195 e.log.Fatal().Err(err).Msg("could not check phase") 196 return 197 } 198 if phase == flow.EpochPhaseSetup { 199 e.unit.Launch(func() { 200 e.onEpochSetupPhaseStarted(finalSnapshot.Epochs().Next()) 201 }) 202 } 203 }) 204 } 205 206 // Done returns a done channel that is closed once the engine has fully stopped. 207 func (e *Engine) Done() <-chan struct{} { 208 return e.unit.Done(func() { 209 // Stop components for all epochs. This is typically a single epoch 210 // but can be multiple near epoch boundaries 211 e.unit.Lock() 212 epochs := make([]module.ReadyDoneAware, 0, len(e.epochs)) 213 for _, epoch := range e.epochs { 214 epochs = append(epochs, epoch) 215 } 216 e.unit.Unlock() 217 e.stopComponents() // stop all components using parent context 218 <-util.AllDone(epochs...) 219 }) 220 } 221 222 // createEpochComponents instantiates and returns epoch-scoped components for 223 // the given epoch, using the configured factory. 224 // 225 // Returns ErrNotAuthorizedForEpoch if this node is not authorized in the epoch. 226 func (e *Engine) createEpochComponents(epoch protocol.Epoch) (*EpochComponents, error) { 227 228 state, prop, sync, hot, aggregator, err := e.factory.Create(epoch) 229 if err != nil { 230 return nil, fmt.Errorf("could not setup requirements for epoch (%d): %w", epoch, err) 231 } 232 233 components := NewEpochComponents(state, prop, sync, hot, aggregator) 234 return components, err 235 } 236 237 // EpochTransition handles the epoch transition protocol event. 238 func (e *Engine) EpochTransition(_ uint64, first *flow.Header) { 239 e.unit.Launch(func() { 240 err := e.onEpochTransition(first) 241 if err != nil { 242 // failing to complete epoch transition is a fatal error 243 e.log.Fatal().Err(err).Msg("failed to complete epoch transition") 244 } 245 }) 246 } 247 248 // EpochSetupPhaseStarted handles the epoch setup phase started protocol event. 249 func (e *Engine) EpochSetupPhaseStarted(_ uint64, first *flow.Header) { 250 e.unit.Launch(func() { 251 nextEpoch := e.state.AtBlockID(first.ID()).Epochs().Next() 252 e.onEpochSetupPhaseStarted(nextEpoch) 253 }) 254 } 255 256 // onEpochTransition is called when we transition to a new epoch. It arranges 257 // to shut down the last epoch's components and starts up the new epoch's. 258 func (e *Engine) onEpochTransition(first *flow.Header) error { 259 e.unit.Lock() 260 defer e.unit.Unlock() 261 262 epoch := e.state.AtBlockID(first.ID()).Epochs().Current() 263 counter, err := epoch.Counter() 264 if err != nil { 265 return fmt.Errorf("could not get epoch counter: %w", err) 266 } 267 268 // greatest block height in the previous epoch is one less than the first 269 // block in current epoch 270 lastEpochMaxHeight := first.Height - 1 271 272 log := e.log.With(). 273 Uint64("last_epoch_max_height", lastEpochMaxHeight). 274 Uint64("cur_epoch_counter", counter). 275 Logger() 276 277 // exit early and log if the epoch already exists 278 _, exists := e.epochs[counter] 279 if exists { 280 log.Warn().Msg("epoch transition: components for new epoch already setup") 281 return nil 282 } 283 284 log.Info().Msg("epoch transition: creating components for new epoch...") 285 286 // create components for new epoch 287 components, err := e.createEpochComponents(epoch) 288 // if we are not authorized in this epoch, skip starting up cluster consensus 289 if errors.Is(err, ErrNotAuthorizedForEpoch) { 290 e.prepareToStopEpochComponents(counter-1, lastEpochMaxHeight) 291 return nil 292 } 293 if err != nil { 294 return fmt.Errorf("could not create epoch components: %w", err) 295 } 296 297 // start up components 298 err = e.startEpochComponents(counter, components) 299 if err != nil { 300 return fmt.Errorf("could not start epoch components: %w", err) 301 } 302 303 log.Info().Msg("epoch transition: new epoch components started successfully") 304 305 // set up callback to stop previous epoch 306 e.prepareToStopEpochComponents(counter-1, lastEpochMaxHeight) 307 308 return nil 309 } 310 311 // prepareToStopEpochComponents registers a callback to stop the epoch with the 312 // given counter once it is no longer possible to receive transactions from that 313 // epoch. This occurs when we finalize sufficiently many blocks in the new epoch 314 // that a transaction referencing any block from the previous epoch would be 315 // considered immediately expired. 316 // 317 // Transactions referencing blocks from the previous epoch are only valid for 318 // inclusion in collections built by clusters from that epoch. Consequently, it 319 // remains possible for the previous epoch's cluster to produce valid collections 320 // until all such transactions have expired. In fact, since these transactions 321 // can NOT be included by clusters in the new epoch, we MUST continue producing 322 // these collections within the previous epoch's clusters. 323 func (e *Engine) prepareToStopEpochComponents(epochCounter, epochMaxHeight uint64) { 324 325 stopAtHeight := epochMaxHeight + flow.DefaultTransactionExpiry + 1 326 327 log := e.log.With(). 328 Uint64("stopping_epoch_max_height", epochMaxHeight). 329 Uint64("stopping_epoch_counter", epochCounter). 330 Uint64("stop_at_height", stopAtHeight). 331 Str("step", "epoch_transition"). 332 Logger() 333 334 log.Info().Msgf("preparing to stop epoch components at height %d", stopAtHeight) 335 336 e.heightEvents.OnHeight(stopAtHeight, func() { 337 e.unit.Launch(func() { 338 e.unit.Lock() 339 defer e.unit.Unlock() 340 341 log.Info().Msg("stopping components for previous epoch...") 342 343 err := e.stopEpochComponents(epochCounter) 344 if err != nil { 345 e.log.Error().Err(err).Msgf("failed to stop components for epoch %d", epochCounter) 346 return 347 } 348 349 log.Info().Msg("previous epoch components stopped successfully") 350 }) 351 }) 352 } 353 354 // onEpochSetupPhaseStarted is called either when we transition into the epoch 355 // setup phase, or when the node is restarted during the epoch setup phase. It 356 // kicks off setup tasks for the phase, in particular submitting a vote for the 357 // next epoch's root cluster QC. 358 func (e *Engine) onEpochSetupPhaseStarted(nextEpoch protocol.Epoch) { 359 360 ctx, cancel := context.WithCancel(e.unit.Ctx()) 361 defer cancel() 362 err := e.voter.Vote(ctx, nextEpoch) 363 if err != nil { 364 e.log.Error().Err(err).Msg("failed to submit QC vote for next epoch") 365 } 366 } 367 368 // startEpochComponents starts the components for the given epoch and adds them 369 // to the engine's internal mapping. 370 // 371 // CAUTION: the caller MUST acquire the engine lock. 372 func (e *Engine) startEpochComponents(counter uint64, components *EpochComponents) error { 373 374 ctx, cancel := context.WithCancel(e.irrecoverableCtx) 375 signalerCtx, _ := irrecoverable.WithSignaler(ctx) 376 377 // start component using its own context 378 components.Start(signalerCtx) 379 380 select { 381 case <-components.Ready(): 382 e.epochs[counter] = NewStartableEpochComponents(components, signalerCtx, cancel) 383 return nil 384 case <-time.After(e.startupTimeout): 385 cancel() // cancel current context if we didn't start in time 386 return fmt.Errorf("could not start epoch %d components after %s", counter, e.startupTimeout) 387 } 388 } 389 390 // stopEpochComponents stops the components for the given epoch and removes them 391 // from the engine's internal mapping. 392 // 393 // CAUTION: the caller MUST acquire the engine lock. 394 func (e *Engine) stopEpochComponents(counter uint64) error { 395 396 components, exists := e.epochs[counter] 397 if !exists { 398 return fmt.Errorf("can not stop non-existent epoch %d", counter) 399 } 400 401 // stop individual component 402 components.cancel() 403 404 select { 405 case <-components.Done(): 406 delete(e.epochs, counter) 407 e.pools.ForEpoch(counter).Clear() 408 return nil 409 case <-time.After(e.startupTimeout): 410 return fmt.Errorf("could not stop epoch %d components after %s", counter, e.startupTimeout) 411 } 412 }