github.com/koko1123/flow-go-1@v0.29.6/engine/collection/epochmgr/engine.go (about)

     1  package epochmgr
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"time"
     8  
     9  	"github.com/rs/zerolog"
    10  
    11  	"github.com/koko1123/flow-go-1/consensus/hotstuff"
    12  	"github.com/koko1123/flow-go-1/engine"
    13  	"github.com/koko1123/flow-go-1/model/flow"
    14  	"github.com/koko1123/flow-go-1/module"
    15  	"github.com/koko1123/flow-go-1/module/component"
    16  	"github.com/koko1123/flow-go-1/module/irrecoverable"
    17  	"github.com/koko1123/flow-go-1/module/mempool/epochs"
    18  	"github.com/koko1123/flow-go-1/module/util"
    19  	"github.com/koko1123/flow-go-1/network"
    20  	"github.com/koko1123/flow-go-1/state/cluster"
    21  	"github.com/koko1123/flow-go-1/state/protocol"
    22  	"github.com/koko1123/flow-go-1/state/protocol/events"
    23  )
    24  
    25  // DefaultStartupTimeout is the default time we wait when starting epoch
    26  // components before giving up.
    27  const DefaultStartupTimeout = 30 * time.Second
    28  
    29  // ErrNotAuthorizedForEpoch is returned when we attempt to create epoch components
    30  // for an epoch in which we are not an authorized network participant. This is the
    31  // case for epochs during which this node is joining or leaving the network.
    32  var ErrNotAuthorizedForEpoch = fmt.Errorf("we are not an authorized participant for the epoch")
    33  
    34  // EpochComponents represents all dependencies for running an epoch.
    35  type EpochComponents struct {
    36  	*component.ComponentManager
    37  	state      cluster.State
    38  	prop       network.Engine
    39  	sync       network.Engine
    40  	hotstuff   module.HotStuff
    41  	aggregator hotstuff.VoteAggregator
    42  }
    43  
    44  var _ component.Component = (*EpochComponents)(nil)
    45  
    46  func NewEpochComponents(
    47  	state cluster.State,
    48  	prop network.Engine,
    49  	sync network.Engine,
    50  	hotstuff module.HotStuff,
    51  	aggregator hotstuff.VoteAggregator,
    52  ) *EpochComponents {
    53  	components := &EpochComponents{
    54  		state:      state,
    55  		prop:       prop,
    56  		sync:       sync,
    57  		hotstuff:   hotstuff,
    58  		aggregator: aggregator,
    59  	}
    60  
    61  	builder := component.NewComponentManagerBuilder()
    62  	// start new worker that will start child components and wait for them to finish
    63  	builder.AddWorker(func(parentCtx irrecoverable.SignalerContext, ready component.ReadyFunc) {
    64  		// create a separate context that is not connected to parent, reason:
    65  		// we want to stop vote aggregator after event loop and compliance engine have shutdown
    66  		ctx, cancel := context.WithCancel(context.Background())
    67  		signalerCtx, _ := irrecoverable.WithSignaler(ctx)
    68  		// start aggregator, hotstuff will be started by compliance engine
    69  		aggregator.Start(signalerCtx)
    70  		// wait until all components start
    71  		<-util.AllReady(components.prop, components.sync, components.aggregator)
    72  		// signal that startup has finished and we are ready to go
    73  		ready()
    74  		// wait for shutdown to be commenced
    75  		<-parentCtx.Done()
    76  		// wait for compliance engine and event loop to shut down
    77  		<-util.AllDone(components.prop, components.sync)
    78  		// after event loop and engines were stopped proceed with stopping vote aggregator
    79  		cancel()
    80  		// wait until it stops
    81  		<-components.aggregator.Done()
    82  	})
    83  	components.ComponentManager = builder.Build()
    84  
    85  	return components
    86  }
    87  
    88  type StartableEpochComponents struct {
    89  	*EpochComponents
    90  	signalerCtx irrecoverable.SignalerContext // used to start the component
    91  	cancel      context.CancelFunc            // used to stop the epoch components
    92  }
    93  
    94  func NewStartableEpochComponents(components *EpochComponents, signalerCtx irrecoverable.SignalerContext, cancel context.CancelFunc) *StartableEpochComponents {
    95  	return &StartableEpochComponents{
    96  		EpochComponents: components,
    97  		signalerCtx:     signalerCtx,
    98  		cancel:          cancel,
    99  	}
   100  }
   101  
   102  // Engine is the epoch manager, which coordinates the lifecycle of other modules
   103  // and processes that are epoch-dependent. The manager is responsible for
   104  // spinning up engines when a new epoch is about to start and spinning down
   105  // engines for an epoch that has ended.
   106  type Engine struct {
   107  	events.Noop // satisfy protocol events consumer interface
   108  
   109  	unit             *engine.Unit
   110  	log              zerolog.Logger
   111  	me               module.Local
   112  	state            protocol.State
   113  	pools            *epochs.TransactionPools      // epoch-scoped transaction pools
   114  	factory          EpochComponentsFactory        // consolidates creating epoch for an epoch
   115  	voter            module.ClusterRootQCVoter     // manages process of voting for next epoch's QC
   116  	heightEvents     events.Heights                // allows subscribing to particular heights
   117  	irrecoverableCtx irrecoverable.SignalerContext // parent context for canceling all started epochs
   118  	stopComponents   context.CancelFunc            // used to stop all components
   119  
   120  	epochs         map[uint64]*StartableEpochComponents // epoch-scoped components per epoch
   121  	startupTimeout time.Duration                        // how long we wait for epoch components to start up
   122  }
   123  
   124  func New(
   125  	log zerolog.Logger,
   126  	me module.Local,
   127  	state protocol.State,
   128  	pools *epochs.TransactionPools,
   129  	voter module.ClusterRootQCVoter,
   130  	factory EpochComponentsFactory,
   131  	heightEvents events.Heights,
   132  ) (*Engine, error) {
   133  	ctx, stopComponents := context.WithCancel(context.Background())
   134  	signalerCtx, _ := irrecoverable.WithSignaler(ctx)
   135  
   136  	e := &Engine{
   137  		unit:             engine.NewUnit(),
   138  		log:              log.With().Str("engine", "epochmgr").Logger(),
   139  		me:               me,
   140  		state:            state,
   141  		pools:            pools,
   142  		voter:            voter,
   143  		factory:          factory,
   144  		heightEvents:     heightEvents,
   145  		epochs:           make(map[uint64]*StartableEpochComponents),
   146  		startupTimeout:   DefaultStartupTimeout,
   147  		irrecoverableCtx: signalerCtx,
   148  		stopComponents:   stopComponents,
   149  	}
   150  
   151  	// set up epoch-scoped epoch managed by this engine for the current epoch
   152  	epoch := e.state.Final().Epochs().Current()
   153  	counter, err := epoch.Counter()
   154  	if err != nil {
   155  		return nil, fmt.Errorf("could not get epoch counter: %w", err)
   156  	}
   157  
   158  	components, err := e.createEpochComponents(epoch)
   159  	// don't set up consensus components if we aren't authorized in current epoch
   160  	if errors.Is(err, ErrNotAuthorizedForEpoch) {
   161  		return e, nil
   162  	}
   163  	if err != nil {
   164  		return nil, fmt.Errorf("could not create epoch components for current epoch: %w", err)
   165  	}
   166  
   167  	ctx, cancel := context.WithCancel(e.irrecoverableCtx)
   168  	signalerCtx, _ = irrecoverable.WithSignaler(ctx)
   169  
   170  	e.epochs[counter] = NewStartableEpochComponents(components, signalerCtx, cancel)
   171  
   172  	return e, nil
   173  }
   174  
   175  // Ready returns a ready channel that is closed once the engine has fully
   176  // started. For proposal engine, this is true once the underlying consensus
   177  // algorithm has started.
   178  func (e *Engine) Ready() <-chan struct{} {
   179  	return e.unit.Ready(func() {
   180  		// Start up components for all epochs. This is typically a single epoch
   181  		// but can be multiple near epoch boundaries
   182  		epochs := make([]module.ReadyDoneAware, 0, len(e.epochs))
   183  		for _, epoch := range e.epochs {
   184  			epochs = append(epochs, epoch)
   185  			epoch.Start(epoch.signalerCtx) // start every component using its own context
   186  		}
   187  		// wait for all engines to start
   188  		<-util.AllReady(epochs...)
   189  	}, func() {
   190  		// check the current phase on startup, in case we are in setup phase
   191  		// and haven't yet voted for the next root QC
   192  		finalSnapshot := e.state.Final()
   193  		phase, err := finalSnapshot.Phase()
   194  		if err != nil {
   195  			e.log.Fatal().Err(err).Msg("could not check phase")
   196  			return
   197  		}
   198  		if phase == flow.EpochPhaseSetup {
   199  			e.unit.Launch(func() {
   200  				e.onEpochSetupPhaseStarted(finalSnapshot.Epochs().Next())
   201  			})
   202  		}
   203  	})
   204  }
   205  
   206  // Done returns a done channel that is closed once the engine has fully stopped.
   207  func (e *Engine) Done() <-chan struct{} {
   208  	return e.unit.Done(func() {
   209  		// Stop components for all epochs. This is typically a single epoch
   210  		// but can be multiple near epoch boundaries
   211  		e.unit.Lock()
   212  		epochs := make([]module.ReadyDoneAware, 0, len(e.epochs))
   213  		for _, epoch := range e.epochs {
   214  			epochs = append(epochs, epoch)
   215  		}
   216  		e.unit.Unlock()
   217  		e.stopComponents() // stop all components using parent context
   218  		<-util.AllDone(epochs...)
   219  	})
   220  }
   221  
   222  // createEpochComponents instantiates and returns epoch-scoped components for
   223  // the given epoch, using the configured factory.
   224  //
   225  // Returns ErrNotAuthorizedForEpoch if this node is not authorized in the epoch.
   226  func (e *Engine) createEpochComponents(epoch protocol.Epoch) (*EpochComponents, error) {
   227  
   228  	state, prop, sync, hot, aggregator, err := e.factory.Create(epoch)
   229  	if err != nil {
   230  		return nil, fmt.Errorf("could not setup requirements for epoch (%d): %w", epoch, err)
   231  	}
   232  
   233  	components := NewEpochComponents(state, prop, sync, hot, aggregator)
   234  	return components, err
   235  }
   236  
   237  // EpochTransition handles the epoch transition protocol event.
   238  func (e *Engine) EpochTransition(_ uint64, first *flow.Header) {
   239  	e.unit.Launch(func() {
   240  		err := e.onEpochTransition(first)
   241  		if err != nil {
   242  			// failing to complete epoch transition is a fatal error
   243  			e.log.Fatal().Err(err).Msg("failed to complete epoch transition")
   244  		}
   245  	})
   246  }
   247  
   248  // EpochSetupPhaseStarted handles the epoch setup phase started protocol event.
   249  func (e *Engine) EpochSetupPhaseStarted(_ uint64, first *flow.Header) {
   250  	e.unit.Launch(func() {
   251  		nextEpoch := e.state.AtBlockID(first.ID()).Epochs().Next()
   252  		e.onEpochSetupPhaseStarted(nextEpoch)
   253  	})
   254  }
   255  
   256  // onEpochTransition is called when we transition to a new epoch. It arranges
   257  // to shut down the last epoch's components and starts up the new epoch's.
   258  func (e *Engine) onEpochTransition(first *flow.Header) error {
   259  	e.unit.Lock()
   260  	defer e.unit.Unlock()
   261  
   262  	epoch := e.state.AtBlockID(first.ID()).Epochs().Current()
   263  	counter, err := epoch.Counter()
   264  	if err != nil {
   265  		return fmt.Errorf("could not get epoch counter: %w", err)
   266  	}
   267  
   268  	// greatest block height in the previous epoch is one less than the first
   269  	// block in current epoch
   270  	lastEpochMaxHeight := first.Height - 1
   271  
   272  	log := e.log.With().
   273  		Uint64("last_epoch_max_height", lastEpochMaxHeight).
   274  		Uint64("cur_epoch_counter", counter).
   275  		Logger()
   276  
   277  	// exit early and log if the epoch already exists
   278  	_, exists := e.epochs[counter]
   279  	if exists {
   280  		log.Warn().Msg("epoch transition: components for new epoch already setup")
   281  		return nil
   282  	}
   283  
   284  	log.Info().Msg("epoch transition: creating components for new epoch...")
   285  
   286  	// create components for new epoch
   287  	components, err := e.createEpochComponents(epoch)
   288  	// if we are not authorized in this epoch, skip starting up cluster consensus
   289  	if errors.Is(err, ErrNotAuthorizedForEpoch) {
   290  		e.prepareToStopEpochComponents(counter-1, lastEpochMaxHeight)
   291  		return nil
   292  	}
   293  	if err != nil {
   294  		return fmt.Errorf("could not create epoch components: %w", err)
   295  	}
   296  
   297  	// start up components
   298  	err = e.startEpochComponents(counter, components)
   299  	if err != nil {
   300  		return fmt.Errorf("could not start epoch components: %w", err)
   301  	}
   302  
   303  	log.Info().Msg("epoch transition: new epoch components started successfully")
   304  
   305  	// set up callback to stop previous epoch
   306  	e.prepareToStopEpochComponents(counter-1, lastEpochMaxHeight)
   307  
   308  	return nil
   309  }
   310  
   311  // prepareToStopEpochComponents registers a callback to stop the epoch with the
   312  // given counter once it is no longer possible to receive transactions from that
   313  // epoch. This occurs when we finalize sufficiently many blocks in the new epoch
   314  // that a transaction referencing any block from the previous epoch would be
   315  // considered immediately expired.
   316  //
   317  // Transactions referencing blocks from the previous epoch are only valid for
   318  // inclusion in collections built by clusters from that epoch. Consequently, it
   319  // remains possible for the previous epoch's cluster to produce valid collections
   320  // until all such transactions have expired. In fact, since these transactions
   321  // can NOT be included by clusters in the new epoch, we MUST continue producing
   322  // these collections within the previous epoch's clusters.
   323  func (e *Engine) prepareToStopEpochComponents(epochCounter, epochMaxHeight uint64) {
   324  
   325  	stopAtHeight := epochMaxHeight + flow.DefaultTransactionExpiry + 1
   326  
   327  	log := e.log.With().
   328  		Uint64("stopping_epoch_max_height", epochMaxHeight).
   329  		Uint64("stopping_epoch_counter", epochCounter).
   330  		Uint64("stop_at_height", stopAtHeight).
   331  		Str("step", "epoch_transition").
   332  		Logger()
   333  
   334  	log.Info().Msgf("preparing to stop epoch components at height %d", stopAtHeight)
   335  
   336  	e.heightEvents.OnHeight(stopAtHeight, func() {
   337  		e.unit.Launch(func() {
   338  			e.unit.Lock()
   339  			defer e.unit.Unlock()
   340  
   341  			log.Info().Msg("stopping components for previous epoch...")
   342  
   343  			err := e.stopEpochComponents(epochCounter)
   344  			if err != nil {
   345  				e.log.Error().Err(err).Msgf("failed to stop components for epoch %d", epochCounter)
   346  				return
   347  			}
   348  
   349  			log.Info().Msg("previous epoch components stopped successfully")
   350  		})
   351  	})
   352  }
   353  
   354  // onEpochSetupPhaseStarted is called either when we transition into the epoch
   355  // setup phase, or when the node is restarted during the epoch setup phase. It
   356  // kicks off setup tasks for the phase, in particular submitting a vote for the
   357  // next epoch's root cluster QC.
   358  func (e *Engine) onEpochSetupPhaseStarted(nextEpoch protocol.Epoch) {
   359  
   360  	ctx, cancel := context.WithCancel(e.unit.Ctx())
   361  	defer cancel()
   362  	err := e.voter.Vote(ctx, nextEpoch)
   363  	if err != nil {
   364  		e.log.Error().Err(err).Msg("failed to submit QC vote for next epoch")
   365  	}
   366  }
   367  
   368  // startEpochComponents starts the components for the given epoch and adds them
   369  // to the engine's internal mapping.
   370  //
   371  // CAUTION: the caller MUST acquire the engine lock.
   372  func (e *Engine) startEpochComponents(counter uint64, components *EpochComponents) error {
   373  
   374  	ctx, cancel := context.WithCancel(e.irrecoverableCtx)
   375  	signalerCtx, _ := irrecoverable.WithSignaler(ctx)
   376  
   377  	// start component using its own context
   378  	components.Start(signalerCtx)
   379  
   380  	select {
   381  	case <-components.Ready():
   382  		e.epochs[counter] = NewStartableEpochComponents(components, signalerCtx, cancel)
   383  		return nil
   384  	case <-time.After(e.startupTimeout):
   385  		cancel() // cancel current context if we didn't start in time
   386  		return fmt.Errorf("could not start epoch %d components after %s", counter, e.startupTimeout)
   387  	}
   388  }
   389  
   390  // stopEpochComponents stops the components for the given epoch and removes them
   391  // from the engine's internal mapping.
   392  //
   393  // CAUTION: the caller MUST acquire the engine lock.
   394  func (e *Engine) stopEpochComponents(counter uint64) error {
   395  
   396  	components, exists := e.epochs[counter]
   397  	if !exists {
   398  		return fmt.Errorf("can not stop non-existent epoch %d", counter)
   399  	}
   400  
   401  	// stop individual component
   402  	components.cancel()
   403  
   404  	select {
   405  	case <-components.Done():
   406  		delete(e.epochs, counter)
   407  		e.pools.ForEpoch(counter).Clear()
   408  		return nil
   409  	case <-time.After(e.startupTimeout):
   410  		return fmt.Errorf("could not stop epoch %d components after %s", counter, e.startupTimeout)
   411  	}
   412  }