github.com/onflow/flow-go@v0.33.17/engine/collection/epochmgr/engine.go

github.com/onflow/flow-go@v0.33.17/engine/collection/epochmgr/engine.go (about)

     1  package epochmgr
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/rs/zerolog"
    10  
    11  	"github.com/onflow/flow-go/engine/collection"
    12  	"github.com/onflow/flow-go/model/flow"
    13  	"github.com/onflow/flow-go/module"
    14  	"github.com/onflow/flow-go/module/component"
    15  	"github.com/onflow/flow-go/module/epochs"
    16  	"github.com/onflow/flow-go/module/irrecoverable"
    17  	epochpool "github.com/onflow/flow-go/module/mempool/epochs"
    18  	"github.com/onflow/flow-go/module/util"
    19  	"github.com/onflow/flow-go/state/protocol"
    20  	"github.com/onflow/flow-go/state/protocol/events"
    21  )
    22  
    23  // DefaultStartupTimeout is the default time we wait when starting epoch components before giving up.
    24  const DefaultStartupTimeout = time.Minute
    25  
    26  // ErrNotAuthorizedForEpoch is returned when we attempt to create epoch components
    27  // for an epoch in which we are not an authorized network participant. This is the
    28  // case for epochs during which this node is joining or leaving the network.
    29  var ErrNotAuthorizedForEpoch = fmt.Errorf("we are not an authorized participant for the epoch")
    30  
    31  // Engine is the epoch manager, which coordinates the lifecycle of other modules
    32  // and processes that are epoch-dependent. The manager is responsible for
    33  // spinning up engines when a new epoch is about to start and spinning down
    34  // engines for an epoch that has ended.
    35  //
    36  // The `epochmgr.Engine` implements the `protocol.Consumer` interface. In particular, it
    37  // ingests the following notifications from the protocol state:
    38  //   - EpochSetupPhaseStarted
    39  //   - EpochTransition
    40  //
    41  // As part of the engine starting, it executes pending actions that should have been triggered
    42  // by protocol events but those events were missed during a crash/restart. See respective
    43  // consumer methods for further details.
    44  type Engine struct {
    45  	events.Noop // satisfy protocol events consumer interface
    46  
    47  	log            zerolog.Logger
    48  	me             module.Local
    49  	state          protocol.State
    50  	pools          *epochpool.TransactionPools // epoch-scoped transaction pools
    51  	factory        EpochComponentsFactory      // consolidates creating epoch for an epoch
    52  	voter          module.ClusterRootQCVoter   // manages process of voting for next epoch's QC
    53  	heightEvents   events.Heights              // allows subscribing to particular heights
    54  	startupTimeout time.Duration               // how long we wait for epoch components to start up
    55  
    56  	mu     sync.RWMutex                       // protects epochs map
    57  	epochs map[uint64]*RunningEpochComponents // epoch-scoped components per epoch
    58  
    59  	// internal event notifications
    60  	epochTransitionEvents        chan *flow.Header        // sends first block of new epoch
    61  	epochSetupPhaseStartedEvents chan *flow.Header        // sends first block of EpochSetup phase
    62  	epochStopEvents              chan uint64              // sends counter of epoch to stop
    63  	clusterIDUpdateDistributor   collection.ClusterEvents // sends cluster ID updates to consumers
    64  	cm                           *component.ComponentManager
    65  	component.Component
    66  }
    67  
    68  var _ component.Component = (*Engine)(nil)
    69  var _ protocol.Consumer = (*Engine)(nil)
    70  
    71  func New(
    72  	log zerolog.Logger,
    73  	me module.Local,
    74  	state protocol.State,
    75  	pools *epochpool.TransactionPools,
    76  	voter module.ClusterRootQCVoter,
    77  	factory EpochComponentsFactory,
    78  	heightEvents events.Heights,
    79  	clusterIDUpdateDistributor collection.ClusterEvents,
    80  ) (*Engine, error) {
    81  	e := &Engine{
    82  		log:                          log.With().Str("engine", "epochmgr").Logger(),
    83  		me:                           me,
    84  		state:                        state,
    85  		pools:                        pools,
    86  		voter:                        voter,
    87  		factory:                      factory,
    88  		heightEvents:                 heightEvents,
    89  		epochs:                       make(map[uint64]*RunningEpochComponents),
    90  		startupTimeout:               DefaultStartupTimeout,
    91  		epochTransitionEvents:        make(chan *flow.Header, 1),
    92  		epochSetupPhaseStartedEvents: make(chan *flow.Header, 1),
    93  		epochStopEvents:              make(chan uint64, 1),
    94  		clusterIDUpdateDistributor:   clusterIDUpdateDistributor,
    95  	}
    96  
    97  	e.cm = component.NewComponentManagerBuilder().
    98  		AddWorker(e.handleEpochEvents).
    99  		Build()
   100  	e.Component = e.cm
   101  
   102  	return e, nil
   103  }
   104  
   105  // Start starts the engine.
   106  func (e *Engine) Start(ctx irrecoverable.SignalerContext) {
   107  	// (1) start engine-scoped workers
   108  	e.cm.Start(ctx)
   109  
   110  	// (2) Retrieve protocol state as of latest finalized block. We use this state
   111  	// to catch up on events, whose execution was missed during crash-restart.
   112  	finalSnapshot := e.state.Final()
   113  
   114  	// (3) check if we should attempt to vote after startup
   115  	err := e.checkShouldVoteOnStartup(finalSnapshot)
   116  	if err != nil {
   117  		ctx.Throw(fmt.Errorf("could not vote on startup: %w", err))
   118  	}
   119  
   120  	// (4) start epoch-scoped components:
   121  	// (a) set up epoch-scoped epoch managed by this engine for the current epoch
   122  	err = e.checkShouldStartCurrentEpochComponentsOnStartup(ctx, finalSnapshot)
   123  	if err != nil {
   124  		ctx.Throw(fmt.Errorf("could not check or start current epoch components: %w", err))
   125  	}
   126  
   127  	// (b) set up epoch-scoped epoch components for the previous epoch
   128  	err = e.checkShouldStartPreviousEpochComponentsOnStartup(ctx, finalSnapshot)
   129  	if err != nil {
   130  		ctx.Throw(fmt.Errorf("could not check or start previous epoch components: %w", err))
   131  	}
   132  }
   133  
   134  // checkShouldStartCurrentEpochComponentsOnStartup checks whether we should instantiate
   135  // consensus components for the current epoch upon startup, and if so, starts them.
   136  // We always start current epoch consensus components, unless this node is not an
   137  // authorized participant in the current epoch.
   138  // No errors are expected during normal operation.
   139  func (e *Engine) checkShouldStartCurrentEpochComponentsOnStartup(ctx irrecoverable.SignalerContext, finalSnapshot protocol.Snapshot) error {
   140  	currentEpoch := finalSnapshot.Epochs().Current()
   141  	currentEpochCounter, err := currentEpoch.Counter()
   142  	if err != nil {
   143  		return fmt.Errorf("could not get epoch counter: %w", err)
   144  	}
   145  
   146  	components, err := e.createEpochComponents(currentEpoch)
   147  	if err != nil {
   148  		if errors.Is(err, ErrNotAuthorizedForEpoch) {
   149  			// don't set up consensus components if we aren't authorized in current epoch
   150  			e.log.Info().Msg("node is not authorized for current epoch - skipping initializing cluster consensus")
   151  			return nil
   152  		}
   153  		return fmt.Errorf("could not create epoch components: %w", err)
   154  	}
   155  	err = e.startEpochComponents(ctx, currentEpochCounter, components)
   156  	if err != nil {
   157  		// all failures to start epoch components are critical
   158  		return fmt.Errorf("could not start epoch components: %w", err)
   159  	}
   160  	return nil
   161  }
   162  
   163  // checkShouldStartPreviousEpochComponentsOnStartup checks whether we should re-instantiate
   164  // consensus components for the previous epoch upon startup, and if so, starts them.
   165  // One cluster is responsible for a portion of transactions with reference blocks
   166  // with one epoch. Since transactions may use reference blocks up to flow.DefaultTransactionExpiry
   167  // many heights old, clusters don't shut down until this many blocks have been finalized
   168  // past the final block of the cluster's epoch.
   169  // No errors are expected during normal operation.
   170  func (e *Engine) checkShouldStartPreviousEpochComponentsOnStartup(engineCtx irrecoverable.SignalerContext, finalSnapshot protocol.Snapshot) error {
   171  	finalHeader, err := finalSnapshot.Head()
   172  	if err != nil {
   173  		return fmt.Errorf("[unexpected] could not get finalized header: %w", err)
   174  	}
   175  	finalizedHeight := finalHeader.Height
   176  
   177  	prevEpoch := finalSnapshot.Epochs().Previous()
   178  	prevEpochCounter, err := prevEpoch.Counter()
   179  	if err != nil {
   180  		if errors.Is(err, protocol.ErrNoPreviousEpoch) {
   181  			return nil
   182  		}
   183  		return fmt.Errorf("[unexpected] could not get previous epoch counter: %w", err)
   184  	}
   185  	prevEpochFinalHeight, err := prevEpoch.FinalHeight()
   186  	if err != nil {
   187  		// no expected errors because we are querying finalized snapshot
   188  		return fmt.Errorf("[unexpected] could not get previous epoch final height: %w", err)
   189  	}
   190  	prevEpochClusterConsensusStopHeight := prevEpochFinalHeight + flow.DefaultTransactionExpiry + 1
   191  
   192  	log := e.log.With().
   193  		Uint64("finalized_height", finalizedHeight).
   194  		Uint64("prev_epoch_counter", prevEpochCounter).
   195  		Uint64("prev_epoch_final_height", prevEpochFinalHeight).
   196  		Uint64("prev_epoch_cluster_stop_height", prevEpochClusterConsensusStopHeight).
   197  		Logger()
   198  
   199  	if finalizedHeight >= prevEpochClusterConsensusStopHeight {
   200  		log.Debug().Msg("not re-starting previous epoch cluster consensus on startup - past stop height")
   201  		return nil
   202  	}
   203  
   204  	components, err := e.createEpochComponents(prevEpoch)
   205  	if err != nil {
   206  		if errors.Is(err, ErrNotAuthorizedForEpoch) {
   207  			// don't set up consensus components if we aren't authorized in previous epoch
   208  			log.Info().Msg("node is not authorized for previous epoch - skipping re-initializing last epoch cluster consensus")
   209  			return nil
   210  		}
   211  		return fmt.Errorf("[unexpected] could not create previous epoch components: %w", err)
   212  	}
   213  	err = e.startEpochComponents(engineCtx, prevEpochCounter, components)
   214  	if err != nil {
   215  		// all failures to start epoch components are critical
   216  		return fmt.Errorf("[unexpected] could not epoch components: %w", err)
   217  	}
   218  	e.prepareToStopEpochComponents(prevEpochCounter, prevEpochFinalHeight)
   219  
   220  	log.Info().Msgf("re-started last epoch cluster consensus - will stop at height %d", prevEpochClusterConsensusStopHeight)
   221  	return nil
   222  }
   223  
   224  // checkShouldVoteOnStartup checks whether we should vote, and if so, sends a signal
   225  // to the worker thread responsible for voting.
   226  // No errors are expected during normal operation.
   227  func (e *Engine) checkShouldVoteOnStartup(finalSnapshot protocol.Snapshot) error {
   228  	// check the current phase on startup, in case we are in setup phase
   229  	// and haven't yet voted for the next root QC
   230  	phase, err := finalSnapshot.Phase()
   231  	if err != nil {
   232  		return fmt.Errorf("could not get epoch phase for finalized snapshot: %w", err)
   233  	}
   234  	if phase == flow.EpochPhaseSetup {
   235  		header, err := finalSnapshot.Head()
   236  		if err != nil {
   237  			return fmt.Errorf("could not get header for finalized snapshot: %w", err)
   238  		}
   239  		e.epochSetupPhaseStartedEvents <- header
   240  	}
   241  	return nil
   242  }
   243  
   244  // Ready returns a ready channel that is closed once the engine has fully started.
   245  // This is true when the engine-scoped worker threads have started, and all presently
   246  // running epoch components (max 2) have started.
   247  func (e *Engine) Ready() <-chan struct{} {
   248  	e.mu.RLock()
   249  	components := make([]module.ReadyDoneAware, 0, len(e.epochs)+1)
   250  	components = append(components, e.cm)
   251  	for _, epoch := range e.epochs {
   252  		components = append(components, epoch)
   253  	}
   254  	e.mu.RUnlock()
   255  
   256  	return util.AllReady(components...)
   257  }
   258  
   259  // Done returns a done channel that is closed once the engine has fully stopped.
   260  // This is true when the engine-scoped worker threads have stopped, and all presently
   261  // running epoch components (max 2) have stopped.
   262  func (e *Engine) Done() <-chan struct{} {
   263  	e.mu.RLock()
   264  	components := make([]module.ReadyDoneAware, 0, len(e.epochs)+1)
   265  	components = append(components, e.cm)
   266  	for _, epoch := range e.epochs {
   267  		components = append(components, epoch)
   268  	}
   269  	e.mu.RUnlock()
   270  
   271  	return util.AllDone(components...)
   272  }
   273  
   274  // createEpochComponents instantiates and returns epoch-scoped components for
   275  // the given epoch, using the configured factory.
   276  // Error returns:
   277  // - ErrNotAuthorizedForEpoch if this node is not authorized in the epoch.
   278  func (e *Engine) createEpochComponents(epoch protocol.Epoch) (*EpochComponents, error) {
   279  	counter, err := epoch.Counter()
   280  	if err != nil {
   281  		return nil, fmt.Errorf("could not get epoch counter: %w", err)
   282  	}
   283  	state, prop, sync, hot, voteAggregator, timeoutAggregator, messageHub, err := e.factory.Create(epoch)
   284  	if err != nil {
   285  		return nil, fmt.Errorf("could not setup requirements for epoch (%d): %w", counter, err)
   286  	}
   287  
   288  	components := NewEpochComponents(state, prop, sync, hot, voteAggregator, timeoutAggregator, messageHub)
   289  	return components, nil
   290  }
   291  
   292  // EpochTransition handles the epoch transition protocol event.
   293  // NOTE: epochmgr.Engine will not restart trailing cluster consensus instances from previous epoch,
   294  // therefore no need to handle dropped protocol events here (see issue below).
   295  // TODO gracefully handle restarts in first 600 blocks of epoch https://github.com/dapperlabs/flow-go/issues/5659
   296  func (e *Engine) EpochTransition(_ uint64, first *flow.Header) {
   297  	e.epochTransitionEvents <- first
   298  }
   299  
   300  // EpochSetupPhaseStarted handles the epoch setup phase started protocol event.
   301  // NOTE: Ready will check if we start up in the EpochSetup phase at initialization and trigger QC voting.
   302  // This handles dropped protocol events and restarts interrupting QC voting.
   303  func (e *Engine) EpochSetupPhaseStarted(_ uint64, first *flow.Header) {
   304  	e.epochSetupPhaseStartedEvents <- first
   305  }
   306  
   307  // handleEpochEvents handles events relating to the epoch lifecycle:
   308  //   - EpochTransition protocol event - we start epoch components for the starting epoch,
   309  //     and schedule shutdown for the ending epoch
   310  //   - EpochSetupPhaseStarted protocol event - we submit our node's vote for our cluster's
   311  //     root block in the next epoch
   312  //   - epochStopEvents - signalled when a previously scheduled shutdown height is reached.
   313  //     We shut down components associated with the epoch.
   314  func (e *Engine) handleEpochEvents(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) {
   315  	ready()
   316  
   317  	for {
   318  		select {
   319  		case <-ctx.Done():
   320  			return
   321  		case firstBlock := <-e.epochTransitionEvents:
   322  			err := e.onEpochTransition(ctx, firstBlock)
   323  			if err != nil {
   324  				ctx.Throw(err)
   325  			}
   326  		case firstBlock := <-e.epochSetupPhaseStartedEvents:
   327  			nextEpoch := e.state.AtBlockID(firstBlock.ID()).Epochs().Next()
   328  			e.onEpochSetupPhaseStarted(ctx, nextEpoch)
   329  		case epochCounter := <-e.epochStopEvents:
   330  			err := e.stopEpochComponents(epochCounter)
   331  			if err != nil {
   332  				ctx.Throw(err)
   333  			}
   334  		}
   335  	}
   336  }
   337  
   338  // handleEpochErrors checks for irrecoverable errors thrown from any components from
   339  // some epoch, and handles them. Currently, handling them means simply throwing them
   340  // to the engine-level signaller context, which should cause the node to crash.
   341  // In the future, we could restart the failed epoch's components instead.
   342  // Must be run as a goroutine.
   343  func (e *Engine) handleEpochErrors(ctx irrecoverable.SignalerContext, errCh <-chan error) {
   344  	select {
   345  	case <-ctx.Done():
   346  		return
   347  	case err := <-errCh:
   348  		if err != nil {
   349  			ctx.Throw(err)
   350  		}
   351  	}
   352  }
   353  
   354  // onEpochTransition is called when we transition to a new epoch. It arranges
   355  // to shut down the last epoch's components and starts up the new epoch's.
   356  //
   357  // No errors are expected during normal operation.
   358  func (e *Engine) onEpochTransition(ctx irrecoverable.SignalerContext, first *flow.Header) error {
   359  	epoch := e.state.AtBlockID(first.ID()).Epochs().Current()
   360  	counter, err := epoch.Counter()
   361  	if err != nil {
   362  		return fmt.Errorf("could not get epoch counter: %w", err)
   363  	}
   364  
   365  	// greatest block height in the previous epoch is one less than the first
   366  	// block in current epoch
   367  	lastEpochMaxHeight := first.Height - 1
   368  
   369  	log := e.log.With().
   370  		Uint64("last_epoch_max_height", lastEpochMaxHeight).
   371  		Uint64("cur_epoch_counter", counter).
   372  		Logger()
   373  
   374  	// exit early and log if the epoch already exists
   375  	_, exists := e.getEpochComponents(counter)
   376  	if exists {
   377  		log.Warn().Msg("epoch transition: components for new epoch already setup, exiting...")
   378  		return nil
   379  	}
   380  
   381  	// register a callback to stop the just-ended epoch at the appropriate block height
   382  	e.prepareToStopEpochComponents(counter-1, lastEpochMaxHeight)
   383  
   384  	log.Info().Msg("epoch transition: creating components for new epoch...")
   385  
   386  	// create components for new epoch
   387  	components, err := e.createEpochComponents(epoch)
   388  	if err != nil {
   389  		if errors.Is(err, ErrNotAuthorizedForEpoch) {
   390  			// if we are not authorized in this epoch, skip starting up cluster consensus
   391  			log.Info().Msg("epoch transition: we are not authorized for new epoch, exiting...")
   392  			return nil
   393  		}
   394  		return fmt.Errorf("could not create epoch components: %w", err)
   395  	}
   396  
   397  	// start up components
   398  	err = e.startEpochComponents(ctx, counter, components)
   399  	if err != nil {
   400  		return fmt.Errorf("unexpected failure starting epoch components: %w", err)
   401  	}
   402  
   403  	log.Info().Msg("epoch transition: new epoch components started successfully")
   404  
   405  	return nil
   406  }
   407  
   408  // prepareToStopEpochComponents registers a callback to stop the epoch with the
   409  // given counter once it is no longer possible to receive transactions from that
   410  // epoch. This occurs when we finalize sufficiently many blocks in the new epoch
   411  // that a transaction referencing any block from the previous epoch would be
   412  // considered immediately expired.
   413  //
   414  // Transactions referencing blocks from the previous epoch are only valid for
   415  // inclusion in collections built by clusters from that epoch. Consequently, it
   416  // remains possible for the previous epoch's cluster to produce valid collections
   417  // until all such transactions have expired. In fact, since these transactions
   418  // can NOT be included by clusters in the new epoch, we MUST continue producing
   419  // these collections within the previous epoch's clusters.
   420  func (e *Engine) prepareToStopEpochComponents(epochCounter, epochMaxHeight uint64) {
   421  	stopAtHeight := epochMaxHeight + flow.DefaultTransactionExpiry + 1
   422  	e.log.Info().
   423  		Uint64("stopping_epoch_max_height", epochMaxHeight).
   424  		Uint64("stopping_epoch_counter", epochCounter).
   425  		Uint64("stop_at_height", stopAtHeight).
   426  		Str("step", "epoch_transition").
   427  		Msgf("preparing to stop epoch components at height %d", stopAtHeight)
   428  
   429  	e.heightEvents.OnHeight(stopAtHeight, func() {
   430  		e.epochStopEvents <- epochCounter
   431  	})
   432  }
   433  
   434  // onEpochSetupPhaseStarted is called either when we transition into the epoch
   435  // setup phase, or when the node is restarted during the epoch setup phase. It
   436  // kicks off setup tasks for the phase, in particular submitting a vote for the
   437  // next epoch's root cluster QC.
   438  func (e *Engine) onEpochSetupPhaseStarted(ctx irrecoverable.SignalerContext, nextEpoch protocol.Epoch) {
   439  	err := e.voter.Vote(ctx, nextEpoch)
   440  	if err != nil {
   441  		if epochs.IsClusterQCNoVoteError(err) {
   442  			e.log.Warn().Err(err).Msg("unable to submit QC vote for next epoch")
   443  			return
   444  		}
   445  		ctx.Throw(fmt.Errorf("unexpected failure to submit QC vote for next epoch: %w", err))
   446  	}
   447  }
   448  
   449  // startEpochComponents starts the components for the given epoch and adds them
   450  // to the engine's internal mapping.
   451  // No errors are expected during normal operation.
   452  func (e *Engine) startEpochComponents(engineCtx irrecoverable.SignalerContext, counter uint64, components *EpochComponents) error {
   453  	epochCtx, cancel, errCh := irrecoverable.WithSignallerAndCancel(engineCtx)
   454  	// start component using its own context
   455  	components.Start(epochCtx)
   456  	go e.handleEpochErrors(engineCtx, errCh)
   457  
   458  	select {
   459  	case <-components.Ready():
   460  		e.storeEpochComponents(counter, NewRunningEpochComponents(components, cancel))
   461  		activeClusterIDS, err := e.activeClusterIDs()
   462  		if err != nil {
   463  			return fmt.Errorf("failed to get active cluster IDs: %w", err)
   464  		}
   465  		e.clusterIDUpdateDistributor.ActiveClustersChanged(activeClusterIDS)
   466  		return nil
   467  	case <-time.After(e.startupTimeout):
   468  		cancel() // cancel current context if we didn't start in time
   469  		return fmt.Errorf("could not start epoch %d components after %s", counter, e.startupTimeout)
   470  	}
   471  }
   472  
   473  // stopEpochComponents stops the components for the given epoch and removes them
   474  // from the engine's internal mapping. If no components exit for the given epoch,
   475  // this is a no-op and a warning is logged.
   476  // No errors are expected during normal operation.
   477  func (e *Engine) stopEpochComponents(counter uint64) error {
   478  	components, exists := e.getEpochComponents(counter)
   479  	if !exists {
   480  		e.log.Warn().Msgf("attempted to stop non-existent epoch %d", counter)
   481  		return nil
   482  	}
   483  
   484  	// stop individual component
   485  	components.cancel()
   486  
   487  	select {
   488  	case <-components.Done():
   489  		e.removeEpoch(counter)
   490  		e.pools.ForEpoch(counter).Clear()
   491  		activeClusterIDS, err := e.activeClusterIDs()
   492  		if err != nil {
   493  			return fmt.Errorf("failed to get active cluster IDs: %w", err)
   494  		}
   495  		e.clusterIDUpdateDistributor.ActiveClustersChanged(activeClusterIDS)
   496  		return nil
   497  	case <-time.After(e.startupTimeout):
   498  		return fmt.Errorf("could not stop epoch %d components after %s", counter, e.startupTimeout)
   499  	}
   500  }
   501  
   502  // getEpochComponents retrieves the stored (running) epoch components for the given epoch counter.
   503  // If no epoch with the counter is stored, returns (nil, false).
   504  // Safe for concurrent use.
   505  func (e *Engine) getEpochComponents(counter uint64) (*RunningEpochComponents, bool) {
   506  	e.mu.RLock()
   507  	epoch, ok := e.epochs[counter]
   508  	e.mu.RUnlock()
   509  	return epoch, ok
   510  }
   511  
   512  // storeEpochComponents stores the given epoch components in the engine's mapping.
   513  // Safe for concurrent use.
   514  func (e *Engine) storeEpochComponents(counter uint64, components *RunningEpochComponents) {
   515  	e.mu.Lock()
   516  	e.epochs[counter] = components
   517  	e.mu.Unlock()
   518  }
   519  
   520  // removeEpoch removes the epoch components with the given counter.
   521  // Safe for concurrent use.
   522  func (e *Engine) removeEpoch(counter uint64) {
   523  	e.mu.Lock()
   524  	delete(e.epochs, counter)
   525  	e.mu.Unlock()
   526  }
   527  
   528  // activeClusterIDs returns the active canonical cluster ID's for the assigned collection clusters.
   529  // No errors are expected during normal operation.
   530  func (e *Engine) activeClusterIDs() (flow.ChainIDList, error) {
   531  	e.mu.RLock()
   532  	defer e.mu.RUnlock()
   533  	clusterIDs := make(flow.ChainIDList, 0)
   534  	for _, epoch := range e.epochs {
   535  		chainID, err := epoch.state.Params().ChainID() // cached, does not hit database
   536  		if err != nil {
   537  			return nil, fmt.Errorf("failed to get active cluster ids: %w", err)
   538  		}
   539  		clusterIDs = append(clusterIDs, chainID)
   540  	}
   541  	return clusterIDs, nil
   542  }