github.com/onflow/flow-go@v0.33.17/consensus/hotstuff/cruisectl/block_time_controller.go (about)

     1  // Package cruisectl implements a "cruise control" system for Flow by adjusting
     2  // nodes' latest ProposalTiming in response to changes in the measured view rate and
     3  // target epoch switchover time.
     4  //
     5  // It uses a PID controller with the projected epoch switchover time as the process
     6  // variable and the set-point computed using epoch length config. The error is
     7  // the difference between the projected epoch switchover time, assuming an
     8  // ideal view time τ, and the target epoch switchover time (based on a schedule).
     9  package cruisectl
    10  
    11  import (
    12  	"fmt"
    13  	"time"
    14  
    15  	"github.com/rs/zerolog"
    16  	"go.uber.org/atomic"
    17  
    18  	"github.com/onflow/flow-go/consensus/hotstuff"
    19  	"github.com/onflow/flow-go/consensus/hotstuff/model"
    20  	"github.com/onflow/flow-go/model/flow"
    21  	"github.com/onflow/flow-go/module"
    22  	"github.com/onflow/flow-go/module/component"
    23  	"github.com/onflow/flow-go/module/irrecoverable"
    24  	"github.com/onflow/flow-go/state/protocol"
    25  	"github.com/onflow/flow-go/state/protocol/events"
    26  )
    27  
    28  // TimedBlock represents a block, with a timestamp recording when the BlockTimeController received the block
    29  type TimedBlock struct {
    30  	Block        *model.Block
    31  	TimeObserved time.Time // timestamp when BlockTimeController received the block, per convention in UTC
    32  }
    33  
    34  // epochInfo stores data about the current and next epoch. It is updated when we enter
    35  // the first view of a new epoch, or the EpochSetup phase of the current epoch.
    36  type epochInfo struct {
    37  	curEpochFirstView     uint64
    38  	curEpochFinalView     uint64    // F[v] - the final view of the epoch
    39  	curEpochTargetEndTime time.Time // T[v] - the target end time of the current epoch
    40  	nextEpochFinalView    *uint64
    41  }
    42  
    43  // targetViewTime returns τ[v], the ideal, steady-state view time for the current epoch.
    44  // For numerical stability, we avoid repetitive conversions between seconds and time.Duration.
    45  // Instead, internally within the controller, we work with float64 in units of seconds.
    46  func (epoch *epochInfo) targetViewTime() float64 {
    47  	return epochLength.Seconds() / float64(epoch.curEpochFinalView-epoch.curEpochFirstView+1)
    48  }
    49  
    50  // fractionComplete returns the percentage of views completed of the epoch for the given curView.
    51  // curView must be within the range [curEpochFirstView, curEpochFinalView]
    52  // Returns the completion percentage as a float between [0, 1]
    53  func (epoch *epochInfo) fractionComplete(curView uint64) float64 {
    54  	return float64(curView-epoch.curEpochFirstView) / float64(epoch.curEpochFinalView-epoch.curEpochFirstView)
    55  }
    56  
    57  // BlockTimeController dynamically adjusts the ProposalTiming of this node,
    58  // based on the measured view rate of the consensus committee as a whole, in
    59  // order to achieve a desired switchover time for each epoch.
    60  // In a nutshell, the controller outputs the block time on the happy path, i.e.
    61  //   - Suppose the node is observing the parent block B0 at some time `x0`.
    62  //   - The controller determines the duration `d` of how much later the child block B1
    63  //     should be observed by the committee.
    64  //   - The controller internally memorizes the latest B0 it has seen and outputs
    65  //     the tuple `(B0, x0, d)`
    66  //
    67  // This low-level controller output `(B0, x0, d)` is wrapped into a `ProposalTiming`
    68  // interface, specifically `happyPathBlockTime` on the happy path. The purpose of the
    69  // `ProposalTiming` wrapper is to translate the raw controller output into a form
    70  // that is useful for the event handler. Edge cases, such as initialization or
    71  // EECC are implemented by other implementations of `ProposalTiming`.
    72  type BlockTimeController struct {
    73  	component.Component
    74  	protocol.Consumer // consumes protocol state events
    75  
    76  	config *Config
    77  
    78  	state   protocol.State
    79  	log     zerolog.Logger
    80  	metrics module.CruiseCtlMetrics
    81  
    82  	epochInfo              // scheduled transition view for current/next epoch
    83  	epochFallbackTriggered bool
    84  
    85  	incorporatedBlocks chan TimedBlock   // OnBlockIncorporated events, we desire these blocks to be processed in a timely manner and therefore use a small channel capacity
    86  	epochSetups        chan *flow.Header // EpochSetupPhaseStarted events (block header within setup phase)
    87  	epochFallbacks     chan struct{}     // EpochFallbackTriggered events
    88  
    89  	proportionalErr Ewma
    90  	integralErr     LeakyIntegrator
    91  
    92  	// latestProposalTiming holds the ProposalTiming that the controller generated in response to processing the latest observation
    93  	latestProposalTiming *atomic.Pointer[ProposalTiming]
    94  }
    95  
    96  var _ hotstuff.ProposalDurationProvider = (*BlockTimeController)(nil)
    97  var _ protocol.Consumer = (*BlockTimeController)(nil)
    98  var _ component.Component = (*BlockTimeController)(nil)
    99  
   100  // NewBlockTimeController returns a new BlockTimeController.
   101  func NewBlockTimeController(log zerolog.Logger, metrics module.CruiseCtlMetrics, config *Config, state protocol.State, curView uint64) (*BlockTimeController, error) {
   102  	// Initial error must be 0 unless we are making assumptions of the prior history of the proportional error `e[v]`
   103  	initProptlErr, initItgErr, initDrivErr := .0, .0, .0
   104  	proportionalErr, err := NewEwma(config.alpha(), initProptlErr)
   105  	if err != nil {
   106  		return nil, fmt.Errorf("failed to initialize EWMA for computing the proportional error: %w", err)
   107  	}
   108  	integralErr, err := NewLeakyIntegrator(config.beta(), initItgErr)
   109  	if err != nil {
   110  		return nil, fmt.Errorf("failed to initialize LeakyIntegrator for computing the integral error: %w", err)
   111  	}
   112  
   113  	ctl := &BlockTimeController{
   114  		Consumer:             events.NewNoop(),
   115  		config:               config,
   116  		log:                  log.With().Str("hotstuff", "cruise_ctl").Logger(),
   117  		metrics:              metrics,
   118  		state:                state,
   119  		incorporatedBlocks:   make(chan TimedBlock, 3),
   120  		epochSetups:          make(chan *flow.Header, 5),
   121  		epochFallbacks:       make(chan struct{}, 5),
   122  		proportionalErr:      proportionalErr,
   123  		integralErr:          integralErr,
   124  		latestProposalTiming: atomic.NewPointer[ProposalTiming](nil), // set in initProposalTiming
   125  	}
   126  	ctl.Component = component.NewComponentManagerBuilder().
   127  		AddWorker(ctl.processEventsWorkerLogic).
   128  		Build()
   129  
   130  	// initialize state
   131  	err = ctl.initEpochInfo(curView)
   132  	if err != nil {
   133  		return nil, fmt.Errorf("could not initialize epoch info: %w", err)
   134  	}
   135  	ctl.initProposalTiming(curView)
   136  
   137  	ctl.log.Debug().
   138  		Uint64("view", curView).
   139  		Msg("initialized BlockTimeController")
   140  	ctl.metrics.PIDError(initProptlErr, initItgErr, initDrivErr)
   141  	ctl.metrics.ControllerOutput(0)
   142  	ctl.metrics.TargetProposalDuration(0)
   143  
   144  	return ctl, nil
   145  }
   146  
   147  // initEpochInfo initializes the epochInfo state upon component startup.
   148  // No errors are expected during normal operation.
   149  func (ctl *BlockTimeController) initEpochInfo(curView uint64) error {
   150  	finalSnapshot := ctl.state.Final()
   151  	curEpoch := finalSnapshot.Epochs().Current()
   152  
   153  	curEpochFirstView, err := curEpoch.FirstView()
   154  	if err != nil {
   155  		return fmt.Errorf("could not initialize current epoch first view: %w", err)
   156  	}
   157  	ctl.curEpochFirstView = curEpochFirstView
   158  
   159  	curEpochFinalView, err := curEpoch.FinalView()
   160  	if err != nil {
   161  		return fmt.Errorf("could not initialize current epoch final view: %w", err)
   162  	}
   163  	ctl.curEpochFinalView = curEpochFinalView
   164  
   165  	phase, err := finalSnapshot.Phase()
   166  	if err != nil {
   167  		return fmt.Errorf("could not check snapshot phase: %w", err)
   168  	}
   169  	if phase > flow.EpochPhaseStaking {
   170  		nextEpochFinalView, err := finalSnapshot.Epochs().Next().FinalView()
   171  		if err != nil {
   172  			return fmt.Errorf("could not initialize next epoch final view: %w", err)
   173  		}
   174  		ctl.epochInfo.nextEpochFinalView = &nextEpochFinalView
   175  	}
   176  
   177  	ctl.curEpochTargetEndTime = ctl.config.TargetTransition.inferTargetEndTime(time.Now().UTC(), ctl.epochInfo.fractionComplete(curView))
   178  
   179  	epochFallbackTriggered, err := ctl.state.Params().EpochFallbackTriggered()
   180  	if err != nil {
   181  		return fmt.Errorf("could not check epoch fallback: %w", err)
   182  	}
   183  	ctl.epochFallbackTriggered = epochFallbackTriggered
   184  
   185  	return nil
   186  }
   187  
   188  // initProposalTiming initializes the ProposalTiming value upon startup.
   189  // CAUTION: Must be called after initEpochInfo.
   190  func (ctl *BlockTimeController) initProposalTiming(curView uint64) {
   191  	// When disabled, or in epoch fallback, use fallback timing (constant ProposalDuration)
   192  	if ctl.epochFallbackTriggered || !ctl.config.Enabled.Load() {
   193  		ctl.storeProposalTiming(newFallbackTiming(curView, time.Now().UTC(), ctl.config.FallbackProposalDelay.Load()))
   194  		return
   195  	}
   196  	// Otherwise, before we observe any view changes, publish blocks immediately
   197  	ctl.storeProposalTiming(newPublishImmediately(curView, time.Now().UTC()))
   198  }
   199  
   200  // storeProposalTiming stores the latest ProposalTiming
   201  // Concurrency safe.
   202  func (ctl *BlockTimeController) storeProposalTiming(proposalTiming ProposalTiming) {
   203  	ctl.latestProposalTiming.Store(&proposalTiming)
   204  }
   205  
   206  // GetProposalTiming returns the controller's latest ProposalTiming. Concurrency safe.
   207  func (ctl *BlockTimeController) GetProposalTiming() ProposalTiming {
   208  	pt := ctl.latestProposalTiming.Load()
   209  	if pt == nil { // should never happen, as we always store non-nil instances of ProposalTiming. Though, this extra check makes `GetProposalTiming` universal.
   210  		return nil
   211  	}
   212  	return *pt
   213  }
   214  
   215  func (ctl *BlockTimeController) TargetPublicationTime(proposalView uint64, timeViewEntered time.Time, parentBlockId flow.Identifier) time.Time {
   216  	return ctl.GetProposalTiming().TargetPublicationTime(proposalView, timeViewEntered, parentBlockId)
   217  }
   218  
   219  // processEventsWorkerLogic is the logic for processing events received from other components.
   220  // This method should be executed by a dedicated worker routine (not concurrency safe).
   221  func (ctl *BlockTimeController) processEventsWorkerLogic(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) {
   222  	ready()
   223  
   224  	done := ctx.Done()
   225  	for {
   226  
   227  		// Priority 1: EpochSetup
   228  		select {
   229  		case block := <-ctl.epochSetups:
   230  			snapshot := ctl.state.AtHeight(block.Height)
   231  			err := ctl.processEpochSetupPhaseStarted(snapshot)
   232  			if err != nil {
   233  				ctl.log.Err(err).Msgf("fatal error handling EpochSetupPhaseStarted event")
   234  				ctx.Throw(err)
   235  				return
   236  			}
   237  		default:
   238  		}
   239  
   240  		// Priority 2: EpochFallbackTriggered
   241  		select {
   242  		case <-ctl.epochFallbacks:
   243  			err := ctl.processEpochFallbackTriggered()
   244  			if err != nil {
   245  				ctl.log.Err(err).Msgf("fatal error processing epoch EECC event")
   246  				ctx.Throw(err)
   247  			}
   248  		default:
   249  		}
   250  
   251  		// Priority 3: OnBlockIncorporated
   252  		select {
   253  		case <-done:
   254  			return
   255  		case block := <-ctl.incorporatedBlocks:
   256  			err := ctl.processIncorporatedBlock(block)
   257  			if err != nil {
   258  				ctl.log.Err(err).Msgf("fatal error handling OnBlockIncorporated event")
   259  				ctx.Throw(err)
   260  				return
   261  			}
   262  		case block := <-ctl.epochSetups:
   263  			snapshot := ctl.state.AtHeight(block.Height)
   264  			err := ctl.processEpochSetupPhaseStarted(snapshot)
   265  			if err != nil {
   266  				ctl.log.Err(err).Msgf("fatal error handling EpochSetupPhaseStarted event")
   267  				ctx.Throw(err)
   268  				return
   269  			}
   270  		case <-ctl.epochFallbacks:
   271  			err := ctl.processEpochFallbackTriggered()
   272  			if err != nil {
   273  				ctl.log.Err(err).Msgf("fatal error processing epoch EECC event")
   274  				ctx.Throw(err)
   275  				return
   276  			}
   277  		}
   278  	}
   279  }
   280  
   281  // processIncorporatedBlock processes `OnBlockIncorporated` events from HotStuff.
   282  // Whenever the view changes, we:
   283  //   - updates epoch info, if this is the first observed view of a new epoch
   284  //   - compute error terms, compensation function output, and new ProposalTiming
   285  //   - compute a new projected epoch end time, assuming an ideal view rate
   286  //
   287  // No errors are expected during normal operation.
   288  func (ctl *BlockTimeController) processIncorporatedBlock(tb TimedBlock) error {
   289  	// if epoch fallback is triggered, we always use fallbackProposalTiming
   290  	if ctl.epochFallbackTriggered {
   291  		return nil
   292  	}
   293  
   294  	latest := ctl.GetProposalTiming()
   295  	if tb.Block.View <= latest.ObservationView() { // we don't care about older blocks that are incorporated into the protocol state
   296  		return nil
   297  	}
   298  
   299  	err := ctl.checkForEpochTransition(tb)
   300  	if err != nil {
   301  		return fmt.Errorf("could not check for epoch transition: %w", err)
   302  	}
   303  
   304  	err = ctl.measureViewDuration(tb)
   305  	if err != nil {
   306  		return fmt.Errorf("could not measure view rate: %w", err)
   307  	}
   308  	return nil
   309  }
   310  
   311  // checkForEpochTransition updates the epochInfo to reflect an epoch transition if curView
   312  // being entered causes a transition to the next epoch. Otherwise, this is a no-op.
   313  // No errors are expected during normal operation.
   314  func (ctl *BlockTimeController) checkForEpochTransition(tb TimedBlock) error {
   315  	view := tb.Block.View
   316  	if view <= ctl.curEpochFinalView { // prevalent case: we are still within the current epoch
   317  		return nil
   318  	}
   319  
   320  	// sanity checks, since we are beyond the final view of the most recently processed epoch:
   321  	if ctl.nextEpochFinalView == nil { // final view of epoch we are entering should be known
   322  		return fmt.Errorf("cannot transition without nextEpochFinalView set")
   323  	}
   324  	if view > *ctl.nextEpochFinalView { // the block's view should be within the upcoming epoch
   325  		return fmt.Errorf("sanity check failed: curView %d is beyond both current epoch (final view %d) and next epoch (final view %d)",
   326  			view, ctl.curEpochFinalView, *ctl.nextEpochFinalView)
   327  	}
   328  
   329  	ctl.curEpochFirstView = ctl.curEpochFinalView + 1
   330  	ctl.curEpochFinalView = *ctl.nextEpochFinalView
   331  	ctl.nextEpochFinalView = nil
   332  	ctl.curEpochTargetEndTime = ctl.config.TargetTransition.inferTargetEndTime(tb.Block.Timestamp, ctl.epochInfo.fractionComplete(view))
   333  	return nil
   334  }
   335  
   336  // measureViewDuration computes a new measurement of projected epoch switchover time and error for the newly entered view.
   337  // It updates the latest ProposalTiming based on the new error.
   338  // No errors are expected during normal operation.
   339  func (ctl *BlockTimeController) measureViewDuration(tb TimedBlock) error {
   340  	view := tb.Block.View
   341  	// if the controller is disabled, we don't update measurements and instead use a fallback timing
   342  	if !ctl.config.Enabled.Load() {
   343  		fallbackDelay := ctl.config.FallbackProposalDelay.Load()
   344  		ctl.storeProposalTiming(newFallbackTiming(view, tb.TimeObserved, fallbackDelay))
   345  		ctl.log.Debug().
   346  			Uint64("cur_view", view).
   347  			Dur("fallback_proposal_delay", fallbackDelay).
   348  			Msg("controller is disabled - using fallback timing")
   349  		return nil
   350  	}
   351  
   352  	previousProposalTiming := ctl.GetProposalTiming()
   353  	previousPropErr := ctl.proportionalErr.Value()
   354  
   355  	// Compute the projected time still needed for the remaining views, assuming that we progress through the remaining views with
   356  	// the idealized target view time.
   357  	// Note the '+1' term in the computation of `viewDurationsRemaining`. This is related to our convention that the epoch begins
   358  	// (happy path) when observing the first block of the epoch. Only by observing this block, the nodes transition to the first
   359  	// view of the epoch. Up to that point, the consensus replicas remain in the last view of the previous epoch, in the state of
   360  	// "having processed the last block of the old epoch and voted for it" (happy path). Replicas remain in this state until they
   361  	// see a confirmation of the view (either QC or TC for the last view of the previous epoch).
   362  	// In accordance with this convention, observing the proposal for the last view of an epoch, marks the start of the last view.
   363  	// By observing the proposal, nodes enter the last view, verify the block, vote for it, the primary aggregates the votes,
   364  	// constructs the child (for first view of new epoch). The last view of the epoch ends, when the child proposal is published.
   365  	tau := ctl.targetViewTime()                                // τ - idealized target view time in units of seconds
   366  	viewDurationsRemaining := ctl.curEpochFinalView + 1 - view // k[v] - views remaining in current epoch
   367  	durationRemaining := ctl.curEpochTargetEndTime.Sub(tb.TimeObserved)
   368  
   369  	// Compute instantaneous error term: e[v] = k[v]·τ - T[v] i.e. the projected difference from target switchover
   370  	// and update PID controller's error terms. All UNITS in SECOND.
   371  	instErr := float64(viewDurationsRemaining)*tau - durationRemaining.Seconds()
   372  	propErr := ctl.proportionalErr.AddObservation(instErr)
   373  	itgErr := ctl.integralErr.AddObservation(instErr)
   374  	drivErr := propErr - previousPropErr
   375  
   376  	// controller output u[v] in units of second
   377  	u := propErr*ctl.config.KP + itgErr*ctl.config.KI + drivErr*ctl.config.KD
   378  
   379  	// compute the controller output for this observation
   380  	unconstrainedBlockTime := time.Duration((tau - u) * float64(time.Second)) // desired time between parent and child block, in units of seconds
   381  	proposalTiming := newHappyPathBlockTime(tb, unconstrainedBlockTime, ctl.config.TimingConfig)
   382  	constrainedBlockTime := proposalTiming.ConstrainedBlockTime()
   383  
   384  	ctl.log.Debug().
   385  		Uint64("last_observation", previousProposalTiming.ObservationView()).
   386  		Dur("duration_since_last_observation", tb.TimeObserved.Sub(previousProposalTiming.ObservationTime())).
   387  		Dur("projected_time_remaining", durationRemaining).
   388  		Uint64("view_durations_remaining", viewDurationsRemaining).
   389  		Float64("inst_err", instErr).
   390  		Float64("proportional_err", propErr).
   391  		Float64("integral_err", itgErr).
   392  		Float64("derivative_err", drivErr).
   393  		Dur("controller_output", time.Duration(u*float64(time.Second))).
   394  		Dur("unconstrained_block_time", unconstrainedBlockTime).
   395  		Dur("constrained_block_time", constrainedBlockTime).
   396  		Msg("measured error upon view change")
   397  
   398  	ctl.metrics.PIDError(propErr, itgErr, drivErr)
   399  	ctl.metrics.ControllerOutput(time.Duration(u * float64(time.Second)))
   400  	ctl.metrics.TargetProposalDuration(proposalTiming.ConstrainedBlockTime())
   401  
   402  	ctl.storeProposalTiming(proposalTiming)
   403  	return nil
   404  }
   405  
   406  // processEpochSetupPhaseStarted processes EpochSetupPhaseStarted events from the protocol state.
   407  // Whenever we enter the EpochSetup phase, we:
   408  //   - store the next epoch's final view
   409  //
   410  // No errors are expected during normal operation.
   411  func (ctl *BlockTimeController) processEpochSetupPhaseStarted(snapshot protocol.Snapshot) error {
   412  	if ctl.epochFallbackTriggered {
   413  		return nil
   414  	}
   415  
   416  	nextEpoch := snapshot.Epochs().Next()
   417  	finalView, err := nextEpoch.FinalView()
   418  	if err != nil {
   419  		return fmt.Errorf("could not get next epochInfo final view: %w", err)
   420  	}
   421  	ctl.epochInfo.nextEpochFinalView = &finalView
   422  	return nil
   423  }
   424  
   425  // processEpochFallbackTriggered processes EpochFallbackTriggered events from the protocol state.
   426  // When epoch fallback mode is triggered, we:
   427  //   - set ProposalTiming to the default value
   428  //   - set epoch fallback triggered, to disable the controller
   429  //
   430  // No errors are expected during normal operation.
   431  func (ctl *BlockTimeController) processEpochFallbackTriggered() error {
   432  	ctl.epochFallbackTriggered = true
   433  	latestFinalized, err := ctl.state.Final().Head()
   434  	if err != nil {
   435  		return fmt.Errorf("failed to retrieve latest finalized block from protocol state %w", err)
   436  	}
   437  
   438  	ctl.storeProposalTiming(newFallbackTiming(latestFinalized.View, time.Now().UTC(), ctl.config.FallbackProposalDelay.Load()))
   439  	return nil
   440  }
   441  
   442  // OnBlockIncorporated listens to notification from HotStuff about incorporating new blocks.
   443  // The event is queued for async processing by the worker. If the channel is full,
   444  // the event is discarded - since we are taking an average it doesn't matter if we
   445  // occasionally miss a sample.
   446  func (ctl *BlockTimeController) OnBlockIncorporated(block *model.Block) {
   447  	select {
   448  	case ctl.incorporatedBlocks <- TimedBlock{Block: block, TimeObserved: time.Now().UTC()}:
   449  	default:
   450  	}
   451  }
   452  
   453  // EpochSetupPhaseStarted responds to the EpochSetup phase starting for the current epoch.
   454  // The event is queued for async processing by the worker.
   455  func (ctl *BlockTimeController) EpochSetupPhaseStarted(_ uint64, first *flow.Header) {
   456  	ctl.epochSetups <- first
   457  }
   458  
   459  // EpochEmergencyFallbackTriggered responds to epoch fallback mode being triggered.
   460  func (ctl *BlockTimeController) EpochEmergencyFallbackTriggered() {
   461  	ctl.epochFallbacks <- struct{}{}
   462  }