github.com/onflow/flow-go@v0.33.17/consensus/hotstuff/cruisectl/block_time_controller.go (about) 1 // Package cruisectl implements a "cruise control" system for Flow by adjusting 2 // nodes' latest ProposalTiming in response to changes in the measured view rate and 3 // target epoch switchover time. 4 // 5 // It uses a PID controller with the projected epoch switchover time as the process 6 // variable and the set-point computed using epoch length config. The error is 7 // the difference between the projected epoch switchover time, assuming an 8 // ideal view time τ, and the target epoch switchover time (based on a schedule). 9 package cruisectl 10 11 import ( 12 "fmt" 13 "time" 14 15 "github.com/rs/zerolog" 16 "go.uber.org/atomic" 17 18 "github.com/onflow/flow-go/consensus/hotstuff" 19 "github.com/onflow/flow-go/consensus/hotstuff/model" 20 "github.com/onflow/flow-go/model/flow" 21 "github.com/onflow/flow-go/module" 22 "github.com/onflow/flow-go/module/component" 23 "github.com/onflow/flow-go/module/irrecoverable" 24 "github.com/onflow/flow-go/state/protocol" 25 "github.com/onflow/flow-go/state/protocol/events" 26 ) 27 28 // TimedBlock represents a block, with a timestamp recording when the BlockTimeController received the block 29 type TimedBlock struct { 30 Block *model.Block 31 TimeObserved time.Time // timestamp when BlockTimeController received the block, per convention in UTC 32 } 33 34 // epochInfo stores data about the current and next epoch. It is updated when we enter 35 // the first view of a new epoch, or the EpochSetup phase of the current epoch. 36 type epochInfo struct { 37 curEpochFirstView uint64 38 curEpochFinalView uint64 // F[v] - the final view of the epoch 39 curEpochTargetEndTime time.Time // T[v] - the target end time of the current epoch 40 nextEpochFinalView *uint64 41 } 42 43 // targetViewTime returns τ[v], the ideal, steady-state view time for the current epoch. 44 // For numerical stability, we avoid repetitive conversions between seconds and time.Duration. 45 // Instead, internally within the controller, we work with float64 in units of seconds. 46 func (epoch *epochInfo) targetViewTime() float64 { 47 return epochLength.Seconds() / float64(epoch.curEpochFinalView-epoch.curEpochFirstView+1) 48 } 49 50 // fractionComplete returns the percentage of views completed of the epoch for the given curView. 51 // curView must be within the range [curEpochFirstView, curEpochFinalView] 52 // Returns the completion percentage as a float between [0, 1] 53 func (epoch *epochInfo) fractionComplete(curView uint64) float64 { 54 return float64(curView-epoch.curEpochFirstView) / float64(epoch.curEpochFinalView-epoch.curEpochFirstView) 55 } 56 57 // BlockTimeController dynamically adjusts the ProposalTiming of this node, 58 // based on the measured view rate of the consensus committee as a whole, in 59 // order to achieve a desired switchover time for each epoch. 60 // In a nutshell, the controller outputs the block time on the happy path, i.e. 61 // - Suppose the node is observing the parent block B0 at some time `x0`. 62 // - The controller determines the duration `d` of how much later the child block B1 63 // should be observed by the committee. 64 // - The controller internally memorizes the latest B0 it has seen and outputs 65 // the tuple `(B0, x0, d)` 66 // 67 // This low-level controller output `(B0, x0, d)` is wrapped into a `ProposalTiming` 68 // interface, specifically `happyPathBlockTime` on the happy path. The purpose of the 69 // `ProposalTiming` wrapper is to translate the raw controller output into a form 70 // that is useful for the event handler. Edge cases, such as initialization or 71 // EECC are implemented by other implementations of `ProposalTiming`. 72 type BlockTimeController struct { 73 component.Component 74 protocol.Consumer // consumes protocol state events 75 76 config *Config 77 78 state protocol.State 79 log zerolog.Logger 80 metrics module.CruiseCtlMetrics 81 82 epochInfo // scheduled transition view for current/next epoch 83 epochFallbackTriggered bool 84 85 incorporatedBlocks chan TimedBlock // OnBlockIncorporated events, we desire these blocks to be processed in a timely manner and therefore use a small channel capacity 86 epochSetups chan *flow.Header // EpochSetupPhaseStarted events (block header within setup phase) 87 epochFallbacks chan struct{} // EpochFallbackTriggered events 88 89 proportionalErr Ewma 90 integralErr LeakyIntegrator 91 92 // latestProposalTiming holds the ProposalTiming that the controller generated in response to processing the latest observation 93 latestProposalTiming *atomic.Pointer[ProposalTiming] 94 } 95 96 var _ hotstuff.ProposalDurationProvider = (*BlockTimeController)(nil) 97 var _ protocol.Consumer = (*BlockTimeController)(nil) 98 var _ component.Component = (*BlockTimeController)(nil) 99 100 // NewBlockTimeController returns a new BlockTimeController. 101 func NewBlockTimeController(log zerolog.Logger, metrics module.CruiseCtlMetrics, config *Config, state protocol.State, curView uint64) (*BlockTimeController, error) { 102 // Initial error must be 0 unless we are making assumptions of the prior history of the proportional error `e[v]` 103 initProptlErr, initItgErr, initDrivErr := .0, .0, .0 104 proportionalErr, err := NewEwma(config.alpha(), initProptlErr) 105 if err != nil { 106 return nil, fmt.Errorf("failed to initialize EWMA for computing the proportional error: %w", err) 107 } 108 integralErr, err := NewLeakyIntegrator(config.beta(), initItgErr) 109 if err != nil { 110 return nil, fmt.Errorf("failed to initialize LeakyIntegrator for computing the integral error: %w", err) 111 } 112 113 ctl := &BlockTimeController{ 114 Consumer: events.NewNoop(), 115 config: config, 116 log: log.With().Str("hotstuff", "cruise_ctl").Logger(), 117 metrics: metrics, 118 state: state, 119 incorporatedBlocks: make(chan TimedBlock, 3), 120 epochSetups: make(chan *flow.Header, 5), 121 epochFallbacks: make(chan struct{}, 5), 122 proportionalErr: proportionalErr, 123 integralErr: integralErr, 124 latestProposalTiming: atomic.NewPointer[ProposalTiming](nil), // set in initProposalTiming 125 } 126 ctl.Component = component.NewComponentManagerBuilder(). 127 AddWorker(ctl.processEventsWorkerLogic). 128 Build() 129 130 // initialize state 131 err = ctl.initEpochInfo(curView) 132 if err != nil { 133 return nil, fmt.Errorf("could not initialize epoch info: %w", err) 134 } 135 ctl.initProposalTiming(curView) 136 137 ctl.log.Debug(). 138 Uint64("view", curView). 139 Msg("initialized BlockTimeController") 140 ctl.metrics.PIDError(initProptlErr, initItgErr, initDrivErr) 141 ctl.metrics.ControllerOutput(0) 142 ctl.metrics.TargetProposalDuration(0) 143 144 return ctl, nil 145 } 146 147 // initEpochInfo initializes the epochInfo state upon component startup. 148 // No errors are expected during normal operation. 149 func (ctl *BlockTimeController) initEpochInfo(curView uint64) error { 150 finalSnapshot := ctl.state.Final() 151 curEpoch := finalSnapshot.Epochs().Current() 152 153 curEpochFirstView, err := curEpoch.FirstView() 154 if err != nil { 155 return fmt.Errorf("could not initialize current epoch first view: %w", err) 156 } 157 ctl.curEpochFirstView = curEpochFirstView 158 159 curEpochFinalView, err := curEpoch.FinalView() 160 if err != nil { 161 return fmt.Errorf("could not initialize current epoch final view: %w", err) 162 } 163 ctl.curEpochFinalView = curEpochFinalView 164 165 phase, err := finalSnapshot.Phase() 166 if err != nil { 167 return fmt.Errorf("could not check snapshot phase: %w", err) 168 } 169 if phase > flow.EpochPhaseStaking { 170 nextEpochFinalView, err := finalSnapshot.Epochs().Next().FinalView() 171 if err != nil { 172 return fmt.Errorf("could not initialize next epoch final view: %w", err) 173 } 174 ctl.epochInfo.nextEpochFinalView = &nextEpochFinalView 175 } 176 177 ctl.curEpochTargetEndTime = ctl.config.TargetTransition.inferTargetEndTime(time.Now().UTC(), ctl.epochInfo.fractionComplete(curView)) 178 179 epochFallbackTriggered, err := ctl.state.Params().EpochFallbackTriggered() 180 if err != nil { 181 return fmt.Errorf("could not check epoch fallback: %w", err) 182 } 183 ctl.epochFallbackTriggered = epochFallbackTriggered 184 185 return nil 186 } 187 188 // initProposalTiming initializes the ProposalTiming value upon startup. 189 // CAUTION: Must be called after initEpochInfo. 190 func (ctl *BlockTimeController) initProposalTiming(curView uint64) { 191 // When disabled, or in epoch fallback, use fallback timing (constant ProposalDuration) 192 if ctl.epochFallbackTriggered || !ctl.config.Enabled.Load() { 193 ctl.storeProposalTiming(newFallbackTiming(curView, time.Now().UTC(), ctl.config.FallbackProposalDelay.Load())) 194 return 195 } 196 // Otherwise, before we observe any view changes, publish blocks immediately 197 ctl.storeProposalTiming(newPublishImmediately(curView, time.Now().UTC())) 198 } 199 200 // storeProposalTiming stores the latest ProposalTiming 201 // Concurrency safe. 202 func (ctl *BlockTimeController) storeProposalTiming(proposalTiming ProposalTiming) { 203 ctl.latestProposalTiming.Store(&proposalTiming) 204 } 205 206 // GetProposalTiming returns the controller's latest ProposalTiming. Concurrency safe. 207 func (ctl *BlockTimeController) GetProposalTiming() ProposalTiming { 208 pt := ctl.latestProposalTiming.Load() 209 if pt == nil { // should never happen, as we always store non-nil instances of ProposalTiming. Though, this extra check makes `GetProposalTiming` universal. 210 return nil 211 } 212 return *pt 213 } 214 215 func (ctl *BlockTimeController) TargetPublicationTime(proposalView uint64, timeViewEntered time.Time, parentBlockId flow.Identifier) time.Time { 216 return ctl.GetProposalTiming().TargetPublicationTime(proposalView, timeViewEntered, parentBlockId) 217 } 218 219 // processEventsWorkerLogic is the logic for processing events received from other components. 220 // This method should be executed by a dedicated worker routine (not concurrency safe). 221 func (ctl *BlockTimeController) processEventsWorkerLogic(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) { 222 ready() 223 224 done := ctx.Done() 225 for { 226 227 // Priority 1: EpochSetup 228 select { 229 case block := <-ctl.epochSetups: 230 snapshot := ctl.state.AtHeight(block.Height) 231 err := ctl.processEpochSetupPhaseStarted(snapshot) 232 if err != nil { 233 ctl.log.Err(err).Msgf("fatal error handling EpochSetupPhaseStarted event") 234 ctx.Throw(err) 235 return 236 } 237 default: 238 } 239 240 // Priority 2: EpochFallbackTriggered 241 select { 242 case <-ctl.epochFallbacks: 243 err := ctl.processEpochFallbackTriggered() 244 if err != nil { 245 ctl.log.Err(err).Msgf("fatal error processing epoch EECC event") 246 ctx.Throw(err) 247 } 248 default: 249 } 250 251 // Priority 3: OnBlockIncorporated 252 select { 253 case <-done: 254 return 255 case block := <-ctl.incorporatedBlocks: 256 err := ctl.processIncorporatedBlock(block) 257 if err != nil { 258 ctl.log.Err(err).Msgf("fatal error handling OnBlockIncorporated event") 259 ctx.Throw(err) 260 return 261 } 262 case block := <-ctl.epochSetups: 263 snapshot := ctl.state.AtHeight(block.Height) 264 err := ctl.processEpochSetupPhaseStarted(snapshot) 265 if err != nil { 266 ctl.log.Err(err).Msgf("fatal error handling EpochSetupPhaseStarted event") 267 ctx.Throw(err) 268 return 269 } 270 case <-ctl.epochFallbacks: 271 err := ctl.processEpochFallbackTriggered() 272 if err != nil { 273 ctl.log.Err(err).Msgf("fatal error processing epoch EECC event") 274 ctx.Throw(err) 275 return 276 } 277 } 278 } 279 } 280 281 // processIncorporatedBlock processes `OnBlockIncorporated` events from HotStuff. 282 // Whenever the view changes, we: 283 // - updates epoch info, if this is the first observed view of a new epoch 284 // - compute error terms, compensation function output, and new ProposalTiming 285 // - compute a new projected epoch end time, assuming an ideal view rate 286 // 287 // No errors are expected during normal operation. 288 func (ctl *BlockTimeController) processIncorporatedBlock(tb TimedBlock) error { 289 // if epoch fallback is triggered, we always use fallbackProposalTiming 290 if ctl.epochFallbackTriggered { 291 return nil 292 } 293 294 latest := ctl.GetProposalTiming() 295 if tb.Block.View <= latest.ObservationView() { // we don't care about older blocks that are incorporated into the protocol state 296 return nil 297 } 298 299 err := ctl.checkForEpochTransition(tb) 300 if err != nil { 301 return fmt.Errorf("could not check for epoch transition: %w", err) 302 } 303 304 err = ctl.measureViewDuration(tb) 305 if err != nil { 306 return fmt.Errorf("could not measure view rate: %w", err) 307 } 308 return nil 309 } 310 311 // checkForEpochTransition updates the epochInfo to reflect an epoch transition if curView 312 // being entered causes a transition to the next epoch. Otherwise, this is a no-op. 313 // No errors are expected during normal operation. 314 func (ctl *BlockTimeController) checkForEpochTransition(tb TimedBlock) error { 315 view := tb.Block.View 316 if view <= ctl.curEpochFinalView { // prevalent case: we are still within the current epoch 317 return nil 318 } 319 320 // sanity checks, since we are beyond the final view of the most recently processed epoch: 321 if ctl.nextEpochFinalView == nil { // final view of epoch we are entering should be known 322 return fmt.Errorf("cannot transition without nextEpochFinalView set") 323 } 324 if view > *ctl.nextEpochFinalView { // the block's view should be within the upcoming epoch 325 return fmt.Errorf("sanity check failed: curView %d is beyond both current epoch (final view %d) and next epoch (final view %d)", 326 view, ctl.curEpochFinalView, *ctl.nextEpochFinalView) 327 } 328 329 ctl.curEpochFirstView = ctl.curEpochFinalView + 1 330 ctl.curEpochFinalView = *ctl.nextEpochFinalView 331 ctl.nextEpochFinalView = nil 332 ctl.curEpochTargetEndTime = ctl.config.TargetTransition.inferTargetEndTime(tb.Block.Timestamp, ctl.epochInfo.fractionComplete(view)) 333 return nil 334 } 335 336 // measureViewDuration computes a new measurement of projected epoch switchover time and error for the newly entered view. 337 // It updates the latest ProposalTiming based on the new error. 338 // No errors are expected during normal operation. 339 func (ctl *BlockTimeController) measureViewDuration(tb TimedBlock) error { 340 view := tb.Block.View 341 // if the controller is disabled, we don't update measurements and instead use a fallback timing 342 if !ctl.config.Enabled.Load() { 343 fallbackDelay := ctl.config.FallbackProposalDelay.Load() 344 ctl.storeProposalTiming(newFallbackTiming(view, tb.TimeObserved, fallbackDelay)) 345 ctl.log.Debug(). 346 Uint64("cur_view", view). 347 Dur("fallback_proposal_delay", fallbackDelay). 348 Msg("controller is disabled - using fallback timing") 349 return nil 350 } 351 352 previousProposalTiming := ctl.GetProposalTiming() 353 previousPropErr := ctl.proportionalErr.Value() 354 355 // Compute the projected time still needed for the remaining views, assuming that we progress through the remaining views with 356 // the idealized target view time. 357 // Note the '+1' term in the computation of `viewDurationsRemaining`. This is related to our convention that the epoch begins 358 // (happy path) when observing the first block of the epoch. Only by observing this block, the nodes transition to the first 359 // view of the epoch. Up to that point, the consensus replicas remain in the last view of the previous epoch, in the state of 360 // "having processed the last block of the old epoch and voted for it" (happy path). Replicas remain in this state until they 361 // see a confirmation of the view (either QC or TC for the last view of the previous epoch). 362 // In accordance with this convention, observing the proposal for the last view of an epoch, marks the start of the last view. 363 // By observing the proposal, nodes enter the last view, verify the block, vote for it, the primary aggregates the votes, 364 // constructs the child (for first view of new epoch). The last view of the epoch ends, when the child proposal is published. 365 tau := ctl.targetViewTime() // τ - idealized target view time in units of seconds 366 viewDurationsRemaining := ctl.curEpochFinalView + 1 - view // k[v] - views remaining in current epoch 367 durationRemaining := ctl.curEpochTargetEndTime.Sub(tb.TimeObserved) 368 369 // Compute instantaneous error term: e[v] = k[v]·τ - T[v] i.e. the projected difference from target switchover 370 // and update PID controller's error terms. All UNITS in SECOND. 371 instErr := float64(viewDurationsRemaining)*tau - durationRemaining.Seconds() 372 propErr := ctl.proportionalErr.AddObservation(instErr) 373 itgErr := ctl.integralErr.AddObservation(instErr) 374 drivErr := propErr - previousPropErr 375 376 // controller output u[v] in units of second 377 u := propErr*ctl.config.KP + itgErr*ctl.config.KI + drivErr*ctl.config.KD 378 379 // compute the controller output for this observation 380 unconstrainedBlockTime := time.Duration((tau - u) * float64(time.Second)) // desired time between parent and child block, in units of seconds 381 proposalTiming := newHappyPathBlockTime(tb, unconstrainedBlockTime, ctl.config.TimingConfig) 382 constrainedBlockTime := proposalTiming.ConstrainedBlockTime() 383 384 ctl.log.Debug(). 385 Uint64("last_observation", previousProposalTiming.ObservationView()). 386 Dur("duration_since_last_observation", tb.TimeObserved.Sub(previousProposalTiming.ObservationTime())). 387 Dur("projected_time_remaining", durationRemaining). 388 Uint64("view_durations_remaining", viewDurationsRemaining). 389 Float64("inst_err", instErr). 390 Float64("proportional_err", propErr). 391 Float64("integral_err", itgErr). 392 Float64("derivative_err", drivErr). 393 Dur("controller_output", time.Duration(u*float64(time.Second))). 394 Dur("unconstrained_block_time", unconstrainedBlockTime). 395 Dur("constrained_block_time", constrainedBlockTime). 396 Msg("measured error upon view change") 397 398 ctl.metrics.PIDError(propErr, itgErr, drivErr) 399 ctl.metrics.ControllerOutput(time.Duration(u * float64(time.Second))) 400 ctl.metrics.TargetProposalDuration(proposalTiming.ConstrainedBlockTime()) 401 402 ctl.storeProposalTiming(proposalTiming) 403 return nil 404 } 405 406 // processEpochSetupPhaseStarted processes EpochSetupPhaseStarted events from the protocol state. 407 // Whenever we enter the EpochSetup phase, we: 408 // - store the next epoch's final view 409 // 410 // No errors are expected during normal operation. 411 func (ctl *BlockTimeController) processEpochSetupPhaseStarted(snapshot protocol.Snapshot) error { 412 if ctl.epochFallbackTriggered { 413 return nil 414 } 415 416 nextEpoch := snapshot.Epochs().Next() 417 finalView, err := nextEpoch.FinalView() 418 if err != nil { 419 return fmt.Errorf("could not get next epochInfo final view: %w", err) 420 } 421 ctl.epochInfo.nextEpochFinalView = &finalView 422 return nil 423 } 424 425 // processEpochFallbackTriggered processes EpochFallbackTriggered events from the protocol state. 426 // When epoch fallback mode is triggered, we: 427 // - set ProposalTiming to the default value 428 // - set epoch fallback triggered, to disable the controller 429 // 430 // No errors are expected during normal operation. 431 func (ctl *BlockTimeController) processEpochFallbackTriggered() error { 432 ctl.epochFallbackTriggered = true 433 latestFinalized, err := ctl.state.Final().Head() 434 if err != nil { 435 return fmt.Errorf("failed to retrieve latest finalized block from protocol state %w", err) 436 } 437 438 ctl.storeProposalTiming(newFallbackTiming(latestFinalized.View, time.Now().UTC(), ctl.config.FallbackProposalDelay.Load())) 439 return nil 440 } 441 442 // OnBlockIncorporated listens to notification from HotStuff about incorporating new blocks. 443 // The event is queued for async processing by the worker. If the channel is full, 444 // the event is discarded - since we are taking an average it doesn't matter if we 445 // occasionally miss a sample. 446 func (ctl *BlockTimeController) OnBlockIncorporated(block *model.Block) { 447 select { 448 case ctl.incorporatedBlocks <- TimedBlock{Block: block, TimeObserved: time.Now().UTC()}: 449 default: 450 } 451 } 452 453 // EpochSetupPhaseStarted responds to the EpochSetup phase starting for the current epoch. 454 // The event is queued for async processing by the worker. 455 func (ctl *BlockTimeController) EpochSetupPhaseStarted(_ uint64, first *flow.Header) { 456 ctl.epochSetups <- first 457 } 458 459 // EpochEmergencyFallbackTriggered responds to epoch fallback mode being triggered. 460 func (ctl *BlockTimeController) EpochEmergencyFallbackTriggered() { 461 ctl.epochFallbacks <- struct{}{} 462 }