github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/execution/ingestion/stop/stop_control.go (about) 1 package stop 2 3 import ( 4 "errors" 5 "fmt" 6 "math" 7 "strings" 8 "sync" 9 "time" 10 11 "github.com/coreos/go-semver/semver" 12 "github.com/rs/zerolog" 13 14 "github.com/onflow/flow-go/engine" 15 "github.com/onflow/flow-go/engine/execution/state" 16 "github.com/onflow/flow-go/model/flow" 17 "github.com/onflow/flow-go/module/component" 18 "github.com/onflow/flow-go/module/irrecoverable" 19 "github.com/onflow/flow-go/state/protocol" 20 psEvents "github.com/onflow/flow-go/state/protocol/events" 21 "github.com/onflow/flow-go/storage" 22 ) 23 24 const ( 25 // TODO: figure out an appropriate graceful stop time (is 10 min. enough?) 26 DefaultMaxGracefulStopDuration = 10 * time.Minute 27 ) 28 29 // StopControl is a specialized component used by ingestion.Engine to encapsulate 30 // control of stopping blocks execution. 31 // It is intended to work tightly with the Engine, not as a general mechanism or interface. 32 // 33 // StopControl can stop execution or crash the node at a specific block height. The stop 34 // height can be set manually or by the version beacon service event. This leads to some 35 // edge cases that are handled by the StopControl: 36 // 37 // 1. stop is already set manually and is set again manually. 38 // This is considered as an attempt to move the stop height. The resulting stop 39 // height is the new one. Note, the new height can be either lower or higher than 40 // previous value. 41 // 2. stop is already set manually and is set by the version beacon. 42 // The resulting stop height is the lower one. 43 // 3. stop is already set by the version beacon and is set manually. 44 // The resulting stop height is the lower one. 45 // 4. stop is already set by the version beacon and is set by the version beacon. 46 // This means version boundaries were edited. The resulting stop 47 // height is the new one. 48 type StopControl struct { 49 unit *engine.Unit 50 maxGracefulStopDuration time.Duration 51 52 // Stop control needs to consume BlockFinalized events. 53 // adding psEvents.Noop makes it a protocol.Consumer 54 psEvents.Noop 55 sync.RWMutex 56 component.Component 57 58 blockFinalizedChan chan *flow.Header 59 60 headers StopControlHeaders 61 exeState state.ReadOnlyExecutionState 62 versionBeacons storage.VersionBeacons 63 64 // stopped is true if node should no longer be executing blocks. 65 stopped bool 66 // stopBoundary is when the node should stop. 67 stopBoundary stopBoundary 68 // nodeVersion could be nil right now. See NewStopControl. 69 nodeVersion *semver.Version 70 // last seen version beacon, used to detect version beacon changes 71 versionBeacon *flow.SealedVersionBeacon 72 // if the node should crash on version boundary from a version beacon is reached 73 crashOnVersionBoundaryReached bool 74 75 log zerolog.Logger 76 } 77 78 var _ protocol.Consumer = (*StopControl)(nil) 79 80 var NoStopHeight = uint64(math.MaxUint64) 81 82 type StopParameters struct { 83 // desired StopBeforeHeight, the first value new version should be used, 84 // so this height WON'T be executed 85 StopBeforeHeight uint64 86 87 // if the node should crash or just pause after reaching StopBeforeHeight 88 ShouldCrash bool 89 } 90 91 func (p StopParameters) Set() bool { 92 return p.StopBeforeHeight != NoStopHeight 93 } 94 95 type stopBoundarySource string 96 97 const ( 98 stopBoundarySourceManual stopBoundarySource = "manual" 99 stopBoundarySourceVersionBeacon stopBoundarySource = "versionBeacon" 100 ) 101 102 type stopBoundary struct { 103 StopParameters 104 105 // The stop control will prevent execution of blocks higher than StopBeforeHeight 106 // once this happens the stop control is affecting execution and StopParameters can 107 // no longer be changed 108 immutable bool 109 110 // This is the block ID of the block that should be executed last. 111 stopAfterExecuting flow.Identifier 112 113 // if the stop parameters were set by the version beacon or manually 114 source stopBoundarySource 115 } 116 117 // String returns string in the format "crash@20023[stopBoundarySourceVersionBeacon]" or 118 // "stop@20023@blockID[manual]" 119 // block ID is only present if stopAfterExecuting is set 120 // the ID is from the block that should be executed last and has height one 121 // less than StopBeforeHeight 122 func (s stopBoundary) String() string { 123 if !s.Set() { 124 return "none" 125 } 126 127 sb := strings.Builder{} 128 if s.ShouldCrash { 129 sb.WriteString("crash") 130 } else { 131 sb.WriteString("stop") 132 } 133 sb.WriteString("@") 134 sb.WriteString(fmt.Sprintf("%d", s.StopBeforeHeight)) 135 136 if s.stopAfterExecuting != flow.ZeroID { 137 sb.WriteString("@") 138 sb.WriteString(s.stopAfterExecuting.String()) 139 } 140 141 sb.WriteString("[") 142 sb.WriteString(string(s.source)) 143 sb.WriteString("]") 144 145 return sb.String() 146 } 147 148 // StopControlHeaders is an interface for fetching headers 149 // Its jut a small subset of storage.Headers for comments see storage.Headers 150 type StopControlHeaders interface { 151 BlockIDByHeight(height uint64) (flow.Identifier, error) 152 } 153 154 // NewStopControl creates new StopControl. 155 // 156 // We currently have no strong guarantee that the node version is a valid semver. 157 // See build.SemverV2 for more details. That is why nil is a valid input for node version 158 // without a node version, the stop control can still be used for manual stopping. 159 func NewStopControl( 160 unit *engine.Unit, 161 maxGracefulStopDuration time.Duration, 162 log zerolog.Logger, 163 exeState state.ReadOnlyExecutionState, 164 headers StopControlHeaders, 165 versionBeacons storage.VersionBeacons, 166 nodeVersion *semver.Version, 167 latestFinalizedBlock *flow.Header, 168 withStoppedExecution bool, 169 crashOnVersionBoundaryReached bool, 170 ) *StopControl { 171 // We should not miss block finalized events, and we should be able to handle them 172 // faster than they are produced anyway. 173 blockFinalizedChan := make(chan *flow.Header, 1000) 174 175 sc := &StopControl{ 176 unit: unit, 177 maxGracefulStopDuration: maxGracefulStopDuration, 178 log: log.With(). 179 Str("component", "stop_control"). 180 Logger(), 181 182 blockFinalizedChan: blockFinalizedChan, 183 184 exeState: exeState, 185 headers: headers, 186 nodeVersion: nodeVersion, 187 versionBeacons: versionBeacons, 188 stopped: withStoppedExecution, 189 crashOnVersionBoundaryReached: crashOnVersionBoundaryReached, 190 // the default is to never stop 191 stopBoundary: stopBoundary{ 192 StopParameters: StopParameters{ 193 StopBeforeHeight: NoStopHeight, 194 }, 195 }, 196 } 197 198 if sc.nodeVersion != nil { 199 log = log.With(). 200 Stringer("node_version", sc.nodeVersion). 201 Bool("crash_on_version_boundary_reached", 202 sc.crashOnVersionBoundaryReached). 203 Logger() 204 } 205 206 log.Info().Msgf("Created") 207 208 cm := component.NewComponentManagerBuilder() 209 cm.AddWorker(sc.processEvents) 210 cm.AddWorker(func(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) { 211 sc.checkInitialVersionBeacon(ctx, ready, latestFinalizedBlock) 212 }) 213 214 sc.Component = cm.Build() 215 216 // TODO: handle version beacon already indicating a stop 217 // right now the stop will happen on first BlockFinalized 218 // which is fine, but ideally we would stop right away. 219 220 return sc 221 } 222 223 // BlockFinalized is called when a block is finalized. 224 // 225 // This is a protocol event consumer. See protocol.Consumer. 226 func (s *StopControl) BlockFinalized(h *flow.Header) { 227 s.blockFinalizedChan <- h 228 } 229 230 // processEvents is a worker that processes block finalized events. 231 func (s *StopControl) processEvents( 232 ctx irrecoverable.SignalerContext, 233 ready component.ReadyFunc, 234 ) { 235 ready() 236 237 for { 238 select { 239 case <-ctx.Done(): 240 return 241 case h := <-s.blockFinalizedChan: 242 s.blockFinalized(ctx, h) 243 } 244 } 245 } 246 247 // BlockFinalizedForTesting is used for testing only. 248 func (s *StopControl) BlockFinalizedForTesting(h *flow.Header) { 249 s.blockFinalized(irrecoverable.MockSignalerContext{}, h) 250 } 251 252 func (s *StopControl) checkInitialVersionBeacon( 253 ctx irrecoverable.SignalerContext, 254 ready component.ReadyFunc, 255 latestFinalizedBlock *flow.Header, 256 ) { 257 // component is not ready until we checked the initial version beacon 258 defer ready() 259 260 // the most straightforward way to check it is to simply pretend we just finalized the 261 // last finalized block 262 s.blockFinalized(ctx, latestFinalizedBlock) 263 264 } 265 266 // IsExecutionStopped returns true is block execution has been stopped 267 func (s *StopControl) IsExecutionStopped() bool { 268 s.RLock() 269 defer s.RUnlock() 270 271 return s.stopped 272 } 273 274 // SetStopParameters sets new stop parameters manually. 275 // 276 // Expected error returns during normal operations: 277 // - ErrCannotChangeStop: this indicates that new stop parameters cannot be set. 278 // See stop.validateStopChange. 279 func (s *StopControl) SetStopParameters( 280 stop StopParameters, 281 ) error { 282 s.Lock() 283 defer s.Unlock() 284 285 boundary := stopBoundary{ 286 StopParameters: stop, 287 source: stopBoundarySourceManual, 288 } 289 290 return s.setStopParameters(boundary) 291 } 292 293 // setStopParameters sets new stop parameters. 294 // stopBoundary is the new stop parameters. If nil, the stop is removed. 295 // 296 // Expected error returns during normal operations: 297 // - ErrCannotChangeStop: this indicates that new stop parameters cannot be set. 298 // See stop.validateStopChange. 299 // 300 // Caller must acquire the lock. 301 func (s *StopControl) setStopParameters( 302 stopBoundary stopBoundary, 303 ) error { 304 log := s.log.With(). 305 Stringer("old_stop", s.stopBoundary). 306 Stringer("new_stop", stopBoundary). 307 Logger() 308 309 err := s.validateStopChange(stopBoundary) 310 if err != nil { 311 log.Info().Err(err).Msg("cannot set stopHeight") 312 return err 313 } 314 315 log.Info().Msg("new stop set") 316 s.stopBoundary = stopBoundary 317 318 return nil 319 } 320 321 var ErrCannotChangeStop = errors.New("cannot change stop control stopping parameters") 322 323 // validateStopChange verifies if the stop parameters can be changed 324 // returns the error with the reason if the parameters cannot be changed. 325 // 326 // Stop parameters cannot be changed if: 327 // 1. node is already stopped 328 // 2. stop parameters are immutable (due to them already affecting execution see 329 // ShouldExecuteBlock) 330 // 3. stop parameters are already set by a different source and the new stop is later than 331 // the existing one 332 // 333 // Expected error returns during normal operations: 334 // - ErrCannotChangeStop: this indicates that new stop parameters cannot be set. 335 // 336 // Caller must acquire the lock. 337 func (s *StopControl) validateStopChange( 338 newStopBoundary stopBoundary, 339 ) error { 340 341 errf := func(reason string) error { 342 return fmt.Errorf("%s: %w", reason, ErrCannotChangeStop) 343 } 344 345 // 1. 346 if s.stopped { 347 return errf("cannot update stop parameters, already stopped") 348 } 349 350 // 2. 351 if s.stopBoundary.immutable { 352 return errf( 353 fmt.Sprintf( 354 "cannot update stopHeight, stopping commenced for %s", 355 s.stopBoundary), 356 ) 357 } 358 359 if !s.stopBoundary.Set() { 360 // if the current stop is no stop, we can always update 361 return nil 362 } 363 364 // 3. 365 if s.stopBoundary.source == newStopBoundary.source { 366 // if the stop was set by the same source, we can always update 367 return nil 368 } 369 370 // 3. 371 // if one stop was set by the version beacon and the other one was manual 372 // we can only update if the new stop is strictly earlier 373 if newStopBoundary.StopBeforeHeight < s.stopBoundary.StopBeforeHeight { 374 return nil 375 376 } 377 // this prevents users moving the stopHeight forward when a version newStopBoundary 378 // is earlier, and prevents version beacons from moving the stopHeight forward 379 // when a manual stop is earlier. 380 return errf("cannot update stopHeight, " + 381 "new stop height is later than the current one") 382 } 383 384 // GetStopParameters returns the upcoming stop parameters or nil if no stop is set. 385 func (s *StopControl) GetStopParameters() StopParameters { 386 s.RLock() 387 defer s.RUnlock() 388 389 return s.stopBoundary.StopParameters 390 } 391 392 // ShouldExecuteBlock should be called when new block can be executed. 393 // The block should not be executed if its height is above or equal to 394 // s.stopBoundary.StopBeforeHeight. 395 // 396 // It returns a boolean indicating if the block should be executed. 397 func (s *StopControl) ShouldExecuteBlock(blockID flow.Identifier, height uint64) bool { 398 s.Lock() 399 defer s.Unlock() 400 401 // don't process anymore blocks if stopped 402 if s.stopped { 403 return false 404 } 405 406 // Skips blocks at or above requested stopHeight 407 // doing so means we have started the stopping process 408 if height < s.stopBoundary.StopBeforeHeight { 409 return true 410 } 411 412 s.log.Info(). 413 Msgf("Skipping execution of %s at height %d"+ 414 " because stop has been requested %s", 415 blockID, 416 height, 417 s.stopBoundary) 418 419 // stopBoundary is now immutable, because it started affecting execution 420 s.stopBoundary.immutable = true 421 return false 422 } 423 424 // blockFinalized is called when a block is marked as finalized 425 // 426 // Once finalization reached stopHeight we can be sure no other fork will be valid at 427 // this height, if this block's parent has been executed, we are safe to stop. 428 // This will happen during normal execution, where blocks are executed 429 // before they are finalized. However, it is possible that EN block computation 430 // progress can fall behind. In this case, we want to crash only after the execution 431 // reached the stopHeight. 432 func (s *StopControl) blockFinalized( 433 ctx irrecoverable.SignalerContext, 434 h *flow.Header, 435 ) { 436 s.Lock() 437 defer s.Unlock() 438 439 // already stopped, nothing to do 440 if s.stopped { 441 return 442 } 443 444 // We already know the ID of the block that should be executed last nothing to do. 445 // Node is stopping. 446 if s.stopBoundary.stopAfterExecuting != flow.ZeroID { 447 return 448 } 449 450 handleErr := func(err error) { 451 s.log.Err(err). 452 Stringer("block_id", h.ID()). 453 Stringer("stop", s.stopBoundary). 454 Msg("Error in stop control BlockFinalized") 455 456 ctx.Throw(err) 457 } 458 459 s.processNewVersionBeacons(ctx, h.Height) 460 461 // we are not at the stop yet, nothing to do 462 if h.Height < s.stopBoundary.StopBeforeHeight { 463 return 464 } 465 466 parentID := h.ParentID 467 468 if h.Height != s.stopBoundary.StopBeforeHeight { 469 // we are past the stop. This can happen if stop was set before 470 // last finalized block 471 s.log.Warn(). 472 Uint64("finalization_height", h.Height). 473 Stringer("block_id", h.ID()). 474 Stringer("stop", s.stopBoundary). 475 Msg("Block finalization already beyond stop.") 476 477 // Let's find the ID of the block that should be executed last 478 // which is the parent of the block at the stopHeight 479 finalizedID, err := s.headers.BlockIDByHeight(s.stopBoundary.StopBeforeHeight - 1) 480 if err != nil { 481 handleErr(fmt.Errorf("failed to get header by height: %w", err)) 482 return 483 } 484 parentID = finalizedID 485 } 486 487 s.stopBoundary.stopAfterExecuting = parentID 488 489 s.log.Info(). 490 Stringer("block_id", h.ID()). 491 Stringer("stop", s.stopBoundary). 492 Stringer("stop_after_executing", s.stopBoundary.stopAfterExecuting). 493 Msgf("Found ID of the block that should be executed last") 494 495 // check if the parent block has been executed then stop right away 496 executed, err := state.IsParentExecuted(s.exeState, h) 497 if err != nil { 498 handleErr(fmt.Errorf( 499 "failed to check if the block has been executed: %w", 500 err, 501 )) 502 return 503 } 504 505 if executed { 506 // we already reached the point where we should stop 507 s.stopExecution() 508 return 509 } 510 } 511 512 // OnBlockExecuted should be called after a block has finished execution 513 func (s *StopControl) OnBlockExecuted(h *flow.Header) { 514 s.Lock() 515 defer s.Unlock() 516 517 if s.stopped { 518 return 519 } 520 521 if s.stopBoundary.stopAfterExecuting != h.ID() { 522 return 523 } 524 525 // double check. Even if requested stopHeight has been changed multiple times, 526 // as long as it matches this block we are safe to terminate 527 if h.Height != s.stopBoundary.StopBeforeHeight-1 { 528 s.log.Warn(). 529 Msgf( 530 "Inconsistent stopping state. "+ 531 "Scheduled to stop after executing block ID %s and height %d, "+ 532 "but this block has a height %d. ", 533 h.ID().String(), 534 s.stopBoundary.StopBeforeHeight-1, 535 h.Height, 536 ) 537 return 538 } 539 540 s.stopExecution() 541 } 542 543 // stopExecution stops the node execution and crashes the node if ShouldCrash is true. 544 // Caller must acquire the lock. 545 func (s *StopControl) stopExecution() { 546 log := s.log.With(). 547 Stringer("requested_stop", s.stopBoundary). 548 Uint64("last_executed_height", s.stopBoundary.StopBeforeHeight). 549 Stringer("last_executed_id", s.stopBoundary.stopAfterExecuting). 550 Logger() 551 552 s.stopped = true 553 log.Warn().Msg("Stopping as finalization reached requested stop") 554 555 if s.stopBoundary.ShouldCrash { 556 log.Info(). 557 Dur("max-graceful-stop-duration", s.maxGracefulStopDuration). 558 Msg("Attempting graceful stop as finalization reached requested stop") 559 doneChan := s.unit.Done() 560 select { 561 case <-doneChan: 562 log.Info().Msg("Engine gracefully stopped") 563 case <-time.After(s.maxGracefulStopDuration): 564 log.Info(). 565 Msg("Engine did not stop within max graceful stop duration") 566 } 567 log.Fatal().Msg("Crashing as finalization reached requested stop") 568 return 569 } 570 } 571 572 // processNewVersionBeacons processes version beacons and updates the stop control stop 573 // height if needed. 574 // 575 // When a block is finalized it is possible that a new version beacon is indexed. 576 // This new version beacon might have added/removed/moved a version boundary. 577 // The old version beacon is considered invalid, and the stop height must be updated 578 // according to the new version beacon. 579 // 580 // Caller must acquire the lock. 581 func (s *StopControl) processNewVersionBeacons( 582 ctx irrecoverable.SignalerContext, 583 height uint64, 584 ) { 585 // TODO: remove when we can guarantee that the node will always have a valid version 586 if s.nodeVersion == nil { 587 return 588 } 589 590 if s.versionBeacon != nil && s.versionBeacon.SealHeight >= height { 591 // already processed this or a higher version beacon 592 return 593 } 594 595 vb, err := s.versionBeacons.Highest(height) 596 if err != nil { 597 s.log.Err(err). 598 Uint64("height", height). 599 Msg("Failed to get highest version beacon for stop control") 600 601 ctx.Throw( 602 fmt.Errorf( 603 "failed to get highest version beacon for stop control: %w", 604 err)) 605 return 606 } 607 608 if vb == nil { 609 // no version beacon found 610 // this is unexpected as there should always be at least the 611 // starting version beacon, but not fatal. 612 // It can happen if the node starts before bootstrap is finished. 613 // TODO: remove when we can guarantee that there will always be a version beacon 614 s.log.Info(). 615 Uint64("height", height). 616 Msg("No version beacon found for stop control") 617 return 618 } 619 620 if s.versionBeacon != nil && s.versionBeacon.SealHeight >= vb.SealHeight { 621 // we already processed this or a higher version beacon 622 return 623 } 624 625 lg := s.log.With(). 626 Str("node_version", s.nodeVersion.String()). 627 Str("beacon", vb.String()). 628 Uint64("vb_seal_height", vb.SealHeight). 629 Uint64("vb_sequence", vb.Sequence).Logger() 630 631 // this is now the last handled version beacon 632 s.versionBeacon = vb 633 634 // this is a new version beacon check what boundary it sets 635 stopHeight, err := s.getVersionBeaconStopHeight(vb) 636 if err != nil { 637 s.log.Err(err). 638 Interface("version_beacon", vb). 639 Msg("Failed to get stop height from version beacon") 640 641 ctx.Throw( 642 fmt.Errorf("failed to get stop height from version beacon: %w", err)) 643 return 644 } 645 646 lg.Info(). 647 Uint64("stop_height", stopHeight). 648 Msg("New version beacon found") 649 650 var newStop = stopBoundary{ 651 StopParameters: StopParameters{ 652 StopBeforeHeight: stopHeight, 653 ShouldCrash: s.crashOnVersionBoundaryReached, 654 }, 655 source: stopBoundarySourceVersionBeacon, 656 } 657 658 err = s.setStopParameters(newStop) 659 if err != nil { 660 // This is just informational and is expected to sometimes happen during 661 // normal operation. The causes for this are described here: validateStopChange. 662 s.log.Info(). 663 Uint64("stop_height", stopHeight). 664 Err(err). 665 Msg("Cannot change stop boundary when detecting new version beacon") 666 } 667 } 668 669 // getVersionBeaconStopHeight returns the stop height that should be set 670 // based on the version beacon 671 // 672 // No error is expected during normal operation since the version beacon 673 // should have been validated when indexing. 674 // 675 // Caller must acquire the lock. 676 func (s *StopControl) getVersionBeaconStopHeight( 677 vb *flow.SealedVersionBeacon, 678 ) ( 679 uint64, 680 error, 681 ) { 682 // version boundaries are sorted by version 683 for _, boundary := range vb.VersionBoundaries { 684 ver, err := boundary.Semver() 685 if err != nil || ver == nil { 686 // this should never happen as we already validated the version beacon 687 // when indexing it 688 return 0, fmt.Errorf("failed to parse semver: %w", err) 689 } 690 691 // This condition can be tweaked in the future. For example if we guarantee that 692 // all nodes with the same major version have compatible execution, 693 // we can stop only on major version change. 694 if s.nodeVersion.LessThan(*ver) { 695 // we need to stop here 696 return boundary.BlockHeight, nil 697 } 698 } 699 700 // no stop boundary should be set 701 return NoStopHeight, nil 702 }