github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/execution/ingestion/core.go (about) 1 package ingestion 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "sync" 8 "time" 9 10 "github.com/rs/zerolog" 11 12 "github.com/onflow/flow-go/engine/execution" 13 "github.com/onflow/flow-go/engine/execution/ingestion/block_queue" 14 "github.com/onflow/flow-go/engine/execution/ingestion/stop" 15 "github.com/onflow/flow-go/engine/execution/state" 16 "github.com/onflow/flow-go/model/flow" 17 "github.com/onflow/flow-go/module" 18 "github.com/onflow/flow-go/module/component" 19 "github.com/onflow/flow-go/module/irrecoverable" 20 "github.com/onflow/flow-go/module/mempool/entity" 21 "github.com/onflow/flow-go/storage" 22 "github.com/onflow/flow-go/utils/logging" 23 ) 24 25 // MaxProcessableBlocks is the maximum number of blocks that is queued to be processed 26 const MaxProcessableBlocks = 10000 27 28 // MaxConcurrentBlockExecutor is the maximum number of concurrent block executors 29 const MaxConcurrentBlockExecutor = 5 30 31 // Core connects the execution components 32 // when it receives blocks and collections, it forwards them to the block queue. 33 // when the block queue decides to execute blocks, it forwards to the executor for execution 34 // when the block queue decides to fetch missing collections, it forwards to the collection fetcher 35 // when a block is executed, it notifies the block queue and forwards to execution state to save them. 36 type Core struct { 37 *component.ComponentManager 38 39 log zerolog.Logger 40 41 // when a block is received, it is first pushed to the processables channel, and then the worker will 42 // fetch the collections and forward it to the block queue. 43 // once the data is fetched, and its parent block is executed, then the block is ready to be executed, it 44 // will be pushed to the blockExecutors channel, and the worker will execute the block. 45 // during startup, the throttle will limit the number of blocks to be added to the processables channel. 46 // once caught up, the throttle will allow all the remaining blocks to be added to the processables channel. 47 processables chan BlockIDHeight // block IDs that are received and waiting to be processed 48 throttle Throttle // to throttle the blocks to be added to processables during startup and catchup 49 blockQueue *block_queue.BlockQueue // blocks are waiting for the data to be fetched 50 blockExecutors chan *entity.ExecutableBlock // blocks that are ready to be executed 51 stopControl *stop.StopControl // decide whether to execute a block or not and when to stop the execution 52 53 // data storage 54 execState state.ExecutionState 55 blocks storage.Blocks 56 collections storage.Collections 57 58 // computation, data fetching, events 59 executor BlockExecutor 60 collectionFetcher CollectionFetcher 61 eventConsumer EventConsumer 62 metrics module.ExecutionMetrics 63 } 64 65 type BlockExecutor interface { 66 ExecuteBlock(ctx context.Context, block *entity.ExecutableBlock) (*execution.ComputationResult, error) 67 } 68 69 type EventConsumer interface { 70 BeforeComputationResultSaved(ctx context.Context, result *execution.ComputationResult) 71 OnComputationResultSaved(ctx context.Context, result *execution.ComputationResult) string 72 } 73 74 func NewCore( 75 logger zerolog.Logger, 76 throttle Throttle, 77 execState state.ExecutionState, 78 stopControl *stop.StopControl, 79 blocks storage.Blocks, 80 collections storage.Collections, 81 executor BlockExecutor, 82 collectionFetcher CollectionFetcher, 83 eventConsumer EventConsumer, 84 metrics module.ExecutionMetrics, 85 ) (*Core, error) { 86 e := &Core{ 87 log: logger.With().Str("engine", "ingestion_core").Logger(), 88 processables: make(chan BlockIDHeight, MaxProcessableBlocks), 89 blockExecutors: make(chan *entity.ExecutableBlock), 90 throttle: throttle, 91 execState: execState, 92 blockQueue: block_queue.NewBlockQueue(logger), 93 stopControl: stopControl, 94 blocks: blocks, 95 collections: collections, 96 executor: executor, 97 collectionFetcher: collectionFetcher, 98 eventConsumer: eventConsumer, 99 metrics: metrics, 100 } 101 102 err := e.throttle.Init(e.processables, DefaultCatchUpThreshold) 103 if err != nil { 104 return nil, fmt.Errorf("fail to initialize throttle engine: %w", err) 105 } 106 107 e.log.Info().Msgf("throttle engine initialized") 108 109 builder := component.NewComponentManagerBuilder().AddWorker(e.launchWorkerToHandleBlocks) 110 111 for w := 0; w < MaxConcurrentBlockExecutor; w++ { 112 builder.AddWorker(e.launchWorkerToExecuteBlocks) 113 } 114 115 e.ComponentManager = builder.Build() 116 117 return e, nil 118 } 119 120 func (e *Core) launchWorkerToHandleBlocks(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) { 121 executionStopped := e.stopControl.IsExecutionStopped() 122 123 e.log.Info().Bool("execution_stopped", executionStopped).Msgf("launching worker") 124 125 ready() 126 127 if executionStopped { 128 return 129 } 130 131 e.launchWorkerToConsumeThrottledBlocks(ctx) 132 } 133 134 func (e *Core) launchWorkerToExecuteBlocks(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) { 135 ready() 136 for { 137 select { 138 case <-ctx.Done(): 139 return 140 case executable := <-e.blockExecutors: 141 err := e.execute(ctx, executable) 142 if err != nil { 143 ctx.Throw(fmt.Errorf("execution ingestion engine failed to execute block %v (%v): %w", 144 executable.Block.Header.Height, 145 executable.Block.ID(), err)) 146 } 147 } 148 } 149 } 150 151 func (e *Core) OnCollection(col *flow.Collection) { 152 err := e.onCollection(col) 153 if err != nil { 154 e.log.Fatal().Err(err).Msgf("error processing collection: %v", col.ID()) 155 } 156 } 157 158 func (e *Core) launchWorkerToConsumeThrottledBlocks(ctx irrecoverable.SignalerContext) { 159 // running worker in the background to consume 160 // processables blocks which are throttled, 161 // and forward them to the block queue for processing 162 e.log.Info().Msgf("starting worker to consume throttled blocks") 163 defer func() { 164 e.log.Info().Msgf("worker to consume throttled blocks stopped") 165 }() 166 for { 167 select { 168 case <-ctx.Done(): 169 // if the engine has shut down, then mark throttle as Done, which 170 // will stop sending new blocks to e.processables 171 err := e.throttle.Done() 172 if err != nil { 173 ctx.Throw(fmt.Errorf("execution ingestion engine failed to stop throttle: %w", err)) 174 } 175 176 // drain the processables 177 e.log.Info().Msgf("draining processables") 178 close(e.processables) 179 for range e.processables { 180 } 181 e.log.Info().Msgf("finish draining processables") 182 return 183 184 case blockIDHeight := <-e.processables: 185 e.log.Debug().Hex("block_id", blockIDHeight.ID[:]).Uint64("height", blockIDHeight.Height).Msg("ingestion core processing block") 186 err := e.onProcessableBlock(blockIDHeight.ID, blockIDHeight.Height) 187 if err != nil { 188 ctx.Throw(fmt.Errorf("execution ingestion engine fail to process block %v (height: %v): %w", 189 blockIDHeight.ID, blockIDHeight.Height, err)) 190 return 191 } 192 } 193 } 194 195 } 196 197 func (e *Core) onProcessableBlock(blockID flow.Identifier, height uint64) error { 198 // skip if stopControl tells to skip 199 if !e.stopControl.ShouldExecuteBlock(blockID, height) { 200 return nil 201 } 202 203 executed, err := e.execState.IsBlockExecuted(height, blockID) 204 if err != nil { 205 return fmt.Errorf("could not check whether block %v is executed: %w", blockID, err) 206 } 207 208 if executed { 209 e.log.Debug().Hex("block_id", blockID[:]).Uint64("height", height).Msg("block has been executed already") 210 return nil 211 } 212 213 block, err := e.blocks.ByID(blockID) 214 if err != nil { 215 return fmt.Errorf("failed to get block %s: %w", blockID, err) 216 } 217 218 missingColls, executables, err := e.enqueuBlock(block, blockID) 219 if err != nil { 220 return fmt.Errorf("failed to enqueue block %v: %w", blockID, err) 221 } 222 223 lg := e.log.With(). 224 Hex("block_id", blockID[:]).Uint64("height", height). 225 Logger() 226 lg.Debug(). 227 Int("executables", len(executables)).Msgf("executeConcurrently block is executable") 228 e.executeConcurrently(executables) 229 230 missingCount, err := e.fetch(missingColls) 231 if err != nil { 232 return fmt.Errorf("failed to fetch missing collections: %w", err) 233 } 234 235 lg.Debug().Int("missing_collections", missingCount).Msgf("fetch missing collections") 236 237 return nil 238 } 239 240 func (e *Core) enqueuBlock(block *flow.Block, blockID flow.Identifier) ( 241 []*block_queue.MissingCollection, 242 []*entity.ExecutableBlock, 243 error, 244 ) { 245 lg := e.log.With(). 246 Hex("block_id", blockID[:]). 247 Uint64("height", block.Header.Height). 248 Logger() 249 250 lg.Info().Msg("handling new block") 251 252 parentCommitment, err := e.execState.StateCommitmentByBlockID(block.Header.ParentID) 253 254 if err == nil { 255 // the parent block is an executed block. 256 missingColls, executables, err := e.blockQueue.HandleBlock(block, &parentCommitment) 257 if err != nil { 258 return nil, nil, fmt.Errorf("unexpected error while adding block to block queue: %w", err) 259 } 260 261 lg.Info().Bool("parent_is_executed", true). 262 Int("missing_col", len(missingColls)). 263 Int("executables", len(executables)). 264 Msgf("block is enqueued") 265 266 return missingColls, executables, nil 267 } 268 269 // handle exception 270 if !errors.Is(err, storage.ErrNotFound) { 271 return nil, nil, fmt.Errorf("failed to get state commitment for parent block %v of block %v (height: %v): %w", 272 block.Header.ParentID, blockID, block.Header.Height, err) 273 } 274 275 // the parent block is an unexecuted block. 276 // we can enqueue the block without providing the state commitment 277 missingColls, executables, err := e.blockQueue.HandleBlock(block, nil) 278 if err != nil { 279 if !errors.Is(err, block_queue.ErrMissingParent) { 280 return nil, nil, fmt.Errorf("unexpected error while adding block to block queue: %w", err) 281 } 282 283 // if parent is missing, there are two possibilities: 284 // 1) parent was never enqueued to block queue 285 // 2) parent was enqueued, but it has been executed and removed from the block queue 286 // however, actually 1) is not possible 2) is the only possible case here, why? 287 // because forwardProcessableToHandler guarantees we always enqueue a block before its child, 288 // which means when HandleBlock is called with a block, then its parent block must have been 289 // called with HandleBlock already. Therefore, 1) is not possible. 290 // And the reason 2) is possible is because the fact that its parent block is missing 291 // might be outdated since OnBlockExecuted might be called concurrently in a different thread. 292 // it means OnBlockExecuted is called in a different thread after us getting the parent commit 293 // and before HandleBlock was called, therefore, we should re-enqueue the block with the 294 // parent commit. It's necessary to check again whether the parent block is executed after the call. 295 lg.Warn().Msgf( 296 "block is missing parent block, re-enqueueing %v (parent: %v)", 297 blockID, block.Header.ParentID, 298 ) 299 300 parentCommitment, err := e.execState.StateCommitmentByBlockID(block.Header.ParentID) 301 if err != nil { 302 return nil, nil, fmt.Errorf("failed to get parent state commitment when re-enqueue block %v (parent: %v): %w", 303 blockID, block.Header.ParentID, err) 304 } 305 306 // now re-enqueue the block with parent commit 307 missing, execs, err := e.blockQueue.HandleBlock(block, &parentCommitment) 308 if err != nil { 309 return nil, nil, fmt.Errorf("unexpected error while reenqueue block to block queue: %w", err) 310 } 311 312 missingColls = flow.Deduplicate(append(missingColls, missing...)) 313 executables = flow.Deduplicate(append(executables, execs...)) 314 } 315 316 lg.Info().Bool("parent_is_executed", false). 317 Int("missing_col", len(missingColls)). 318 Int("executables", len(executables)). 319 Msgf("block is enqueued") 320 321 return missingColls, executables, nil 322 } 323 324 func (e *Core) onBlockExecuted( 325 ctx context.Context, 326 block *entity.ExecutableBlock, 327 computationResult *execution.ComputationResult, 328 startedAt time.Time, 329 ) error { 330 commit := computationResult.CurrentEndState() 331 332 e.metrics.ExecutionLastExecutedBlockHeight(block.Block.Header.Height) 333 334 wg := sync.WaitGroup{} 335 wg.Add(1) 336 defer wg.Wait() 337 338 go func() { 339 defer wg.Done() 340 e.eventConsumer.BeforeComputationResultSaved(ctx, computationResult) 341 }() 342 343 err := e.execState.SaveExecutionResults(ctx, computationResult) 344 if err != nil { 345 return fmt.Errorf("cannot persist execution state: %w", err) 346 } 347 348 blockID := block.ID() 349 lg := e.log.With(). 350 Hex("block_id", blockID[:]). 351 Uint64("height", block.Block.Header.Height). 352 Logger() 353 354 lg.Debug().Msgf("execution state saved") 355 356 // must call OnBlockExecuted AFTER saving the execution result to storage 357 // because when enqueuing a block, we rely on execState.StateCommitmentByBlockID 358 // to determine whether a block has been executed or not. 359 executables, err := e.blockQueue.OnBlockExecuted(blockID, commit) 360 if err != nil { 361 return fmt.Errorf("unexpected error while marking block as executed: %w", err) 362 } 363 364 e.stopControl.OnBlockExecuted(block.Block.Header) 365 366 // notify event consumer so that the event consumer can do tasks 367 // such as broadcasting or uploading the result 368 logs := e.eventConsumer.OnComputationResultSaved(ctx, computationResult) 369 370 receipt := computationResult.ExecutionReceipt 371 lg.Info(). 372 Int("collections", len(block.CompleteCollections)). 373 Hex("parent_block", block.Block.Header.ParentID[:]). 374 Int("collections", len(block.Block.Payload.Guarantees)). 375 Hex("start_state", block.StartState[:]). 376 Hex("final_state", commit[:]). 377 Hex("receipt_id", logging.Entity(receipt)). 378 Hex("result_id", logging.Entity(receipt.ExecutionResult)). 379 Hex("execution_data_id", receipt.ExecutionResult.ExecutionDataID[:]). 380 Bool("state_changed", commit != *block.StartState). 381 Uint64("num_txs", nonSystemTransactionCount(receipt.ExecutionResult)). 382 Int64("timeSpentInMS", time.Since(startedAt).Milliseconds()). 383 Str("logs", logs). // broadcasted 384 Int("executables", len(executables)). 385 Msgf("block executed") 386 387 // we ensures that the child blocks are only executed after the execution result of 388 // its parent block has been successfully saved to storage. 389 // this ensures OnBlockExecuted would not be called with blocks in a wrong order, such as 390 // OnBlockExecuted(childBlock) being called before OnBlockExecuted(parentBlock). 391 392 e.executeConcurrently(executables) 393 err = e.throttle.OnBlockExecuted(blockID, block.Block.Header.Height) 394 if err != nil { 395 return fmt.Errorf("failed to notify throttle that block %v has been executed: %w", blockID, err) 396 } 397 398 return nil 399 } 400 401 func (e *Core) onCollection(col *flow.Collection) error { 402 colID := col.ID() 403 e.log.Info(). 404 Hex("collection_id", colID[:]). 405 Msgf("handle collection") 406 // EN might request a collection from multiple collection nodes, 407 // therefore might receive multiple copies of the same collection. 408 // we only need to store it once. 409 err := storeCollectionIfMissing(e.collections, col) 410 if err != nil { 411 return fmt.Errorf("failed to store collection %v: %w", col.ID(), err) 412 } 413 414 return e.handleCollection(colID, col) 415 } 416 417 func (e *Core) handleCollection(colID flow.Identifier, col *flow.Collection) error { 418 // if the collection is a duplication, it's still good to add it to the block queue, 419 // because chances are the collection was stored before a restart, and 420 // is not in the queue after the restart. 421 // adding it to the queue ensures we don't miss any collection. 422 // since the queue's state is in memory, processing a duplicated collection should be 423 // a fast no-op, and won't return any executable blocks. 424 executables, err := e.blockQueue.HandleCollection(col) 425 if err != nil { 426 return fmt.Errorf("unexpected error while adding collection to block queue") 427 } 428 429 e.log.Debug(). 430 Hex("collection_id", colID[:]). 431 Int("executables", len(executables)).Msgf("executeConcurrently: collection is handled, ready to execute block") 432 433 e.executeConcurrently(executables) 434 435 return nil 436 } 437 438 func storeCollectionIfMissing(collections storage.Collections, col *flow.Collection) error { 439 _, err := collections.ByID(col.ID()) 440 if err != nil { 441 if !errors.Is(err, storage.ErrNotFound) { 442 return fmt.Errorf("failed to get collection %v: %w", col.ID(), err) 443 } 444 445 err := collections.Store(col) 446 if err != nil { 447 return fmt.Errorf("failed to store collection %v: %w", col.ID(), err) 448 } 449 } 450 451 return nil 452 } 453 454 // execute block concurrently 455 func (e *Core) executeConcurrently(executables []*entity.ExecutableBlock) { 456 for _, executable := range executables { 457 select { 458 case <-e.ShutdownSignal(): 459 // if the engine has shut down, then stop executing the block 460 return 461 case e.blockExecutors <- executable: 462 } 463 } 464 } 465 466 func (e *Core) execute(ctx context.Context, executable *entity.ExecutableBlock) error { 467 if !e.stopControl.ShouldExecuteBlock(executable.Block.Header.ID(), executable.Block.Header.Height) { 468 return nil 469 } 470 471 e.log.Info(). 472 Hex("block_id", logging.Entity(executable)). 473 Uint64("height", executable.Block.Header.Height). 474 Int("collections", len(executable.CompleteCollections)). 475 Msgf("executing block") 476 477 startedAt := time.Now() 478 479 result, err := e.executor.ExecuteBlock(ctx, executable) 480 if err != nil { 481 return fmt.Errorf("failed to execute block %v: %w", executable.Block.ID(), err) 482 } 483 484 err = e.onBlockExecuted(ctx, executable, result, startedAt) 485 if err != nil { 486 return fmt.Errorf("failed to handle execution result of block %v: %w", executable.Block.ID(), err) 487 } 488 489 return nil 490 } 491 492 func (e *Core) fetch(missingColls []*block_queue.MissingCollection) (int, error) { 493 missingCount := 0 494 for _, col := range missingColls { 495 496 // if we've requested this collection, we will store it in the storage, 497 // so check the storage to see whether we've seen it. 498 collection, err := e.collections.ByID(col.Guarantee.CollectionID) 499 500 if err == nil { 501 // we found the collection from storage, forward this collection to handler 502 err = e.handleCollection(col.Guarantee.CollectionID, collection) 503 if err != nil { 504 return 0, fmt.Errorf("could not handle collection: %w", err) 505 } 506 507 continue 508 } 509 510 // check if there was exception 511 if !errors.Is(err, storage.ErrNotFound) { 512 return 0, fmt.Errorf("error while querying for collection: %w", err) 513 } 514 515 err = e.collectionFetcher.FetchCollection(col.BlockID, col.Height, col.Guarantee) 516 if err != nil { 517 return 0, fmt.Errorf("failed to fetch collection %v for block %v (height: %v): %w", 518 col.Guarantee.ID(), col.BlockID, col.Height, err) 519 } 520 missingCount++ 521 } 522 523 if missingCount > 0 { 524 e.collectionFetcher.Force() 525 e.metrics.ExecutionCollectionRequestSent() 526 } 527 528 return missingCount, nil 529 }