github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/execution/ingestion/core.go (about)

     1  package ingestion
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/rs/zerolog"
    11  
    12  	"github.com/onflow/flow-go/engine/execution"
    13  	"github.com/onflow/flow-go/engine/execution/ingestion/block_queue"
    14  	"github.com/onflow/flow-go/engine/execution/ingestion/stop"
    15  	"github.com/onflow/flow-go/engine/execution/state"
    16  	"github.com/onflow/flow-go/model/flow"
    17  	"github.com/onflow/flow-go/module"
    18  	"github.com/onflow/flow-go/module/component"
    19  	"github.com/onflow/flow-go/module/irrecoverable"
    20  	"github.com/onflow/flow-go/module/mempool/entity"
    21  	"github.com/onflow/flow-go/storage"
    22  	"github.com/onflow/flow-go/utils/logging"
    23  )
    24  
    25  // MaxProcessableBlocks is the maximum number of blocks that is queued to be processed
    26  const MaxProcessableBlocks = 10000
    27  
    28  // MaxConcurrentBlockExecutor is the maximum number of concurrent block executors
    29  const MaxConcurrentBlockExecutor = 5
    30  
    31  // Core connects the execution components
    32  // when it receives blocks and collections, it forwards them to the block queue.
    33  // when the block queue decides to execute blocks, it forwards to the executor for execution
    34  // when the block queue decides to fetch missing collections, it forwards to the collection fetcher
    35  // when a block is executed, it notifies the block queue and forwards to execution state to save them.
    36  type Core struct {
    37  	*component.ComponentManager
    38  
    39  	log zerolog.Logger
    40  
    41  	// when a block is received, it is first pushed to the processables channel, and then the worker will
    42  	// fetch the collections and forward it to the block queue.
    43  	// once the data is fetched, and its parent block is executed, then the block is ready to be executed, it
    44  	// will be pushed to the blockExecutors channel, and the worker will execute the block.
    45  	// during startup, the throttle will limit the number of blocks to be added to the processables channel.
    46  	// once caught up, the throttle will allow all the remaining blocks to be added to the processables channel.
    47  	processables   chan BlockIDHeight           // block IDs that are received and waiting to be processed
    48  	throttle       Throttle                     // to throttle the blocks to be added to processables during startup and catchup
    49  	blockQueue     *block_queue.BlockQueue      // blocks are waiting for the data to be fetched
    50  	blockExecutors chan *entity.ExecutableBlock // blocks that are ready to be executed
    51  	stopControl    *stop.StopControl            // decide whether to execute a block or not and when to stop the execution
    52  
    53  	// data storage
    54  	execState   state.ExecutionState
    55  	blocks      storage.Blocks
    56  	collections storage.Collections
    57  
    58  	// computation, data fetching, events
    59  	executor          BlockExecutor
    60  	collectionFetcher CollectionFetcher
    61  	eventConsumer     EventConsumer
    62  	metrics           module.ExecutionMetrics
    63  }
    64  
    65  type BlockExecutor interface {
    66  	ExecuteBlock(ctx context.Context, block *entity.ExecutableBlock) (*execution.ComputationResult, error)
    67  }
    68  
    69  type EventConsumer interface {
    70  	BeforeComputationResultSaved(ctx context.Context, result *execution.ComputationResult)
    71  	OnComputationResultSaved(ctx context.Context, result *execution.ComputationResult) string
    72  }
    73  
    74  func NewCore(
    75  	logger zerolog.Logger,
    76  	throttle Throttle,
    77  	execState state.ExecutionState,
    78  	stopControl *stop.StopControl,
    79  	blocks storage.Blocks,
    80  	collections storage.Collections,
    81  	executor BlockExecutor,
    82  	collectionFetcher CollectionFetcher,
    83  	eventConsumer EventConsumer,
    84  	metrics module.ExecutionMetrics,
    85  ) (*Core, error) {
    86  	e := &Core{
    87  		log:               logger.With().Str("engine", "ingestion_core").Logger(),
    88  		processables:      make(chan BlockIDHeight, MaxProcessableBlocks),
    89  		blockExecutors:    make(chan *entity.ExecutableBlock),
    90  		throttle:          throttle,
    91  		execState:         execState,
    92  		blockQueue:        block_queue.NewBlockQueue(logger),
    93  		stopControl:       stopControl,
    94  		blocks:            blocks,
    95  		collections:       collections,
    96  		executor:          executor,
    97  		collectionFetcher: collectionFetcher,
    98  		eventConsumer:     eventConsumer,
    99  		metrics:           metrics,
   100  	}
   101  
   102  	err := e.throttle.Init(e.processables, DefaultCatchUpThreshold)
   103  	if err != nil {
   104  		return nil, fmt.Errorf("fail to initialize throttle engine: %w", err)
   105  	}
   106  
   107  	e.log.Info().Msgf("throttle engine initialized")
   108  
   109  	builder := component.NewComponentManagerBuilder().AddWorker(e.launchWorkerToHandleBlocks)
   110  
   111  	for w := 0; w < MaxConcurrentBlockExecutor; w++ {
   112  		builder.AddWorker(e.launchWorkerToExecuteBlocks)
   113  	}
   114  
   115  	e.ComponentManager = builder.Build()
   116  
   117  	return e, nil
   118  }
   119  
   120  func (e *Core) launchWorkerToHandleBlocks(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) {
   121  	executionStopped := e.stopControl.IsExecutionStopped()
   122  
   123  	e.log.Info().Bool("execution_stopped", executionStopped).Msgf("launching worker")
   124  
   125  	ready()
   126  
   127  	if executionStopped {
   128  		return
   129  	}
   130  
   131  	e.launchWorkerToConsumeThrottledBlocks(ctx)
   132  }
   133  
   134  func (e *Core) launchWorkerToExecuteBlocks(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) {
   135  	ready()
   136  	for {
   137  		select {
   138  		case <-ctx.Done():
   139  			return
   140  		case executable := <-e.blockExecutors:
   141  			err := e.execute(ctx, executable)
   142  			if err != nil {
   143  				ctx.Throw(fmt.Errorf("execution ingestion engine failed to execute block %v (%v): %w",
   144  					executable.Block.Header.Height,
   145  					executable.Block.ID(), err))
   146  			}
   147  		}
   148  	}
   149  }
   150  
   151  func (e *Core) OnCollection(col *flow.Collection) {
   152  	err := e.onCollection(col)
   153  	if err != nil {
   154  		e.log.Fatal().Err(err).Msgf("error processing collection: %v", col.ID())
   155  	}
   156  }
   157  
   158  func (e *Core) launchWorkerToConsumeThrottledBlocks(ctx irrecoverable.SignalerContext) {
   159  	// running worker in the background to consume
   160  	// processables blocks which are throttled,
   161  	// and forward them to the block queue for processing
   162  	e.log.Info().Msgf("starting worker to consume throttled blocks")
   163  	defer func() {
   164  		e.log.Info().Msgf("worker to consume throttled blocks stopped")
   165  	}()
   166  	for {
   167  		select {
   168  		case <-ctx.Done():
   169  			// if the engine has shut down, then mark throttle as Done, which
   170  			// will stop sending new blocks to e.processables
   171  			err := e.throttle.Done()
   172  			if err != nil {
   173  				ctx.Throw(fmt.Errorf("execution ingestion engine failed to stop throttle: %w", err))
   174  			}
   175  
   176  			// drain the processables
   177  			e.log.Info().Msgf("draining processables")
   178  			close(e.processables)
   179  			for range e.processables {
   180  			}
   181  			e.log.Info().Msgf("finish draining processables")
   182  			return
   183  
   184  		case blockIDHeight := <-e.processables:
   185  			e.log.Debug().Hex("block_id", blockIDHeight.ID[:]).Uint64("height", blockIDHeight.Height).Msg("ingestion core processing block")
   186  			err := e.onProcessableBlock(blockIDHeight.ID, blockIDHeight.Height)
   187  			if err != nil {
   188  				ctx.Throw(fmt.Errorf("execution ingestion engine fail to process block %v (height: %v): %w",
   189  					blockIDHeight.ID, blockIDHeight.Height, err))
   190  				return
   191  			}
   192  		}
   193  	}
   194  
   195  }
   196  
   197  func (e *Core) onProcessableBlock(blockID flow.Identifier, height uint64) error {
   198  	// skip if stopControl tells to skip
   199  	if !e.stopControl.ShouldExecuteBlock(blockID, height) {
   200  		return nil
   201  	}
   202  
   203  	executed, err := e.execState.IsBlockExecuted(height, blockID)
   204  	if err != nil {
   205  		return fmt.Errorf("could not check whether block %v is executed: %w", blockID, err)
   206  	}
   207  
   208  	if executed {
   209  		e.log.Debug().Hex("block_id", blockID[:]).Uint64("height", height).Msg("block has been executed already")
   210  		return nil
   211  	}
   212  
   213  	block, err := e.blocks.ByID(blockID)
   214  	if err != nil {
   215  		return fmt.Errorf("failed to get block %s: %w", blockID, err)
   216  	}
   217  
   218  	missingColls, executables, err := e.enqueuBlock(block, blockID)
   219  	if err != nil {
   220  		return fmt.Errorf("failed to enqueue block %v: %w", blockID, err)
   221  	}
   222  
   223  	lg := e.log.With().
   224  		Hex("block_id", blockID[:]).Uint64("height", height).
   225  		Logger()
   226  	lg.Debug().
   227  		Int("executables", len(executables)).Msgf("executeConcurrently block is executable")
   228  	e.executeConcurrently(executables)
   229  
   230  	missingCount, err := e.fetch(missingColls)
   231  	if err != nil {
   232  		return fmt.Errorf("failed to fetch missing collections: %w", err)
   233  	}
   234  
   235  	lg.Debug().Int("missing_collections", missingCount).Msgf("fetch missing collections")
   236  
   237  	return nil
   238  }
   239  
   240  func (e *Core) enqueuBlock(block *flow.Block, blockID flow.Identifier) (
   241  	[]*block_queue.MissingCollection,
   242  	[]*entity.ExecutableBlock,
   243  	error,
   244  ) {
   245  	lg := e.log.With().
   246  		Hex("block_id", blockID[:]).
   247  		Uint64("height", block.Header.Height).
   248  		Logger()
   249  
   250  	lg.Info().Msg("handling new block")
   251  
   252  	parentCommitment, err := e.execState.StateCommitmentByBlockID(block.Header.ParentID)
   253  
   254  	if err == nil {
   255  		// the parent block is an executed block.
   256  		missingColls, executables, err := e.blockQueue.HandleBlock(block, &parentCommitment)
   257  		if err != nil {
   258  			return nil, nil, fmt.Errorf("unexpected error while adding block to block queue: %w", err)
   259  		}
   260  
   261  		lg.Info().Bool("parent_is_executed", true).
   262  			Int("missing_col", len(missingColls)).
   263  			Int("executables", len(executables)).
   264  			Msgf("block is enqueued")
   265  
   266  		return missingColls, executables, nil
   267  	}
   268  
   269  	// handle exception
   270  	if !errors.Is(err, storage.ErrNotFound) {
   271  		return nil, nil, fmt.Errorf("failed to get state commitment for parent block %v of block %v (height: %v): %w",
   272  			block.Header.ParentID, blockID, block.Header.Height, err)
   273  	}
   274  
   275  	// the parent block is an unexecuted block.
   276  	// we can enqueue the block without providing the state commitment
   277  	missingColls, executables, err := e.blockQueue.HandleBlock(block, nil)
   278  	if err != nil {
   279  		if !errors.Is(err, block_queue.ErrMissingParent) {
   280  			return nil, nil, fmt.Errorf("unexpected error while adding block to block queue: %w", err)
   281  		}
   282  
   283  		// if parent is missing, there are two possibilities:
   284  		// 1) parent was never enqueued to block queue
   285  		// 2) parent was enqueued, but it has been executed and removed from the block queue
   286  		// however, actually 1) is not possible 2) is the only possible case here, why?
   287  		// because forwardProcessableToHandler guarantees we always enqueue a block before its child,
   288  		// which means when HandleBlock is called with a block, then its parent block must have been
   289  		// called with HandleBlock already. Therefore, 1) is not possible.
   290  		// And the reason 2) is possible is because the fact that its parent block is missing
   291  		// might be outdated since OnBlockExecuted might be called concurrently in a different thread.
   292  		// it means OnBlockExecuted is called in a different thread after us getting the parent commit
   293  		// and before HandleBlock was called, therefore, we should re-enqueue the block with the
   294  		// parent commit. It's necessary to check again whether the parent block is executed after the call.
   295  		lg.Warn().Msgf(
   296  			"block is missing parent block, re-enqueueing %v (parent: %v)",
   297  			blockID, block.Header.ParentID,
   298  		)
   299  
   300  		parentCommitment, err := e.execState.StateCommitmentByBlockID(block.Header.ParentID)
   301  		if err != nil {
   302  			return nil, nil, fmt.Errorf("failed to get parent state commitment when re-enqueue block %v (parent: %v): %w",
   303  				blockID, block.Header.ParentID, err)
   304  		}
   305  
   306  		// now re-enqueue the block with parent commit
   307  		missing, execs, err := e.blockQueue.HandleBlock(block, &parentCommitment)
   308  		if err != nil {
   309  			return nil, nil, fmt.Errorf("unexpected error while reenqueue block to block queue: %w", err)
   310  		}
   311  
   312  		missingColls = flow.Deduplicate(append(missingColls, missing...))
   313  		executables = flow.Deduplicate(append(executables, execs...))
   314  	}
   315  
   316  	lg.Info().Bool("parent_is_executed", false).
   317  		Int("missing_col", len(missingColls)).
   318  		Int("executables", len(executables)).
   319  		Msgf("block is enqueued")
   320  
   321  	return missingColls, executables, nil
   322  }
   323  
   324  func (e *Core) onBlockExecuted(
   325  	ctx context.Context,
   326  	block *entity.ExecutableBlock,
   327  	computationResult *execution.ComputationResult,
   328  	startedAt time.Time,
   329  ) error {
   330  	commit := computationResult.CurrentEndState()
   331  
   332  	e.metrics.ExecutionLastExecutedBlockHeight(block.Block.Header.Height)
   333  
   334  	wg := sync.WaitGroup{}
   335  	wg.Add(1)
   336  	defer wg.Wait()
   337  
   338  	go func() {
   339  		defer wg.Done()
   340  		e.eventConsumer.BeforeComputationResultSaved(ctx, computationResult)
   341  	}()
   342  
   343  	err := e.execState.SaveExecutionResults(ctx, computationResult)
   344  	if err != nil {
   345  		return fmt.Errorf("cannot persist execution state: %w", err)
   346  	}
   347  
   348  	blockID := block.ID()
   349  	lg := e.log.With().
   350  		Hex("block_id", blockID[:]).
   351  		Uint64("height", block.Block.Header.Height).
   352  		Logger()
   353  
   354  	lg.Debug().Msgf("execution state saved")
   355  
   356  	// must call OnBlockExecuted AFTER saving the execution result to storage
   357  	// because when enqueuing a block, we rely on execState.StateCommitmentByBlockID
   358  	// to determine whether a block has been executed or not.
   359  	executables, err := e.blockQueue.OnBlockExecuted(blockID, commit)
   360  	if err != nil {
   361  		return fmt.Errorf("unexpected error while marking block as executed: %w", err)
   362  	}
   363  
   364  	e.stopControl.OnBlockExecuted(block.Block.Header)
   365  
   366  	// notify event consumer so that the event consumer can do tasks
   367  	// such as broadcasting or uploading the result
   368  	logs := e.eventConsumer.OnComputationResultSaved(ctx, computationResult)
   369  
   370  	receipt := computationResult.ExecutionReceipt
   371  	lg.Info().
   372  		Int("collections", len(block.CompleteCollections)).
   373  		Hex("parent_block", block.Block.Header.ParentID[:]).
   374  		Int("collections", len(block.Block.Payload.Guarantees)).
   375  		Hex("start_state", block.StartState[:]).
   376  		Hex("final_state", commit[:]).
   377  		Hex("receipt_id", logging.Entity(receipt)).
   378  		Hex("result_id", logging.Entity(receipt.ExecutionResult)).
   379  		Hex("execution_data_id", receipt.ExecutionResult.ExecutionDataID[:]).
   380  		Bool("state_changed", commit != *block.StartState).
   381  		Uint64("num_txs", nonSystemTransactionCount(receipt.ExecutionResult)).
   382  		Int64("timeSpentInMS", time.Since(startedAt).Milliseconds()).
   383  		Str("logs", logs). // broadcasted
   384  		Int("executables", len(executables)).
   385  		Msgf("block executed")
   386  
   387  	// we ensures that the child blocks are only executed after the execution result of
   388  	// its parent block has been successfully saved to storage.
   389  	// this ensures OnBlockExecuted would not be called with blocks in a wrong order, such as
   390  	// OnBlockExecuted(childBlock) being called before OnBlockExecuted(parentBlock).
   391  
   392  	e.executeConcurrently(executables)
   393  	err = e.throttle.OnBlockExecuted(blockID, block.Block.Header.Height)
   394  	if err != nil {
   395  		return fmt.Errorf("failed to notify throttle that block %v has been executed: %w", blockID, err)
   396  	}
   397  
   398  	return nil
   399  }
   400  
   401  func (e *Core) onCollection(col *flow.Collection) error {
   402  	colID := col.ID()
   403  	e.log.Info().
   404  		Hex("collection_id", colID[:]).
   405  		Msgf("handle collection")
   406  	// EN might request a collection from multiple collection nodes,
   407  	// therefore might receive multiple copies of the same collection.
   408  	// we only need to store it once.
   409  	err := storeCollectionIfMissing(e.collections, col)
   410  	if err != nil {
   411  		return fmt.Errorf("failed to store collection %v: %w", col.ID(), err)
   412  	}
   413  
   414  	return e.handleCollection(colID, col)
   415  }
   416  
   417  func (e *Core) handleCollection(colID flow.Identifier, col *flow.Collection) error {
   418  	// if the collection is a duplication, it's still good to add it to the block queue,
   419  	// because chances are the collection was stored before a restart, and
   420  	// is not in the queue after the restart.
   421  	// adding it to the queue ensures we don't miss any collection.
   422  	// since the queue's state is in memory, processing a duplicated collection should be
   423  	// a fast no-op, and won't return any executable blocks.
   424  	executables, err := e.blockQueue.HandleCollection(col)
   425  	if err != nil {
   426  		return fmt.Errorf("unexpected error while adding collection to block queue")
   427  	}
   428  
   429  	e.log.Debug().
   430  		Hex("collection_id", colID[:]).
   431  		Int("executables", len(executables)).Msgf("executeConcurrently: collection is handled, ready to execute block")
   432  
   433  	e.executeConcurrently(executables)
   434  
   435  	return nil
   436  }
   437  
   438  func storeCollectionIfMissing(collections storage.Collections, col *flow.Collection) error {
   439  	_, err := collections.ByID(col.ID())
   440  	if err != nil {
   441  		if !errors.Is(err, storage.ErrNotFound) {
   442  			return fmt.Errorf("failed to get collection %v: %w", col.ID(), err)
   443  		}
   444  
   445  		err := collections.Store(col)
   446  		if err != nil {
   447  			return fmt.Errorf("failed to store collection %v: %w", col.ID(), err)
   448  		}
   449  	}
   450  
   451  	return nil
   452  }
   453  
   454  // execute block concurrently
   455  func (e *Core) executeConcurrently(executables []*entity.ExecutableBlock) {
   456  	for _, executable := range executables {
   457  		select {
   458  		case <-e.ShutdownSignal():
   459  			// if the engine has shut down, then stop executing the block
   460  			return
   461  		case e.blockExecutors <- executable:
   462  		}
   463  	}
   464  }
   465  
   466  func (e *Core) execute(ctx context.Context, executable *entity.ExecutableBlock) error {
   467  	if !e.stopControl.ShouldExecuteBlock(executable.Block.Header.ID(), executable.Block.Header.Height) {
   468  		return nil
   469  	}
   470  
   471  	e.log.Info().
   472  		Hex("block_id", logging.Entity(executable)).
   473  		Uint64("height", executable.Block.Header.Height).
   474  		Int("collections", len(executable.CompleteCollections)).
   475  		Msgf("executing block")
   476  
   477  	startedAt := time.Now()
   478  
   479  	result, err := e.executor.ExecuteBlock(ctx, executable)
   480  	if err != nil {
   481  		return fmt.Errorf("failed to execute block %v: %w", executable.Block.ID(), err)
   482  	}
   483  
   484  	err = e.onBlockExecuted(ctx, executable, result, startedAt)
   485  	if err != nil {
   486  		return fmt.Errorf("failed to handle execution result of block %v: %w", executable.Block.ID(), err)
   487  	}
   488  
   489  	return nil
   490  }
   491  
   492  func (e *Core) fetch(missingColls []*block_queue.MissingCollection) (int, error) {
   493  	missingCount := 0
   494  	for _, col := range missingColls {
   495  
   496  		// if we've requested this collection, we will store it in the storage,
   497  		// so check the storage to see whether we've seen it.
   498  		collection, err := e.collections.ByID(col.Guarantee.CollectionID)
   499  
   500  		if err == nil {
   501  			// we found the collection from storage, forward this collection to handler
   502  			err = e.handleCollection(col.Guarantee.CollectionID, collection)
   503  			if err != nil {
   504  				return 0, fmt.Errorf("could not handle collection: %w", err)
   505  			}
   506  
   507  			continue
   508  		}
   509  
   510  		// check if there was exception
   511  		if !errors.Is(err, storage.ErrNotFound) {
   512  			return 0, fmt.Errorf("error while querying for collection: %w", err)
   513  		}
   514  
   515  		err = e.collectionFetcher.FetchCollection(col.BlockID, col.Height, col.Guarantee)
   516  		if err != nil {
   517  			return 0, fmt.Errorf("failed to fetch collection %v for block %v (height: %v): %w",
   518  				col.Guarantee.ID(), col.BlockID, col.Height, err)
   519  		}
   520  		missingCount++
   521  	}
   522  
   523  	if missingCount > 0 {
   524  		e.collectionFetcher.Force()
   525  		e.metrics.ExecutionCollectionRequestSent()
   526  	}
   527  
   528  	return missingCount, nil
   529  }