github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/state_synchronization/requester/execution_data_requester.go (about)

     1  package requester
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"time"
     8  
     9  	"github.com/rs/zerolog"
    10  	"github.com/sethvargo/go-retry"
    11  
    12  	"github.com/onflow/flow-go/consensus/hotstuff/model"
    13  	"github.com/onflow/flow-go/engine"
    14  	"github.com/onflow/flow-go/model/flow"
    15  	"github.com/onflow/flow-go/module"
    16  	"github.com/onflow/flow-go/module/component"
    17  	"github.com/onflow/flow-go/module/executiondatasync/execution_data"
    18  	"github.com/onflow/flow-go/module/executiondatasync/execution_data/cache"
    19  	"github.com/onflow/flow-go/module/irrecoverable"
    20  	"github.com/onflow/flow-go/module/jobqueue"
    21  	"github.com/onflow/flow-go/module/state_synchronization"
    22  	"github.com/onflow/flow-go/module/state_synchronization/requester/jobs"
    23  	"github.com/onflow/flow-go/module/util"
    24  	"github.com/onflow/flow-go/state/protocol"
    25  	"github.com/onflow/flow-go/storage"
    26  	"github.com/onflow/flow-go/utils/logging"
    27  )
    28  
    29  // The ExecutionDataRequester downloads ExecutionData for sealed blocks from other participants in
    30  // the flow network. The ExecutionData for a sealed block should always downloadable, since a
    31  // sealed block must have been executed.
    32  //
    33  // Once the ExecutionData for a block is downloaded, the node becomes a seeder for other participants
    34  // on the network using the bitswap protocol. The downloading and seeding work is handled by the
    35  // ExecutionDataService.
    36  //
    37  // The ExecutionDataRequester internally uses a job queue to request and download each sealed block
    38  // with multiple workers. It downloads ExecutionData block by block towards the latest sealed block.
    39  // In order to ensure it does not miss any sealed block to download, it persists the last downloaded
    40  // height, and only increments it when the next height has been downloaded. In the event of a crash
    41  // failure, it will read the last downloaded height, and process from the next un-downloaded height.
    42  // The requester listens to block finalization event, and checks if sealed height has been changed,
    43  // if changed, it create job for each un-downloaded and sealed height.
    44  //
    45  // The requester is made up of 3 subcomponents:
    46  //
    47  // * OnBlockFinalized:     receives block finalized events from the finalization distributor and
    48  //                         forwards them to the blockConsumer.
    49  //
    50  // * blockConsumer:        is a jobqueue that receives block finalization events. On each event,
    51  //                         it checks for the latest sealed block, then uses a pool of workers to
    52  //                         download ExecutionData for each block from the network. After each
    53  //                         successful download, the blockConsumer sends a notification to the
    54  //                         notificationConsumer that a new ExecutionData is available.
    55  //
    56  // * notificationConsumer: is a jobqueue that receives ExecutionData fetched events. On each event,
    57  //                         it checks if ExecutionData for the next consecutive block height is
    58  //                         available, then uses a single worker to send notifications to registered
    59  //                         consumers.
    60  //                         the registered consumers are guaranteed to receive each sealed block in
    61  //                         consecutive height at least once.
    62  //
    63  //    +------------------+      +---------------+       +----------------------+
    64  // -->| OnBlockFinalized |----->| blockConsumer |   +-->| notificationConsumer |
    65  //    +------------------+      +-------+-------+   |   +-----------+----------+
    66  //                                      |           |               |
    67  //                               +------+------+    |        +------+------+
    68  //                            xN | Worker Pool |----+     x1 | Worker Pool |----> Registered consumers
    69  //                               +-------------+             +-------------+
    70  
    71  const (
    72  	// DefaultFetchTimeout is the default initial timeout for fetching ExecutionData from the
    73  	// db/network. The timeout is increased using an incremental backoff until FetchTimeout.
    74  	DefaultFetchTimeout = 10 * time.Second
    75  
    76  	// DefaultMaxFetchTimeout is the default timeout for fetching ExecutionData from the db/network
    77  	DefaultMaxFetchTimeout = 10 * time.Minute
    78  
    79  	// DefaultRetryDelay is the default initial delay used in the exponential backoff for failed
    80  	// ExecutionData download retries
    81  	DefaultRetryDelay = 1 * time.Second
    82  
    83  	// DefaultMaxRetryDelay is the default maximum delay used in the exponential backoff for failed
    84  	// ExecutionData download retries
    85  	DefaultMaxRetryDelay = 5 * time.Minute
    86  
    87  	// DefaultMaxSearchAhead is the default max number of unsent notifications to allow before
    88  	// pausing new fetches.
    89  	DefaultMaxSearchAhead = 5000
    90  
    91  	// Number of goroutines to use for downloading new ExecutionData from the network.
    92  	fetchWorkers = 4
    93  )
    94  
    95  // ExecutionDataConfig contains configuration options for the ExecutionDataRequester
    96  type ExecutionDataConfig struct {
    97  	// The initial value to use as the last processed block height. This should be the
    98  	// first block height to sync - 1
    99  	InitialBlockHeight uint64
   100  
   101  	// Max number of unsent notifications to allow before pausing new fetches. After exceeding this
   102  	// limit, the requester will stop processing new finalized block notifications. This prevents
   103  	// unbounded memory use by the requester if it gets stuck fetching a specific height.
   104  	MaxSearchAhead uint64
   105  
   106  	// The initial timeout for fetching ExecutionData from the db/network
   107  	FetchTimeout time.Duration
   108  
   109  	// The max timeout for fetching ExecutionData from the db/network
   110  	MaxFetchTimeout time.Duration
   111  
   112  	// Exponential backoff settings for download retries
   113  	RetryDelay    time.Duration
   114  	MaxRetryDelay time.Duration
   115  }
   116  
   117  type executionDataRequester struct {
   118  	component.Component
   119  	downloader execution_data.Downloader
   120  	metrics    module.ExecutionDataRequesterMetrics
   121  	config     ExecutionDataConfig
   122  	log        zerolog.Logger
   123  
   124  	// Local db objects
   125  	headers storage.Headers
   126  
   127  	executionDataReader *jobs.ExecutionDataReader
   128  
   129  	// Notifiers for queue consumers
   130  	finalizationNotifier engine.Notifier
   131  
   132  	// Job queues
   133  	blockConsumer        *jobqueue.ComponentConsumer
   134  	notificationConsumer *jobqueue.ComponentConsumer
   135  
   136  	execDataCache *cache.ExecutionDataCache
   137  	distributor   *ExecutionDataDistributor
   138  }
   139  
   140  var _ state_synchronization.ExecutionDataRequester = (*executionDataRequester)(nil)
   141  
   142  // New creates a new execution data requester component
   143  func New(
   144  	log zerolog.Logger,
   145  	edrMetrics module.ExecutionDataRequesterMetrics,
   146  	downloader execution_data.Downloader,
   147  	execDataCache *cache.ExecutionDataCache,
   148  	processedHeight storage.ConsumerProgress,
   149  	processedNotifications storage.ConsumerProgress,
   150  	state protocol.State,
   151  	headers storage.Headers,
   152  	cfg ExecutionDataConfig,
   153  	distributor *ExecutionDataDistributor,
   154  ) (state_synchronization.ExecutionDataRequester, error) {
   155  	e := &executionDataRequester{
   156  		log:                  log.With().Str("component", "execution_data_requester").Logger(),
   157  		downloader:           downloader,
   158  		execDataCache:        execDataCache,
   159  		metrics:              edrMetrics,
   160  		headers:              headers,
   161  		config:               cfg,
   162  		finalizationNotifier: engine.NewNotifier(),
   163  		distributor:          distributor,
   164  	}
   165  
   166  	executionDataNotifier := engine.NewNotifier()
   167  
   168  	// jobqueue Jobs object that tracks sealed blocks by height. This is used by the blockConsumer
   169  	// to get a sequential list of sealed blocks.
   170  	sealedBlockReader := jobqueue.NewSealedBlockHeaderReader(state, headers)
   171  
   172  	// blockConsumer ensures every sealed block's execution data is downloaded.
   173  	// It listens to block finalization events from `finalizationNotifier`, then checks if there
   174  	// are new sealed blocks with `sealedBlockReader`. If there are, it starts workers to process
   175  	// them with `processingBlockJob`, which fetches execution data. At most `fetchWorkers` workers
   176  	// will be created for concurrent processing. When a sealed block's execution data has been
   177  	// downloaded, it updates and persists the highest consecutive downloaded height with
   178  	// `processedHeight`. That way, if the node crashes, it reads the `processedHeight` and resume
   179  	// from `processedHeight + 1`. If the database is empty, rootHeight will be used to init the
   180  	// last processed height. Once the execution data is fetched and stored, it notifies
   181  	// `executionDataNotifier`.
   182  	blockConsumer, err := jobqueue.NewComponentConsumer(
   183  		e.log.With().Str("module", "block_consumer").Logger(),
   184  		e.finalizationNotifier.Channel(), // to listen to finalization events to find newly sealed blocks
   185  		processedHeight,                  // read and persist the downloaded height
   186  		sealedBlockReader,                // read sealed blocks by height
   187  		e.config.InitialBlockHeight,      // initial "last processed" height for empty db
   188  		e.processBlockJob,                // process the sealed block job to download its execution data
   189  		fetchWorkers,                     // the number of concurrent workers
   190  		e.config.MaxSearchAhead,          // max number of unsent notifications to allow before pausing new fetches
   191  	)
   192  	if err != nil {
   193  		return nil, fmt.Errorf("failed to create block consumer: %w", err)
   194  	}
   195  	e.blockConsumer = blockConsumer
   196  
   197  	// notifies notificationConsumer when new ExecutionData blobs are available
   198  	// SetPostNotifier will notify executionDataNotifier AFTER e.blockConsumer.LastProcessedIndex is updated.
   199  	// Even though it doesn't guarantee to notify for every height at least once, the notificationConsumer is
   200  	// able to guarantee to process every height at least once, because the notificationConsumer finds new jobs
   201  	// using executionDataReader which finds new heights using e.blockConsumer.LastProcessedIndex
   202  	e.blockConsumer.SetPostNotifier(func(module.JobID) { executionDataNotifier.Notify() })
   203  
   204  	// jobqueue Jobs object tracks downloaded execution data by height. This is used by the
   205  	// notificationConsumer to get downloaded execution data from storage.
   206  	e.executionDataReader = jobs.NewExecutionDataReader(
   207  		e.execDataCache,
   208  		e.config.FetchTimeout,
   209  		// method to get highest consecutive height that has downloaded execution data. it is used
   210  		// here by the notification job consumer to discover new jobs.
   211  		// Note: we don't want to notify notificationConsumer for a block if it has not downloaded
   212  		// execution data yet.
   213  		func() (uint64, error) {
   214  			return e.blockConsumer.LastProcessedIndex(), nil
   215  		},
   216  	)
   217  
   218  	// notificationConsumer consumes `OnExecutionDataFetched` events, and ensures its consumer
   219  	// receives this event in consecutive block height order.
   220  	// It listens to events from `executionDataNotifier`, which is delivered when
   221  	// a block's execution data is downloaded and stored, and checks the `executionDataCache` to
   222  	// find if the next un-processed consecutive height is available.
   223  	// To know what's the height of the next un-processed consecutive height, it reads the latest
   224  	// consecutive height in `processedNotifications`. And it's persisted in storage to be crash-resistant.
   225  	// When a new consecutive height is available, it calls `processNotificationJob` to notify all the
   226  	// `e.consumers`.
   227  	// Note: the `e.consumers` will be guaranteed to receive at least one `OnExecutionDataFetched` event
   228  	// for each sealed block in consecutive block height order.
   229  	e.notificationConsumer, err = jobqueue.NewComponentConsumer(
   230  		e.log.With().Str("module", "notification_consumer").Logger(),
   231  		executionDataNotifier.Channel(), // listen for notifications from the block consumer
   232  		processedNotifications,          // read and persist the notified height
   233  		e.executionDataReader,           // read execution data by height
   234  		e.config.InitialBlockHeight,     // initial "last processed" height for empty db
   235  		e.processNotificationJob,        // process the job to send notifications for an execution data
   236  		1,                               // use a single worker to ensure notification is delivered in consecutive order
   237  		0,                               // search ahead limit controlled by worker count
   238  	)
   239  	if err != nil {
   240  		return nil, fmt.Errorf("failed to create notification consumer: %w", err)
   241  	}
   242  
   243  	e.Component = component.NewComponentManagerBuilder().
   244  		AddWorker(e.runBlockConsumer).
   245  		AddWorker(e.runNotificationConsumer).
   246  		Build()
   247  
   248  	return e, nil
   249  }
   250  
   251  // OnBlockFinalized accepts block finalization notifications from the FollowerDistributor
   252  func (e *executionDataRequester) OnBlockFinalized(*model.Block) {
   253  	e.finalizationNotifier.Notify()
   254  }
   255  
   256  // HighestConsecutiveHeight returns the highest consecutive block height for which ExecutionData
   257  // has been received.
   258  // This method must only be called after the component is Ready. If it is called early, an error is returned.
   259  func (e *executionDataRequester) HighestConsecutiveHeight() (uint64, error) {
   260  	select {
   261  	case <-e.blockConsumer.Ready():
   262  	default:
   263  		// LastProcessedIndex is not meaningful until the component has completed startup
   264  		return 0, fmt.Errorf("HighestConsecutiveHeight must not be called before the component is ready")
   265  	}
   266  
   267  	return e.blockConsumer.LastProcessedIndex(), nil
   268  }
   269  
   270  // runBlockConsumer runs the blockConsumer component
   271  func (e *executionDataRequester) runBlockConsumer(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) {
   272  	err := util.WaitClosed(ctx, e.downloader.Ready())
   273  	if err != nil {
   274  		return // context cancelled
   275  	}
   276  
   277  	err = util.WaitClosed(ctx, e.notificationConsumer.Ready())
   278  	if err != nil {
   279  		return // context cancelled
   280  	}
   281  
   282  	e.blockConsumer.Start(ctx)
   283  
   284  	err = util.WaitClosed(ctx, e.blockConsumer.Ready())
   285  	if err == nil {
   286  		ready()
   287  	}
   288  
   289  	<-e.blockConsumer.Done()
   290  }
   291  
   292  // runNotificationConsumer runs the notificationConsumer component
   293  func (e *executionDataRequester) runNotificationConsumer(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) {
   294  	e.executionDataReader.AddContext(ctx)
   295  	e.notificationConsumer.Start(ctx)
   296  
   297  	err := util.WaitClosed(ctx, e.notificationConsumer.Ready())
   298  	if err == nil {
   299  		ready()
   300  	}
   301  
   302  	<-e.notificationConsumer.Done()
   303  }
   304  
   305  // Fetch Worker Methods
   306  
   307  // processBlockJob consumes jobs from the blockConsumer and attempts to download an ExecutionData
   308  // for the given block height.
   309  func (e *executionDataRequester) processBlockJob(ctx irrecoverable.SignalerContext, job module.Job, jobComplete func()) {
   310  	// convert job into a block entry
   311  	header, err := jobqueue.JobToBlockHeader(job)
   312  	if err != nil {
   313  		ctx.Throw(fmt.Errorf("failed to convert job to block: %w", err))
   314  	}
   315  
   316  	err = e.processSealedHeight(ctx, header.ID(), header.Height)
   317  	if err == nil {
   318  		jobComplete()
   319  		return
   320  	}
   321  
   322  	// errors are thrown as irrecoverable errors except context cancellation, and invalid blobs
   323  	// invalid blobs are logged, and never completed, which will halt downloads after maxSearchAhead
   324  	// is reached.
   325  	e.log.Error().Err(err).Str("job_id", string(job.ID())).Msg("error encountered while processing block job")
   326  }
   327  
   328  // processSealedHeight downloads ExecutionData for the given block height.
   329  // If the download fails, it will retry forever, using exponential backoff.
   330  func (e *executionDataRequester) processSealedHeight(ctx irrecoverable.SignalerContext, blockID flow.Identifier, height uint64) error {
   331  	backoff := retry.NewExponential(e.config.RetryDelay)
   332  	backoff = retry.WithCappedDuration(e.config.MaxRetryDelay, backoff)
   333  	backoff = retry.WithJitterPercent(15, backoff)
   334  
   335  	// bitswap always waits for either all data to be received or a timeout, even if it encountered an error.
   336  	// use an incremental backoff for the timeout so we do faster initial retries, then allow for more
   337  	// time in case data is large or there is network congestion.
   338  	timeout := retry.NewExponential(e.config.FetchTimeout)
   339  	timeout = retry.WithCappedDuration(e.config.MaxFetchTimeout, timeout)
   340  
   341  	attempt := 0
   342  	return retry.Do(ctx, backoff, func(context.Context) error {
   343  		if attempt > 0 {
   344  			e.log.Debug().
   345  				Str("block_id", blockID.String()).
   346  				Uint64("height", height).
   347  				Uint64("attempt", uint64(attempt)).
   348  				Msgf("retrying download")
   349  
   350  			e.metrics.FetchRetried()
   351  		}
   352  		attempt++
   353  
   354  		// download execution data for the block
   355  		fetchTimeout, _ := timeout.Next()
   356  		err := e.processFetchRequest(ctx, blockID, height, fetchTimeout)
   357  
   358  		// don't retry if the blob was invalid
   359  		if isInvalidBlobError(err) {
   360  			return err
   361  		}
   362  
   363  		return retry.RetryableError(err)
   364  	})
   365  }
   366  
   367  func (e *executionDataRequester) processFetchRequest(parentCtx irrecoverable.SignalerContext, blockID flow.Identifier, height uint64, fetchTimeout time.Duration) error {
   368  	logger := e.log.With().
   369  		Str("block_id", blockID.String()).
   370  		Uint64("height", height).
   371  		Logger()
   372  
   373  	logger.Debug().Msg("processing fetch request")
   374  
   375  	start := time.Now()
   376  	e.metrics.ExecutionDataFetchStarted()
   377  
   378  	logger.Debug().Msg("downloading execution data")
   379  
   380  	ctx, cancel := context.WithTimeout(parentCtx, fetchTimeout)
   381  	defer cancel()
   382  
   383  	execData, err := e.execDataCache.ByBlockID(ctx, blockID)
   384  
   385  	e.metrics.ExecutionDataFetchFinished(time.Since(start), err == nil, height)
   386  
   387  	if isInvalidBlobError(err) {
   388  		// This means an execution result was sealed with an invalid execution data id (invalid data).
   389  		// Eventually, verification nodes will verify that the execution data is valid, and not sign the receipt
   390  		logger.Error().Err(err).Msg("HALTING REQUESTER: invalid execution data found")
   391  
   392  		return err
   393  	}
   394  
   395  	// Some or all of the blob was missing or corrupt. retry
   396  	if isBlobNotFoundError(err) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
   397  		logger.Error().Err(err).Msg("failed to get execution data for block")
   398  
   399  		return err
   400  	}
   401  
   402  	// Any other error is unexpected
   403  	if err != nil {
   404  		logger.Error().Err(err).Msg("unexpected error fetching execution data")
   405  
   406  		parentCtx.Throw(err)
   407  	}
   408  
   409  	logger.Info().
   410  		Hex("execution_data_id", logging.ID(execData.ID())).
   411  		Msg("execution data fetched")
   412  
   413  	return nil
   414  }
   415  
   416  // Notification Worker Methods
   417  
   418  func (e *executionDataRequester) processNotificationJob(ctx irrecoverable.SignalerContext, job module.Job, jobComplete func()) {
   419  	// convert job into a block entry
   420  	entry, err := jobs.JobToBlockEntry(job)
   421  	if err != nil {
   422  		ctx.Throw(fmt.Errorf("failed to convert job to entry: %w", err))
   423  	}
   424  
   425  	e.log.Debug().
   426  		Hex("block_id", logging.ID(entry.BlockID)).
   427  		Uint64("height", entry.Height).
   428  		Msgf("notifying for block")
   429  
   430  	// send notifications
   431  	e.distributor.OnExecutionDataReceived(entry.ExecutionData)
   432  	jobComplete()
   433  
   434  	e.metrics.NotificationSent(entry.Height)
   435  }
   436  
   437  func isInvalidBlobError(err error) bool {
   438  	var malformedDataError *execution_data.MalformedDataError
   439  	var blobSizeLimitExceededError *execution_data.BlobSizeLimitExceededError
   440  	return errors.As(err, &malformedDataError) ||
   441  		errors.As(err, &blobSizeLimitExceededError)
   442  }
   443  
   444  func isBlobNotFoundError(err error) bool {
   445  	var blobNotFoundError *execution_data.BlobNotFoundError
   446  	return errors.As(err, &blobNotFoundError)
   447  }