github.com/koko1123/flow-go-1@v0.29.6/module/state_synchronization/requester/execution_data_requester.go (about)

     1  package requester
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/rs/zerolog"
    11  	"github.com/sethvargo/go-retry"
    12  
    13  	"github.com/koko1123/flow-go-1/consensus/hotstuff/model"
    14  	"github.com/koko1123/flow-go-1/engine"
    15  	"github.com/koko1123/flow-go-1/model/flow"
    16  	"github.com/koko1123/flow-go-1/module"
    17  	"github.com/koko1123/flow-go-1/module/component"
    18  	"github.com/koko1123/flow-go-1/module/executiondatasync/execution_data"
    19  	"github.com/koko1123/flow-go-1/module/irrecoverable"
    20  	"github.com/koko1123/flow-go-1/module/jobqueue"
    21  	"github.com/koko1123/flow-go-1/module/state_synchronization"
    22  	"github.com/koko1123/flow-go-1/module/state_synchronization/requester/jobs"
    23  	"github.com/koko1123/flow-go-1/module/util"
    24  	"github.com/koko1123/flow-go-1/state/protocol"
    25  	"github.com/koko1123/flow-go-1/storage"
    26  )
    27  
    28  // The ExecutionDataRequester downloads ExecutionData for sealed blocks from other participants in
    29  // the flow network. The ExecutionData for a sealed block should always downloadable, since a
    30  // sealed block must have been executed.
    31  //
    32  // Once the ExecutionData for a block is downloaded, the node becomes a seeder for other participants
    33  // on the network using the bitswap protocol. The downloading and seeding work is handled by the
    34  // ExecutionDataService.
    35  //
    36  // The ExecutionDataRequester internally uses a job queue to request and download each sealed block
    37  // with multiple workers. It downloads ExecutionData block by block towards the latest sealed block.
    38  // In order to ensure it does not miss any sealed block to download, it persists the last downloaded
    39  // height, and only increments it when the next height has been downloaded. In the event of a crash
    40  // failure, it will read the last downloaded height, and process from the next un-downloaded height.
    41  // The requester listens to block finalization event, and checks if sealed height has been changed,
    42  // if changed, it create job for each un-downloaded and sealed height.
    43  //
    44  // The requester is made up of 3 subcomponents:
    45  //
    46  // * OnBlockFinalized:     receives block finalized events from the finalization distributor and
    47  //                         forwards them to the blockConsumer.
    48  //
    49  // * blockConsumer:        is a jobqueue that receives block finalization events. On each event,
    50  //                         it checks for the latest sealed block, then uses a pool of workers to
    51  //                         download ExecutionData for each block from the network. After each
    52  //                         successful download, the blockConsumer sends a notification to the
    53  //                         notificationConsumer that a new ExecutionData is available.
    54  //
    55  // * notificationConsumer: is a jobqueue that receives ExecutionData fetched events. On each event,
    56  //                         it checks if ExecutionData for the next consecutive block height is
    57  //                         available, then uses a single worker to send notifications to registered
    58  //                         consumers.
    59  //                         the registered consumers are guaranteed to receive each sealed block in
    60  //                         consecutive height at least once.
    61  //
    62  //    +------------------+      +---------------+       +----------------------+
    63  // -->| OnBlockFinalized |----->| blockConsumer |   +-->| notificationConsumer |
    64  //    +------------------+      +-------+-------+   |   +-----------+----------+
    65  //                                      |           |               |
    66  //                               +------+------+    |        +------+------+
    67  //                            xN | Worker Pool |----+     x1 | Worker Pool |----> Registered consumers
    68  //                               +-------------+             +-------------+
    69  
    70  const (
    71  	// DefaultFetchTimeout is the default initial timeout for fetching ExecutionData from the
    72  	// db/network. The timeout is increased using an incremental backoff until FetchTimeout.
    73  	DefaultFetchTimeout = 10 * time.Second
    74  
    75  	// DefaultMaxFetchTimeout is the default timeout for fetching ExecutionData from the db/network
    76  	DefaultMaxFetchTimeout = 10 * time.Minute
    77  
    78  	// DefaultRetryDelay is the default initial delay used in the exponential backoff for failed
    79  	// ExecutionData download retries
    80  	DefaultRetryDelay = 1 * time.Second
    81  
    82  	// DefaultMaxRetryDelay is the default maximum delay used in the exponential backoff for failed
    83  	// ExecutionData download retries
    84  	DefaultMaxRetryDelay = 5 * time.Minute
    85  
    86  	// DefaultMaxSearchAhead is the default max number of unsent notifications to allow before
    87  	// pausing new fetches.
    88  	DefaultMaxSearchAhead = 5000
    89  
    90  	// Number of goroutines to use for downloading new ExecutionData from the network.
    91  	fetchWorkers = 4
    92  )
    93  
    94  // ExecutionDataConfig contains configuration options for the ExecutionDataRequester
    95  type ExecutionDataConfig struct {
    96  	// The initial value to use as the last processed block height. This should be the
    97  	// first block height to sync - 1
    98  	InitialBlockHeight uint64
    99  
   100  	// Max number of unsent notifications to allow before pausing new fetches. After exceeding this
   101  	// limit, the requester will stop processing new finalized block notifications. This prevents
   102  	// unbounded memory use by the requester if it gets stuck fetching a specific height.
   103  	MaxSearchAhead uint64
   104  
   105  	// The initial timeout for fetching ExecutionData from the db/network
   106  	FetchTimeout time.Duration
   107  
   108  	// The max timeout for fetching ExecutionData from the db/network
   109  	MaxFetchTimeout time.Duration
   110  
   111  	// Exponential backoff settings for download retries
   112  	RetryDelay    time.Duration
   113  	MaxRetryDelay time.Duration
   114  }
   115  
   116  type executionDataRequester struct {
   117  	component.Component
   118  	cm         *component.ComponentManager
   119  	downloader execution_data.Downloader
   120  	metrics    module.ExecutionDataRequesterMetrics
   121  	config     ExecutionDataConfig
   122  	log        zerolog.Logger
   123  
   124  	// Local db objects
   125  	headers storage.Headers
   126  	results storage.ExecutionResults
   127  	seals   storage.Seals
   128  
   129  	executionDataReader *jobs.ExecutionDataReader
   130  
   131  	// Notifiers for queue consumers
   132  	finalizationNotifier engine.Notifier
   133  
   134  	// Job queues
   135  	blockConsumer        *jobqueue.ComponentConsumer
   136  	notificationConsumer *jobqueue.ComponentConsumer
   137  
   138  	// List of callbacks to call when ExecutionData is successfully fetched for a block
   139  	consumers []state_synchronization.ExecutionDataReceivedCallback
   140  
   141  	consumerMu sync.RWMutex
   142  }
   143  
   144  var _ state_synchronization.ExecutionDataRequester = (*executionDataRequester)(nil)
   145  
   146  // New creates a new execution data requester component
   147  func New(
   148  	log zerolog.Logger,
   149  	edrMetrics module.ExecutionDataRequesterMetrics,
   150  	downloader execution_data.Downloader,
   151  	processedHeight storage.ConsumerProgress,
   152  	processedNotifications storage.ConsumerProgress,
   153  	state protocol.State,
   154  	headers storage.Headers,
   155  	results storage.ExecutionResults,
   156  	seals storage.Seals,
   157  	cfg ExecutionDataConfig,
   158  ) state_synchronization.ExecutionDataRequester {
   159  	e := &executionDataRequester{
   160  		log:                  log.With().Str("component", "execution_data_requester").Logger(),
   161  		downloader:           downloader,
   162  		metrics:              edrMetrics,
   163  		headers:              headers,
   164  		results:              results,
   165  		seals:                seals,
   166  		config:               cfg,
   167  		finalizationNotifier: engine.NewNotifier(),
   168  	}
   169  
   170  	executionDataNotifier := engine.NewNotifier()
   171  
   172  	// jobqueue Jobs object that tracks sealed blocks by height. This is used by the blockConsumer
   173  	// to get a sequential list of sealed blocks.
   174  	sealedBlockReader := jobqueue.NewSealedBlockHeaderReader(state, headers)
   175  
   176  	// blockConsumer ensures every sealed block's execution data is downloaded.
   177  	// It listens to block finalization events from `finalizationNotifier`, then checks if there
   178  	// are new sealed blocks with `sealedBlockReader`. If there are, it starts workers to process
   179  	// them with `processingBlockJob`, which fetches execution data. At most `fetchWorkers` workers
   180  	// will be created for concurrent processing. When a sealed block's execution data has been
   181  	// downloaded, it updates and persists the highest consecutive downloaded height with
   182  	// `processedHeight`. That way, if the node crashes, it reads the `processedHeight` and resume
   183  	// from `processedHeight + 1`. If the database is empty, rootHeight will be used to init the
   184  	// last processed height. Once the execution data is fetched and stored, it notifies
   185  	// `executionDataNotifier`.
   186  	e.blockConsumer = jobqueue.NewComponentConsumer(
   187  		e.log.With().Str("module", "block_consumer").Logger(),
   188  		e.finalizationNotifier.Channel(), // to listen to finalization events to find newly sealed blocks
   189  		processedHeight,                  // read and persist the downloaded height
   190  		sealedBlockReader,                // read sealed blocks by height
   191  		e.config.InitialBlockHeight,      // initial "last processed" height for empty db
   192  		e.processBlockJob,                // process the sealed block job to download its execution data
   193  		fetchWorkers,                     // the number of concurrent workers
   194  		e.config.MaxSearchAhead,          // max number of unsent notifications to allow before pausing new fetches
   195  	)
   196  	// notifies notificationConsumer when new ExecutionData blobs are available
   197  	// SetPostNotifier will notify executionDataNotifier AFTER e.blockConsumer.LastProcessedIndex is updated.
   198  	// Even though it doesn't guarantee to notify for every height at least once, the notificationConsumer is
   199  	// able to guarantee to process every height at least once, because the notificationConsumer finds new job
   200  	// using executionDataReader which finds new height using e.blockConsumer.LastProcessedIndex
   201  	e.blockConsumer.SetPostNotifier(func(module.JobID) { executionDataNotifier.Notify() })
   202  
   203  	// jobqueue Jobs object tracks downloaded execution data by height. This is used by the
   204  	// notificationConsumer to get downloaded execution data from storage.
   205  	e.executionDataReader = jobs.NewExecutionDataReader(
   206  		e.downloader,
   207  		e.headers,
   208  		e.results,
   209  		e.seals,
   210  		e.config.FetchTimeout,
   211  		// method to get highest consecutive height that has downloaded execution data. it is used
   212  		// here by the notification job consumer to discover new jobs.
   213  		// Note: we don't want to notify notificationConsumer for a block if it has not downloaded
   214  		// execution data yet.
   215  		e.blockConsumer.LastProcessedIndex,
   216  	)
   217  
   218  	// notificationConsumer consumes `OnExecutionDataFetched` events, and ensures its consumer
   219  	// receives this event in consecutive block height order.
   220  	// It listens to events from `executionDataNotifier`, which is delivered when
   221  	// a block's execution data is downloaded and stored, and checks the `executionDataCache` to
   222  	// find if the next un-processed consecutive height is available.
   223  	// To know what's the height of the next un-processed consecutive height, it reads the latest
   224  	// consecutive height in `processedNotifications`. And it's persisted in storage to be crash-resistant.
   225  	// When a new consecutive height is available, it calls `processNotificationJob` to notify all the
   226  	// `e.consumers`.
   227  	// Note: the `e.consumers` will be guaranteed to receive at least one `OnExecutionDataFetched` event
   228  	// for each sealed block in consecutive block height order.
   229  	e.notificationConsumer = jobqueue.NewComponentConsumer(
   230  		e.log.With().Str("module", "notification_consumer").Logger(),
   231  		executionDataNotifier.Channel(), // listen for notifications from the block consumer
   232  		processedNotifications,          // read and persist the notified height
   233  		e.executionDataReader,           // read execution data by height
   234  		e.config.InitialBlockHeight,     // initial "last processed" height for empty db
   235  		e.processNotificationJob,        // process the job to send notifications for an execution data
   236  		1,                               // use a single worker to ensure notification is delivered in consecutive order
   237  		0,                               // search ahead limit controlled by worker count
   238  	)
   239  
   240  	builder := component.NewComponentManagerBuilder().
   241  		AddWorker(e.runBlockConsumer).
   242  		AddWorker(e.runNotificationConsumer)
   243  
   244  	e.cm = builder.Build()
   245  	e.Component = e.cm
   246  
   247  	return e
   248  }
   249  
   250  // OnBlockFinalized accepts block finalization notifications from the FinalizationDistributor
   251  func (e *executionDataRequester) OnBlockFinalized(*model.Block) {
   252  	e.finalizationNotifier.Notify()
   253  }
   254  
   255  // AddOnExecutionDataFetchedConsumer adds a callback to be called when a new ExecutionData is received
   256  // Callback Implementations must:
   257  //   - be concurrency safe
   258  //   - be non-blocking
   259  //   - handle repetition of the same events (with some processing overhead).
   260  func (e *executionDataRequester) AddOnExecutionDataFetchedConsumer(fn state_synchronization.ExecutionDataReceivedCallback) {
   261  	e.consumerMu.Lock()
   262  	defer e.consumerMu.Unlock()
   263  
   264  	e.consumers = append(e.consumers, fn)
   265  }
   266  
   267  // runBlockConsumer runs the blockConsumer component
   268  func (e *executionDataRequester) runBlockConsumer(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) {
   269  	err := util.WaitClosed(ctx, e.downloader.Ready())
   270  	if err != nil {
   271  		return // context cancelled
   272  	}
   273  
   274  	err = util.WaitClosed(ctx, e.notificationConsumer.Ready())
   275  	if err != nil {
   276  		return // context cancelled
   277  	}
   278  
   279  	e.blockConsumer.Start(ctx)
   280  
   281  	err = util.WaitClosed(ctx, e.blockConsumer.Ready())
   282  	if err == nil {
   283  		ready()
   284  	}
   285  
   286  	<-e.blockConsumer.Done()
   287  }
   288  
   289  // runNotificationConsumer runs the notificationConsumer component
   290  func (e *executionDataRequester) runNotificationConsumer(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) {
   291  	e.executionDataReader.AddContext(ctx)
   292  	e.notificationConsumer.Start(ctx)
   293  
   294  	err := util.WaitClosed(ctx, e.notificationConsumer.Ready())
   295  	if err == nil {
   296  		ready()
   297  	}
   298  
   299  	<-e.notificationConsumer.Done()
   300  }
   301  
   302  // Fetch Worker Methods
   303  
   304  // processBlockJob consumes jobs from the blockConsumer and attempts to download an ExecutionData
   305  // for the given block height.
   306  func (e *executionDataRequester) processBlockJob(ctx irrecoverable.SignalerContext, job module.Job, jobComplete func()) {
   307  	// convert job into a block entry
   308  	header, err := jobqueue.JobToBlockHeader(job)
   309  	if err != nil {
   310  		ctx.Throw(fmt.Errorf("failed to convert job to block: %w", err))
   311  	}
   312  
   313  	err = e.processSealedHeight(ctx, header.ID(), header.Height)
   314  	if err == nil {
   315  		jobComplete()
   316  		return
   317  	}
   318  
   319  	// errors are thrown as irrecoverable errors except context cancellation, and invalid blobs
   320  	// invalid blobs are logged, and never completed, which will halt downloads after maxSearchAhead
   321  	// is reached.
   322  	e.log.Error().Err(err).Str("job_id", string(job.ID())).Msg("error encountered while processing block job")
   323  }
   324  
   325  // processSealedHeight downloads ExecutionData for the given block height.
   326  // If the download fails, it will retry forever, using exponential backoff.
   327  func (e *executionDataRequester) processSealedHeight(ctx irrecoverable.SignalerContext, blockID flow.Identifier, height uint64) error {
   328  	backoff := retry.NewExponential(e.config.RetryDelay)
   329  	backoff = retry.WithCappedDuration(e.config.MaxRetryDelay, backoff)
   330  	backoff = retry.WithJitterPercent(15, backoff)
   331  
   332  	// bitswap always waits for either all data to be received or a timeout, even if it encountered an error.
   333  	// use an incremental backoff for the timeout so we do faster initial retries, then allow for more
   334  	// time in case data is large or there is network congestion.
   335  	timeout := retry.NewExponential(e.config.FetchTimeout)
   336  	timeout = retry.WithCappedDuration(e.config.MaxFetchTimeout, timeout)
   337  
   338  	attempt := 0
   339  	return retry.Do(ctx, backoff, func(context.Context) error {
   340  		if attempt > 0 {
   341  			e.log.Debug().
   342  				Str("block_id", blockID.String()).
   343  				Uint64("height", height).
   344  				Uint64("attempt", uint64(attempt)).
   345  				Msgf("retrying download")
   346  
   347  			e.metrics.FetchRetried()
   348  		}
   349  		attempt++
   350  
   351  		// download execution data for the block
   352  		fetchTimeout, _ := timeout.Next()
   353  		err := e.processFetchRequest(ctx, blockID, height, fetchTimeout)
   354  
   355  		// don't retry if the blob was invalid
   356  		if isInvalidBlobError(err) {
   357  			return err
   358  		}
   359  
   360  		return retry.RetryableError(err)
   361  	})
   362  }
   363  
   364  func (e *executionDataRequester) processFetchRequest(ctx irrecoverable.SignalerContext, blockID flow.Identifier, height uint64, fetchTimeout time.Duration) error {
   365  	logger := e.log.With().
   366  		Str("block_id", blockID.String()).
   367  		Uint64("height", height).
   368  		Logger()
   369  
   370  	logger.Debug().Msg("processing fetch request")
   371  
   372  	seal, err := e.seals.FinalizedSealForBlock(blockID)
   373  	if err != nil {
   374  		ctx.Throw(fmt.Errorf("failed to get seal for block %s: %w", blockID, err))
   375  	}
   376  
   377  	result, err := e.results.ByID(seal.ResultID)
   378  	if err != nil {
   379  		ctx.Throw(fmt.Errorf("failed to lookup execution result for block %s: %w", blockID, err))
   380  	}
   381  
   382  	logger = logger.With().Str("execution_data_id", result.ExecutionDataID.String()).Logger()
   383  
   384  	start := time.Now()
   385  	e.metrics.ExecutionDataFetchStarted()
   386  
   387  	logger.Debug().Msg("downloading execution data")
   388  
   389  	_, err = e.fetchExecutionData(ctx, result.ExecutionDataID, fetchTimeout)
   390  
   391  	e.metrics.ExecutionDataFetchFinished(time.Since(start), err == nil, height)
   392  
   393  	if isInvalidBlobError(err) {
   394  		// This means an execution result was sealed with an invalid execution data id (invalid data).
   395  		// Eventually, verification nodes will verify that the execution data is valid, and not sign the receipt
   396  		logger.Error().Err(err).Msg("HALTING REQUESTER: invalid execution data found")
   397  
   398  		return err
   399  	}
   400  
   401  	// Some or all of the blob was missing or corrupt. retry
   402  	if isBlobNotFoundError(err) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
   403  		logger.Error().Err(err).Msg("failed to get execution data for block")
   404  
   405  		return err
   406  	}
   407  
   408  	// Any other error is unexpected
   409  	if err != nil {
   410  		logger.Error().Err(err).Msg("unexpected error fetching execution data")
   411  
   412  		ctx.Throw(err)
   413  	}
   414  
   415  	logger.Info().Msg("execution data fetched")
   416  
   417  	return nil
   418  }
   419  
   420  // fetchExecutionData fetches the ExecutionData by its ID, and times out if fetchTimeout is exceeded
   421  func (e *executionDataRequester) fetchExecutionData(signalerCtx irrecoverable.SignalerContext, executionDataID flow.Identifier, fetchTimeout time.Duration) (*execution_data.BlockExecutionData, error) {
   422  	ctx, cancel := context.WithTimeout(signalerCtx, fetchTimeout)
   423  	defer cancel()
   424  
   425  	// Get the data from the network
   426  	// this is a blocking call, won't be unblocked until either hitting error (including timeout) or
   427  	// the data is received
   428  	executionData, err := e.downloader.Download(ctx, executionDataID)
   429  
   430  	if err != nil {
   431  		return nil, err
   432  	}
   433  
   434  	return executionData, nil
   435  }
   436  
   437  // Notification Worker Methods
   438  
   439  func (e *executionDataRequester) processNotificationJob(ctx irrecoverable.SignalerContext, job module.Job, jobComplete func()) {
   440  	// convert job into a block entry
   441  	entry, err := jobs.JobToBlockEntry(job)
   442  	if err != nil {
   443  		ctx.Throw(fmt.Errorf("failed to convert job to entry: %w", err))
   444  	}
   445  
   446  	e.processNotification(ctx, entry.Height, entry.ExecutionData)
   447  	jobComplete()
   448  }
   449  
   450  func (e *executionDataRequester) processNotification(ctx irrecoverable.SignalerContext, height uint64, executionData *execution_data.BlockExecutionData) {
   451  	e.log.Debug().Msgf("notifying for block %d", height)
   452  
   453  	// send notifications
   454  	e.notifyConsumers(executionData)
   455  
   456  	e.metrics.NotificationSent(height)
   457  }
   458  
   459  func (e *executionDataRequester) notifyConsumers(executionData *execution_data.BlockExecutionData) {
   460  	e.consumerMu.RLock()
   461  	defer e.consumerMu.RUnlock()
   462  
   463  	for _, fn := range e.consumers {
   464  		fn(executionData)
   465  	}
   466  }
   467  
   468  func isInvalidBlobError(err error) bool {
   469  	var malformedDataError *execution_data.MalformedDataError
   470  	var blobSizeLimitExceededError *execution_data.BlobSizeLimitExceededError
   471  	return errors.As(err, &malformedDataError) ||
   472  		errors.As(err, &blobSizeLimitExceededError)
   473  }
   474  
   475  func isBlobNotFoundError(err error) bool {
   476  	var blobNotFoundError *execution_data.BlobNotFoundError
   477  	return errors.As(err, &blobNotFoundError)
   478  }