github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/state_synchronization/requester/execution_data_requester.go (about) 1 package requester 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "time" 8 9 "github.com/rs/zerolog" 10 "github.com/sethvargo/go-retry" 11 12 "github.com/onflow/flow-go/consensus/hotstuff/model" 13 "github.com/onflow/flow-go/engine" 14 "github.com/onflow/flow-go/model/flow" 15 "github.com/onflow/flow-go/module" 16 "github.com/onflow/flow-go/module/component" 17 "github.com/onflow/flow-go/module/executiondatasync/execution_data" 18 "github.com/onflow/flow-go/module/executiondatasync/execution_data/cache" 19 "github.com/onflow/flow-go/module/irrecoverable" 20 "github.com/onflow/flow-go/module/jobqueue" 21 "github.com/onflow/flow-go/module/state_synchronization" 22 "github.com/onflow/flow-go/module/state_synchronization/requester/jobs" 23 "github.com/onflow/flow-go/module/util" 24 "github.com/onflow/flow-go/state/protocol" 25 "github.com/onflow/flow-go/storage" 26 "github.com/onflow/flow-go/utils/logging" 27 ) 28 29 // The ExecutionDataRequester downloads ExecutionData for sealed blocks from other participants in 30 // the flow network. The ExecutionData for a sealed block should always downloadable, since a 31 // sealed block must have been executed. 32 // 33 // Once the ExecutionData for a block is downloaded, the node becomes a seeder for other participants 34 // on the network using the bitswap protocol. The downloading and seeding work is handled by the 35 // ExecutionDataService. 36 // 37 // The ExecutionDataRequester internally uses a job queue to request and download each sealed block 38 // with multiple workers. It downloads ExecutionData block by block towards the latest sealed block. 39 // In order to ensure it does not miss any sealed block to download, it persists the last downloaded 40 // height, and only increments it when the next height has been downloaded. In the event of a crash 41 // failure, it will read the last downloaded height, and process from the next un-downloaded height. 42 // The requester listens to block finalization event, and checks if sealed height has been changed, 43 // if changed, it create job for each un-downloaded and sealed height. 44 // 45 // The requester is made up of 3 subcomponents: 46 // 47 // * OnBlockFinalized: receives block finalized events from the finalization distributor and 48 // forwards them to the blockConsumer. 49 // 50 // * blockConsumer: is a jobqueue that receives block finalization events. On each event, 51 // it checks for the latest sealed block, then uses a pool of workers to 52 // download ExecutionData for each block from the network. After each 53 // successful download, the blockConsumer sends a notification to the 54 // notificationConsumer that a new ExecutionData is available. 55 // 56 // * notificationConsumer: is a jobqueue that receives ExecutionData fetched events. On each event, 57 // it checks if ExecutionData for the next consecutive block height is 58 // available, then uses a single worker to send notifications to registered 59 // consumers. 60 // the registered consumers are guaranteed to receive each sealed block in 61 // consecutive height at least once. 62 // 63 // +------------------+ +---------------+ +----------------------+ 64 // -->| OnBlockFinalized |----->| blockConsumer | +-->| notificationConsumer | 65 // +------------------+ +-------+-------+ | +-----------+----------+ 66 // | | | 67 // +------+------+ | +------+------+ 68 // xN | Worker Pool |----+ x1 | Worker Pool |----> Registered consumers 69 // +-------------+ +-------------+ 70 71 const ( 72 // DefaultFetchTimeout is the default initial timeout for fetching ExecutionData from the 73 // db/network. The timeout is increased using an incremental backoff until FetchTimeout. 74 DefaultFetchTimeout = 10 * time.Second 75 76 // DefaultMaxFetchTimeout is the default timeout for fetching ExecutionData from the db/network 77 DefaultMaxFetchTimeout = 10 * time.Minute 78 79 // DefaultRetryDelay is the default initial delay used in the exponential backoff for failed 80 // ExecutionData download retries 81 DefaultRetryDelay = 1 * time.Second 82 83 // DefaultMaxRetryDelay is the default maximum delay used in the exponential backoff for failed 84 // ExecutionData download retries 85 DefaultMaxRetryDelay = 5 * time.Minute 86 87 // DefaultMaxSearchAhead is the default max number of unsent notifications to allow before 88 // pausing new fetches. 89 DefaultMaxSearchAhead = 5000 90 91 // Number of goroutines to use for downloading new ExecutionData from the network. 92 fetchWorkers = 4 93 ) 94 95 // ExecutionDataConfig contains configuration options for the ExecutionDataRequester 96 type ExecutionDataConfig struct { 97 // The initial value to use as the last processed block height. This should be the 98 // first block height to sync - 1 99 InitialBlockHeight uint64 100 101 // Max number of unsent notifications to allow before pausing new fetches. After exceeding this 102 // limit, the requester will stop processing new finalized block notifications. This prevents 103 // unbounded memory use by the requester if it gets stuck fetching a specific height. 104 MaxSearchAhead uint64 105 106 // The initial timeout for fetching ExecutionData from the db/network 107 FetchTimeout time.Duration 108 109 // The max timeout for fetching ExecutionData from the db/network 110 MaxFetchTimeout time.Duration 111 112 // Exponential backoff settings for download retries 113 RetryDelay time.Duration 114 MaxRetryDelay time.Duration 115 } 116 117 type executionDataRequester struct { 118 component.Component 119 downloader execution_data.Downloader 120 metrics module.ExecutionDataRequesterMetrics 121 config ExecutionDataConfig 122 log zerolog.Logger 123 124 // Local db objects 125 headers storage.Headers 126 127 executionDataReader *jobs.ExecutionDataReader 128 129 // Notifiers for queue consumers 130 finalizationNotifier engine.Notifier 131 132 // Job queues 133 blockConsumer *jobqueue.ComponentConsumer 134 notificationConsumer *jobqueue.ComponentConsumer 135 136 execDataCache *cache.ExecutionDataCache 137 distributor *ExecutionDataDistributor 138 } 139 140 var _ state_synchronization.ExecutionDataRequester = (*executionDataRequester)(nil) 141 142 // New creates a new execution data requester component 143 func New( 144 log zerolog.Logger, 145 edrMetrics module.ExecutionDataRequesterMetrics, 146 downloader execution_data.Downloader, 147 execDataCache *cache.ExecutionDataCache, 148 processedHeight storage.ConsumerProgress, 149 processedNotifications storage.ConsumerProgress, 150 state protocol.State, 151 headers storage.Headers, 152 cfg ExecutionDataConfig, 153 distributor *ExecutionDataDistributor, 154 ) (state_synchronization.ExecutionDataRequester, error) { 155 e := &executionDataRequester{ 156 log: log.With().Str("component", "execution_data_requester").Logger(), 157 downloader: downloader, 158 execDataCache: execDataCache, 159 metrics: edrMetrics, 160 headers: headers, 161 config: cfg, 162 finalizationNotifier: engine.NewNotifier(), 163 distributor: distributor, 164 } 165 166 executionDataNotifier := engine.NewNotifier() 167 168 // jobqueue Jobs object that tracks sealed blocks by height. This is used by the blockConsumer 169 // to get a sequential list of sealed blocks. 170 sealedBlockReader := jobqueue.NewSealedBlockHeaderReader(state, headers) 171 172 // blockConsumer ensures every sealed block's execution data is downloaded. 173 // It listens to block finalization events from `finalizationNotifier`, then checks if there 174 // are new sealed blocks with `sealedBlockReader`. If there are, it starts workers to process 175 // them with `processingBlockJob`, which fetches execution data. At most `fetchWorkers` workers 176 // will be created for concurrent processing. When a sealed block's execution data has been 177 // downloaded, it updates and persists the highest consecutive downloaded height with 178 // `processedHeight`. That way, if the node crashes, it reads the `processedHeight` and resume 179 // from `processedHeight + 1`. If the database is empty, rootHeight will be used to init the 180 // last processed height. Once the execution data is fetched and stored, it notifies 181 // `executionDataNotifier`. 182 blockConsumer, err := jobqueue.NewComponentConsumer( 183 e.log.With().Str("module", "block_consumer").Logger(), 184 e.finalizationNotifier.Channel(), // to listen to finalization events to find newly sealed blocks 185 processedHeight, // read and persist the downloaded height 186 sealedBlockReader, // read sealed blocks by height 187 e.config.InitialBlockHeight, // initial "last processed" height for empty db 188 e.processBlockJob, // process the sealed block job to download its execution data 189 fetchWorkers, // the number of concurrent workers 190 e.config.MaxSearchAhead, // max number of unsent notifications to allow before pausing new fetches 191 ) 192 if err != nil { 193 return nil, fmt.Errorf("failed to create block consumer: %w", err) 194 } 195 e.blockConsumer = blockConsumer 196 197 // notifies notificationConsumer when new ExecutionData blobs are available 198 // SetPostNotifier will notify executionDataNotifier AFTER e.blockConsumer.LastProcessedIndex is updated. 199 // Even though it doesn't guarantee to notify for every height at least once, the notificationConsumer is 200 // able to guarantee to process every height at least once, because the notificationConsumer finds new jobs 201 // using executionDataReader which finds new heights using e.blockConsumer.LastProcessedIndex 202 e.blockConsumer.SetPostNotifier(func(module.JobID) { executionDataNotifier.Notify() }) 203 204 // jobqueue Jobs object tracks downloaded execution data by height. This is used by the 205 // notificationConsumer to get downloaded execution data from storage. 206 e.executionDataReader = jobs.NewExecutionDataReader( 207 e.execDataCache, 208 e.config.FetchTimeout, 209 // method to get highest consecutive height that has downloaded execution data. it is used 210 // here by the notification job consumer to discover new jobs. 211 // Note: we don't want to notify notificationConsumer for a block if it has not downloaded 212 // execution data yet. 213 func() (uint64, error) { 214 return e.blockConsumer.LastProcessedIndex(), nil 215 }, 216 ) 217 218 // notificationConsumer consumes `OnExecutionDataFetched` events, and ensures its consumer 219 // receives this event in consecutive block height order. 220 // It listens to events from `executionDataNotifier`, which is delivered when 221 // a block's execution data is downloaded and stored, and checks the `executionDataCache` to 222 // find if the next un-processed consecutive height is available. 223 // To know what's the height of the next un-processed consecutive height, it reads the latest 224 // consecutive height in `processedNotifications`. And it's persisted in storage to be crash-resistant. 225 // When a new consecutive height is available, it calls `processNotificationJob` to notify all the 226 // `e.consumers`. 227 // Note: the `e.consumers` will be guaranteed to receive at least one `OnExecutionDataFetched` event 228 // for each sealed block in consecutive block height order. 229 e.notificationConsumer, err = jobqueue.NewComponentConsumer( 230 e.log.With().Str("module", "notification_consumer").Logger(), 231 executionDataNotifier.Channel(), // listen for notifications from the block consumer 232 processedNotifications, // read and persist the notified height 233 e.executionDataReader, // read execution data by height 234 e.config.InitialBlockHeight, // initial "last processed" height for empty db 235 e.processNotificationJob, // process the job to send notifications for an execution data 236 1, // use a single worker to ensure notification is delivered in consecutive order 237 0, // search ahead limit controlled by worker count 238 ) 239 if err != nil { 240 return nil, fmt.Errorf("failed to create notification consumer: %w", err) 241 } 242 243 e.Component = component.NewComponentManagerBuilder(). 244 AddWorker(e.runBlockConsumer). 245 AddWorker(e.runNotificationConsumer). 246 Build() 247 248 return e, nil 249 } 250 251 // OnBlockFinalized accepts block finalization notifications from the FollowerDistributor 252 func (e *executionDataRequester) OnBlockFinalized(*model.Block) { 253 e.finalizationNotifier.Notify() 254 } 255 256 // HighestConsecutiveHeight returns the highest consecutive block height for which ExecutionData 257 // has been received. 258 // This method must only be called after the component is Ready. If it is called early, an error is returned. 259 func (e *executionDataRequester) HighestConsecutiveHeight() (uint64, error) { 260 select { 261 case <-e.blockConsumer.Ready(): 262 default: 263 // LastProcessedIndex is not meaningful until the component has completed startup 264 return 0, fmt.Errorf("HighestConsecutiveHeight must not be called before the component is ready") 265 } 266 267 return e.blockConsumer.LastProcessedIndex(), nil 268 } 269 270 // runBlockConsumer runs the blockConsumer component 271 func (e *executionDataRequester) runBlockConsumer(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) { 272 err := util.WaitClosed(ctx, e.downloader.Ready()) 273 if err != nil { 274 return // context cancelled 275 } 276 277 err = util.WaitClosed(ctx, e.notificationConsumer.Ready()) 278 if err != nil { 279 return // context cancelled 280 } 281 282 e.blockConsumer.Start(ctx) 283 284 err = util.WaitClosed(ctx, e.blockConsumer.Ready()) 285 if err == nil { 286 ready() 287 } 288 289 <-e.blockConsumer.Done() 290 } 291 292 // runNotificationConsumer runs the notificationConsumer component 293 func (e *executionDataRequester) runNotificationConsumer(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) { 294 e.executionDataReader.AddContext(ctx) 295 e.notificationConsumer.Start(ctx) 296 297 err := util.WaitClosed(ctx, e.notificationConsumer.Ready()) 298 if err == nil { 299 ready() 300 } 301 302 <-e.notificationConsumer.Done() 303 } 304 305 // Fetch Worker Methods 306 307 // processBlockJob consumes jobs from the blockConsumer and attempts to download an ExecutionData 308 // for the given block height. 309 func (e *executionDataRequester) processBlockJob(ctx irrecoverable.SignalerContext, job module.Job, jobComplete func()) { 310 // convert job into a block entry 311 header, err := jobqueue.JobToBlockHeader(job) 312 if err != nil { 313 ctx.Throw(fmt.Errorf("failed to convert job to block: %w", err)) 314 } 315 316 err = e.processSealedHeight(ctx, header.ID(), header.Height) 317 if err == nil { 318 jobComplete() 319 return 320 } 321 322 // errors are thrown as irrecoverable errors except context cancellation, and invalid blobs 323 // invalid blobs are logged, and never completed, which will halt downloads after maxSearchAhead 324 // is reached. 325 e.log.Error().Err(err).Str("job_id", string(job.ID())).Msg("error encountered while processing block job") 326 } 327 328 // processSealedHeight downloads ExecutionData for the given block height. 329 // If the download fails, it will retry forever, using exponential backoff. 330 func (e *executionDataRequester) processSealedHeight(ctx irrecoverable.SignalerContext, blockID flow.Identifier, height uint64) error { 331 backoff := retry.NewExponential(e.config.RetryDelay) 332 backoff = retry.WithCappedDuration(e.config.MaxRetryDelay, backoff) 333 backoff = retry.WithJitterPercent(15, backoff) 334 335 // bitswap always waits for either all data to be received or a timeout, even if it encountered an error. 336 // use an incremental backoff for the timeout so we do faster initial retries, then allow for more 337 // time in case data is large or there is network congestion. 338 timeout := retry.NewExponential(e.config.FetchTimeout) 339 timeout = retry.WithCappedDuration(e.config.MaxFetchTimeout, timeout) 340 341 attempt := 0 342 return retry.Do(ctx, backoff, func(context.Context) error { 343 if attempt > 0 { 344 e.log.Debug(). 345 Str("block_id", blockID.String()). 346 Uint64("height", height). 347 Uint64("attempt", uint64(attempt)). 348 Msgf("retrying download") 349 350 e.metrics.FetchRetried() 351 } 352 attempt++ 353 354 // download execution data for the block 355 fetchTimeout, _ := timeout.Next() 356 err := e.processFetchRequest(ctx, blockID, height, fetchTimeout) 357 358 // don't retry if the blob was invalid 359 if isInvalidBlobError(err) { 360 return err 361 } 362 363 return retry.RetryableError(err) 364 }) 365 } 366 367 func (e *executionDataRequester) processFetchRequest(parentCtx irrecoverable.SignalerContext, blockID flow.Identifier, height uint64, fetchTimeout time.Duration) error { 368 logger := e.log.With(). 369 Str("block_id", blockID.String()). 370 Uint64("height", height). 371 Logger() 372 373 logger.Debug().Msg("processing fetch request") 374 375 start := time.Now() 376 e.metrics.ExecutionDataFetchStarted() 377 378 logger.Debug().Msg("downloading execution data") 379 380 ctx, cancel := context.WithTimeout(parentCtx, fetchTimeout) 381 defer cancel() 382 383 execData, err := e.execDataCache.ByBlockID(ctx, blockID) 384 385 e.metrics.ExecutionDataFetchFinished(time.Since(start), err == nil, height) 386 387 if isInvalidBlobError(err) { 388 // This means an execution result was sealed with an invalid execution data id (invalid data). 389 // Eventually, verification nodes will verify that the execution data is valid, and not sign the receipt 390 logger.Error().Err(err).Msg("HALTING REQUESTER: invalid execution data found") 391 392 return err 393 } 394 395 // Some or all of the blob was missing or corrupt. retry 396 if isBlobNotFoundError(err) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) { 397 logger.Error().Err(err).Msg("failed to get execution data for block") 398 399 return err 400 } 401 402 // Any other error is unexpected 403 if err != nil { 404 logger.Error().Err(err).Msg("unexpected error fetching execution data") 405 406 parentCtx.Throw(err) 407 } 408 409 logger.Info(). 410 Hex("execution_data_id", logging.ID(execData.ID())). 411 Msg("execution data fetched") 412 413 return nil 414 } 415 416 // Notification Worker Methods 417 418 func (e *executionDataRequester) processNotificationJob(ctx irrecoverable.SignalerContext, job module.Job, jobComplete func()) { 419 // convert job into a block entry 420 entry, err := jobs.JobToBlockEntry(job) 421 if err != nil { 422 ctx.Throw(fmt.Errorf("failed to convert job to entry: %w", err)) 423 } 424 425 e.log.Debug(). 426 Hex("block_id", logging.ID(entry.BlockID)). 427 Uint64("height", entry.Height). 428 Msgf("notifying for block") 429 430 // send notifications 431 e.distributor.OnExecutionDataReceived(entry.ExecutionData) 432 jobComplete() 433 434 e.metrics.NotificationSent(entry.Height) 435 } 436 437 func isInvalidBlobError(err error) bool { 438 var malformedDataError *execution_data.MalformedDataError 439 var blobSizeLimitExceededError *execution_data.BlobSizeLimitExceededError 440 return errors.As(err, &malformedDataError) || 441 errors.As(err, &blobSizeLimitExceededError) 442 } 443 444 func isBlobNotFoundError(err error) bool { 445 var blobNotFoundError *execution_data.BlobNotFoundError 446 return errors.As(err, &blobNotFoundError) 447 }