github.com/koko1123/flow-go-1@v0.29.6/module/state_synchronization/requester/execution_data_requester.go (about) 1 package requester 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "sync" 8 "time" 9 10 "github.com/rs/zerolog" 11 "github.com/sethvargo/go-retry" 12 13 "github.com/koko1123/flow-go-1/consensus/hotstuff/model" 14 "github.com/koko1123/flow-go-1/engine" 15 "github.com/koko1123/flow-go-1/model/flow" 16 "github.com/koko1123/flow-go-1/module" 17 "github.com/koko1123/flow-go-1/module/component" 18 "github.com/koko1123/flow-go-1/module/executiondatasync/execution_data" 19 "github.com/koko1123/flow-go-1/module/irrecoverable" 20 "github.com/koko1123/flow-go-1/module/jobqueue" 21 "github.com/koko1123/flow-go-1/module/state_synchronization" 22 "github.com/koko1123/flow-go-1/module/state_synchronization/requester/jobs" 23 "github.com/koko1123/flow-go-1/module/util" 24 "github.com/koko1123/flow-go-1/state/protocol" 25 "github.com/koko1123/flow-go-1/storage" 26 ) 27 28 // The ExecutionDataRequester downloads ExecutionData for sealed blocks from other participants in 29 // the flow network. The ExecutionData for a sealed block should always downloadable, since a 30 // sealed block must have been executed. 31 // 32 // Once the ExecutionData for a block is downloaded, the node becomes a seeder for other participants 33 // on the network using the bitswap protocol. The downloading and seeding work is handled by the 34 // ExecutionDataService. 35 // 36 // The ExecutionDataRequester internally uses a job queue to request and download each sealed block 37 // with multiple workers. It downloads ExecutionData block by block towards the latest sealed block. 38 // In order to ensure it does not miss any sealed block to download, it persists the last downloaded 39 // height, and only increments it when the next height has been downloaded. In the event of a crash 40 // failure, it will read the last downloaded height, and process from the next un-downloaded height. 41 // The requester listens to block finalization event, and checks if sealed height has been changed, 42 // if changed, it create job for each un-downloaded and sealed height. 43 // 44 // The requester is made up of 3 subcomponents: 45 // 46 // * OnBlockFinalized: receives block finalized events from the finalization distributor and 47 // forwards them to the blockConsumer. 48 // 49 // * blockConsumer: is a jobqueue that receives block finalization events. On each event, 50 // it checks for the latest sealed block, then uses a pool of workers to 51 // download ExecutionData for each block from the network. After each 52 // successful download, the blockConsumer sends a notification to the 53 // notificationConsumer that a new ExecutionData is available. 54 // 55 // * notificationConsumer: is a jobqueue that receives ExecutionData fetched events. On each event, 56 // it checks if ExecutionData for the next consecutive block height is 57 // available, then uses a single worker to send notifications to registered 58 // consumers. 59 // the registered consumers are guaranteed to receive each sealed block in 60 // consecutive height at least once. 61 // 62 // +------------------+ +---------------+ +----------------------+ 63 // -->| OnBlockFinalized |----->| blockConsumer | +-->| notificationConsumer | 64 // +------------------+ +-------+-------+ | +-----------+----------+ 65 // | | | 66 // +------+------+ | +------+------+ 67 // xN | Worker Pool |----+ x1 | Worker Pool |----> Registered consumers 68 // +-------------+ +-------------+ 69 70 const ( 71 // DefaultFetchTimeout is the default initial timeout for fetching ExecutionData from the 72 // db/network. The timeout is increased using an incremental backoff until FetchTimeout. 73 DefaultFetchTimeout = 10 * time.Second 74 75 // DefaultMaxFetchTimeout is the default timeout for fetching ExecutionData from the db/network 76 DefaultMaxFetchTimeout = 10 * time.Minute 77 78 // DefaultRetryDelay is the default initial delay used in the exponential backoff for failed 79 // ExecutionData download retries 80 DefaultRetryDelay = 1 * time.Second 81 82 // DefaultMaxRetryDelay is the default maximum delay used in the exponential backoff for failed 83 // ExecutionData download retries 84 DefaultMaxRetryDelay = 5 * time.Minute 85 86 // DefaultMaxSearchAhead is the default max number of unsent notifications to allow before 87 // pausing new fetches. 88 DefaultMaxSearchAhead = 5000 89 90 // Number of goroutines to use for downloading new ExecutionData from the network. 91 fetchWorkers = 4 92 ) 93 94 // ExecutionDataConfig contains configuration options for the ExecutionDataRequester 95 type ExecutionDataConfig struct { 96 // The initial value to use as the last processed block height. This should be the 97 // first block height to sync - 1 98 InitialBlockHeight uint64 99 100 // Max number of unsent notifications to allow before pausing new fetches. After exceeding this 101 // limit, the requester will stop processing new finalized block notifications. This prevents 102 // unbounded memory use by the requester if it gets stuck fetching a specific height. 103 MaxSearchAhead uint64 104 105 // The initial timeout for fetching ExecutionData from the db/network 106 FetchTimeout time.Duration 107 108 // The max timeout for fetching ExecutionData from the db/network 109 MaxFetchTimeout time.Duration 110 111 // Exponential backoff settings for download retries 112 RetryDelay time.Duration 113 MaxRetryDelay time.Duration 114 } 115 116 type executionDataRequester struct { 117 component.Component 118 cm *component.ComponentManager 119 downloader execution_data.Downloader 120 metrics module.ExecutionDataRequesterMetrics 121 config ExecutionDataConfig 122 log zerolog.Logger 123 124 // Local db objects 125 headers storage.Headers 126 results storage.ExecutionResults 127 seals storage.Seals 128 129 executionDataReader *jobs.ExecutionDataReader 130 131 // Notifiers for queue consumers 132 finalizationNotifier engine.Notifier 133 134 // Job queues 135 blockConsumer *jobqueue.ComponentConsumer 136 notificationConsumer *jobqueue.ComponentConsumer 137 138 // List of callbacks to call when ExecutionData is successfully fetched for a block 139 consumers []state_synchronization.ExecutionDataReceivedCallback 140 141 consumerMu sync.RWMutex 142 } 143 144 var _ state_synchronization.ExecutionDataRequester = (*executionDataRequester)(nil) 145 146 // New creates a new execution data requester component 147 func New( 148 log zerolog.Logger, 149 edrMetrics module.ExecutionDataRequesterMetrics, 150 downloader execution_data.Downloader, 151 processedHeight storage.ConsumerProgress, 152 processedNotifications storage.ConsumerProgress, 153 state protocol.State, 154 headers storage.Headers, 155 results storage.ExecutionResults, 156 seals storage.Seals, 157 cfg ExecutionDataConfig, 158 ) state_synchronization.ExecutionDataRequester { 159 e := &executionDataRequester{ 160 log: log.With().Str("component", "execution_data_requester").Logger(), 161 downloader: downloader, 162 metrics: edrMetrics, 163 headers: headers, 164 results: results, 165 seals: seals, 166 config: cfg, 167 finalizationNotifier: engine.NewNotifier(), 168 } 169 170 executionDataNotifier := engine.NewNotifier() 171 172 // jobqueue Jobs object that tracks sealed blocks by height. This is used by the blockConsumer 173 // to get a sequential list of sealed blocks. 174 sealedBlockReader := jobqueue.NewSealedBlockHeaderReader(state, headers) 175 176 // blockConsumer ensures every sealed block's execution data is downloaded. 177 // It listens to block finalization events from `finalizationNotifier`, then checks if there 178 // are new sealed blocks with `sealedBlockReader`. If there are, it starts workers to process 179 // them with `processingBlockJob`, which fetches execution data. At most `fetchWorkers` workers 180 // will be created for concurrent processing. When a sealed block's execution data has been 181 // downloaded, it updates and persists the highest consecutive downloaded height with 182 // `processedHeight`. That way, if the node crashes, it reads the `processedHeight` and resume 183 // from `processedHeight + 1`. If the database is empty, rootHeight will be used to init the 184 // last processed height. Once the execution data is fetched and stored, it notifies 185 // `executionDataNotifier`. 186 e.blockConsumer = jobqueue.NewComponentConsumer( 187 e.log.With().Str("module", "block_consumer").Logger(), 188 e.finalizationNotifier.Channel(), // to listen to finalization events to find newly sealed blocks 189 processedHeight, // read and persist the downloaded height 190 sealedBlockReader, // read sealed blocks by height 191 e.config.InitialBlockHeight, // initial "last processed" height for empty db 192 e.processBlockJob, // process the sealed block job to download its execution data 193 fetchWorkers, // the number of concurrent workers 194 e.config.MaxSearchAhead, // max number of unsent notifications to allow before pausing new fetches 195 ) 196 // notifies notificationConsumer when new ExecutionData blobs are available 197 // SetPostNotifier will notify executionDataNotifier AFTER e.blockConsumer.LastProcessedIndex is updated. 198 // Even though it doesn't guarantee to notify for every height at least once, the notificationConsumer is 199 // able to guarantee to process every height at least once, because the notificationConsumer finds new job 200 // using executionDataReader which finds new height using e.blockConsumer.LastProcessedIndex 201 e.blockConsumer.SetPostNotifier(func(module.JobID) { executionDataNotifier.Notify() }) 202 203 // jobqueue Jobs object tracks downloaded execution data by height. This is used by the 204 // notificationConsumer to get downloaded execution data from storage. 205 e.executionDataReader = jobs.NewExecutionDataReader( 206 e.downloader, 207 e.headers, 208 e.results, 209 e.seals, 210 e.config.FetchTimeout, 211 // method to get highest consecutive height that has downloaded execution data. it is used 212 // here by the notification job consumer to discover new jobs. 213 // Note: we don't want to notify notificationConsumer for a block if it has not downloaded 214 // execution data yet. 215 e.blockConsumer.LastProcessedIndex, 216 ) 217 218 // notificationConsumer consumes `OnExecutionDataFetched` events, and ensures its consumer 219 // receives this event in consecutive block height order. 220 // It listens to events from `executionDataNotifier`, which is delivered when 221 // a block's execution data is downloaded and stored, and checks the `executionDataCache` to 222 // find if the next un-processed consecutive height is available. 223 // To know what's the height of the next un-processed consecutive height, it reads the latest 224 // consecutive height in `processedNotifications`. And it's persisted in storage to be crash-resistant. 225 // When a new consecutive height is available, it calls `processNotificationJob` to notify all the 226 // `e.consumers`. 227 // Note: the `e.consumers` will be guaranteed to receive at least one `OnExecutionDataFetched` event 228 // for each sealed block in consecutive block height order. 229 e.notificationConsumer = jobqueue.NewComponentConsumer( 230 e.log.With().Str("module", "notification_consumer").Logger(), 231 executionDataNotifier.Channel(), // listen for notifications from the block consumer 232 processedNotifications, // read and persist the notified height 233 e.executionDataReader, // read execution data by height 234 e.config.InitialBlockHeight, // initial "last processed" height for empty db 235 e.processNotificationJob, // process the job to send notifications for an execution data 236 1, // use a single worker to ensure notification is delivered in consecutive order 237 0, // search ahead limit controlled by worker count 238 ) 239 240 builder := component.NewComponentManagerBuilder(). 241 AddWorker(e.runBlockConsumer). 242 AddWorker(e.runNotificationConsumer) 243 244 e.cm = builder.Build() 245 e.Component = e.cm 246 247 return e 248 } 249 250 // OnBlockFinalized accepts block finalization notifications from the FinalizationDistributor 251 func (e *executionDataRequester) OnBlockFinalized(*model.Block) { 252 e.finalizationNotifier.Notify() 253 } 254 255 // AddOnExecutionDataFetchedConsumer adds a callback to be called when a new ExecutionData is received 256 // Callback Implementations must: 257 // - be concurrency safe 258 // - be non-blocking 259 // - handle repetition of the same events (with some processing overhead). 260 func (e *executionDataRequester) AddOnExecutionDataFetchedConsumer(fn state_synchronization.ExecutionDataReceivedCallback) { 261 e.consumerMu.Lock() 262 defer e.consumerMu.Unlock() 263 264 e.consumers = append(e.consumers, fn) 265 } 266 267 // runBlockConsumer runs the blockConsumer component 268 func (e *executionDataRequester) runBlockConsumer(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) { 269 err := util.WaitClosed(ctx, e.downloader.Ready()) 270 if err != nil { 271 return // context cancelled 272 } 273 274 err = util.WaitClosed(ctx, e.notificationConsumer.Ready()) 275 if err != nil { 276 return // context cancelled 277 } 278 279 e.blockConsumer.Start(ctx) 280 281 err = util.WaitClosed(ctx, e.blockConsumer.Ready()) 282 if err == nil { 283 ready() 284 } 285 286 <-e.blockConsumer.Done() 287 } 288 289 // runNotificationConsumer runs the notificationConsumer component 290 func (e *executionDataRequester) runNotificationConsumer(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) { 291 e.executionDataReader.AddContext(ctx) 292 e.notificationConsumer.Start(ctx) 293 294 err := util.WaitClosed(ctx, e.notificationConsumer.Ready()) 295 if err == nil { 296 ready() 297 } 298 299 <-e.notificationConsumer.Done() 300 } 301 302 // Fetch Worker Methods 303 304 // processBlockJob consumes jobs from the blockConsumer and attempts to download an ExecutionData 305 // for the given block height. 306 func (e *executionDataRequester) processBlockJob(ctx irrecoverable.SignalerContext, job module.Job, jobComplete func()) { 307 // convert job into a block entry 308 header, err := jobqueue.JobToBlockHeader(job) 309 if err != nil { 310 ctx.Throw(fmt.Errorf("failed to convert job to block: %w", err)) 311 } 312 313 err = e.processSealedHeight(ctx, header.ID(), header.Height) 314 if err == nil { 315 jobComplete() 316 return 317 } 318 319 // errors are thrown as irrecoverable errors except context cancellation, and invalid blobs 320 // invalid blobs are logged, and never completed, which will halt downloads after maxSearchAhead 321 // is reached. 322 e.log.Error().Err(err).Str("job_id", string(job.ID())).Msg("error encountered while processing block job") 323 } 324 325 // processSealedHeight downloads ExecutionData for the given block height. 326 // If the download fails, it will retry forever, using exponential backoff. 327 func (e *executionDataRequester) processSealedHeight(ctx irrecoverable.SignalerContext, blockID flow.Identifier, height uint64) error { 328 backoff := retry.NewExponential(e.config.RetryDelay) 329 backoff = retry.WithCappedDuration(e.config.MaxRetryDelay, backoff) 330 backoff = retry.WithJitterPercent(15, backoff) 331 332 // bitswap always waits for either all data to be received or a timeout, even if it encountered an error. 333 // use an incremental backoff for the timeout so we do faster initial retries, then allow for more 334 // time in case data is large or there is network congestion. 335 timeout := retry.NewExponential(e.config.FetchTimeout) 336 timeout = retry.WithCappedDuration(e.config.MaxFetchTimeout, timeout) 337 338 attempt := 0 339 return retry.Do(ctx, backoff, func(context.Context) error { 340 if attempt > 0 { 341 e.log.Debug(). 342 Str("block_id", blockID.String()). 343 Uint64("height", height). 344 Uint64("attempt", uint64(attempt)). 345 Msgf("retrying download") 346 347 e.metrics.FetchRetried() 348 } 349 attempt++ 350 351 // download execution data for the block 352 fetchTimeout, _ := timeout.Next() 353 err := e.processFetchRequest(ctx, blockID, height, fetchTimeout) 354 355 // don't retry if the blob was invalid 356 if isInvalidBlobError(err) { 357 return err 358 } 359 360 return retry.RetryableError(err) 361 }) 362 } 363 364 func (e *executionDataRequester) processFetchRequest(ctx irrecoverable.SignalerContext, blockID flow.Identifier, height uint64, fetchTimeout time.Duration) error { 365 logger := e.log.With(). 366 Str("block_id", blockID.String()). 367 Uint64("height", height). 368 Logger() 369 370 logger.Debug().Msg("processing fetch request") 371 372 seal, err := e.seals.FinalizedSealForBlock(blockID) 373 if err != nil { 374 ctx.Throw(fmt.Errorf("failed to get seal for block %s: %w", blockID, err)) 375 } 376 377 result, err := e.results.ByID(seal.ResultID) 378 if err != nil { 379 ctx.Throw(fmt.Errorf("failed to lookup execution result for block %s: %w", blockID, err)) 380 } 381 382 logger = logger.With().Str("execution_data_id", result.ExecutionDataID.String()).Logger() 383 384 start := time.Now() 385 e.metrics.ExecutionDataFetchStarted() 386 387 logger.Debug().Msg("downloading execution data") 388 389 _, err = e.fetchExecutionData(ctx, result.ExecutionDataID, fetchTimeout) 390 391 e.metrics.ExecutionDataFetchFinished(time.Since(start), err == nil, height) 392 393 if isInvalidBlobError(err) { 394 // This means an execution result was sealed with an invalid execution data id (invalid data). 395 // Eventually, verification nodes will verify that the execution data is valid, and not sign the receipt 396 logger.Error().Err(err).Msg("HALTING REQUESTER: invalid execution data found") 397 398 return err 399 } 400 401 // Some or all of the blob was missing or corrupt. retry 402 if isBlobNotFoundError(err) || errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) { 403 logger.Error().Err(err).Msg("failed to get execution data for block") 404 405 return err 406 } 407 408 // Any other error is unexpected 409 if err != nil { 410 logger.Error().Err(err).Msg("unexpected error fetching execution data") 411 412 ctx.Throw(err) 413 } 414 415 logger.Info().Msg("execution data fetched") 416 417 return nil 418 } 419 420 // fetchExecutionData fetches the ExecutionData by its ID, and times out if fetchTimeout is exceeded 421 func (e *executionDataRequester) fetchExecutionData(signalerCtx irrecoverable.SignalerContext, executionDataID flow.Identifier, fetchTimeout time.Duration) (*execution_data.BlockExecutionData, error) { 422 ctx, cancel := context.WithTimeout(signalerCtx, fetchTimeout) 423 defer cancel() 424 425 // Get the data from the network 426 // this is a blocking call, won't be unblocked until either hitting error (including timeout) or 427 // the data is received 428 executionData, err := e.downloader.Download(ctx, executionDataID) 429 430 if err != nil { 431 return nil, err 432 } 433 434 return executionData, nil 435 } 436 437 // Notification Worker Methods 438 439 func (e *executionDataRequester) processNotificationJob(ctx irrecoverable.SignalerContext, job module.Job, jobComplete func()) { 440 // convert job into a block entry 441 entry, err := jobs.JobToBlockEntry(job) 442 if err != nil { 443 ctx.Throw(fmt.Errorf("failed to convert job to entry: %w", err)) 444 } 445 446 e.processNotification(ctx, entry.Height, entry.ExecutionData) 447 jobComplete() 448 } 449 450 func (e *executionDataRequester) processNotification(ctx irrecoverable.SignalerContext, height uint64, executionData *execution_data.BlockExecutionData) { 451 e.log.Debug().Msgf("notifying for block %d", height) 452 453 // send notifications 454 e.notifyConsumers(executionData) 455 456 e.metrics.NotificationSent(height) 457 } 458 459 func (e *executionDataRequester) notifyConsumers(executionData *execution_data.BlockExecutionData) { 460 e.consumerMu.RLock() 461 defer e.consumerMu.RUnlock() 462 463 for _, fn := range e.consumers { 464 fn(executionData) 465 } 466 } 467 468 func isInvalidBlobError(err error) bool { 469 var malformedDataError *execution_data.MalformedDataError 470 var blobSizeLimitExceededError *execution_data.BlobSizeLimitExceededError 471 return errors.As(err, &malformedDataError) || 472 errors.As(err, &blobSizeLimitExceededError) 473 } 474 475 func isBlobNotFoundError(err error) bool { 476 var blobNotFoundError *execution_data.BlobNotFoundError 477 return errors.As(err, &blobNotFoundError) 478 }