github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/common/requester/engine.go (about) 1 package requester 2 3 import ( 4 "fmt" 5 "math" 6 "time" 7 8 "github.com/rs/zerolog" 9 "github.com/vmihailenco/msgpack" 10 "go.uber.org/atomic" 11 12 "github.com/onflow/flow-go/engine" 13 "github.com/onflow/flow-go/model/flow" 14 "github.com/onflow/flow-go/model/flow/filter" 15 "github.com/onflow/flow-go/model/messages" 16 "github.com/onflow/flow-go/module" 17 "github.com/onflow/flow-go/module/metrics" 18 "github.com/onflow/flow-go/network" 19 "github.com/onflow/flow-go/network/channels" 20 "github.com/onflow/flow-go/state/protocol" 21 "github.com/onflow/flow-go/utils/logging" 22 "github.com/onflow/flow-go/utils/rand" 23 ) 24 25 // HandleFunc is a function provided to the requester engine to handle an entity 26 // once it has been retrieved from a provider. The function should be non-blocking 27 // and errors should be handled internally within the function. 28 type HandleFunc func(originID flow.Identifier, entity flow.Entity) 29 30 // CreateFunc is a function that creates a `flow.Entity` with an underlying type 31 // so that we can properly decode entities transmitted over the network. 32 type CreateFunc func() flow.Entity 33 34 // Engine is a generic requester engine, handling the requesting of entities 35 // on the flow network. It is the `request` part of the request-reply 36 // pattern provided by the pair of generic exchange engines. 37 type Engine struct { 38 unit *engine.Unit 39 log zerolog.Logger 40 cfg Config 41 metrics module.EngineMetrics 42 me module.Local 43 state protocol.State 44 con network.Conduit 45 channel channels.Channel 46 selector flow.IdentityFilter[flow.Identity] 47 create CreateFunc 48 handle HandleFunc 49 50 // changing the following state variables must be guarded by unit.Lock() 51 items map[flow.Identifier]*Item 52 requests map[uint64]*messages.EntityRequest 53 forcedDispatchOngoing *atomic.Bool // to ensure only trigger dispatching logic once at any time 54 } 55 56 // New creates a new requester engine, operating on the provided network channel, and requesting entities from a node 57 // within the set obtained by applying the provided selector filter. The options allow customization of the parameters 58 // related to the batch and retry logic. 59 func New(log zerolog.Logger, metrics module.EngineMetrics, net network.EngineRegistry, me module.Local, state protocol.State, 60 channel channels.Channel, selector flow.IdentityFilter[flow.Identity], create CreateFunc, options ...OptionFunc) (*Engine, error) { 61 62 // initialize the default config 63 cfg := Config{ 64 BatchThreshold: 32, 65 BatchInterval: time.Second, 66 RetryInitial: 4 * time.Second, 67 RetryFunction: RetryGeometric(2), 68 RetryMaximum: 2 * time.Minute, 69 RetryAttempts: math.MaxUint32, 70 ValidateStaking: true, 71 } 72 73 // apply the custom option parameters 74 for _, option := range options { 75 option(&cfg) 76 } 77 78 // check validity of retry function 79 interval := cfg.RetryFunction(time.Second) 80 if interval < time.Second { 81 return nil, fmt.Errorf("invalid retry function (interval must always increase)") 82 } 83 84 // check validity of maximum interval 85 if cfg.RetryMaximum < cfg.RetryInitial { 86 return nil, fmt.Errorf("invalid retry maximum (must not be smaller than initial interval)") 87 } 88 89 // make sure we don't send requests from self 90 selector = filter.And( 91 selector, 92 filter.Not(filter.HasNodeID[flow.Identity](me.NodeID())), 93 filter.Not(filter.HasParticipationStatus(flow.EpochParticipationStatusEjected)), 94 ) 95 96 // make sure we only send requests to nodes that are active in the current epoch and have positive weight 97 if cfg.ValidateStaking { 98 selector = filter.And( 99 selector, 100 filter.HasInitialWeight[flow.Identity](true), 101 filter.HasParticipationStatus(flow.EpochParticipationStatusActive), 102 ) 103 } 104 105 // initialize the propagation engine with its dependencies 106 e := &Engine{ 107 unit: engine.NewUnit(), 108 log: log.With().Str("engine", "requester").Logger(), 109 cfg: cfg, 110 metrics: metrics, 111 me: me, 112 state: state, 113 channel: channel, 114 selector: selector, 115 create: create, 116 handle: nil, 117 items: make(map[flow.Identifier]*Item), // holds all pending items 118 requests: make(map[uint64]*messages.EntityRequest), // holds all sent requests 119 forcedDispatchOngoing: atomic.NewBool(false), 120 } 121 122 // register the engine with the network layer and store the conduit 123 con, err := net.Register(channels.Channel(channel), e) 124 if err != nil { 125 return nil, fmt.Errorf("could not register engine: %w", err) 126 } 127 e.con = con 128 129 return e, nil 130 } 131 132 // WithHandle sets the handle function of the requester, which is how it processes 133 // returned entities. The engine can not be started without setting the handle 134 // function. It is done in a separate call so that the requester can be injected 135 // into engines upon construction, and then provide a handle function to the 136 // requester from that engine itself. 137 func (e *Engine) WithHandle(handle HandleFunc) { 138 e.handle = handle 139 } 140 141 // Ready returns a ready channel that is closed once the engine has fully 142 // started. For consensus engine, this is true once the underlying consensus 143 // algorithm has started. 144 func (e *Engine) Ready() <-chan struct{} { 145 if e.handle == nil { 146 panic("must initialize requester engine with handler") 147 } 148 e.unit.Launch(e.poll) 149 return e.unit.Ready() 150 } 151 152 // Done returns a done channel that is closed once the engine has fully stopped. 153 // For the consensus engine, we wait for hotstuff to finish. 154 func (e *Engine) Done() <-chan struct{} { 155 return e.unit.Done() 156 } 157 158 // SubmitLocal submits an message originating on the local node. 159 func (e *Engine) SubmitLocal(message interface{}) { 160 e.unit.Launch(func() { 161 err := e.process(e.me.NodeID(), message) 162 if err != nil { 163 engine.LogError(e.log, err) 164 } 165 }) 166 } 167 168 // Submit submits the given message from the node with the given origin ID 169 // for processing in a non-blocking manner. It returns instantly and logs 170 // a potential processing error internally when done. 171 func (e *Engine) Submit(channel channels.Channel, originID flow.Identifier, message interface{}) { 172 e.unit.Launch(func() { 173 err := e.Process(channel, originID, message) 174 if err != nil { 175 engine.LogError(e.log, err) 176 } 177 }) 178 } 179 180 // ProcessLocal processes an message originating on the local node. 181 func (e *Engine) ProcessLocal(message interface{}) error { 182 return e.unit.Do(func() error { 183 return e.process(e.me.NodeID(), message) 184 }) 185 } 186 187 // Process processes the given message from the node with the given origin ID in 188 // a blocking manner. It returns the potential processing error when done. 189 func (e *Engine) Process(channel channels.Channel, originID flow.Identifier, message interface{}) error { 190 return e.unit.Do(func() error { 191 return e.process(originID, message) 192 }) 193 } 194 195 // EntityByID adds an entity to the list of entities to be requested from the 196 // provider. It is idempotent, meaning that adding the same entity to the 197 // requester engine multiple times has no effect, unless the item has 198 // expired due to too many requests and has thus been deleted from the 199 // list. The provided selector will be applied to the set of valid providers on top 200 // of the global selector injected upon construction. It allows for finer-grained 201 // control over which subset of providers to request a given entity from, such as 202 // selection of a collection cluster. Use `filter.Any` if no additional selection 203 // is required. Checks integrity of response to make sure that we got entity that we were requesting. 204 func (e *Engine) EntityByID(entityID flow.Identifier, selector flow.IdentityFilter[flow.Identity]) { 205 e.addEntityRequest(entityID, selector, true) 206 } 207 208 // Query will request data through the request engine backing the interface. 209 // The additional selector will be applied to the subset 210 // of valid providers for the data and allows finer-grained control 211 // over which providers to request data from. Doesn't perform integrity check 212 // can be used to get entities without knowing their ID. 213 func (e *Engine) Query(key flow.Identifier, selector flow.IdentityFilter[flow.Identity]) { 214 e.addEntityRequest(key, selector, false) 215 } 216 217 func (e *Engine) addEntityRequest(entityID flow.Identifier, selector flow.IdentityFilter[flow.Identity], checkIntegrity bool) { 218 e.unit.Lock() 219 defer e.unit.Unlock() 220 221 // check if we already have an item for this entity 222 _, duplicate := e.items[entityID] 223 if duplicate { 224 return 225 } 226 227 // otherwise, add a new item to the list 228 item := &Item{ 229 EntityID: entityID, 230 NumAttempts: 0, 231 LastRequested: time.Time{}, 232 RetryAfter: e.cfg.RetryInitial, 233 ExtraSelector: selector, 234 checkIntegrity: checkIntegrity, 235 } 236 e.items[entityID] = item 237 } 238 239 // Force will force the requester engine to dispatch all currently 240 // valid batch requests. 241 func (e *Engine) Force() { 242 // exit early in case a forced dispatch is currently ongoing 243 if e.forcedDispatchOngoing.Load() { 244 return 245 } 246 247 // using Launch to ensure the caller won't be blocked 248 e.unit.Launch(func() { 249 // using atomic bool to ensure there is at most one caller would trigger dispatching requests 250 if e.forcedDispatchOngoing.CompareAndSwap(false, true) { 251 count := uint(0) 252 for { 253 dispatched, err := e.dispatchRequest() 254 if err != nil { 255 e.log.Error().Err(err).Msg("could not dispatch requests") 256 break 257 } 258 if !dispatched { 259 e.log.Debug().Uint("requests", count).Msg("forced request dispatch") 260 break 261 } 262 count++ 263 } 264 e.forcedDispatchOngoing.Store(false) 265 } 266 }) 267 } 268 269 func (e *Engine) poll() { 270 ticker := time.NewTicker(e.cfg.BatchInterval) 271 272 PollLoop: 273 for { 274 select { 275 case <-e.unit.Quit(): 276 break PollLoop 277 278 case <-ticker.C: 279 if e.forcedDispatchOngoing.Load() { 280 return 281 } 282 283 dispatched, err := e.dispatchRequest() 284 if err != nil { 285 e.log.Error().Err(err).Msg("could not dispatch requests") 286 continue PollLoop 287 } 288 if dispatched { 289 e.log.Debug().Uint("requests", 1).Msg("regular request dispatch") 290 } 291 } 292 } 293 294 ticker.Stop() 295 } 296 297 // dispatchRequest dispatches a subset of requests (selection based on internal heuristic). 298 // While `dispatchRequest` sends a request (covering some but not necessarily all items), 299 // if and only if there is something to request. In other words it cannot happen that 300 // `dispatchRequest` sends no request, but there is something to be requested. 301 // The boolean return value indicates whether a request was dispatched at all. 302 func (e *Engine) dispatchRequest() (bool, error) { 303 304 e.unit.Lock() 305 defer e.unit.Unlock() 306 307 e.log.Debug().Int("num_entities", len(e.items)).Msg("selecting entities") 308 309 // get the current top-level set of valid providers 310 providers, err := e.state.Final().Identities(e.selector) 311 if err != nil { 312 return false, fmt.Errorf("could not get providers: %w", err) 313 } 314 315 // randomize order of items, so that they can be requested in different order each time 316 rndItems := make([]flow.Identifier, 0, len(e.items)) 317 for k := range e.items { 318 rndItems = append(rndItems, e.items[k].EntityID) 319 } 320 err = rand.Shuffle(uint(len(rndItems)), func(i, j uint) { 321 rndItems[i], rndItems[j] = rndItems[j], rndItems[i] 322 }) 323 if err != nil { 324 return false, fmt.Errorf("shuffle failed: %w", err) 325 } 326 327 // go through each item and decide if it should be requested again 328 now := time.Now().UTC() 329 var providerID flow.Identifier 330 var entityIDs []flow.Identifier 331 for _, entityID := range rndItems { 332 item := e.items[entityID] 333 334 // if the item should not be requested yet, ignore 335 cutoff := item.LastRequested.Add(item.RetryAfter) 336 if cutoff.After(now) { 337 continue 338 } 339 340 // if the item reached maximum amount of retries, drop 341 if item.NumAttempts >= e.cfg.RetryAttempts { 342 e.log.Debug().Str("entity_id", entityID.String()).Msg("dropping entity ID max amount of retries reached") 343 delete(e.items, entityID) 344 continue 345 } 346 347 // if the provider has already been chosen, check if this item 348 // can be requested from the same provider; otherwise skip it 349 // for now, so it will be part of the next batch request 350 if providerID != flow.ZeroID { 351 overlap := providers.Filter(filter.And( 352 filter.HasNodeID[flow.Identity](providerID), 353 item.ExtraSelector, 354 )) 355 if len(overlap) == 0 { 356 continue 357 } 358 } 359 360 // if no provider has been chosen yet, choose from restricted set 361 // NOTE: a single item can not permanently block requests going 362 // out when no providers are available for it, because the iteration 363 // order is random and will skip the item most of the times 364 // when other items are available 365 if providerID == flow.ZeroID { 366 providers = providers.Filter(item.ExtraSelector) 367 if len(providers) == 0 { 368 return false, fmt.Errorf("no valid providers available") 369 } 370 id, err := providers.Sample(1) 371 if err != nil { 372 return false, fmt.Errorf("sampling failed: %w", err) 373 } 374 providerID = id[0].NodeID 375 } 376 377 // add item to list and set retry parameters 378 // NOTE: we add the retry interval to the last requested timestamp, 379 // rather than using the current timestamp, in order to conserve a 380 // more even distribution of timestamps over time, which should lead 381 // to a more even distribution of entities over batch requests 382 entityIDs = append(entityIDs, entityID) 383 item.NumAttempts++ 384 item.LastRequested = now 385 item.RetryAfter = e.cfg.RetryFunction(item.RetryAfter) 386 387 // make sure the interval is within parameters 388 if item.RetryAfter < e.cfg.RetryInitial { 389 item.RetryAfter = e.cfg.RetryInitial 390 } 391 if item.RetryAfter > e.cfg.RetryMaximum { 392 item.RetryAfter = e.cfg.RetryMaximum 393 } 394 395 // if we reached the maximum size for a batch, bail 396 if uint(len(entityIDs)) >= e.cfg.BatchThreshold { 397 break 398 } 399 } 400 401 // if there are no items to request, return 402 if len(entityIDs) == 0 { 403 return false, nil 404 } 405 406 nonce, err := rand.Uint64() 407 if err != nil { 408 return false, fmt.Errorf("nonce generation failed: %w", err) 409 } 410 411 // create a batch request, send it and store it for reference 412 req := &messages.EntityRequest{ 413 Nonce: nonce, 414 EntityIDs: entityIDs, 415 } 416 417 requestStart := time.Now() 418 419 if e.log.Debug().Enabled() { 420 e.log.Debug(). 421 Hex("provider", logging.ID(providerID)). 422 Uint64("nonce", req.Nonce). 423 Int("num_selected", len(entityIDs)). 424 Strs("entities", logging.IDs(entityIDs)). 425 Msg("sending entity request") 426 } 427 428 err = e.con.Unicast(req, providerID) 429 if err != nil { 430 return true, fmt.Errorf("could not send request for entities %v: %w", logging.IDs(entityIDs), err) 431 } 432 e.requests[req.Nonce] = req 433 434 if e.log.Debug().Enabled() { 435 e.log.Debug(). 436 Hex("provider", logging.ID(providerID)). 437 Uint64("nonce", req.Nonce). 438 Strs("entities", logging.IDs(entityIDs)). 439 TimeDiff("duration", time.Now(), requestStart). 440 Msg("entity request sent") 441 } 442 443 // NOTE: we forget about requests after the expiry of the shortest retry time 444 // from the entities in the list; this means that we purge requests aggressively. 445 // However, most requests should be responded to on the first attempt and clearing 446 // these up only removes the ability to instantly retry upon partial responses, so 447 // it won't affect much. 448 go func() { 449 <-time.After(e.cfg.RetryInitial) 450 451 e.unit.Lock() 452 defer e.unit.Unlock() 453 delete(e.requests, req.Nonce) 454 }() 455 456 e.metrics.MessageSent(e.channel.String(), metrics.MessageEntityRequest) 457 e.log.Debug(). 458 Uint64("nonce", req.Nonce). 459 Strs("entity_ids", flow.IdentifierList(req.EntityIDs).Strings()). 460 Msg("entity request sent") 461 462 return true, nil 463 } 464 465 // process processes events for the propagation engine on the consensus node. 466 func (e *Engine) process(originID flow.Identifier, message interface{}) error { 467 468 e.metrics.MessageReceived(e.channel.String(), metrics.MessageEntityResponse) 469 defer e.metrics.MessageHandled(e.channel.String(), metrics.MessageEntityResponse) 470 471 switch msg := message.(type) { 472 case *messages.EntityResponse: 473 return e.onEntityResponse(originID, msg) 474 default: 475 return engine.NewInvalidInputErrorf("invalid message type (%T)", message) 476 } 477 } 478 479 func (e *Engine) onEntityResponse(originID flow.Identifier, res *messages.EntityResponse) error { 480 lg := e.log.With().Str("origin_id", originID.String()).Uint64("nonce", res.Nonce).Logger() 481 482 lg.Debug().Strs("entity_ids", flow.IdentifierList(res.EntityIDs).Strings()).Msg("entity response received") 483 484 if e.cfg.ValidateStaking { 485 486 // check that the response comes from a valid provider 487 providers, err := e.state.Final().Identities(filter.And( 488 e.selector, 489 filter.HasNodeID[flow.Identity](originID), 490 )) 491 if err != nil { 492 return fmt.Errorf("could not get providers: %w", err) 493 } 494 if len(providers) == 0 { 495 return engine.NewInvalidInputErrorf("invalid provider origin (%x)", originID) 496 } 497 } 498 499 if e.log.Debug().Enabled() { 500 e.log.Debug(). 501 Hex("provider", logging.ID(originID)). 502 Strs("entities", logging.IDs(res.EntityIDs)). 503 Uint64("nonce", res.Nonce). 504 Msg("onEntityResponse entries received") 505 } 506 507 e.unit.Lock() 508 defer e.unit.Unlock() 509 510 // build a list of needed entities; if not available, process anyway, 511 // but in that case we can't re-queue missing items 512 needed := make(map[flow.Identifier]struct{}) 513 req, exists := e.requests[res.Nonce] 514 if exists { 515 delete(e.requests, req.Nonce) 516 for _, entityID := range req.EntityIDs { 517 needed[entityID] = struct{}{} 518 } 519 } 520 521 // ensure the response is correctly formed 522 if len(res.Blobs) != len(res.EntityIDs) { 523 return engine.NewInvalidInputErrorf("invalid response with %d blobs, %d IDs", len(res.Blobs), len(res.EntityIDs)) 524 } 525 526 // process each entity in the response 527 // NOTE: this requires engines to be somewhat idempotent, which is a good 528 // thing, as it increases the robustness of their code 529 for i := 0; i < len(res.Blobs); i++ { 530 blob := res.Blobs[i] 531 entityID := res.EntityIDs[i] 532 533 // the entity might already have been returned in another response 534 item, exists := e.items[entityID] 535 if !exists { 536 lg.Debug().Hex("entity_id", logging.ID(entityID)).Msg("entity not in items skipping") 537 continue 538 } 539 540 // create the entity with underlying concrete type and decode blob 541 entity := e.create() 542 err := msgpack.Unmarshal(blob, &entity) 543 if err != nil { 544 return fmt.Errorf("could not decode entity: %w", err) 545 } 546 547 if item.checkIntegrity { 548 actualEntityID := entity.ID() 549 // validate that we got correct entity, exactly what we were expecting 550 if entityID != actualEntityID { 551 lg.Error(). 552 Hex("stated_entity_id", logging.ID(entityID)). 553 Hex("provided_entity", logging.ID(actualEntityID)). 554 Bool(logging.KeySuspicious, true). 555 Msg("provided entity does not match stated ID") 556 continue 557 } 558 } 559 560 // remove from needed items and pending items 561 delete(needed, entityID) 562 delete(e.items, entityID) 563 564 // process the entity 565 go e.handle(originID, entity) 566 } 567 568 // requeue requested entities that have not been delivered in the response 569 // NOTE: this logic allows a provider to send back an empty response to 570 // indicate that none of the requested entities are available, thus allowing 571 // the requester engine to immediately request them from another provider 572 for entityID := range needed { 573 574 // it's possible the item is unavailable, if it was already received 575 // in another response 576 item, exists := e.items[entityID] 577 if !exists { 578 // the entity could have been received in another request 579 continue 580 } 581 582 // we set the timestamp to zero, so that the item will be included 583 // in the next batch regardless of retry interval 584 item.LastRequested = time.Time{} 585 } 586 587 return nil 588 }