github.com/onflow/flow-go@v0.33.17/engine/common/requester/engine.go (about) 1 package requester 2 3 import ( 4 "fmt" 5 "math" 6 "time" 7 8 "github.com/rs/zerolog" 9 "github.com/vmihailenco/msgpack" 10 "go.uber.org/atomic" 11 12 "github.com/onflow/flow-go/engine" 13 "github.com/onflow/flow-go/model/flow" 14 "github.com/onflow/flow-go/model/flow/filter" 15 "github.com/onflow/flow-go/model/messages" 16 "github.com/onflow/flow-go/module" 17 "github.com/onflow/flow-go/module/metrics" 18 "github.com/onflow/flow-go/network" 19 "github.com/onflow/flow-go/network/channels" 20 "github.com/onflow/flow-go/state/protocol" 21 "github.com/onflow/flow-go/utils/logging" 22 "github.com/onflow/flow-go/utils/rand" 23 ) 24 25 // HandleFunc is a function provided to the requester engine to handle an entity 26 // once it has been retrieved from a provider. The function should be non-blocking 27 // and errors should be handled internally within the function. 28 type HandleFunc func(originID flow.Identifier, entity flow.Entity) 29 30 // CreateFunc is a function that creates a `flow.Entity` with an underlying type 31 // so that we can properly decode entities transmitted over the network. 32 type CreateFunc func() flow.Entity 33 34 // Engine is a generic requester engine, handling the requesting of entities 35 // on the flow network. It is the `request` part of the request-reply 36 // pattern provided by the pair of generic exchange engines. 37 type Engine struct { 38 unit *engine.Unit 39 log zerolog.Logger 40 cfg Config 41 metrics module.EngineMetrics 42 me module.Local 43 state protocol.State 44 con network.Conduit 45 channel channels.Channel 46 selector flow.IdentityFilter 47 create CreateFunc 48 handle HandleFunc 49 50 // changing the following state variables must be guarded by unit.Lock() 51 items map[flow.Identifier]*Item 52 requests map[uint64]*messages.EntityRequest 53 forcedDispatchOngoing *atomic.Bool // to ensure only trigger dispatching logic once at any time 54 } 55 56 // New creates a new requester engine, operating on the provided network channel, and requesting entities from a node 57 // within the set obtained by applying the provided selector filter. The options allow customization of the parameters 58 // related to the batch and retry logic. 59 func New(log zerolog.Logger, metrics module.EngineMetrics, net network.EngineRegistry, me module.Local, state protocol.State, 60 channel channels.Channel, selector flow.IdentityFilter, create CreateFunc, options ...OptionFunc) (*Engine, error) { 61 62 // initialize the default config 63 cfg := Config{ 64 BatchThreshold: 32, 65 BatchInterval: time.Second, 66 RetryInitial: 4 * time.Second, 67 RetryFunction: RetryGeometric(2), 68 RetryMaximum: 2 * time.Minute, 69 RetryAttempts: math.MaxUint32, 70 ValidateStaking: true, 71 } 72 73 // apply the custom option parameters 74 for _, option := range options { 75 option(&cfg) 76 } 77 78 // check validity of retry function 79 interval := cfg.RetryFunction(time.Second) 80 if interval < time.Second { 81 return nil, fmt.Errorf("invalid retry function (interval must always increase)") 82 } 83 84 // check validity of maximum interval 85 if cfg.RetryMaximum < cfg.RetryInitial { 86 return nil, fmt.Errorf("invalid retry maximum (must not be smaller than initial interval)") 87 } 88 89 // make sure we don't send requests from self 90 selector = filter.And( 91 selector, 92 filter.Not(filter.HasNodeID(me.NodeID())), 93 filter.Not(filter.Ejected), 94 ) 95 96 // make sure we don't send requests to unauthorized nodes 97 if cfg.ValidateStaking { 98 selector = filter.And( 99 selector, 100 filter.HasWeight(true), 101 ) 102 } 103 104 // initialize the propagation engine with its dependencies 105 e := &Engine{ 106 unit: engine.NewUnit(), 107 log: log.With().Str("engine", "requester").Logger(), 108 cfg: cfg, 109 metrics: metrics, 110 me: me, 111 state: state, 112 channel: channel, 113 selector: selector, 114 create: create, 115 handle: nil, 116 items: make(map[flow.Identifier]*Item), // holds all pending items 117 requests: make(map[uint64]*messages.EntityRequest), // holds all sent requests 118 forcedDispatchOngoing: atomic.NewBool(false), 119 } 120 121 // register the engine with the network layer and store the conduit 122 con, err := net.Register(channels.Channel(channel), e) 123 if err != nil { 124 return nil, fmt.Errorf("could not register engine: %w", err) 125 } 126 e.con = con 127 128 return e, nil 129 } 130 131 // WithHandle sets the handle function of the requester, which is how it processes 132 // returned entities. The engine can not be started without setting the handle 133 // function. It is done in a separate call so that the requester can be injected 134 // into engines upon construction, and then provide a handle function to the 135 // requester from that engine itself. 136 func (e *Engine) WithHandle(handle HandleFunc) *Engine { 137 e.handle = handle 138 return e 139 } 140 141 // Ready returns a ready channel that is closed once the engine has fully 142 // started. For consensus engine, this is true once the underlying consensus 143 // algorithm has started. 144 func (e *Engine) Ready() <-chan struct{} { 145 if e.handle == nil { 146 panic("must initialize requester engine with handler") 147 } 148 e.unit.Launch(e.poll) 149 return e.unit.Ready() 150 } 151 152 // Done returns a done channel that is closed once the engine has fully stopped. 153 // For the consensus engine, we wait for hotstuff to finish. 154 func (e *Engine) Done() <-chan struct{} { 155 return e.unit.Done() 156 } 157 158 // SubmitLocal submits an message originating on the local node. 159 func (e *Engine) SubmitLocal(message interface{}) { 160 e.unit.Launch(func() { 161 err := e.process(e.me.NodeID(), message) 162 if err != nil { 163 engine.LogError(e.log, err) 164 } 165 }) 166 } 167 168 // Submit submits the given message from the node with the given origin ID 169 // for processing in a non-blocking manner. It returns instantly and logs 170 // a potential processing error internally when done. 171 func (e *Engine) Submit(channel channels.Channel, originID flow.Identifier, message interface{}) { 172 e.unit.Launch(func() { 173 err := e.Process(channel, originID, message) 174 if err != nil { 175 engine.LogError(e.log, err) 176 } 177 }) 178 } 179 180 // ProcessLocal processes an message originating on the local node. 181 func (e *Engine) ProcessLocal(message interface{}) error { 182 return e.unit.Do(func() error { 183 return e.process(e.me.NodeID(), message) 184 }) 185 } 186 187 // Process processes the given message from the node with the given origin ID in 188 // a blocking manner. It returns the potential processing error when done. 189 func (e *Engine) Process(channel channels.Channel, originID flow.Identifier, message interface{}) error { 190 return e.unit.Do(func() error { 191 return e.process(originID, message) 192 }) 193 } 194 195 // EntityByID adds an entity to the list of entities to be requested from the 196 // provider. It is idempotent, meaning that adding the same entity to the 197 // requester engine multiple times has no effect, unless the item has 198 // expired due to too many requests and has thus been deleted from the 199 // list. The provided selector will be applied to the set of valid providers on top 200 // of the global selector injected upon construction. It allows for finer-grained 201 // control over which subset of providers to request a given entity from, such as 202 // selection of a collection cluster. Use `filter.Any` if no additional selection 203 // is required. Checks integrity of response to make sure that we got entity that we were requesting. 204 func (e *Engine) EntityByID(entityID flow.Identifier, selector flow.IdentityFilter) { 205 e.addEntityRequest(entityID, selector, true) 206 } 207 208 // Query will request data through the request engine backing the interface. 209 // The additional selector will be applied to the subset 210 // of valid providers for the data and allows finer-grained control 211 // over which providers to request data from. Doesn't perform integrity check 212 // can be used to get entities without knowing their ID. 213 func (e *Engine) Query(key flow.Identifier, selector flow.IdentityFilter) { 214 e.addEntityRequest(key, selector, false) 215 } 216 217 func (e *Engine) addEntityRequest(entityID flow.Identifier, selector flow.IdentityFilter, checkIntegrity bool) { 218 e.unit.Lock() 219 defer e.unit.Unlock() 220 221 // check if we already have an item for this entity 222 _, duplicate := e.items[entityID] 223 if duplicate { 224 return 225 } 226 227 // otherwise, add a new item to the list 228 item := &Item{ 229 EntityID: entityID, 230 NumAttempts: 0, 231 LastRequested: time.Time{}, 232 RetryAfter: e.cfg.RetryInitial, 233 ExtraSelector: selector, 234 checkIntegrity: checkIntegrity, 235 } 236 e.items[entityID] = item 237 } 238 239 // Force will force the requester engine to dispatch all currently 240 // valid batch requests. 241 func (e *Engine) Force() { 242 // exit early in case a forced dispatch is currently ongoing 243 if e.forcedDispatchOngoing.Load() { 244 return 245 } 246 247 // using Launch to ensure the caller won't be blocked 248 e.unit.Launch(func() { 249 // using atomic bool to ensure there is at most one caller would trigger dispatching requests 250 if e.forcedDispatchOngoing.CompareAndSwap(false, true) { 251 count := uint(0) 252 for { 253 dispatched, err := e.dispatchRequest() 254 if err != nil { 255 e.log.Error().Err(err).Msg("could not dispatch requests") 256 break 257 } 258 if !dispatched { 259 e.log.Debug().Uint("requests", count).Msg("forced request dispatch") 260 break 261 } 262 count++ 263 } 264 e.forcedDispatchOngoing.Store(false) 265 } 266 }) 267 } 268 269 func (e *Engine) poll() { 270 ticker := time.NewTicker(e.cfg.BatchInterval) 271 272 PollLoop: 273 for { 274 select { 275 case <-e.unit.Quit(): 276 break PollLoop 277 278 case <-ticker.C: 279 if e.forcedDispatchOngoing.Load() { 280 return 281 } 282 283 dispatched, err := e.dispatchRequest() 284 if err != nil { 285 e.log.Error().Err(err).Msg("could not dispatch requests") 286 continue PollLoop 287 } 288 if dispatched { 289 e.log.Debug().Uint("requests", 1).Msg("regular request dispatch") 290 } 291 } 292 } 293 294 ticker.Stop() 295 } 296 297 // dispatchRequest dispatches a subset of requests (selection based on internal heuristic). 298 // While `dispatchRequest` sends a request (covering some but not necessarily all items), 299 // if and only if there is something to request. In other words it cannot happen that 300 // `dispatchRequest` sends no request, but there is something to be requested. 301 // The boolean return value indicates whether a request was dispatched at all. 302 func (e *Engine) dispatchRequest() (bool, error) { 303 304 e.unit.Lock() 305 defer e.unit.Unlock() 306 307 e.log.Debug().Int("num_entities", len(e.items)).Msg("selecting entities") 308 309 // get the current top-level set of valid providers 310 providers, err := e.state.Final().Identities(e.selector) 311 if err != nil { 312 return false, fmt.Errorf("could not get providers: %w", err) 313 } 314 315 // randomize order of items, so that they can be requested in different order each time 316 rndItems := make([]flow.Identifier, 0, len(e.items)) 317 for k := range e.items { 318 rndItems = append(rndItems, e.items[k].EntityID) 319 } 320 err = rand.Shuffle(uint(len(rndItems)), func(i, j uint) { 321 rndItems[i], rndItems[j] = rndItems[j], rndItems[i] 322 }) 323 if err != nil { 324 return false, fmt.Errorf("shuffle failed: %w", err) 325 } 326 327 // go through each item and decide if it should be requested again 328 now := time.Now().UTC() 329 var providerID flow.Identifier 330 var entityIDs []flow.Identifier 331 for _, entityID := range rndItems { 332 item := e.items[entityID] 333 334 // if the item should not be requested yet, ignore 335 cutoff := item.LastRequested.Add(item.RetryAfter) 336 if cutoff.After(now) { 337 continue 338 } 339 340 // if the item reached maximum amount of retries, drop 341 if item.NumAttempts >= e.cfg.RetryAttempts { 342 e.log.Debug().Str("entity_id", entityID.String()).Msg("dropping entity ID max amount of retries reached") 343 delete(e.items, entityID) 344 continue 345 } 346 347 // if the provider has already been chosen, check if this item 348 // can be requested from the same provider; otherwise skip it 349 // for now, so it will be part of the next batch request 350 if providerID != flow.ZeroID { 351 overlap := providers.Filter(filter.And( 352 filter.HasNodeID(providerID), 353 item.ExtraSelector, 354 )) 355 if len(overlap) == 0 { 356 continue 357 } 358 } 359 360 // if no provider has been chosen yet, choose from restricted set 361 // NOTE: a single item can not permanently block requests going 362 // out when no providers are available for it, because the iteration 363 // order is random and will skip the item most of the times 364 // when other items are available 365 if providerID == flow.ZeroID { 366 providers = providers.Filter(item.ExtraSelector) 367 if len(providers) == 0 { 368 return false, fmt.Errorf("no valid providers available") 369 } 370 id, err := providers.Sample(1) 371 if err != nil { 372 return false, fmt.Errorf("sampling failed: %w", err) 373 } 374 providerID = id[0].NodeID 375 } 376 377 // add item to list and set retry parameters 378 // NOTE: we add the retry interval to the last requested timestamp, 379 // rather than using the current timestamp, in order to conserve a 380 // more even distribution of timestamps over time, which should lead 381 // to a more even distribution of entities over batch requests 382 entityIDs = append(entityIDs, entityID) 383 item.NumAttempts++ 384 item.LastRequested = now 385 item.RetryAfter = e.cfg.RetryFunction(item.RetryAfter) 386 387 // make sure the interval is within parameters 388 if item.RetryAfter < e.cfg.RetryInitial { 389 item.RetryAfter = e.cfg.RetryInitial 390 } 391 if item.RetryAfter > e.cfg.RetryMaximum { 392 item.RetryAfter = e.cfg.RetryMaximum 393 } 394 395 // if we reached the maximum size for a batch, bail 396 if uint(len(entityIDs)) >= e.cfg.BatchThreshold { 397 break 398 } 399 } 400 401 // if there are no items to request, return 402 if len(entityIDs) == 0 { 403 return false, nil 404 } 405 406 nonce, err := rand.Uint64() 407 if err != nil { 408 return false, fmt.Errorf("nonce generation failed: %w", err) 409 } 410 411 // create a batch request, send it and store it for reference 412 req := &messages.EntityRequest{ 413 Nonce: nonce, 414 EntityIDs: entityIDs, 415 } 416 417 requestStart := time.Now() 418 419 if e.log.Debug().Enabled() { 420 e.log.Debug(). 421 Hex("provider", logging.ID(providerID)). 422 Uint64("nonce", req.Nonce). 423 Int("num_selected", len(entityIDs)). 424 Strs("entities", logging.IDs(entityIDs)). 425 Msg("sending entity request") 426 } 427 428 err = e.con.Unicast(req, providerID) 429 if err != nil { 430 return true, fmt.Errorf("could not send request for entities %v: %w", logging.IDs(entityIDs), err) 431 } 432 e.requests[req.Nonce] = req 433 434 if e.log.Debug().Enabled() { 435 e.log.Debug(). 436 Hex("provider", logging.ID(providerID)). 437 Uint64("nonce", req.Nonce). 438 Strs("entities", logging.IDs(entityIDs)). 439 TimeDiff("duration", time.Now(), requestStart). 440 Msg("entity request sent") 441 } 442 443 // NOTE: we forget about requests after the expiry of the shortest retry time 444 // from the entities in the list; this means that we purge requests aggressively. 445 // However, most requests should be responded to on the first attempt and clearing 446 // these up only removes the ability to instantly retry upon partial responses, so 447 // it won't affect much. 448 go func() { 449 <-time.After(e.cfg.RetryInitial) 450 451 e.unit.Lock() 452 defer e.unit.Unlock() 453 delete(e.requests, req.Nonce) 454 }() 455 456 e.metrics.MessageSent(e.channel.String(), metrics.MessageEntityRequest) 457 e.log.Debug(). 458 Uint64("nonce", req.Nonce). 459 Strs("entity_ids", flow.IdentifierList(req.EntityIDs).Strings()). 460 Msg("entity request sent") 461 462 return true, nil 463 } 464 465 // process processes events for the propagation engine on the consensus node. 466 func (e *Engine) process(originID flow.Identifier, message interface{}) error { 467 468 e.metrics.MessageReceived(e.channel.String(), metrics.MessageEntityResponse) 469 defer e.metrics.MessageHandled(e.channel.String(), metrics.MessageEntityResponse) 470 471 switch msg := message.(type) { 472 case *messages.EntityResponse: 473 return e.onEntityResponse(originID, msg) 474 default: 475 return engine.NewInvalidInputErrorf("invalid message type (%T)", message) 476 } 477 } 478 479 func (e *Engine) onEntityResponse(originID flow.Identifier, res *messages.EntityResponse) error { 480 lg := e.log.With().Str("origin_id", originID.String()).Uint64("nonce", res.Nonce).Logger() 481 482 lg.Debug().Strs("entity_ids", flow.IdentifierList(res.EntityIDs).Strings()).Msg("entity response received") 483 484 if e.cfg.ValidateStaking { 485 486 // check that the response comes from a valid provider 487 providers, err := e.state.Final().Identities(filter.And( 488 e.selector, 489 filter.HasNodeID(originID), 490 )) 491 if err != nil { 492 return fmt.Errorf("could not get providers: %w", err) 493 } 494 if len(providers) == 0 { 495 return engine.NewInvalidInputErrorf("invalid provider origin (%x)", originID) 496 } 497 } 498 499 if e.log.Debug().Enabled() { 500 e.log.Debug(). 501 Hex("provider", logging.ID(originID)). 502 Strs("entities", logging.IDs(res.EntityIDs)). 503 Uint64("nonce", res.Nonce). 504 Msg("onEntityResponse entries received") 505 } 506 507 e.unit.Lock() 508 defer e.unit.Unlock() 509 510 // build a list of needed entities; if not available, process anyway, 511 // but in that case we can't re-queue missing items 512 needed := make(map[flow.Identifier]struct{}) 513 req, exists := e.requests[res.Nonce] 514 if exists { 515 delete(e.requests, req.Nonce) 516 for _, entityID := range req.EntityIDs { 517 needed[entityID] = struct{}{} 518 } 519 } 520 521 // ensure the response is correctly formed 522 if len(res.Blobs) != len(res.EntityIDs) { 523 return engine.NewInvalidInputErrorf("invalid response with %d blobs, %d IDs", len(res.Blobs), len(res.EntityIDs)) 524 } 525 526 // process each entity in the response 527 // NOTE: this requires engines to be somewhat idempotent, which is a good 528 // thing, as it increases the robustness of their code 529 for i := 0; i < len(res.Blobs); i++ { 530 blob := res.Blobs[i] 531 entityID := res.EntityIDs[i] 532 533 // the entity might already have been returned in another response 534 item, exists := e.items[entityID] 535 if !exists { 536 lg.Debug().Hex("entity_id", logging.ID(entityID)).Msg("entity not in items skipping") 537 continue 538 } 539 540 // create the entity with underlying concrete type and decode blob 541 entity := e.create() 542 err := msgpack.Unmarshal(blob, &entity) 543 if err != nil { 544 return fmt.Errorf("could not decode entity: %w", err) 545 } 546 547 if item.checkIntegrity { 548 actualEntityID := entity.ID() 549 // validate that we got correct entity, exactly what we were expecting 550 if entityID != actualEntityID { 551 lg.Error(). 552 Hex("stated_entity_id", logging.ID(entityID)). 553 Hex("provided_entity", logging.ID(actualEntityID)). 554 Bool(logging.KeySuspicious, true). 555 Msg("provided entity does not match stated ID") 556 continue 557 } 558 } 559 560 // remove from needed items and pending items 561 delete(needed, entityID) 562 delete(e.items, entityID) 563 564 // process the entity 565 go e.handle(originID, entity) 566 } 567 568 // requeue requested entities that have not been delivered in the response 569 // NOTE: this logic allows a provider to send back an empty response to 570 // indicate that none of the requested entities are available, thus allowing 571 // the requester engine to immediately request them from another provider 572 for entityID := range needed { 573 574 // it's possible the item is unavailable, if it was already received 575 // in another response 576 item, exists := e.items[entityID] 577 if !exists { 578 // the entity could have been received in another request 579 continue 580 } 581 582 // we set the timestamp to zero, so that the item will be included 583 // in the next batch regardless of retry interval 584 item.LastRequested = time.Time{} 585 } 586 587 return nil 588 }