github.com/koko1123/flow-go-1@v0.29.6/engine/common/requester/engine.go (about) 1 package requester 2 3 import ( 4 "fmt" 5 "math" 6 "math/rand" 7 "time" 8 9 "github.com/rs/zerolog" 10 "github.com/vmihailenco/msgpack" 11 "go.uber.org/atomic" 12 13 "github.com/koko1123/flow-go-1/engine" 14 "github.com/koko1123/flow-go-1/model/flow" 15 "github.com/koko1123/flow-go-1/model/flow/filter" 16 "github.com/koko1123/flow-go-1/model/messages" 17 "github.com/koko1123/flow-go-1/module" 18 "github.com/koko1123/flow-go-1/module/metrics" 19 "github.com/koko1123/flow-go-1/network" 20 "github.com/koko1123/flow-go-1/network/channels" 21 "github.com/koko1123/flow-go-1/state/protocol" 22 "github.com/koko1123/flow-go-1/utils/logging" 23 ) 24 25 // HandleFunc is a function provided to the requester engine to handle an entity 26 // once it has been retrieved from a provider. The function should be non-blocking 27 // and errors should be handled internally within the function. 28 type HandleFunc func(originID flow.Identifier, entity flow.Entity) 29 30 // CreateFunc is a function that creates a `flow.Entity` with an underlying type 31 // so that we can properly decode entities transmitted over the network. 32 type CreateFunc func() flow.Entity 33 34 // Engine is a generic requester engine, handling the requesting of entities 35 // on the flow network. It is the `request` part of the request-reply 36 // pattern provided by the pair of generic exchange engines. 37 type Engine struct { 38 unit *engine.Unit 39 log zerolog.Logger 40 cfg Config 41 metrics module.EngineMetrics 42 me module.Local 43 state protocol.State 44 con network.Conduit 45 channel channels.Channel 46 selector flow.IdentityFilter 47 create CreateFunc 48 handle HandleFunc 49 50 // changing the following state variables must be guarded by unit.Lock() 51 items map[flow.Identifier]*Item 52 requests map[uint64]*messages.EntityRequest 53 forcedDispatchOngoing *atomic.Bool // to ensure only trigger dispatching logic once at any time 54 rng *rand.Rand 55 } 56 57 // New creates a new requester engine, operating on the provided network channel, and requesting entities from a node 58 // within the set obtained by applying the provided selector filter. The options allow customization of the parameters 59 // related to the batch and retry logic. 60 func New(log zerolog.Logger, metrics module.EngineMetrics, net network.Network, me module.Local, state protocol.State, 61 channel channels.Channel, selector flow.IdentityFilter, create CreateFunc, options ...OptionFunc) (*Engine, error) { 62 63 // initialize the default config 64 cfg := Config{ 65 BatchThreshold: 32, 66 BatchInterval: time.Second, 67 RetryInitial: 4 * time.Second, 68 RetryFunction: RetryGeometric(2), 69 RetryMaximum: 2 * time.Minute, 70 RetryAttempts: math.MaxUint32, 71 ValidateStaking: true, 72 } 73 74 // apply the custom option parameters 75 for _, option := range options { 76 option(&cfg) 77 } 78 79 // check validity of retry function 80 interval := cfg.RetryFunction(time.Second) 81 if interval < time.Second { 82 return nil, fmt.Errorf("invalid retry function (interval must always increase)") 83 } 84 85 // check validity of maximum interval 86 if cfg.RetryMaximum < cfg.RetryInitial { 87 return nil, fmt.Errorf("invalid retry maximum (must not be smaller than initial interval)") 88 } 89 90 // make sure we don't send requests from self 91 selector = filter.And( 92 selector, 93 filter.Not(filter.HasNodeID(me.NodeID())), 94 filter.Not(filter.Ejected), 95 ) 96 97 // make sure we don't send requests to unauthorized nodes 98 if cfg.ValidateStaking { 99 selector = filter.And( 100 selector, 101 filter.HasWeight(true), 102 ) 103 } 104 105 // initialize the propagation engine with its dependencies 106 e := &Engine{ 107 unit: engine.NewUnit(), 108 log: log.With().Str("engine", "requester").Logger(), 109 cfg: cfg, 110 metrics: metrics, 111 me: me, 112 state: state, 113 channel: channel, 114 selector: selector, 115 create: create, 116 handle: nil, 117 items: make(map[flow.Identifier]*Item), // holds all pending items 118 requests: make(map[uint64]*messages.EntityRequest), // holds all sent requests 119 forcedDispatchOngoing: atomic.NewBool(false), 120 rng: rand.New(rand.NewSource(time.Now().UnixNano())), 121 } 122 123 // register the engine with the network layer and store the conduit 124 con, err := net.Register(channels.Channel(channel), e) 125 if err != nil { 126 return nil, fmt.Errorf("could not register engine: %w", err) 127 } 128 e.con = con 129 130 return e, nil 131 } 132 133 // WithHandle sets the handle function of the requester, which is how it processes 134 // returned entities. The engine can not be started without setting the handle 135 // function. It is done in a separate call so that the requester can be injected 136 // into engines upon construction, and then provide a handle function to the 137 // requester from that engine itself. 138 func (e *Engine) WithHandle(handle HandleFunc) *Engine { 139 e.handle = handle 140 return e 141 } 142 143 // Ready returns a ready channel that is closed once the engine has fully 144 // started. For consensus engine, this is true once the underlying consensus 145 // algorithm has started. 146 func (e *Engine) Ready() <-chan struct{} { 147 if e.handle == nil { 148 panic("must initialize requester engine with handler") 149 } 150 e.unit.Launch(e.poll) 151 return e.unit.Ready() 152 } 153 154 // Done returns a done channel that is closed once the engine has fully stopped. 155 // For the consensus engine, we wait for hotstuff to finish. 156 func (e *Engine) Done() <-chan struct{} { 157 return e.unit.Done() 158 } 159 160 // SubmitLocal submits an message originating on the local node. 161 func (e *Engine) SubmitLocal(message interface{}) { 162 e.unit.Launch(func() { 163 err := e.process(e.me.NodeID(), message) 164 if err != nil { 165 engine.LogError(e.log, err) 166 } 167 }) 168 } 169 170 // Submit submits the given message from the node with the given origin ID 171 // for processing in a non-blocking manner. It returns instantly and logs 172 // a potential processing error internally when done. 173 func (e *Engine) Submit(channel channels.Channel, originID flow.Identifier, message interface{}) { 174 e.unit.Launch(func() { 175 err := e.Process(channel, originID, message) 176 if err != nil { 177 engine.LogError(e.log, err) 178 } 179 }) 180 } 181 182 // ProcessLocal processes an message originating on the local node. 183 func (e *Engine) ProcessLocal(message interface{}) error { 184 return e.unit.Do(func() error { 185 return e.process(e.me.NodeID(), message) 186 }) 187 } 188 189 // Process processes the given message from the node with the given origin ID in 190 // a blocking manner. It returns the potential processing error when done. 191 func (e *Engine) Process(channel channels.Channel, originID flow.Identifier, message interface{}) error { 192 return e.unit.Do(func() error { 193 return e.process(originID, message) 194 }) 195 } 196 197 // EntityByID adds an entity to the list of entities to be requested from the 198 // provider. It is idempotent, meaning that adding the same entity to the 199 // requester engine multiple times has no effect, unless the item has 200 // expired due to too many requests and has thus been deleted from the 201 // list. The provided selector will be applied to the set of valid providers on top 202 // of the global selector injected upon construction. It allows for finer-grained 203 // control over which subset of providers to request a given entity from, such as 204 // selection of a collection cluster. Use `filter.Any` if no additional selection 205 // is required. Checks integrity of response to make sure that we got entity that we were requesting. 206 func (e *Engine) EntityByID(entityID flow.Identifier, selector flow.IdentityFilter) { 207 e.addEntityRequest(entityID, selector, true) 208 } 209 210 // Query will request data through the request engine backing the interface. 211 // The additional selector will be applied to the subset 212 // of valid providers for the data and allows finer-grained control 213 // over which providers to request data from. Doesn't perform integrity check 214 // can be used to get entities without knowing their ID. 215 func (e *Engine) Query(key flow.Identifier, selector flow.IdentityFilter) { 216 e.addEntityRequest(key, selector, false) 217 } 218 219 func (e *Engine) addEntityRequest(entityID flow.Identifier, selector flow.IdentityFilter, checkIntegrity bool) { 220 e.unit.Lock() 221 defer e.unit.Unlock() 222 223 // check if we already have an item for this entity 224 _, duplicate := e.items[entityID] 225 if duplicate { 226 return 227 } 228 229 // otherwise, add a new item to the list 230 item := &Item{ 231 EntityID: entityID, 232 NumAttempts: 0, 233 LastRequested: time.Time{}, 234 RetryAfter: e.cfg.RetryInitial, 235 ExtraSelector: selector, 236 checkIntegrity: checkIntegrity, 237 } 238 e.items[entityID] = item 239 } 240 241 // Force will force the requester engine to dispatch all currently 242 // valid batch requests. 243 func (e *Engine) Force() { 244 // exit early in case a forced dispatch is currently ongoing 245 if e.forcedDispatchOngoing.Load() { 246 return 247 } 248 249 // using Launch to ensure the caller won't be blocked 250 e.unit.Launch(func() { 251 // using atomic bool to ensure there is at most one caller would trigger dispatching requests 252 if e.forcedDispatchOngoing.CompareAndSwap(false, true) { 253 count := uint(0) 254 for { 255 dispatched, err := e.dispatchRequest() 256 if err != nil { 257 e.log.Error().Err(err).Msg("could not dispatch requests") 258 break 259 } 260 if !dispatched { 261 e.log.Debug().Uint("requests", count).Msg("forced request dispatch") 262 break 263 } 264 count++ 265 } 266 e.forcedDispatchOngoing.Store(false) 267 } 268 }) 269 } 270 271 func (e *Engine) poll() { 272 ticker := time.NewTicker(e.cfg.BatchInterval) 273 274 PollLoop: 275 for { 276 select { 277 case <-e.unit.Quit(): 278 break PollLoop 279 280 case <-ticker.C: 281 if e.forcedDispatchOngoing.Load() { 282 return 283 } 284 285 dispatched, err := e.dispatchRequest() 286 if err != nil { 287 e.log.Error().Err(err).Msg("could not dispatch requests") 288 continue PollLoop 289 } 290 if dispatched { 291 e.log.Debug().Uint("requests", 1).Msg("regular request dispatch") 292 } 293 } 294 } 295 296 ticker.Stop() 297 } 298 299 // dispatchRequest dispatches a subset of requests (selection based on internal heuristic). 300 // While `dispatchRequest` sends a request (covering some but not necessarily all items), 301 // if and only if there is something to request. In other words it cannot happen that 302 // `dispatchRequest` sends no request, but there is something to be requested. 303 // The boolean return value indicates whether a request was dispatched at all. 304 func (e *Engine) dispatchRequest() (bool, error) { 305 306 e.unit.Lock() 307 defer e.unit.Unlock() 308 309 e.log.Debug().Int("num_entities", len(e.items)).Msg("selecting entities") 310 311 // get the current top-level set of valid providers 312 providers, err := e.state.Final().Identities(e.selector) 313 if err != nil { 314 return false, fmt.Errorf("could not get providers: %w", err) 315 } 316 317 // randomize order of items, so that they can be requested in different order each time 318 rndItems := make([]flow.Identifier, 0, len(e.items)) 319 for k := range e.items { 320 rndItems = append(rndItems, e.items[k].EntityID) 321 } 322 e.rng.Shuffle(len(rndItems), func(i, j int) { rndItems[i], rndItems[j] = rndItems[j], rndItems[i] }) 323 324 // go through each item and decide if it should be requested again 325 now := time.Now().UTC() 326 var providerID flow.Identifier 327 var entityIDs []flow.Identifier 328 for _, entityID := range rndItems { 329 item := e.items[entityID] 330 331 // if the item should not be requested yet, ignore 332 cutoff := item.LastRequested.Add(item.RetryAfter) 333 if cutoff.After(now) { 334 continue 335 } 336 337 // if the item reached maximum amount of retries, drop 338 if item.NumAttempts >= e.cfg.RetryAttempts { 339 e.log.Debug().Str("entity_id", entityID.String()).Msg("dropping entity ID max amount of retries reached") 340 delete(e.items, entityID) 341 continue 342 } 343 344 // if the provider has already been chosen, check if this item 345 // can be requested from the same provider; otherwise skip it 346 // for now, so it will be part of the next batch request 347 if providerID != flow.ZeroID { 348 overlap := providers.Filter(filter.And( 349 filter.HasNodeID(providerID), 350 item.ExtraSelector, 351 )) 352 if len(overlap) == 0 { 353 continue 354 } 355 } 356 357 // if no provider has been chosen yet, choose from restricted set 358 // NOTE: a single item can not permanently block requests going 359 // out when no providers are available for it, because the iteration 360 // order is random and will skip the item most of the times 361 // when other items are available 362 if providerID == flow.ZeroID { 363 providers = providers.Filter(item.ExtraSelector) 364 if len(providers) == 0 { 365 return false, fmt.Errorf("no valid providers available") 366 } 367 providerID = providers.Sample(1)[0].NodeID 368 } 369 370 // add item to list and set retry parameters 371 // NOTE: we add the retry interval to the last requested timestamp, 372 // rather than using the current timestamp, in order to conserve a 373 // more even distribution of timestamps over time, which should lead 374 // to a more even distribution of entities over batch requests 375 entityIDs = append(entityIDs, entityID) 376 item.NumAttempts++ 377 item.LastRequested = now 378 item.RetryAfter = e.cfg.RetryFunction(item.RetryAfter) 379 380 // make sure the interval is within parameters 381 if item.RetryAfter < e.cfg.RetryInitial { 382 item.RetryAfter = e.cfg.RetryInitial 383 } 384 if item.RetryAfter > e.cfg.RetryMaximum { 385 item.RetryAfter = e.cfg.RetryMaximum 386 } 387 388 // if we reached the maximum size for a batch, bail 389 if uint(len(entityIDs)) >= e.cfg.BatchThreshold { 390 break 391 } 392 } 393 394 // if there are no items to request, return 395 if len(entityIDs) == 0 { 396 return false, nil 397 } 398 399 // create a batch request, send it and store it for reference 400 req := &messages.EntityRequest{ 401 Nonce: e.rng.Uint64(), 402 EntityIDs: entityIDs, 403 } 404 405 requestStart := time.Now() 406 407 if e.log.Debug().Enabled() { 408 e.log.Debug(). 409 Hex("provider", logging.ID(providerID)). 410 Uint64("nonce", req.Nonce). 411 Int("num_selected", len(entityIDs)). 412 Strs("entities", logging.IDs(entityIDs)). 413 Msg("sending entity request") 414 } 415 416 err = e.con.Unicast(req, providerID) 417 if err != nil { 418 return true, fmt.Errorf("could not send request for entities %v: %w", logging.IDs(entityIDs), err) 419 } 420 e.requests[req.Nonce] = req 421 422 if e.log.Debug().Enabled() { 423 e.log.Debug(). 424 Hex("provider", logging.ID(providerID)). 425 Uint64("nonce", req.Nonce). 426 Strs("entities", logging.IDs(entityIDs)). 427 TimeDiff("duration", time.Now(), requestStart). 428 Msg("entity request sent") 429 } 430 431 // NOTE: we forget about requests after the expiry of the shortest retry time 432 // from the entities in the list; this means that we purge requests aggressively. 433 // However, most requests should be responded to on the first attempt and clearing 434 // these up only removes the ability to instantly retry upon partial responses, so 435 // it won't affect much. 436 go func() { 437 <-time.After(e.cfg.RetryInitial) 438 439 e.unit.Lock() 440 defer e.unit.Unlock() 441 delete(e.requests, req.Nonce) 442 }() 443 444 e.metrics.MessageSent(e.channel.String(), metrics.MessageEntityRequest) 445 e.log.Debug(). 446 Uint64("nonce", req.Nonce). 447 Strs("entity_ids", flow.IdentifierList(req.EntityIDs).Strings()). 448 Msg("entity request sent") 449 450 return true, nil 451 } 452 453 // process processes events for the propagation engine on the consensus node. 454 func (e *Engine) process(originID flow.Identifier, message interface{}) error { 455 456 e.metrics.MessageReceived(e.channel.String(), metrics.MessageEntityResponse) 457 defer e.metrics.MessageHandled(e.channel.String(), metrics.MessageEntityResponse) 458 459 switch msg := message.(type) { 460 case *messages.EntityResponse: 461 return e.onEntityResponse(originID, msg) 462 default: 463 return engine.NewInvalidInputErrorf("invalid message type (%T)", message) 464 } 465 } 466 467 func (e *Engine) onEntityResponse(originID flow.Identifier, res *messages.EntityResponse) error { 468 lg := e.log.With().Str("origin_id", originID.String()).Uint64("nonce", res.Nonce).Logger() 469 470 lg.Debug().Strs("entity_ids", flow.IdentifierList(res.EntityIDs).Strings()).Msg("entity response received") 471 472 if e.cfg.ValidateStaking { 473 474 // check that the response comes from a valid provider 475 providers, err := e.state.Final().Identities(filter.And( 476 e.selector, 477 filter.HasNodeID(originID), 478 )) 479 if err != nil { 480 return fmt.Errorf("could not get providers: %w", err) 481 } 482 if len(providers) == 0 { 483 return engine.NewInvalidInputErrorf("invalid provider origin (%x)", originID) 484 } 485 } 486 487 if e.log.Debug().Enabled() { 488 e.log.Debug(). 489 Hex("provider", logging.ID(originID)). 490 Strs("entities", logging.IDs(res.EntityIDs)). 491 Uint64("nonce", res.Nonce). 492 Msg("onEntityResponse entries received") 493 } 494 495 e.unit.Lock() 496 defer e.unit.Unlock() 497 498 // build a list of needed entities; if not available, process anyway, 499 // but in that case we can't re-queue missing items 500 needed := make(map[flow.Identifier]struct{}) 501 req, exists := e.requests[res.Nonce] 502 if exists { 503 delete(e.requests, req.Nonce) 504 for _, entityID := range req.EntityIDs { 505 needed[entityID] = struct{}{} 506 } 507 } 508 509 // ensure the response is correctly formed 510 if len(res.Blobs) != len(res.EntityIDs) { 511 return engine.NewInvalidInputErrorf("invalid response with %d blobs, %d IDs", len(res.Blobs), len(res.EntityIDs)) 512 } 513 514 // process each entity in the response 515 // NOTE: this requires engines to be somewhat idempotent, which is a good 516 // thing, as it increases the robustness of their code 517 for i := 0; i < len(res.Blobs); i++ { 518 blob := res.Blobs[i] 519 entityID := res.EntityIDs[i] 520 521 // the entity might already have been returned in another response 522 item, exists := e.items[entityID] 523 if !exists { 524 lg.Debug().Hex("entity_id", logging.ID(entityID)).Msg("entity not in items skipping") 525 continue 526 } 527 528 // create the entity with underlying concrete type and decode blob 529 entity := e.create() 530 err := msgpack.Unmarshal(blob, &entity) 531 if err != nil { 532 return fmt.Errorf("could not decode entity: %w", err) 533 } 534 535 if item.checkIntegrity { 536 actualEntityID := entity.ID() 537 // validate that we got correct entity, exactly what we were expecting 538 if entityID != actualEntityID { 539 lg.Error(). 540 Hex("stated_entity_id", logging.ID(entityID)). 541 Hex("provided_entity", logging.ID(actualEntityID)). 542 Bool(logging.KeySuspicious, true). 543 Msg("provided entity does not match stated ID") 544 continue 545 } 546 } 547 548 // remove from needed items and pending items 549 delete(needed, entityID) 550 delete(e.items, entityID) 551 552 // process the entity 553 go e.handle(originID, entity) 554 } 555 556 // requeue requested entities that have not been delivered in the response 557 // NOTE: this logic allows a provider to send back an empty response to 558 // indicate that none of the requested entities are available, thus allowing 559 // the requester engine to immediately request them from another provider 560 for entityID := range needed { 561 562 // it's possible the item is unavailable, if it was already received 563 // in another response 564 item, exists := e.items[entityID] 565 if !exists { 566 // the entity could have been received in another request 567 continue 568 } 569 570 // we set the timestamp to zero, so that the item will be included 571 // in the next batch regardless of retry interval 572 item.LastRequested = time.Time{} 573 } 574 575 return nil 576 }