github.com/koko1123/flow-go-1@v0.29.6/engine/verification/requester/requester.go (about) 1 package requester 2 3 import ( 4 "context" 5 "fmt" 6 "time" 7 8 "github.com/rs/zerolog" 9 "go.opentelemetry.io/otel/attribute" 10 "golang.org/x/exp/rand" 11 12 "github.com/koko1123/flow-go-1/engine" 13 "github.com/koko1123/flow-go-1/engine/verification/fetcher" 14 "github.com/koko1123/flow-go-1/model/flow" 15 "github.com/koko1123/flow-go-1/model/messages" 16 "github.com/koko1123/flow-go-1/model/verification" 17 "github.com/koko1123/flow-go-1/module" 18 "github.com/koko1123/flow-go-1/module/mempool" 19 "github.com/koko1123/flow-go-1/module/trace" 20 "github.com/koko1123/flow-go-1/network" 21 "github.com/koko1123/flow-go-1/network/channels" 22 "github.com/koko1123/flow-go-1/state/protocol" 23 "github.com/koko1123/flow-go-1/utils/logging" 24 ) 25 26 const ( 27 // DefaultRequestInterval is the time interval that requester engine tries requesting chunk data packs. 28 DefaultRequestInterval = 1000 * time.Millisecond 29 30 // DefaultBackoffMultiplier is the base of exponent in exponential backoff multiplier for backing off requests for chunk data packs. 31 DefaultBackoffMultiplier = float64(2) 32 33 // DefaultBackoffMinInterval is the minimum time interval a chunk data pack request waits before dispatching. 34 DefaultBackoffMinInterval = 1000 * time.Millisecond 35 36 // DefaultBackoffMaxInterval is the maximum time interval a chunk data pack request waits before dispatching. 37 DefaultBackoffMaxInterval = 1 * time.Minute 38 39 // DefaultRequestTargets is the maximum number of execution nodes a chunk data pack request is dispatched to. 40 DefaultRequestTargets = 2 41 ) 42 43 // Engine implements a ChunkDataPackRequester that is responsible of receiving chunk data pack requests, 44 // dispatching it to the execution nodes, receiving the requested chunk data pack from execution nodes, 45 // and passing it to the registered handler. 46 type Engine struct { 47 // common 48 log zerolog.Logger 49 unit *engine.Unit 50 state protocol.State // used to check the last sealed height. 51 con network.Conduit // used to send the chunk data request, and receive the response. 52 53 // monitoring 54 tracer module.Tracer 55 metrics module.VerificationMetrics 56 57 // output interfaces 58 handler fetcher.ChunkDataPackHandler // contains callbacks for handling received chunk data packs. 59 60 // internal logic 61 retryInterval time.Duration // determines time in milliseconds for retrying chunk data requests. 62 requestTargets uint64 // maximum number of execution nodes being asked for a chunk data pack. 63 pendingRequests mempool.ChunkRequests // used to track requested chunks. 64 reqQualifierFunc RequestQualifierFunc // used to decide whether to dispatch a request at a certain cycle. 65 reqUpdaterFunc mempool.ChunkRequestHistoryUpdaterFunc // used to atomically update chunk request info on mempool. 66 } 67 68 func New(log zerolog.Logger, 69 state protocol.State, 70 net network.Network, 71 tracer module.Tracer, 72 metrics module.VerificationMetrics, 73 pendingRequests mempool.ChunkRequests, 74 retryInterval time.Duration, 75 reqQualifierFunc RequestQualifierFunc, 76 reqUpdaterFunc mempool.ChunkRequestHistoryUpdaterFunc, 77 requestTargets uint64) (*Engine, error) { 78 79 e := &Engine{ 80 log: log.With().Str("engine", "requester").Logger(), 81 unit: engine.NewUnit(), 82 state: state, 83 tracer: tracer, 84 metrics: metrics, 85 retryInterval: retryInterval, 86 requestTargets: requestTargets, 87 pendingRequests: pendingRequests, 88 reqUpdaterFunc: reqUpdaterFunc, 89 reqQualifierFunc: reqQualifierFunc, 90 } 91 92 con, err := net.Register(channels.RequestChunks, e) 93 if err != nil { 94 return nil, fmt.Errorf("could not register chunk data pack provider engine: %w", err) 95 } 96 e.con = con 97 98 return e, nil 99 } 100 101 func (e *Engine) WithChunkDataPackHandler(handler fetcher.ChunkDataPackHandler) { 102 e.handler = handler 103 } 104 105 // SubmitLocal submits an event originating on the local node. 106 func (e *Engine) SubmitLocal(event interface{}) { 107 e.log.Fatal().Msg("engine is not supposed to be invoked on SubmitLocal") 108 } 109 110 // Submit submits the given event from the node with the given origin ID 111 // for processing in a non-blocking manner. It returns instantly and logs 112 // a potential processing error internally when done. 113 func (e *Engine) Submit(channel channels.Channel, originID flow.Identifier, event interface{}) { 114 e.unit.Launch(func() { 115 err := e.Process(channel, originID, event) 116 if err != nil { 117 engine.LogError(e.log, err) 118 } 119 }) 120 } 121 122 // ProcessLocal processes an event originating on the local node. 123 func (e *Engine) ProcessLocal(event interface{}) error { 124 return fmt.Errorf("should not invoke ProcessLocal of Match engine, use Process instead") 125 } 126 127 // Process processes the given event from the node with the given origin ID in 128 // a blocking manner. It returns the potential processing error when done. 129 func (e *Engine) Process(channel channels.Channel, originID flow.Identifier, event interface{}) error { 130 return e.unit.Do(func() error { 131 return e.process(originID, event) 132 }) 133 } 134 135 // Ready initializes the engine and returns a channel that is closed when the initialization is done. 136 func (e *Engine) Ready() <-chan struct{} { 137 delay := time.Duration(0) 138 // run a periodic check to retry requesting chunk data packs. 139 // if onTimer takes longer than retryInterval, the next call will be blocked until the previous 140 // call has finished. 141 // That being said, there won't be two onTimer running in parallel. See test cases for LaunchPeriodically. 142 e.unit.LaunchPeriodically(e.onTimer, e.retryInterval, delay) 143 return e.unit.Ready() 144 } 145 146 // Done terminates the engine and returns a channel that is closed when the termination is done 147 func (e *Engine) Done() <-chan struct{} { 148 return e.unit.Done() 149 } 150 151 // process receives and submits an event to the engine for processing. 152 // It returns an error so the engine will not propagate an event unless 153 // it is successfully processed by the engine. 154 // The origin ID indicates the node which originally submitted the event to 155 // the peer-to-peer network. 156 func (e *Engine) process(originID flow.Identifier, event interface{}) error { 157 switch resource := event.(type) { 158 case *messages.ChunkDataResponse: 159 e.handleChunkDataPackWithTracing(originID, &resource.ChunkDataPack) 160 default: 161 return fmt.Errorf("invalid event type (%T)", event) 162 } 163 164 return nil 165 } 166 167 // handleChunkDataPackWithTracing encapsulates the logic of handling a chunk data pack with tracing enabled. 168 func (e *Engine) handleChunkDataPackWithTracing(originID flow.Identifier, chunkDataPack *flow.ChunkDataPack) { 169 // TODO: change this to block level as well 170 if chunkDataPack.Collection != nil { 171 span, _ := e.tracer.StartCollectionSpan(e.unit.Ctx(), chunkDataPack.Collection.ID(), trace.VERRequesterHandleChunkDataResponse) 172 span.SetAttributes(attribute.String("chunk_id", chunkDataPack.ChunkID.String())) 173 defer span.End() 174 } 175 e.handleChunkDataPack(originID, chunkDataPack) 176 } 177 178 // handleChunkDataPack sends the received chunk data pack to the registered handler, and cleans up its request status. 179 func (e *Engine) handleChunkDataPack(originID flow.Identifier, chunkDataPack *flow.ChunkDataPack) { 180 chunkID := chunkDataPack.ChunkID 181 lg := e.log.With(). 182 Hex("chunk_id", logging.ID(chunkID)). 183 Logger() 184 185 if chunkDataPack.Collection != nil { 186 collectionID := chunkDataPack.Collection.ID() 187 lg = lg.With().Hex("collection_id", logging.ID(collectionID)).Logger() 188 } 189 lg.Debug().Msg("chunk data pack received") 190 191 e.metrics.OnChunkDataPackResponseReceivedFromNetworkByRequester() 192 193 // makes sure we still need this chunk, and we will not process duplicate chunk data packs. 194 locators, removed := e.pendingRequests.PopAll(chunkID) 195 if !removed { 196 lg.Debug().Msg("chunk request status not found in mempool to be removed, dropping chunk") 197 return 198 } 199 200 for _, locator := range locators { 201 response := verification.ChunkDataPackResponse{ 202 Locator: *locator, 203 Cdp: chunkDataPack, 204 } 205 206 e.handler.HandleChunkDataPack(originID, &response) 207 e.metrics.OnChunkDataPackSentToFetcher() 208 lg.Info(). 209 Hex("result_id", logging.ID(locator.ResultID)). 210 Uint64("chunk_index", locator.Index). 211 Msg("successfully sent the chunk data pack to the handler") 212 } 213 } 214 215 // Request receives a chunk data pack request and adds it into the pending requests mempool. 216 func (e *Engine) Request(request *verification.ChunkDataPackRequest) { 217 added := e.pendingRequests.Add(request) 218 219 e.metrics.OnChunkDataPackRequestReceivedByRequester() 220 221 e.log.Info(). 222 Hex("chunk_id", logging.ID(request.ChunkID)). 223 Uint64("block_height", request.Height). 224 Int("agree_executors", len(request.Agrees)). 225 Int("disagree_executors", len(request.Disagrees)). 226 Bool("added_to_pending_requests", added). 227 Msg("chunk data pack request arrived") 228 } 229 230 // onTimer should run periodically, it goes through all pending requests, and requests their chunk data pack. 231 // It also retries the chunk data request if the data hasn't been received for a while. 232 func (e *Engine) onTimer() { 233 pendingReqs := e.pendingRequests.All() 234 235 // keeps maximum attempts made on chunk data packs of the next unsealed height for telemetry 236 maxAttempts := uint64(0) 237 238 e.log.Debug(). 239 Int("total", len(pendingReqs)). 240 Msg("start processing all pending chunk data requests") 241 242 lastSealed, err := e.state.Sealed().Head() 243 if err != nil { 244 e.log.Fatal(). 245 Err(err). 246 Msg("could not determine whether block has been sealed") 247 } 248 249 for _, request := range pendingReqs { 250 attempts := e.handleChunkDataPackRequestWithTracing(request, lastSealed.Height) 251 if attempts > maxAttempts && request.Height == lastSealed.Height+uint64(1) { 252 maxAttempts = attempts 253 } 254 } 255 256 e.metrics.SetMaxChunkDataPackAttemptsForNextUnsealedHeightAtRequester(maxAttempts) 257 } 258 259 // handleChunkDataPackRequestWithTracing encapsulates the logic of dispatching chunk data request in network with tracing enabled. 260 func (e *Engine) handleChunkDataPackRequestWithTracing(request *verification.ChunkDataPackRequestInfo, lastSealedHeight uint64) uint64 { 261 // TODO (Ramtin) - enable tracing later 262 ctx := e.unit.Ctx() 263 return e.handleChunkDataPackRequest(ctx, request, lastSealedHeight) 264 } 265 266 // handleChunkDataPackRequest encapsulates the logic of dispatching the chunk data pack request to the network. 267 // The return value determines number of times this request has been dispatched. 268 func (e *Engine) handleChunkDataPackRequest(ctx context.Context, request *verification.ChunkDataPackRequestInfo, lastSealedHeight uint64) uint64 { 269 lg := e.log.With(). 270 Hex("chunk_id", logging.ID(request.ChunkID)). 271 Uint64("block_height", request.Height). 272 Logger() 273 274 // if block has been sealed, then we can finish 275 if request.Height <= lastSealedHeight { 276 locators, removed := e.pendingRequests.PopAll(request.ChunkID) 277 278 if !removed { 279 lg.Debug().Msg("chunk request status not found in mempool to be removed, drops requesting chunk of a sealed block") 280 return 0 281 } 282 283 for _, locator := range locators { 284 e.handler.NotifyChunkDataPackSealed(locator.Index, locator.ResultID) 285 lg.Info(). 286 Hex("result_id", logging.ID(locator.ResultID)). 287 Uint64("chunk_index", locator.Index). 288 Msg("drops requesting chunk of a sealed block") 289 } 290 291 return 0 292 } 293 294 qualified := e.canDispatchRequest(request.ChunkID) 295 if !qualified { 296 lg.Debug().Msg("chunk data pack request is not qualified for dispatching at this round") 297 return 0 298 } 299 300 err := e.requestChunkDataPackWithTracing(ctx, request) 301 if err != nil { 302 lg.Error().Err(err).Msg("could not request chunk data pack") 303 return 0 304 } 305 306 attempts, lastAttempt, retryAfter, updated := e.onRequestDispatched(request.ChunkID) 307 lg.Info(). 308 Bool("pending_request_updated", updated). 309 Uint64("attempts_made", attempts). 310 Time("last_attempt", lastAttempt). 311 Dur("retry_after", retryAfter). 312 Msg("chunk data pack requested") 313 314 return attempts 315 } 316 317 // requestChunkDataPack dispatches request for the chunk data pack to the execution nodes. 318 func (e *Engine) requestChunkDataPackWithTracing(ctx context.Context, request *verification.ChunkDataPackRequestInfo) error { 319 var err error 320 e.tracer.WithSpanFromContext(ctx, trace.VERRequesterDispatchChunkDataRequest, func() { 321 err = e.requestChunkDataPack(request) 322 }) 323 return err 324 } 325 326 // requestChunkDataPack dispatches request for the chunk data pack to the execution nodes. 327 func (e *Engine) requestChunkDataPack(request *verification.ChunkDataPackRequestInfo) error { 328 req := &messages.ChunkDataRequest{ 329 ChunkID: request.ChunkID, 330 Nonce: rand.Uint64(), // prevent the request from being deduplicated by the receiver 331 } 332 333 // publishes the chunk data request to the network 334 targetIDs := request.SampleTargets(int(e.requestTargets)) 335 err := e.con.Publish(req, targetIDs...) 336 if err != nil { 337 return fmt.Errorf("could not publish chunk data pack request for chunk (id=%s): %w", request.ChunkID, err) 338 } 339 340 return nil 341 } 342 343 // canDispatchRequest returns whether chunk data request for this chunk ID can be dispatched. 344 func (e *Engine) canDispatchRequest(chunkID flow.Identifier) bool { 345 attempts, lastAttempt, retryAfter, exists := e.pendingRequests.RequestHistory(chunkID) 346 if !exists { 347 return false 348 } 349 350 return e.reqQualifierFunc(attempts, lastAttempt, retryAfter) 351 } 352 353 // onRequestDispatched encapsulates the logic of updating the chunk data request post a successful dispatch. 354 func (e *Engine) onRequestDispatched(chunkID flow.Identifier) (uint64, time.Time, time.Duration, bool) { 355 e.metrics.OnChunkDataPackRequestDispatchedInNetworkByRequester() 356 return e.pendingRequests.UpdateRequestHistory(chunkID, e.reqUpdaterFunc) 357 }