github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/collection/synchronization/engine.go (about) 1 package synchronization 2 3 import ( 4 "errors" 5 "fmt" 6 "time" 7 8 "github.com/hashicorp/go-multierror" 9 "github.com/rs/zerolog" 10 11 "github.com/onflow/flow-go/engine" 12 "github.com/onflow/flow-go/engine/collection" 13 "github.com/onflow/flow-go/engine/common/fifoqueue" 14 commonsync "github.com/onflow/flow-go/engine/common/synchronization" 15 "github.com/onflow/flow-go/model/chainsync" 16 "github.com/onflow/flow-go/model/flow" 17 "github.com/onflow/flow-go/model/flow/filter" 18 "github.com/onflow/flow-go/model/messages" 19 "github.com/onflow/flow-go/module" 20 synccore "github.com/onflow/flow-go/module/chainsync" 21 "github.com/onflow/flow-go/module/lifecycle" 22 "github.com/onflow/flow-go/module/metrics" 23 "github.com/onflow/flow-go/network" 24 "github.com/onflow/flow-go/network/channels" 25 "github.com/onflow/flow-go/state/cluster" 26 "github.com/onflow/flow-go/storage" 27 "github.com/onflow/flow-go/utils/rand" 28 ) 29 30 // defaultSyncResponseQueueCapacity maximum capacity of sync responses queue 31 const defaultSyncResponseQueueCapacity = 500 32 33 // defaultBlockResponseQueueCapacity maximum capacity of block responses queue 34 const defaultBlockResponseQueueCapacity = 500 35 36 // Engine is the synchronization engine, responsible for synchronizing chain state. 37 type Engine struct { 38 unit *engine.Unit 39 lm *lifecycle.LifecycleManager 40 log zerolog.Logger 41 metrics module.EngineMetrics 42 me module.Local 43 participants flow.IdentitySkeletonList 44 con network.Conduit 45 comp collection.Compliance // compliance layer engine 46 47 pollInterval time.Duration 48 scanInterval time.Duration 49 core module.SyncCore 50 state cluster.State 51 52 requestHandler *RequestHandlerEngine // component responsible for handling requests 53 54 pendingSyncResponses engine.MessageStore // message store for *message.SyncResponse 55 pendingBlockResponses engine.MessageStore // message store for *message.BlockResponse 56 responseMessageHandler *engine.MessageHandler // message handler responsible for response processing 57 } 58 59 // New creates a new cluster chain synchronization engine. 60 func New( 61 log zerolog.Logger, 62 metrics module.EngineMetrics, 63 net network.EngineRegistry, 64 me module.Local, 65 participants flow.IdentitySkeletonList, 66 state cluster.State, 67 blocks storage.ClusterBlocks, 68 comp collection.Compliance, 69 core module.SyncCore, 70 opts ...commonsync.OptionFunc, 71 ) (*Engine, error) { 72 73 opt := commonsync.DefaultConfig() 74 for _, f := range opts { 75 f(opt) 76 } 77 78 if comp == nil { 79 return nil, fmt.Errorf("must initialize synchronization engine with comp engine") 80 } 81 82 // initialize the propagation engine with its dependencies 83 e := &Engine{ 84 unit: engine.NewUnit(), 85 lm: lifecycle.NewLifecycleManager(), 86 log: log.With().Str("engine", "cluster_synchronization").Logger(), 87 metrics: metrics, 88 me: me, 89 participants: participants.Filter(filter.Not(filter.HasNodeID[flow.IdentitySkeleton](me.NodeID()))), 90 comp: comp, 91 core: core, 92 pollInterval: opt.PollInterval, 93 scanInterval: opt.ScanInterval, 94 state: state, 95 } 96 97 err := e.setupResponseMessageHandler() 98 if err != nil { 99 return nil, fmt.Errorf("could not setup message handler") 100 } 101 chainID := state.Params().ChainID() 102 103 // register the engine with the network layer and store the conduit 104 con, err := net.Register(channels.SyncCluster(chainID), e) 105 if err != nil { 106 return nil, fmt.Errorf("could not register engine: %w", err) 107 } 108 e.con = con 109 110 e.requestHandler = NewRequestHandlerEngine(log, metrics, con, me, blocks, core, state) 111 112 return e, nil 113 } 114 115 // setupResponseMessageHandler initializes the inbound queues and the MessageHandler for UNTRUSTED responses. 116 func (e *Engine) setupResponseMessageHandler() error { 117 syncResponseQueue, err := fifoqueue.NewFifoQueue(defaultSyncResponseQueueCapacity) 118 if err != nil { 119 return fmt.Errorf("failed to create queue for sync responses: %w", err) 120 } 121 122 e.pendingSyncResponses = &engine.FifoMessageStore{ 123 FifoQueue: syncResponseQueue, 124 } 125 126 blockResponseQueue, err := fifoqueue.NewFifoQueue(defaultBlockResponseQueueCapacity) 127 if err != nil { 128 return fmt.Errorf("failed to create queue for block responses: %w", err) 129 } 130 131 e.pendingBlockResponses = &engine.FifoMessageStore{ 132 FifoQueue: blockResponseQueue, 133 } 134 135 // define message queueing behaviour 136 e.responseMessageHandler = engine.NewMessageHandler( 137 e.log, 138 engine.NewNotifier(), 139 engine.Pattern{ 140 Match: func(msg *engine.Message) bool { 141 _, ok := msg.Payload.(*messages.SyncResponse) 142 if ok { 143 e.metrics.MessageReceived(metrics.EngineClusterSynchronization, metrics.MessageSyncResponse) 144 } 145 return ok 146 }, 147 Store: e.pendingSyncResponses, 148 }, 149 engine.Pattern{ 150 Match: func(msg *engine.Message) bool { 151 _, ok := msg.Payload.(*messages.ClusterBlockResponse) 152 if ok { 153 e.metrics.MessageReceived(metrics.EngineClusterSynchronization, metrics.MessageBlockResponse) 154 } 155 return ok 156 }, 157 Store: e.pendingBlockResponses, 158 }, 159 ) 160 161 return nil 162 } 163 164 // Ready returns a ready channel that is closed once the engine has fully started. 165 func (e *Engine) Ready() <-chan struct{} { 166 e.lm.OnStart(func() { 167 e.unit.Launch(e.checkLoop) 168 e.unit.Launch(e.responseProcessingLoop) 169 // wait for request handler to startup 170 <-e.requestHandler.Ready() 171 }) 172 return e.lm.Started() 173 } 174 175 // Done returns a done channel that is closed once the engine has fully stopped. 176 func (e *Engine) Done() <-chan struct{} { 177 e.lm.OnStop(func() { 178 // signal the request handler to shutdown 179 requestHandlerDone := e.requestHandler.Done() 180 // wait for request sending and response processing routines to exit 181 <-e.unit.Done() 182 // wait for request handler shutdown to complete 183 <-requestHandlerDone 184 }) 185 return e.lm.Stopped() 186 } 187 188 // SubmitLocal submits an event originating on the local node. 189 func (e *Engine) SubmitLocal(event interface{}) { 190 err := e.ProcessLocal(event) 191 if err != nil { 192 e.log.Fatal().Err(err).Msg("internal error processing event") 193 } 194 } 195 196 // Submit submits the given event from the node with the given origin ID 197 // for processing in a non-blocking manner. It returns instantly and logs 198 // a potential processing error internally when done. 199 func (e *Engine) Submit(channel channels.Channel, originID flow.Identifier, event interface{}) { 200 err := e.Process(channel, originID, event) 201 if err != nil { 202 e.log.Fatal().Err(err).Msg("internal error processing event") 203 } 204 } 205 206 // ProcessLocal processes an event originating on the local node. 207 func (e *Engine) ProcessLocal(event interface{}) error { 208 return e.process(e.me.NodeID(), event) 209 } 210 211 // Process processes the given event from the node with the given origin ID in 212 // a blocking manner. It returns the potential processing error when done. 213 func (e *Engine) Process(channel channels.Channel, originID flow.Identifier, event interface{}) error { 214 err := e.process(originID, event) 215 if err != nil { 216 if engine.IsIncompatibleInputTypeError(err) { 217 e.log.Warn().Msgf("%v delivered unsupported message %T through %v", originID, event, channel) 218 return nil 219 } 220 return fmt.Errorf("unexpected error while processing engine message: %w", err) 221 } 222 return nil 223 } 224 225 // process processes events for the synchronization engine. 226 // Error returns: 227 // - IncompatibleInputTypeError if input has unexpected type 228 // - All other errors are potential symptoms of internal state corruption or bugs (fatal). 229 func (e *Engine) process(originID flow.Identifier, event interface{}) error { 230 switch event.(type) { 231 case *messages.RangeRequest, *messages.BatchRequest, *messages.SyncRequest: 232 return e.requestHandler.process(originID, event) 233 case *messages.SyncResponse, *messages.ClusterBlockResponse: 234 return e.responseMessageHandler.Process(originID, event) 235 default: 236 return fmt.Errorf("received input with type %T from %x: %w", event, originID[:], engine.IncompatibleInputTypeError) 237 } 238 } 239 240 // responseProcessingLoop is a separate goroutine that performs processing of queued responses 241 func (e *Engine) responseProcessingLoop() { 242 notifier := e.responseMessageHandler.GetNotifier() 243 for { 244 select { 245 case <-e.unit.Quit(): 246 return 247 case <-notifier: 248 e.processAvailableResponses() 249 } 250 } 251 } 252 253 // processAvailableResponses is processor of pending events which drives events from networking layer to business logic. 254 func (e *Engine) processAvailableResponses() { 255 for { 256 select { 257 case <-e.unit.Quit(): 258 return 259 default: 260 } 261 262 msg, ok := e.pendingSyncResponses.Get() 263 if ok { 264 e.onSyncResponse(msg.OriginID, msg.Payload.(*messages.SyncResponse)) 265 e.metrics.MessageHandled(metrics.EngineClusterSynchronization, metrics.MessageSyncResponse) 266 continue 267 } 268 269 msg, ok = e.pendingBlockResponses.Get() 270 if ok { 271 e.onBlockResponse(msg.OriginID, msg.Payload.(*messages.ClusterBlockResponse)) 272 e.metrics.MessageHandled(metrics.EngineClusterSynchronization, metrics.MessageBlockResponse) 273 continue 274 } 275 276 // when there is no more messages in the queue, back to the loop to wait 277 // for the next incoming message to arrive. 278 return 279 } 280 } 281 282 // onSyncResponse processes a synchronization response. 283 func (e *Engine) onSyncResponse(originID flow.Identifier, res *messages.SyncResponse) { 284 final, err := e.state.Final().Head() 285 if err != nil { 286 e.log.Error().Err(err).Msg("could not get last finalized header") 287 return 288 } 289 e.core.HandleHeight(final, res.Height) 290 } 291 292 // onBlockResponse processes a response containing a specifically requested block. 293 func (e *Engine) onBlockResponse(originID flow.Identifier, res *messages.ClusterBlockResponse) { 294 // process the blocks one by one 295 for _, block := range res.Blocks { 296 header := block.Header 297 if !e.core.HandleBlock(&header) { 298 continue 299 } 300 synced := flow.Slashable[*messages.ClusterBlockProposal]{ 301 OriginID: originID, 302 Message: &messages.ClusterBlockProposal{ 303 Block: block, 304 }, 305 } 306 // forward the block to the compliance engine for validation and processing 307 e.comp.OnSyncedClusterBlock(synced) 308 } 309 } 310 311 // checkLoop will regularly scan for items that need requesting. 312 func (e *Engine) checkLoop() { 313 pollChan := make(<-chan time.Time) 314 if e.pollInterval > 0 { 315 poll := time.NewTicker(e.pollInterval) 316 pollChan = poll.C 317 defer poll.Stop() 318 } 319 scan := time.NewTicker(e.scanInterval) 320 321 CheckLoop: 322 for { 323 // give the quit channel a priority to be selected 324 select { 325 case <-e.unit.Quit(): 326 break CheckLoop 327 default: 328 } 329 330 select { 331 case <-e.unit.Quit(): 332 break CheckLoop 333 case <-pollChan: 334 e.pollHeight() 335 case <-scan.C: 336 final, err := e.state.Final().Head() 337 if err != nil { 338 e.log.Fatal().Err(err).Msg("could not get last finalized header") 339 continue 340 } 341 ranges, batches := e.core.ScanPending(final) 342 e.sendRequests(ranges, batches) 343 } 344 } 345 346 // some minor cleanup 347 scan.Stop() 348 } 349 350 // pollHeight will send a synchronization request to three random nodes. 351 func (e *Engine) pollHeight() { 352 head, err := e.state.Final().Head() 353 if err != nil { 354 e.log.Error().Err(err).Msg("could not get last finalized header") 355 return 356 } 357 358 nonce, err := rand.Uint64() 359 if err != nil { 360 // TODO: this error should be returned by pollHeight() 361 // it is logged for now since the only error possible is related to a failure 362 // of the system entropy generation. Such error is going to cause failures in other 363 // components where it's handled properly and will lead to crashing the module. 364 e.log.Error().Err(err).Msg("nonce generation failed during pollHeight") 365 return 366 } 367 368 // send the request for synchronization 369 req := &messages.SyncRequest{ 370 Nonce: nonce, 371 Height: head.Height, 372 } 373 err = e.con.Multicast(req, synccore.DefaultPollNodes, e.participants.NodeIDs()...) 374 if err != nil && !errors.Is(err, network.EmptyTargetList) { 375 e.log.Warn().Err(err).Msg("sending sync request to poll heights failed") 376 return 377 } 378 e.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageSyncRequest) 379 } 380 381 // sendRequests sends a request for each range and batch using consensus participants from last finalized snapshot. 382 func (e *Engine) sendRequests(ranges []chainsync.Range, batches []chainsync.Batch) { 383 var errs *multierror.Error 384 385 for _, ran := range ranges { 386 nonce, err := rand.Uint64() 387 if err != nil { 388 // TODO: this error should be returned by sendRequests 389 // it is logged for now since the only error possible is related to a failure 390 // of the system entropy generation. Such error is going to cause failures in other 391 // components where it's handled properly and will lead to crashing the module. 392 e.log.Error().Err(err).Msg("nonce generation failed during range request") 393 return 394 } 395 req := &messages.RangeRequest{ 396 Nonce: nonce, 397 FromHeight: ran.From, 398 ToHeight: ran.To, 399 } 400 err = e.con.Multicast(req, synccore.DefaultBlockRequestNodes, e.participants.NodeIDs()...) 401 if err != nil { 402 errs = multierror.Append(errs, fmt.Errorf("could not submit range request: %w", err)) 403 continue 404 } 405 e.log.Debug(). 406 Uint64("range_from", req.FromHeight). 407 Uint64("range_to", req.ToHeight). 408 Uint64("range_nonce", req.Nonce). 409 Msg("range requested") 410 e.core.RangeRequested(ran) 411 e.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageRangeRequest) 412 } 413 414 for _, batch := range batches { 415 nonce, err := rand.Uint64() 416 if err != nil { 417 // TODO: this error should be returned by sendRequests 418 // it is logged for now since the only error possible is related to a failure 419 // of the system entropy generation. Such error is going to cause failures in other 420 // components where it's handled properly and will lead to crashing the module. 421 e.log.Error().Err(err).Msg("nonce generation failed during batch request") 422 return 423 } 424 req := &messages.BatchRequest{ 425 Nonce: nonce, 426 BlockIDs: batch.BlockIDs, 427 } 428 err = e.con.Multicast(req, synccore.DefaultBlockRequestNodes, e.participants.NodeIDs()...) 429 if err != nil { 430 errs = multierror.Append(errs, fmt.Errorf("could not submit batch request: %w", err)) 431 continue 432 } 433 e.core.BatchRequested(batch) 434 e.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageBatchRequest) 435 } 436 437 if err := errs.ErrorOrNil(); err != nil { 438 e.log.Warn().Err(err).Msg("sending range and batch requests failed") 439 } 440 }