github.com/onflow/flow-go@v0.33.17/engine/collection/synchronization/engine.go (about) 1 // (c) 2019 Dapper Labs - ALL RIGHTS RESERVED 2 3 package synchronization 4 5 import ( 6 "errors" 7 "fmt" 8 "time" 9 10 "github.com/hashicorp/go-multierror" 11 "github.com/rs/zerolog" 12 13 "github.com/onflow/flow-go/engine" 14 "github.com/onflow/flow-go/engine/collection" 15 "github.com/onflow/flow-go/engine/common/fifoqueue" 16 commonsync "github.com/onflow/flow-go/engine/common/synchronization" 17 "github.com/onflow/flow-go/model/chainsync" 18 "github.com/onflow/flow-go/model/flow" 19 "github.com/onflow/flow-go/model/flow/filter" 20 "github.com/onflow/flow-go/model/messages" 21 "github.com/onflow/flow-go/module" 22 synccore "github.com/onflow/flow-go/module/chainsync" 23 "github.com/onflow/flow-go/module/lifecycle" 24 "github.com/onflow/flow-go/module/metrics" 25 "github.com/onflow/flow-go/network" 26 "github.com/onflow/flow-go/network/channels" 27 "github.com/onflow/flow-go/state/cluster" 28 "github.com/onflow/flow-go/storage" 29 "github.com/onflow/flow-go/utils/rand" 30 ) 31 32 // defaultSyncResponseQueueCapacity maximum capacity of sync responses queue 33 const defaultSyncResponseQueueCapacity = 500 34 35 // defaultBlockResponseQueueCapacity maximum capacity of block responses queue 36 const defaultBlockResponseQueueCapacity = 500 37 38 // Engine is the synchronization engine, responsible for synchronizing chain state. 39 type Engine struct { 40 unit *engine.Unit 41 lm *lifecycle.LifecycleManager 42 log zerolog.Logger 43 metrics module.EngineMetrics 44 me module.Local 45 participants flow.IdentityList 46 con network.Conduit 47 comp collection.Compliance // compliance layer engine 48 49 pollInterval time.Duration 50 scanInterval time.Duration 51 core module.SyncCore 52 state cluster.State 53 54 requestHandler *RequestHandlerEngine // component responsible for handling requests 55 56 pendingSyncResponses engine.MessageStore // message store for *message.SyncResponse 57 pendingBlockResponses engine.MessageStore // message store for *message.BlockResponse 58 responseMessageHandler *engine.MessageHandler // message handler responsible for response processing 59 } 60 61 // New creates a new cluster chain synchronization engine. 62 func New( 63 log zerolog.Logger, 64 metrics module.EngineMetrics, 65 net network.EngineRegistry, 66 me module.Local, 67 participants flow.IdentityList, 68 state cluster.State, 69 blocks storage.ClusterBlocks, 70 comp collection.Compliance, 71 core module.SyncCore, 72 opts ...commonsync.OptionFunc, 73 ) (*Engine, error) { 74 75 opt := commonsync.DefaultConfig() 76 for _, f := range opts { 77 f(opt) 78 } 79 80 if comp == nil { 81 return nil, fmt.Errorf("must initialize synchronization engine with comp engine") 82 } 83 84 // initialize the propagation engine with its dependencies 85 e := &Engine{ 86 unit: engine.NewUnit(), 87 lm: lifecycle.NewLifecycleManager(), 88 log: log.With().Str("engine", "cluster_synchronization").Logger(), 89 metrics: metrics, 90 me: me, 91 participants: participants.Filter(filter.Not(filter.HasNodeID(me.NodeID()))), 92 comp: comp, 93 core: core, 94 pollInterval: opt.PollInterval, 95 scanInterval: opt.ScanInterval, 96 state: state, 97 } 98 99 err := e.setupResponseMessageHandler() 100 if err != nil { 101 return nil, fmt.Errorf("could not setup message handler") 102 } 103 104 chainID, err := state.Params().ChainID() 105 if err != nil { 106 return nil, fmt.Errorf("could not get chain ID: %w", err) 107 } 108 109 // register the engine with the network layer and store the conduit 110 con, err := net.Register(channels.SyncCluster(chainID), e) 111 if err != nil { 112 return nil, fmt.Errorf("could not register engine: %w", err) 113 } 114 e.con = con 115 116 e.requestHandler = NewRequestHandlerEngine(log, metrics, con, me, blocks, core, state) 117 118 return e, nil 119 } 120 121 // setupResponseMessageHandler initializes the inbound queues and the MessageHandler for UNTRUSTED responses. 122 func (e *Engine) setupResponseMessageHandler() error { 123 syncResponseQueue, err := fifoqueue.NewFifoQueue(defaultSyncResponseQueueCapacity) 124 if err != nil { 125 return fmt.Errorf("failed to create queue for sync responses: %w", err) 126 } 127 128 e.pendingSyncResponses = &engine.FifoMessageStore{ 129 FifoQueue: syncResponseQueue, 130 } 131 132 blockResponseQueue, err := fifoqueue.NewFifoQueue(defaultBlockResponseQueueCapacity) 133 if err != nil { 134 return fmt.Errorf("failed to create queue for block responses: %w", err) 135 } 136 137 e.pendingBlockResponses = &engine.FifoMessageStore{ 138 FifoQueue: blockResponseQueue, 139 } 140 141 // define message queueing behaviour 142 e.responseMessageHandler = engine.NewMessageHandler( 143 e.log, 144 engine.NewNotifier(), 145 engine.Pattern{ 146 Match: func(msg *engine.Message) bool { 147 _, ok := msg.Payload.(*messages.SyncResponse) 148 if ok { 149 e.metrics.MessageReceived(metrics.EngineClusterSynchronization, metrics.MessageSyncResponse) 150 } 151 return ok 152 }, 153 Store: e.pendingSyncResponses, 154 }, 155 engine.Pattern{ 156 Match: func(msg *engine.Message) bool { 157 _, ok := msg.Payload.(*messages.ClusterBlockResponse) 158 if ok { 159 e.metrics.MessageReceived(metrics.EngineClusterSynchronization, metrics.MessageBlockResponse) 160 } 161 return ok 162 }, 163 Store: e.pendingBlockResponses, 164 }, 165 ) 166 167 return nil 168 } 169 170 // Ready returns a ready channel that is closed once the engine has fully started. 171 func (e *Engine) Ready() <-chan struct{} { 172 e.lm.OnStart(func() { 173 e.unit.Launch(e.checkLoop) 174 e.unit.Launch(e.responseProcessingLoop) 175 // wait for request handler to startup 176 <-e.requestHandler.Ready() 177 }) 178 return e.lm.Started() 179 } 180 181 // Done returns a done channel that is closed once the engine has fully stopped. 182 func (e *Engine) Done() <-chan struct{} { 183 e.lm.OnStop(func() { 184 // signal the request handler to shutdown 185 requestHandlerDone := e.requestHandler.Done() 186 // wait for request sending and response processing routines to exit 187 <-e.unit.Done() 188 // wait for request handler shutdown to complete 189 <-requestHandlerDone 190 }) 191 return e.lm.Stopped() 192 } 193 194 // SubmitLocal submits an event originating on the local node. 195 func (e *Engine) SubmitLocal(event interface{}) { 196 err := e.ProcessLocal(event) 197 if err != nil { 198 e.log.Fatal().Err(err).Msg("internal error processing event") 199 } 200 } 201 202 // Submit submits the given event from the node with the given origin ID 203 // for processing in a non-blocking manner. It returns instantly and logs 204 // a potential processing error internally when done. 205 func (e *Engine) Submit(channel channels.Channel, originID flow.Identifier, event interface{}) { 206 err := e.Process(channel, originID, event) 207 if err != nil { 208 e.log.Fatal().Err(err).Msg("internal error processing event") 209 } 210 } 211 212 // ProcessLocal processes an event originating on the local node. 213 func (e *Engine) ProcessLocal(event interface{}) error { 214 return e.process(e.me.NodeID(), event) 215 } 216 217 // Process processes the given event from the node with the given origin ID in 218 // a blocking manner. It returns the potential processing error when done. 219 func (e *Engine) Process(channel channels.Channel, originID flow.Identifier, event interface{}) error { 220 err := e.process(originID, event) 221 if err != nil { 222 if engine.IsIncompatibleInputTypeError(err) { 223 e.log.Warn().Msgf("%v delivered unsupported message %T through %v", originID, event, channel) 224 return nil 225 } 226 return fmt.Errorf("unexpected error while processing engine message: %w", err) 227 } 228 return nil 229 } 230 231 // process processes events for the synchronization engine. 232 // Error returns: 233 // - IncompatibleInputTypeError if input has unexpected type 234 // - All other errors are potential symptoms of internal state corruption or bugs (fatal). 235 func (e *Engine) process(originID flow.Identifier, event interface{}) error { 236 switch event.(type) { 237 case *messages.RangeRequest, *messages.BatchRequest, *messages.SyncRequest: 238 return e.requestHandler.process(originID, event) 239 case *messages.SyncResponse, *messages.ClusterBlockResponse: 240 return e.responseMessageHandler.Process(originID, event) 241 default: 242 return fmt.Errorf("received input with type %T from %x: %w", event, originID[:], engine.IncompatibleInputTypeError) 243 } 244 } 245 246 // responseProcessingLoop is a separate goroutine that performs processing of queued responses 247 func (e *Engine) responseProcessingLoop() { 248 notifier := e.responseMessageHandler.GetNotifier() 249 for { 250 select { 251 case <-e.unit.Quit(): 252 return 253 case <-notifier: 254 e.processAvailableResponses() 255 } 256 } 257 } 258 259 // processAvailableResponses is processor of pending events which drives events from networking layer to business logic. 260 func (e *Engine) processAvailableResponses() { 261 for { 262 select { 263 case <-e.unit.Quit(): 264 return 265 default: 266 } 267 268 msg, ok := e.pendingSyncResponses.Get() 269 if ok { 270 e.onSyncResponse(msg.OriginID, msg.Payload.(*messages.SyncResponse)) 271 e.metrics.MessageHandled(metrics.EngineClusterSynchronization, metrics.MessageSyncResponse) 272 continue 273 } 274 275 msg, ok = e.pendingBlockResponses.Get() 276 if ok { 277 e.onBlockResponse(msg.OriginID, msg.Payload.(*messages.ClusterBlockResponse)) 278 e.metrics.MessageHandled(metrics.EngineClusterSynchronization, metrics.MessageBlockResponse) 279 continue 280 } 281 282 // when there is no more messages in the queue, back to the loop to wait 283 // for the next incoming message to arrive. 284 return 285 } 286 } 287 288 // onSyncResponse processes a synchronization response. 289 func (e *Engine) onSyncResponse(originID flow.Identifier, res *messages.SyncResponse) { 290 final, err := e.state.Final().Head() 291 if err != nil { 292 e.log.Error().Err(err).Msg("could not get last finalized header") 293 return 294 } 295 e.core.HandleHeight(final, res.Height) 296 } 297 298 // onBlockResponse processes a response containing a specifically requested block. 299 func (e *Engine) onBlockResponse(originID flow.Identifier, res *messages.ClusterBlockResponse) { 300 // process the blocks one by one 301 for _, block := range res.Blocks { 302 header := block.Header 303 if !e.core.HandleBlock(&header) { 304 continue 305 } 306 synced := flow.Slashable[*messages.ClusterBlockProposal]{ 307 OriginID: originID, 308 Message: &messages.ClusterBlockProposal{ 309 Block: block, 310 }, 311 } 312 // forward the block to the compliance engine for validation and processing 313 e.comp.OnSyncedClusterBlock(synced) 314 } 315 } 316 317 // checkLoop will regularly scan for items that need requesting. 318 func (e *Engine) checkLoop() { 319 pollChan := make(<-chan time.Time) 320 if e.pollInterval > 0 { 321 poll := time.NewTicker(e.pollInterval) 322 pollChan = poll.C 323 defer poll.Stop() 324 } 325 scan := time.NewTicker(e.scanInterval) 326 327 CheckLoop: 328 for { 329 // give the quit channel a priority to be selected 330 select { 331 case <-e.unit.Quit(): 332 break CheckLoop 333 default: 334 } 335 336 select { 337 case <-e.unit.Quit(): 338 break CheckLoop 339 case <-pollChan: 340 e.pollHeight() 341 case <-scan.C: 342 final, err := e.state.Final().Head() 343 if err != nil { 344 e.log.Fatal().Err(err).Msg("could not get last finalized header") 345 continue 346 } 347 ranges, batches := e.core.ScanPending(final) 348 e.sendRequests(ranges, batches) 349 } 350 } 351 352 // some minor cleanup 353 scan.Stop() 354 } 355 356 // pollHeight will send a synchronization request to three random nodes. 357 func (e *Engine) pollHeight() { 358 head, err := e.state.Final().Head() 359 if err != nil { 360 e.log.Error().Err(err).Msg("could not get last finalized header") 361 return 362 } 363 364 nonce, err := rand.Uint64() 365 if err != nil { 366 // TODO: this error should be returned by pollHeight() 367 // it is logged for now since the only error possible is related to a failure 368 // of the system entropy generation. Such error is going to cause failures in other 369 // components where it's handled properly and will lead to crashing the module. 370 e.log.Error().Err(err).Msg("nonce generation failed during pollHeight") 371 return 372 } 373 374 // send the request for synchronization 375 req := &messages.SyncRequest{ 376 Nonce: nonce, 377 Height: head.Height, 378 } 379 err = e.con.Multicast(req, synccore.DefaultPollNodes, e.participants.NodeIDs()...) 380 if err != nil && !errors.Is(err, network.EmptyTargetList) { 381 e.log.Warn().Err(err).Msg("sending sync request to poll heights failed") 382 return 383 } 384 e.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageSyncRequest) 385 } 386 387 // sendRequests sends a request for each range and batch using consensus participants from last finalized snapshot. 388 func (e *Engine) sendRequests(ranges []chainsync.Range, batches []chainsync.Batch) { 389 var errs *multierror.Error 390 391 for _, ran := range ranges { 392 nonce, err := rand.Uint64() 393 if err != nil { 394 // TODO: this error should be returned by sendRequests 395 // it is logged for now since the only error possible is related to a failure 396 // of the system entropy generation. Such error is going to cause failures in other 397 // components where it's handled properly and will lead to crashing the module. 398 e.log.Error().Err(err).Msg("nonce generation failed during range request") 399 return 400 } 401 req := &messages.RangeRequest{ 402 Nonce: nonce, 403 FromHeight: ran.From, 404 ToHeight: ran.To, 405 } 406 err = e.con.Multicast(req, synccore.DefaultBlockRequestNodes, e.participants.NodeIDs()...) 407 if err != nil { 408 errs = multierror.Append(errs, fmt.Errorf("could not submit range request: %w", err)) 409 continue 410 } 411 e.log.Debug(). 412 Uint64("range_from", req.FromHeight). 413 Uint64("range_to", req.ToHeight). 414 Uint64("range_nonce", req.Nonce). 415 Msg("range requested") 416 e.core.RangeRequested(ran) 417 e.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageRangeRequest) 418 } 419 420 for _, batch := range batches { 421 nonce, err := rand.Uint64() 422 if err != nil { 423 // TODO: this error should be returned by sendRequests 424 // it is logged for now since the only error possible is related to a failure 425 // of the system entropy generation. Such error is going to cause failures in other 426 // components where it's handled properly and will lead to crashing the module. 427 e.log.Error().Err(err).Msg("nonce generation failed during batch request") 428 return 429 } 430 req := &messages.BatchRequest{ 431 Nonce: nonce, 432 BlockIDs: batch.BlockIDs, 433 } 434 err = e.con.Multicast(req, synccore.DefaultBlockRequestNodes, e.participants.NodeIDs()...) 435 if err != nil { 436 errs = multierror.Append(errs, fmt.Errorf("could not submit batch request: %w", err)) 437 continue 438 } 439 e.core.BatchRequested(batch) 440 e.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageBatchRequest) 441 } 442 443 if err := errs.ErrorOrNil(); err != nil { 444 e.log.Warn().Err(err).Msg("sending range and batch requests failed") 445 } 446 }