github.com/koko1123/flow-go-1@v0.29.6/engine/collection/synchronization/engine.go (about) 1 // (c) 2019 Dapper Labs - ALL RIGHTS RESERVED 2 3 package synchronization 4 5 import ( 6 "fmt" 7 "math/rand" 8 "time" 9 10 "github.com/hashicorp/go-multierror" 11 "github.com/rs/zerolog" 12 13 "github.com/koko1123/flow-go-1/engine" 14 "github.com/koko1123/flow-go-1/engine/common/fifoqueue" 15 commonsync "github.com/koko1123/flow-go-1/engine/common/synchronization" 16 "github.com/koko1123/flow-go-1/model/chainsync" 17 "github.com/koko1123/flow-go-1/model/events" 18 "github.com/koko1123/flow-go-1/model/flow" 19 "github.com/koko1123/flow-go-1/model/flow/filter" 20 "github.com/koko1123/flow-go-1/model/messages" 21 "github.com/koko1123/flow-go-1/module" 22 synccore "github.com/koko1123/flow-go-1/module/chainsync" 23 "github.com/koko1123/flow-go-1/module/lifecycle" 24 "github.com/koko1123/flow-go-1/module/metrics" 25 "github.com/koko1123/flow-go-1/network" 26 "github.com/koko1123/flow-go-1/network/channels" 27 "github.com/koko1123/flow-go-1/state/cluster" 28 "github.com/koko1123/flow-go-1/storage" 29 ) 30 31 // defaultSyncResponseQueueCapacity maximum capacity of sync responses queue 32 const defaultSyncResponseQueueCapacity = 500 33 34 // defaultBlockResponseQueueCapacity maximum capacity of block responses queue 35 const defaultBlockResponseQueueCapacity = 500 36 37 // Engine is the synchronization engine, responsible for synchronizing chain state. 38 type Engine struct { 39 unit *engine.Unit 40 lm *lifecycle.LifecycleManager 41 log zerolog.Logger 42 metrics module.EngineMetrics 43 me module.Local 44 participants flow.IdentityList 45 con network.Conduit 46 comp network.Engine // compliance layer engine 47 48 pollInterval time.Duration 49 scanInterval time.Duration 50 core module.SyncCore 51 state cluster.State 52 53 requestHandler *RequestHandlerEngine // component responsible for handling requests 54 55 pendingSyncResponses engine.MessageStore // message store for *message.SyncResponse 56 pendingBlockResponses engine.MessageStore // message store for *message.BlockResponse 57 responseMessageHandler *engine.MessageHandler // message handler responsible for response processing 58 } 59 60 // New creates a new cluster chain synchronization engine. 61 func New( 62 log zerolog.Logger, 63 metrics module.EngineMetrics, 64 net network.Network, 65 me module.Local, 66 participants flow.IdentityList, 67 state cluster.State, 68 blocks storage.ClusterBlocks, 69 comp network.Engine, 70 core module.SyncCore, 71 opts ...commonsync.OptionFunc, 72 ) (*Engine, error) { 73 74 opt := commonsync.DefaultConfig() 75 for _, f := range opts { 76 f(opt) 77 } 78 79 if comp == nil { 80 return nil, fmt.Errorf("must initialize synchronization engine with comp engine") 81 } 82 83 // initialize the propagation engine with its dependencies 84 e := &Engine{ 85 unit: engine.NewUnit(), 86 lm: lifecycle.NewLifecycleManager(), 87 log: log.With().Str("engine", "cluster_synchronization").Logger(), 88 metrics: metrics, 89 me: me, 90 participants: participants.Filter(filter.Not(filter.HasNodeID(me.NodeID()))), 91 comp: comp, 92 core: core, 93 pollInterval: opt.PollInterval, 94 scanInterval: opt.ScanInterval, 95 state: state, 96 } 97 98 err := e.setupResponseMessageHandler() 99 if err != nil { 100 return nil, fmt.Errorf("could not setup message handler") 101 } 102 103 chainID, err := state.Params().ChainID() 104 if err != nil { 105 return nil, fmt.Errorf("could not get chain ID: %w", err) 106 } 107 108 // register the engine with the network layer and store the conduit 109 con, err := net.Register(channels.SyncCluster(chainID), e) 110 if err != nil { 111 return nil, fmt.Errorf("could not register engine: %w", err) 112 } 113 e.con = con 114 115 e.requestHandler = NewRequestHandlerEngine(log, metrics, con, me, blocks, core, state) 116 117 return e, nil 118 } 119 120 // setupResponseMessageHandler initializes the inbound queues and the MessageHandler for UNTRUSTED responses. 121 func (e *Engine) setupResponseMessageHandler() error { 122 syncResponseQueue, err := fifoqueue.NewFifoQueue(defaultSyncResponseQueueCapacity) 123 if err != nil { 124 return fmt.Errorf("failed to create queue for sync responses: %w", err) 125 } 126 127 e.pendingSyncResponses = &engine.FifoMessageStore{ 128 FifoQueue: syncResponseQueue, 129 } 130 131 blockResponseQueue, err := fifoqueue.NewFifoQueue(defaultBlockResponseQueueCapacity) 132 if err != nil { 133 return fmt.Errorf("failed to create queue for block responses: %w", err) 134 } 135 136 e.pendingBlockResponses = &engine.FifoMessageStore{ 137 FifoQueue: blockResponseQueue, 138 } 139 140 // define message queueing behaviour 141 e.responseMessageHandler = engine.NewMessageHandler( 142 e.log, 143 engine.NewNotifier(), 144 engine.Pattern{ 145 Match: func(msg *engine.Message) bool { 146 _, ok := msg.Payload.(*messages.SyncResponse) 147 if ok { 148 e.metrics.MessageReceived(metrics.EngineClusterSynchronization, metrics.MessageSyncResponse) 149 } 150 return ok 151 }, 152 Store: e.pendingSyncResponses, 153 }, 154 engine.Pattern{ 155 Match: func(msg *engine.Message) bool { 156 _, ok := msg.Payload.(*messages.ClusterBlockResponse) 157 if ok { 158 e.metrics.MessageReceived(metrics.EngineClusterSynchronization, metrics.MessageBlockResponse) 159 } 160 return ok 161 }, 162 Store: e.pendingBlockResponses, 163 }, 164 ) 165 166 return nil 167 } 168 169 // Ready returns a ready channel that is closed once the engine has fully started. 170 func (e *Engine) Ready() <-chan struct{} { 171 e.lm.OnStart(func() { 172 e.unit.Launch(e.checkLoop) 173 e.unit.Launch(e.responseProcessingLoop) 174 // wait for request handler to startup 175 <-e.requestHandler.Ready() 176 }) 177 return e.lm.Started() 178 } 179 180 // Done returns a done channel that is closed once the engine has fully stopped. 181 func (e *Engine) Done() <-chan struct{} { 182 e.lm.OnStop(func() { 183 // signal the request handler to shutdown 184 requestHandlerDone := e.requestHandler.Done() 185 // wait for request sending and response processing routines to exit 186 <-e.unit.Done() 187 // wait for request handler shutdown to complete 188 <-requestHandlerDone 189 }) 190 return e.lm.Stopped() 191 } 192 193 // SubmitLocal submits an event originating on the local node. 194 func (e *Engine) SubmitLocal(event interface{}) { 195 err := e.ProcessLocal(event) 196 if err != nil { 197 e.log.Fatal().Err(err).Msg("internal error processing event") 198 } 199 } 200 201 // Submit submits the given event from the node with the given origin ID 202 // for processing in a non-blocking manner. It returns instantly and logs 203 // a potential processing error internally when done. 204 func (e *Engine) Submit(channel channels.Channel, originID flow.Identifier, event interface{}) { 205 err := e.Process(channel, originID, event) 206 if err != nil { 207 e.log.Fatal().Err(err).Msg("internal error processing event") 208 } 209 } 210 211 // ProcessLocal processes an event originating on the local node. 212 func (e *Engine) ProcessLocal(event interface{}) error { 213 return e.process(e.me.NodeID(), event) 214 } 215 216 // Process processes the given event from the node with the given origin ID in 217 // a blocking manner. It returns the potential processing error when done. 218 func (e *Engine) Process(channel channels.Channel, originID flow.Identifier, event interface{}) error { 219 err := e.process(originID, event) 220 if err != nil { 221 if engine.IsIncompatibleInputTypeError(err) { 222 e.log.Warn().Msgf("%v delivered unsupported message %T through %v", originID, event, channel) 223 return nil 224 } 225 return fmt.Errorf("unexpected error while processing engine message: %w", err) 226 } 227 return nil 228 } 229 230 // process processes events for the synchronization engine. 231 // Error returns: 232 // - IncompatibleInputTypeError if input has unexpected type 233 // - All other errors are potential symptoms of internal state corruption or bugs (fatal). 234 func (e *Engine) process(originID flow.Identifier, event interface{}) error { 235 switch event.(type) { 236 case *messages.RangeRequest, *messages.BatchRequest, *messages.SyncRequest: 237 return e.requestHandler.process(originID, event) 238 case *messages.SyncResponse, *messages.ClusterBlockResponse: 239 return e.responseMessageHandler.Process(originID, event) 240 default: 241 return fmt.Errorf("received input with type %T from %x: %w", event, originID[:], engine.IncompatibleInputTypeError) 242 } 243 } 244 245 // responseProcessingLoop is a separate goroutine that performs processing of queued responses 246 func (e *Engine) responseProcessingLoop() { 247 notifier := e.responseMessageHandler.GetNotifier() 248 for { 249 select { 250 case <-e.unit.Quit(): 251 return 252 case <-notifier: 253 e.processAvailableResponses() 254 } 255 } 256 } 257 258 // processAvailableResponses is processor of pending events which drives events from networking layer to business logic. 259 func (e *Engine) processAvailableResponses() { 260 for { 261 select { 262 case <-e.unit.Quit(): 263 return 264 default: 265 } 266 267 msg, ok := e.pendingSyncResponses.Get() 268 if ok { 269 e.onSyncResponse(msg.OriginID, msg.Payload.(*messages.SyncResponse)) 270 e.metrics.MessageHandled(metrics.EngineClusterSynchronization, metrics.MessageSyncResponse) 271 continue 272 } 273 274 msg, ok = e.pendingBlockResponses.Get() 275 if ok { 276 e.onBlockResponse(msg.OriginID, msg.Payload.(*messages.ClusterBlockResponse)) 277 e.metrics.MessageHandled(metrics.EngineClusterSynchronization, metrics.MessageBlockResponse) 278 continue 279 } 280 281 // when there is no more messages in the queue, back to the loop to wait 282 // for the next incoming message to arrive. 283 return 284 } 285 } 286 287 // onSyncResponse processes a synchronization response. 288 func (e *Engine) onSyncResponse(originID flow.Identifier, res *messages.SyncResponse) { 289 final, err := e.state.Final().Head() 290 if err != nil { 291 e.log.Error().Err(err).Msg("could not get last finalized header") 292 return 293 } 294 e.core.HandleHeight(final, res.Height) 295 } 296 297 // onBlockResponse processes a response containing a specifically requested block. 298 func (e *Engine) onBlockResponse(originID flow.Identifier, res *messages.ClusterBlockResponse) { 299 // process the blocks one by one 300 for _, block := range res.Blocks { 301 if !e.core.HandleBlock(&block.Header) { 302 continue 303 } 304 synced := &events.SyncedClusterBlock{ 305 OriginID: originID, 306 Block: block, 307 } 308 e.comp.SubmitLocal(synced) 309 } 310 } 311 312 // checkLoop will regularly scan for items that need requesting. 313 func (e *Engine) checkLoop() { 314 pollChan := make(<-chan time.Time) 315 if e.pollInterval > 0 { 316 poll := time.NewTicker(e.pollInterval) 317 pollChan = poll.C 318 defer poll.Stop() 319 } 320 scan := time.NewTicker(e.scanInterval) 321 322 CheckLoop: 323 for { 324 // give the quit channel a priority to be selected 325 select { 326 case <-e.unit.Quit(): 327 break CheckLoop 328 default: 329 } 330 331 select { 332 case <-e.unit.Quit(): 333 break CheckLoop 334 case <-pollChan: 335 e.pollHeight() 336 case <-scan.C: 337 final, err := e.state.Final().Head() 338 if err != nil { 339 e.log.Fatal().Err(err).Msg("could not get last finalized header") 340 continue 341 } 342 ranges, batches := e.core.ScanPending(final) 343 e.sendRequests(ranges, batches) 344 } 345 } 346 347 // some minor cleanup 348 scan.Stop() 349 } 350 351 // pollHeight will send a synchronization request to three random nodes. 352 func (e *Engine) pollHeight() { 353 head, err := e.state.Final().Head() 354 if err != nil { 355 e.log.Error().Err(err).Msg("could not get last finalized header") 356 return 357 } 358 359 // send the request for synchronization 360 req := &messages.SyncRequest{ 361 Nonce: rand.Uint64(), 362 Height: head.Height, 363 } 364 err = e.con.Multicast(req, synccore.DefaultPollNodes, e.participants.NodeIDs()...) 365 if err != nil { 366 e.log.Warn().Err(err).Msg("sending sync request to poll heights failed") 367 return 368 } 369 e.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageSyncRequest) 370 } 371 372 // sendRequests sends a request for each range and batch using consensus participants from last finalized snapshot. 373 func (e *Engine) sendRequests(ranges []chainsync.Range, batches []chainsync.Batch) { 374 var errs *multierror.Error 375 376 for _, ran := range ranges { 377 req := &messages.RangeRequest{ 378 Nonce: rand.Uint64(), 379 FromHeight: ran.From, 380 ToHeight: ran.To, 381 } 382 err := e.con.Multicast(req, synccore.DefaultBlockRequestNodes, e.participants.NodeIDs()...) 383 if err != nil { 384 errs = multierror.Append(errs, fmt.Errorf("could not submit range request: %w", err)) 385 continue 386 } 387 e.log.Debug(). 388 Uint64("range_from", req.FromHeight). 389 Uint64("range_to", req.ToHeight). 390 Uint64("range_nonce", req.Nonce). 391 Msg("range requested") 392 e.core.RangeRequested(ran) 393 e.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageRangeRequest) 394 } 395 396 for _, batch := range batches { 397 req := &messages.BatchRequest{ 398 Nonce: rand.Uint64(), 399 BlockIDs: batch.BlockIDs, 400 } 401 err := e.con.Multicast(req, synccore.DefaultBlockRequestNodes, e.participants.NodeIDs()...) 402 if err != nil { 403 errs = multierror.Append(errs, fmt.Errorf("could not submit batch request: %w", err)) 404 continue 405 } 406 e.core.BatchRequested(batch) 407 e.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageBatchRequest) 408 } 409 410 if err := errs.ErrorOrNil(); err != nil { 411 e.log.Warn().Err(err).Msg("sending range and batch requests failed") 412 } 413 }