github.com/amazechain/amc@v0.1.3/internal/sync/initial-sync/blocks_queue.go (about) 1 package initialsync 2 3 import ( 4 "context" 5 "errors" 6 "github.com/amazechain/amc/api/protocol/types_pb" 7 "github.com/amazechain/amc/common" 8 "github.com/amazechain/amc/internal/p2p" 9 amcsync "github.com/amazechain/amc/internal/sync" 10 "github.com/holiman/uint256" 11 "time" 12 13 "github.com/libp2p/go-libp2p/core/peer" 14 ) 15 16 const ( 17 // queueStopCallTimeout is time allowed for queue to release resources when quitting. 18 queueStopCallTimeout = 1 * time.Second 19 // pollingInterval defines how often state machine needs to check for new events. 20 pollingInterval = 200 * time.Millisecond 21 // staleEpochTimeout is an period after which epoch's state is considered stale. 22 staleEpochTimeout = 1 * time.Second 23 // skippedMachineTimeout is a period after which skipped machine is considered as stuck 24 // and is reset (if machine is the last one, then all machines are reset and search for 25 // skipped slot or backtracking takes place). 26 skippedMachineTimeout = 10 * staleEpochTimeout 27 // lookaheadSteps is a limit on how many forward steps are loaded into queue. 28 // Each step is managed by assigned finite state machine. Must be >= 2. 29 lookaheadSteps = 2 30 // noRequiredPeersErrMaxRetries defines number of retries when no required peers are found. 31 noRequiredPeersErrMaxRetries = 1000 32 // noRequiredPeersErrRefreshInterval defines interval for which queue will be paused before 33 // making the next attempt to obtain data. 34 noRequiredPeersErrRefreshInterval = 15 * time.Second 35 // maxResetAttempts number of times stale FSM is reset, before backtracking is triggered. 36 maxResetAttempts = 4 37 // startBackSlots defines number of slots before the current head, which defines a start position 38 // of the initial machine. This allows more robustness in case of normal sync sets head to some 39 // orphaned block: in that case starting earlier and re-fetching blocks allows to reorganize chain. 40 //startBackSlots = 32 41 ) 42 43 var ( 44 errQueueCtxIsDone = errors.New("queue's context is done, reinitialize") 45 errQueueTakesTooLongToStop = errors.New("queue takes too long to stop") 46 errInvalidInitialState = errors.New("invalid initial state") 47 errInputNotFetchRequestParams = errors.New("input data is not type *fetchRequestParams") 48 errNoRequiredPeers = errors.New("no peers with required blocks are found") 49 ) 50 51 const ( 52 modeStopOnFinalizedEpoch syncMode = iota 53 modeNonConstrained 54 ) 55 56 // syncMode specifies sync mod type. 57 type syncMode uint8 58 59 // blocksQueueConfig is a config to setup block queue service. 60 type blocksQueueConfig struct { 61 blocksFetcher *blocksFetcher 62 chain common.IBlockChain 63 highestExpectedBlockNr *uint256.Int 64 p2p p2p.P2P 65 mode syncMode 66 } 67 68 // blocksQueue is a priority queue that serves as a intermediary between block fetchers (producers) 69 // and block processing goroutine (consumer). Consumer can rely on order of incoming blocks. 70 type blocksQueue struct { 71 ctx context.Context 72 cancel context.CancelFunc 73 smm *stateMachineManager 74 blocksFetcher *blocksFetcher 75 chain common.IBlockChain 76 highestExpectedBlockNr *uint256.Int 77 mode syncMode 78 exitConditions struct { 79 noRequiredPeersErrRetries int 80 } 81 fetchedData chan *blocksQueueFetchedData // output channel for ready blocks 82 quit chan struct{} // termination notifier 83 } 84 85 // blocksQueueFetchedData is a data container that is returned from a queue on each step. 86 type blocksQueueFetchedData struct { 87 pid peer.ID 88 blocks []*types_pb.Block 89 } 90 91 // newBlocksQueue creates initialized priority queue. 92 func newBlocksQueue(ctx context.Context, cfg *blocksQueueConfig) *blocksQueue { 93 ctx, cancel := context.WithCancel(ctx) 94 95 blocksFetcher := cfg.blocksFetcher 96 if blocksFetcher == nil { 97 blocksFetcher = newBlocksFetcher(ctx, &blocksFetcherConfig{ 98 chain: cfg.chain, 99 p2p: cfg.p2p, 100 }) 101 } 102 highestExpectedBlockNr := cfg.highestExpectedBlockNr 103 104 // Override fetcher's sync mode. 105 blocksFetcher.mode = cfg.mode 106 107 queue := &blocksQueue{ 108 ctx: ctx, 109 cancel: cancel, 110 highestExpectedBlockNr: highestExpectedBlockNr, 111 blocksFetcher: blocksFetcher, 112 chain: cfg.chain, 113 mode: cfg.mode, 114 fetchedData: make(chan *blocksQueueFetchedData, 1), 115 quit: make(chan struct{}), 116 } 117 118 // Configure state machines. 119 queue.smm = newStateMachineManager() 120 queue.smm.addEventHandler(eventTick, stateNew, queue.onScheduleEvent(ctx)) 121 queue.smm.addEventHandler(eventDataReceived, stateScheduled, queue.onDataReceivedEvent(ctx)) 122 queue.smm.addEventHandler(eventTick, stateDataParsed, queue.onReadyToSendEvent(ctx)) 123 queue.smm.addEventHandler(eventTick, stateSkipped, queue.onProcessSkippedEvent(ctx)) 124 queue.smm.addEventHandler(eventTick, stateSent, queue.onCheckStaleEvent(ctx)) 125 126 return queue 127 } 128 129 // start boots up the queue processing. 130 func (q *blocksQueue) start() error { 131 select { 132 case <-q.ctx.Done(): 133 return errQueueCtxIsDone 134 default: 135 go q.loop() 136 return nil 137 } 138 } 139 140 // stop terminates all queue operations. 141 func (q *blocksQueue) stop() error { 142 q.cancel() 143 select { 144 case <-q.quit: 145 return nil 146 case <-time.After(queueStopCallTimeout): 147 return errQueueTakesTooLongToStop 148 } 149 } 150 151 // loop is a main queue loop. 152 func (q *blocksQueue) loop() { 153 defer close(q.quit) 154 155 defer func() { 156 q.blocksFetcher.stop() 157 close(q.fetchedData) 158 }() 159 160 if err := q.blocksFetcher.start(); err != nil { 161 log.Debug("Can not start blocks provider", "err", err) 162 } 163 164 // Define initial state machines. 165 // currentblock update? 166 startBlockNr := new(uint256.Int).AddUint64(q.chain.CurrentBlock().Number64(), 1) 167 blocksPerRequest := q.blocksFetcher.blocksPerPeriod 168 for i := startBlockNr.Clone(); i.Cmp(new(uint256.Int).AddUint64(startBlockNr, blocksPerRequest*lookaheadSteps)) == -1; i = i.AddUint64(i, blocksPerRequest) { 169 q.smm.addStateMachine(i) 170 } 171 172 ticker := time.NewTicker(pollingInterval) 173 defer ticker.Stop() 174 for { 175 if waitHighestExpectedBlockNr(q) { 176 continue 177 } 178 179 log.Trace("tick", 180 "highestExpectedBlockNr", q.highestExpectedBlockNr, 181 "ourBlockNr", q.chain.CurrentBlock().Number64(), 182 "state", q.smm.String(), 183 ) 184 185 select { 186 case <-ticker.C: 187 for _, key := range q.smm.keys { 188 fsm := q.smm.machines[key.Uint64()] 189 if err := fsm.trigger(eventTick, nil); err != nil { 190 log.Debug("Can not trigger event", 191 "highestExpectedBlockNr", q.highestExpectedBlockNr, 192 "noRequiredPeersErrRetries", q.exitConditions.noRequiredPeersErrRetries, 193 "event", eventTick, 194 "start", fsm.start, 195 "error", err.Error(), 196 ) 197 if errors.Is(err, errNoRequiredPeers) { 198 forceExit := q.exitConditions.noRequiredPeersErrRetries > noRequiredPeersErrMaxRetries 199 if forceExit { 200 q.cancel() 201 } else { 202 q.exitConditions.noRequiredPeersErrRetries++ 203 log.Debug("Waiting for finalized peers") 204 time.Sleep(noRequiredPeersErrRefreshInterval) 205 } 206 continue 207 } 208 } 209 // Do garbage collection, and advance sliding window forward. 210 if q.chain.CurrentBlock().Number64().Cmp(new(uint256.Int).AddUint64(fsm.start, blocksPerRequest-1)) >= 0 { 211 highestStartSlot, err := q.smm.highestStartSlot() 212 if err != nil { 213 log.Debug("Cannot obtain highest epoch state number", "err", err) 214 continue 215 } 216 if err := q.smm.removeStateMachine(fsm.start); err != nil { 217 log.Debug("Can not remove state machine", "err", err) 218 } 219 if len(q.smm.machines) < lookaheadSteps { 220 q.smm.addStateMachine(new(uint256.Int).AddUint64(highestStartSlot, blocksPerRequest)) 221 } 222 } 223 } 224 case response, ok := <-q.blocksFetcher.requestResponses(): 225 if !ok { 226 log.Debug("Fetcher closed output channel") 227 q.cancel() 228 return 229 } 230 // Update state of an epoch for which data is received. 231 if fsm, ok := q.smm.findStateMachine(response.start); ok { 232 if err := fsm.trigger(eventDataReceived, response); err != nil { 233 log.Debug("Can not process event", 234 "event", eventDataReceived, 235 "start", fsm.start, 236 "error", err.Error(), 237 ) 238 fsm.setState(stateNew) 239 continue 240 } 241 } 242 case <-q.ctx.Done(): 243 log.Debug("Context closed, exiting goroutine (blocks queue)") 244 return 245 } 246 } 247 } 248 249 func waitHighestExpectedBlockNr(q *blocksQueue) bool { 250 // Check highest expected blockNr when we approach chain's head slot. 251 if q.chain.CurrentBlock().Number64().Cmp(q.highestExpectedBlockNr) >= 0 { 252 // By the time initial sync is complete, highest slot may increase, re-check. 253 targetBlockNr := q.blocksFetcher.bestFinalizedBlockNr() 254 if q.highestExpectedBlockNr.Cmp(targetBlockNr) == -1 { 255 q.highestExpectedBlockNr = targetBlockNr 256 return true 257 } 258 log.Debug("Highest expected blockNr reached", "blockNr", targetBlockNr) 259 q.cancel() 260 } 261 return false 262 } 263 264 // onScheduleEvent is an event called on newly arrived epochs. Transforms state to scheduled. 265 func (q *blocksQueue) onScheduleEvent(ctx context.Context) eventHandlerFn { 266 return func(m *stateMachine, in interface{}) (stateID, error) { 267 if m.state != stateNew { 268 return m.state, errInvalidInitialState 269 } 270 if m.start.Cmp(q.highestExpectedBlockNr) == 1 { 271 m.setState(stateSkipped) 272 return m.state, errBlockNrIsTooHigh 273 } 274 blocksPerRequest := q.blocksFetcher.blocksPerPeriod 275 if q.highestExpectedBlockNr.Cmp(new(uint256.Int).AddUint64(m.start, blocksPerRequest)) < 0 { 276 blocksPerRequest = new(uint256.Int).Sub(q.highestExpectedBlockNr, m.start).Uint64() + 1 277 } 278 if err := q.blocksFetcher.scheduleRequest(ctx, m.start, blocksPerRequest); err != nil { 279 return m.state, err 280 } 281 return stateScheduled, nil 282 } 283 } 284 285 // onDataReceivedEvent is an event called when data is received from fetcher. 286 func (q *blocksQueue) onDataReceivedEvent(ctx context.Context) eventHandlerFn { 287 return func(m *stateMachine, in interface{}) (stateID, error) { 288 if ctx.Err() != nil { 289 return m.state, ctx.Err() 290 } 291 if m.state != stateScheduled { 292 return m.state, errInvalidInitialState 293 } 294 response, ok := in.(*fetchRequestResponse) 295 if !ok { 296 return m.state, errInputNotFetchRequestParams 297 } 298 if response.err != nil { 299 switch response.err { 300 //todo 301 case errBlockNrIsTooHigh: 302 // Current window is already too big, re-request previous epochs. 303 for _, fsm := range q.smm.machines { 304 if fsm.start.Cmp(response.start) == -1 && fsm.state == stateSkipped { 305 fsm.setState(stateNew) 306 } 307 } 308 case amcsync.ErrInvalidFetchedData: 309 // Peer returned invalid data, penalize. 310 q.blocksFetcher.p2p.Peers().Scorers().BadResponsesScorer().Increment(m.pid) 311 log.Debug("Peer is penalized for invalid blocks", "pid", response.pid) 312 } 313 return m.state, response.err 314 } 315 m.pid = response.pid 316 m.blocks = response.blocks 317 return stateDataParsed, nil 318 } 319 } 320 321 // onReadyToSendEvent is an event called to allow epochs with available blocks to send them downstream. 322 func (q *blocksQueue) onReadyToSendEvent(ctx context.Context) eventHandlerFn { 323 return func(m *stateMachine, in interface{}) (stateID, error) { 324 if ctx.Err() != nil { 325 return m.state, ctx.Err() 326 } 327 if m.state != stateDataParsed { 328 return m.state, errInvalidInitialState 329 } 330 331 if len(m.blocks) == 0 { 332 return stateSkipped, nil 333 } 334 335 send := func() (stateID, error) { 336 data := &blocksQueueFetchedData{ 337 pid: m.pid, 338 blocks: m.blocks, 339 } 340 select { 341 case <-ctx.Done(): 342 return m.state, ctx.Err() 343 case q.fetchedData <- data: 344 } 345 return stateSent, nil 346 } 347 348 // Make sure that we send epochs in a correct order. 349 // If machine is the first (has lowest start block), send. 350 if m.isFirst() { 351 return send() 352 } 353 354 // Make sure that previous epoch is already processed. 355 for _, fsm := range q.smm.machines { 356 // Review only previous slots. 357 if fsm.start.Cmp(m.start) == -1 { 358 switch fsm.state { 359 case stateNew, stateScheduled, stateDataParsed: 360 return m.state, nil 361 } 362 } 363 } 364 365 return send() 366 } 367 } 368 369 // onProcessSkippedEvent is an event triggered on skipped machines, allowing handlers to 370 // extend lookahead window, in case where progress is not possible otherwise. 371 func (q *blocksQueue) onProcessSkippedEvent(ctx context.Context) eventHandlerFn { 372 return func(m *stateMachine, in interface{}) (stateID, error) { 373 if ctx.Err() != nil { 374 return m.state, ctx.Err() 375 } 376 if m.state != stateSkipped { 377 return m.state, errInvalidInitialState 378 } 379 380 // Only the highest epoch with skipped state can trigger extension. 381 if !m.isLast() { 382 // When a state machine stays in skipped state for too long - reset it. 383 if time.Since(m.updated) > skippedMachineTimeout { 384 return stateNew, nil 385 } 386 return m.state, nil 387 } 388 389 // Make sure that all machines are in skipped state i.e. manager cannot progress without reset or 390 // moving the last machine's start block forward (in an attempt to find next non-skipped block). 391 if !q.smm.allMachinesInState(stateSkipped) { 392 return m.state, nil 393 } 394 395 // Check if we have enough peers to progress, or sync needs to halt (due to no peers available). 396 //bestFinalizedSlot := q.blocksFetcher.bestFinalizedBlockNr() 397 if q.blocksFetcher.bestFinalizedBlockNr().Cmp(q.chain.CurrentBlock().Number64()) >= 0 { 398 return stateSkipped, errNoRequiredPeers 399 } 400 401 // All machines are skipped, FSMs need reset. 402 startBlockNr := new(uint256.Int).AddUint64(q.chain.CurrentBlock().Number64(), 1) 403 404 //todo q.blocksFetcher.findFork(ctx, startSlot) 405 406 return stateSkipped, q.resetFromBlockNr(ctx, startBlockNr) 407 } 408 } 409 410 // onCheckStaleEvent is an event that allows to mark stale epochs, 411 // so that they can be re-processed. 412 func (_ *blocksQueue) onCheckStaleEvent(ctx context.Context) eventHandlerFn { 413 return func(m *stateMachine, in interface{}) (stateID, error) { 414 if ctx.Err() != nil { 415 return m.state, ctx.Err() 416 } 417 if m.state != stateSent { 418 return m.state, errInvalidInitialState 419 } 420 421 // Break out immediately if bucket is not stale. 422 if time.Since(m.updated) < staleEpochTimeout { 423 return m.state, nil 424 } 425 426 return stateSkipped, nil 427 } 428 }