github.com/prysmaticlabs/prysm@v1.4.4/beacon-chain/sync/initial-sync/blocks_queue.go (about) 1 package initialsync 2 3 import ( 4 "context" 5 "errors" 6 "time" 7 8 "github.com/libp2p/go-libp2p-core/peer" 9 types "github.com/prysmaticlabs/eth2-types" 10 "github.com/prysmaticlabs/prysm/beacon-chain/core/helpers" 11 "github.com/prysmaticlabs/prysm/beacon-chain/db" 12 "github.com/prysmaticlabs/prysm/beacon-chain/p2p" 13 beaconsync "github.com/prysmaticlabs/prysm/beacon-chain/sync" 14 "github.com/prysmaticlabs/prysm/proto/interfaces" 15 "github.com/sirupsen/logrus" 16 ) 17 18 const ( 19 // queueStopCallTimeout is time allowed for queue to release resources when quitting. 20 queueStopCallTimeout = 1 * time.Second 21 // pollingInterval defines how often state machine needs to check for new events. 22 pollingInterval = 200 * time.Millisecond 23 // staleEpochTimeout is an period after which epoch's state is considered stale. 24 staleEpochTimeout = 1 * time.Second 25 // skippedMachineTimeout is a period after which skipped machine is considered as stuck 26 // and is reset (if machine is the last one, then all machines are reset and search for 27 // skipped slot or backtracking takes place). 28 skippedMachineTimeout = 10 * staleEpochTimeout 29 // lookaheadSteps is a limit on how many forward steps are loaded into queue. 30 // Each step is managed by assigned finite state machine. Must be >= 2. 31 lookaheadSteps = 8 32 // noRequiredPeersErrMaxRetries defines number of retries when no required peers are found. 33 noRequiredPeersErrMaxRetries = 1000 34 // noRequiredPeersErrRefreshInterval defines interval for which queue will be paused before 35 // making the next attempt to obtain data. 36 noRequiredPeersErrRefreshInterval = 15 * time.Second 37 // maxResetAttempts number of times stale FSM is reset, before backtracking is triggered. 38 maxResetAttempts = 4 39 // startBackSlots defines number of slots before the current head, which defines a start position 40 // of the initial machine. This allows more robustness in case of normal sync sets head to some 41 // orphaned block: in that case starting earlier and re-fetching blocks allows to reorganize chain. 42 startBackSlots = 32 43 ) 44 45 var ( 46 errQueueCtxIsDone = errors.New("queue's context is done, reinitialize") 47 errQueueTakesTooLongToStop = errors.New("queue takes too long to stop") 48 errInvalidInitialState = errors.New("invalid initial state") 49 errInputNotFetchRequestParams = errors.New("input data is not type *fetchRequestParams") 50 errNoRequiredPeers = errors.New("no peers with required blocks are found") 51 ) 52 53 const ( 54 modeStopOnFinalizedEpoch syncMode = iota 55 modeNonConstrained 56 ) 57 58 // syncMode specifies sync mod type. 59 type syncMode uint8 60 61 // blocksQueueConfig is a config to setup block queue service. 62 type blocksQueueConfig struct { 63 blocksFetcher *blocksFetcher 64 chain blockchainService 65 highestExpectedSlot types.Slot 66 p2p p2p.P2P 67 db db.ReadOnlyDatabase 68 mode syncMode 69 } 70 71 // blocksQueue is a priority queue that serves as a intermediary between block fetchers (producers) 72 // and block processing goroutine (consumer). Consumer can rely on order of incoming blocks. 73 type blocksQueue struct { 74 ctx context.Context 75 cancel context.CancelFunc 76 smm *stateMachineManager 77 blocksFetcher *blocksFetcher 78 chain blockchainService 79 highestExpectedSlot types.Slot 80 mode syncMode 81 exitConditions struct { 82 noRequiredPeersErrRetries int 83 } 84 fetchedData chan *blocksQueueFetchedData // output channel for ready blocks 85 staleEpochs map[types.Epoch]uint8 // counter to keep track of stale FSMs 86 quit chan struct{} // termination notifier 87 } 88 89 // blocksQueueFetchedData is a data container that is returned from a queue on each step. 90 type blocksQueueFetchedData struct { 91 pid peer.ID 92 blocks []interfaces.SignedBeaconBlock 93 } 94 95 // newBlocksQueue creates initialized priority queue. 96 func newBlocksQueue(ctx context.Context, cfg *blocksQueueConfig) *blocksQueue { 97 ctx, cancel := context.WithCancel(ctx) 98 99 blocksFetcher := cfg.blocksFetcher 100 if blocksFetcher == nil { 101 blocksFetcher = newBlocksFetcher(ctx, &blocksFetcherConfig{ 102 chain: cfg.chain, 103 p2p: cfg.p2p, 104 db: cfg.db, 105 }) 106 } 107 highestExpectedSlot := cfg.highestExpectedSlot 108 if highestExpectedSlot == 0 { 109 if cfg.mode == modeStopOnFinalizedEpoch { 110 highestExpectedSlot = blocksFetcher.bestFinalizedSlot() 111 } else { 112 highestExpectedSlot = blocksFetcher.bestNonFinalizedSlot() 113 } 114 } 115 116 // Override fetcher's sync mode. 117 blocksFetcher.mode = cfg.mode 118 119 queue := &blocksQueue{ 120 ctx: ctx, 121 cancel: cancel, 122 highestExpectedSlot: highestExpectedSlot, 123 blocksFetcher: blocksFetcher, 124 chain: cfg.chain, 125 mode: cfg.mode, 126 fetchedData: make(chan *blocksQueueFetchedData, 1), 127 quit: make(chan struct{}), 128 staleEpochs: make(map[types.Epoch]uint8), 129 } 130 131 // Configure state machines. 132 queue.smm = newStateMachineManager() 133 queue.smm.addEventHandler(eventTick, stateNew, queue.onScheduleEvent(ctx)) 134 queue.smm.addEventHandler(eventDataReceived, stateScheduled, queue.onDataReceivedEvent(ctx)) 135 queue.smm.addEventHandler(eventTick, stateDataParsed, queue.onReadyToSendEvent(ctx)) 136 queue.smm.addEventHandler(eventTick, stateSkipped, queue.onProcessSkippedEvent(ctx)) 137 queue.smm.addEventHandler(eventTick, stateSent, queue.onCheckStaleEvent(ctx)) 138 139 return queue 140 } 141 142 // start boots up the queue processing. 143 func (q *blocksQueue) start() error { 144 select { 145 case <-q.ctx.Done(): 146 return errQueueCtxIsDone 147 default: 148 go q.loop() 149 return nil 150 } 151 } 152 153 // stop terminates all queue operations. 154 func (q *blocksQueue) stop() error { 155 q.cancel() 156 select { 157 case <-q.quit: 158 return nil 159 case <-time.After(queueStopCallTimeout): 160 return errQueueTakesTooLongToStop 161 } 162 } 163 164 // loop is a main queue loop. 165 func (q *blocksQueue) loop() { 166 defer close(q.quit) 167 168 defer func() { 169 q.blocksFetcher.stop() 170 close(q.fetchedData) 171 }() 172 173 if err := q.blocksFetcher.start(); err != nil { 174 log.WithError(err).Debug("Can not start blocks provider") 175 } 176 177 // Define initial state machines. 178 startSlot := q.chain.HeadSlot() 179 if startSlot > startBackSlots { 180 startSlot -= startBackSlots 181 } 182 blocksPerRequest := q.blocksFetcher.blocksPerSecond 183 for i := startSlot; i < startSlot.Add(blocksPerRequest*lookaheadSteps); i += types.Slot(blocksPerRequest) { 184 q.smm.addStateMachine(i) 185 } 186 187 ticker := time.NewTicker(pollingInterval) 188 defer ticker.Stop() 189 for { 190 // Check highest expected slot when we approach chain's head slot. 191 if q.chain.HeadSlot() >= q.highestExpectedSlot { 192 // By the time initial sync is complete, highest slot may increase, re-check. 193 if q.mode == modeStopOnFinalizedEpoch { 194 if q.highestExpectedSlot < q.blocksFetcher.bestFinalizedSlot() { 195 q.highestExpectedSlot = q.blocksFetcher.bestFinalizedSlot() 196 continue 197 } 198 } else { 199 if q.highestExpectedSlot < q.blocksFetcher.bestNonFinalizedSlot() { 200 q.highestExpectedSlot = q.blocksFetcher.bestNonFinalizedSlot() 201 continue 202 } 203 } 204 log.WithField("slot", q.highestExpectedSlot).Debug("Highest expected slot reached") 205 q.cancel() 206 } 207 208 log.WithFields(logrus.Fields{ 209 "highestExpectedSlot": q.highestExpectedSlot, 210 "headSlot": q.chain.HeadSlot(), 211 "state": q.smm.String(), 212 "staleEpoch": q.staleEpochs, 213 }).Trace("tick") 214 215 select { 216 case <-ticker.C: 217 for _, key := range q.smm.keys { 218 fsm := q.smm.machines[key] 219 if err := fsm.trigger(eventTick, nil); err != nil { 220 log.WithFields(logrus.Fields{ 221 "highestExpectedSlot": q.highestExpectedSlot, 222 "noRequiredPeersErrRetries": q.exitConditions.noRequiredPeersErrRetries, 223 "event": eventTick, 224 "epoch": helpers.SlotToEpoch(fsm.start), 225 "start": fsm.start, 226 "error": err.Error(), 227 }).Debug("Can not trigger event") 228 if errors.Is(err, errNoRequiredPeers) { 229 forceExit := q.exitConditions.noRequiredPeersErrRetries > noRequiredPeersErrMaxRetries 230 if q.mode == modeStopOnFinalizedEpoch || forceExit { 231 q.cancel() 232 } else { 233 q.exitConditions.noRequiredPeersErrRetries++ 234 log.Debug("Waiting for finalized peers") 235 time.Sleep(noRequiredPeersErrRefreshInterval) 236 } 237 continue 238 } 239 } 240 // Do garbage collection, and advance sliding window forward. 241 if q.chain.HeadSlot() >= fsm.start.Add(blocksPerRequest-1) { 242 highestStartSlot, err := q.smm.highestStartSlot() 243 if err != nil { 244 log.WithError(err).Debug("Cannot obtain highest epoch state number") 245 continue 246 } 247 if err := q.smm.removeStateMachine(fsm.start); err != nil { 248 log.WithError(err).Debug("Can not remove state machine") 249 } 250 if len(q.smm.machines) < lookaheadSteps { 251 q.smm.addStateMachine(highestStartSlot.Add(blocksPerRequest)) 252 } 253 } 254 } 255 case response, ok := <-q.blocksFetcher.requestResponses(): 256 if !ok { 257 log.Debug("Fetcher closed output channel") 258 q.cancel() 259 return 260 } 261 // Update state of an epoch for which data is received. 262 if fsm, ok := q.smm.findStateMachine(response.start); ok { 263 if err := fsm.trigger(eventDataReceived, response); err != nil { 264 log.WithFields(logrus.Fields{ 265 "event": eventDataReceived, 266 "epoch": helpers.SlotToEpoch(fsm.start), 267 "error": err.Error(), 268 }).Debug("Can not process event") 269 fsm.setState(stateNew) 270 continue 271 } 272 } 273 case <-q.ctx.Done(): 274 log.Debug("Context closed, exiting goroutine (blocks queue)") 275 return 276 } 277 } 278 } 279 280 // onScheduleEvent is an event called on newly arrived epochs. Transforms state to scheduled. 281 func (q *blocksQueue) onScheduleEvent(ctx context.Context) eventHandlerFn { 282 return func(m *stateMachine, in interface{}) (stateID, error) { 283 if m.state != stateNew { 284 return m.state, errInvalidInitialState 285 } 286 if m.start > q.highestExpectedSlot { 287 m.setState(stateSkipped) 288 return m.state, errSlotIsTooHigh 289 } 290 blocksPerRequest := q.blocksFetcher.blocksPerSecond 291 if err := q.blocksFetcher.scheduleRequest(ctx, m.start, blocksPerRequest); err != nil { 292 return m.state, err 293 } 294 return stateScheduled, nil 295 } 296 } 297 298 // onDataReceivedEvent is an event called when data is received from fetcher. 299 func (q *blocksQueue) onDataReceivedEvent(ctx context.Context) eventHandlerFn { 300 return func(m *stateMachine, in interface{}) (stateID, error) { 301 if ctx.Err() != nil { 302 return m.state, ctx.Err() 303 } 304 if m.state != stateScheduled { 305 return m.state, errInvalidInitialState 306 } 307 response, ok := in.(*fetchRequestResponse) 308 if !ok { 309 return m.state, errInputNotFetchRequestParams 310 } 311 if response.err != nil { 312 switch response.err { 313 case errSlotIsTooHigh: 314 // Current window is already too big, re-request previous epochs. 315 for _, fsm := range q.smm.machines { 316 if fsm.start < response.start && fsm.state == stateSkipped { 317 fsm.setState(stateNew) 318 } 319 } 320 case beaconsync.ErrInvalidFetchedData: 321 // Peer returned invalid data, penalize. 322 q.blocksFetcher.p2p.Peers().Scorers().BadResponsesScorer().Increment(m.pid) 323 log.WithField("pid", response.pid).Debug("Peer is penalized for invalid blocks") 324 } 325 return m.state, response.err 326 } 327 m.pid = response.pid 328 m.blocks = response.blocks 329 return stateDataParsed, nil 330 } 331 } 332 333 // onReadyToSendEvent is an event called to allow epochs with available blocks to send them downstream. 334 func (q *blocksQueue) onReadyToSendEvent(ctx context.Context) eventHandlerFn { 335 return func(m *stateMachine, in interface{}) (stateID, error) { 336 if ctx.Err() != nil { 337 return m.state, ctx.Err() 338 } 339 if m.state != stateDataParsed { 340 return m.state, errInvalidInitialState 341 } 342 343 if len(m.blocks) == 0 { 344 return stateSkipped, nil 345 } 346 347 send := func() (stateID, error) { 348 data := &blocksQueueFetchedData{ 349 pid: m.pid, 350 blocks: m.blocks, 351 } 352 select { 353 case <-ctx.Done(): 354 return m.state, ctx.Err() 355 case q.fetchedData <- data: 356 } 357 return stateSent, nil 358 } 359 360 // Make sure that we send epochs in a correct order. 361 // If machine is the first (has lowest start block), send. 362 if m.isFirst() { 363 return send() 364 } 365 366 // Make sure that previous epoch is already processed. 367 for _, fsm := range q.smm.machines { 368 // Review only previous slots. 369 if fsm.start < m.start { 370 switch fsm.state { 371 case stateNew, stateScheduled, stateDataParsed: 372 return m.state, nil 373 } 374 } 375 } 376 377 return send() 378 } 379 } 380 381 // onProcessSkippedEvent is an event triggered on skipped machines, allowing handlers to 382 // extend lookahead window, in case where progress is not possible otherwise. 383 func (q *blocksQueue) onProcessSkippedEvent(ctx context.Context) eventHandlerFn { 384 return func(m *stateMachine, in interface{}) (stateID, error) { 385 if ctx.Err() != nil { 386 return m.state, ctx.Err() 387 } 388 if m.state != stateSkipped { 389 return m.state, errInvalidInitialState 390 } 391 392 // Only the highest epoch with skipped state can trigger extension. 393 if !m.isLast() { 394 // When a state machine stays in skipped state for too long - reset it. 395 if time.Since(m.updated) > skippedMachineTimeout { 396 return stateNew, nil 397 } 398 return m.state, nil 399 } 400 401 // Make sure that all machines are in skipped state i.e. manager cannot progress without reset or 402 // moving the last machine's start block forward (in an attempt to find next non-skipped block). 403 if !q.smm.allMachinesInState(stateSkipped) { 404 return m.state, nil 405 } 406 407 // Check if we have enough peers to progress, or sync needs to halt (due to no peers available). 408 bestFinalizedSlot := q.blocksFetcher.bestFinalizedSlot() 409 if q.mode == modeStopOnFinalizedEpoch { 410 if bestFinalizedSlot <= q.chain.HeadSlot() { 411 return stateSkipped, errNoRequiredPeers 412 } 413 } else { 414 if q.blocksFetcher.bestNonFinalizedSlot() <= q.chain.HeadSlot() { 415 return stateSkipped, errNoRequiredPeers 416 } 417 } 418 419 // All machines are skipped, FSMs need reset. 420 startSlot := q.chain.HeadSlot() + 1 421 if q.mode == modeNonConstrained && startSlot > bestFinalizedSlot { 422 q.staleEpochs[helpers.SlotToEpoch(startSlot)]++ 423 // If FSMs have been reset enough times, try to explore alternative forks. 424 if q.staleEpochs[helpers.SlotToEpoch(startSlot)] >= maxResetAttempts { 425 delete(q.staleEpochs, helpers.SlotToEpoch(startSlot)) 426 fork, err := q.blocksFetcher.findFork(ctx, startSlot) 427 if err == nil { 428 return stateSkipped, q.resetFromFork(ctx, fork) 429 } 430 log.WithFields(logrus.Fields{ 431 "epoch": helpers.SlotToEpoch(startSlot), 432 "error": err.Error(), 433 }).Debug("Can not explore alternative branches") 434 } 435 } 436 return stateSkipped, q.resetFromSlot(ctx, startSlot) 437 } 438 } 439 440 // onCheckStaleEvent is an event that allows to mark stale epochs, 441 // so that they can be re-processed. 442 func (q *blocksQueue) onCheckStaleEvent(ctx context.Context) eventHandlerFn { 443 return func(m *stateMachine, in interface{}) (stateID, error) { 444 if ctx.Err() != nil { 445 return m.state, ctx.Err() 446 } 447 if m.state != stateSent { 448 return m.state, errInvalidInitialState 449 } 450 451 // Break out immediately if bucket is not stale. 452 if time.Since(m.updated) < staleEpochTimeout { 453 return m.state, nil 454 } 455 456 return stateSkipped, nil 457 } 458 }