github.com/prysmaticlabs/prysm@v1.4.4/beacon-chain/sync/initial-sync/blocks_queue.go (about)

     1  package initialsync
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"time"
     7  
     8  	"github.com/libp2p/go-libp2p-core/peer"
     9  	types "github.com/prysmaticlabs/eth2-types"
    10  	"github.com/prysmaticlabs/prysm/beacon-chain/core/helpers"
    11  	"github.com/prysmaticlabs/prysm/beacon-chain/db"
    12  	"github.com/prysmaticlabs/prysm/beacon-chain/p2p"
    13  	beaconsync "github.com/prysmaticlabs/prysm/beacon-chain/sync"
    14  	"github.com/prysmaticlabs/prysm/proto/interfaces"
    15  	"github.com/sirupsen/logrus"
    16  )
    17  
    18  const (
    19  	// queueStopCallTimeout is time allowed for queue to release resources when quitting.
    20  	queueStopCallTimeout = 1 * time.Second
    21  	// pollingInterval defines how often state machine needs to check for new events.
    22  	pollingInterval = 200 * time.Millisecond
    23  	// staleEpochTimeout is an period after which epoch's state is considered stale.
    24  	staleEpochTimeout = 1 * time.Second
    25  	// skippedMachineTimeout is a period after which skipped machine is considered as stuck
    26  	// and is reset (if machine is the last one, then all machines are reset and search for
    27  	// skipped slot or backtracking takes place).
    28  	skippedMachineTimeout = 10 * staleEpochTimeout
    29  	// lookaheadSteps is a limit on how many forward steps are loaded into queue.
    30  	// Each step is managed by assigned finite state machine. Must be >= 2.
    31  	lookaheadSteps = 8
    32  	// noRequiredPeersErrMaxRetries defines number of retries when no required peers are found.
    33  	noRequiredPeersErrMaxRetries = 1000
    34  	// noRequiredPeersErrRefreshInterval defines interval for which queue will be paused before
    35  	// making the next attempt to obtain data.
    36  	noRequiredPeersErrRefreshInterval = 15 * time.Second
    37  	// maxResetAttempts number of times stale FSM is reset, before backtracking is triggered.
    38  	maxResetAttempts = 4
    39  	// startBackSlots defines number of slots before the current head, which defines a start position
    40  	// of the initial machine. This allows more robustness in case of normal sync sets head to some
    41  	// orphaned block: in that case starting earlier and re-fetching blocks allows to reorganize chain.
    42  	startBackSlots = 32
    43  )
    44  
    45  var (
    46  	errQueueCtxIsDone             = errors.New("queue's context is done, reinitialize")
    47  	errQueueTakesTooLongToStop    = errors.New("queue takes too long to stop")
    48  	errInvalidInitialState        = errors.New("invalid initial state")
    49  	errInputNotFetchRequestParams = errors.New("input data is not type *fetchRequestParams")
    50  	errNoRequiredPeers            = errors.New("no peers with required blocks are found")
    51  )
    52  
    53  const (
    54  	modeStopOnFinalizedEpoch syncMode = iota
    55  	modeNonConstrained
    56  )
    57  
    58  // syncMode specifies sync mod type.
    59  type syncMode uint8
    60  
    61  // blocksQueueConfig is a config to setup block queue service.
    62  type blocksQueueConfig struct {
    63  	blocksFetcher       *blocksFetcher
    64  	chain               blockchainService
    65  	highestExpectedSlot types.Slot
    66  	p2p                 p2p.P2P
    67  	db                  db.ReadOnlyDatabase
    68  	mode                syncMode
    69  }
    70  
    71  // blocksQueue is a priority queue that serves as a intermediary between block fetchers (producers)
    72  // and block processing goroutine (consumer). Consumer can rely on order of incoming blocks.
    73  type blocksQueue struct {
    74  	ctx                 context.Context
    75  	cancel              context.CancelFunc
    76  	smm                 *stateMachineManager
    77  	blocksFetcher       *blocksFetcher
    78  	chain               blockchainService
    79  	highestExpectedSlot types.Slot
    80  	mode                syncMode
    81  	exitConditions      struct {
    82  		noRequiredPeersErrRetries int
    83  	}
    84  	fetchedData chan *blocksQueueFetchedData // output channel for ready blocks
    85  	staleEpochs map[types.Epoch]uint8        // counter to keep track of stale FSMs
    86  	quit        chan struct{}                // termination notifier
    87  }
    88  
    89  // blocksQueueFetchedData is a data container that is returned from a queue on each step.
    90  type blocksQueueFetchedData struct {
    91  	pid    peer.ID
    92  	blocks []interfaces.SignedBeaconBlock
    93  }
    94  
    95  // newBlocksQueue creates initialized priority queue.
    96  func newBlocksQueue(ctx context.Context, cfg *blocksQueueConfig) *blocksQueue {
    97  	ctx, cancel := context.WithCancel(ctx)
    98  
    99  	blocksFetcher := cfg.blocksFetcher
   100  	if blocksFetcher == nil {
   101  		blocksFetcher = newBlocksFetcher(ctx, &blocksFetcherConfig{
   102  			chain: cfg.chain,
   103  			p2p:   cfg.p2p,
   104  			db:    cfg.db,
   105  		})
   106  	}
   107  	highestExpectedSlot := cfg.highestExpectedSlot
   108  	if highestExpectedSlot == 0 {
   109  		if cfg.mode == modeStopOnFinalizedEpoch {
   110  			highestExpectedSlot = blocksFetcher.bestFinalizedSlot()
   111  		} else {
   112  			highestExpectedSlot = blocksFetcher.bestNonFinalizedSlot()
   113  		}
   114  	}
   115  
   116  	// Override fetcher's sync mode.
   117  	blocksFetcher.mode = cfg.mode
   118  
   119  	queue := &blocksQueue{
   120  		ctx:                 ctx,
   121  		cancel:              cancel,
   122  		highestExpectedSlot: highestExpectedSlot,
   123  		blocksFetcher:       blocksFetcher,
   124  		chain:               cfg.chain,
   125  		mode:                cfg.mode,
   126  		fetchedData:         make(chan *blocksQueueFetchedData, 1),
   127  		quit:                make(chan struct{}),
   128  		staleEpochs:         make(map[types.Epoch]uint8),
   129  	}
   130  
   131  	// Configure state machines.
   132  	queue.smm = newStateMachineManager()
   133  	queue.smm.addEventHandler(eventTick, stateNew, queue.onScheduleEvent(ctx))
   134  	queue.smm.addEventHandler(eventDataReceived, stateScheduled, queue.onDataReceivedEvent(ctx))
   135  	queue.smm.addEventHandler(eventTick, stateDataParsed, queue.onReadyToSendEvent(ctx))
   136  	queue.smm.addEventHandler(eventTick, stateSkipped, queue.onProcessSkippedEvent(ctx))
   137  	queue.smm.addEventHandler(eventTick, stateSent, queue.onCheckStaleEvent(ctx))
   138  
   139  	return queue
   140  }
   141  
   142  // start boots up the queue processing.
   143  func (q *blocksQueue) start() error {
   144  	select {
   145  	case <-q.ctx.Done():
   146  		return errQueueCtxIsDone
   147  	default:
   148  		go q.loop()
   149  		return nil
   150  	}
   151  }
   152  
   153  // stop terminates all queue operations.
   154  func (q *blocksQueue) stop() error {
   155  	q.cancel()
   156  	select {
   157  	case <-q.quit:
   158  		return nil
   159  	case <-time.After(queueStopCallTimeout):
   160  		return errQueueTakesTooLongToStop
   161  	}
   162  }
   163  
   164  // loop is a main queue loop.
   165  func (q *blocksQueue) loop() {
   166  	defer close(q.quit)
   167  
   168  	defer func() {
   169  		q.blocksFetcher.stop()
   170  		close(q.fetchedData)
   171  	}()
   172  
   173  	if err := q.blocksFetcher.start(); err != nil {
   174  		log.WithError(err).Debug("Can not start blocks provider")
   175  	}
   176  
   177  	// Define initial state machines.
   178  	startSlot := q.chain.HeadSlot()
   179  	if startSlot > startBackSlots {
   180  		startSlot -= startBackSlots
   181  	}
   182  	blocksPerRequest := q.blocksFetcher.blocksPerSecond
   183  	for i := startSlot; i < startSlot.Add(blocksPerRequest*lookaheadSteps); i += types.Slot(blocksPerRequest) {
   184  		q.smm.addStateMachine(i)
   185  	}
   186  
   187  	ticker := time.NewTicker(pollingInterval)
   188  	defer ticker.Stop()
   189  	for {
   190  		// Check highest expected slot when we approach chain's head slot.
   191  		if q.chain.HeadSlot() >= q.highestExpectedSlot {
   192  			// By the time initial sync is complete, highest slot may increase, re-check.
   193  			if q.mode == modeStopOnFinalizedEpoch {
   194  				if q.highestExpectedSlot < q.blocksFetcher.bestFinalizedSlot() {
   195  					q.highestExpectedSlot = q.blocksFetcher.bestFinalizedSlot()
   196  					continue
   197  				}
   198  			} else {
   199  				if q.highestExpectedSlot < q.blocksFetcher.bestNonFinalizedSlot() {
   200  					q.highestExpectedSlot = q.blocksFetcher.bestNonFinalizedSlot()
   201  					continue
   202  				}
   203  			}
   204  			log.WithField("slot", q.highestExpectedSlot).Debug("Highest expected slot reached")
   205  			q.cancel()
   206  		}
   207  
   208  		log.WithFields(logrus.Fields{
   209  			"highestExpectedSlot": q.highestExpectedSlot,
   210  			"headSlot":            q.chain.HeadSlot(),
   211  			"state":               q.smm.String(),
   212  			"staleEpoch":          q.staleEpochs,
   213  		}).Trace("tick")
   214  
   215  		select {
   216  		case <-ticker.C:
   217  			for _, key := range q.smm.keys {
   218  				fsm := q.smm.machines[key]
   219  				if err := fsm.trigger(eventTick, nil); err != nil {
   220  					log.WithFields(logrus.Fields{
   221  						"highestExpectedSlot":       q.highestExpectedSlot,
   222  						"noRequiredPeersErrRetries": q.exitConditions.noRequiredPeersErrRetries,
   223  						"event":                     eventTick,
   224  						"epoch":                     helpers.SlotToEpoch(fsm.start),
   225  						"start":                     fsm.start,
   226  						"error":                     err.Error(),
   227  					}).Debug("Can not trigger event")
   228  					if errors.Is(err, errNoRequiredPeers) {
   229  						forceExit := q.exitConditions.noRequiredPeersErrRetries > noRequiredPeersErrMaxRetries
   230  						if q.mode == modeStopOnFinalizedEpoch || forceExit {
   231  							q.cancel()
   232  						} else {
   233  							q.exitConditions.noRequiredPeersErrRetries++
   234  							log.Debug("Waiting for finalized peers")
   235  							time.Sleep(noRequiredPeersErrRefreshInterval)
   236  						}
   237  						continue
   238  					}
   239  				}
   240  				// Do garbage collection, and advance sliding window forward.
   241  				if q.chain.HeadSlot() >= fsm.start.Add(blocksPerRequest-1) {
   242  					highestStartSlot, err := q.smm.highestStartSlot()
   243  					if err != nil {
   244  						log.WithError(err).Debug("Cannot obtain highest epoch state number")
   245  						continue
   246  					}
   247  					if err := q.smm.removeStateMachine(fsm.start); err != nil {
   248  						log.WithError(err).Debug("Can not remove state machine")
   249  					}
   250  					if len(q.smm.machines) < lookaheadSteps {
   251  						q.smm.addStateMachine(highestStartSlot.Add(blocksPerRequest))
   252  					}
   253  				}
   254  			}
   255  		case response, ok := <-q.blocksFetcher.requestResponses():
   256  			if !ok {
   257  				log.Debug("Fetcher closed output channel")
   258  				q.cancel()
   259  				return
   260  			}
   261  			// Update state of an epoch for which data is received.
   262  			if fsm, ok := q.smm.findStateMachine(response.start); ok {
   263  				if err := fsm.trigger(eventDataReceived, response); err != nil {
   264  					log.WithFields(logrus.Fields{
   265  						"event": eventDataReceived,
   266  						"epoch": helpers.SlotToEpoch(fsm.start),
   267  						"error": err.Error(),
   268  					}).Debug("Can not process event")
   269  					fsm.setState(stateNew)
   270  					continue
   271  				}
   272  			}
   273  		case <-q.ctx.Done():
   274  			log.Debug("Context closed, exiting goroutine (blocks queue)")
   275  			return
   276  		}
   277  	}
   278  }
   279  
   280  // onScheduleEvent is an event called on newly arrived epochs. Transforms state to scheduled.
   281  func (q *blocksQueue) onScheduleEvent(ctx context.Context) eventHandlerFn {
   282  	return func(m *stateMachine, in interface{}) (stateID, error) {
   283  		if m.state != stateNew {
   284  			return m.state, errInvalidInitialState
   285  		}
   286  		if m.start > q.highestExpectedSlot {
   287  			m.setState(stateSkipped)
   288  			return m.state, errSlotIsTooHigh
   289  		}
   290  		blocksPerRequest := q.blocksFetcher.blocksPerSecond
   291  		if err := q.blocksFetcher.scheduleRequest(ctx, m.start, blocksPerRequest); err != nil {
   292  			return m.state, err
   293  		}
   294  		return stateScheduled, nil
   295  	}
   296  }
   297  
   298  // onDataReceivedEvent is an event called when data is received from fetcher.
   299  func (q *blocksQueue) onDataReceivedEvent(ctx context.Context) eventHandlerFn {
   300  	return func(m *stateMachine, in interface{}) (stateID, error) {
   301  		if ctx.Err() != nil {
   302  			return m.state, ctx.Err()
   303  		}
   304  		if m.state != stateScheduled {
   305  			return m.state, errInvalidInitialState
   306  		}
   307  		response, ok := in.(*fetchRequestResponse)
   308  		if !ok {
   309  			return m.state, errInputNotFetchRequestParams
   310  		}
   311  		if response.err != nil {
   312  			switch response.err {
   313  			case errSlotIsTooHigh:
   314  				// Current window is already too big, re-request previous epochs.
   315  				for _, fsm := range q.smm.machines {
   316  					if fsm.start < response.start && fsm.state == stateSkipped {
   317  						fsm.setState(stateNew)
   318  					}
   319  				}
   320  			case beaconsync.ErrInvalidFetchedData:
   321  				// Peer returned invalid data, penalize.
   322  				q.blocksFetcher.p2p.Peers().Scorers().BadResponsesScorer().Increment(m.pid)
   323  				log.WithField("pid", response.pid).Debug("Peer is penalized for invalid blocks")
   324  			}
   325  			return m.state, response.err
   326  		}
   327  		m.pid = response.pid
   328  		m.blocks = response.blocks
   329  		return stateDataParsed, nil
   330  	}
   331  }
   332  
   333  // onReadyToSendEvent is an event called to allow epochs with available blocks to send them downstream.
   334  func (q *blocksQueue) onReadyToSendEvent(ctx context.Context) eventHandlerFn {
   335  	return func(m *stateMachine, in interface{}) (stateID, error) {
   336  		if ctx.Err() != nil {
   337  			return m.state, ctx.Err()
   338  		}
   339  		if m.state != stateDataParsed {
   340  			return m.state, errInvalidInitialState
   341  		}
   342  
   343  		if len(m.blocks) == 0 {
   344  			return stateSkipped, nil
   345  		}
   346  
   347  		send := func() (stateID, error) {
   348  			data := &blocksQueueFetchedData{
   349  				pid:    m.pid,
   350  				blocks: m.blocks,
   351  			}
   352  			select {
   353  			case <-ctx.Done():
   354  				return m.state, ctx.Err()
   355  			case q.fetchedData <- data:
   356  			}
   357  			return stateSent, nil
   358  		}
   359  
   360  		// Make sure that we send epochs in a correct order.
   361  		// If machine is the first (has lowest start block), send.
   362  		if m.isFirst() {
   363  			return send()
   364  		}
   365  
   366  		// Make sure that previous epoch is already processed.
   367  		for _, fsm := range q.smm.machines {
   368  			// Review only previous slots.
   369  			if fsm.start < m.start {
   370  				switch fsm.state {
   371  				case stateNew, stateScheduled, stateDataParsed:
   372  					return m.state, nil
   373  				}
   374  			}
   375  		}
   376  
   377  		return send()
   378  	}
   379  }
   380  
   381  // onProcessSkippedEvent is an event triggered on skipped machines, allowing handlers to
   382  // extend lookahead window, in case where progress is not possible otherwise.
   383  func (q *blocksQueue) onProcessSkippedEvent(ctx context.Context) eventHandlerFn {
   384  	return func(m *stateMachine, in interface{}) (stateID, error) {
   385  		if ctx.Err() != nil {
   386  			return m.state, ctx.Err()
   387  		}
   388  		if m.state != stateSkipped {
   389  			return m.state, errInvalidInitialState
   390  		}
   391  
   392  		// Only the highest epoch with skipped state can trigger extension.
   393  		if !m.isLast() {
   394  			// When a state machine stays in skipped state for too long - reset it.
   395  			if time.Since(m.updated) > skippedMachineTimeout {
   396  				return stateNew, nil
   397  			}
   398  			return m.state, nil
   399  		}
   400  
   401  		// Make sure that all machines are in skipped state i.e. manager cannot progress without reset or
   402  		// moving the last machine's start block forward (in an attempt to find next non-skipped block).
   403  		if !q.smm.allMachinesInState(stateSkipped) {
   404  			return m.state, nil
   405  		}
   406  
   407  		// Check if we have enough peers to progress, or sync needs to halt (due to no peers available).
   408  		bestFinalizedSlot := q.blocksFetcher.bestFinalizedSlot()
   409  		if q.mode == modeStopOnFinalizedEpoch {
   410  			if bestFinalizedSlot <= q.chain.HeadSlot() {
   411  				return stateSkipped, errNoRequiredPeers
   412  			}
   413  		} else {
   414  			if q.blocksFetcher.bestNonFinalizedSlot() <= q.chain.HeadSlot() {
   415  				return stateSkipped, errNoRequiredPeers
   416  			}
   417  		}
   418  
   419  		// All machines are skipped, FSMs need reset.
   420  		startSlot := q.chain.HeadSlot() + 1
   421  		if q.mode == modeNonConstrained && startSlot > bestFinalizedSlot {
   422  			q.staleEpochs[helpers.SlotToEpoch(startSlot)]++
   423  			// If FSMs have been reset enough times, try to explore alternative forks.
   424  			if q.staleEpochs[helpers.SlotToEpoch(startSlot)] >= maxResetAttempts {
   425  				delete(q.staleEpochs, helpers.SlotToEpoch(startSlot))
   426  				fork, err := q.blocksFetcher.findFork(ctx, startSlot)
   427  				if err == nil {
   428  					return stateSkipped, q.resetFromFork(ctx, fork)
   429  				}
   430  				log.WithFields(logrus.Fields{
   431  					"epoch": helpers.SlotToEpoch(startSlot),
   432  					"error": err.Error(),
   433  				}).Debug("Can not explore alternative branches")
   434  			}
   435  		}
   436  		return stateSkipped, q.resetFromSlot(ctx, startSlot)
   437  	}
   438  }
   439  
   440  // onCheckStaleEvent is an event that allows to mark stale epochs,
   441  // so that they can be re-processed.
   442  func (q *blocksQueue) onCheckStaleEvent(ctx context.Context) eventHandlerFn {
   443  	return func(m *stateMachine, in interface{}) (stateID, error) {
   444  		if ctx.Err() != nil {
   445  			return m.state, ctx.Err()
   446  		}
   447  		if m.state != stateSent {
   448  			return m.state, errInvalidInitialState
   449  		}
   450  
   451  		// Break out immediately if bucket is not stale.
   452  		if time.Since(m.updated) < staleEpochTimeout {
   453  			return m.state, nil
   454  		}
   455  
   456  		return stateSkipped, nil
   457  	}
   458  }