github.com/amazechain/amc@v0.1.3/internal/sync/initial-sync/blocks_queue.go (about)

     1  package initialsync
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"github.com/amazechain/amc/api/protocol/types_pb"
     7  	"github.com/amazechain/amc/common"
     8  	"github.com/amazechain/amc/internal/p2p"
     9  	amcsync "github.com/amazechain/amc/internal/sync"
    10  	"github.com/holiman/uint256"
    11  	"time"
    12  
    13  	"github.com/libp2p/go-libp2p/core/peer"
    14  )
    15  
    16  const (
    17  	// queueStopCallTimeout is time allowed for queue to release resources when quitting.
    18  	queueStopCallTimeout = 1 * time.Second
    19  	// pollingInterval defines how often state machine needs to check for new events.
    20  	pollingInterval = 200 * time.Millisecond
    21  	// staleEpochTimeout is an period after which epoch's state is considered stale.
    22  	staleEpochTimeout = 1 * time.Second
    23  	// skippedMachineTimeout is a period after which skipped machine is considered as stuck
    24  	// and is reset (if machine is the last one, then all machines are reset and search for
    25  	// skipped slot or backtracking takes place).
    26  	skippedMachineTimeout = 10 * staleEpochTimeout
    27  	// lookaheadSteps is a limit on how many forward steps are loaded into queue.
    28  	// Each step is managed by assigned finite state machine. Must be >= 2.
    29  	lookaheadSteps = 2
    30  	// noRequiredPeersErrMaxRetries defines number of retries when no required peers are found.
    31  	noRequiredPeersErrMaxRetries = 1000
    32  	// noRequiredPeersErrRefreshInterval defines interval for which queue will be paused before
    33  	// making the next attempt to obtain data.
    34  	noRequiredPeersErrRefreshInterval = 15 * time.Second
    35  	// maxResetAttempts number of times stale FSM is reset, before backtracking is triggered.
    36  	maxResetAttempts = 4
    37  	// startBackSlots defines number of slots before the current head, which defines a start position
    38  	// of the initial machine. This allows more robustness in case of normal sync sets head to some
    39  	// orphaned block: in that case starting earlier and re-fetching blocks allows to reorganize chain.
    40  	//startBackSlots = 32
    41  )
    42  
    43  var (
    44  	errQueueCtxIsDone             = errors.New("queue's context is done, reinitialize")
    45  	errQueueTakesTooLongToStop    = errors.New("queue takes too long to stop")
    46  	errInvalidInitialState        = errors.New("invalid initial state")
    47  	errInputNotFetchRequestParams = errors.New("input data is not type *fetchRequestParams")
    48  	errNoRequiredPeers            = errors.New("no peers with required blocks are found")
    49  )
    50  
    51  const (
    52  	modeStopOnFinalizedEpoch syncMode = iota
    53  	modeNonConstrained
    54  )
    55  
    56  // syncMode specifies sync mod type.
    57  type syncMode uint8
    58  
    59  // blocksQueueConfig is a config to setup block queue service.
    60  type blocksQueueConfig struct {
    61  	blocksFetcher          *blocksFetcher
    62  	chain                  common.IBlockChain
    63  	highestExpectedBlockNr *uint256.Int
    64  	p2p                    p2p.P2P
    65  	mode                   syncMode
    66  }
    67  
    68  // blocksQueue is a priority queue that serves as a intermediary between block fetchers (producers)
    69  // and block processing goroutine (consumer). Consumer can rely on order of incoming blocks.
    70  type blocksQueue struct {
    71  	ctx                    context.Context
    72  	cancel                 context.CancelFunc
    73  	smm                    *stateMachineManager
    74  	blocksFetcher          *blocksFetcher
    75  	chain                  common.IBlockChain
    76  	highestExpectedBlockNr *uint256.Int
    77  	mode                   syncMode
    78  	exitConditions         struct {
    79  		noRequiredPeersErrRetries int
    80  	}
    81  	fetchedData chan *blocksQueueFetchedData // output channel for ready blocks
    82  	quit        chan struct{}                // termination notifier
    83  }
    84  
    85  // blocksQueueFetchedData is a data container that is returned from a queue on each step.
    86  type blocksQueueFetchedData struct {
    87  	pid    peer.ID
    88  	blocks []*types_pb.Block
    89  }
    90  
    91  // newBlocksQueue creates initialized priority queue.
    92  func newBlocksQueue(ctx context.Context, cfg *blocksQueueConfig) *blocksQueue {
    93  	ctx, cancel := context.WithCancel(ctx)
    94  
    95  	blocksFetcher := cfg.blocksFetcher
    96  	if blocksFetcher == nil {
    97  		blocksFetcher = newBlocksFetcher(ctx, &blocksFetcherConfig{
    98  			chain: cfg.chain,
    99  			p2p:   cfg.p2p,
   100  		})
   101  	}
   102  	highestExpectedBlockNr := cfg.highestExpectedBlockNr
   103  
   104  	// Override fetcher's sync mode.
   105  	blocksFetcher.mode = cfg.mode
   106  
   107  	queue := &blocksQueue{
   108  		ctx:                    ctx,
   109  		cancel:                 cancel,
   110  		highestExpectedBlockNr: highestExpectedBlockNr,
   111  		blocksFetcher:          blocksFetcher,
   112  		chain:                  cfg.chain,
   113  		mode:                   cfg.mode,
   114  		fetchedData:            make(chan *blocksQueueFetchedData, 1),
   115  		quit:                   make(chan struct{}),
   116  	}
   117  
   118  	// Configure state machines.
   119  	queue.smm = newStateMachineManager()
   120  	queue.smm.addEventHandler(eventTick, stateNew, queue.onScheduleEvent(ctx))
   121  	queue.smm.addEventHandler(eventDataReceived, stateScheduled, queue.onDataReceivedEvent(ctx))
   122  	queue.smm.addEventHandler(eventTick, stateDataParsed, queue.onReadyToSendEvent(ctx))
   123  	queue.smm.addEventHandler(eventTick, stateSkipped, queue.onProcessSkippedEvent(ctx))
   124  	queue.smm.addEventHandler(eventTick, stateSent, queue.onCheckStaleEvent(ctx))
   125  
   126  	return queue
   127  }
   128  
   129  // start boots up the queue processing.
   130  func (q *blocksQueue) start() error {
   131  	select {
   132  	case <-q.ctx.Done():
   133  		return errQueueCtxIsDone
   134  	default:
   135  		go q.loop()
   136  		return nil
   137  	}
   138  }
   139  
   140  // stop terminates all queue operations.
   141  func (q *blocksQueue) stop() error {
   142  	q.cancel()
   143  	select {
   144  	case <-q.quit:
   145  		return nil
   146  	case <-time.After(queueStopCallTimeout):
   147  		return errQueueTakesTooLongToStop
   148  	}
   149  }
   150  
   151  // loop is a main queue loop.
   152  func (q *blocksQueue) loop() {
   153  	defer close(q.quit)
   154  
   155  	defer func() {
   156  		q.blocksFetcher.stop()
   157  		close(q.fetchedData)
   158  	}()
   159  
   160  	if err := q.blocksFetcher.start(); err != nil {
   161  		log.Debug("Can not start blocks provider", "err", err)
   162  	}
   163  
   164  	// Define initial state machines.
   165  	// currentblock update?
   166  	startBlockNr := new(uint256.Int).AddUint64(q.chain.CurrentBlock().Number64(), 1)
   167  	blocksPerRequest := q.blocksFetcher.blocksPerPeriod
   168  	for i := startBlockNr.Clone(); i.Cmp(new(uint256.Int).AddUint64(startBlockNr, blocksPerRequest*lookaheadSteps)) == -1; i = i.AddUint64(i, blocksPerRequest) {
   169  		q.smm.addStateMachine(i)
   170  	}
   171  
   172  	ticker := time.NewTicker(pollingInterval)
   173  	defer ticker.Stop()
   174  	for {
   175  		if waitHighestExpectedBlockNr(q) {
   176  			continue
   177  		}
   178  
   179  		log.Trace("tick",
   180  			"highestExpectedBlockNr", q.highestExpectedBlockNr,
   181  			"ourBlockNr", q.chain.CurrentBlock().Number64(),
   182  			"state", q.smm.String(),
   183  		)
   184  
   185  		select {
   186  		case <-ticker.C:
   187  			for _, key := range q.smm.keys {
   188  				fsm := q.smm.machines[key.Uint64()]
   189  				if err := fsm.trigger(eventTick, nil); err != nil {
   190  					log.Debug("Can not trigger event",
   191  						"highestExpectedBlockNr", q.highestExpectedBlockNr,
   192  						"noRequiredPeersErrRetries", q.exitConditions.noRequiredPeersErrRetries,
   193  						"event", eventTick,
   194  						"start", fsm.start,
   195  						"error", err.Error(),
   196  					)
   197  					if errors.Is(err, errNoRequiredPeers) {
   198  						forceExit := q.exitConditions.noRequiredPeersErrRetries > noRequiredPeersErrMaxRetries
   199  						if forceExit {
   200  							q.cancel()
   201  						} else {
   202  							q.exitConditions.noRequiredPeersErrRetries++
   203  							log.Debug("Waiting for finalized peers")
   204  							time.Sleep(noRequiredPeersErrRefreshInterval)
   205  						}
   206  						continue
   207  					}
   208  				}
   209  				// Do garbage collection, and advance sliding window forward.
   210  				if q.chain.CurrentBlock().Number64().Cmp(new(uint256.Int).AddUint64(fsm.start, blocksPerRequest-1)) >= 0 {
   211  					highestStartSlot, err := q.smm.highestStartSlot()
   212  					if err != nil {
   213  						log.Debug("Cannot obtain highest epoch state number", "err", err)
   214  						continue
   215  					}
   216  					if err := q.smm.removeStateMachine(fsm.start); err != nil {
   217  						log.Debug("Can not remove state machine", "err", err)
   218  					}
   219  					if len(q.smm.machines) < lookaheadSteps {
   220  						q.smm.addStateMachine(new(uint256.Int).AddUint64(highestStartSlot, blocksPerRequest))
   221  					}
   222  				}
   223  			}
   224  		case response, ok := <-q.blocksFetcher.requestResponses():
   225  			if !ok {
   226  				log.Debug("Fetcher closed output channel")
   227  				q.cancel()
   228  				return
   229  			}
   230  			// Update state of an epoch for which data is received.
   231  			if fsm, ok := q.smm.findStateMachine(response.start); ok {
   232  				if err := fsm.trigger(eventDataReceived, response); err != nil {
   233  					log.Debug("Can not process event",
   234  						"event", eventDataReceived,
   235  						"start", fsm.start,
   236  						"error", err.Error(),
   237  					)
   238  					fsm.setState(stateNew)
   239  					continue
   240  				}
   241  			}
   242  		case <-q.ctx.Done():
   243  			log.Debug("Context closed, exiting goroutine (blocks queue)")
   244  			return
   245  		}
   246  	}
   247  }
   248  
   249  func waitHighestExpectedBlockNr(q *blocksQueue) bool {
   250  	// Check highest expected blockNr when we approach chain's head slot.
   251  	if q.chain.CurrentBlock().Number64().Cmp(q.highestExpectedBlockNr) >= 0 {
   252  		// By the time initial sync is complete, highest slot may increase, re-check.
   253  		targetBlockNr := q.blocksFetcher.bestFinalizedBlockNr()
   254  		if q.highestExpectedBlockNr.Cmp(targetBlockNr) == -1 {
   255  			q.highestExpectedBlockNr = targetBlockNr
   256  			return true
   257  		}
   258  		log.Debug("Highest expected blockNr reached", "blockNr", targetBlockNr)
   259  		q.cancel()
   260  	}
   261  	return false
   262  }
   263  
   264  // onScheduleEvent is an event called on newly arrived epochs. Transforms state to scheduled.
   265  func (q *blocksQueue) onScheduleEvent(ctx context.Context) eventHandlerFn {
   266  	return func(m *stateMachine, in interface{}) (stateID, error) {
   267  		if m.state != stateNew {
   268  			return m.state, errInvalidInitialState
   269  		}
   270  		if m.start.Cmp(q.highestExpectedBlockNr) == 1 {
   271  			m.setState(stateSkipped)
   272  			return m.state, errBlockNrIsTooHigh
   273  		}
   274  		blocksPerRequest := q.blocksFetcher.blocksPerPeriod
   275  		if q.highestExpectedBlockNr.Cmp(new(uint256.Int).AddUint64(m.start, blocksPerRequest)) < 0 {
   276  			blocksPerRequest = new(uint256.Int).Sub(q.highestExpectedBlockNr, m.start).Uint64() + 1
   277  		}
   278  		if err := q.blocksFetcher.scheduleRequest(ctx, m.start, blocksPerRequest); err != nil {
   279  			return m.state, err
   280  		}
   281  		return stateScheduled, nil
   282  	}
   283  }
   284  
   285  // onDataReceivedEvent is an event called when data is received from fetcher.
   286  func (q *blocksQueue) onDataReceivedEvent(ctx context.Context) eventHandlerFn {
   287  	return func(m *stateMachine, in interface{}) (stateID, error) {
   288  		if ctx.Err() != nil {
   289  			return m.state, ctx.Err()
   290  		}
   291  		if m.state != stateScheduled {
   292  			return m.state, errInvalidInitialState
   293  		}
   294  		response, ok := in.(*fetchRequestResponse)
   295  		if !ok {
   296  			return m.state, errInputNotFetchRequestParams
   297  		}
   298  		if response.err != nil {
   299  			switch response.err {
   300  			//todo
   301  			case errBlockNrIsTooHigh:
   302  				// Current window is already too big, re-request previous epochs.
   303  				for _, fsm := range q.smm.machines {
   304  					if fsm.start.Cmp(response.start) == -1 && fsm.state == stateSkipped {
   305  						fsm.setState(stateNew)
   306  					}
   307  				}
   308  			case amcsync.ErrInvalidFetchedData:
   309  				// Peer returned invalid data, penalize.
   310  				q.blocksFetcher.p2p.Peers().Scorers().BadResponsesScorer().Increment(m.pid)
   311  				log.Debug("Peer is penalized for invalid blocks", "pid", response.pid)
   312  			}
   313  			return m.state, response.err
   314  		}
   315  		m.pid = response.pid
   316  		m.blocks = response.blocks
   317  		return stateDataParsed, nil
   318  	}
   319  }
   320  
   321  // onReadyToSendEvent is an event called to allow epochs with available blocks to send them downstream.
   322  func (q *blocksQueue) onReadyToSendEvent(ctx context.Context) eventHandlerFn {
   323  	return func(m *stateMachine, in interface{}) (stateID, error) {
   324  		if ctx.Err() != nil {
   325  			return m.state, ctx.Err()
   326  		}
   327  		if m.state != stateDataParsed {
   328  			return m.state, errInvalidInitialState
   329  		}
   330  
   331  		if len(m.blocks) == 0 {
   332  			return stateSkipped, nil
   333  		}
   334  
   335  		send := func() (stateID, error) {
   336  			data := &blocksQueueFetchedData{
   337  				pid:    m.pid,
   338  				blocks: m.blocks,
   339  			}
   340  			select {
   341  			case <-ctx.Done():
   342  				return m.state, ctx.Err()
   343  			case q.fetchedData <- data:
   344  			}
   345  			return stateSent, nil
   346  		}
   347  
   348  		// Make sure that we send epochs in a correct order.
   349  		// If machine is the first (has lowest start block), send.
   350  		if m.isFirst() {
   351  			return send()
   352  		}
   353  
   354  		// Make sure that previous epoch is already processed.
   355  		for _, fsm := range q.smm.machines {
   356  			// Review only previous slots.
   357  			if fsm.start.Cmp(m.start) == -1 {
   358  				switch fsm.state {
   359  				case stateNew, stateScheduled, stateDataParsed:
   360  					return m.state, nil
   361  				}
   362  			}
   363  		}
   364  
   365  		return send()
   366  	}
   367  }
   368  
   369  // onProcessSkippedEvent is an event triggered on skipped machines, allowing handlers to
   370  // extend lookahead window, in case where progress is not possible otherwise.
   371  func (q *blocksQueue) onProcessSkippedEvent(ctx context.Context) eventHandlerFn {
   372  	return func(m *stateMachine, in interface{}) (stateID, error) {
   373  		if ctx.Err() != nil {
   374  			return m.state, ctx.Err()
   375  		}
   376  		if m.state != stateSkipped {
   377  			return m.state, errInvalidInitialState
   378  		}
   379  
   380  		// Only the highest epoch with skipped state can trigger extension.
   381  		if !m.isLast() {
   382  			// When a state machine stays in skipped state for too long - reset it.
   383  			if time.Since(m.updated) > skippedMachineTimeout {
   384  				return stateNew, nil
   385  			}
   386  			return m.state, nil
   387  		}
   388  
   389  		// Make sure that all machines are in skipped state i.e. manager cannot progress without reset or
   390  		// moving the last machine's start block forward (in an attempt to find next non-skipped block).
   391  		if !q.smm.allMachinesInState(stateSkipped) {
   392  			return m.state, nil
   393  		}
   394  
   395  		// Check if we have enough peers to progress, or sync needs to halt (due to no peers available).
   396  		//bestFinalizedSlot := q.blocksFetcher.bestFinalizedBlockNr()
   397  		if q.blocksFetcher.bestFinalizedBlockNr().Cmp(q.chain.CurrentBlock().Number64()) >= 0 {
   398  			return stateSkipped, errNoRequiredPeers
   399  		}
   400  
   401  		// All machines are skipped, FSMs need reset.
   402  		startBlockNr := new(uint256.Int).AddUint64(q.chain.CurrentBlock().Number64(), 1)
   403  
   404  		//todo q.blocksFetcher.findFork(ctx, startSlot)
   405  
   406  		return stateSkipped, q.resetFromBlockNr(ctx, startBlockNr)
   407  	}
   408  }
   409  
   410  // onCheckStaleEvent is an event that allows to mark stale epochs,
   411  // so that they can be re-processed.
   412  func (_ *blocksQueue) onCheckStaleEvent(ctx context.Context) eventHandlerFn {
   413  	return func(m *stateMachine, in interface{}) (stateID, error) {
   414  		if ctx.Err() != nil {
   415  			return m.state, ctx.Err()
   416  		}
   417  		if m.state != stateSent {
   418  			return m.state, errInvalidInitialState
   419  		}
   420  
   421  		// Break out immediately if bucket is not stale.
   422  		if time.Since(m.updated) < staleEpochTimeout {
   423  			return m.state, nil
   424  		}
   425  
   426  		return stateSkipped, nil
   427  	}
   428  }