github.com/palisadeinc/bor@v0.0.0-20230615125219-ab7196213d15/eth/downloader/skeleton.go (about)

     1  // Copyright 2021 The go-ethereum Authors
     2  // This file is part of the go-ethereum library.
     3  //
     4  // The go-ethereum library is free software: you can redistribute it and/or modify
     5  // it under the terms of the GNU Lesser General Public License as published by
     6  // the Free Software Foundation, either version 3 of the License, or
     7  // (at your option) any later version.
     8  //
     9  // The go-ethereum library is distributed in the hope that it will be useful,
    10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    12  // GNU Lesser General Public License for more details.
    13  //
    14  // You should have received a copy of the GNU Lesser General Public License
    15  // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
    16  
    17  package downloader
    18  
    19  import (
    20  	"encoding/json"
    21  	"errors"
    22  	"math/rand"
    23  	"sort"
    24  	"time"
    25  
    26  	"github.com/ethereum/go-ethereum/common"
    27  	"github.com/ethereum/go-ethereum/core/rawdb"
    28  	"github.com/ethereum/go-ethereum/core/types"
    29  	"github.com/ethereum/go-ethereum/eth/protocols/eth"
    30  	"github.com/ethereum/go-ethereum/ethdb"
    31  	"github.com/ethereum/go-ethereum/log"
    32  )
    33  
    34  // scratchHeaders is the number of headers to store in a scratch space to allow
    35  // concurrent downloads. A header is about 0.5KB in size, so there is no worry
    36  // about using too much memory. The only catch is that we can only validate gaps
    37  // afer they're linked to the head, so the bigger the scratch space, the larger
    38  // potential for invalid headers.
    39  //
    40  // The current scratch space of 131072 headers is expected to use 64MB RAM.
    41  const scratchHeaders = 131072
    42  
    43  // requestHeaders is the number of header to request from a remote peer in a single
    44  // network packet. Although the skeleton downloader takes into consideration peer
    45  // capacities when picking idlers, the packet size was decided to remain constant
    46  // since headers are relatively small and it's easier to work with fixed batches
    47  // vs. dynamic interval fillings.
    48  const requestHeaders = 512
    49  
    50  // errSyncLinked is an internal helper error to signal that the current sync
    51  // cycle linked up to the genesis block, this the skeleton syncer should ping
    52  // the backfiller to resume. Since we already have that logic on sync start,
    53  // piggie-back on that instead of 2 entrypoints.
    54  var errSyncLinked = errors.New("sync linked")
    55  
    56  // errSyncMerged is an internal helper error to signal that the current sync
    57  // cycle merged with a previously aborted subchain, thus the skeleton syncer
    58  // should abort and restart with the new state.
    59  var errSyncMerged = errors.New("sync merged")
    60  
    61  // errSyncReorged is an internal helper error to signal that the head chain of
    62  // the current sync cycle was (partially) reorged, thus the skeleton syncer
    63  // should abort and restart with the new state.
    64  var errSyncReorged = errors.New("sync reorged")
    65  
    66  // errTerminated is returned if the sync mechanism was terminated for this run of
    67  // the process. This is usually the case when Geth is shutting down and some events
    68  // might still be propagating.
    69  var errTerminated = errors.New("terminated")
    70  
    71  // errReorgDenied is returned if an attempt is made to extend the beacon chain
    72  // with a new header, but it does not link up to the existing sync.
    73  var errReorgDenied = errors.New("non-forced head reorg denied")
    74  
    75  func init() {
    76  	// Tuning parameters is nice, but the scratch space must be assignable in
    77  	// full to peers. It's a useless cornercase to support a dangling half-group.
    78  	if scratchHeaders%requestHeaders != 0 {
    79  		panic("Please make scratchHeaders divisible by requestHeaders")
    80  	}
    81  }
    82  
    83  // subchain is a contiguous header chain segment that is backed by the database,
    84  // but may not be linked to the live chain. The skeleton downloader may produce
    85  // a new one of these every time it is restarted until the subchain grows large
    86  // enough to connect with a previous subchain.
    87  //
    88  // The subchains use the exact same database namespace and are not disjoint from
    89  // each other. As such, extending one to overlap the other entails reducing the
    90  // second one first. This combined buffer model is used to avoid having to move
    91  // data on disk when two subchains are joined together.
    92  type subchain struct {
    93  	Head uint64      // Block number of the newest header in the subchain
    94  	Tail uint64      // Block number of the oldest header in the subchain
    95  	Next common.Hash // Block hash of the next oldest header in the subchain
    96  }
    97  
    98  // skeletonProgress is a database entry to allow suspending and resuming a chain
    99  // sync. As the skeleton header chain is downloaded backwards, restarts can and
   100  // will produce temporarily disjoint subchains. There is no way to restart a
   101  // suspended skeleton sync without prior knowledge of all prior suspension points.
   102  type skeletonProgress struct {
   103  	Subchains []*subchain // Disjoint subchains downloaded until now
   104  }
   105  
   106  // headUpdate is a notification that the beacon sync should switch to a new target.
   107  // The update might request whether to forcefully change the target, or only try to
   108  // extend it and fail if it's not possible.
   109  type headUpdate struct {
   110  	header *types.Header // Header to update the sync target to
   111  	force  bool          // Whether to force the update or only extend if possible
   112  	errc   chan error    // Channel to signal acceptance of the new head
   113  }
   114  
   115  // headerRequest tracks a pending header request to ensure responses are to
   116  // actual requests and to validate any security constraints.
   117  //
   118  // Concurrency note: header requests and responses are handled concurrently from
   119  // the main runloop to allow Keccak256 hash verifications on the peer's thread and
   120  // to drop on invalid response. The request struct must contain all the data to
   121  // construct the response without accessing runloop internals (i.e. subchains).
   122  // That is only included to allow the runloop to match a response to the task being
   123  // synced without having yet another set of maps.
   124  type headerRequest struct {
   125  	peer string // Peer to which this request is assigned
   126  	id   uint64 // Request ID of this request
   127  
   128  	deliver chan *headerResponse // Channel to deliver successful response on
   129  	revert  chan *headerRequest  // Channel to deliver request failure on
   130  	cancel  chan struct{}        // Channel to track sync cancellation
   131  	stale   chan struct{}        // Channel to signal the request was dropped
   132  
   133  	head uint64 // Head number of the requested batch of headers
   134  }
   135  
   136  // headerResponse is an already verified remote response to a header request.
   137  type headerResponse struct {
   138  	peer    *peerConnection // Peer from which this response originates
   139  	reqid   uint64          // Request ID that this response fulfils
   140  	headers []*types.Header // Chain of headers
   141  }
   142  
   143  // backfiller is a callback interface through which the skeleton sync can tell
   144  // the downloader that it should suspend or resume backfilling on specific head
   145  // events (e.g. suspend on forks or gaps, resume on successful linkups).
   146  type backfiller interface {
   147  	// suspend requests the backfiller to abort any running full or snap sync
   148  	// based on the skeleton chain as it might be invalid. The backfiller should
   149  	// gracefully handle multiple consecutive suspends without a resume, even
   150  	// on initial sartup.
   151  	suspend()
   152  
   153  	// resume requests the backfiller to start running fill or snap sync based on
   154  	// the skeleton chain as it has successfully been linked. Appending new heads
   155  	// to the end of the chain will not result in suspend/resume cycles.
   156  	resume()
   157  }
   158  
   159  // skeleton represents a header chain synchronized after the merge where blocks
   160  // aren't validated any more via PoW in a forward fashion, rather are dictated
   161  // and extended at the head via the beacon chain and backfilled on the original
   162  // Ethereum block sync protocol.
   163  //
   164  // Since the skeleton is grown backwards from head to genesis, it is handled as
   165  // a separate entity, not mixed in with the logical sequential transition of the
   166  // blocks. Once the skeleton is connected to an existing, validated chain, the
   167  // headers will be moved into the main downloader for filling and execution.
   168  //
   169  // Opposed to the original Ethereum block synchronization which is trustless (and
   170  // uses a master peer to minimize the attack surface), post-merge block sync starts
   171  // from a trusted head. As such, there is no need for a master peer any more and
   172  // headers can be requested fully concurrently (though some batches might be
   173  // discarded if they don't link up correctly).
   174  //
   175  // Although a skeleton is part of a sync cycle, it is not recreated, rather stays
   176  // alive throughout the lifetime of the downloader. This allows it to be extended
   177  // concurrently with the sync cycle, since extensions arrive from an API surface,
   178  // not from within (vs. legacy Ethereum sync).
   179  //
   180  // Since the skeleton tracks the entire header chain until it is consumed by the
   181  // forward block filling, it needs 0.5KB/block storage. At current mainnet sizes
   182  // this is only possible with a disk backend. Since the skeleton is separate from
   183  // the node's header chain, storing the headers ephemerally until sync finishes
   184  // is wasted disk IO, but it's a price we're going to pay to keep things simple
   185  // for now.
   186  type skeleton struct {
   187  	db     ethdb.Database // Database backing the skeleton
   188  	filler backfiller     // Chain syncer suspended/resumed by head events
   189  
   190  	peers *peerSet                   // Set of peers we can sync from
   191  	idles map[string]*peerConnection // Set of idle peers in the current sync cycle
   192  	drop  peerDropFn                 // Drops a peer for misbehaving
   193  
   194  	progress *skeletonProgress // Sync progress tracker for resumption and metrics
   195  	started  time.Time         // Timestamp when the skeleton syncer was created
   196  	logged   time.Time         // Timestamp when progress was last logged to the user
   197  	pulled   uint64            // Number of headers downloaded in this run
   198  
   199  	scratchSpace  []*types.Header // Scratch space to accumulate headers in (first = recent)
   200  	scratchOwners []string        // Peer IDs owning chunks of the scratch space (pend or delivered)
   201  	scratchHead   uint64          // Block number of the first item in the scratch space
   202  
   203  	requests map[uint64]*headerRequest // Header requests currently running
   204  
   205  	headEvents chan *headUpdate // Notification channel for new heads
   206  	terminate  chan chan error  // Termination channel to abort sync
   207  	terminated chan struct{}    // Channel to signal that the syner is dead
   208  
   209  	// Callback hooks used during testing
   210  	syncStarting func() // callback triggered after a sync cycle is inited but before started
   211  }
   212  
   213  // newSkeleton creates a new sync skeleton that tracks a potentially dangling
   214  // header chain until it's linked into an existing set of blocks.
   215  func newSkeleton(db ethdb.Database, peers *peerSet, drop peerDropFn, filler backfiller) *skeleton {
   216  	sk := &skeleton{
   217  		db:         db,
   218  		filler:     filler,
   219  		peers:      peers,
   220  		drop:       drop,
   221  		requests:   make(map[uint64]*headerRequest),
   222  		headEvents: make(chan *headUpdate),
   223  		terminate:  make(chan chan error),
   224  		terminated: make(chan struct{}),
   225  	}
   226  	go sk.startup()
   227  	return sk
   228  }
   229  
   230  // startup is an initial background loop which waits for an event to start or
   231  // tear the syncer down. This is required to make the skeleton sync loop once
   232  // per process but at the same time not start before the beacon chain announces
   233  // a new (existing) head.
   234  func (s *skeleton) startup() {
   235  	// Close a notification channel so anyone sending us events will know if the
   236  	// sync loop was torn down for good.
   237  	defer close(s.terminated)
   238  
   239  	// Wait for startup or teardown. This wait might loop a few times if a beacon
   240  	// client requests sync head extensions, but not forced reorgs (i.e. they are
   241  	// giving us new payloads without setting a starting head initially).
   242  	for {
   243  		select {
   244  		case errc := <-s.terminate:
   245  			// No head was announced but Geth is shutting down
   246  			errc <- nil
   247  			return
   248  
   249  		case event := <-s.headEvents:
   250  			// New head announced, start syncing to it, looping every time a current
   251  			// cycle is terminated due to a chain event (head reorg, old chain merge).
   252  			if !event.force {
   253  				event.errc <- errors.New("forced head needed for startup")
   254  				continue
   255  			}
   256  			event.errc <- nil // forced head accepted for startup
   257  			head := event.header
   258  			s.started = time.Now()
   259  
   260  			for {
   261  				// If the sync cycle terminated or was terminated, propagate up when
   262  				// higher layers request termination. There's no fancy explicit error
   263  				// signalling as the sync loop should never terminate (TM).
   264  				newhead, err := s.sync(head)
   265  				switch {
   266  				case err == errSyncLinked:
   267  					// Sync cycle linked up to the genesis block. Tear down the loop
   268  					// and restart it so, it can properly notify the backfiller. Don't
   269  					// account a new head.
   270  					head = nil
   271  
   272  				case err == errSyncMerged:
   273  					// Subchains were merged, we just need to reinit the internal
   274  					// start to continue on the tail of the merged chain. Don't
   275  					// announce a new head,
   276  					head = nil
   277  
   278  				case err == errSyncReorged:
   279  					// The subchain being synced got modified at the head in a
   280  					// way that requires resyncing it. Restart sync with the new
   281  					// head to force a cleanup.
   282  					head = newhead
   283  
   284  				case err == errTerminated:
   285  					// Sync was requested to be terminated from within, stop and
   286  					// return (no need to pass a message, was already done internally)
   287  					return
   288  
   289  				default:
   290  					// Sync either successfully terminated or failed with an unhandled
   291  					// error. Abort and wait until Geth requests a termination.
   292  					errc := <-s.terminate
   293  					errc <- err
   294  					return
   295  				}
   296  			}
   297  		}
   298  	}
   299  }
   300  
   301  // Terminate tears down the syncer indefinitely.
   302  func (s *skeleton) Terminate() error {
   303  	// Request termination and fetch any errors
   304  	errc := make(chan error)
   305  	s.terminate <- errc
   306  	err := <-errc
   307  
   308  	// Wait for full shutdown (not necessary, but cleaner)
   309  	<-s.terminated
   310  	return err
   311  }
   312  
   313  // Sync starts or resumes a previous sync cycle to download and maintain a reverse
   314  // header chain starting at the head and leading towards genesis to an available
   315  // ancestor.
   316  //
   317  // This method does not block, rather it just waits until the syncer receives the
   318  // fed header. What the syncer does with it is the syncer's problem.
   319  func (s *skeleton) Sync(head *types.Header, force bool) error {
   320  	log.Trace("New skeleton head announced", "number", head.Number, "hash", head.Hash(), "force", force)
   321  	errc := make(chan error)
   322  
   323  	select {
   324  	case s.headEvents <- &headUpdate{header: head, force: force, errc: errc}:
   325  		return <-errc
   326  	case <-s.terminated:
   327  		return errTerminated
   328  	}
   329  }
   330  
   331  // sync is the internal version of Sync that executes a single sync cycle, either
   332  // until some termination condition is reached, or until the current cycle merges
   333  // with a previously aborted run.
   334  func (s *skeleton) sync(head *types.Header) (*types.Header, error) {
   335  	// If we're continuing a previous merge interrupt, just access the existing
   336  	// old state without initing from disk.
   337  	if head == nil {
   338  		head = rawdb.ReadSkeletonHeader(s.db, s.progress.Subchains[0].Head)
   339  	} else {
   340  		// Otherwise, initialize the sync, trimming and previous leftovers until
   341  		// we're consistent with the newly requested chain head
   342  		s.initSync(head)
   343  	}
   344  	// Create the scratch space to fill with concurrently downloaded headers
   345  	s.scratchSpace = make([]*types.Header, scratchHeaders)
   346  	defer func() { s.scratchSpace = nil }() // don't hold on to references after sync
   347  
   348  	s.scratchOwners = make([]string, scratchHeaders/requestHeaders)
   349  	defer func() { s.scratchOwners = nil }() // don't hold on to references after sync
   350  
   351  	s.scratchHead = s.progress.Subchains[0].Tail - 1 // tail must not be 0!
   352  
   353  	// If the sync is already done, resume the backfiller. When the loop stops,
   354  	// terminate the backfiller too.
   355  	linked := len(s.progress.Subchains) == 1 &&
   356  		rawdb.HasBody(s.db, s.progress.Subchains[0].Next, s.scratchHead) &&
   357  		rawdb.HasReceipts(s.db, s.progress.Subchains[0].Next, s.scratchHead)
   358  	if linked {
   359  		s.filler.resume()
   360  	}
   361  	defer s.filler.suspend()
   362  
   363  	// Create a set of unique channels for this sync cycle. We need these to be
   364  	// ephemeral so a data race doesn't accidentally deliver something stale on
   365  	// a persistent channel across syncs (yup, this happened)
   366  	var (
   367  		requestFails = make(chan *headerRequest)
   368  		responses    = make(chan *headerResponse)
   369  	)
   370  	cancel := make(chan struct{})
   371  	defer close(cancel)
   372  
   373  	log.Debug("Starting reverse header sync cycle", "head", head.Number, "hash", head.Hash(), "cont", s.scratchHead)
   374  
   375  	// Whether sync completed or not, disregard any future packets
   376  	defer func() {
   377  		log.Debug("Terminating reverse header sync cycle", "head", head.Number, "hash", head.Hash(), "cont", s.scratchHead)
   378  		s.requests = make(map[uint64]*headerRequest)
   379  	}()
   380  
   381  	// Start tracking idle peers for task assignments
   382  	peering := make(chan *peeringEvent, 64) // arbitrary buffer, just some burst protection
   383  
   384  	peeringSub := s.peers.SubscribeEvents(peering)
   385  	defer peeringSub.Unsubscribe()
   386  
   387  	s.idles = make(map[string]*peerConnection)
   388  	for _, peer := range s.peers.AllPeers() {
   389  		s.idles[peer.id] = peer
   390  	}
   391  	// Nofity any tester listening for startup events
   392  	if s.syncStarting != nil {
   393  		s.syncStarting()
   394  	}
   395  	for {
   396  		// Something happened, try to assign new tasks to any idle peers
   397  		if !linked {
   398  			s.assignTasks(responses, requestFails, cancel)
   399  		}
   400  		// Wait for something to happen
   401  		select {
   402  		case event := <-peering:
   403  			// A peer joined or left, the tasks queue and allocations need to be
   404  			// checked for potential assignment or reassignment
   405  			peerid := event.peer.id
   406  			if event.join {
   407  				log.Debug("Joining skeleton peer", "id", peerid)
   408  				s.idles[peerid] = event.peer
   409  			} else {
   410  				log.Debug("Leaving skeleton peer", "id", peerid)
   411  				s.revertRequests(peerid)
   412  				delete(s.idles, peerid)
   413  			}
   414  
   415  		case errc := <-s.terminate:
   416  			errc <- nil
   417  			return nil, errTerminated
   418  
   419  		case event := <-s.headEvents:
   420  			// New head was announced, try to integrate it. If successful, nothing
   421  			// needs to be done as the head simply extended the last range. For now
   422  			// we don't seamlessly integrate reorgs to keep things simple. If the
   423  			// network starts doing many mini reorgs, it might be worthwhile handling
   424  			// a limited depth without an error.
   425  			if reorged := s.processNewHead(event.header, event.force); reorged {
   426  				// If a reorg is needed, and we're forcing the new head, signal
   427  				// the syncer to tear down and start over. Otherwise, drop the
   428  				// non-force reorg.
   429  				if event.force {
   430  					event.errc <- nil // forced head reorg accepted
   431  					return event.header, errSyncReorged
   432  				}
   433  				event.errc <- errReorgDenied
   434  				continue
   435  			}
   436  			event.errc <- nil // head extension accepted
   437  
   438  			// New head was integrated into the skeleton chain. If the backfiller
   439  			// is still running, it will pick it up. If it already terminated,
   440  			// a new cycle needs to be spun up.
   441  			if linked {
   442  				s.filler.resume()
   443  			}
   444  
   445  		case req := <-requestFails:
   446  			s.revertRequest(req)
   447  
   448  		case res := <-responses:
   449  			// Process the batch of headers. If though processing we managed to
   450  			// link the current subchain to a previously downloaded one, abort the
   451  			// sync and restart with the merged subchains.
   452  			//
   453  			// If we managed to link to the existing local chain or genesis block,
   454  			// abort sync altogether.
   455  			linked, merged := s.processResponse(res)
   456  			if linked {
   457  				log.Debug("Beacon sync linked to local chain")
   458  				return nil, errSyncLinked
   459  			}
   460  			if merged {
   461  				log.Debug("Beacon sync merged subchains")
   462  				return nil, errSyncMerged
   463  			}
   464  			// We still have work to do, loop and repeat
   465  		}
   466  	}
   467  }
   468  
   469  // initSync attempts to get the skeleton sync into a consistent state wrt any
   470  // past state on disk and the newly requested head to sync to. If the new head
   471  // is nil, the method will return and continue from the previous head.
   472  func (s *skeleton) initSync(head *types.Header) {
   473  	// Extract the head number, we'll need it all over
   474  	number := head.Number.Uint64()
   475  
   476  	// Retrieve the previously saved sync progress
   477  	if status := rawdb.ReadSkeletonSyncStatus(s.db); len(status) > 0 {
   478  		s.progress = new(skeletonProgress)
   479  		if err := json.Unmarshal(status, s.progress); err != nil {
   480  			log.Error("Failed to decode skeleton sync status", "err", err)
   481  		} else {
   482  			// Previous sync was available, print some continuation logs
   483  			for _, subchain := range s.progress.Subchains {
   484  				log.Debug("Restarting skeleton subchain", "head", subchain.Head, "tail", subchain.Tail)
   485  			}
   486  			// Create a new subchain for the head (unless the last can be extended),
   487  			// trimming anything it would overwrite
   488  			headchain := &subchain{
   489  				Head: number,
   490  				Tail: number,
   491  				Next: head.ParentHash,
   492  			}
   493  			for len(s.progress.Subchains) > 0 {
   494  				// If the last chain is above the new head, delete altogether
   495  				lastchain := s.progress.Subchains[0]
   496  				if lastchain.Tail >= headchain.Tail {
   497  					log.Debug("Dropping skeleton subchain", "head", lastchain.Head, "tail", lastchain.Tail)
   498  					s.progress.Subchains = s.progress.Subchains[1:]
   499  					continue
   500  				}
   501  				// Otherwise truncate the last chain if needed and abort trimming
   502  				if lastchain.Head >= headchain.Tail {
   503  					log.Debug("Trimming skeleton subchain", "oldhead", lastchain.Head, "newhead", headchain.Tail-1, "tail", lastchain.Tail)
   504  					lastchain.Head = headchain.Tail - 1
   505  				}
   506  				break
   507  			}
   508  			// If the last subchain can be extended, we're lucky. Otherwise create
   509  			// a new subchain sync task.
   510  			var extended bool
   511  			if n := len(s.progress.Subchains); n > 0 {
   512  				lastchain := s.progress.Subchains[0]
   513  				if lastchain.Head == headchain.Tail-1 {
   514  					lasthead := rawdb.ReadSkeletonHeader(s.db, lastchain.Head)
   515  					if lasthead.Hash() == head.ParentHash {
   516  						log.Debug("Extended skeleton subchain with new head", "head", headchain.Tail, "tail", lastchain.Tail)
   517  						lastchain.Head = headchain.Tail
   518  						extended = true
   519  					}
   520  				}
   521  			}
   522  			if !extended {
   523  				log.Debug("Created new skeleton subchain", "head", number, "tail", number)
   524  				s.progress.Subchains = append([]*subchain{headchain}, s.progress.Subchains...)
   525  			}
   526  			// Update the database with the new sync stats and insert the new
   527  			// head header. We won't delete any trimmed skeleton headers since
   528  			// those will be outside the index space of the many subchains and
   529  			// the database space will be reclaimed eventually when processing
   530  			// blocks above the current head (TODO(karalabe): don't forget).
   531  			batch := s.db.NewBatch()
   532  
   533  			rawdb.WriteSkeletonHeader(batch, head)
   534  			s.saveSyncStatus(batch)
   535  
   536  			if err := batch.Write(); err != nil {
   537  				log.Crit("Failed to write skeleton sync status", "err", err)
   538  			}
   539  			return
   540  		}
   541  	}
   542  	// Either we've failed to decode the previus state, or there was none. Start
   543  	// a fresh sync with a single subchain represented by the currently sent
   544  	// chain head.
   545  	s.progress = &skeletonProgress{
   546  		Subchains: []*subchain{
   547  			{
   548  				Head: number,
   549  				Tail: number,
   550  				Next: head.ParentHash,
   551  			},
   552  		},
   553  	}
   554  	batch := s.db.NewBatch()
   555  
   556  	rawdb.WriteSkeletonHeader(batch, head)
   557  	s.saveSyncStatus(batch)
   558  
   559  	if err := batch.Write(); err != nil {
   560  		log.Crit("Failed to write initial skeleton sync status", "err", err)
   561  	}
   562  	log.Debug("Created initial skeleton subchain", "head", number, "tail", number)
   563  }
   564  
   565  // saveSyncStatus marshals the remaining sync tasks into leveldb.
   566  func (s *skeleton) saveSyncStatus(db ethdb.KeyValueWriter) {
   567  	status, err := json.Marshal(s.progress)
   568  	if err != nil {
   569  		panic(err) // This can only fail during implementation
   570  	}
   571  	rawdb.WriteSkeletonSyncStatus(db, status)
   572  }
   573  
   574  // processNewHead does the internal shuffling for a new head marker and either
   575  // accepts and integrates it into the skeleton or requests a reorg. Upon reorg,
   576  // the syncer will tear itself down and restart with a fresh head. It is simpler
   577  // to reconstruct the sync state than to mutate it and hope for the best.
   578  func (s *skeleton) processNewHead(head *types.Header, force bool) bool {
   579  	// If the header cannot be inserted without interruption, return an error for
   580  	// the outer loop to tear down the skeleton sync and restart it
   581  	number := head.Number.Uint64()
   582  
   583  	lastchain := s.progress.Subchains[0]
   584  	if lastchain.Tail >= number {
   585  		if force {
   586  			log.Warn("Beacon chain reorged", "tail", lastchain.Tail, "newHead", number)
   587  		}
   588  		return true
   589  	}
   590  	if lastchain.Head+1 < number {
   591  		if force {
   592  			log.Warn("Beacon chain gapped", "head", lastchain.Head, "newHead", number)
   593  		}
   594  		return true
   595  	}
   596  	if parent := rawdb.ReadSkeletonHeader(s.db, number-1); parent.Hash() != head.ParentHash {
   597  		if force {
   598  			log.Warn("Beacon chain forked", "ancestor", parent.Number, "hash", parent.Hash(), "want", head.ParentHash)
   599  		}
   600  		return true
   601  	}
   602  	// New header seems to be in the last subchain range. Unwind any extra headers
   603  	// from the chain tip and insert the new head. We won't delete any trimmed
   604  	// skeleton headers since those will be outside the index space of the many
   605  	// subchains and the database space will be reclaimed eventually when processing
   606  	// blocks above the current head (TODO(karalabe): don't forget).
   607  	batch := s.db.NewBatch()
   608  
   609  	rawdb.WriteSkeletonHeader(batch, head)
   610  	lastchain.Head = number
   611  	s.saveSyncStatus(batch)
   612  
   613  	if err := batch.Write(); err != nil {
   614  		log.Crit("Failed to write skeleton sync status", "err", err)
   615  	}
   616  	return false
   617  }
   618  
   619  // assignTasks attempts to match idle peers to pending header retrievals.
   620  func (s *skeleton) assignTasks(success chan *headerResponse, fail chan *headerRequest, cancel chan struct{}) {
   621  	// Sort the peers by download capacity to use faster ones if many available
   622  	idlers := &peerCapacitySort{
   623  		peers: make([]*peerConnection, 0, len(s.idles)),
   624  		caps:  make([]int, 0, len(s.idles)),
   625  	}
   626  	targetTTL := s.peers.rates.TargetTimeout()
   627  	for _, peer := range s.idles {
   628  		idlers.peers = append(idlers.peers, peer)
   629  		idlers.caps = append(idlers.caps, s.peers.rates.Capacity(peer.id, eth.BlockHeadersMsg, targetTTL))
   630  	}
   631  	if len(idlers.peers) == 0 {
   632  		return
   633  	}
   634  	sort.Sort(idlers)
   635  
   636  	// Find header regions not yet downloading and fill them
   637  	for task, owner := range s.scratchOwners {
   638  		// If we're out of idle peers, stop assigning tasks
   639  		if len(idlers.peers) == 0 {
   640  			return
   641  		}
   642  		// Skip any tasks already filling
   643  		if owner != "" {
   644  			continue
   645  		}
   646  		// If we've reached the genesis, stop assigning tasks
   647  		if uint64(task*requestHeaders) >= s.scratchHead {
   648  			return
   649  		}
   650  		// Found a task and have peers available, assign it
   651  		idle := idlers.peers[0]
   652  
   653  		idlers.peers = idlers.peers[1:]
   654  		idlers.caps = idlers.caps[1:]
   655  
   656  		// Matched a pending task to an idle peer, allocate a unique request id
   657  		var reqid uint64
   658  		for {
   659  			reqid = uint64(rand.Int63())
   660  			if reqid == 0 {
   661  				continue
   662  			}
   663  			if _, ok := s.requests[reqid]; ok {
   664  				continue
   665  			}
   666  			break
   667  		}
   668  		// Generate the network query and send it to the peer
   669  		req := &headerRequest{
   670  			peer:    idle.id,
   671  			id:      reqid,
   672  			deliver: success,
   673  			revert:  fail,
   674  			cancel:  cancel,
   675  			stale:   make(chan struct{}),
   676  			head:    s.scratchHead - uint64(task*requestHeaders),
   677  		}
   678  		s.requests[reqid] = req
   679  		delete(s.idles, idle.id)
   680  
   681  		// Generate the network query and send it to the peer
   682  		go s.executeTask(idle, req)
   683  
   684  		// Inject the request into the task to block further assignments
   685  		s.scratchOwners[task] = idle.id
   686  	}
   687  }
   688  
   689  // executeTask executes a single fetch request, blocking until either a result
   690  // arrives or a timeouts / cancellation is triggered. The method should be run
   691  // on its own goroutine and will deliver on the requested channels.
   692  func (s *skeleton) executeTask(peer *peerConnection, req *headerRequest) {
   693  	start := time.Now()
   694  	resCh := make(chan *eth.Response)
   695  
   696  	// Figure out how many headers to fetch. Usually this will be a full batch,
   697  	// but for the very tail of the chain, trim the request to the number left.
   698  	// Since nodes may or may not return the genesis header for a batch request,
   699  	// don't even request it. The parent hash of block #1 is enough to link.
   700  	requestCount := requestHeaders
   701  	if req.head < requestHeaders {
   702  		requestCount = int(req.head)
   703  	}
   704  	peer.log.Trace("Fetching skeleton headers", "from", req.head, "count", requestCount)
   705  	netreq, err := peer.peer.RequestHeadersByNumber(req.head, requestCount, 0, true, resCh)
   706  	if err != nil {
   707  		peer.log.Trace("Failed to request headers", "err", err)
   708  		s.scheduleRevertRequest(req)
   709  		return
   710  	}
   711  	defer netreq.Close()
   712  
   713  	// Wait until the response arrives, the request is cancelled or times out
   714  	ttl := s.peers.rates.TargetTimeout()
   715  
   716  	timeoutTimer := time.NewTimer(ttl)
   717  	defer timeoutTimer.Stop()
   718  
   719  	select {
   720  	case <-req.cancel:
   721  		peer.log.Debug("Header request cancelled")
   722  		s.scheduleRevertRequest(req)
   723  
   724  	case <-timeoutTimer.C:
   725  		// Header retrieval timed out, update the metrics
   726  		peer.log.Warn("Header request timed out, dropping peer", "elapsed", ttl)
   727  		headerTimeoutMeter.Mark(1)
   728  		s.peers.rates.Update(peer.id, eth.BlockHeadersMsg, 0, 0)
   729  		s.scheduleRevertRequest(req)
   730  
   731  		// At this point we either need to drop the offending peer, or we need a
   732  		// mechanism to allow waiting for the response and not cancel it. For now
   733  		// lets go with dropping since the header sizes are deterministic and the
   734  		// beacon sync runs exclusive (downloader is idle) so there should be no
   735  		// other load to make timeouts probable. If we notice that timeouts happen
   736  		// more often than we'd like, we can introduce a tracker for the requests
   737  		// gone stale and monitor them. However, in that case too, we need a way
   738  		// to protect against malicious peers never responding, so it would need
   739  		// a second, hard-timeout mechanism.
   740  		s.drop(peer.id)
   741  
   742  	case res := <-resCh:
   743  		// Headers successfully retrieved, update the metrics
   744  		headers := *res.Res.(*eth.BlockHeadersPacket)
   745  
   746  		headerReqTimer.Update(time.Since(start))
   747  		s.peers.rates.Update(peer.id, eth.BlockHeadersMsg, res.Time, len(headers))
   748  
   749  		// Cross validate the headers with the requests
   750  		switch {
   751  		case len(headers) == 0:
   752  			// No headers were delivered, reject the response and reschedule
   753  			peer.log.Debug("No headers delivered")
   754  			res.Done <- errors.New("no headers delivered")
   755  			s.scheduleRevertRequest(req)
   756  
   757  		case headers[0].Number.Uint64() != req.head:
   758  			// Header batch anchored at non-requested number
   759  			peer.log.Debug("Invalid header response head", "have", headers[0].Number, "want", req.head)
   760  			res.Done <- errors.New("invalid header batch anchor")
   761  			s.scheduleRevertRequest(req)
   762  
   763  		case req.head >= requestHeaders && len(headers) != requestHeaders:
   764  			// Invalid number of non-genesis headers delivered, reject the response and reschedule
   765  			peer.log.Debug("Invalid non-genesis header count", "have", len(headers), "want", requestHeaders)
   766  			res.Done <- errors.New("not enough non-genesis headers delivered")
   767  			s.scheduleRevertRequest(req)
   768  
   769  		case req.head < requestHeaders && uint64(len(headers)) != req.head:
   770  			// Invalid number of genesis headers delivered, reject the response and reschedule
   771  			peer.log.Debug("Invalid genesis header count", "have", len(headers), "want", headers[0].Number.Uint64())
   772  			res.Done <- errors.New("not enough genesis headers delivered")
   773  			s.scheduleRevertRequest(req)
   774  
   775  		default:
   776  			// Packet seems structurally valid, check hash progression and if it
   777  			// is correct too, deliver for storage
   778  			for i := 0; i < len(headers)-1; i++ {
   779  				if headers[i].ParentHash != headers[i+1].Hash() {
   780  					peer.log.Debug("Invalid hash progression", "index", i, "wantparenthash", headers[i].ParentHash, "haveparenthash", headers[i+1].Hash())
   781  					res.Done <- errors.New("invalid hash progression")
   782  					s.scheduleRevertRequest(req)
   783  					return
   784  				}
   785  			}
   786  			// Hash chain is valid. The delivery might still be junk as we're
   787  			// downloading batches concurrently (so no way to link the headers
   788  			// until gaps are filled); in that case, we'll nuke the peer when
   789  			// we detect the fault.
   790  			res.Done <- nil
   791  
   792  			select {
   793  			case req.deliver <- &headerResponse{
   794  				peer:    peer,
   795  				reqid:   req.id,
   796  				headers: headers,
   797  			}:
   798  			case <-req.cancel:
   799  			}
   800  		}
   801  	}
   802  }
   803  
   804  // revertRequests locates all the currently pending reuqests from a particular
   805  // peer and reverts them, rescheduling for others to fulfill.
   806  func (s *skeleton) revertRequests(peer string) {
   807  	// Gather the requests first, revertals need the lock too
   808  	var requests []*headerRequest
   809  	for _, req := range s.requests {
   810  		if req.peer == peer {
   811  			requests = append(requests, req)
   812  		}
   813  	}
   814  	// Revert all the requests matching the peer
   815  	for _, req := range requests {
   816  		s.revertRequest(req)
   817  	}
   818  }
   819  
   820  // scheduleRevertRequest asks the event loop to clean up a request and return
   821  // all failed retrieval tasks to the scheduler for reassignment.
   822  func (s *skeleton) scheduleRevertRequest(req *headerRequest) {
   823  	select {
   824  	case req.revert <- req:
   825  		// Sync event loop notified
   826  	case <-req.cancel:
   827  		// Sync cycle got cancelled
   828  	case <-req.stale:
   829  		// Request already reverted
   830  	}
   831  }
   832  
   833  // revertRequest cleans up a request and returns all failed retrieval tasks to
   834  // the scheduler for reassignment.
   835  //
   836  // Note, this needs to run on the event runloop thread to reschedule to idle peers.
   837  // On peer threads, use scheduleRevertRequest.
   838  func (s *skeleton) revertRequest(req *headerRequest) {
   839  	log.Trace("Reverting header request", "peer", req.peer, "reqid", req.id)
   840  	select {
   841  	case <-req.stale:
   842  		log.Trace("Header request already reverted", "peer", req.peer, "reqid", req.id)
   843  		return
   844  	default:
   845  	}
   846  	close(req.stale)
   847  
   848  	// Remove the request from the tracked set
   849  	delete(s.requests, req.id)
   850  
   851  	// Remove the request from the tracked set and mark the task as not-pending,
   852  	// ready for resheduling
   853  	s.scratchOwners[(s.scratchHead-req.head)/requestHeaders] = ""
   854  }
   855  
   856  func (s *skeleton) processResponse(res *headerResponse) (linked bool, merged bool) {
   857  	res.peer.log.Trace("Processing header response", "head", res.headers[0].Number, "hash", res.headers[0].Hash(), "count", len(res.headers))
   858  
   859  	// Whether the response is valid, we can mark the peer as idle and notify
   860  	// the scheduler to assign a new task. If the response is invalid, we'll
   861  	// drop the peer in a bit.
   862  	s.idles[res.peer.id] = res.peer
   863  
   864  	// Ensure the response is for a valid request
   865  	if _, ok := s.requests[res.reqid]; !ok {
   866  		// Some internal accounting is broken. A request either times out or it
   867  		// gets fulfilled successfully. It should not be possible to deliver a
   868  		// response to a non-existing request.
   869  		res.peer.log.Error("Unexpected header packet")
   870  		return false, false
   871  	}
   872  	delete(s.requests, res.reqid)
   873  
   874  	// Insert the delivered headers into the scratch space independent of the
   875  	// content or continuation; those will be validated in a moment
   876  	head := res.headers[0].Number.Uint64()
   877  	copy(s.scratchSpace[s.scratchHead-head:], res.headers)
   878  
   879  	// If there's still a gap in the head of the scratch space, abort
   880  	if s.scratchSpace[0] == nil {
   881  		return false, false
   882  	}
   883  	// Try to consume any head headers, validating the boundary conditions
   884  	batch := s.db.NewBatch()
   885  	for s.scratchSpace[0] != nil {
   886  		// Next batch of headers available, cross-reference with the subchain
   887  		// we are extending and either accept or discard
   888  		if s.progress.Subchains[0].Next != s.scratchSpace[0].Hash() {
   889  			// Print a log messages to track what's going on
   890  			tail := s.progress.Subchains[0].Tail
   891  			want := s.progress.Subchains[0].Next
   892  			have := s.scratchSpace[0].Hash()
   893  
   894  			log.Warn("Invalid skeleton headers", "peer", s.scratchOwners[0], "number", tail-1, "want", want, "have", have)
   895  
   896  			// The peer delivered junk, or at least not the subchain we are
   897  			// syncing to. Free up the scratch space and assignment, reassign
   898  			// and drop the original peer.
   899  			for i := 0; i < requestHeaders; i++ {
   900  				s.scratchSpace[i] = nil
   901  			}
   902  			s.drop(s.scratchOwners[0])
   903  			s.scratchOwners[0] = ""
   904  			break
   905  		}
   906  		// Scratch delivery matches required subchain, deliver the batch of
   907  		// headers and push the subchain forward
   908  		var consumed int
   909  		for _, header := range s.scratchSpace[:requestHeaders] {
   910  			if header != nil { // nil when the genesis is reached
   911  				consumed++
   912  
   913  				rawdb.WriteSkeletonHeader(batch, header)
   914  				s.pulled++
   915  
   916  				s.progress.Subchains[0].Tail--
   917  				s.progress.Subchains[0].Next = header.ParentHash
   918  
   919  				// If we've reached an existing block in the chain, stop retrieving
   920  				// headers. Note, if we want to support light clients with the same
   921  				// code we'd need to switch here based on the downloader mode. That
   922  				// said, there's no such functionality for now, so don't complicate.
   923  				//
   924  				// In the case of full sync it would be enough to check for the body,
   925  				// but even a full syncing node will generate a receipt once block
   926  				// processing is done, so it's just one more "needless" check.
   927  				var (
   928  					hasBody    = rawdb.HasBody(s.db, header.ParentHash, header.Number.Uint64()-1)
   929  					hasReceipt = rawdb.HasReceipts(s.db, header.ParentHash, header.Number.Uint64()-1)
   930  				)
   931  				if hasBody && hasReceipt {
   932  					linked = true
   933  					break
   934  				}
   935  			}
   936  		}
   937  		head := s.progress.Subchains[0].Head
   938  		tail := s.progress.Subchains[0].Tail
   939  		next := s.progress.Subchains[0].Next
   940  
   941  		log.Trace("Primary subchain extended", "head", head, "tail", tail, "next", next)
   942  
   943  		// If the beacon chain was linked to the local chain, completely swap out
   944  		// all internal progress and abort header synchronization.
   945  		if linked {
   946  			// Note, linking into the local chain should also mean that there are
   947  			// no leftover subchains, but just in case there's some junk due to
   948  			// strange conditions or bugs, clean up all internal state.
   949  			if len(s.progress.Subchains) > 1 {
   950  				log.Error("Cleaning up leftovers after beacon link")
   951  				s.progress.Subchains = s.progress.Subchains[:1]
   952  			}
   953  			break
   954  		}
   955  		// Batch of headers consumed, shift the download window forward
   956  		copy(s.scratchSpace, s.scratchSpace[requestHeaders:])
   957  		for i := 0; i < requestHeaders; i++ {
   958  			s.scratchSpace[scratchHeaders-i-1] = nil
   959  		}
   960  		copy(s.scratchOwners, s.scratchOwners[1:])
   961  		s.scratchOwners[scratchHeaders/requestHeaders-1] = ""
   962  
   963  		s.scratchHead -= uint64(consumed)
   964  
   965  		// If the subchain extended into the next subchain, we need to handle
   966  		// the overlap. Since there could be many overlaps (come on), do this
   967  		// in a loop.
   968  		for len(s.progress.Subchains) > 1 && s.progress.Subchains[1].Head >= s.progress.Subchains[0].Tail {
   969  			// Extract some stats from the second subchain
   970  			head := s.progress.Subchains[1].Head
   971  			tail := s.progress.Subchains[1].Tail
   972  			next := s.progress.Subchains[1].Next
   973  
   974  			// Since we just overwrote part of the next subchain, we need to trim
   975  			// its head independent of matching or mismatching content
   976  			if s.progress.Subchains[1].Tail >= s.progress.Subchains[0].Tail {
   977  				// Fully overwritten, get rid of the subchain as a whole
   978  				log.Debug("Previous subchain fully overwritten", "head", head, "tail", tail, "next", next)
   979  				s.progress.Subchains = append(s.progress.Subchains[:1], s.progress.Subchains[2:]...)
   980  				continue
   981  			} else {
   982  				// Partially overwritten, trim the head to the overwritten size
   983  				log.Debug("Previous subchain partially overwritten", "head", head, "tail", tail, "next", next)
   984  				s.progress.Subchains[1].Head = s.progress.Subchains[0].Tail - 1
   985  			}
   986  			// If the old subchain is an extension of the new one, merge the two
   987  			// and let the skeleton syncer restart (to clean internal state)
   988  			if rawdb.ReadSkeletonHeader(s.db, s.progress.Subchains[1].Head).Hash() == s.progress.Subchains[0].Next {
   989  				log.Debug("Previous subchain merged", "head", head, "tail", tail, "next", next)
   990  				s.progress.Subchains[0].Tail = s.progress.Subchains[1].Tail
   991  				s.progress.Subchains[0].Next = s.progress.Subchains[1].Next
   992  
   993  				s.progress.Subchains = append(s.progress.Subchains[:1], s.progress.Subchains[2:]...)
   994  				merged = true
   995  			}
   996  		}
   997  		// If subchains were merged, all further available headers in the scratch
   998  		// space are invalid since we skipped ahead. Stop processing the scratch
   999  		// space to avoid dropping peers thinking they delivered invalid data.
  1000  		if merged {
  1001  			break
  1002  		}
  1003  	}
  1004  	s.saveSyncStatus(batch)
  1005  	if err := batch.Write(); err != nil {
  1006  		log.Crit("Failed to write skeleton headers and progress", "err", err)
  1007  	}
  1008  	// Print a progress report making the UX a bit nicer
  1009  	left := s.progress.Subchains[0].Tail - 1
  1010  	if linked {
  1011  		left = 0
  1012  	}
  1013  	if time.Since(s.logged) > 8*time.Second || left == 0 {
  1014  		s.logged = time.Now()
  1015  
  1016  		if s.pulled == 0 {
  1017  			log.Info("Beacon sync starting", "left", left)
  1018  		} else {
  1019  			eta := float64(time.Since(s.started)) / float64(s.pulled) * float64(left)
  1020  			log.Info("Syncing beacon headers", "downloaded", s.pulled, "left", left, "eta", common.PrettyDuration(eta))
  1021  		}
  1022  	}
  1023  	return linked, merged
  1024  }
  1025  
  1026  // Bounds retrieves the current head and tail tracked by the skeleton syncer.
  1027  // This method is used by the backfiller, whose life cycle is controlled by the
  1028  // skeleton syncer.
  1029  //
  1030  // Note, the method will not use the internal state of the skeleton, but will
  1031  // rather blindly pull stuff from the database. This is fine, because the back-
  1032  // filler will only run when the skeleton chain is fully downloaded and stable.
  1033  // There might be new heads appended, but those are atomic from the perspective
  1034  // of this method. Any head reorg will first tear down the backfiller and only
  1035  // then make the modification.
  1036  func (s *skeleton) Bounds() (head *types.Header, tail *types.Header, err error) {
  1037  	// Read the current sync progress from disk and figure out the current head.
  1038  	// Although there's a lot of error handling here, these are mostly as sanity
  1039  	// checks to avoid crashing if a programming error happens. These should not
  1040  	// happen in live code.
  1041  	status := rawdb.ReadSkeletonSyncStatus(s.db)
  1042  	if len(status) == 0 {
  1043  		return nil, nil, errors.New("beacon sync not yet started")
  1044  	}
  1045  	progress := new(skeletonProgress)
  1046  	if err := json.Unmarshal(status, progress); err != nil {
  1047  		return nil, nil, err
  1048  	}
  1049  	head = rawdb.ReadSkeletonHeader(s.db, progress.Subchains[0].Head)
  1050  	tail = rawdb.ReadSkeletonHeader(s.db, progress.Subchains[0].Tail)
  1051  
  1052  	return head, tail, nil
  1053  }
  1054  
  1055  // Header retrieves a specific header tracked by the skeleton syncer. This method
  1056  // is meant to be used by the backfiller, whose life cycle is controlled by the
  1057  // skeleton syncer.
  1058  //
  1059  // Note, outside the permitted runtimes, this method might return nil results and
  1060  // subsequent calls might return headers from different chains.
  1061  func (s *skeleton) Header(number uint64) *types.Header {
  1062  	return rawdb.ReadSkeletonHeader(s.db, number)
  1063  }