github.com/ethereum-optimism/optimism@v1.7.2/op-node/p2p/sync.go (about)

     1  package p2p
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"encoding/binary"
     7  	"errors"
     8  	"fmt"
     9  	"io"
    10  	"math/big"
    11  	"sync"
    12  	"sync/atomic"
    13  	"time"
    14  
    15  	"github.com/golang/snappy"
    16  	"github.com/hashicorp/golang-lru/v2/simplelru"
    17  	"github.com/libp2p/go-libp2p/core/network"
    18  	"github.com/libp2p/go-libp2p/core/peer"
    19  	"github.com/libp2p/go-libp2p/core/protocol"
    20  	"golang.org/x/time/rate"
    21  
    22  	"github.com/ethereum/go-ethereum"
    23  	"github.com/ethereum/go-ethereum/common"
    24  	"github.com/ethereum/go-ethereum/log"
    25  
    26  	"github.com/ethereum-optimism/optimism/op-node/rollup"
    27  	"github.com/ethereum-optimism/optimism/op-service/eth"
    28  )
    29  
    30  // StreamCtxFn provides a new context to use when handling stream requests
    31  type StreamCtxFn func() context.Context
    32  
    33  // Note: the mocknet in testing does not support read/write stream timeouts, the timeouts are only applied if available.
    34  // Rate-limits always apply, and are making sure the request/response throughput is not too fast, instead of too slow.
    35  const (
    36  	// timeout for opening a req-resp stream to another peer. This may involve some protocol negotiation.
    37  	streamTimeout = time.Second * 5
    38  	// timeout for writing the request as client. Can be as long as serverReadRequestTimeout
    39  	clientWriteRequestTimeout = time.Second * 10
    40  	// timeout for reading a response of a serving peer as client. Can be as long as serverWriteChunkTimeout
    41  	clientReadResponsetimeout = time.Second * 10
    42  	// timeout for reading the request content, deny the request if it cannot be fully read in time
    43  	serverReadRequestTimeout = time.Second * 10
    44  	// timeout for writing a single response message chunk
    45  	// (if a future response consists of multiple chunks, reset the writing timeout per chunk)
    46  	serverWriteChunkTimeout = time.Second * 10
    47  	// after the rate-limit reservation hits the max throttle delay, give up on serving a request and just close the stream
    48  	maxThrottleDelay = time.Second * 20
    49  	// Do not serve more than 20 requests per second
    50  	globalServerBlocksRateLimit rate.Limit = 20
    51  	// Allows a burst of 2x our rate limit
    52  	globalServerBlocksBurst = 40
    53  	// Do not serve more than 4 requests per second to the same peer, so we can serve other peers at the same time
    54  	peerServerBlocksRateLimit rate.Limit = 4
    55  	// Allow a peer to request 30s of blocks at once
    56  	peerServerBlocksBurst = 15
    57  	// If the client hits a request error, it counts as a lot of rate-limit tokens for syncing from that peer:
    58  	// we rather sync from other servers. We'll try again later,
    59  	// and eventually kick the peer based on degraded scoring if it's really not serving us well.
    60  	// TODO(CLI-4009): Use a backoff rather than this mechanism.
    61  	clientErrRateCost = peerServerBlocksBurst
    62  )
    63  
    64  func PayloadByNumberProtocolID(l2ChainID *big.Int) protocol.ID {
    65  	return protocol.ID(fmt.Sprintf("/opstack/req/payload_by_number/%d/0", l2ChainID))
    66  }
    67  
    68  type requestHandlerFn func(ctx context.Context, log log.Logger, stream network.Stream)
    69  
    70  func MakeStreamHandler(resourcesCtx context.Context, log log.Logger, fn requestHandlerFn) network.StreamHandler {
    71  	return func(stream network.Stream) {
    72  		log := log.New("peer", stream.Conn().ID(), "remote", stream.Conn().RemoteMultiaddr())
    73  		defer func() {
    74  			if err := recover(); err != nil {
    75  				log.Error("p2p server request handling panic", "err", err, "protocol", stream.Protocol())
    76  			}
    77  		}()
    78  		defer stream.Close()
    79  		fn(resourcesCtx, log, stream)
    80  	}
    81  }
    82  
    83  type newStreamFn func(ctx context.Context, peerId peer.ID, protocolId ...protocol.ID) (network.Stream, error)
    84  
    85  type receivePayloadFn func(ctx context.Context, from peer.ID, payload *eth.ExecutionPayloadEnvelope) error
    86  
    87  type rangeRequest struct {
    88  	start uint64
    89  	end   eth.L2BlockRef
    90  }
    91  
    92  type syncResult struct {
    93  	payload *eth.ExecutionPayloadEnvelope
    94  	peer    peer.ID
    95  }
    96  
    97  type peerRequest struct {
    98  	num uint64
    99  
   100  	complete *atomic.Bool
   101  }
   102  
   103  type inFlightCheck struct {
   104  	num uint64
   105  
   106  	result chan bool
   107  }
   108  
   109  type SyncClientMetrics interface {
   110  	ClientPayloadByNumberEvent(num uint64, resultCode byte, duration time.Duration)
   111  	PayloadsQuarantineSize(n int)
   112  }
   113  
   114  type SyncPeerScorer interface {
   115  	onValidResponse(id peer.ID)
   116  	onResponseError(id peer.ID)
   117  	onRejectedPayload(id peer.ID)
   118  }
   119  
   120  // SyncClient implements a reverse chain sync with a minimal interface:
   121  // signal the desired range, and receive blocks within this range back.
   122  // Through parent-hash verification, received blocks are all ensured to be part of the canonical chain at one point,
   123  // but it is up to the user to organize and process the results further.
   124  //
   125  // For the sync-client to retrieve any data, peers must be added with AddPeer(id), and removed upon disconnect with RemovePeer(id).
   126  // The client is started with Start(), and may be started before or after changing any peers.
   127  //
   128  // ### Stages
   129  //
   130  // The sync mechanism is implemented as following:
   131  // - User sends range request: blocks on sync main loop (with ctx timeout)
   132  // - Main loop processes range request (from high to low), dividing block requests by number between parallel peers.
   133  //   - The high part of the range has a known block-hash, and is marked as trusted.
   134  //   - Once there are no more peers available for buffering requests, we stop the range request processing.
   135  //   - Every request buffered for a peer is tracked as in-flight, by block number.
   136  //   - In-flight requests are not repeated
   137  //   - Requests for data that's already in the quarantine are not repeated
   138  //   - Data already in the quarantine that is trusted is attempted to be promoted.
   139  //
   140  // - Peers each have their own routine for processing requests.
   141  //   - They fetch the requested block by number, parse and validate it, and then send it back to the main loop
   142  //   - If peers fail to fetch or process it, or fail to send it back to the main loop within timeout,
   143  //     then the doRequest returns an error. It then marks the in-flight request as completed.
   144  //
   145  // - Main loop receives results synchronously with the range requests
   146  //   - The result is removed from in-flight tracker
   147  //   - The result is added to the quarantine
   148  //   - If we trust the hash, we try to promote the result.
   149  //
   150  // ### Concepts
   151  //
   152  // The main concepts are:
   153  // - Quarantine: an LRU that stores the latest fetched block data, by hash as well as an extra index by number.
   154  //
   155  //   - Quarantine eviction: upon regular LRU eviction, or explicit removal (when we learn data is not canonical),
   156  //     the sync result is removed from quarantine without being forwarded to the receiver.
   157  //     The peer that provided the data may be down-scored for providing un-utilized data if the data
   158  //     is not trusted during eviction.
   159  //
   160  // - Trusted data: data becomes trusted through 2 ways:
   161  //   - The hash / parent-hash of the sync target is marked as trusted.
   162  //   - The parent-hash of any promoted data is marked as trusted.
   163  //
   164  // - The trusted-data is maintained in LRU: we only care about the recent accessed blocks.
   165  //
   166  //   - Result promotion: content from the quarantine is "promoted" when we find the blockhash is trusted.
   167  //     The data is removed from the quarantine, and forwarded to the receiver.
   168  //
   169  // ### Usage
   170  //
   171  // The user is expected to request the range of blocks between its existing chain head,
   172  // and a trusted future block-hash as reference to sync towards.
   173  // Upon receiving results from the sync-client, the user should adjust down its sync-target
   174  // based on the received results, to avoid duplicating work when req-requesting an updated range.
   175  // Range requests should still be repeated eventually however, as the sync client will give up on syncing a large range
   176  // when it's too busy syncing.
   177  //
   178  // The rationale for this approach is that this sync mechanism is primarily intended
   179  // for quickly filling gaps between an existing chain and a gossip chain, and not for very long block ranges.
   180  // Syncing in the execution-layer (through snap-sync) is more appropriate for long ranges.
   181  // If the user does sync a long range of blocks through this mechanism,
   182  // it does end up traversing through the chain, but receives the blocks in reverse order.
   183  // It is up to the user to persist the blocks for later processing, or drop & resync them if persistence is limited.
   184  type SyncClient struct {
   185  	log log.Logger
   186  
   187  	cfg *rollup.Config
   188  
   189  	metrics   SyncClientMetrics
   190  	appScorer SyncPeerScorer
   191  
   192  	newStreamFn     newStreamFn
   193  	payloadByNumber protocol.ID
   194  
   195  	peersLock sync.Mutex
   196  	// syncing worker per peer
   197  	peers map[peer.ID]context.CancelFunc
   198  
   199  	// trusted blocks are, or have been, canonical at one point.
   200  	// Everything that's trusted is acceptable to pass to the sync receiver,
   201  	// but we target to just sync the blocks of the latest canonical view of the chain.
   202  	trusted *simplelru.LRU[common.Hash, struct{}]
   203  
   204  	// quarantine is a LRU of untrusted results: blocks that could not be verified yet
   205  	quarantine *simplelru.LRU[common.Hash, syncResult]
   206  	// quarantineByNum indexes the quarantine contents by number.
   207  	// No duplicates here, only the latest quarantine write is indexed.
   208  	// This map is cleared upon evictions of items from the quarantine LRU
   209  	quarantineByNum map[uint64]common.Hash
   210  
   211  	// inFlight requests are not repeated
   212  	inFlight map[uint64]*atomic.Bool
   213  
   214  	requests       chan rangeRequest
   215  	peerRequests   chan peerRequest
   216  	inFlightChecks chan inFlightCheck
   217  
   218  	results chan syncResult
   219  
   220  	receivePayload receivePayloadFn
   221  
   222  	// Global rate limiter for all peers.
   223  	globalRL *rate.Limiter
   224  
   225  	// resource context: all peers and mainLoop tasks inherit this, and start shutting down once resCancel() is called.
   226  	resCtx    context.Context
   227  	resCancel context.CancelFunc
   228  
   229  	// wait group: wait for the resources to close. Adding to this is only safe if the peersLock is held.
   230  	wg sync.WaitGroup
   231  
   232  	// Don't allow anything to be added to the wait-group while, or after, we are shutting down.
   233  	// This is protected by peersLock.
   234  	closingPeers bool
   235  }
   236  
   237  func NewSyncClient(log log.Logger, cfg *rollup.Config, newStream newStreamFn, rcv receivePayloadFn, metrics SyncClientMetrics, appScorer SyncPeerScorer) *SyncClient {
   238  	ctx, cancel := context.WithCancel(context.Background())
   239  
   240  	c := &SyncClient{
   241  		log:             log,
   242  		cfg:             cfg,
   243  		metrics:         metrics,
   244  		appScorer:       appScorer,
   245  		newStreamFn:     newStream,
   246  		payloadByNumber: PayloadByNumberProtocolID(cfg.L2ChainID),
   247  		peers:           make(map[peer.ID]context.CancelFunc),
   248  		quarantineByNum: make(map[uint64]common.Hash),
   249  		inFlight:        make(map[uint64]*atomic.Bool),
   250  		requests:        make(chan rangeRequest), // blocking
   251  		peerRequests:    make(chan peerRequest, 128),
   252  		results:         make(chan syncResult, 128),
   253  		inFlightChecks:  make(chan inFlightCheck, 128),
   254  		globalRL:        rate.NewLimiter(globalServerBlocksRateLimit, globalServerBlocksBurst),
   255  		resCtx:          ctx,
   256  		resCancel:       cancel,
   257  		receivePayload:  rcv,
   258  	}
   259  	// never errors with positive LRU cache size
   260  	// TODO(CLI-3733): if we had an LRU based on on total payloads size, instead of payload count,
   261  	//  we can safely buffer more data in the happy case.
   262  	q, _ := simplelru.NewLRU[common.Hash, syncResult](100, c.onQuarantineEvict)
   263  	c.quarantine = q
   264  	trusted, _ := simplelru.NewLRU[common.Hash, struct{}](10000, nil)
   265  	c.trusted = trusted
   266  	return c
   267  }
   268  
   269  func (s *SyncClient) Start() {
   270  	s.peersLock.Lock()
   271  	s.wg.Add(1)
   272  	s.peersLock.Unlock()
   273  	go s.mainLoop()
   274  }
   275  
   276  func (s *SyncClient) AddPeer(id peer.ID) {
   277  	s.peersLock.Lock()
   278  	defer s.peersLock.Unlock()
   279  	if s.closingPeers {
   280  		return
   281  	}
   282  	if _, ok := s.peers[id]; ok {
   283  		s.log.Warn("cannot register peer for sync duties, peer was already registered", "peer", id)
   284  		return
   285  	}
   286  	s.wg.Add(1)
   287  	// add new peer routine
   288  	ctx, cancel := context.WithCancel(s.resCtx)
   289  	s.peers[id] = cancel
   290  	go s.peerLoop(ctx, id)
   291  }
   292  
   293  func (s *SyncClient) RemovePeer(id peer.ID) {
   294  	s.peersLock.Lock()
   295  	defer s.peersLock.Unlock()
   296  	cancel, ok := s.peers[id]
   297  	if !ok {
   298  		s.log.Warn("cannot remove peer from sync duties, peer was not registered", "peer", id)
   299  		return
   300  	}
   301  	cancel() // once loop exits
   302  	delete(s.peers, id)
   303  }
   304  
   305  // Close will shut down the sync client and all attached work, and block until shutdown is complete.
   306  // This will block if the Start() has not created the main background loop.
   307  func (s *SyncClient) Close() error {
   308  	s.peersLock.Lock()
   309  	s.closingPeers = true
   310  	s.peersLock.Unlock()
   311  	s.resCancel()
   312  	s.wg.Wait()
   313  	return nil
   314  }
   315  
   316  func (s *SyncClient) RequestL2Range(ctx context.Context, start, end eth.L2BlockRef) error {
   317  	if end == (eth.L2BlockRef{}) {
   318  		s.log.Debug("P2P sync client received range signal, but cannot sync open-ended chain: need sync target to verify blocks through parent-hashes", "start", start)
   319  		return nil
   320  	}
   321  	// synchronize requests with the main loop for state access
   322  	select {
   323  	case s.requests <- rangeRequest{start: start.Number, end: end}:
   324  		return nil
   325  	case <-ctx.Done():
   326  		return fmt.Errorf("too busy with P2P results/requests: %w", ctx.Err())
   327  	}
   328  }
   329  
   330  const (
   331  	maxRequestScheduling = time.Second * 3
   332  	maxResultProcessing  = time.Second * 3
   333  )
   334  
   335  func (s *SyncClient) mainLoop() {
   336  	defer s.wg.Done()
   337  	for {
   338  		select {
   339  		case req := <-s.requests:
   340  			ctx, cancel := context.WithTimeout(s.resCtx, maxRequestScheduling)
   341  			s.onRangeRequest(ctx, req)
   342  			cancel()
   343  		case res := <-s.results:
   344  			ctx, cancel := context.WithTimeout(s.resCtx, maxResultProcessing)
   345  			s.onResult(ctx, res)
   346  			cancel()
   347  		case check := <-s.inFlightChecks:
   348  			s.log.Info("Checking in flight", "num", check.num)
   349  			complete, ok := s.inFlight[check.num]
   350  			if !ok {
   351  				check.result <- false
   352  			} else {
   353  				check.result <- !complete.Load()
   354  			}
   355  		case <-s.resCtx.Done():
   356  			s.log.Info("stopped P2P req-resp L2 block sync client")
   357  			return
   358  		}
   359  	}
   360  }
   361  
   362  func (s *SyncClient) isInFlight(ctx context.Context, num uint64) (bool, error) {
   363  	check := inFlightCheck{num: num, result: make(chan bool, 1)}
   364  	select {
   365  	case s.inFlightChecks <- check:
   366  	case <-ctx.Done():
   367  		return false, errors.New("context cancelled when publishing in flight check")
   368  	}
   369  	select {
   370  	case res := <-check.result:
   371  		return res, nil
   372  	case <-ctx.Done():
   373  		return false, errors.New("context cancelled while waiting for in flight check response")
   374  	}
   375  }
   376  
   377  // onRangeRequest is exclusively called by the main loop, and has thus direct access to the request bookkeeping state.
   378  // This function transforms requested block ranges into work for each peer.
   379  func (s *SyncClient) onRangeRequest(ctx context.Context, req rangeRequest) {
   380  	// add req head to trusted set of blocks
   381  	s.trusted.Add(req.end.Hash, struct{}{})
   382  	s.trusted.Add(req.end.ParentHash, struct{}{})
   383  
   384  	log := s.log.New("target", req.start, "end", req.end)
   385  
   386  	// clean up the completed in-flight requests
   387  	for k, v := range s.inFlight {
   388  		if v.Load() {
   389  			delete(s.inFlight, k)
   390  		}
   391  	}
   392  
   393  	// Now try to fetch lower numbers than current end, to traverse back towards the updated start.
   394  	for i := uint64(0); ; i++ {
   395  		num := req.end.Number - 1 - i
   396  		if num <= req.start {
   397  			return
   398  		}
   399  		// check if we have something in quarantine already
   400  		if h, ok := s.quarantineByNum[num]; ok {
   401  			if s.trusted.Contains(h) { // if we trust it, try to promote it.
   402  				s.tryPromote(h)
   403  			}
   404  			// Don't fetch things that we have a candidate for already.
   405  			// We'll evict it from quarantine by finding a conflict, or if we sync enough other blocks
   406  			continue
   407  		}
   408  
   409  		if _, ok := s.inFlight[num]; ok {
   410  			log.Debug("request still in-flight, not rescheduling sync request", "num", num)
   411  			continue // request still in flight
   412  		}
   413  		pr := peerRequest{num: num, complete: new(atomic.Bool)}
   414  
   415  		log.Debug("Scheduling P2P block request", "num", num)
   416  		// schedule number
   417  		select {
   418  		case s.peerRequests <- pr:
   419  			s.inFlight[num] = pr.complete
   420  		case <-ctx.Done():
   421  			log.Info("did not schedule full P2P sync range", "current", num, "err", ctx.Err())
   422  			return
   423  		default: // peers may all be busy processing requests already
   424  			log.Info("no peers ready to handle block requests for more P2P requests for L2 block history", "current", num)
   425  			return
   426  		}
   427  	}
   428  }
   429  
   430  func (s *SyncClient) onQuarantineEvict(key common.Hash, value syncResult) {
   431  	delete(s.quarantineByNum, uint64(value.payload.ExecutionPayload.BlockNumber))
   432  	s.metrics.PayloadsQuarantineSize(s.quarantine.Len())
   433  	if !s.trusted.Contains(key) {
   434  		s.log.Debug("evicting untrusted payload from quarantine", "id", value.payload.ExecutionPayload.ID(), "peer", value.peer)
   435  		// Down-score peer for having provided us a bad block that never turned out to be canonical
   436  		s.appScorer.onRejectedPayload(value.peer)
   437  	} else {
   438  		s.log.Debug("evicting trusted payload from quarantine", "id", value.payload.ExecutionPayload.ID(), "peer", value.peer)
   439  	}
   440  }
   441  
   442  func (s *SyncClient) tryPromote(h common.Hash) {
   443  	parentRes, ok := s.quarantine.Get(h)
   444  	if ok {
   445  		// Simply reschedule the result, to get it (and possibly its parents) out of quarantine without recursion.
   446  		// s.results is buffered, but skip the promotion if the channel is full as it would cause a deadlock.
   447  		select {
   448  		case s.results <- parentRes:
   449  		default:
   450  			s.log.Debug("failed to signal block for promotion: sync client is too busy", "h", h)
   451  		}
   452  	} else {
   453  		s.log.Debug("cannot find block in quarantine, nothing to promote", "h", h)
   454  	}
   455  }
   456  
   457  func (s *SyncClient) promote(ctx context.Context, res syncResult) {
   458  	s.log.Debug("promoting p2p sync result", "payload", res.payload.ExecutionPayload.ID(), "peer", res.peer)
   459  
   460  	if err := s.receivePayload(ctx, res.peer, res.payload); err != nil {
   461  		s.log.Warn("failed to promote payload, receiver error", "err", err)
   462  		return
   463  	}
   464  	s.trusted.Add(res.payload.ExecutionPayload.BlockHash, struct{}{})
   465  	if s.quarantine.Remove(res.payload.ExecutionPayload.BlockHash) {
   466  		s.log.Debug("promoted previously p2p-synced block from quarantine to main", "id", res.payload.ExecutionPayload.ID())
   467  	} else {
   468  		s.log.Debug("promoted new p2p-synced block to main", "id", res.payload.ExecutionPayload.ID())
   469  	}
   470  
   471  	// Mark parent block as trusted, so that we can promote it once we receive it / find it
   472  	s.trusted.Add(res.payload.ExecutionPayload.ParentHash, struct{}{})
   473  
   474  	// Try to promote the parent block too, if any: previous unverifiable data may now be canonical
   475  	s.tryPromote(res.payload.ExecutionPayload.ParentHash)
   476  
   477  	// In case we don't have the parent, and what we have in quarantine is wrong,
   478  	// clear what we buffered in favor of fetching something else.
   479  	if h, ok := s.quarantineByNum[uint64(res.payload.ExecutionPayload.BlockNumber)-1]; ok {
   480  		s.quarantine.Remove(h)
   481  	}
   482  }
   483  
   484  // onResult is exclusively called by the main loop, and has thus direct access to the request bookkeeping state.
   485  // This function verifies if the result is canonical, and either promotes the result or moves the result into quarantine.
   486  func (s *SyncClient) onResult(ctx context.Context, res syncResult) {
   487  	payload := res.payload.ExecutionPayload
   488  	s.log.Debug("processing p2p sync result", "payload", payload.ID(), "peer", res.peer)
   489  	// Clean up the in-flight request, we have a result now.
   490  	delete(s.inFlight, uint64(payload.BlockNumber))
   491  	// Always put it in quarantine first. If promotion fails because the receiver is too busy, this functions as cache.
   492  	s.quarantine.Add(payload.BlockHash, res)
   493  	s.quarantineByNum[uint64(payload.BlockNumber)] = payload.BlockHash
   494  	s.metrics.PayloadsQuarantineSize(s.quarantine.Len())
   495  	// If we know this block is canonical, then promote it
   496  	if s.trusted.Contains(payload.BlockHash) {
   497  		s.promote(ctx, res)
   498  	}
   499  }
   500  
   501  // peerLoop for syncing from a single peer
   502  func (s *SyncClient) peerLoop(ctx context.Context, id peer.ID) {
   503  	defer func() {
   504  		s.peersLock.Lock()
   505  		delete(s.peers, id) // clean up
   506  		s.log.Debug("stopped syncing loop of peer", "id", id)
   507  		s.wg.Done()
   508  		s.peersLock.Unlock()
   509  	}()
   510  
   511  	log := s.log.New("peer", id)
   512  	log.Info("Starting P2P sync client event loop")
   513  
   514  	// Implement the same rate limits as the server does per-peer,
   515  	// so we don't be too aggressive to the server.
   516  	rl := rate.NewLimiter(peerServerBlocksRateLimit, peerServerBlocksBurst)
   517  
   518  	for {
   519  		// wait for a global allocation to be available
   520  		if err := s.globalRL.Wait(ctx); err != nil {
   521  			return
   522  		}
   523  		// wait for peer to be available for more work
   524  		if err := rl.Wait(ctx); err != nil {
   525  			return
   526  		}
   527  
   528  		// once the peer is available, wait for a sync request.
   529  		select {
   530  		case pr := <-s.peerRequests:
   531  			// We already established the peer is available w.r.t. rate-limiting,
   532  			// and this is the only loop over this peer, so we can request now.
   533  			start := time.Now()
   534  			err := s.doRequest(ctx, id, pr.num)
   535  			if err != nil {
   536  				// mark as complete if there's an error: we are not sending any result and can complete immediately.
   537  				pr.complete.Store(true)
   538  				log.Warn("failed p2p sync request", "num", pr.num, "err", err)
   539  				s.appScorer.onResponseError(id)
   540  				// If we hit an error, then count it as many requests.
   541  				// We'd like to avoid making more requests for a while, to back off.
   542  				if err := rl.WaitN(ctx, clientErrRateCost); err != nil {
   543  					return
   544  				}
   545  			} else {
   546  				log.Debug("completed p2p sync request", "num", pr.num)
   547  				s.appScorer.onValidResponse(id)
   548  			}
   549  			took := time.Since(start)
   550  
   551  			resultCode := byte(0)
   552  			if err != nil {
   553  				if re, ok := err.(requestResultErr); ok {
   554  					resultCode = re.ResultCode()
   555  				} else {
   556  					resultCode = 1
   557  				}
   558  			}
   559  			s.metrics.ClientPayloadByNumberEvent(pr.num, resultCode, took)
   560  		case <-ctx.Done():
   561  			return
   562  		}
   563  	}
   564  }
   565  
   566  type requestResultErr byte
   567  
   568  func (r requestResultErr) Error() string {
   569  	return fmt.Sprintf("peer failed to serve request with code %d", uint8(r))
   570  }
   571  
   572  func (r requestResultErr) ResultCode() byte {
   573  	return byte(r)
   574  }
   575  
   576  func (s *SyncClient) doRequest(ctx context.Context, id peer.ID, expectedBlockNum uint64) error {
   577  	// open stream to peer
   578  	reqCtx, reqCancel := context.WithTimeout(ctx, streamTimeout)
   579  	str, err := s.newStreamFn(reqCtx, id, s.payloadByNumber)
   580  	reqCancel()
   581  	if err != nil {
   582  		return fmt.Errorf("failed to open stream: %w", err)
   583  	}
   584  	defer str.Close()
   585  	// set write timeout (if available)
   586  	_ = str.SetWriteDeadline(time.Now().Add(clientWriteRequestTimeout))
   587  	if err := binary.Write(str, binary.LittleEndian, expectedBlockNum); err != nil {
   588  		return fmt.Errorf("failed to write request (%d): %w", expectedBlockNum, err)
   589  	}
   590  	if err := str.CloseWrite(); err != nil {
   591  		return fmt.Errorf("failed to close writer side while making request: %w", err)
   592  	}
   593  
   594  	// set read timeout (if available)
   595  	_ = str.SetReadDeadline(time.Now().Add(clientReadResponsetimeout))
   596  
   597  	// Limit input, as well as output.
   598  	// Compression may otherwise continue to read ignored data for a small output,
   599  	// or output more data than desired (zip-bomb)
   600  	r := io.LimitReader(str, maxGossipSize)
   601  	var result [1]byte
   602  	if _, err := io.ReadFull(r, result[:]); err != nil {
   603  		return fmt.Errorf("failed to read result part of response: %w", err)
   604  	}
   605  	if res := result[0]; res != 0 {
   606  		return requestResultErr(res)
   607  	}
   608  	var versionData [4]byte
   609  	if _, err := io.ReadFull(r, versionData[:]); err != nil {
   610  		return fmt.Errorf("failed to read version part of response: %w", err)
   611  	}
   612  	version := binary.LittleEndian.Uint32(versionData[:])
   613  	if version != 0 && version != 1 {
   614  		return fmt.Errorf("unrecognized version: %d", version)
   615  	}
   616  	// payload is SSZ encoded with Snappy framed compression
   617  	r = snappy.NewReader(r)
   618  	r = io.LimitReader(r, maxGossipSize)
   619  	// We cannot stream straight into the SSZ decoder, since we need the scope of the SSZ payload.
   620  	// The server does not prepend it, nor would we trust a claimed length anyway, so we buffer the data we get.
   621  	data, err := io.ReadAll(r)
   622  	if err != nil {
   623  		return fmt.Errorf("failed to read response: %w", err)
   624  	}
   625  
   626  	envelope := &eth.ExecutionPayloadEnvelope{}
   627  
   628  	if version == 0 {
   629  		expectedBlockTime := s.cfg.TimestampForBlock(expectedBlockNum)
   630  		envelope, err = s.readExecutionPayload(data, expectedBlockTime)
   631  		if err != nil {
   632  			return err
   633  		}
   634  	} else if version == 1 {
   635  		if err := envelope.UnmarshalSSZ(uint32(len(data)), bytes.NewReader(data)); err != nil {
   636  			return fmt.Errorf("failed to decode execution payload envelope response: %w", err)
   637  		}
   638  	} else {
   639  		panic(fmt.Errorf("should have already filtered by version, but got: %d", version))
   640  	}
   641  
   642  	if err := str.CloseRead(); err != nil {
   643  		return fmt.Errorf("failed to close reading side")
   644  	}
   645  	if err := verifyBlock(envelope, expectedBlockNum); err != nil {
   646  		return fmt.Errorf("received execution payload is invalid: %w", err)
   647  	}
   648  	select {
   649  	case s.results <- syncResult{payload: envelope, peer: id}:
   650  	case <-ctx.Done():
   651  		return fmt.Errorf("failed to process response, sync client is too busy: %w", err)
   652  	}
   653  	return nil
   654  }
   655  
   656  func (s *SyncClient) readExecutionPayload(data []byte, expectedTime uint64) (*eth.ExecutionPayloadEnvelope, error) {
   657  	blockVersion := eth.BlockV1
   658  	if s.cfg.IsCanyon(expectedTime) {
   659  		blockVersion = eth.BlockV2
   660  	}
   661  
   662  	var res eth.ExecutionPayload
   663  	if err := res.UnmarshalSSZ(blockVersion, uint32(len(data)), bytes.NewReader(data)); err != nil {
   664  		return nil, fmt.Errorf("failed to decode response: %w", err)
   665  	}
   666  
   667  	return &eth.ExecutionPayloadEnvelope{ExecutionPayload: &res}, nil
   668  }
   669  
   670  func verifyBlock(envelope *eth.ExecutionPayloadEnvelope, expectedNum uint64) error {
   671  	payload := envelope.ExecutionPayload
   672  
   673  	// verify L2 block
   674  	if expectedNum != uint64(payload.BlockNumber) {
   675  		return fmt.Errorf("received execution payload for block %d, but expected block %d", payload.BlockNumber, expectedNum)
   676  	}
   677  	actual, ok := envelope.CheckBlockHash()
   678  	if !ok { // payload itself contains bad block hash
   679  		return fmt.Errorf("received execution payload for block %d with bad block hash %s, expected %s", expectedNum, payload.BlockHash, actual)
   680  	}
   681  	return nil
   682  }
   683  
   684  // peerStat maintains rate-limiting data of a peer that requests blocks from us.
   685  type peerStat struct {
   686  	// Requests tokenizes each request to sync
   687  	Requests *rate.Limiter
   688  }
   689  
   690  type L2Chain interface {
   691  	PayloadByNumber(ctx context.Context, number uint64) (*eth.ExecutionPayloadEnvelope, error)
   692  }
   693  
   694  type ReqRespServerMetrics interface {
   695  	ServerPayloadByNumberEvent(num uint64, resultCode byte, duration time.Duration)
   696  }
   697  
   698  type ReqRespServer struct {
   699  	cfg *rollup.Config
   700  
   701  	l2 L2Chain
   702  
   703  	metrics ReqRespServerMetrics
   704  
   705  	peerRateLimits *simplelru.LRU[peer.ID, *peerStat]
   706  	peerStatsLock  sync.Mutex
   707  
   708  	globalRequestsRL *rate.Limiter
   709  }
   710  
   711  func NewReqRespServer(cfg *rollup.Config, l2 L2Chain, metrics ReqRespServerMetrics) *ReqRespServer {
   712  	// We should never allow over 1000 different peers to churn through quickly,
   713  	// so it's fine to prune rate-limit details past this.
   714  
   715  	peerRateLimits, _ := simplelru.NewLRU[peer.ID, *peerStat](1000, nil)
   716  	globalRequestsRL := rate.NewLimiter(globalServerBlocksRateLimit, globalServerBlocksBurst)
   717  
   718  	return &ReqRespServer{
   719  		cfg:              cfg,
   720  		l2:               l2,
   721  		metrics:          metrics,
   722  		peerRateLimits:   peerRateLimits,
   723  		globalRequestsRL: globalRequestsRL,
   724  	}
   725  }
   726  
   727  // HandleSyncRequest is a stream handler function to register the L2 unsafe payloads alt-sync protocol.
   728  // See MakeStreamHandler to transform this into a LibP2P handler function.
   729  //
   730  // Note that the same peer may open parallel streams.
   731  //
   732  // The caller must Close the stream.
   733  func (srv *ReqRespServer) HandleSyncRequest(ctx context.Context, log log.Logger, stream network.Stream) {
   734  	// may stay 0 if we fail to decode the request
   735  	start := time.Now()
   736  
   737  	// We wait as long as necessary; we throttle the peer instead of disconnecting,
   738  	// unless the delay reaches a threshold that is unreasonable to wait for.
   739  	ctx, cancel := context.WithTimeout(ctx, maxThrottleDelay)
   740  	req, err := srv.handleSyncRequest(ctx, stream)
   741  	cancel()
   742  
   743  	resultCode := byte(0)
   744  	if err != nil {
   745  		log.Warn("failed to serve p2p sync request", "req", req, "err", err)
   746  		if errors.Is(err, ethereum.NotFound) {
   747  			resultCode = 1
   748  		} else if errors.Is(err, invalidRequestErr) {
   749  			resultCode = 2
   750  		} else {
   751  			resultCode = 3
   752  		}
   753  		// try to write error code, so the other peer can understand the reason for failure.
   754  		_, _ = stream.Write([]byte{resultCode})
   755  	} else {
   756  		log.Debug("successfully served sync response", "req", req)
   757  	}
   758  	srv.metrics.ServerPayloadByNumberEvent(req, 0, time.Since(start))
   759  }
   760  
   761  var invalidRequestErr = errors.New("invalid request")
   762  
   763  func (srv *ReqRespServer) handleSyncRequest(ctx context.Context, stream network.Stream) (uint64, error) {
   764  	peerId := stream.Conn().RemotePeer()
   765  
   766  	// take a token from the global rate-limiter,
   767  	// to make sure there's not too much concurrent server work between different peers.
   768  	if err := srv.globalRequestsRL.Wait(ctx); err != nil {
   769  		return 0, fmt.Errorf("timed out waiting for global sync rate limit: %w", err)
   770  	}
   771  
   772  	// find rate limiting data of peer, or add otherwise
   773  	srv.peerStatsLock.Lock()
   774  	ps, _ := srv.peerRateLimits.Get(peerId)
   775  	if ps == nil {
   776  		ps = &peerStat{
   777  			Requests: rate.NewLimiter(peerServerBlocksRateLimit, peerServerBlocksBurst),
   778  		}
   779  		srv.peerRateLimits.Add(peerId, ps)
   780  		ps.Requests.Reserve() // count the hit, but make it delay the next request rather than immediately waiting
   781  	} else {
   782  		// Only wait if it's an existing peer, otherwise the instant rate-limit Wait call always errors.
   783  
   784  		// If the requester thinks we're taking too long, then it's their problem and they can disconnect.
   785  		// We'll disconnect ourselves only when failing to read/write,
   786  		// if the work is invalid (range validation), or when individual sub tasks timeout.
   787  		if err := ps.Requests.Wait(ctx); err != nil {
   788  			return 0, fmt.Errorf("timed out waiting for global sync rate limit: %w", err)
   789  		}
   790  	}
   791  	srv.peerStatsLock.Unlock()
   792  
   793  	// Set read deadline, if available
   794  	_ = stream.SetReadDeadline(time.Now().Add(serverReadRequestTimeout))
   795  
   796  	// Read the request
   797  	var req uint64
   798  	if err := binary.Read(stream, binary.LittleEndian, &req); err != nil {
   799  		return 0, fmt.Errorf("failed to read requested block number: %w", err)
   800  	}
   801  	if err := stream.CloseRead(); err != nil {
   802  		return req, fmt.Errorf("failed to close reading-side of a P2P sync request call: %w", err)
   803  	}
   804  
   805  	// Check the request is within the expected range of blocks
   806  	if req < srv.cfg.Genesis.L2.Number {
   807  		return req, fmt.Errorf("cannot serve request for L2 block %d before genesis %d: %w", req, srv.cfg.Genesis.L2.Number, invalidRequestErr)
   808  	}
   809  	max, err := srv.cfg.TargetBlockNumber(uint64(time.Now().Unix()))
   810  	if err != nil {
   811  		return req, fmt.Errorf("cannot determine max target block number to verify request: %w", invalidRequestErr)
   812  	}
   813  	if req > max {
   814  		return req, fmt.Errorf("cannot serve request for L2 block %d after max expected block (%v): %w", req, max, invalidRequestErr)
   815  	}
   816  
   817  	envelope, err := srv.l2.PayloadByNumber(ctx, req)
   818  	if err != nil {
   819  		if errors.Is(err, ethereum.NotFound) {
   820  			return req, fmt.Errorf("peer requested unknown block by number: %w", err)
   821  		} else {
   822  			return req, fmt.Errorf("failed to retrieve payload to serve to peer: %w", err)
   823  		}
   824  	}
   825  
   826  	// We set write deadline, if available, to safely write without blocking on a throttling peer connection
   827  	_ = stream.SetWriteDeadline(time.Now().Add(serverWriteChunkTimeout))
   828  
   829  	w := snappy.NewBufferedWriter(stream)
   830  
   831  	if srv.cfg.IsEcotone(uint64(envelope.ExecutionPayload.Timestamp)) {
   832  		// 0 - resultCode: success = 0
   833  		// 1:5 - version: 1 (little endian)
   834  		tmp := [5]byte{0, 1, 0, 0, 0}
   835  		if _, err := stream.Write(tmp[:]); err != nil {
   836  			return req, fmt.Errorf("failed to write response header data: %w", err)
   837  		}
   838  		if _, err := envelope.MarshalSSZ(w); err != nil {
   839  			return req, fmt.Errorf("failed to write payload to sync response: %w", err)
   840  		}
   841  	} else {
   842  		// 0 - resultCode: success = 0
   843  		// 1:5 - version: 0
   844  		var tmp [5]byte
   845  		if _, err := stream.Write(tmp[:]); err != nil {
   846  			return req, fmt.Errorf("failed to write response header data: %w", err)
   847  		}
   848  		if _, err := envelope.ExecutionPayload.MarshalSSZ(w); err != nil {
   849  			return req, fmt.Errorf("failed to write payload to sync response: %w", err)
   850  		}
   851  	}
   852  
   853  	if err := w.Close(); err != nil {
   854  		return req, fmt.Errorf("failed to finishing writing payload to sync response: %w", err)
   855  	}
   856  
   857  	return req, nil
   858  }