github.com/ethereum-optimism/optimism@v1.7.2/op-node/p2p/sync.go (about) 1 package p2p 2 3 import ( 4 "bytes" 5 "context" 6 "encoding/binary" 7 "errors" 8 "fmt" 9 "io" 10 "math/big" 11 "sync" 12 "sync/atomic" 13 "time" 14 15 "github.com/golang/snappy" 16 "github.com/hashicorp/golang-lru/v2/simplelru" 17 "github.com/libp2p/go-libp2p/core/network" 18 "github.com/libp2p/go-libp2p/core/peer" 19 "github.com/libp2p/go-libp2p/core/protocol" 20 "golang.org/x/time/rate" 21 22 "github.com/ethereum/go-ethereum" 23 "github.com/ethereum/go-ethereum/common" 24 "github.com/ethereum/go-ethereum/log" 25 26 "github.com/ethereum-optimism/optimism/op-node/rollup" 27 "github.com/ethereum-optimism/optimism/op-service/eth" 28 ) 29 30 // StreamCtxFn provides a new context to use when handling stream requests 31 type StreamCtxFn func() context.Context 32 33 // Note: the mocknet in testing does not support read/write stream timeouts, the timeouts are only applied if available. 34 // Rate-limits always apply, and are making sure the request/response throughput is not too fast, instead of too slow. 35 const ( 36 // timeout for opening a req-resp stream to another peer. This may involve some protocol negotiation. 37 streamTimeout = time.Second * 5 38 // timeout for writing the request as client. Can be as long as serverReadRequestTimeout 39 clientWriteRequestTimeout = time.Second * 10 40 // timeout for reading a response of a serving peer as client. Can be as long as serverWriteChunkTimeout 41 clientReadResponsetimeout = time.Second * 10 42 // timeout for reading the request content, deny the request if it cannot be fully read in time 43 serverReadRequestTimeout = time.Second * 10 44 // timeout for writing a single response message chunk 45 // (if a future response consists of multiple chunks, reset the writing timeout per chunk) 46 serverWriteChunkTimeout = time.Second * 10 47 // after the rate-limit reservation hits the max throttle delay, give up on serving a request and just close the stream 48 maxThrottleDelay = time.Second * 20 49 // Do not serve more than 20 requests per second 50 globalServerBlocksRateLimit rate.Limit = 20 51 // Allows a burst of 2x our rate limit 52 globalServerBlocksBurst = 40 53 // Do not serve more than 4 requests per second to the same peer, so we can serve other peers at the same time 54 peerServerBlocksRateLimit rate.Limit = 4 55 // Allow a peer to request 30s of blocks at once 56 peerServerBlocksBurst = 15 57 // If the client hits a request error, it counts as a lot of rate-limit tokens for syncing from that peer: 58 // we rather sync from other servers. We'll try again later, 59 // and eventually kick the peer based on degraded scoring if it's really not serving us well. 60 // TODO(CLI-4009): Use a backoff rather than this mechanism. 61 clientErrRateCost = peerServerBlocksBurst 62 ) 63 64 func PayloadByNumberProtocolID(l2ChainID *big.Int) protocol.ID { 65 return protocol.ID(fmt.Sprintf("/opstack/req/payload_by_number/%d/0", l2ChainID)) 66 } 67 68 type requestHandlerFn func(ctx context.Context, log log.Logger, stream network.Stream) 69 70 func MakeStreamHandler(resourcesCtx context.Context, log log.Logger, fn requestHandlerFn) network.StreamHandler { 71 return func(stream network.Stream) { 72 log := log.New("peer", stream.Conn().ID(), "remote", stream.Conn().RemoteMultiaddr()) 73 defer func() { 74 if err := recover(); err != nil { 75 log.Error("p2p server request handling panic", "err", err, "protocol", stream.Protocol()) 76 } 77 }() 78 defer stream.Close() 79 fn(resourcesCtx, log, stream) 80 } 81 } 82 83 type newStreamFn func(ctx context.Context, peerId peer.ID, protocolId ...protocol.ID) (network.Stream, error) 84 85 type receivePayloadFn func(ctx context.Context, from peer.ID, payload *eth.ExecutionPayloadEnvelope) error 86 87 type rangeRequest struct { 88 start uint64 89 end eth.L2BlockRef 90 } 91 92 type syncResult struct { 93 payload *eth.ExecutionPayloadEnvelope 94 peer peer.ID 95 } 96 97 type peerRequest struct { 98 num uint64 99 100 complete *atomic.Bool 101 } 102 103 type inFlightCheck struct { 104 num uint64 105 106 result chan bool 107 } 108 109 type SyncClientMetrics interface { 110 ClientPayloadByNumberEvent(num uint64, resultCode byte, duration time.Duration) 111 PayloadsQuarantineSize(n int) 112 } 113 114 type SyncPeerScorer interface { 115 onValidResponse(id peer.ID) 116 onResponseError(id peer.ID) 117 onRejectedPayload(id peer.ID) 118 } 119 120 // SyncClient implements a reverse chain sync with a minimal interface: 121 // signal the desired range, and receive blocks within this range back. 122 // Through parent-hash verification, received blocks are all ensured to be part of the canonical chain at one point, 123 // but it is up to the user to organize and process the results further. 124 // 125 // For the sync-client to retrieve any data, peers must be added with AddPeer(id), and removed upon disconnect with RemovePeer(id). 126 // The client is started with Start(), and may be started before or after changing any peers. 127 // 128 // ### Stages 129 // 130 // The sync mechanism is implemented as following: 131 // - User sends range request: blocks on sync main loop (with ctx timeout) 132 // - Main loop processes range request (from high to low), dividing block requests by number between parallel peers. 133 // - The high part of the range has a known block-hash, and is marked as trusted. 134 // - Once there are no more peers available for buffering requests, we stop the range request processing. 135 // - Every request buffered for a peer is tracked as in-flight, by block number. 136 // - In-flight requests are not repeated 137 // - Requests for data that's already in the quarantine are not repeated 138 // - Data already in the quarantine that is trusted is attempted to be promoted. 139 // 140 // - Peers each have their own routine for processing requests. 141 // - They fetch the requested block by number, parse and validate it, and then send it back to the main loop 142 // - If peers fail to fetch or process it, or fail to send it back to the main loop within timeout, 143 // then the doRequest returns an error. It then marks the in-flight request as completed. 144 // 145 // - Main loop receives results synchronously with the range requests 146 // - The result is removed from in-flight tracker 147 // - The result is added to the quarantine 148 // - If we trust the hash, we try to promote the result. 149 // 150 // ### Concepts 151 // 152 // The main concepts are: 153 // - Quarantine: an LRU that stores the latest fetched block data, by hash as well as an extra index by number. 154 // 155 // - Quarantine eviction: upon regular LRU eviction, or explicit removal (when we learn data is not canonical), 156 // the sync result is removed from quarantine without being forwarded to the receiver. 157 // The peer that provided the data may be down-scored for providing un-utilized data if the data 158 // is not trusted during eviction. 159 // 160 // - Trusted data: data becomes trusted through 2 ways: 161 // - The hash / parent-hash of the sync target is marked as trusted. 162 // - The parent-hash of any promoted data is marked as trusted. 163 // 164 // - The trusted-data is maintained in LRU: we only care about the recent accessed blocks. 165 // 166 // - Result promotion: content from the quarantine is "promoted" when we find the blockhash is trusted. 167 // The data is removed from the quarantine, and forwarded to the receiver. 168 // 169 // ### Usage 170 // 171 // The user is expected to request the range of blocks between its existing chain head, 172 // and a trusted future block-hash as reference to sync towards. 173 // Upon receiving results from the sync-client, the user should adjust down its sync-target 174 // based on the received results, to avoid duplicating work when req-requesting an updated range. 175 // Range requests should still be repeated eventually however, as the sync client will give up on syncing a large range 176 // when it's too busy syncing. 177 // 178 // The rationale for this approach is that this sync mechanism is primarily intended 179 // for quickly filling gaps between an existing chain and a gossip chain, and not for very long block ranges. 180 // Syncing in the execution-layer (through snap-sync) is more appropriate for long ranges. 181 // If the user does sync a long range of blocks through this mechanism, 182 // it does end up traversing through the chain, but receives the blocks in reverse order. 183 // It is up to the user to persist the blocks for later processing, or drop & resync them if persistence is limited. 184 type SyncClient struct { 185 log log.Logger 186 187 cfg *rollup.Config 188 189 metrics SyncClientMetrics 190 appScorer SyncPeerScorer 191 192 newStreamFn newStreamFn 193 payloadByNumber protocol.ID 194 195 peersLock sync.Mutex 196 // syncing worker per peer 197 peers map[peer.ID]context.CancelFunc 198 199 // trusted blocks are, or have been, canonical at one point. 200 // Everything that's trusted is acceptable to pass to the sync receiver, 201 // but we target to just sync the blocks of the latest canonical view of the chain. 202 trusted *simplelru.LRU[common.Hash, struct{}] 203 204 // quarantine is a LRU of untrusted results: blocks that could not be verified yet 205 quarantine *simplelru.LRU[common.Hash, syncResult] 206 // quarantineByNum indexes the quarantine contents by number. 207 // No duplicates here, only the latest quarantine write is indexed. 208 // This map is cleared upon evictions of items from the quarantine LRU 209 quarantineByNum map[uint64]common.Hash 210 211 // inFlight requests are not repeated 212 inFlight map[uint64]*atomic.Bool 213 214 requests chan rangeRequest 215 peerRequests chan peerRequest 216 inFlightChecks chan inFlightCheck 217 218 results chan syncResult 219 220 receivePayload receivePayloadFn 221 222 // Global rate limiter for all peers. 223 globalRL *rate.Limiter 224 225 // resource context: all peers and mainLoop tasks inherit this, and start shutting down once resCancel() is called. 226 resCtx context.Context 227 resCancel context.CancelFunc 228 229 // wait group: wait for the resources to close. Adding to this is only safe if the peersLock is held. 230 wg sync.WaitGroup 231 232 // Don't allow anything to be added to the wait-group while, or after, we are shutting down. 233 // This is protected by peersLock. 234 closingPeers bool 235 } 236 237 func NewSyncClient(log log.Logger, cfg *rollup.Config, newStream newStreamFn, rcv receivePayloadFn, metrics SyncClientMetrics, appScorer SyncPeerScorer) *SyncClient { 238 ctx, cancel := context.WithCancel(context.Background()) 239 240 c := &SyncClient{ 241 log: log, 242 cfg: cfg, 243 metrics: metrics, 244 appScorer: appScorer, 245 newStreamFn: newStream, 246 payloadByNumber: PayloadByNumberProtocolID(cfg.L2ChainID), 247 peers: make(map[peer.ID]context.CancelFunc), 248 quarantineByNum: make(map[uint64]common.Hash), 249 inFlight: make(map[uint64]*atomic.Bool), 250 requests: make(chan rangeRequest), // blocking 251 peerRequests: make(chan peerRequest, 128), 252 results: make(chan syncResult, 128), 253 inFlightChecks: make(chan inFlightCheck, 128), 254 globalRL: rate.NewLimiter(globalServerBlocksRateLimit, globalServerBlocksBurst), 255 resCtx: ctx, 256 resCancel: cancel, 257 receivePayload: rcv, 258 } 259 // never errors with positive LRU cache size 260 // TODO(CLI-3733): if we had an LRU based on on total payloads size, instead of payload count, 261 // we can safely buffer more data in the happy case. 262 q, _ := simplelru.NewLRU[common.Hash, syncResult](100, c.onQuarantineEvict) 263 c.quarantine = q 264 trusted, _ := simplelru.NewLRU[common.Hash, struct{}](10000, nil) 265 c.trusted = trusted 266 return c 267 } 268 269 func (s *SyncClient) Start() { 270 s.peersLock.Lock() 271 s.wg.Add(1) 272 s.peersLock.Unlock() 273 go s.mainLoop() 274 } 275 276 func (s *SyncClient) AddPeer(id peer.ID) { 277 s.peersLock.Lock() 278 defer s.peersLock.Unlock() 279 if s.closingPeers { 280 return 281 } 282 if _, ok := s.peers[id]; ok { 283 s.log.Warn("cannot register peer for sync duties, peer was already registered", "peer", id) 284 return 285 } 286 s.wg.Add(1) 287 // add new peer routine 288 ctx, cancel := context.WithCancel(s.resCtx) 289 s.peers[id] = cancel 290 go s.peerLoop(ctx, id) 291 } 292 293 func (s *SyncClient) RemovePeer(id peer.ID) { 294 s.peersLock.Lock() 295 defer s.peersLock.Unlock() 296 cancel, ok := s.peers[id] 297 if !ok { 298 s.log.Warn("cannot remove peer from sync duties, peer was not registered", "peer", id) 299 return 300 } 301 cancel() // once loop exits 302 delete(s.peers, id) 303 } 304 305 // Close will shut down the sync client and all attached work, and block until shutdown is complete. 306 // This will block if the Start() has not created the main background loop. 307 func (s *SyncClient) Close() error { 308 s.peersLock.Lock() 309 s.closingPeers = true 310 s.peersLock.Unlock() 311 s.resCancel() 312 s.wg.Wait() 313 return nil 314 } 315 316 func (s *SyncClient) RequestL2Range(ctx context.Context, start, end eth.L2BlockRef) error { 317 if end == (eth.L2BlockRef{}) { 318 s.log.Debug("P2P sync client received range signal, but cannot sync open-ended chain: need sync target to verify blocks through parent-hashes", "start", start) 319 return nil 320 } 321 // synchronize requests with the main loop for state access 322 select { 323 case s.requests <- rangeRequest{start: start.Number, end: end}: 324 return nil 325 case <-ctx.Done(): 326 return fmt.Errorf("too busy with P2P results/requests: %w", ctx.Err()) 327 } 328 } 329 330 const ( 331 maxRequestScheduling = time.Second * 3 332 maxResultProcessing = time.Second * 3 333 ) 334 335 func (s *SyncClient) mainLoop() { 336 defer s.wg.Done() 337 for { 338 select { 339 case req := <-s.requests: 340 ctx, cancel := context.WithTimeout(s.resCtx, maxRequestScheduling) 341 s.onRangeRequest(ctx, req) 342 cancel() 343 case res := <-s.results: 344 ctx, cancel := context.WithTimeout(s.resCtx, maxResultProcessing) 345 s.onResult(ctx, res) 346 cancel() 347 case check := <-s.inFlightChecks: 348 s.log.Info("Checking in flight", "num", check.num) 349 complete, ok := s.inFlight[check.num] 350 if !ok { 351 check.result <- false 352 } else { 353 check.result <- !complete.Load() 354 } 355 case <-s.resCtx.Done(): 356 s.log.Info("stopped P2P req-resp L2 block sync client") 357 return 358 } 359 } 360 } 361 362 func (s *SyncClient) isInFlight(ctx context.Context, num uint64) (bool, error) { 363 check := inFlightCheck{num: num, result: make(chan bool, 1)} 364 select { 365 case s.inFlightChecks <- check: 366 case <-ctx.Done(): 367 return false, errors.New("context cancelled when publishing in flight check") 368 } 369 select { 370 case res := <-check.result: 371 return res, nil 372 case <-ctx.Done(): 373 return false, errors.New("context cancelled while waiting for in flight check response") 374 } 375 } 376 377 // onRangeRequest is exclusively called by the main loop, and has thus direct access to the request bookkeeping state. 378 // This function transforms requested block ranges into work for each peer. 379 func (s *SyncClient) onRangeRequest(ctx context.Context, req rangeRequest) { 380 // add req head to trusted set of blocks 381 s.trusted.Add(req.end.Hash, struct{}{}) 382 s.trusted.Add(req.end.ParentHash, struct{}{}) 383 384 log := s.log.New("target", req.start, "end", req.end) 385 386 // clean up the completed in-flight requests 387 for k, v := range s.inFlight { 388 if v.Load() { 389 delete(s.inFlight, k) 390 } 391 } 392 393 // Now try to fetch lower numbers than current end, to traverse back towards the updated start. 394 for i := uint64(0); ; i++ { 395 num := req.end.Number - 1 - i 396 if num <= req.start { 397 return 398 } 399 // check if we have something in quarantine already 400 if h, ok := s.quarantineByNum[num]; ok { 401 if s.trusted.Contains(h) { // if we trust it, try to promote it. 402 s.tryPromote(h) 403 } 404 // Don't fetch things that we have a candidate for already. 405 // We'll evict it from quarantine by finding a conflict, or if we sync enough other blocks 406 continue 407 } 408 409 if _, ok := s.inFlight[num]; ok { 410 log.Debug("request still in-flight, not rescheduling sync request", "num", num) 411 continue // request still in flight 412 } 413 pr := peerRequest{num: num, complete: new(atomic.Bool)} 414 415 log.Debug("Scheduling P2P block request", "num", num) 416 // schedule number 417 select { 418 case s.peerRequests <- pr: 419 s.inFlight[num] = pr.complete 420 case <-ctx.Done(): 421 log.Info("did not schedule full P2P sync range", "current", num, "err", ctx.Err()) 422 return 423 default: // peers may all be busy processing requests already 424 log.Info("no peers ready to handle block requests for more P2P requests for L2 block history", "current", num) 425 return 426 } 427 } 428 } 429 430 func (s *SyncClient) onQuarantineEvict(key common.Hash, value syncResult) { 431 delete(s.quarantineByNum, uint64(value.payload.ExecutionPayload.BlockNumber)) 432 s.metrics.PayloadsQuarantineSize(s.quarantine.Len()) 433 if !s.trusted.Contains(key) { 434 s.log.Debug("evicting untrusted payload from quarantine", "id", value.payload.ExecutionPayload.ID(), "peer", value.peer) 435 // Down-score peer for having provided us a bad block that never turned out to be canonical 436 s.appScorer.onRejectedPayload(value.peer) 437 } else { 438 s.log.Debug("evicting trusted payload from quarantine", "id", value.payload.ExecutionPayload.ID(), "peer", value.peer) 439 } 440 } 441 442 func (s *SyncClient) tryPromote(h common.Hash) { 443 parentRes, ok := s.quarantine.Get(h) 444 if ok { 445 // Simply reschedule the result, to get it (and possibly its parents) out of quarantine without recursion. 446 // s.results is buffered, but skip the promotion if the channel is full as it would cause a deadlock. 447 select { 448 case s.results <- parentRes: 449 default: 450 s.log.Debug("failed to signal block for promotion: sync client is too busy", "h", h) 451 } 452 } else { 453 s.log.Debug("cannot find block in quarantine, nothing to promote", "h", h) 454 } 455 } 456 457 func (s *SyncClient) promote(ctx context.Context, res syncResult) { 458 s.log.Debug("promoting p2p sync result", "payload", res.payload.ExecutionPayload.ID(), "peer", res.peer) 459 460 if err := s.receivePayload(ctx, res.peer, res.payload); err != nil { 461 s.log.Warn("failed to promote payload, receiver error", "err", err) 462 return 463 } 464 s.trusted.Add(res.payload.ExecutionPayload.BlockHash, struct{}{}) 465 if s.quarantine.Remove(res.payload.ExecutionPayload.BlockHash) { 466 s.log.Debug("promoted previously p2p-synced block from quarantine to main", "id", res.payload.ExecutionPayload.ID()) 467 } else { 468 s.log.Debug("promoted new p2p-synced block to main", "id", res.payload.ExecutionPayload.ID()) 469 } 470 471 // Mark parent block as trusted, so that we can promote it once we receive it / find it 472 s.trusted.Add(res.payload.ExecutionPayload.ParentHash, struct{}{}) 473 474 // Try to promote the parent block too, if any: previous unverifiable data may now be canonical 475 s.tryPromote(res.payload.ExecutionPayload.ParentHash) 476 477 // In case we don't have the parent, and what we have in quarantine is wrong, 478 // clear what we buffered in favor of fetching something else. 479 if h, ok := s.quarantineByNum[uint64(res.payload.ExecutionPayload.BlockNumber)-1]; ok { 480 s.quarantine.Remove(h) 481 } 482 } 483 484 // onResult is exclusively called by the main loop, and has thus direct access to the request bookkeeping state. 485 // This function verifies if the result is canonical, and either promotes the result or moves the result into quarantine. 486 func (s *SyncClient) onResult(ctx context.Context, res syncResult) { 487 payload := res.payload.ExecutionPayload 488 s.log.Debug("processing p2p sync result", "payload", payload.ID(), "peer", res.peer) 489 // Clean up the in-flight request, we have a result now. 490 delete(s.inFlight, uint64(payload.BlockNumber)) 491 // Always put it in quarantine first. If promotion fails because the receiver is too busy, this functions as cache. 492 s.quarantine.Add(payload.BlockHash, res) 493 s.quarantineByNum[uint64(payload.BlockNumber)] = payload.BlockHash 494 s.metrics.PayloadsQuarantineSize(s.quarantine.Len()) 495 // If we know this block is canonical, then promote it 496 if s.trusted.Contains(payload.BlockHash) { 497 s.promote(ctx, res) 498 } 499 } 500 501 // peerLoop for syncing from a single peer 502 func (s *SyncClient) peerLoop(ctx context.Context, id peer.ID) { 503 defer func() { 504 s.peersLock.Lock() 505 delete(s.peers, id) // clean up 506 s.log.Debug("stopped syncing loop of peer", "id", id) 507 s.wg.Done() 508 s.peersLock.Unlock() 509 }() 510 511 log := s.log.New("peer", id) 512 log.Info("Starting P2P sync client event loop") 513 514 // Implement the same rate limits as the server does per-peer, 515 // so we don't be too aggressive to the server. 516 rl := rate.NewLimiter(peerServerBlocksRateLimit, peerServerBlocksBurst) 517 518 for { 519 // wait for a global allocation to be available 520 if err := s.globalRL.Wait(ctx); err != nil { 521 return 522 } 523 // wait for peer to be available for more work 524 if err := rl.Wait(ctx); err != nil { 525 return 526 } 527 528 // once the peer is available, wait for a sync request. 529 select { 530 case pr := <-s.peerRequests: 531 // We already established the peer is available w.r.t. rate-limiting, 532 // and this is the only loop over this peer, so we can request now. 533 start := time.Now() 534 err := s.doRequest(ctx, id, pr.num) 535 if err != nil { 536 // mark as complete if there's an error: we are not sending any result and can complete immediately. 537 pr.complete.Store(true) 538 log.Warn("failed p2p sync request", "num", pr.num, "err", err) 539 s.appScorer.onResponseError(id) 540 // If we hit an error, then count it as many requests. 541 // We'd like to avoid making more requests for a while, to back off. 542 if err := rl.WaitN(ctx, clientErrRateCost); err != nil { 543 return 544 } 545 } else { 546 log.Debug("completed p2p sync request", "num", pr.num) 547 s.appScorer.onValidResponse(id) 548 } 549 took := time.Since(start) 550 551 resultCode := byte(0) 552 if err != nil { 553 if re, ok := err.(requestResultErr); ok { 554 resultCode = re.ResultCode() 555 } else { 556 resultCode = 1 557 } 558 } 559 s.metrics.ClientPayloadByNumberEvent(pr.num, resultCode, took) 560 case <-ctx.Done(): 561 return 562 } 563 } 564 } 565 566 type requestResultErr byte 567 568 func (r requestResultErr) Error() string { 569 return fmt.Sprintf("peer failed to serve request with code %d", uint8(r)) 570 } 571 572 func (r requestResultErr) ResultCode() byte { 573 return byte(r) 574 } 575 576 func (s *SyncClient) doRequest(ctx context.Context, id peer.ID, expectedBlockNum uint64) error { 577 // open stream to peer 578 reqCtx, reqCancel := context.WithTimeout(ctx, streamTimeout) 579 str, err := s.newStreamFn(reqCtx, id, s.payloadByNumber) 580 reqCancel() 581 if err != nil { 582 return fmt.Errorf("failed to open stream: %w", err) 583 } 584 defer str.Close() 585 // set write timeout (if available) 586 _ = str.SetWriteDeadline(time.Now().Add(clientWriteRequestTimeout)) 587 if err := binary.Write(str, binary.LittleEndian, expectedBlockNum); err != nil { 588 return fmt.Errorf("failed to write request (%d): %w", expectedBlockNum, err) 589 } 590 if err := str.CloseWrite(); err != nil { 591 return fmt.Errorf("failed to close writer side while making request: %w", err) 592 } 593 594 // set read timeout (if available) 595 _ = str.SetReadDeadline(time.Now().Add(clientReadResponsetimeout)) 596 597 // Limit input, as well as output. 598 // Compression may otherwise continue to read ignored data for a small output, 599 // or output more data than desired (zip-bomb) 600 r := io.LimitReader(str, maxGossipSize) 601 var result [1]byte 602 if _, err := io.ReadFull(r, result[:]); err != nil { 603 return fmt.Errorf("failed to read result part of response: %w", err) 604 } 605 if res := result[0]; res != 0 { 606 return requestResultErr(res) 607 } 608 var versionData [4]byte 609 if _, err := io.ReadFull(r, versionData[:]); err != nil { 610 return fmt.Errorf("failed to read version part of response: %w", err) 611 } 612 version := binary.LittleEndian.Uint32(versionData[:]) 613 if version != 0 && version != 1 { 614 return fmt.Errorf("unrecognized version: %d", version) 615 } 616 // payload is SSZ encoded with Snappy framed compression 617 r = snappy.NewReader(r) 618 r = io.LimitReader(r, maxGossipSize) 619 // We cannot stream straight into the SSZ decoder, since we need the scope of the SSZ payload. 620 // The server does not prepend it, nor would we trust a claimed length anyway, so we buffer the data we get. 621 data, err := io.ReadAll(r) 622 if err != nil { 623 return fmt.Errorf("failed to read response: %w", err) 624 } 625 626 envelope := ð.ExecutionPayloadEnvelope{} 627 628 if version == 0 { 629 expectedBlockTime := s.cfg.TimestampForBlock(expectedBlockNum) 630 envelope, err = s.readExecutionPayload(data, expectedBlockTime) 631 if err != nil { 632 return err 633 } 634 } else if version == 1 { 635 if err := envelope.UnmarshalSSZ(uint32(len(data)), bytes.NewReader(data)); err != nil { 636 return fmt.Errorf("failed to decode execution payload envelope response: %w", err) 637 } 638 } else { 639 panic(fmt.Errorf("should have already filtered by version, but got: %d", version)) 640 } 641 642 if err := str.CloseRead(); err != nil { 643 return fmt.Errorf("failed to close reading side") 644 } 645 if err := verifyBlock(envelope, expectedBlockNum); err != nil { 646 return fmt.Errorf("received execution payload is invalid: %w", err) 647 } 648 select { 649 case s.results <- syncResult{payload: envelope, peer: id}: 650 case <-ctx.Done(): 651 return fmt.Errorf("failed to process response, sync client is too busy: %w", err) 652 } 653 return nil 654 } 655 656 func (s *SyncClient) readExecutionPayload(data []byte, expectedTime uint64) (*eth.ExecutionPayloadEnvelope, error) { 657 blockVersion := eth.BlockV1 658 if s.cfg.IsCanyon(expectedTime) { 659 blockVersion = eth.BlockV2 660 } 661 662 var res eth.ExecutionPayload 663 if err := res.UnmarshalSSZ(blockVersion, uint32(len(data)), bytes.NewReader(data)); err != nil { 664 return nil, fmt.Errorf("failed to decode response: %w", err) 665 } 666 667 return ð.ExecutionPayloadEnvelope{ExecutionPayload: &res}, nil 668 } 669 670 func verifyBlock(envelope *eth.ExecutionPayloadEnvelope, expectedNum uint64) error { 671 payload := envelope.ExecutionPayload 672 673 // verify L2 block 674 if expectedNum != uint64(payload.BlockNumber) { 675 return fmt.Errorf("received execution payload for block %d, but expected block %d", payload.BlockNumber, expectedNum) 676 } 677 actual, ok := envelope.CheckBlockHash() 678 if !ok { // payload itself contains bad block hash 679 return fmt.Errorf("received execution payload for block %d with bad block hash %s, expected %s", expectedNum, payload.BlockHash, actual) 680 } 681 return nil 682 } 683 684 // peerStat maintains rate-limiting data of a peer that requests blocks from us. 685 type peerStat struct { 686 // Requests tokenizes each request to sync 687 Requests *rate.Limiter 688 } 689 690 type L2Chain interface { 691 PayloadByNumber(ctx context.Context, number uint64) (*eth.ExecutionPayloadEnvelope, error) 692 } 693 694 type ReqRespServerMetrics interface { 695 ServerPayloadByNumberEvent(num uint64, resultCode byte, duration time.Duration) 696 } 697 698 type ReqRespServer struct { 699 cfg *rollup.Config 700 701 l2 L2Chain 702 703 metrics ReqRespServerMetrics 704 705 peerRateLimits *simplelru.LRU[peer.ID, *peerStat] 706 peerStatsLock sync.Mutex 707 708 globalRequestsRL *rate.Limiter 709 } 710 711 func NewReqRespServer(cfg *rollup.Config, l2 L2Chain, metrics ReqRespServerMetrics) *ReqRespServer { 712 // We should never allow over 1000 different peers to churn through quickly, 713 // so it's fine to prune rate-limit details past this. 714 715 peerRateLimits, _ := simplelru.NewLRU[peer.ID, *peerStat](1000, nil) 716 globalRequestsRL := rate.NewLimiter(globalServerBlocksRateLimit, globalServerBlocksBurst) 717 718 return &ReqRespServer{ 719 cfg: cfg, 720 l2: l2, 721 metrics: metrics, 722 peerRateLimits: peerRateLimits, 723 globalRequestsRL: globalRequestsRL, 724 } 725 } 726 727 // HandleSyncRequest is a stream handler function to register the L2 unsafe payloads alt-sync protocol. 728 // See MakeStreamHandler to transform this into a LibP2P handler function. 729 // 730 // Note that the same peer may open parallel streams. 731 // 732 // The caller must Close the stream. 733 func (srv *ReqRespServer) HandleSyncRequest(ctx context.Context, log log.Logger, stream network.Stream) { 734 // may stay 0 if we fail to decode the request 735 start := time.Now() 736 737 // We wait as long as necessary; we throttle the peer instead of disconnecting, 738 // unless the delay reaches a threshold that is unreasonable to wait for. 739 ctx, cancel := context.WithTimeout(ctx, maxThrottleDelay) 740 req, err := srv.handleSyncRequest(ctx, stream) 741 cancel() 742 743 resultCode := byte(0) 744 if err != nil { 745 log.Warn("failed to serve p2p sync request", "req", req, "err", err) 746 if errors.Is(err, ethereum.NotFound) { 747 resultCode = 1 748 } else if errors.Is(err, invalidRequestErr) { 749 resultCode = 2 750 } else { 751 resultCode = 3 752 } 753 // try to write error code, so the other peer can understand the reason for failure. 754 _, _ = stream.Write([]byte{resultCode}) 755 } else { 756 log.Debug("successfully served sync response", "req", req) 757 } 758 srv.metrics.ServerPayloadByNumberEvent(req, 0, time.Since(start)) 759 } 760 761 var invalidRequestErr = errors.New("invalid request") 762 763 func (srv *ReqRespServer) handleSyncRequest(ctx context.Context, stream network.Stream) (uint64, error) { 764 peerId := stream.Conn().RemotePeer() 765 766 // take a token from the global rate-limiter, 767 // to make sure there's not too much concurrent server work between different peers. 768 if err := srv.globalRequestsRL.Wait(ctx); err != nil { 769 return 0, fmt.Errorf("timed out waiting for global sync rate limit: %w", err) 770 } 771 772 // find rate limiting data of peer, or add otherwise 773 srv.peerStatsLock.Lock() 774 ps, _ := srv.peerRateLimits.Get(peerId) 775 if ps == nil { 776 ps = &peerStat{ 777 Requests: rate.NewLimiter(peerServerBlocksRateLimit, peerServerBlocksBurst), 778 } 779 srv.peerRateLimits.Add(peerId, ps) 780 ps.Requests.Reserve() // count the hit, but make it delay the next request rather than immediately waiting 781 } else { 782 // Only wait if it's an existing peer, otherwise the instant rate-limit Wait call always errors. 783 784 // If the requester thinks we're taking too long, then it's their problem and they can disconnect. 785 // We'll disconnect ourselves only when failing to read/write, 786 // if the work is invalid (range validation), or when individual sub tasks timeout. 787 if err := ps.Requests.Wait(ctx); err != nil { 788 return 0, fmt.Errorf("timed out waiting for global sync rate limit: %w", err) 789 } 790 } 791 srv.peerStatsLock.Unlock() 792 793 // Set read deadline, if available 794 _ = stream.SetReadDeadline(time.Now().Add(serverReadRequestTimeout)) 795 796 // Read the request 797 var req uint64 798 if err := binary.Read(stream, binary.LittleEndian, &req); err != nil { 799 return 0, fmt.Errorf("failed to read requested block number: %w", err) 800 } 801 if err := stream.CloseRead(); err != nil { 802 return req, fmt.Errorf("failed to close reading-side of a P2P sync request call: %w", err) 803 } 804 805 // Check the request is within the expected range of blocks 806 if req < srv.cfg.Genesis.L2.Number { 807 return req, fmt.Errorf("cannot serve request for L2 block %d before genesis %d: %w", req, srv.cfg.Genesis.L2.Number, invalidRequestErr) 808 } 809 max, err := srv.cfg.TargetBlockNumber(uint64(time.Now().Unix())) 810 if err != nil { 811 return req, fmt.Errorf("cannot determine max target block number to verify request: %w", invalidRequestErr) 812 } 813 if req > max { 814 return req, fmt.Errorf("cannot serve request for L2 block %d after max expected block (%v): %w", req, max, invalidRequestErr) 815 } 816 817 envelope, err := srv.l2.PayloadByNumber(ctx, req) 818 if err != nil { 819 if errors.Is(err, ethereum.NotFound) { 820 return req, fmt.Errorf("peer requested unknown block by number: %w", err) 821 } else { 822 return req, fmt.Errorf("failed to retrieve payload to serve to peer: %w", err) 823 } 824 } 825 826 // We set write deadline, if available, to safely write without blocking on a throttling peer connection 827 _ = stream.SetWriteDeadline(time.Now().Add(serverWriteChunkTimeout)) 828 829 w := snappy.NewBufferedWriter(stream) 830 831 if srv.cfg.IsEcotone(uint64(envelope.ExecutionPayload.Timestamp)) { 832 // 0 - resultCode: success = 0 833 // 1:5 - version: 1 (little endian) 834 tmp := [5]byte{0, 1, 0, 0, 0} 835 if _, err := stream.Write(tmp[:]); err != nil { 836 return req, fmt.Errorf("failed to write response header data: %w", err) 837 } 838 if _, err := envelope.MarshalSSZ(w); err != nil { 839 return req, fmt.Errorf("failed to write payload to sync response: %w", err) 840 } 841 } else { 842 // 0 - resultCode: success = 0 843 // 1:5 - version: 0 844 var tmp [5]byte 845 if _, err := stream.Write(tmp[:]); err != nil { 846 return req, fmt.Errorf("failed to write response header data: %w", err) 847 } 848 if _, err := envelope.ExecutionPayload.MarshalSSZ(w); err != nil { 849 return req, fmt.Errorf("failed to write payload to sync response: %w", err) 850 } 851 } 852 853 if err := w.Close(); err != nil { 854 return req, fmt.Errorf("failed to finishing writing payload to sync response: %w", err) 855 } 856 857 return req, nil 858 }