github.com/palisadeinc/bor@v0.0.0-20230615125219-ab7196213d15/eth/downloader/skeleton.go (about) 1 // Copyright 2021 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package downloader 18 19 import ( 20 "encoding/json" 21 "errors" 22 "math/rand" 23 "sort" 24 "time" 25 26 "github.com/ethereum/go-ethereum/common" 27 "github.com/ethereum/go-ethereum/core/rawdb" 28 "github.com/ethereum/go-ethereum/core/types" 29 "github.com/ethereum/go-ethereum/eth/protocols/eth" 30 "github.com/ethereum/go-ethereum/ethdb" 31 "github.com/ethereum/go-ethereum/log" 32 ) 33 34 // scratchHeaders is the number of headers to store in a scratch space to allow 35 // concurrent downloads. A header is about 0.5KB in size, so there is no worry 36 // about using too much memory. The only catch is that we can only validate gaps 37 // afer they're linked to the head, so the bigger the scratch space, the larger 38 // potential for invalid headers. 39 // 40 // The current scratch space of 131072 headers is expected to use 64MB RAM. 41 const scratchHeaders = 131072 42 43 // requestHeaders is the number of header to request from a remote peer in a single 44 // network packet. Although the skeleton downloader takes into consideration peer 45 // capacities when picking idlers, the packet size was decided to remain constant 46 // since headers are relatively small and it's easier to work with fixed batches 47 // vs. dynamic interval fillings. 48 const requestHeaders = 512 49 50 // errSyncLinked is an internal helper error to signal that the current sync 51 // cycle linked up to the genesis block, this the skeleton syncer should ping 52 // the backfiller to resume. Since we already have that logic on sync start, 53 // piggie-back on that instead of 2 entrypoints. 54 var errSyncLinked = errors.New("sync linked") 55 56 // errSyncMerged is an internal helper error to signal that the current sync 57 // cycle merged with a previously aborted subchain, thus the skeleton syncer 58 // should abort and restart with the new state. 59 var errSyncMerged = errors.New("sync merged") 60 61 // errSyncReorged is an internal helper error to signal that the head chain of 62 // the current sync cycle was (partially) reorged, thus the skeleton syncer 63 // should abort and restart with the new state. 64 var errSyncReorged = errors.New("sync reorged") 65 66 // errTerminated is returned if the sync mechanism was terminated for this run of 67 // the process. This is usually the case when Geth is shutting down and some events 68 // might still be propagating. 69 var errTerminated = errors.New("terminated") 70 71 // errReorgDenied is returned if an attempt is made to extend the beacon chain 72 // with a new header, but it does not link up to the existing sync. 73 var errReorgDenied = errors.New("non-forced head reorg denied") 74 75 func init() { 76 // Tuning parameters is nice, but the scratch space must be assignable in 77 // full to peers. It's a useless cornercase to support a dangling half-group. 78 if scratchHeaders%requestHeaders != 0 { 79 panic("Please make scratchHeaders divisible by requestHeaders") 80 } 81 } 82 83 // subchain is a contiguous header chain segment that is backed by the database, 84 // but may not be linked to the live chain. The skeleton downloader may produce 85 // a new one of these every time it is restarted until the subchain grows large 86 // enough to connect with a previous subchain. 87 // 88 // The subchains use the exact same database namespace and are not disjoint from 89 // each other. As such, extending one to overlap the other entails reducing the 90 // second one first. This combined buffer model is used to avoid having to move 91 // data on disk when two subchains are joined together. 92 type subchain struct { 93 Head uint64 // Block number of the newest header in the subchain 94 Tail uint64 // Block number of the oldest header in the subchain 95 Next common.Hash // Block hash of the next oldest header in the subchain 96 } 97 98 // skeletonProgress is a database entry to allow suspending and resuming a chain 99 // sync. As the skeleton header chain is downloaded backwards, restarts can and 100 // will produce temporarily disjoint subchains. There is no way to restart a 101 // suspended skeleton sync without prior knowledge of all prior suspension points. 102 type skeletonProgress struct { 103 Subchains []*subchain // Disjoint subchains downloaded until now 104 } 105 106 // headUpdate is a notification that the beacon sync should switch to a new target. 107 // The update might request whether to forcefully change the target, or only try to 108 // extend it and fail if it's not possible. 109 type headUpdate struct { 110 header *types.Header // Header to update the sync target to 111 force bool // Whether to force the update or only extend if possible 112 errc chan error // Channel to signal acceptance of the new head 113 } 114 115 // headerRequest tracks a pending header request to ensure responses are to 116 // actual requests and to validate any security constraints. 117 // 118 // Concurrency note: header requests and responses are handled concurrently from 119 // the main runloop to allow Keccak256 hash verifications on the peer's thread and 120 // to drop on invalid response. The request struct must contain all the data to 121 // construct the response without accessing runloop internals (i.e. subchains). 122 // That is only included to allow the runloop to match a response to the task being 123 // synced without having yet another set of maps. 124 type headerRequest struct { 125 peer string // Peer to which this request is assigned 126 id uint64 // Request ID of this request 127 128 deliver chan *headerResponse // Channel to deliver successful response on 129 revert chan *headerRequest // Channel to deliver request failure on 130 cancel chan struct{} // Channel to track sync cancellation 131 stale chan struct{} // Channel to signal the request was dropped 132 133 head uint64 // Head number of the requested batch of headers 134 } 135 136 // headerResponse is an already verified remote response to a header request. 137 type headerResponse struct { 138 peer *peerConnection // Peer from which this response originates 139 reqid uint64 // Request ID that this response fulfils 140 headers []*types.Header // Chain of headers 141 } 142 143 // backfiller is a callback interface through which the skeleton sync can tell 144 // the downloader that it should suspend or resume backfilling on specific head 145 // events (e.g. suspend on forks or gaps, resume on successful linkups). 146 type backfiller interface { 147 // suspend requests the backfiller to abort any running full or snap sync 148 // based on the skeleton chain as it might be invalid. The backfiller should 149 // gracefully handle multiple consecutive suspends without a resume, even 150 // on initial sartup. 151 suspend() 152 153 // resume requests the backfiller to start running fill or snap sync based on 154 // the skeleton chain as it has successfully been linked. Appending new heads 155 // to the end of the chain will not result in suspend/resume cycles. 156 resume() 157 } 158 159 // skeleton represents a header chain synchronized after the merge where blocks 160 // aren't validated any more via PoW in a forward fashion, rather are dictated 161 // and extended at the head via the beacon chain and backfilled on the original 162 // Ethereum block sync protocol. 163 // 164 // Since the skeleton is grown backwards from head to genesis, it is handled as 165 // a separate entity, not mixed in with the logical sequential transition of the 166 // blocks. Once the skeleton is connected to an existing, validated chain, the 167 // headers will be moved into the main downloader for filling and execution. 168 // 169 // Opposed to the original Ethereum block synchronization which is trustless (and 170 // uses a master peer to minimize the attack surface), post-merge block sync starts 171 // from a trusted head. As such, there is no need for a master peer any more and 172 // headers can be requested fully concurrently (though some batches might be 173 // discarded if they don't link up correctly). 174 // 175 // Although a skeleton is part of a sync cycle, it is not recreated, rather stays 176 // alive throughout the lifetime of the downloader. This allows it to be extended 177 // concurrently with the sync cycle, since extensions arrive from an API surface, 178 // not from within (vs. legacy Ethereum sync). 179 // 180 // Since the skeleton tracks the entire header chain until it is consumed by the 181 // forward block filling, it needs 0.5KB/block storage. At current mainnet sizes 182 // this is only possible with a disk backend. Since the skeleton is separate from 183 // the node's header chain, storing the headers ephemerally until sync finishes 184 // is wasted disk IO, but it's a price we're going to pay to keep things simple 185 // for now. 186 type skeleton struct { 187 db ethdb.Database // Database backing the skeleton 188 filler backfiller // Chain syncer suspended/resumed by head events 189 190 peers *peerSet // Set of peers we can sync from 191 idles map[string]*peerConnection // Set of idle peers in the current sync cycle 192 drop peerDropFn // Drops a peer for misbehaving 193 194 progress *skeletonProgress // Sync progress tracker for resumption and metrics 195 started time.Time // Timestamp when the skeleton syncer was created 196 logged time.Time // Timestamp when progress was last logged to the user 197 pulled uint64 // Number of headers downloaded in this run 198 199 scratchSpace []*types.Header // Scratch space to accumulate headers in (first = recent) 200 scratchOwners []string // Peer IDs owning chunks of the scratch space (pend or delivered) 201 scratchHead uint64 // Block number of the first item in the scratch space 202 203 requests map[uint64]*headerRequest // Header requests currently running 204 205 headEvents chan *headUpdate // Notification channel for new heads 206 terminate chan chan error // Termination channel to abort sync 207 terminated chan struct{} // Channel to signal that the syner is dead 208 209 // Callback hooks used during testing 210 syncStarting func() // callback triggered after a sync cycle is inited but before started 211 } 212 213 // newSkeleton creates a new sync skeleton that tracks a potentially dangling 214 // header chain until it's linked into an existing set of blocks. 215 func newSkeleton(db ethdb.Database, peers *peerSet, drop peerDropFn, filler backfiller) *skeleton { 216 sk := &skeleton{ 217 db: db, 218 filler: filler, 219 peers: peers, 220 drop: drop, 221 requests: make(map[uint64]*headerRequest), 222 headEvents: make(chan *headUpdate), 223 terminate: make(chan chan error), 224 terminated: make(chan struct{}), 225 } 226 go sk.startup() 227 return sk 228 } 229 230 // startup is an initial background loop which waits for an event to start or 231 // tear the syncer down. This is required to make the skeleton sync loop once 232 // per process but at the same time not start before the beacon chain announces 233 // a new (existing) head. 234 func (s *skeleton) startup() { 235 // Close a notification channel so anyone sending us events will know if the 236 // sync loop was torn down for good. 237 defer close(s.terminated) 238 239 // Wait for startup or teardown. This wait might loop a few times if a beacon 240 // client requests sync head extensions, but not forced reorgs (i.e. they are 241 // giving us new payloads without setting a starting head initially). 242 for { 243 select { 244 case errc := <-s.terminate: 245 // No head was announced but Geth is shutting down 246 errc <- nil 247 return 248 249 case event := <-s.headEvents: 250 // New head announced, start syncing to it, looping every time a current 251 // cycle is terminated due to a chain event (head reorg, old chain merge). 252 if !event.force { 253 event.errc <- errors.New("forced head needed for startup") 254 continue 255 } 256 event.errc <- nil // forced head accepted for startup 257 head := event.header 258 s.started = time.Now() 259 260 for { 261 // If the sync cycle terminated or was terminated, propagate up when 262 // higher layers request termination. There's no fancy explicit error 263 // signalling as the sync loop should never terminate (TM). 264 newhead, err := s.sync(head) 265 switch { 266 case err == errSyncLinked: 267 // Sync cycle linked up to the genesis block. Tear down the loop 268 // and restart it so, it can properly notify the backfiller. Don't 269 // account a new head. 270 head = nil 271 272 case err == errSyncMerged: 273 // Subchains were merged, we just need to reinit the internal 274 // start to continue on the tail of the merged chain. Don't 275 // announce a new head, 276 head = nil 277 278 case err == errSyncReorged: 279 // The subchain being synced got modified at the head in a 280 // way that requires resyncing it. Restart sync with the new 281 // head to force a cleanup. 282 head = newhead 283 284 case err == errTerminated: 285 // Sync was requested to be terminated from within, stop and 286 // return (no need to pass a message, was already done internally) 287 return 288 289 default: 290 // Sync either successfully terminated or failed with an unhandled 291 // error. Abort and wait until Geth requests a termination. 292 errc := <-s.terminate 293 errc <- err 294 return 295 } 296 } 297 } 298 } 299 } 300 301 // Terminate tears down the syncer indefinitely. 302 func (s *skeleton) Terminate() error { 303 // Request termination and fetch any errors 304 errc := make(chan error) 305 s.terminate <- errc 306 err := <-errc 307 308 // Wait for full shutdown (not necessary, but cleaner) 309 <-s.terminated 310 return err 311 } 312 313 // Sync starts or resumes a previous sync cycle to download and maintain a reverse 314 // header chain starting at the head and leading towards genesis to an available 315 // ancestor. 316 // 317 // This method does not block, rather it just waits until the syncer receives the 318 // fed header. What the syncer does with it is the syncer's problem. 319 func (s *skeleton) Sync(head *types.Header, force bool) error { 320 log.Trace("New skeleton head announced", "number", head.Number, "hash", head.Hash(), "force", force) 321 errc := make(chan error) 322 323 select { 324 case s.headEvents <- &headUpdate{header: head, force: force, errc: errc}: 325 return <-errc 326 case <-s.terminated: 327 return errTerminated 328 } 329 } 330 331 // sync is the internal version of Sync that executes a single sync cycle, either 332 // until some termination condition is reached, or until the current cycle merges 333 // with a previously aborted run. 334 func (s *skeleton) sync(head *types.Header) (*types.Header, error) { 335 // If we're continuing a previous merge interrupt, just access the existing 336 // old state without initing from disk. 337 if head == nil { 338 head = rawdb.ReadSkeletonHeader(s.db, s.progress.Subchains[0].Head) 339 } else { 340 // Otherwise, initialize the sync, trimming and previous leftovers until 341 // we're consistent with the newly requested chain head 342 s.initSync(head) 343 } 344 // Create the scratch space to fill with concurrently downloaded headers 345 s.scratchSpace = make([]*types.Header, scratchHeaders) 346 defer func() { s.scratchSpace = nil }() // don't hold on to references after sync 347 348 s.scratchOwners = make([]string, scratchHeaders/requestHeaders) 349 defer func() { s.scratchOwners = nil }() // don't hold on to references after sync 350 351 s.scratchHead = s.progress.Subchains[0].Tail - 1 // tail must not be 0! 352 353 // If the sync is already done, resume the backfiller. When the loop stops, 354 // terminate the backfiller too. 355 linked := len(s.progress.Subchains) == 1 && 356 rawdb.HasBody(s.db, s.progress.Subchains[0].Next, s.scratchHead) && 357 rawdb.HasReceipts(s.db, s.progress.Subchains[0].Next, s.scratchHead) 358 if linked { 359 s.filler.resume() 360 } 361 defer s.filler.suspend() 362 363 // Create a set of unique channels for this sync cycle. We need these to be 364 // ephemeral so a data race doesn't accidentally deliver something stale on 365 // a persistent channel across syncs (yup, this happened) 366 var ( 367 requestFails = make(chan *headerRequest) 368 responses = make(chan *headerResponse) 369 ) 370 cancel := make(chan struct{}) 371 defer close(cancel) 372 373 log.Debug("Starting reverse header sync cycle", "head", head.Number, "hash", head.Hash(), "cont", s.scratchHead) 374 375 // Whether sync completed or not, disregard any future packets 376 defer func() { 377 log.Debug("Terminating reverse header sync cycle", "head", head.Number, "hash", head.Hash(), "cont", s.scratchHead) 378 s.requests = make(map[uint64]*headerRequest) 379 }() 380 381 // Start tracking idle peers for task assignments 382 peering := make(chan *peeringEvent, 64) // arbitrary buffer, just some burst protection 383 384 peeringSub := s.peers.SubscribeEvents(peering) 385 defer peeringSub.Unsubscribe() 386 387 s.idles = make(map[string]*peerConnection) 388 for _, peer := range s.peers.AllPeers() { 389 s.idles[peer.id] = peer 390 } 391 // Nofity any tester listening for startup events 392 if s.syncStarting != nil { 393 s.syncStarting() 394 } 395 for { 396 // Something happened, try to assign new tasks to any idle peers 397 if !linked { 398 s.assignTasks(responses, requestFails, cancel) 399 } 400 // Wait for something to happen 401 select { 402 case event := <-peering: 403 // A peer joined or left, the tasks queue and allocations need to be 404 // checked for potential assignment or reassignment 405 peerid := event.peer.id 406 if event.join { 407 log.Debug("Joining skeleton peer", "id", peerid) 408 s.idles[peerid] = event.peer 409 } else { 410 log.Debug("Leaving skeleton peer", "id", peerid) 411 s.revertRequests(peerid) 412 delete(s.idles, peerid) 413 } 414 415 case errc := <-s.terminate: 416 errc <- nil 417 return nil, errTerminated 418 419 case event := <-s.headEvents: 420 // New head was announced, try to integrate it. If successful, nothing 421 // needs to be done as the head simply extended the last range. For now 422 // we don't seamlessly integrate reorgs to keep things simple. If the 423 // network starts doing many mini reorgs, it might be worthwhile handling 424 // a limited depth without an error. 425 if reorged := s.processNewHead(event.header, event.force); reorged { 426 // If a reorg is needed, and we're forcing the new head, signal 427 // the syncer to tear down and start over. Otherwise, drop the 428 // non-force reorg. 429 if event.force { 430 event.errc <- nil // forced head reorg accepted 431 return event.header, errSyncReorged 432 } 433 event.errc <- errReorgDenied 434 continue 435 } 436 event.errc <- nil // head extension accepted 437 438 // New head was integrated into the skeleton chain. If the backfiller 439 // is still running, it will pick it up. If it already terminated, 440 // a new cycle needs to be spun up. 441 if linked { 442 s.filler.resume() 443 } 444 445 case req := <-requestFails: 446 s.revertRequest(req) 447 448 case res := <-responses: 449 // Process the batch of headers. If though processing we managed to 450 // link the current subchain to a previously downloaded one, abort the 451 // sync and restart with the merged subchains. 452 // 453 // If we managed to link to the existing local chain or genesis block, 454 // abort sync altogether. 455 linked, merged := s.processResponse(res) 456 if linked { 457 log.Debug("Beacon sync linked to local chain") 458 return nil, errSyncLinked 459 } 460 if merged { 461 log.Debug("Beacon sync merged subchains") 462 return nil, errSyncMerged 463 } 464 // We still have work to do, loop and repeat 465 } 466 } 467 } 468 469 // initSync attempts to get the skeleton sync into a consistent state wrt any 470 // past state on disk and the newly requested head to sync to. If the new head 471 // is nil, the method will return and continue from the previous head. 472 func (s *skeleton) initSync(head *types.Header) { 473 // Extract the head number, we'll need it all over 474 number := head.Number.Uint64() 475 476 // Retrieve the previously saved sync progress 477 if status := rawdb.ReadSkeletonSyncStatus(s.db); len(status) > 0 { 478 s.progress = new(skeletonProgress) 479 if err := json.Unmarshal(status, s.progress); err != nil { 480 log.Error("Failed to decode skeleton sync status", "err", err) 481 } else { 482 // Previous sync was available, print some continuation logs 483 for _, subchain := range s.progress.Subchains { 484 log.Debug("Restarting skeleton subchain", "head", subchain.Head, "tail", subchain.Tail) 485 } 486 // Create a new subchain for the head (unless the last can be extended), 487 // trimming anything it would overwrite 488 headchain := &subchain{ 489 Head: number, 490 Tail: number, 491 Next: head.ParentHash, 492 } 493 for len(s.progress.Subchains) > 0 { 494 // If the last chain is above the new head, delete altogether 495 lastchain := s.progress.Subchains[0] 496 if lastchain.Tail >= headchain.Tail { 497 log.Debug("Dropping skeleton subchain", "head", lastchain.Head, "tail", lastchain.Tail) 498 s.progress.Subchains = s.progress.Subchains[1:] 499 continue 500 } 501 // Otherwise truncate the last chain if needed and abort trimming 502 if lastchain.Head >= headchain.Tail { 503 log.Debug("Trimming skeleton subchain", "oldhead", lastchain.Head, "newhead", headchain.Tail-1, "tail", lastchain.Tail) 504 lastchain.Head = headchain.Tail - 1 505 } 506 break 507 } 508 // If the last subchain can be extended, we're lucky. Otherwise create 509 // a new subchain sync task. 510 var extended bool 511 if n := len(s.progress.Subchains); n > 0 { 512 lastchain := s.progress.Subchains[0] 513 if lastchain.Head == headchain.Tail-1 { 514 lasthead := rawdb.ReadSkeletonHeader(s.db, lastchain.Head) 515 if lasthead.Hash() == head.ParentHash { 516 log.Debug("Extended skeleton subchain with new head", "head", headchain.Tail, "tail", lastchain.Tail) 517 lastchain.Head = headchain.Tail 518 extended = true 519 } 520 } 521 } 522 if !extended { 523 log.Debug("Created new skeleton subchain", "head", number, "tail", number) 524 s.progress.Subchains = append([]*subchain{headchain}, s.progress.Subchains...) 525 } 526 // Update the database with the new sync stats and insert the new 527 // head header. We won't delete any trimmed skeleton headers since 528 // those will be outside the index space of the many subchains and 529 // the database space will be reclaimed eventually when processing 530 // blocks above the current head (TODO(karalabe): don't forget). 531 batch := s.db.NewBatch() 532 533 rawdb.WriteSkeletonHeader(batch, head) 534 s.saveSyncStatus(batch) 535 536 if err := batch.Write(); err != nil { 537 log.Crit("Failed to write skeleton sync status", "err", err) 538 } 539 return 540 } 541 } 542 // Either we've failed to decode the previus state, or there was none. Start 543 // a fresh sync with a single subchain represented by the currently sent 544 // chain head. 545 s.progress = &skeletonProgress{ 546 Subchains: []*subchain{ 547 { 548 Head: number, 549 Tail: number, 550 Next: head.ParentHash, 551 }, 552 }, 553 } 554 batch := s.db.NewBatch() 555 556 rawdb.WriteSkeletonHeader(batch, head) 557 s.saveSyncStatus(batch) 558 559 if err := batch.Write(); err != nil { 560 log.Crit("Failed to write initial skeleton sync status", "err", err) 561 } 562 log.Debug("Created initial skeleton subchain", "head", number, "tail", number) 563 } 564 565 // saveSyncStatus marshals the remaining sync tasks into leveldb. 566 func (s *skeleton) saveSyncStatus(db ethdb.KeyValueWriter) { 567 status, err := json.Marshal(s.progress) 568 if err != nil { 569 panic(err) // This can only fail during implementation 570 } 571 rawdb.WriteSkeletonSyncStatus(db, status) 572 } 573 574 // processNewHead does the internal shuffling for a new head marker and either 575 // accepts and integrates it into the skeleton or requests a reorg. Upon reorg, 576 // the syncer will tear itself down and restart with a fresh head. It is simpler 577 // to reconstruct the sync state than to mutate it and hope for the best. 578 func (s *skeleton) processNewHead(head *types.Header, force bool) bool { 579 // If the header cannot be inserted without interruption, return an error for 580 // the outer loop to tear down the skeleton sync and restart it 581 number := head.Number.Uint64() 582 583 lastchain := s.progress.Subchains[0] 584 if lastchain.Tail >= number { 585 if force { 586 log.Warn("Beacon chain reorged", "tail", lastchain.Tail, "newHead", number) 587 } 588 return true 589 } 590 if lastchain.Head+1 < number { 591 if force { 592 log.Warn("Beacon chain gapped", "head", lastchain.Head, "newHead", number) 593 } 594 return true 595 } 596 if parent := rawdb.ReadSkeletonHeader(s.db, number-1); parent.Hash() != head.ParentHash { 597 if force { 598 log.Warn("Beacon chain forked", "ancestor", parent.Number, "hash", parent.Hash(), "want", head.ParentHash) 599 } 600 return true 601 } 602 // New header seems to be in the last subchain range. Unwind any extra headers 603 // from the chain tip and insert the new head. We won't delete any trimmed 604 // skeleton headers since those will be outside the index space of the many 605 // subchains and the database space will be reclaimed eventually when processing 606 // blocks above the current head (TODO(karalabe): don't forget). 607 batch := s.db.NewBatch() 608 609 rawdb.WriteSkeletonHeader(batch, head) 610 lastchain.Head = number 611 s.saveSyncStatus(batch) 612 613 if err := batch.Write(); err != nil { 614 log.Crit("Failed to write skeleton sync status", "err", err) 615 } 616 return false 617 } 618 619 // assignTasks attempts to match idle peers to pending header retrievals. 620 func (s *skeleton) assignTasks(success chan *headerResponse, fail chan *headerRequest, cancel chan struct{}) { 621 // Sort the peers by download capacity to use faster ones if many available 622 idlers := &peerCapacitySort{ 623 peers: make([]*peerConnection, 0, len(s.idles)), 624 caps: make([]int, 0, len(s.idles)), 625 } 626 targetTTL := s.peers.rates.TargetTimeout() 627 for _, peer := range s.idles { 628 idlers.peers = append(idlers.peers, peer) 629 idlers.caps = append(idlers.caps, s.peers.rates.Capacity(peer.id, eth.BlockHeadersMsg, targetTTL)) 630 } 631 if len(idlers.peers) == 0 { 632 return 633 } 634 sort.Sort(idlers) 635 636 // Find header regions not yet downloading and fill them 637 for task, owner := range s.scratchOwners { 638 // If we're out of idle peers, stop assigning tasks 639 if len(idlers.peers) == 0 { 640 return 641 } 642 // Skip any tasks already filling 643 if owner != "" { 644 continue 645 } 646 // If we've reached the genesis, stop assigning tasks 647 if uint64(task*requestHeaders) >= s.scratchHead { 648 return 649 } 650 // Found a task and have peers available, assign it 651 idle := idlers.peers[0] 652 653 idlers.peers = idlers.peers[1:] 654 idlers.caps = idlers.caps[1:] 655 656 // Matched a pending task to an idle peer, allocate a unique request id 657 var reqid uint64 658 for { 659 reqid = uint64(rand.Int63()) 660 if reqid == 0 { 661 continue 662 } 663 if _, ok := s.requests[reqid]; ok { 664 continue 665 } 666 break 667 } 668 // Generate the network query and send it to the peer 669 req := &headerRequest{ 670 peer: idle.id, 671 id: reqid, 672 deliver: success, 673 revert: fail, 674 cancel: cancel, 675 stale: make(chan struct{}), 676 head: s.scratchHead - uint64(task*requestHeaders), 677 } 678 s.requests[reqid] = req 679 delete(s.idles, idle.id) 680 681 // Generate the network query and send it to the peer 682 go s.executeTask(idle, req) 683 684 // Inject the request into the task to block further assignments 685 s.scratchOwners[task] = idle.id 686 } 687 } 688 689 // executeTask executes a single fetch request, blocking until either a result 690 // arrives or a timeouts / cancellation is triggered. The method should be run 691 // on its own goroutine and will deliver on the requested channels. 692 func (s *skeleton) executeTask(peer *peerConnection, req *headerRequest) { 693 start := time.Now() 694 resCh := make(chan *eth.Response) 695 696 // Figure out how many headers to fetch. Usually this will be a full batch, 697 // but for the very tail of the chain, trim the request to the number left. 698 // Since nodes may or may not return the genesis header for a batch request, 699 // don't even request it. The parent hash of block #1 is enough to link. 700 requestCount := requestHeaders 701 if req.head < requestHeaders { 702 requestCount = int(req.head) 703 } 704 peer.log.Trace("Fetching skeleton headers", "from", req.head, "count", requestCount) 705 netreq, err := peer.peer.RequestHeadersByNumber(req.head, requestCount, 0, true, resCh) 706 if err != nil { 707 peer.log.Trace("Failed to request headers", "err", err) 708 s.scheduleRevertRequest(req) 709 return 710 } 711 defer netreq.Close() 712 713 // Wait until the response arrives, the request is cancelled or times out 714 ttl := s.peers.rates.TargetTimeout() 715 716 timeoutTimer := time.NewTimer(ttl) 717 defer timeoutTimer.Stop() 718 719 select { 720 case <-req.cancel: 721 peer.log.Debug("Header request cancelled") 722 s.scheduleRevertRequest(req) 723 724 case <-timeoutTimer.C: 725 // Header retrieval timed out, update the metrics 726 peer.log.Warn("Header request timed out, dropping peer", "elapsed", ttl) 727 headerTimeoutMeter.Mark(1) 728 s.peers.rates.Update(peer.id, eth.BlockHeadersMsg, 0, 0) 729 s.scheduleRevertRequest(req) 730 731 // At this point we either need to drop the offending peer, or we need a 732 // mechanism to allow waiting for the response and not cancel it. For now 733 // lets go with dropping since the header sizes are deterministic and the 734 // beacon sync runs exclusive (downloader is idle) so there should be no 735 // other load to make timeouts probable. If we notice that timeouts happen 736 // more often than we'd like, we can introduce a tracker for the requests 737 // gone stale and monitor them. However, in that case too, we need a way 738 // to protect against malicious peers never responding, so it would need 739 // a second, hard-timeout mechanism. 740 s.drop(peer.id) 741 742 case res := <-resCh: 743 // Headers successfully retrieved, update the metrics 744 headers := *res.Res.(*eth.BlockHeadersPacket) 745 746 headerReqTimer.Update(time.Since(start)) 747 s.peers.rates.Update(peer.id, eth.BlockHeadersMsg, res.Time, len(headers)) 748 749 // Cross validate the headers with the requests 750 switch { 751 case len(headers) == 0: 752 // No headers were delivered, reject the response and reschedule 753 peer.log.Debug("No headers delivered") 754 res.Done <- errors.New("no headers delivered") 755 s.scheduleRevertRequest(req) 756 757 case headers[0].Number.Uint64() != req.head: 758 // Header batch anchored at non-requested number 759 peer.log.Debug("Invalid header response head", "have", headers[0].Number, "want", req.head) 760 res.Done <- errors.New("invalid header batch anchor") 761 s.scheduleRevertRequest(req) 762 763 case req.head >= requestHeaders && len(headers) != requestHeaders: 764 // Invalid number of non-genesis headers delivered, reject the response and reschedule 765 peer.log.Debug("Invalid non-genesis header count", "have", len(headers), "want", requestHeaders) 766 res.Done <- errors.New("not enough non-genesis headers delivered") 767 s.scheduleRevertRequest(req) 768 769 case req.head < requestHeaders && uint64(len(headers)) != req.head: 770 // Invalid number of genesis headers delivered, reject the response and reschedule 771 peer.log.Debug("Invalid genesis header count", "have", len(headers), "want", headers[0].Number.Uint64()) 772 res.Done <- errors.New("not enough genesis headers delivered") 773 s.scheduleRevertRequest(req) 774 775 default: 776 // Packet seems structurally valid, check hash progression and if it 777 // is correct too, deliver for storage 778 for i := 0; i < len(headers)-1; i++ { 779 if headers[i].ParentHash != headers[i+1].Hash() { 780 peer.log.Debug("Invalid hash progression", "index", i, "wantparenthash", headers[i].ParentHash, "haveparenthash", headers[i+1].Hash()) 781 res.Done <- errors.New("invalid hash progression") 782 s.scheduleRevertRequest(req) 783 return 784 } 785 } 786 // Hash chain is valid. The delivery might still be junk as we're 787 // downloading batches concurrently (so no way to link the headers 788 // until gaps are filled); in that case, we'll nuke the peer when 789 // we detect the fault. 790 res.Done <- nil 791 792 select { 793 case req.deliver <- &headerResponse{ 794 peer: peer, 795 reqid: req.id, 796 headers: headers, 797 }: 798 case <-req.cancel: 799 } 800 } 801 } 802 } 803 804 // revertRequests locates all the currently pending reuqests from a particular 805 // peer and reverts them, rescheduling for others to fulfill. 806 func (s *skeleton) revertRequests(peer string) { 807 // Gather the requests first, revertals need the lock too 808 var requests []*headerRequest 809 for _, req := range s.requests { 810 if req.peer == peer { 811 requests = append(requests, req) 812 } 813 } 814 // Revert all the requests matching the peer 815 for _, req := range requests { 816 s.revertRequest(req) 817 } 818 } 819 820 // scheduleRevertRequest asks the event loop to clean up a request and return 821 // all failed retrieval tasks to the scheduler for reassignment. 822 func (s *skeleton) scheduleRevertRequest(req *headerRequest) { 823 select { 824 case req.revert <- req: 825 // Sync event loop notified 826 case <-req.cancel: 827 // Sync cycle got cancelled 828 case <-req.stale: 829 // Request already reverted 830 } 831 } 832 833 // revertRequest cleans up a request and returns all failed retrieval tasks to 834 // the scheduler for reassignment. 835 // 836 // Note, this needs to run on the event runloop thread to reschedule to idle peers. 837 // On peer threads, use scheduleRevertRequest. 838 func (s *skeleton) revertRequest(req *headerRequest) { 839 log.Trace("Reverting header request", "peer", req.peer, "reqid", req.id) 840 select { 841 case <-req.stale: 842 log.Trace("Header request already reverted", "peer", req.peer, "reqid", req.id) 843 return 844 default: 845 } 846 close(req.stale) 847 848 // Remove the request from the tracked set 849 delete(s.requests, req.id) 850 851 // Remove the request from the tracked set and mark the task as not-pending, 852 // ready for resheduling 853 s.scratchOwners[(s.scratchHead-req.head)/requestHeaders] = "" 854 } 855 856 func (s *skeleton) processResponse(res *headerResponse) (linked bool, merged bool) { 857 res.peer.log.Trace("Processing header response", "head", res.headers[0].Number, "hash", res.headers[0].Hash(), "count", len(res.headers)) 858 859 // Whether the response is valid, we can mark the peer as idle and notify 860 // the scheduler to assign a new task. If the response is invalid, we'll 861 // drop the peer in a bit. 862 s.idles[res.peer.id] = res.peer 863 864 // Ensure the response is for a valid request 865 if _, ok := s.requests[res.reqid]; !ok { 866 // Some internal accounting is broken. A request either times out or it 867 // gets fulfilled successfully. It should not be possible to deliver a 868 // response to a non-existing request. 869 res.peer.log.Error("Unexpected header packet") 870 return false, false 871 } 872 delete(s.requests, res.reqid) 873 874 // Insert the delivered headers into the scratch space independent of the 875 // content or continuation; those will be validated in a moment 876 head := res.headers[0].Number.Uint64() 877 copy(s.scratchSpace[s.scratchHead-head:], res.headers) 878 879 // If there's still a gap in the head of the scratch space, abort 880 if s.scratchSpace[0] == nil { 881 return false, false 882 } 883 // Try to consume any head headers, validating the boundary conditions 884 batch := s.db.NewBatch() 885 for s.scratchSpace[0] != nil { 886 // Next batch of headers available, cross-reference with the subchain 887 // we are extending and either accept or discard 888 if s.progress.Subchains[0].Next != s.scratchSpace[0].Hash() { 889 // Print a log messages to track what's going on 890 tail := s.progress.Subchains[0].Tail 891 want := s.progress.Subchains[0].Next 892 have := s.scratchSpace[0].Hash() 893 894 log.Warn("Invalid skeleton headers", "peer", s.scratchOwners[0], "number", tail-1, "want", want, "have", have) 895 896 // The peer delivered junk, or at least not the subchain we are 897 // syncing to. Free up the scratch space and assignment, reassign 898 // and drop the original peer. 899 for i := 0; i < requestHeaders; i++ { 900 s.scratchSpace[i] = nil 901 } 902 s.drop(s.scratchOwners[0]) 903 s.scratchOwners[0] = "" 904 break 905 } 906 // Scratch delivery matches required subchain, deliver the batch of 907 // headers and push the subchain forward 908 var consumed int 909 for _, header := range s.scratchSpace[:requestHeaders] { 910 if header != nil { // nil when the genesis is reached 911 consumed++ 912 913 rawdb.WriteSkeletonHeader(batch, header) 914 s.pulled++ 915 916 s.progress.Subchains[0].Tail-- 917 s.progress.Subchains[0].Next = header.ParentHash 918 919 // If we've reached an existing block in the chain, stop retrieving 920 // headers. Note, if we want to support light clients with the same 921 // code we'd need to switch here based on the downloader mode. That 922 // said, there's no such functionality for now, so don't complicate. 923 // 924 // In the case of full sync it would be enough to check for the body, 925 // but even a full syncing node will generate a receipt once block 926 // processing is done, so it's just one more "needless" check. 927 var ( 928 hasBody = rawdb.HasBody(s.db, header.ParentHash, header.Number.Uint64()-1) 929 hasReceipt = rawdb.HasReceipts(s.db, header.ParentHash, header.Number.Uint64()-1) 930 ) 931 if hasBody && hasReceipt { 932 linked = true 933 break 934 } 935 } 936 } 937 head := s.progress.Subchains[0].Head 938 tail := s.progress.Subchains[0].Tail 939 next := s.progress.Subchains[0].Next 940 941 log.Trace("Primary subchain extended", "head", head, "tail", tail, "next", next) 942 943 // If the beacon chain was linked to the local chain, completely swap out 944 // all internal progress and abort header synchronization. 945 if linked { 946 // Note, linking into the local chain should also mean that there are 947 // no leftover subchains, but just in case there's some junk due to 948 // strange conditions or bugs, clean up all internal state. 949 if len(s.progress.Subchains) > 1 { 950 log.Error("Cleaning up leftovers after beacon link") 951 s.progress.Subchains = s.progress.Subchains[:1] 952 } 953 break 954 } 955 // Batch of headers consumed, shift the download window forward 956 copy(s.scratchSpace, s.scratchSpace[requestHeaders:]) 957 for i := 0; i < requestHeaders; i++ { 958 s.scratchSpace[scratchHeaders-i-1] = nil 959 } 960 copy(s.scratchOwners, s.scratchOwners[1:]) 961 s.scratchOwners[scratchHeaders/requestHeaders-1] = "" 962 963 s.scratchHead -= uint64(consumed) 964 965 // If the subchain extended into the next subchain, we need to handle 966 // the overlap. Since there could be many overlaps (come on), do this 967 // in a loop. 968 for len(s.progress.Subchains) > 1 && s.progress.Subchains[1].Head >= s.progress.Subchains[0].Tail { 969 // Extract some stats from the second subchain 970 head := s.progress.Subchains[1].Head 971 tail := s.progress.Subchains[1].Tail 972 next := s.progress.Subchains[1].Next 973 974 // Since we just overwrote part of the next subchain, we need to trim 975 // its head independent of matching or mismatching content 976 if s.progress.Subchains[1].Tail >= s.progress.Subchains[0].Tail { 977 // Fully overwritten, get rid of the subchain as a whole 978 log.Debug("Previous subchain fully overwritten", "head", head, "tail", tail, "next", next) 979 s.progress.Subchains = append(s.progress.Subchains[:1], s.progress.Subchains[2:]...) 980 continue 981 } else { 982 // Partially overwritten, trim the head to the overwritten size 983 log.Debug("Previous subchain partially overwritten", "head", head, "tail", tail, "next", next) 984 s.progress.Subchains[1].Head = s.progress.Subchains[0].Tail - 1 985 } 986 // If the old subchain is an extension of the new one, merge the two 987 // and let the skeleton syncer restart (to clean internal state) 988 if rawdb.ReadSkeletonHeader(s.db, s.progress.Subchains[1].Head).Hash() == s.progress.Subchains[0].Next { 989 log.Debug("Previous subchain merged", "head", head, "tail", tail, "next", next) 990 s.progress.Subchains[0].Tail = s.progress.Subchains[1].Tail 991 s.progress.Subchains[0].Next = s.progress.Subchains[1].Next 992 993 s.progress.Subchains = append(s.progress.Subchains[:1], s.progress.Subchains[2:]...) 994 merged = true 995 } 996 } 997 // If subchains were merged, all further available headers in the scratch 998 // space are invalid since we skipped ahead. Stop processing the scratch 999 // space to avoid dropping peers thinking they delivered invalid data. 1000 if merged { 1001 break 1002 } 1003 } 1004 s.saveSyncStatus(batch) 1005 if err := batch.Write(); err != nil { 1006 log.Crit("Failed to write skeleton headers and progress", "err", err) 1007 } 1008 // Print a progress report making the UX a bit nicer 1009 left := s.progress.Subchains[0].Tail - 1 1010 if linked { 1011 left = 0 1012 } 1013 if time.Since(s.logged) > 8*time.Second || left == 0 { 1014 s.logged = time.Now() 1015 1016 if s.pulled == 0 { 1017 log.Info("Beacon sync starting", "left", left) 1018 } else { 1019 eta := float64(time.Since(s.started)) / float64(s.pulled) * float64(left) 1020 log.Info("Syncing beacon headers", "downloaded", s.pulled, "left", left, "eta", common.PrettyDuration(eta)) 1021 } 1022 } 1023 return linked, merged 1024 } 1025 1026 // Bounds retrieves the current head and tail tracked by the skeleton syncer. 1027 // This method is used by the backfiller, whose life cycle is controlled by the 1028 // skeleton syncer. 1029 // 1030 // Note, the method will not use the internal state of the skeleton, but will 1031 // rather blindly pull stuff from the database. This is fine, because the back- 1032 // filler will only run when the skeleton chain is fully downloaded and stable. 1033 // There might be new heads appended, but those are atomic from the perspective 1034 // of this method. Any head reorg will first tear down the backfiller and only 1035 // then make the modification. 1036 func (s *skeleton) Bounds() (head *types.Header, tail *types.Header, err error) { 1037 // Read the current sync progress from disk and figure out the current head. 1038 // Although there's a lot of error handling here, these are mostly as sanity 1039 // checks to avoid crashing if a programming error happens. These should not 1040 // happen in live code. 1041 status := rawdb.ReadSkeletonSyncStatus(s.db) 1042 if len(status) == 0 { 1043 return nil, nil, errors.New("beacon sync not yet started") 1044 } 1045 progress := new(skeletonProgress) 1046 if err := json.Unmarshal(status, progress); err != nil { 1047 return nil, nil, err 1048 } 1049 head = rawdb.ReadSkeletonHeader(s.db, progress.Subchains[0].Head) 1050 tail = rawdb.ReadSkeletonHeader(s.db, progress.Subchains[0].Tail) 1051 1052 return head, tail, nil 1053 } 1054 1055 // Header retrieves a specific header tracked by the skeleton syncer. This method 1056 // is meant to be used by the backfiller, whose life cycle is controlled by the 1057 // skeleton syncer. 1058 // 1059 // Note, outside the permitted runtimes, this method might return nil results and 1060 // subsequent calls might return headers from different chains. 1061 func (s *skeleton) Header(number uint64) *types.Header { 1062 return rawdb.ReadSkeletonHeader(s.db, number) 1063 }