github.com/carter-ya/go-ethereum@v0.0.0-20230628080049-d2309be3983b/eth/downloader/skeleton.go (about) 1 // Copyright 2022 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package downloader 18 19 import ( 20 "encoding/json" 21 "errors" 22 "fmt" 23 "math/rand" 24 "sort" 25 "time" 26 27 "github.com/ethereum/go-ethereum/common" 28 "github.com/ethereum/go-ethereum/core/rawdb" 29 "github.com/ethereum/go-ethereum/core/types" 30 "github.com/ethereum/go-ethereum/eth/protocols/eth" 31 "github.com/ethereum/go-ethereum/ethdb" 32 "github.com/ethereum/go-ethereum/log" 33 ) 34 35 // scratchHeaders is the number of headers to store in a scratch space to allow 36 // concurrent downloads. A header is about 0.5KB in size, so there is no worry 37 // about using too much memory. The only catch is that we can only validate gaps 38 // afer they're linked to the head, so the bigger the scratch space, the larger 39 // potential for invalid headers. 40 // 41 // The current scratch space of 131072 headers is expected to use 64MB RAM. 42 const scratchHeaders = 131072 43 44 // requestHeaders is the number of header to request from a remote peer in a single 45 // network packet. Although the skeleton downloader takes into consideration peer 46 // capacities when picking idlers, the packet size was decided to remain constant 47 // since headers are relatively small and it's easier to work with fixed batches 48 // vs. dynamic interval fillings. 49 const requestHeaders = 512 50 51 // errSyncLinked is an internal helper error to signal that the current sync 52 // cycle linked up to the genesis block, this the skeleton syncer should ping 53 // the backfiller to resume. Since we already have that logic on sync start, 54 // piggy-back on that instead of 2 entrypoints. 55 var errSyncLinked = errors.New("sync linked") 56 57 // errSyncMerged is an internal helper error to signal that the current sync 58 // cycle merged with a previously aborted subchain, thus the skeleton syncer 59 // should abort and restart with the new state. 60 var errSyncMerged = errors.New("sync merged") 61 62 // errSyncReorged is an internal helper error to signal that the head chain of 63 // the current sync cycle was (partially) reorged, thus the skeleton syncer 64 // should abort and restart with the new state. 65 var errSyncReorged = errors.New("sync reorged") 66 67 // errTerminated is returned if the sync mechanism was terminated for this run of 68 // the process. This is usually the case when Geth is shutting down and some events 69 // might still be propagating. 70 var errTerminated = errors.New("terminated") 71 72 // errReorgDenied is returned if an attempt is made to extend the beacon chain 73 // with a new header, but it does not link up to the existing sync. 74 var errReorgDenied = errors.New("non-forced head reorg denied") 75 76 func init() { 77 // Tuning parameters is nice, but the scratch space must be assignable in 78 // full to peers. It's a useless cornercase to support a dangling half-group. 79 if scratchHeaders%requestHeaders != 0 { 80 panic("Please make scratchHeaders divisible by requestHeaders") 81 } 82 } 83 84 // subchain is a contiguous header chain segment that is backed by the database, 85 // but may not be linked to the live chain. The skeleton downloader may produce 86 // a new one of these every time it is restarted until the subchain grows large 87 // enough to connect with a previous subchain. 88 // 89 // The subchains use the exact same database namespace and are not disjoint from 90 // each other. As such, extending one to overlap the other entails reducing the 91 // second one first. This combined buffer model is used to avoid having to move 92 // data on disk when two subchains are joined together. 93 type subchain struct { 94 Head uint64 // Block number of the newest header in the subchain 95 Tail uint64 // Block number of the oldest header in the subchain 96 Next common.Hash // Block hash of the next oldest header in the subchain 97 } 98 99 // skeletonProgress is a database entry to allow suspending and resuming a chain 100 // sync. As the skeleton header chain is downloaded backwards, restarts can and 101 // will produce temporarily disjoint subchains. There is no way to restart a 102 // suspended skeleton sync without prior knowledge of all prior suspension points. 103 type skeletonProgress struct { 104 Subchains []*subchain // Disjoint subchains downloaded until now 105 } 106 107 // headUpdate is a notification that the beacon sync should switch to a new target. 108 // The update might request whether to forcefully change the target, or only try to 109 // extend it and fail if it's not possible. 110 type headUpdate struct { 111 header *types.Header // Header to update the sync target to 112 force bool // Whether to force the update or only extend if possible 113 errc chan error // Channel to signal acceptance of the new head 114 } 115 116 // headerRequest tracks a pending header request to ensure responses are to 117 // actual requests and to validate any security constraints. 118 // 119 // Concurrency note: header requests and responses are handled concurrently from 120 // the main runloop to allow Keccak256 hash verifications on the peer's thread and 121 // to drop on invalid response. The request struct must contain all the data to 122 // construct the response without accessing runloop internals (i.e. subchains). 123 // That is only included to allow the runloop to match a response to the task being 124 // synced without having yet another set of maps. 125 type headerRequest struct { 126 peer string // Peer to which this request is assigned 127 id uint64 // Request ID of this request 128 129 deliver chan *headerResponse // Channel to deliver successful response on 130 revert chan *headerRequest // Channel to deliver request failure on 131 cancel chan struct{} // Channel to track sync cancellation 132 stale chan struct{} // Channel to signal the request was dropped 133 134 head uint64 // Head number of the requested batch of headers 135 } 136 137 // headerResponse is an already verified remote response to a header request. 138 type headerResponse struct { 139 peer *peerConnection // Peer from which this response originates 140 reqid uint64 // Request ID that this response fulfils 141 headers []*types.Header // Chain of headers 142 } 143 144 // backfiller is a callback interface through which the skeleton sync can tell 145 // the downloader that it should suspend or resume backfilling on specific head 146 // events (e.g. suspend on forks or gaps, resume on successful linkups). 147 type backfiller interface { 148 // suspend requests the backfiller to abort any running full or snap sync 149 // based on the skeleton chain as it might be invalid. The backfiller should 150 // gracefully handle multiple consecutive suspends without a resume, even 151 // on initial startup. 152 // 153 // The method should return the last block header that has been successfully 154 // backfilled, or nil if the backfiller was not resumed. 155 suspend() *types.Header 156 157 // resume requests the backfiller to start running fill or snap sync based on 158 // the skeleton chain as it has successfully been linked. Appending new heads 159 // to the end of the chain will not result in suspend/resume cycles. 160 // leaking too much sync logic out to the filler. 161 resume() 162 } 163 164 // skeleton represents a header chain synchronized after the merge where blocks 165 // aren't validated any more via PoW in a forward fashion, rather are dictated 166 // and extended at the head via the beacon chain and backfilled on the original 167 // Ethereum block sync protocol. 168 // 169 // Since the skeleton is grown backwards from head to genesis, it is handled as 170 // a separate entity, not mixed in with the logical sequential transition of the 171 // blocks. Once the skeleton is connected to an existing, validated chain, the 172 // headers will be moved into the main downloader for filling and execution. 173 // 174 // Opposed to the original Ethereum block synchronization which is trustless (and 175 // uses a master peer to minimize the attack surface), post-merge block sync starts 176 // from a trusted head. As such, there is no need for a master peer any more and 177 // headers can be requested fully concurrently (though some batches might be 178 // discarded if they don't link up correctly). 179 // 180 // Although a skeleton is part of a sync cycle, it is not recreated, rather stays 181 // alive throughout the lifetime of the downloader. This allows it to be extended 182 // concurrently with the sync cycle, since extensions arrive from an API surface, 183 // not from within (vs. legacy Ethereum sync). 184 // 185 // Since the skeleton tracks the entire header chain until it is consumed by the 186 // forward block filling, it needs 0.5KB/block storage. At current mainnet sizes 187 // this is only possible with a disk backend. Since the skeleton is separate from 188 // the node's header chain, storing the headers ephemerally until sync finishes 189 // is wasted disk IO, but it's a price we're going to pay to keep things simple 190 // for now. 191 type skeleton struct { 192 db ethdb.Database // Database backing the skeleton 193 filler backfiller // Chain syncer suspended/resumed by head events 194 195 peers *peerSet // Set of peers we can sync from 196 idles map[string]*peerConnection // Set of idle peers in the current sync cycle 197 drop peerDropFn // Drops a peer for misbehaving 198 199 progress *skeletonProgress // Sync progress tracker for resumption and metrics 200 started time.Time // Timestamp when the skeleton syncer was created 201 logged time.Time // Timestamp when progress was last logged to the user 202 pulled uint64 // Number of headers downloaded in this run 203 204 scratchSpace []*types.Header // Scratch space to accumulate headers in (first = recent) 205 scratchOwners []string // Peer IDs owning chunks of the scratch space (pend or delivered) 206 scratchHead uint64 // Block number of the first item in the scratch space 207 208 requests map[uint64]*headerRequest // Header requests currently running 209 210 headEvents chan *headUpdate // Notification channel for new heads 211 terminate chan chan error // Termination channel to abort sync 212 terminated chan struct{} // Channel to signal that the syncer is dead 213 214 // Callback hooks used during testing 215 syncStarting func() // callback triggered after a sync cycle is inited but before started 216 } 217 218 // newSkeleton creates a new sync skeleton that tracks a potentially dangling 219 // header chain until it's linked into an existing set of blocks. 220 func newSkeleton(db ethdb.Database, peers *peerSet, drop peerDropFn, filler backfiller) *skeleton { 221 sk := &skeleton{ 222 db: db, 223 filler: filler, 224 peers: peers, 225 drop: drop, 226 requests: make(map[uint64]*headerRequest), 227 headEvents: make(chan *headUpdate), 228 terminate: make(chan chan error), 229 terminated: make(chan struct{}), 230 } 231 go sk.startup() 232 return sk 233 } 234 235 // startup is an initial background loop which waits for an event to start or 236 // tear the syncer down. This is required to make the skeleton sync loop once 237 // per process but at the same time not start before the beacon chain announces 238 // a new (existing) head. 239 func (s *skeleton) startup() { 240 // Close a notification channel so anyone sending us events will know if the 241 // sync loop was torn down for good. 242 defer close(s.terminated) 243 244 // Wait for startup or teardown. This wait might loop a few times if a beacon 245 // client requests sync head extensions, but not forced reorgs (i.e. they are 246 // giving us new payloads without setting a starting head initially). 247 for { 248 select { 249 case errc := <-s.terminate: 250 // No head was announced but Geth is shutting down 251 errc <- nil 252 return 253 254 case event := <-s.headEvents: 255 // New head announced, start syncing to it, looping every time a current 256 // cycle is terminated due to a chain event (head reorg, old chain merge). 257 if !event.force { 258 event.errc <- errors.New("forced head needed for startup") 259 continue 260 } 261 event.errc <- nil // forced head accepted for startup 262 head := event.header 263 s.started = time.Now() 264 265 for { 266 // If the sync cycle terminated or was terminated, propagate up when 267 // higher layers request termination. There's no fancy explicit error 268 // signalling as the sync loop should never terminate (TM). 269 newhead, err := s.sync(head) 270 switch { 271 case err == errSyncLinked: 272 // Sync cycle linked up to the genesis block. Tear down the loop 273 // and restart it so, it can properly notify the backfiller. Don't 274 // account a new head. 275 head = nil 276 277 case err == errSyncMerged: 278 // Subchains were merged, we just need to reinit the internal 279 // start to continue on the tail of the merged chain. Don't 280 // announce a new head, 281 head = nil 282 283 case err == errSyncReorged: 284 // The subchain being synced got modified at the head in a 285 // way that requires resyncing it. Restart sync with the new 286 // head to force a cleanup. 287 head = newhead 288 289 case err == errTerminated: 290 // Sync was requested to be terminated from within, stop and 291 // return (no need to pass a message, was already done internally) 292 return 293 294 default: 295 // Sync either successfully terminated or failed with an unhandled 296 // error. Abort and wait until Geth requests a termination. 297 errc := <-s.terminate 298 errc <- err 299 return 300 } 301 } 302 } 303 } 304 } 305 306 // Terminate tears down the syncer indefinitely. 307 func (s *skeleton) Terminate() error { 308 // Request termination and fetch any errors 309 errc := make(chan error) 310 s.terminate <- errc 311 err := <-errc 312 313 // Wait for full shutdown (not necessary, but cleaner) 314 <-s.terminated 315 return err 316 } 317 318 // Sync starts or resumes a previous sync cycle to download and maintain a reverse 319 // header chain starting at the head and leading towards genesis to an available 320 // ancestor. 321 // 322 // This method does not block, rather it just waits until the syncer receives the 323 // fed header. What the syncer does with it is the syncer's problem. 324 func (s *skeleton) Sync(head *types.Header, force bool) error { 325 log.Trace("New skeleton head announced", "number", head.Number, "hash", head.Hash(), "force", force) 326 errc := make(chan error) 327 328 select { 329 case s.headEvents <- &headUpdate{header: head, force: force, errc: errc}: 330 return <-errc 331 case <-s.terminated: 332 return errTerminated 333 } 334 } 335 336 // sync is the internal version of Sync that executes a single sync cycle, either 337 // until some termination condition is reached, or until the current cycle merges 338 // with a previously aborted run. 339 func (s *skeleton) sync(head *types.Header) (*types.Header, error) { 340 // If we're continuing a previous merge interrupt, just access the existing 341 // old state without initing from disk. 342 if head == nil { 343 head = rawdb.ReadSkeletonHeader(s.db, s.progress.Subchains[0].Head) 344 } else { 345 // Otherwise, initialize the sync, trimming and previous leftovers until 346 // we're consistent with the newly requested chain head 347 s.initSync(head) 348 } 349 // Create the scratch space to fill with concurrently downloaded headers 350 s.scratchSpace = make([]*types.Header, scratchHeaders) 351 defer func() { s.scratchSpace = nil }() // don't hold on to references after sync 352 353 s.scratchOwners = make([]string, scratchHeaders/requestHeaders) 354 defer func() { s.scratchOwners = nil }() // don't hold on to references after sync 355 356 s.scratchHead = s.progress.Subchains[0].Tail - 1 // tail must not be 0! 357 358 // If the sync is already done, resume the backfiller. When the loop stops, 359 // terminate the backfiller too. 360 linked := len(s.progress.Subchains) == 1 && 361 rawdb.HasHeader(s.db, s.progress.Subchains[0].Next, s.scratchHead) && 362 rawdb.HasBody(s.db, s.progress.Subchains[0].Next, s.scratchHead) && 363 rawdb.HasReceipts(s.db, s.progress.Subchains[0].Next, s.scratchHead) 364 if linked { 365 s.filler.resume() 366 } 367 defer func() { 368 if filled := s.filler.suspend(); filled != nil { 369 // If something was filled, try to delete stale sync helpers. If 370 // unsuccessful, warn the user, but not much else we can do (it's 371 // a programming error, just let users report an issue and don't 372 // choke in the meantime). 373 if err := s.cleanStales(filled); err != nil { 374 log.Error("Failed to clean stale beacon headers", "err", err) 375 } 376 } 377 }() 378 // Create a set of unique channels for this sync cycle. We need these to be 379 // ephemeral so a data race doesn't accidentally deliver something stale on 380 // a persistent channel across syncs (yup, this happened) 381 var ( 382 requestFails = make(chan *headerRequest) 383 responses = make(chan *headerResponse) 384 ) 385 cancel := make(chan struct{}) 386 defer close(cancel) 387 388 log.Debug("Starting reverse header sync cycle", "head", head.Number, "hash", head.Hash(), "cont", s.scratchHead) 389 390 // Whether sync completed or not, disregard any future packets 391 defer func() { 392 log.Debug("Terminating reverse header sync cycle", "head", head.Number, "hash", head.Hash(), "cont", s.scratchHead) 393 s.requests = make(map[uint64]*headerRequest) 394 }() 395 396 // Start tracking idle peers for task assignments 397 peering := make(chan *peeringEvent, 64) // arbitrary buffer, just some burst protection 398 399 peeringSub := s.peers.SubscribeEvents(peering) 400 defer peeringSub.Unsubscribe() 401 402 s.idles = make(map[string]*peerConnection) 403 for _, peer := range s.peers.AllPeers() { 404 s.idles[peer.id] = peer 405 } 406 // Nofity any tester listening for startup events 407 if s.syncStarting != nil { 408 s.syncStarting() 409 } 410 for { 411 // Something happened, try to assign new tasks to any idle peers 412 if !linked { 413 s.assignTasks(responses, requestFails, cancel) 414 } 415 // Wait for something to happen 416 select { 417 case event := <-peering: 418 // A peer joined or left, the tasks queue and allocations need to be 419 // checked for potential assignment or reassignment 420 peerid := event.peer.id 421 if event.join { 422 log.Debug("Joining skeleton peer", "id", peerid) 423 s.idles[peerid] = event.peer 424 } else { 425 log.Debug("Leaving skeleton peer", "id", peerid) 426 s.revertRequests(peerid) 427 delete(s.idles, peerid) 428 } 429 430 case errc := <-s.terminate: 431 errc <- nil 432 return nil, errTerminated 433 434 case event := <-s.headEvents: 435 // New head was announced, try to integrate it. If successful, nothing 436 // needs to be done as the head simply extended the last range. For now 437 // we don't seamlessly integrate reorgs to keep things simple. If the 438 // network starts doing many mini reorgs, it might be worthwhile handling 439 // a limited depth without an error. 440 if reorged := s.processNewHead(event.header, event.force); reorged { 441 // If a reorg is needed, and we're forcing the new head, signal 442 // the syncer to tear down and start over. Otherwise, drop the 443 // non-force reorg. 444 if event.force { 445 event.errc <- nil // forced head reorg accepted 446 return event.header, errSyncReorged 447 } 448 event.errc <- errReorgDenied 449 continue 450 } 451 event.errc <- nil // head extension accepted 452 453 // New head was integrated into the skeleton chain. If the backfiller 454 // is still running, it will pick it up. If it already terminated, 455 // a new cycle needs to be spun up. 456 if linked { 457 s.filler.resume() 458 } 459 460 case req := <-requestFails: 461 s.revertRequest(req) 462 463 case res := <-responses: 464 // Process the batch of headers. If though processing we managed to 465 // link the current subchain to a previously downloaded one, abort the 466 // sync and restart with the merged subchains. 467 // 468 // If we managed to link to the existing local chain or genesis block, 469 // abort sync altogether. 470 linked, merged := s.processResponse(res) 471 if linked { 472 log.Debug("Beacon sync linked to local chain") 473 return nil, errSyncLinked 474 } 475 if merged { 476 log.Debug("Beacon sync merged subchains") 477 return nil, errSyncMerged 478 } 479 // We still have work to do, loop and repeat 480 } 481 } 482 } 483 484 // initSync attempts to get the skeleton sync into a consistent state wrt any 485 // past state on disk and the newly requested head to sync to. If the new head 486 // is nil, the method will return and continue from the previous head. 487 func (s *skeleton) initSync(head *types.Header) { 488 // Extract the head number, we'll need it all over 489 number := head.Number.Uint64() 490 491 // Retrieve the previously saved sync progress 492 if status := rawdb.ReadSkeletonSyncStatus(s.db); len(status) > 0 { 493 s.progress = new(skeletonProgress) 494 if err := json.Unmarshal(status, s.progress); err != nil { 495 log.Error("Failed to decode skeleton sync status", "err", err) 496 } else { 497 // Previous sync was available, print some continuation logs 498 for _, subchain := range s.progress.Subchains { 499 log.Debug("Restarting skeleton subchain", "head", subchain.Head, "tail", subchain.Tail) 500 } 501 // Create a new subchain for the head (unless the last can be extended), 502 // trimming anything it would overwrite 503 headchain := &subchain{ 504 Head: number, 505 Tail: number, 506 Next: head.ParentHash, 507 } 508 for len(s.progress.Subchains) > 0 { 509 // If the last chain is above the new head, delete altogether 510 lastchain := s.progress.Subchains[0] 511 if lastchain.Tail >= headchain.Tail { 512 log.Debug("Dropping skeleton subchain", "head", lastchain.Head, "tail", lastchain.Tail) 513 s.progress.Subchains = s.progress.Subchains[1:] 514 continue 515 } 516 // Otherwise truncate the last chain if needed and abort trimming 517 if lastchain.Head >= headchain.Tail { 518 log.Debug("Trimming skeleton subchain", "oldhead", lastchain.Head, "newhead", headchain.Tail-1, "tail", lastchain.Tail) 519 lastchain.Head = headchain.Tail - 1 520 } 521 break 522 } 523 // If the last subchain can be extended, we're lucky. Otherwise create 524 // a new subchain sync task. 525 var extended bool 526 if n := len(s.progress.Subchains); n > 0 { 527 lastchain := s.progress.Subchains[0] 528 if lastchain.Head == headchain.Tail-1 { 529 lasthead := rawdb.ReadSkeletonHeader(s.db, lastchain.Head) 530 if lasthead.Hash() == head.ParentHash { 531 log.Debug("Extended skeleton subchain with new head", "head", headchain.Tail, "tail", lastchain.Tail) 532 lastchain.Head = headchain.Tail 533 extended = true 534 } 535 } 536 } 537 if !extended { 538 log.Debug("Created new skeleton subchain", "head", number, "tail", number) 539 s.progress.Subchains = append([]*subchain{headchain}, s.progress.Subchains...) 540 } 541 // Update the database with the new sync stats and insert the new 542 // head header. We won't delete any trimmed skeleton headers since 543 // those will be outside the index space of the many subchains and 544 // the database space will be reclaimed eventually when processing 545 // blocks above the current head (TODO(karalabe): don't forget). 546 batch := s.db.NewBatch() 547 548 rawdb.WriteSkeletonHeader(batch, head) 549 s.saveSyncStatus(batch) 550 551 if err := batch.Write(); err != nil { 552 log.Crit("Failed to write skeleton sync status", "err", err) 553 } 554 return 555 } 556 } 557 // Either we've failed to decode the previous state, or there was none. Start 558 // a fresh sync with a single subchain represented by the currently sent 559 // chain head. 560 s.progress = &skeletonProgress{ 561 Subchains: []*subchain{ 562 { 563 Head: number, 564 Tail: number, 565 Next: head.ParentHash, 566 }, 567 }, 568 } 569 batch := s.db.NewBatch() 570 571 rawdb.WriteSkeletonHeader(batch, head) 572 s.saveSyncStatus(batch) 573 574 if err := batch.Write(); err != nil { 575 log.Crit("Failed to write initial skeleton sync status", "err", err) 576 } 577 log.Debug("Created initial skeleton subchain", "head", number, "tail", number) 578 } 579 580 // saveSyncStatus marshals the remaining sync tasks into leveldb. 581 func (s *skeleton) saveSyncStatus(db ethdb.KeyValueWriter) { 582 status, err := json.Marshal(s.progress) 583 if err != nil { 584 panic(err) // This can only fail during implementation 585 } 586 rawdb.WriteSkeletonSyncStatus(db, status) 587 } 588 589 // processNewHead does the internal shuffling for a new head marker and either 590 // accepts and integrates it into the skeleton or requests a reorg. Upon reorg, 591 // the syncer will tear itself down and restart with a fresh head. It is simpler 592 // to reconstruct the sync state than to mutate it and hope for the best. 593 func (s *skeleton) processNewHead(head *types.Header, force bool) bool { 594 // If the header cannot be inserted without interruption, return an error for 595 // the outer loop to tear down the skeleton sync and restart it 596 number := head.Number.Uint64() 597 598 lastchain := s.progress.Subchains[0] 599 if lastchain.Tail >= number { 600 // If the chain is down to a single beacon header, and it is re-announced 601 // once more, ignore it instead of tearing down sync for a noop. 602 if lastchain.Head == lastchain.Tail { 603 if current := rawdb.ReadSkeletonHeader(s.db, number); current.Hash() == head.Hash() { 604 return false 605 } 606 } 607 // Not a noop / double head announce, abort with a reorg 608 if force { 609 log.Warn("Beacon chain reorged", "tail", lastchain.Tail, "head", lastchain.Head, "newHead", number) 610 } 611 return true 612 } 613 if lastchain.Head+1 < number { 614 if force { 615 log.Warn("Beacon chain gapped", "head", lastchain.Head, "newHead", number) 616 } 617 return true 618 } 619 if parent := rawdb.ReadSkeletonHeader(s.db, number-1); parent.Hash() != head.ParentHash { 620 if force { 621 log.Warn("Beacon chain forked", "ancestor", parent.Number, "hash", parent.Hash(), "want", head.ParentHash) 622 } 623 return true 624 } 625 // New header seems to be in the last subchain range. Unwind any extra headers 626 // from the chain tip and insert the new head. We won't delete any trimmed 627 // skeleton headers since those will be outside the index space of the many 628 // subchains and the database space will be reclaimed eventually when processing 629 // blocks above the current head (TODO(karalabe): don't forget). 630 batch := s.db.NewBatch() 631 632 rawdb.WriteSkeletonHeader(batch, head) 633 lastchain.Head = number 634 s.saveSyncStatus(batch) 635 636 if err := batch.Write(); err != nil { 637 log.Crit("Failed to write skeleton sync status", "err", err) 638 } 639 return false 640 } 641 642 // assignTasks attempts to match idle peers to pending header retrievals. 643 func (s *skeleton) assignTasks(success chan *headerResponse, fail chan *headerRequest, cancel chan struct{}) { 644 // Sort the peers by download capacity to use faster ones if many available 645 idlers := &peerCapacitySort{ 646 peers: make([]*peerConnection, 0, len(s.idles)), 647 caps: make([]int, 0, len(s.idles)), 648 } 649 targetTTL := s.peers.rates.TargetTimeout() 650 for _, peer := range s.idles { 651 idlers.peers = append(idlers.peers, peer) 652 idlers.caps = append(idlers.caps, s.peers.rates.Capacity(peer.id, eth.BlockHeadersMsg, targetTTL)) 653 } 654 if len(idlers.peers) == 0 { 655 return 656 } 657 sort.Sort(idlers) 658 659 // Find header regions not yet downloading and fill them 660 for task, owner := range s.scratchOwners { 661 // If we're out of idle peers, stop assigning tasks 662 if len(idlers.peers) == 0 { 663 return 664 } 665 // Skip any tasks already filling 666 if owner != "" { 667 continue 668 } 669 // If we've reached the genesis, stop assigning tasks 670 if uint64(task*requestHeaders) >= s.scratchHead { 671 return 672 } 673 // Found a task and have peers available, assign it 674 idle := idlers.peers[0] 675 676 idlers.peers = idlers.peers[1:] 677 idlers.caps = idlers.caps[1:] 678 679 // Matched a pending task to an idle peer, allocate a unique request id 680 var reqid uint64 681 for { 682 reqid = uint64(rand.Int63()) 683 if reqid == 0 { 684 continue 685 } 686 if _, ok := s.requests[reqid]; ok { 687 continue 688 } 689 break 690 } 691 // Generate the network query and send it to the peer 692 req := &headerRequest{ 693 peer: idle.id, 694 id: reqid, 695 deliver: success, 696 revert: fail, 697 cancel: cancel, 698 stale: make(chan struct{}), 699 head: s.scratchHead - uint64(task*requestHeaders), 700 } 701 s.requests[reqid] = req 702 delete(s.idles, idle.id) 703 704 // Generate the network query and send it to the peer 705 go s.executeTask(idle, req) 706 707 // Inject the request into the task to block further assignments 708 s.scratchOwners[task] = idle.id 709 } 710 } 711 712 // executeTask executes a single fetch request, blocking until either a result 713 // arrives or a timeouts / cancellation is triggered. The method should be run 714 // on its own goroutine and will deliver on the requested channels. 715 func (s *skeleton) executeTask(peer *peerConnection, req *headerRequest) { 716 start := time.Now() 717 resCh := make(chan *eth.Response) 718 719 // Figure out how many headers to fetch. Usually this will be a full batch, 720 // but for the very tail of the chain, trim the request to the number left. 721 // Since nodes may or may not return the genesis header for a batch request, 722 // don't even request it. The parent hash of block #1 is enough to link. 723 requestCount := requestHeaders 724 if req.head < requestHeaders { 725 requestCount = int(req.head) 726 } 727 peer.log.Trace("Fetching skeleton headers", "from", req.head, "count", requestCount) 728 netreq, err := peer.peer.RequestHeadersByNumber(req.head, requestCount, 0, true, resCh) 729 if err != nil { 730 peer.log.Trace("Failed to request headers", "err", err) 731 s.scheduleRevertRequest(req) 732 return 733 } 734 defer netreq.Close() 735 736 // Wait until the response arrives, the request is cancelled or times out 737 ttl := s.peers.rates.TargetTimeout() 738 739 timeoutTimer := time.NewTimer(ttl) 740 defer timeoutTimer.Stop() 741 742 select { 743 case <-req.cancel: 744 peer.log.Debug("Header request cancelled") 745 s.scheduleRevertRequest(req) 746 747 case <-timeoutTimer.C: 748 // Header retrieval timed out, update the metrics 749 peer.log.Warn("Header request timed out, dropping peer", "elapsed", ttl) 750 headerTimeoutMeter.Mark(1) 751 s.peers.rates.Update(peer.id, eth.BlockHeadersMsg, 0, 0) 752 s.scheduleRevertRequest(req) 753 754 // At this point we either need to drop the offending peer, or we need a 755 // mechanism to allow waiting for the response and not cancel it. For now 756 // lets go with dropping since the header sizes are deterministic and the 757 // beacon sync runs exclusive (downloader is idle) so there should be no 758 // other load to make timeouts probable. If we notice that timeouts happen 759 // more often than we'd like, we can introduce a tracker for the requests 760 // gone stale and monitor them. However, in that case too, we need a way 761 // to protect against malicious peers never responding, so it would need 762 // a second, hard-timeout mechanism. 763 s.drop(peer.id) 764 765 case res := <-resCh: 766 // Headers successfully retrieved, update the metrics 767 headers := *res.Res.(*eth.BlockHeadersPacket) 768 769 headerReqTimer.Update(time.Since(start)) 770 s.peers.rates.Update(peer.id, eth.BlockHeadersMsg, res.Time, len(headers)) 771 772 // Cross validate the headers with the requests 773 switch { 774 case len(headers) == 0: 775 // No headers were delivered, reject the response and reschedule 776 peer.log.Debug("No headers delivered") 777 res.Done <- errors.New("no headers delivered") 778 s.scheduleRevertRequest(req) 779 780 case headers[0].Number.Uint64() != req.head: 781 // Header batch anchored at non-requested number 782 peer.log.Debug("Invalid header response head", "have", headers[0].Number, "want", req.head) 783 res.Done <- errors.New("invalid header batch anchor") 784 s.scheduleRevertRequest(req) 785 786 case req.head >= requestHeaders && len(headers) != requestHeaders: 787 // Invalid number of non-genesis headers delivered, reject the response and reschedule 788 peer.log.Debug("Invalid non-genesis header count", "have", len(headers), "want", requestHeaders) 789 res.Done <- errors.New("not enough non-genesis headers delivered") 790 s.scheduleRevertRequest(req) 791 792 case req.head < requestHeaders && uint64(len(headers)) != req.head: 793 // Invalid number of genesis headers delivered, reject the response and reschedule 794 peer.log.Debug("Invalid genesis header count", "have", len(headers), "want", headers[0].Number.Uint64()) 795 res.Done <- errors.New("not enough genesis headers delivered") 796 s.scheduleRevertRequest(req) 797 798 default: 799 // Packet seems structurally valid, check hash progression and if it 800 // is correct too, deliver for storage 801 for i := 0; i < len(headers)-1; i++ { 802 if headers[i].ParentHash != headers[i+1].Hash() { 803 peer.log.Debug("Invalid hash progression", "index", i, "wantparenthash", headers[i].ParentHash, "haveparenthash", headers[i+1].Hash()) 804 res.Done <- errors.New("invalid hash progression") 805 s.scheduleRevertRequest(req) 806 return 807 } 808 } 809 // Hash chain is valid. The delivery might still be junk as we're 810 // downloading batches concurrently (so no way to link the headers 811 // until gaps are filled); in that case, we'll nuke the peer when 812 // we detect the fault. 813 res.Done <- nil 814 815 select { 816 case req.deliver <- &headerResponse{ 817 peer: peer, 818 reqid: req.id, 819 headers: headers, 820 }: 821 case <-req.cancel: 822 } 823 } 824 } 825 } 826 827 // revertRequests locates all the currently pending requests from a particular 828 // peer and reverts them, rescheduling for others to fulfill. 829 func (s *skeleton) revertRequests(peer string) { 830 // Gather the requests first, revertals need the lock too 831 var requests []*headerRequest 832 for _, req := range s.requests { 833 if req.peer == peer { 834 requests = append(requests, req) 835 } 836 } 837 // Revert all the requests matching the peer 838 for _, req := range requests { 839 s.revertRequest(req) 840 } 841 } 842 843 // scheduleRevertRequest asks the event loop to clean up a request and return 844 // all failed retrieval tasks to the scheduler for reassignment. 845 func (s *skeleton) scheduleRevertRequest(req *headerRequest) { 846 select { 847 case req.revert <- req: 848 // Sync event loop notified 849 case <-req.cancel: 850 // Sync cycle got cancelled 851 case <-req.stale: 852 // Request already reverted 853 } 854 } 855 856 // revertRequest cleans up a request and returns all failed retrieval tasks to 857 // the scheduler for reassignment. 858 // 859 // Note, this needs to run on the event runloop thread to reschedule to idle peers. 860 // On peer threads, use scheduleRevertRequest. 861 func (s *skeleton) revertRequest(req *headerRequest) { 862 log.Trace("Reverting header request", "peer", req.peer, "reqid", req.id) 863 select { 864 case <-req.stale: 865 log.Trace("Header request already reverted", "peer", req.peer, "reqid", req.id) 866 return 867 default: 868 } 869 close(req.stale) 870 871 // Remove the request from the tracked set 872 delete(s.requests, req.id) 873 874 // Remove the request from the tracked set and mark the task as not-pending, 875 // ready for rescheduling 876 s.scratchOwners[(s.scratchHead-req.head)/requestHeaders] = "" 877 } 878 879 func (s *skeleton) processResponse(res *headerResponse) (linked bool, merged bool) { 880 res.peer.log.Trace("Processing header response", "head", res.headers[0].Number, "hash", res.headers[0].Hash(), "count", len(res.headers)) 881 882 // Whether the response is valid, we can mark the peer as idle and notify 883 // the scheduler to assign a new task. If the response is invalid, we'll 884 // drop the peer in a bit. 885 s.idles[res.peer.id] = res.peer 886 887 // Ensure the response is for a valid request 888 if _, ok := s.requests[res.reqid]; !ok { 889 // Some internal accounting is broken. A request either times out or it 890 // gets fulfilled successfully. It should not be possible to deliver a 891 // response to a non-existing request. 892 res.peer.log.Error("Unexpected header packet") 893 return false, false 894 } 895 delete(s.requests, res.reqid) 896 897 // Insert the delivered headers into the scratch space independent of the 898 // content or continuation; those will be validated in a moment 899 head := res.headers[0].Number.Uint64() 900 copy(s.scratchSpace[s.scratchHead-head:], res.headers) 901 902 // If there's still a gap in the head of the scratch space, abort 903 if s.scratchSpace[0] == nil { 904 return false, false 905 } 906 // Try to consume any head headers, validating the boundary conditions 907 batch := s.db.NewBatch() 908 for s.scratchSpace[0] != nil { 909 // Next batch of headers available, cross-reference with the subchain 910 // we are extending and either accept or discard 911 if s.progress.Subchains[0].Next != s.scratchSpace[0].Hash() { 912 // Print a log messages to track what's going on 913 tail := s.progress.Subchains[0].Tail 914 want := s.progress.Subchains[0].Next 915 have := s.scratchSpace[0].Hash() 916 917 log.Warn("Invalid skeleton headers", "peer", s.scratchOwners[0], "number", tail-1, "want", want, "have", have) 918 919 // The peer delivered junk, or at least not the subchain we are 920 // syncing to. Free up the scratch space and assignment, reassign 921 // and drop the original peer. 922 for i := 0; i < requestHeaders; i++ { 923 s.scratchSpace[i] = nil 924 } 925 s.drop(s.scratchOwners[0]) 926 s.scratchOwners[0] = "" 927 break 928 } 929 // Scratch delivery matches required subchain, deliver the batch of 930 // headers and push the subchain forward 931 var consumed int 932 for _, header := range s.scratchSpace[:requestHeaders] { 933 if header != nil { // nil when the genesis is reached 934 consumed++ 935 936 rawdb.WriteSkeletonHeader(batch, header) 937 s.pulled++ 938 939 s.progress.Subchains[0].Tail-- 940 s.progress.Subchains[0].Next = header.ParentHash 941 942 // If we've reached an existing block in the chain, stop retrieving 943 // headers. Note, if we want to support light clients with the same 944 // code we'd need to switch here based on the downloader mode. That 945 // said, there's no such functionality for now, so don't complicate. 946 // 947 // In the case of full sync it would be enough to check for the body, 948 // but even a full syncing node will generate a receipt once block 949 // processing is done, so it's just one more "needless" check. 950 // 951 // The weird cascading checks are done to minimize the database reads. 952 linked = rawdb.HasHeader(s.db, header.ParentHash, header.Number.Uint64()-1) && 953 rawdb.HasBody(s.db, header.ParentHash, header.Number.Uint64()-1) && 954 rawdb.HasReceipts(s.db, header.ParentHash, header.Number.Uint64()-1) 955 if linked { 956 break 957 } 958 } 959 } 960 head := s.progress.Subchains[0].Head 961 tail := s.progress.Subchains[0].Tail 962 next := s.progress.Subchains[0].Next 963 964 log.Trace("Primary subchain extended", "head", head, "tail", tail, "next", next) 965 966 // If the beacon chain was linked to the local chain, completely swap out 967 // all internal progress and abort header synchronization. 968 if linked { 969 // Linking into the local chain should also mean that there are no 970 // leftover subchains, but in the case of importing the blocks via 971 // the engine API, we will not push the subchains forward. This will 972 // lead to a gap between an old sync cycle and a future one. 973 if subchains := len(s.progress.Subchains); subchains > 1 { 974 switch { 975 // If there are only 2 subchains - the current one and an older 976 // one - and the old one consists of a single block, then it's 977 // the expected new sync cycle after some propagated blocks. Log 978 // it for debugging purposes, explicitly clean and don't escalate. 979 case subchains == 2 && s.progress.Subchains[1].Head == s.progress.Subchains[1].Tail: 980 log.Debug("Cleaning previous beacon sync state", "head", s.progress.Subchains[1].Head) 981 rawdb.DeleteSkeletonHeader(batch, s.progress.Subchains[1].Head) 982 s.progress.Subchains = s.progress.Subchains[:1] 983 984 // If we have more than one header or more than one leftover chain, 985 // the syncer's internal state is corrupted. Do try to fix it, but 986 // be very vocal about the fault. 987 default: 988 var context []interface{} 989 990 for i := range s.progress.Subchains[1:] { 991 context = append(context, fmt.Sprintf("stale_head_%d", i+1)) 992 context = append(context, s.progress.Subchains[i+1].Head) 993 context = append(context, fmt.Sprintf("stale_tail_%d", i+1)) 994 context = append(context, s.progress.Subchains[i+1].Tail) 995 context = append(context, fmt.Sprintf("stale_next_%d", i+1)) 996 context = append(context, s.progress.Subchains[i+1].Next) 997 } 998 log.Error("Cleaning spurious beacon sync leftovers", context...) 999 s.progress.Subchains = s.progress.Subchains[:1] 1000 1001 // Note, here we didn't actually delete the headers at all, 1002 // just the metadata. We could implement a cleanup mechanism, 1003 // but further modifying corrupted state is kind of asking 1004 // for it. Unless there's a good enough reason to risk it, 1005 // better to live with the small database junk. 1006 } 1007 } 1008 break 1009 } 1010 // Batch of headers consumed, shift the download window forward 1011 copy(s.scratchSpace, s.scratchSpace[requestHeaders:]) 1012 for i := 0; i < requestHeaders; i++ { 1013 s.scratchSpace[scratchHeaders-i-1] = nil 1014 } 1015 copy(s.scratchOwners, s.scratchOwners[1:]) 1016 s.scratchOwners[scratchHeaders/requestHeaders-1] = "" 1017 1018 s.scratchHead -= uint64(consumed) 1019 1020 // If the subchain extended into the next subchain, we need to handle 1021 // the overlap. Since there could be many overlaps (come on), do this 1022 // in a loop. 1023 for len(s.progress.Subchains) > 1 && s.progress.Subchains[1].Head >= s.progress.Subchains[0].Tail { 1024 // Extract some stats from the second subchain 1025 head := s.progress.Subchains[1].Head 1026 tail := s.progress.Subchains[1].Tail 1027 next := s.progress.Subchains[1].Next 1028 1029 // Since we just overwrote part of the next subchain, we need to trim 1030 // its head independent of matching or mismatching content 1031 if s.progress.Subchains[1].Tail >= s.progress.Subchains[0].Tail { 1032 // Fully overwritten, get rid of the subchain as a whole 1033 log.Debug("Previous subchain fully overwritten", "head", head, "tail", tail, "next", next) 1034 s.progress.Subchains = append(s.progress.Subchains[:1], s.progress.Subchains[2:]...) 1035 continue 1036 } else { 1037 // Partially overwritten, trim the head to the overwritten size 1038 log.Debug("Previous subchain partially overwritten", "head", head, "tail", tail, "next", next) 1039 s.progress.Subchains[1].Head = s.progress.Subchains[0].Tail - 1 1040 } 1041 // If the old subchain is an extension of the new one, merge the two 1042 // and let the skeleton syncer restart (to clean internal state) 1043 if rawdb.ReadSkeletonHeader(s.db, s.progress.Subchains[1].Head).Hash() == s.progress.Subchains[0].Next { 1044 log.Debug("Previous subchain merged", "head", head, "tail", tail, "next", next) 1045 s.progress.Subchains[0].Tail = s.progress.Subchains[1].Tail 1046 s.progress.Subchains[0].Next = s.progress.Subchains[1].Next 1047 1048 s.progress.Subchains = append(s.progress.Subchains[:1], s.progress.Subchains[2:]...) 1049 merged = true 1050 } 1051 } 1052 // If subchains were merged, all further available headers in the scratch 1053 // space are invalid since we skipped ahead. Stop processing the scratch 1054 // space to avoid dropping peers thinking they delivered invalid data. 1055 if merged { 1056 break 1057 } 1058 } 1059 s.saveSyncStatus(batch) 1060 if err := batch.Write(); err != nil { 1061 log.Crit("Failed to write skeleton headers and progress", "err", err) 1062 } 1063 // Print a progress report making the UX a bit nicer 1064 left := s.progress.Subchains[0].Tail - 1 1065 if linked { 1066 left = 0 1067 } 1068 if time.Since(s.logged) > 8*time.Second || left == 0 { 1069 s.logged = time.Now() 1070 1071 if s.pulled == 0 { 1072 log.Info("Beacon sync starting", "left", left) 1073 } else { 1074 eta := float64(time.Since(s.started)) / float64(s.pulled) * float64(left) 1075 log.Info("Syncing beacon headers", "downloaded", s.pulled, "left", left, "eta", common.PrettyDuration(eta)) 1076 } 1077 } 1078 return linked, merged 1079 } 1080 1081 // cleanStales removes previously synced beacon headers that have become stale 1082 // due to the downloader backfilling past the tracked tail. 1083 func (s *skeleton) cleanStales(filled *types.Header) error { 1084 number := filled.Number.Uint64() 1085 log.Trace("Cleaning stale beacon headers", "filled", number, "hash", filled.Hash()) 1086 1087 // If the filled header is below the linked subchain, something's 1088 // corrupted internally. Report and error and refuse to do anything. 1089 if number < s.progress.Subchains[0].Tail { 1090 return fmt.Errorf("filled header below beacon header tail: %d < %d", number, s.progress.Subchains[0].Tail) 1091 } 1092 // Subchain seems trimmable, push the tail forward up to the last 1093 // filled header and delete everything before it - if available. In 1094 // case we filled past the head, recreate the subchain with a new 1095 // head to keep it consistent with the data on disk. 1096 var ( 1097 start = s.progress.Subchains[0].Tail // start deleting from the first known header 1098 end = number // delete until the requested threshold 1099 ) 1100 s.progress.Subchains[0].Tail = number 1101 s.progress.Subchains[0].Next = filled.ParentHash 1102 1103 if s.progress.Subchains[0].Head < number { 1104 // If more headers were filled than available, push the entire 1105 // subchain forward to keep tracking the node's block imports 1106 end = s.progress.Subchains[0].Head + 1 // delete the entire original range, including the head 1107 s.progress.Subchains[0].Head = number // assign a new head (tail is already assigned to this) 1108 } 1109 // Execute the trimming and the potential rewiring of the progress 1110 batch := s.db.NewBatch() 1111 1112 if end != number { 1113 // The entire original skeleton chain was deleted and a new one 1114 // defined. Make sure the new single-header chain gets pushed to 1115 // disk to keep internal state consistent. 1116 rawdb.WriteSkeletonHeader(batch, filled) 1117 } 1118 s.saveSyncStatus(batch) 1119 for n := start; n < end; n++ { 1120 // If the batch grew too big, flush it and continue with a new batch. 1121 // The catch is that the sync metadata needs to reflect the actually 1122 // flushed state, so temporarily change the subchain progress and 1123 // revert after the flush. 1124 if batch.ValueSize() >= ethdb.IdealBatchSize { 1125 tmpTail := s.progress.Subchains[0].Tail 1126 tmpNext := s.progress.Subchains[0].Next 1127 1128 s.progress.Subchains[0].Tail = n 1129 s.progress.Subchains[0].Next = rawdb.ReadSkeletonHeader(s.db, n).ParentHash 1130 s.saveSyncStatus(batch) 1131 1132 if err := batch.Write(); err != nil { 1133 log.Crit("Failed to write beacon trim data", "err", err) 1134 } 1135 batch.Reset() 1136 1137 s.progress.Subchains[0].Tail = tmpTail 1138 s.progress.Subchains[0].Next = tmpNext 1139 s.saveSyncStatus(batch) 1140 } 1141 rawdb.DeleteSkeletonHeader(batch, n) 1142 } 1143 if err := batch.Write(); err != nil { 1144 log.Crit("Failed to write beacon trim data", "err", err) 1145 } 1146 return nil 1147 } 1148 1149 // Bounds retrieves the current head and tail tracked by the skeleton syncer. 1150 // This method is used by the backfiller, whose life cycle is controlled by the 1151 // skeleton syncer. 1152 // 1153 // Note, the method will not use the internal state of the skeleton, but will 1154 // rather blindly pull stuff from the database. This is fine, because the back- 1155 // filler will only run when the skeleton chain is fully downloaded and stable. 1156 // There might be new heads appended, but those are atomic from the perspective 1157 // of this method. Any head reorg will first tear down the backfiller and only 1158 // then make the modification. 1159 func (s *skeleton) Bounds() (head *types.Header, tail *types.Header, err error) { 1160 // Read the current sync progress from disk and figure out the current head. 1161 // Although there's a lot of error handling here, these are mostly as sanity 1162 // checks to avoid crashing if a programming error happens. These should not 1163 // happen in live code. 1164 status := rawdb.ReadSkeletonSyncStatus(s.db) 1165 if len(status) == 0 { 1166 return nil, nil, errors.New("beacon sync not yet started") 1167 } 1168 progress := new(skeletonProgress) 1169 if err := json.Unmarshal(status, progress); err != nil { 1170 return nil, nil, err 1171 } 1172 head = rawdb.ReadSkeletonHeader(s.db, progress.Subchains[0].Head) 1173 tail = rawdb.ReadSkeletonHeader(s.db, progress.Subchains[0].Tail) 1174 1175 return head, tail, nil 1176 } 1177 1178 // Header retrieves a specific header tracked by the skeleton syncer. This method 1179 // is meant to be used by the backfiller, whose life cycle is controlled by the 1180 // skeleton syncer. 1181 // 1182 // Note, outside the permitted runtimes, this method might return nil results and 1183 // subsequent calls might return headers from different chains. 1184 func (s *skeleton) Header(number uint64) *types.Header { 1185 return rawdb.ReadSkeletonHeader(s.db, number) 1186 }