github.com/cryptotooltop/go-ethereum@v0.0.0-20231103184714-151d1922f3e5/eth/downloader/statesync.go (about) 1 // Copyright 2017 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package downloader 18 19 import ( 20 "fmt" 21 "sync" 22 "time" 23 24 "golang.org/x/crypto/sha3" 25 26 "github.com/scroll-tech/go-ethereum/common" 27 "github.com/scroll-tech/go-ethereum/core/rawdb" 28 "github.com/scroll-tech/go-ethereum/core/state" 29 "github.com/scroll-tech/go-ethereum/crypto" 30 "github.com/scroll-tech/go-ethereum/ethdb" 31 "github.com/scroll-tech/go-ethereum/log" 32 "github.com/scroll-tech/go-ethereum/trie" 33 ) 34 35 // stateReq represents a batch of state fetch requests grouped together into 36 // a single data retrieval network packet. 37 type stateReq struct { 38 nItems uint16 // Number of items requested for download (max is 384, so uint16 is sufficient) 39 trieTasks map[common.Hash]*trieTask // Trie node download tasks to track previous attempts 40 codeTasks map[common.Hash]*codeTask // Byte code download tasks to track previous attempts 41 timeout time.Duration // Maximum round trip time for this to complete 42 timer *time.Timer // Timer to fire when the RTT timeout expires 43 peer *peerConnection // Peer that we're requesting from 44 delivered time.Time // Time when the packet was delivered (independent when we process it) 45 response [][]byte // Response data of the peer (nil for timeouts) 46 dropped bool // Flag whether the peer dropped off early 47 } 48 49 // timedOut returns if this request timed out. 50 func (req *stateReq) timedOut() bool { 51 return req.response == nil 52 } 53 54 // stateSyncStats is a collection of progress stats to report during a state trie 55 // sync to RPC requests as well as to display in user logs. 56 type stateSyncStats struct { 57 processed uint64 // Number of state entries processed 58 duplicate uint64 // Number of state entries downloaded twice 59 unexpected uint64 // Number of non-requested state entries received 60 pending uint64 // Number of still pending state entries 61 } 62 63 // syncState starts downloading state with the given root hash. 64 func (d *Downloader) syncState(root common.Hash) *stateSync { 65 // Create the state sync 66 s := newStateSync(d, root) 67 select { 68 case d.stateSyncStart <- s: 69 // If we tell the statesync to restart with a new root, we also need 70 // to wait for it to actually also start -- when old requests have timed 71 // out or been delivered 72 <-s.started 73 case <-d.quitCh: 74 s.err = errCancelStateFetch 75 close(s.done) 76 } 77 return s 78 } 79 80 // stateFetcher manages the active state sync and accepts requests 81 // on its behalf. 82 func (d *Downloader) stateFetcher() { 83 for { 84 select { 85 case s := <-d.stateSyncStart: 86 for next := s; next != nil; { 87 next = d.runStateSync(next) 88 } 89 case <-d.stateCh: 90 // Ignore state responses while no sync is running. 91 case <-d.quitCh: 92 return 93 } 94 } 95 } 96 97 // runStateSync runs a state synchronisation until it completes or another root 98 // hash is requested to be switched over to. 99 func (d *Downloader) runStateSync(s *stateSync) *stateSync { 100 var ( 101 active = make(map[string]*stateReq) // Currently in-flight requests 102 finished []*stateReq // Completed or failed requests 103 timeout = make(chan *stateReq) // Timed out active requests 104 ) 105 log.Trace("State sync starting", "root", s.root) 106 107 defer func() { 108 // Cancel active request timers on exit. Also set peers to idle so they're 109 // available for the next sync. 110 for _, req := range active { 111 req.timer.Stop() 112 req.peer.SetNodeDataIdle(int(req.nItems), time.Now()) 113 } 114 }() 115 go s.run() 116 defer s.Cancel() 117 118 // Listen for peer departure events to cancel assigned tasks 119 peerDrop := make(chan *peerConnection, 1024) 120 peerSub := s.d.peers.SubscribePeerDrops(peerDrop) 121 defer peerSub.Unsubscribe() 122 123 for { 124 // Enable sending of the first buffered element if there is one. 125 var ( 126 deliverReq *stateReq 127 deliverReqCh chan *stateReq 128 ) 129 if len(finished) > 0 { 130 deliverReq = finished[0] 131 deliverReqCh = s.deliver 132 } 133 134 select { 135 // The stateSync lifecycle: 136 case next := <-d.stateSyncStart: 137 d.spindownStateSync(active, finished, timeout, peerDrop) 138 return next 139 140 case <-s.done: 141 d.spindownStateSync(active, finished, timeout, peerDrop) 142 return nil 143 144 // Send the next finished request to the current sync: 145 case deliverReqCh <- deliverReq: 146 // Shift out the first request, but also set the emptied slot to nil for GC 147 copy(finished, finished[1:]) 148 finished[len(finished)-1] = nil 149 finished = finished[:len(finished)-1] 150 151 // Handle incoming state packs: 152 case pack := <-d.stateCh: 153 // Discard any data not requested (or previously timed out) 154 req := active[pack.PeerId()] 155 if req == nil { 156 log.Debug("Unrequested node data", "peer", pack.PeerId(), "len", pack.Items()) 157 continue 158 } 159 // Finalize the request and queue up for processing 160 req.timer.Stop() 161 req.response = pack.(*statePack).states 162 req.delivered = time.Now() 163 164 finished = append(finished, req) 165 delete(active, pack.PeerId()) 166 167 // Handle dropped peer connections: 168 case p := <-peerDrop: 169 // Skip if no request is currently pending 170 req := active[p.id] 171 if req == nil { 172 continue 173 } 174 // Finalize the request and queue up for processing 175 req.timer.Stop() 176 req.dropped = true 177 req.delivered = time.Now() 178 179 finished = append(finished, req) 180 delete(active, p.id) 181 182 // Handle timed-out requests: 183 case req := <-timeout: 184 // If the peer is already requesting something else, ignore the stale timeout. 185 // This can happen when the timeout and the delivery happens simultaneously, 186 // causing both pathways to trigger. 187 if active[req.peer.id] != req { 188 continue 189 } 190 req.delivered = time.Now() 191 // Move the timed out data back into the download queue 192 finished = append(finished, req) 193 delete(active, req.peer.id) 194 195 // Track outgoing state requests: 196 case req := <-d.trackStateReq: 197 // If an active request already exists for this peer, we have a problem. In 198 // theory the trie node schedule must never assign two requests to the same 199 // peer. In practice however, a peer might receive a request, disconnect and 200 // immediately reconnect before the previous times out. In this case the first 201 // request is never honored, alas we must not silently overwrite it, as that 202 // causes valid requests to go missing and sync to get stuck. 203 if old := active[req.peer.id]; old != nil { 204 log.Warn("Busy peer assigned new state fetch", "peer", old.peer.id) 205 // Move the previous request to the finished set 206 old.timer.Stop() 207 old.dropped = true 208 old.delivered = time.Now() 209 finished = append(finished, old) 210 } 211 // Start a timer to notify the sync loop if the peer stalled. 212 req.timer = time.AfterFunc(req.timeout, func() { 213 timeout <- req 214 }) 215 active[req.peer.id] = req 216 } 217 } 218 } 219 220 // spindownStateSync 'drains' the outstanding requests; some will be delivered and other 221 // will time out. This is to ensure that when the next stateSync starts working, all peers 222 // are marked as idle and de facto _are_ idle. 223 func (d *Downloader) spindownStateSync(active map[string]*stateReq, finished []*stateReq, timeout chan *stateReq, peerDrop chan *peerConnection) { 224 log.Trace("State sync spinning down", "active", len(active), "finished", len(finished)) 225 for len(active) > 0 { 226 var ( 227 req *stateReq 228 reason string 229 ) 230 select { 231 // Handle (drop) incoming state packs: 232 case pack := <-d.stateCh: 233 req = active[pack.PeerId()] 234 reason = "delivered" 235 // Handle dropped peer connections: 236 case p := <-peerDrop: 237 req = active[p.id] 238 reason = "peerdrop" 239 // Handle timed-out requests: 240 case req = <-timeout: 241 reason = "timeout" 242 } 243 if req == nil { 244 continue 245 } 246 req.peer.log.Trace("State peer marked idle (spindown)", "req.items", int(req.nItems), "reason", reason) 247 req.timer.Stop() 248 delete(active, req.peer.id) 249 req.peer.SetNodeDataIdle(int(req.nItems), time.Now()) 250 } 251 // The 'finished' set contains deliveries that we were going to pass to processing. 252 // Those are now moot, but we still need to set those peers as idle, which would 253 // otherwise have been done after processing 254 for _, req := range finished { 255 req.peer.SetNodeDataIdle(int(req.nItems), time.Now()) 256 } 257 } 258 259 // stateSync schedules requests for downloading a particular state trie defined 260 // by a given state root. 261 type stateSync struct { 262 d *Downloader // Downloader instance to access and manage current peerset 263 264 root common.Hash // State root currently being synced 265 sched *trie.Sync // State trie sync scheduler defining the tasks 266 keccak crypto.KeccakState // Keccak256 hasher to verify deliveries with 267 268 trieTasks map[common.Hash]*trieTask // Set of trie node tasks currently queued for retrieval 269 codeTasks map[common.Hash]*codeTask // Set of byte code tasks currently queued for retrieval 270 271 numUncommitted int 272 bytesUncommitted int 273 274 started chan struct{} // Started is signalled once the sync loop starts 275 276 deliver chan *stateReq // Delivery channel multiplexing peer responses 277 cancel chan struct{} // Channel to signal a termination request 278 cancelOnce sync.Once // Ensures cancel only ever gets called once 279 done chan struct{} // Channel to signal termination completion 280 err error // Any error hit during sync (set before completion) 281 } 282 283 // trieTask represents a single trie node download task, containing a set of 284 // peers already attempted retrieval from to detect stalled syncs and abort. 285 type trieTask struct { 286 path [][]byte 287 attempts map[string]struct{} 288 } 289 290 // codeTask represents a single byte code download task, containing a set of 291 // peers already attempted retrieval from to detect stalled syncs and abort. 292 type codeTask struct { 293 attempts map[string]struct{} 294 } 295 296 // newStateSync creates a new state trie download scheduler. This method does not 297 // yet start the sync. The user needs to call run to initiate. 298 func newStateSync(d *Downloader, root common.Hash) *stateSync { 299 return &stateSync{ 300 d: d, 301 root: root, 302 sched: state.NewStateSync(root, d.stateDB, d.stateBloom, nil), 303 keccak: sha3.NewLegacyKeccak256().(crypto.KeccakState), 304 trieTasks: make(map[common.Hash]*trieTask), 305 codeTasks: make(map[common.Hash]*codeTask), 306 deliver: make(chan *stateReq), 307 cancel: make(chan struct{}), 308 done: make(chan struct{}), 309 started: make(chan struct{}), 310 } 311 } 312 313 // run starts the task assignment and response processing loop, blocking until 314 // it finishes, and finally notifying any goroutines waiting for the loop to 315 // finish. 316 func (s *stateSync) run() { 317 close(s.started) 318 if s.d.snapSync { 319 s.err = s.d.SnapSyncer.Sync(s.root, s.cancel) 320 } else { 321 s.err = s.loop() 322 } 323 close(s.done) 324 } 325 326 // Wait blocks until the sync is done or canceled. 327 func (s *stateSync) Wait() error { 328 <-s.done 329 return s.err 330 } 331 332 // Cancel cancels the sync and waits until it has shut down. 333 func (s *stateSync) Cancel() error { 334 s.cancelOnce.Do(func() { 335 close(s.cancel) 336 }) 337 return s.Wait() 338 } 339 340 // loop is the main event loop of a state trie sync. It it responsible for the 341 // assignment of new tasks to peers (including sending it to them) as well as 342 // for the processing of inbound data. Note, that the loop does not directly 343 // receive data from peers, rather those are buffered up in the downloader and 344 // pushed here async. The reason is to decouple processing from data receipt 345 // and timeouts. 346 func (s *stateSync) loop() (err error) { 347 // Listen for new peer events to assign tasks to them 348 newPeer := make(chan *peerConnection, 1024) 349 peerSub := s.d.peers.SubscribeNewPeers(newPeer) 350 defer peerSub.Unsubscribe() 351 defer func() { 352 cerr := s.commit(true) 353 if err == nil { 354 err = cerr 355 } 356 }() 357 358 // Keep assigning new tasks until the sync completes or aborts 359 for s.sched.Pending() > 0 { 360 if err = s.commit(false); err != nil { 361 return err 362 } 363 s.assignTasks() 364 // Tasks assigned, wait for something to happen 365 select { 366 case <-newPeer: 367 // New peer arrived, try to assign it download tasks 368 369 case <-s.cancel: 370 return errCancelStateFetch 371 372 case <-s.d.cancelCh: 373 return errCanceled 374 375 case req := <-s.deliver: 376 // Response, disconnect or timeout triggered, drop the peer if stalling 377 log.Trace("Received node data response", "peer", req.peer.id, "count", len(req.response), "dropped", req.dropped, "timeout", !req.dropped && req.timedOut()) 378 if req.nItems <= 2 && !req.dropped && req.timedOut() { 379 // 2 items are the minimum requested, if even that times out, we've no use of 380 // this peer at the moment. 381 log.Warn("Stalling state sync, dropping peer", "peer", req.peer.id) 382 if s.d.dropPeer == nil { 383 // The dropPeer method is nil when `--copydb` is used for a local copy. 384 // Timeouts can occur if e.g. compaction hits at the wrong time, and can be ignored 385 req.peer.log.Warn("Downloader wants to drop peer, but peerdrop-function is not set", "peer", req.peer.id) 386 } else { 387 s.d.dropPeer(req.peer.id) 388 389 // If this peer was the master peer, abort sync immediately 390 s.d.cancelLock.RLock() 391 master := req.peer.id == s.d.cancelPeer 392 s.d.cancelLock.RUnlock() 393 394 if master { 395 s.d.cancel() 396 return errTimeout 397 } 398 } 399 } 400 // Process all the received blobs and check for stale delivery 401 delivered, err := s.process(req) 402 req.peer.SetNodeDataIdle(delivered, req.delivered) 403 if err != nil { 404 log.Warn("Node data write error", "err", err) 405 return err 406 } 407 } 408 } 409 return nil 410 } 411 412 func (s *stateSync) commit(force bool) error { 413 if !force && s.bytesUncommitted < ethdb.IdealBatchSize { 414 return nil 415 } 416 start := time.Now() 417 b := s.d.stateDB.NewBatch() 418 if err := s.sched.Commit(b); err != nil { 419 return err 420 } 421 if err := b.Write(); err != nil { 422 return fmt.Errorf("DB write error: %v", err) 423 } 424 s.updateStats(s.numUncommitted, 0, 0, time.Since(start)) 425 s.numUncommitted = 0 426 s.bytesUncommitted = 0 427 return nil 428 } 429 430 // assignTasks attempts to assign new tasks to all idle peers, either from the 431 // batch currently being retried, or fetching new data from the trie sync itself. 432 func (s *stateSync) assignTasks() { 433 // Iterate over all idle peers and try to assign them state fetches 434 peers, _ := s.d.peers.NodeDataIdlePeers() 435 for _, p := range peers { 436 // Assign a batch of fetches proportional to the estimated latency/bandwidth 437 cap := p.NodeDataCapacity(s.d.peers.rates.TargetRoundTrip()) 438 req := &stateReq{peer: p, timeout: s.d.peers.rates.TargetTimeout()} 439 440 nodes, _, codes := s.fillTasks(cap, req) 441 442 // If the peer was assigned tasks to fetch, send the network request 443 if len(nodes)+len(codes) > 0 { 444 req.peer.log.Trace("Requesting batch of state data", "nodes", len(nodes), "codes", len(codes), "root", s.root) 445 select { 446 case s.d.trackStateReq <- req: 447 req.peer.FetchNodeData(append(nodes, codes...)) // Unified retrieval under eth/6x 448 case <-s.cancel: 449 case <-s.d.cancelCh: 450 } 451 } 452 } 453 } 454 455 // fillTasks fills the given request object with a maximum of n state download 456 // tasks to send to the remote peer. 457 func (s *stateSync) fillTasks(n int, req *stateReq) (nodes []common.Hash, paths []trie.SyncPath, codes []common.Hash) { 458 // Refill available tasks from the scheduler. 459 if fill := n - (len(s.trieTasks) + len(s.codeTasks)); fill > 0 { 460 nodes, paths, codes := s.sched.Missing(fill) 461 for i, hash := range nodes { 462 s.trieTasks[hash] = &trieTask{ 463 path: paths[i], 464 attempts: make(map[string]struct{}), 465 } 466 } 467 for _, hash := range codes { 468 s.codeTasks[hash] = &codeTask{ 469 attempts: make(map[string]struct{}), 470 } 471 } 472 } 473 // Find tasks that haven't been tried with the request's peer. Prefer code 474 // over trie nodes as those can be written to disk and forgotten about. 475 nodes = make([]common.Hash, 0, n) 476 paths = make([]trie.SyncPath, 0, n) 477 codes = make([]common.Hash, 0, n) 478 479 req.trieTasks = make(map[common.Hash]*trieTask, n) 480 req.codeTasks = make(map[common.Hash]*codeTask, n) 481 482 for hash, t := range s.codeTasks { 483 // Stop when we've gathered enough requests 484 if len(nodes)+len(codes) == n { 485 break 486 } 487 // Skip any requests we've already tried from this peer 488 if _, ok := t.attempts[req.peer.id]; ok { 489 continue 490 } 491 // Assign the request to this peer 492 t.attempts[req.peer.id] = struct{}{} 493 codes = append(codes, hash) 494 req.codeTasks[hash] = t 495 delete(s.codeTasks, hash) 496 } 497 for hash, t := range s.trieTasks { 498 // Stop when we've gathered enough requests 499 if len(nodes)+len(codes) == n { 500 break 501 } 502 // Skip any requests we've already tried from this peer 503 if _, ok := t.attempts[req.peer.id]; ok { 504 continue 505 } 506 // Assign the request to this peer 507 t.attempts[req.peer.id] = struct{}{} 508 509 nodes = append(nodes, hash) 510 paths = append(paths, t.path) 511 512 req.trieTasks[hash] = t 513 delete(s.trieTasks, hash) 514 } 515 req.nItems = uint16(len(nodes) + len(codes)) 516 return nodes, paths, codes 517 } 518 519 // process iterates over a batch of delivered state data, injecting each item 520 // into a running state sync, re-queuing any items that were requested but not 521 // delivered. Returns whether the peer actually managed to deliver anything of 522 // value, and any error that occurred. 523 func (s *stateSync) process(req *stateReq) (int, error) { 524 // Collect processing stats and update progress if valid data was received 525 duplicate, unexpected, successful := 0, 0, 0 526 527 defer func(start time.Time) { 528 if duplicate > 0 || unexpected > 0 { 529 s.updateStats(0, duplicate, unexpected, time.Since(start)) 530 } 531 }(time.Now()) 532 533 // Iterate over all the delivered data and inject one-by-one into the trie 534 for _, blob := range req.response { 535 hash, err := s.processNodeData(blob) 536 switch err { 537 case nil: 538 s.numUncommitted++ 539 s.bytesUncommitted += len(blob) 540 successful++ 541 case trie.ErrNotRequested: 542 unexpected++ 543 case trie.ErrAlreadyProcessed: 544 duplicate++ 545 default: 546 return successful, fmt.Errorf("invalid state node %s: %v", hash.TerminalString(), err) 547 } 548 // Delete from both queues (one delivery is enough for the syncer) 549 delete(req.trieTasks, hash) 550 delete(req.codeTasks, hash) 551 } 552 // Put unfulfilled tasks back into the retry queue 553 npeers := s.d.peers.Len() 554 for hash, task := range req.trieTasks { 555 // If the node did deliver something, missing items may be due to a protocol 556 // limit or a previous timeout + delayed delivery. Both cases should permit 557 // the node to retry the missing items (to avoid single-peer stalls). 558 if len(req.response) > 0 || req.timedOut() { 559 delete(task.attempts, req.peer.id) 560 } 561 // If we've requested the node too many times already, it may be a malicious 562 // sync where nobody has the right data. Abort. 563 if len(task.attempts) >= npeers { 564 return successful, fmt.Errorf("trie node %s failed with all peers (%d tries, %d peers)", hash.TerminalString(), len(task.attempts), npeers) 565 } 566 // Missing item, place into the retry queue. 567 s.trieTasks[hash] = task 568 } 569 for hash, task := range req.codeTasks { 570 // If the node did deliver something, missing items may be due to a protocol 571 // limit or a previous timeout + delayed delivery. Both cases should permit 572 // the node to retry the missing items (to avoid single-peer stalls). 573 if len(req.response) > 0 || req.timedOut() { 574 delete(task.attempts, req.peer.id) 575 } 576 // If we've requested the node too many times already, it may be a malicious 577 // sync where nobody has the right data. Abort. 578 if len(task.attempts) >= npeers { 579 return successful, fmt.Errorf("byte code %s failed with all peers (%d tries, %d peers)", hash.TerminalString(), len(task.attempts), npeers) 580 } 581 // Missing item, place into the retry queue. 582 s.codeTasks[hash] = task 583 } 584 return successful, nil 585 } 586 587 // processNodeData tries to inject a trie node data blob delivered from a remote 588 // peer into the state trie, returning whether anything useful was written or any 589 // error occurred. 590 func (s *stateSync) processNodeData(blob []byte) (common.Hash, error) { 591 res := trie.SyncResult{Data: blob} 592 s.keccak.Reset() 593 s.keccak.Write(blob) 594 s.keccak.Read(res.Hash[:]) 595 err := s.sched.Process(res) 596 return res.Hash, err 597 } 598 599 // updateStats bumps the various state sync progress counters and displays a log 600 // message for the user to see. 601 func (s *stateSync) updateStats(written, duplicate, unexpected int, duration time.Duration) { 602 s.d.syncStatsLock.Lock() 603 defer s.d.syncStatsLock.Unlock() 604 605 s.d.syncStatsState.pending = uint64(s.sched.Pending()) 606 s.d.syncStatsState.processed += uint64(written) 607 s.d.syncStatsState.duplicate += uint64(duplicate) 608 s.d.syncStatsState.unexpected += uint64(unexpected) 609 610 if written > 0 || duplicate > 0 || unexpected > 0 { 611 log.Info("Imported new state entries", "count", written, "elapsed", common.PrettyDuration(duration), "processed", s.d.syncStatsState.processed, "pending", s.d.syncStatsState.pending, "trieretry", len(s.trieTasks), "coderetry", len(s.codeTasks), "duplicate", s.d.syncStatsState.duplicate, "unexpected", s.d.syncStatsState.unexpected) 612 } 613 if written > 0 { 614 rawdb.WriteFastTrieProgress(s.d.stateDB, s.d.syncStatsState.processed) 615 } 616 }