github.com/bearnetworkchain/go-bearnetwork@v1.10.19-0.20220604150648-d63890c2e42b/les/downloader/statesync.go (about) 1 // Copyright 2017 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package downloader 18 19 import ( 20 "fmt" 21 "sync" 22 "time" 23 24 "github.com/bearnetworkchain/go-bearnetwork/common" 25 "github.com/bearnetworkchain/go-bearnetwork/core/state" 26 "github.com/bearnetworkchain/go-bearnetwork/crypto" 27 "github.com/bearnetworkchain/go-bearnetwork/ethdb" 28 "github.com/bearnetworkchain/go-bearnetwork/log" 29 "github.com/bearnetworkchain/go-bearnetwork/trie" 30 "golang.org/x/crypto/sha3" 31 ) 32 33 // stateReq represents a batch of state fetch requests grouped together into 34 // a single data retrieval network packet. 35 type stateReq struct { 36 nItems uint16 // Number of items requested for download (max is 384, so uint16 is sufficient) 37 trieTasks map[common.Hash]*trieTask // Trie node download tasks to track previous attempts 38 codeTasks map[common.Hash]*codeTask // Byte code download tasks to track previous attempts 39 timeout time.Duration // Maximum round trip time for this to complete 40 timer *time.Timer // Timer to fire when the RTT timeout expires 41 peer *peerConnection // Peer that we're requesting from 42 delivered time.Time // Time when the packet was delivered (independent when we process it) 43 response [][]byte // Response data of the peer (nil for timeouts) 44 dropped bool // Flag whether the peer dropped off early 45 } 46 47 // timedOut returns if this request timed out. 48 func (req *stateReq) timedOut() bool { 49 return req.response == nil 50 } 51 52 // stateSyncStats is a collection of progress stats to report during a state trie 53 // sync to RPC requests as well as to display in user logs. 54 type stateSyncStats struct { 55 processed uint64 // Number of state entries processed 56 duplicate uint64 // Number of state entries downloaded twice 57 unexpected uint64 // Number of non-requested state entries received 58 pending uint64 // Number of still pending state entries 59 } 60 61 // syncState starts downloading state with the given root hash. 62 func (d *Downloader) syncState(root common.Hash) *stateSync { 63 // Create the state sync 64 s := newStateSync(d, root) 65 select { 66 case d.stateSyncStart <- s: 67 // If we tell the statesync to restart with a new root, we also need 68 // to wait for it to actually also start -- when old requests have timed 69 // out or been delivered 70 <-s.started 71 case <-d.quitCh: 72 s.err = errCancelStateFetch 73 close(s.done) 74 } 75 return s 76 } 77 78 // stateFetcher manages the active state sync and accepts requests 79 // on its behalf. 80 func (d *Downloader) stateFetcher() { 81 for { 82 select { 83 case s := <-d.stateSyncStart: 84 for next := s; next != nil; { 85 next = d.runStateSync(next) 86 } 87 case <-d.stateCh: 88 // Ignore state responses while no sync is running. 89 case <-d.quitCh: 90 return 91 } 92 } 93 } 94 95 // runStateSync runs a state synchronisation until it completes or another root 96 // hash is requested to be switched over to. 97 func (d *Downloader) runStateSync(s *stateSync) *stateSync { 98 var ( 99 active = make(map[string]*stateReq) // Currently in-flight requests 100 finished []*stateReq // Completed or failed requests 101 timeout = make(chan *stateReq) // Timed out active requests 102 ) 103 log.Trace("State sync starting", "root", s.root) 104 105 defer func() { 106 // Cancel active request timers on exit. Also set peers to idle so they're 107 // available for the next sync. 108 for _, req := range active { 109 req.timer.Stop() 110 req.peer.SetNodeDataIdle(int(req.nItems), time.Now()) 111 } 112 }() 113 go s.run() 114 defer s.Cancel() 115 116 // Listen for peer departure events to cancel assigned tasks 117 peerDrop := make(chan *peerConnection, 1024) 118 peerSub := s.d.peers.SubscribePeerDrops(peerDrop) 119 defer peerSub.Unsubscribe() 120 121 for { 122 // Enable sending of the first buffered element if there is one. 123 var ( 124 deliverReq *stateReq 125 deliverReqCh chan *stateReq 126 ) 127 if len(finished) > 0 { 128 deliverReq = finished[0] 129 deliverReqCh = s.deliver 130 } 131 132 select { 133 // The stateSync lifecycle: 134 case next := <-d.stateSyncStart: 135 d.spindownStateSync(active, finished, timeout, peerDrop) 136 return next 137 138 case <-s.done: 139 d.spindownStateSync(active, finished, timeout, peerDrop) 140 return nil 141 142 // Send the next finished request to the current sync: 143 case deliverReqCh <- deliverReq: 144 // Shift out the first request, but also set the emptied slot to nil for GC 145 copy(finished, finished[1:]) 146 finished[len(finished)-1] = nil 147 finished = finished[:len(finished)-1] 148 149 // Handle incoming state packs: 150 case pack := <-d.stateCh: 151 // Discard any data not requested (or previously timed out) 152 req := active[pack.PeerId()] 153 if req == nil { 154 log.Debug("Unrequested node data", "peer", pack.PeerId(), "len", pack.Items()) 155 continue 156 } 157 // Finalize the request and queue up for processing 158 req.timer.Stop() 159 req.response = pack.(*statePack).states 160 req.delivered = time.Now() 161 162 finished = append(finished, req) 163 delete(active, pack.PeerId()) 164 165 // Handle dropped peer connections: 166 case p := <-peerDrop: 167 // Skip if no request is currently pending 168 req := active[p.id] 169 if req == nil { 170 continue 171 } 172 // Finalize the request and queue up for processing 173 req.timer.Stop() 174 req.dropped = true 175 req.delivered = time.Now() 176 177 finished = append(finished, req) 178 delete(active, p.id) 179 180 // Handle timed-out requests: 181 case req := <-timeout: 182 // If the peer is already requesting something else, ignore the stale timeout. 183 // This can happen when the timeout and the delivery happens simultaneously, 184 // causing both pathways to trigger. 185 if active[req.peer.id] != req { 186 continue 187 } 188 req.delivered = time.Now() 189 // Move the timed out data back into the download queue 190 finished = append(finished, req) 191 delete(active, req.peer.id) 192 193 // Track outgoing state requests: 194 case req := <-d.trackStateReq: 195 // If an active request already exists for this peer, we have a problem. In 196 // theory the trie node schedule must never assign two requests to the same 197 // peer. In practice however, a peer might receive a request, disconnect and 198 // immediately reconnect before the previous times out. In this case the first 199 // request is never honored, alas we must not silently overwrite it, as that 200 // causes valid requests to go missing and sync to get stuck. 201 if old := active[req.peer.id]; old != nil { 202 log.Warn("Busy peer assigned new state fetch", "peer", old.peer.id) 203 // Move the previous request to the finished set 204 old.timer.Stop() 205 old.dropped = true 206 old.delivered = time.Now() 207 finished = append(finished, old) 208 } 209 // Start a timer to notify the sync loop if the peer stalled. 210 req.timer = time.AfterFunc(req.timeout, func() { 211 timeout <- req 212 }) 213 active[req.peer.id] = req 214 } 215 } 216 } 217 218 // spindownStateSync 'drains' the outstanding requests; some will be delivered and other 219 // will time out. This is to ensure that when the next stateSync starts working, all peers 220 // are marked as idle and de facto _are_ idle. 221 func (d *Downloader) spindownStateSync(active map[string]*stateReq, finished []*stateReq, timeout chan *stateReq, peerDrop chan *peerConnection) { 222 log.Trace("State sync spinning down", "active", len(active), "finished", len(finished)) 223 for len(active) > 0 { 224 var ( 225 req *stateReq 226 reason string 227 ) 228 select { 229 // Handle (drop) incoming state packs: 230 case pack := <-d.stateCh: 231 req = active[pack.PeerId()] 232 reason = "delivered" 233 // Handle dropped peer connections: 234 case p := <-peerDrop: 235 req = active[p.id] 236 reason = "peerdrop" 237 // Handle timed-out requests: 238 case req = <-timeout: 239 reason = "timeout" 240 } 241 if req == nil { 242 continue 243 } 244 req.peer.log.Trace("State peer marked idle (spindown)", "req.items", int(req.nItems), "reason", reason) 245 req.timer.Stop() 246 delete(active, req.peer.id) 247 req.peer.SetNodeDataIdle(int(req.nItems), time.Now()) 248 } 249 // The 'finished' set contains deliveries that we were going to pass to processing. 250 // Those are now moot, but we still need to set those peers as idle, which would 251 // otherwise have been done after processing 252 for _, req := range finished { 253 req.peer.SetNodeDataIdle(int(req.nItems), time.Now()) 254 } 255 } 256 257 // stateSync schedules requests for downloading a particular state trie defined 258 // by a given state root. 259 type stateSync struct { 260 d *Downloader // Downloader instance to access and manage current peerset 261 262 root common.Hash // State root currently being synced 263 sched *trie.Sync // State trie sync scheduler defining the tasks 264 keccak crypto.KeccakState // Keccak256 hasher to verify deliveries with 265 266 trieTasks map[common.Hash]*trieTask // Set of trie node tasks currently queued for retrieval 267 codeTasks map[common.Hash]*codeTask // Set of byte code tasks currently queued for retrieval 268 269 numUncommitted int 270 bytesUncommitted int 271 272 started chan struct{} // Started is signalled once the sync loop starts 273 274 deliver chan *stateReq // Delivery channel multiplexing peer responses 275 cancel chan struct{} // Channel to signal a termination request 276 cancelOnce sync.Once // Ensures cancel only ever gets called once 277 done chan struct{} // Channel to signal termination completion 278 err error // Any error hit during sync (set before completion) 279 } 280 281 // trieTask represents a single trie node download task, containing a set of 282 // peers already attempted retrieval from to detect stalled syncs and abort. 283 type trieTask struct { 284 path [][]byte 285 attempts map[string]struct{} 286 } 287 288 // codeTask represents a single byte code download task, containing a set of 289 // peers already attempted retrieval from to detect stalled syncs and abort. 290 type codeTask struct { 291 attempts map[string]struct{} 292 } 293 294 // newStateSync creates a new state trie download scheduler. This method does not 295 // yet start the sync. The user needs to call run to initiate. 296 func newStateSync(d *Downloader, root common.Hash) *stateSync { 297 return &stateSync{ 298 d: d, 299 root: root, 300 sched: state.NewStateSync(root, d.stateDB, nil), 301 keccak: sha3.NewLegacyKeccak256().(crypto.KeccakState), 302 trieTasks: make(map[common.Hash]*trieTask), 303 codeTasks: make(map[common.Hash]*codeTask), 304 deliver: make(chan *stateReq), 305 cancel: make(chan struct{}), 306 done: make(chan struct{}), 307 started: make(chan struct{}), 308 } 309 } 310 311 // run starts the task assignment and response processing loop, blocking until 312 // it finishes, and finally notifying any goroutines waiting for the loop to 313 // finish. 314 func (s *stateSync) run() { 315 close(s.started) 316 if s.d.snapSync { 317 s.err = s.d.SnapSyncer.Sync(s.root, s.cancel) 318 } else { 319 s.err = s.loop() 320 } 321 close(s.done) 322 } 323 324 // Wait blocks until the sync is done or canceled. 325 func (s *stateSync) Wait() error { 326 <-s.done 327 return s.err 328 } 329 330 // Cancel cancels the sync and waits until it has shut down. 331 func (s *stateSync) Cancel() error { 332 s.cancelOnce.Do(func() { 333 close(s.cancel) 334 }) 335 return s.Wait() 336 } 337 338 // loop is the main event loop of a state trie sync. It it responsible for the 339 // assignment of new tasks to peers (including sending it to them) as well as 340 // for the processing of inbound data. Note, that the loop does not directly 341 // receive data from peers, rather those are buffered up in the downloader and 342 // pushed here async. The reason is to decouple processing from data receipt 343 // and timeouts. 344 func (s *stateSync) loop() (err error) { 345 // Listen for new peer events to assign tasks to them 346 newPeer := make(chan *peerConnection, 1024) 347 peerSub := s.d.peers.SubscribeNewPeers(newPeer) 348 defer peerSub.Unsubscribe() 349 defer func() { 350 cerr := s.commit(true) 351 if err == nil { 352 err = cerr 353 } 354 }() 355 356 // Keep assigning new tasks until the sync completes or aborts 357 for s.sched.Pending() > 0 { 358 if err = s.commit(false); err != nil { 359 return err 360 } 361 s.assignTasks() 362 // Tasks assigned, wait for something to happen 363 select { 364 case <-newPeer: 365 // New peer arrived, try to assign it download tasks 366 367 case <-s.cancel: 368 return errCancelStateFetch 369 370 case <-s.d.cancelCh: 371 return errCanceled 372 373 case req := <-s.deliver: 374 // Response, disconnect or timeout triggered, drop the peer if stalling 375 log.Trace("Received node data response", "peer", req.peer.id, "count", len(req.response), "dropped", req.dropped, "timeout", !req.dropped && req.timedOut()) 376 if req.nItems <= 2 && !req.dropped && req.timedOut() { 377 // 2 items are the minimum requested, if even that times out, we've no use of 378 // this peer at the moment. 379 log.Warn("Stalling state sync, dropping peer", "peer", req.peer.id) 380 if s.d.dropPeer == nil { 381 // The dropPeer method is nil when `--copydb` is used for a local copy. 382 // Timeouts can occur if e.g. compaction hits at the wrong time, and can be ignored 383 req.peer.log.Warn("Downloader wants to drop peer, but peerdrop-function is not set", "peer", req.peer.id) 384 } else { 385 s.d.dropPeer(req.peer.id) 386 387 // If this peer was the master peer, abort sync immediately 388 s.d.cancelLock.RLock() 389 master := req.peer.id == s.d.cancelPeer 390 s.d.cancelLock.RUnlock() 391 392 if master { 393 s.d.cancel() 394 return errTimeout 395 } 396 } 397 } 398 // Process all the received blobs and check for stale delivery 399 delivered, err := s.process(req) 400 req.peer.SetNodeDataIdle(delivered, req.delivered) 401 if err != nil { 402 log.Warn("Node data write error", "err", err) 403 return err 404 } 405 } 406 } 407 return nil 408 } 409 410 func (s *stateSync) commit(force bool) error { 411 if !force && s.bytesUncommitted < ethdb.IdealBatchSize { 412 return nil 413 } 414 start := time.Now() 415 b := s.d.stateDB.NewBatch() 416 if err := s.sched.Commit(b); err != nil { 417 return err 418 } 419 if err := b.Write(); err != nil { 420 return fmt.Errorf("DB write error: %v", err) 421 } 422 s.updateStats(s.numUncommitted, 0, 0, time.Since(start)) 423 s.numUncommitted = 0 424 s.bytesUncommitted = 0 425 return nil 426 } 427 428 // assignTasks attempts to assign new tasks to all idle peers, either from the 429 // batch currently being retried, or fetching new data from the trie sync itself. 430 func (s *stateSync) assignTasks() { 431 // Iterate over all idle peers and try to assign them state fetches 432 peers, _ := s.d.peers.NodeDataIdlePeers() 433 for _, p := range peers { 434 // Assign a batch of fetches proportional to the estimated latency/bandwidth 435 cap := p.NodeDataCapacity(s.d.peers.rates.TargetRoundTrip()) 436 req := &stateReq{peer: p, timeout: s.d.peers.rates.TargetTimeout()} 437 438 nodes, _, codes := s.fillTasks(cap, req) 439 440 // If the peer was assigned tasks to fetch, send the network request 441 if len(nodes)+len(codes) > 0 { 442 req.peer.log.Trace("Requesting batch of state data", "nodes", len(nodes), "codes", len(codes), "root", s.root) 443 select { 444 case s.d.trackStateReq <- req: 445 req.peer.FetchNodeData(append(nodes, codes...)) // Unified retrieval under eth/6x 446 case <-s.cancel: 447 case <-s.d.cancelCh: 448 } 449 } 450 } 451 } 452 453 // fillTasks fills the given request object with a maximum of n state download 454 // tasks to send to the remote peer. 455 func (s *stateSync) fillTasks(n int, req *stateReq) (nodes []common.Hash, paths []trie.SyncPath, codes []common.Hash) { 456 // Refill available tasks from the scheduler. 457 if fill := n - (len(s.trieTasks) + len(s.codeTasks)); fill > 0 { 458 nodes, paths, codes := s.sched.Missing(fill) 459 for i, hash := range nodes { 460 s.trieTasks[hash] = &trieTask{ 461 path: paths[i], 462 attempts: make(map[string]struct{}), 463 } 464 } 465 for _, hash := range codes { 466 s.codeTasks[hash] = &codeTask{ 467 attempts: make(map[string]struct{}), 468 } 469 } 470 } 471 // Find tasks that haven't been tried with the request's peer. Prefer code 472 // over trie nodes as those can be written to disk and forgotten about. 473 nodes = make([]common.Hash, 0, n) 474 paths = make([]trie.SyncPath, 0, n) 475 codes = make([]common.Hash, 0, n) 476 477 req.trieTasks = make(map[common.Hash]*trieTask, n) 478 req.codeTasks = make(map[common.Hash]*codeTask, n) 479 480 for hash, t := range s.codeTasks { 481 // Stop when we've gathered enough requests 482 if len(nodes)+len(codes) == n { 483 break 484 } 485 // Skip any requests we've already tried from this peer 486 if _, ok := t.attempts[req.peer.id]; ok { 487 continue 488 } 489 // Assign the request to this peer 490 t.attempts[req.peer.id] = struct{}{} 491 codes = append(codes, hash) 492 req.codeTasks[hash] = t 493 delete(s.codeTasks, hash) 494 } 495 for hash, t := range s.trieTasks { 496 // Stop when we've gathered enough requests 497 if len(nodes)+len(codes) == n { 498 break 499 } 500 // Skip any requests we've already tried from this peer 501 if _, ok := t.attempts[req.peer.id]; ok { 502 continue 503 } 504 // Assign the request to this peer 505 t.attempts[req.peer.id] = struct{}{} 506 507 nodes = append(nodes, hash) 508 paths = append(paths, t.path) 509 510 req.trieTasks[hash] = t 511 delete(s.trieTasks, hash) 512 } 513 req.nItems = uint16(len(nodes) + len(codes)) 514 return nodes, paths, codes 515 } 516 517 // process iterates over a batch of delivered state data, injecting each item 518 // into a running state sync, re-queuing any items that were requested but not 519 // delivered. Returns whether the peer actually managed to deliver anything of 520 // value, and any error that occurred. 521 func (s *stateSync) process(req *stateReq) (int, error) { 522 // Collect processing stats and update progress if valid data was received 523 duplicate, unexpected, successful := 0, 0, 0 524 525 defer func(start time.Time) { 526 if duplicate > 0 || unexpected > 0 { 527 s.updateStats(0, duplicate, unexpected, time.Since(start)) 528 } 529 }(time.Now()) 530 531 // Iterate over all the delivered data and inject one-by-one into the trie 532 for _, blob := range req.response { 533 hash, err := s.processNodeData(blob) 534 switch err { 535 case nil: 536 s.numUncommitted++ 537 s.bytesUncommitted += len(blob) 538 successful++ 539 case trie.ErrNotRequested: 540 unexpected++ 541 case trie.ErrAlreadyProcessed: 542 duplicate++ 543 default: 544 return successful, fmt.Errorf("invalid state node %s: %v", hash.TerminalString(), err) 545 } 546 // Delete from both queues (one delivery is enough for the syncer) 547 delete(req.trieTasks, hash) 548 delete(req.codeTasks, hash) 549 } 550 // Put unfulfilled tasks back into the retry queue 551 npeers := s.d.peers.Len() 552 for hash, task := range req.trieTasks { 553 // If the node did deliver something, missing items may be due to a protocol 554 // limit or a previous timeout + delayed delivery. Both cases should permit 555 // the node to retry the missing items (to avoid single-peer stalls). 556 if len(req.response) > 0 || req.timedOut() { 557 delete(task.attempts, req.peer.id) 558 } 559 // If we've requested the node too many times already, it may be a malicious 560 // sync where nobody has the right data. Abort. 561 if len(task.attempts) >= npeers { 562 return successful, fmt.Errorf("trie node %s failed with all peers (%d tries, %d peers)", hash.TerminalString(), len(task.attempts), npeers) 563 } 564 // Missing item, place into the retry queue. 565 s.trieTasks[hash] = task 566 } 567 for hash, task := range req.codeTasks { 568 // If the node did deliver something, missing items may be due to a protocol 569 // limit or a previous timeout + delayed delivery. Both cases should permit 570 // the node to retry the missing items (to avoid single-peer stalls). 571 if len(req.response) > 0 || req.timedOut() { 572 delete(task.attempts, req.peer.id) 573 } 574 // If we've requested the node too many times already, it may be a malicious 575 // sync where nobody has the right data. Abort. 576 if len(task.attempts) >= npeers { 577 return successful, fmt.Errorf("byte code %s failed with all peers (%d tries, %d peers)", hash.TerminalString(), len(task.attempts), npeers) 578 } 579 // Missing item, place into the retry queue. 580 s.codeTasks[hash] = task 581 } 582 return successful, nil 583 } 584 585 // processNodeData tries to inject a trie node data blob delivered from a remote 586 // peer into the state trie, returning whether anything useful was written or any 587 // error occurred. 588 func (s *stateSync) processNodeData(blob []byte) (common.Hash, error) { 589 res := trie.SyncResult{Data: blob} 590 s.keccak.Reset() 591 s.keccak.Write(blob) 592 s.keccak.Read(res.Hash[:]) 593 err := s.sched.Process(res) 594 return res.Hash, err 595 } 596 597 // updateStats bumps the various state sync progress counters and displays a log 598 // message for the user to see. 599 func (s *stateSync) updateStats(written, duplicate, unexpected int, duration time.Duration) { 600 s.d.syncStatsLock.Lock() 601 defer s.d.syncStatsLock.Unlock() 602 603 s.d.syncStatsState.pending = uint64(s.sched.Pending()) 604 s.d.syncStatsState.processed += uint64(written) 605 s.d.syncStatsState.duplicate += uint64(duplicate) 606 s.d.syncStatsState.unexpected += uint64(unexpected) 607 608 if written > 0 || duplicate > 0 || unexpected > 0 { 609 log.Info("Imported new state entries", "count", written, "elapsed", common.PrettyDuration(duration), "processed", s.d.syncStatsState.processed, "pending", s.d.syncStatsState.pending, "trieretry", len(s.trieTasks), "coderetry", len(s.codeTasks), "duplicate", s.d.syncStatsState.duplicate, "unexpected", s.d.syncStatsState.unexpected) 610 } 611 if written > 0 { 612 //rawdb.WriteFastTrieProgress(s.d.stateDB, s.d.syncStatsState.processed) 613 } 614 }