github.com/trizin/go-ethereum@v1.9.7/eth/downloader/statesync.go (about) 1 // Copyright 2017 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package downloader 18 19 import ( 20 "fmt" 21 "hash" 22 "sync" 23 "time" 24 25 "github.com/ethereum/go-ethereum/common" 26 "github.com/ethereum/go-ethereum/core/rawdb" 27 "github.com/ethereum/go-ethereum/core/state" 28 "github.com/ethereum/go-ethereum/ethdb" 29 "github.com/ethereum/go-ethereum/log" 30 "github.com/ethereum/go-ethereum/trie" 31 "golang.org/x/crypto/sha3" 32 ) 33 34 // stateReq represents a batch of state fetch requests grouped together into 35 // a single data retrieval network packet. 36 type stateReq struct { 37 items []common.Hash // Hashes of the state items to download 38 tasks map[common.Hash]*stateTask // Download tasks to track previous attempts 39 timeout time.Duration // Maximum round trip time for this to complete 40 timer *time.Timer // Timer to fire when the RTT timeout expires 41 peer *peerConnection // Peer that we're requesting from 42 response [][]byte // Response data of the peer (nil for timeouts) 43 dropped bool // Flag whether the peer dropped off early 44 } 45 46 // timedOut returns if this request timed out. 47 func (req *stateReq) timedOut() bool { 48 return req.response == nil 49 } 50 51 // stateSyncStats is a collection of progress stats to report during a state trie 52 // sync to RPC requests as well as to display in user logs. 53 type stateSyncStats struct { 54 processed uint64 // Number of state entries processed 55 duplicate uint64 // Number of state entries downloaded twice 56 unexpected uint64 // Number of non-requested state entries received 57 pending uint64 // Number of still pending state entries 58 } 59 60 // syncState starts downloading state with the given root hash. 61 func (d *Downloader) syncState(root common.Hash) *stateSync { 62 // Create the state sync 63 s := newStateSync(d, root) 64 select { 65 case d.stateSyncStart <- s: 66 case <-d.quitCh: 67 s.err = errCancelStateFetch 68 close(s.done) 69 } 70 return s 71 } 72 73 // stateFetcher manages the active state sync and accepts requests 74 // on its behalf. 75 func (d *Downloader) stateFetcher() { 76 for { 77 select { 78 case s := <-d.stateSyncStart: 79 for next := s; next != nil; { 80 next = d.runStateSync(next) 81 } 82 case <-d.stateCh: 83 // Ignore state responses while no sync is running. 84 case <-d.quitCh: 85 return 86 } 87 } 88 } 89 90 // runStateSync runs a state synchronisation until it completes or another root 91 // hash is requested to be switched over to. 92 func (d *Downloader) runStateSync(s *stateSync) *stateSync { 93 var ( 94 active = make(map[string]*stateReq) // Currently in-flight requests 95 finished []*stateReq // Completed or failed requests 96 timeout = make(chan *stateReq) // Timed out active requests 97 ) 98 defer func() { 99 // Cancel active request timers on exit. Also set peers to idle so they're 100 // available for the next sync. 101 for _, req := range active { 102 req.timer.Stop() 103 req.peer.SetNodeDataIdle(len(req.items)) 104 } 105 }() 106 // Run the state sync. 107 go s.run() 108 defer s.Cancel() 109 110 // Listen for peer departure events to cancel assigned tasks 111 peerDrop := make(chan *peerConnection, 1024) 112 peerSub := s.d.peers.SubscribePeerDrops(peerDrop) 113 defer peerSub.Unsubscribe() 114 115 for { 116 // Enable sending of the first buffered element if there is one. 117 var ( 118 deliverReq *stateReq 119 deliverReqCh chan *stateReq 120 ) 121 if len(finished) > 0 { 122 deliverReq = finished[0] 123 deliverReqCh = s.deliver 124 } 125 126 select { 127 // The stateSync lifecycle: 128 case next := <-d.stateSyncStart: 129 return next 130 131 case <-s.done: 132 return nil 133 134 // Send the next finished request to the current sync: 135 case deliverReqCh <- deliverReq: 136 // Shift out the first request, but also set the emptied slot to nil for GC 137 copy(finished, finished[1:]) 138 finished[len(finished)-1] = nil 139 finished = finished[:len(finished)-1] 140 141 // Handle incoming state packs: 142 case pack := <-d.stateCh: 143 // Discard any data not requested (or previously timed out) 144 req := active[pack.PeerId()] 145 if req == nil { 146 log.Debug("Unrequested node data", "peer", pack.PeerId(), "len", pack.Items()) 147 continue 148 } 149 // Finalize the request and queue up for processing 150 req.timer.Stop() 151 req.response = pack.(*statePack).states 152 153 finished = append(finished, req) 154 delete(active, pack.PeerId()) 155 156 // Handle dropped peer connections: 157 case p := <-peerDrop: 158 // Skip if no request is currently pending 159 req := active[p.id] 160 if req == nil { 161 continue 162 } 163 // Finalize the request and queue up for processing 164 req.timer.Stop() 165 req.dropped = true 166 167 finished = append(finished, req) 168 delete(active, p.id) 169 170 // Handle timed-out requests: 171 case req := <-timeout: 172 // If the peer is already requesting something else, ignore the stale timeout. 173 // This can happen when the timeout and the delivery happens simultaneously, 174 // causing both pathways to trigger. 175 if active[req.peer.id] != req { 176 continue 177 } 178 // Move the timed out data back into the download queue 179 finished = append(finished, req) 180 delete(active, req.peer.id) 181 182 // Track outgoing state requests: 183 case req := <-d.trackStateReq: 184 // If an active request already exists for this peer, we have a problem. In 185 // theory the trie node schedule must never assign two requests to the same 186 // peer. In practice however, a peer might receive a request, disconnect and 187 // immediately reconnect before the previous times out. In this case the first 188 // request is never honored, alas we must not silently overwrite it, as that 189 // causes valid requests to go missing and sync to get stuck. 190 if old := active[req.peer.id]; old != nil { 191 log.Warn("Busy peer assigned new state fetch", "peer", old.peer.id) 192 193 // Make sure the previous one doesn't get siletly lost 194 old.timer.Stop() 195 old.dropped = true 196 197 finished = append(finished, old) 198 } 199 // Start a timer to notify the sync loop if the peer stalled. 200 req.timer = time.AfterFunc(req.timeout, func() { 201 select { 202 case timeout <- req: 203 case <-s.done: 204 // Prevent leaking of timer goroutines in the unlikely case where a 205 // timer is fired just before exiting runStateSync. 206 } 207 }) 208 active[req.peer.id] = req 209 } 210 } 211 } 212 213 // stateSync schedules requests for downloading a particular state trie defined 214 // by a given state root. 215 type stateSync struct { 216 d *Downloader // Downloader instance to access and manage current peerset 217 218 sched *trie.Sync // State trie sync scheduler defining the tasks 219 keccak hash.Hash // Keccak256 hasher to verify deliveries with 220 tasks map[common.Hash]*stateTask // Set of tasks currently queued for retrieval 221 222 numUncommitted int 223 bytesUncommitted int 224 225 deliver chan *stateReq // Delivery channel multiplexing peer responses 226 cancel chan struct{} // Channel to signal a termination request 227 cancelOnce sync.Once // Ensures cancel only ever gets called once 228 done chan struct{} // Channel to signal termination completion 229 err error // Any error hit during sync (set before completion) 230 } 231 232 // stateTask represents a single trie node download task, containing a set of 233 // peers already attempted retrieval from to detect stalled syncs and abort. 234 type stateTask struct { 235 attempts map[string]struct{} 236 } 237 238 // newStateSync creates a new state trie download scheduler. This method does not 239 // yet start the sync. The user needs to call run to initiate. 240 func newStateSync(d *Downloader, root common.Hash) *stateSync { 241 return &stateSync{ 242 d: d, 243 sched: state.NewStateSync(root, d.stateDB, d.stateBloom), 244 keccak: sha3.NewLegacyKeccak256(), 245 tasks: make(map[common.Hash]*stateTask), 246 deliver: make(chan *stateReq), 247 cancel: make(chan struct{}), 248 done: make(chan struct{}), 249 } 250 } 251 252 // run starts the task assignment and response processing loop, blocking until 253 // it finishes, and finally notifying any goroutines waiting for the loop to 254 // finish. 255 func (s *stateSync) run() { 256 s.err = s.loop() 257 close(s.done) 258 } 259 260 // Wait blocks until the sync is done or canceled. 261 func (s *stateSync) Wait() error { 262 <-s.done 263 return s.err 264 } 265 266 // Cancel cancels the sync and waits until it has shut down. 267 func (s *stateSync) Cancel() error { 268 s.cancelOnce.Do(func() { close(s.cancel) }) 269 return s.Wait() 270 } 271 272 // loop is the main event loop of a state trie sync. It it responsible for the 273 // assignment of new tasks to peers (including sending it to them) as well as 274 // for the processing of inbound data. Note, that the loop does not directly 275 // receive data from peers, rather those are buffered up in the downloader and 276 // pushed here async. The reason is to decouple processing from data receipt 277 // and timeouts. 278 func (s *stateSync) loop() (err error) { 279 // Listen for new peer events to assign tasks to them 280 newPeer := make(chan *peerConnection, 1024) 281 peerSub := s.d.peers.SubscribeNewPeers(newPeer) 282 defer peerSub.Unsubscribe() 283 defer func() { 284 cerr := s.commit(true) 285 if err == nil { 286 err = cerr 287 } 288 }() 289 290 // Keep assigning new tasks until the sync completes or aborts 291 for s.sched.Pending() > 0 { 292 if err = s.commit(false); err != nil { 293 return err 294 } 295 s.assignTasks() 296 // Tasks assigned, wait for something to happen 297 select { 298 case <-newPeer: 299 // New peer arrived, try to assign it download tasks 300 301 case <-s.cancel: 302 return errCancelStateFetch 303 304 case <-s.d.cancelCh: 305 return errCanceled 306 307 case req := <-s.deliver: 308 // Response, disconnect or timeout triggered, drop the peer if stalling 309 log.Trace("Received node data response", "peer", req.peer.id, "count", len(req.response), "dropped", req.dropped, "timeout", !req.dropped && req.timedOut()) 310 if len(req.items) <= 2 && !req.dropped && req.timedOut() { 311 // 2 items are the minimum requested, if even that times out, we've no use of 312 // this peer at the moment. 313 log.Warn("Stalling state sync, dropping peer", "peer", req.peer.id) 314 if s.d.dropPeer == nil { 315 // The dropPeer method is nil when `--copydb` is used for a local copy. 316 // Timeouts can occur if e.g. compaction hits at the wrong time, and can be ignored 317 req.peer.log.Warn("Downloader wants to drop peer, but peerdrop-function is not set", "peer", req.peer.id) 318 } else { 319 s.d.dropPeer(req.peer.id) 320 321 // If this peer was the master peer, abort sync immediately 322 s.d.cancelLock.RLock() 323 master := req.peer.id == s.d.cancelPeer 324 s.d.cancelLock.RUnlock() 325 326 if master { 327 s.d.cancel() 328 return errTimeout 329 } 330 } 331 } 332 // Process all the received blobs and check for stale delivery 333 delivered, err := s.process(req) 334 if err != nil { 335 log.Warn("Node data write error", "err", err) 336 return err 337 } 338 req.peer.SetNodeDataIdle(delivered) 339 } 340 } 341 return nil 342 } 343 344 func (s *stateSync) commit(force bool) error { 345 if !force && s.bytesUncommitted < ethdb.IdealBatchSize { 346 return nil 347 } 348 start := time.Now() 349 b := s.d.stateDB.NewBatch() 350 if err := s.sched.Commit(b); err != nil { 351 return err 352 } 353 if err := b.Write(); err != nil { 354 return fmt.Errorf("DB write error: %v", err) 355 } 356 s.updateStats(s.numUncommitted, 0, 0, time.Since(start)) 357 s.numUncommitted = 0 358 s.bytesUncommitted = 0 359 return nil 360 } 361 362 // assignTasks attempts to assign new tasks to all idle peers, either from the 363 // batch currently being retried, or fetching new data from the trie sync itself. 364 func (s *stateSync) assignTasks() { 365 // Iterate over all idle peers and try to assign them state fetches 366 peers, _ := s.d.peers.NodeDataIdlePeers() 367 for _, p := range peers { 368 // Assign a batch of fetches proportional to the estimated latency/bandwidth 369 cap := p.NodeDataCapacity(s.d.requestRTT()) 370 req := &stateReq{peer: p, timeout: s.d.requestTTL()} 371 s.fillTasks(cap, req) 372 373 // If the peer was assigned tasks to fetch, send the network request 374 if len(req.items) > 0 { 375 req.peer.log.Trace("Requesting new batch of data", "type", "state", "count", len(req.items)) 376 select { 377 case s.d.trackStateReq <- req: 378 req.peer.FetchNodeData(req.items) 379 case <-s.cancel: 380 case <-s.d.cancelCh: 381 } 382 } 383 } 384 } 385 386 // fillTasks fills the given request object with a maximum of n state download 387 // tasks to send to the remote peer. 388 func (s *stateSync) fillTasks(n int, req *stateReq) { 389 // Refill available tasks from the scheduler. 390 if len(s.tasks) < n { 391 new := s.sched.Missing(n - len(s.tasks)) 392 for _, hash := range new { 393 s.tasks[hash] = &stateTask{make(map[string]struct{})} 394 } 395 } 396 // Find tasks that haven't been tried with the request's peer. 397 req.items = make([]common.Hash, 0, n) 398 req.tasks = make(map[common.Hash]*stateTask, n) 399 for hash, t := range s.tasks { 400 // Stop when we've gathered enough requests 401 if len(req.items) == n { 402 break 403 } 404 // Skip any requests we've already tried from this peer 405 if _, ok := t.attempts[req.peer.id]; ok { 406 continue 407 } 408 // Assign the request to this peer 409 t.attempts[req.peer.id] = struct{}{} 410 req.items = append(req.items, hash) 411 req.tasks[hash] = t 412 delete(s.tasks, hash) 413 } 414 } 415 416 // process iterates over a batch of delivered state data, injecting each item 417 // into a running state sync, re-queuing any items that were requested but not 418 // delivered. Returns whether the peer actually managed to deliver anything of 419 // value, and any error that occurred. 420 func (s *stateSync) process(req *stateReq) (int, error) { 421 // Collect processing stats and update progress if valid data was received 422 duplicate, unexpected, successful := 0, 0, 0 423 424 defer func(start time.Time) { 425 if duplicate > 0 || unexpected > 0 { 426 s.updateStats(0, duplicate, unexpected, time.Since(start)) 427 } 428 }(time.Now()) 429 430 // Iterate over all the delivered data and inject one-by-one into the trie 431 for _, blob := range req.response { 432 _, hash, err := s.processNodeData(blob) 433 switch err { 434 case nil: 435 s.numUncommitted++ 436 s.bytesUncommitted += len(blob) 437 successful++ 438 case trie.ErrNotRequested: 439 unexpected++ 440 case trie.ErrAlreadyProcessed: 441 duplicate++ 442 default: 443 return successful, fmt.Errorf("invalid state node %s: %v", hash.TerminalString(), err) 444 } 445 delete(req.tasks, hash) 446 } 447 // Put unfulfilled tasks back into the retry queue 448 npeers := s.d.peers.Len() 449 for hash, task := range req.tasks { 450 // If the node did deliver something, missing items may be due to a protocol 451 // limit or a previous timeout + delayed delivery. Both cases should permit 452 // the node to retry the missing items (to avoid single-peer stalls). 453 if len(req.response) > 0 || req.timedOut() { 454 delete(task.attempts, req.peer.id) 455 } 456 // If we've requested the node too many times already, it may be a malicious 457 // sync where nobody has the right data. Abort. 458 if len(task.attempts) >= npeers { 459 return successful, fmt.Errorf("state node %s failed with all peers (%d tries, %d peers)", hash.TerminalString(), len(task.attempts), npeers) 460 } 461 // Missing item, place into the retry queue. 462 s.tasks[hash] = task 463 } 464 return successful, nil 465 } 466 467 // processNodeData tries to inject a trie node data blob delivered from a remote 468 // peer into the state trie, returning whether anything useful was written or any 469 // error occurred. 470 func (s *stateSync) processNodeData(blob []byte) (bool, common.Hash, error) { 471 res := trie.SyncResult{Data: blob} 472 s.keccak.Reset() 473 s.keccak.Write(blob) 474 s.keccak.Sum(res.Hash[:0]) 475 committed, _, err := s.sched.Process([]trie.SyncResult{res}) 476 return committed, res.Hash, err 477 } 478 479 // updateStats bumps the various state sync progress counters and displays a log 480 // message for the user to see. 481 func (s *stateSync) updateStats(written, duplicate, unexpected int, duration time.Duration) { 482 s.d.syncStatsLock.Lock() 483 defer s.d.syncStatsLock.Unlock() 484 485 s.d.syncStatsState.pending = uint64(s.sched.Pending()) 486 s.d.syncStatsState.processed += uint64(written) 487 s.d.syncStatsState.duplicate += uint64(duplicate) 488 s.d.syncStatsState.unexpected += uint64(unexpected) 489 490 if written > 0 || duplicate > 0 || unexpected > 0 { 491 log.Info("Imported new state entries", "count", written, "elapsed", common.PrettyDuration(duration), "processed", s.d.syncStatsState.processed, "pending", s.d.syncStatsState.pending, "retry", len(s.tasks), "duplicate", s.d.syncStatsState.duplicate, "unexpected", s.d.syncStatsState.unexpected) 492 } 493 if written > 0 { 494 rawdb.WriteFastTrieProgress(s.d.stateDB, s.d.syncStatsState.processed) 495 } 496 }