github.com/SmartMeshFoundation/Spectrum@v0.0.0-20220621030607-452a266fee1e/eth/downloader/statesync.go (about) 1 // Copyright 2017 The Spectrum Authors 2 // This file is part of the Spectrum library. 3 // 4 // The Spectrum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The Spectrum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the Spectrum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package downloader 18 19 import ( 20 "fmt" 21 "hash" 22 "sync" 23 "sync/atomic" 24 "time" 25 26 "github.com/SmartMeshFoundation/Spectrum/common" 27 "github.com/SmartMeshFoundation/Spectrum/core/state" 28 "github.com/SmartMeshFoundation/Spectrum/crypto/sha3" 29 "github.com/SmartMeshFoundation/Spectrum/ethdb" 30 "github.com/SmartMeshFoundation/Spectrum/log" 31 "github.com/SmartMeshFoundation/Spectrum/trie" 32 ) 33 34 // stateReq represents a batch of state fetch requests groupped together into 35 // a single data retrieval network packet. 36 type stateReq struct { 37 items []common.Hash // Hashes of the state items to download 38 tasks map[common.Hash]*stateTask // Download tasks to track previous attempts 39 timeout time.Duration // Maximum round trip time for this to complete 40 timer *time.Timer // Timer to fire when the RTT timeout expires 41 peer *peerConnection // Peer that we're requesting from 42 response [][]byte // Response data of the peer (nil for timeouts) 43 dropped bool // Flag whether the peer dropped off early 44 } 45 46 // timedOut returns if this request timed out. 47 func (req *stateReq) timedOut() bool { 48 return req.response == nil 49 } 50 51 // stateSyncStats is a collection of progress stats to report during a state trie 52 // sync to RPC requests as well as to display in user logs. 53 type stateSyncStats struct { 54 processed uint64 // Number of state entries processed 55 duplicate uint64 // Number of state entries downloaded twice 56 unexpected uint64 // Number of non-requested state entries received 57 pending uint64 // Number of still pending state entries 58 } 59 60 // syncState starts downloading state with the given root hash. 61 func (d *Downloader) syncState(root common.Hash) *stateSync { 62 s := newStateSync(d, root) 63 select { 64 case d.stateSyncStart <- s: 65 case <-d.quitCh: 66 s.err = errCancelStateFetch 67 close(s.done) 68 } 69 return s 70 } 71 72 // stateFetcher manages the active state sync and accepts requests 73 // on its behalf. 74 func (d *Downloader) stateFetcher() { 75 for { 76 select { 77 case s := <-d.stateSyncStart: 78 for next := s; next != nil; { 79 next = d.runStateSync(next) 80 } 81 case <-d.stateCh: 82 // Ignore state responses while no sync is running. 83 case <-d.quitCh: 84 return 85 } 86 } 87 } 88 89 // runStateSync runs a state synchronisation until it completes or another root 90 // hash is requested to be switched over to. 91 func (d *Downloader) runStateSync(s *stateSync) *stateSync { 92 var ( 93 active = make(map[string]*stateReq) // Currently in-flight requests 94 finished []*stateReq // Completed or failed requests 95 timeout = make(chan *stateReq) // Timed out active requests 96 ) 97 defer func() { 98 // Cancel active request timers on exit. Also set peers to idle so they're 99 // available for the next sync. 100 for _, req := range active { 101 req.timer.Stop() 102 req.peer.SetNodeDataIdle(len(req.items)) 103 } 104 }() 105 // Run the state sync. 106 go s.run() 107 defer s.Cancel() 108 109 // Listen for peer departure events to cancel assigned tasks 110 peerDrop := make(chan *peerConnection, 1024) 111 peerSub := s.d.peers.SubscribePeerDrops(peerDrop) 112 defer peerSub.Unsubscribe() 113 114 for { 115 // Enable sending of the first buffered element if there is one. 116 var ( 117 deliverReq *stateReq 118 deliverReqCh chan *stateReq 119 ) 120 if len(finished) > 0 { 121 deliverReq = finished[0] 122 deliverReqCh = s.deliver 123 } 124 125 select { 126 // The stateSync lifecycle: 127 case next := <-d.stateSyncStart: 128 return next 129 130 case <-s.done: 131 return nil 132 133 // Send the next finished request to the current sync: 134 case deliverReqCh <- deliverReq: 135 finished = append(finished[:0], finished[1:]...) 136 137 // Handle incoming state packs: 138 case pack := <-d.stateCh: 139 // Discard any data not requested (or previsouly timed out) 140 req := active[pack.PeerId()] 141 if req == nil { 142 log.Debug("Unrequested node data", "peer", pack.PeerId(), "len", pack.Items()) 143 continue 144 } 145 // Finalize the request and queue up for processing 146 req.timer.Stop() 147 req.response = pack.(*statePack).states 148 149 finished = append(finished, req) 150 delete(active, pack.PeerId()) 151 152 // Handle dropped peer connections: 153 case p := <-peerDrop: 154 // Skip if no request is currently pending 155 req := active[p.id] 156 if req == nil { 157 continue 158 } 159 // Finalize the request and queue up for processing 160 req.timer.Stop() 161 req.dropped = true 162 163 finished = append(finished, req) 164 delete(active, p.id) 165 166 // Handle timed-out requests: 167 case req := <-timeout: 168 // If the peer is already requesting something else, ignore the stale timeout. 169 // This can happen when the timeout and the delivery happens simultaneously, 170 // causing both pathways to trigger. 171 if active[req.peer.id] != req { 172 continue 173 } 174 // Move the timed out data back into the download queue 175 finished = append(finished, req) 176 delete(active, req.peer.id) 177 178 // Track outgoing state requests: 179 case req := <-d.trackStateReq: 180 // If an active request already exists for this peer, we have a problem. In 181 // theory the trie node schedule must never assign two requests to the same 182 // peer. In practive however, a peer might receive a request, disconnect and 183 // immediately reconnect before the previous times out. In this case the first 184 // request is never honored, alas we must not silently overwrite it, as that 185 // causes valid requests to go missing and sync to get stuck. 186 if old := active[req.peer.id]; old != nil { 187 log.Warn("Busy peer assigned new state fetch", "peer", old.peer.id) 188 189 // Make sure the previous one doesn't get siletly lost 190 old.timer.Stop() 191 old.dropped = true 192 193 finished = append(finished, old) 194 } 195 // Start a timer to notify the sync loop if the peer stalled. 196 req.timer = time.AfterFunc(req.timeout, func() { 197 select { 198 case timeout <- req: 199 case <-s.done: 200 // Prevent leaking of timer goroutines in the unlikely case where a 201 // timer is fired just before exiting runStateSync. 202 } 203 }) 204 active[req.peer.id] = req 205 } 206 } 207 } 208 209 // stateSync schedules requests for downloading a particular state trie defined 210 // by a given state root. 211 type stateSync struct { 212 d *Downloader // Downloader instance to access and manage current peerset 213 214 sched *trie.TrieSync // State trie sync scheduler defining the tasks 215 keccak hash.Hash // Keccak256 hasher to verify deliveries with 216 tasks map[common.Hash]*stateTask // Set of tasks currently queued for retrieval 217 218 numUncommitted int 219 bytesUncommitted int 220 221 deliver chan *stateReq // Delivery channel multiplexing peer responses 222 cancel chan struct{} // Channel to signal a termination request 223 cancelOnce sync.Once // Ensures cancel only ever gets called once 224 done chan struct{} // Channel to signal termination completion 225 err error // Any error hit during sync (set before completion) 226 } 227 228 // stateTask represents a single trie node download taks, containing a set of 229 // peers already attempted retrieval from to detect stalled syncs and abort. 230 type stateTask struct { 231 attempts map[string]struct{} 232 } 233 234 // newStateSync creates a new state trie download scheduler. This method does not 235 // yet start the sync. The user needs to call run to initiate. 236 func newStateSync(d *Downloader, root common.Hash) *stateSync { 237 return &stateSync{ 238 d: d, 239 sched: state.NewStateSync(root, d.stateDB), 240 keccak: sha3.NewKeccak256(), 241 tasks: make(map[common.Hash]*stateTask), 242 deliver: make(chan *stateReq), 243 cancel: make(chan struct{}), 244 done: make(chan struct{}), 245 } 246 } 247 248 // run starts the task assignment and response processing loop, blocking until 249 // it finishes, and finally notifying any goroutines waiting for the loop to 250 // finish. 251 func (s *stateSync) run() { 252 s.err = s.loop() 253 close(s.done) 254 } 255 256 // Wait blocks until the sync is done or canceled. 257 func (s *stateSync) Wait() error { 258 <-s.done 259 return s.err 260 } 261 262 // Cancel cancels the sync and waits until it has shut down. 263 func (s *stateSync) Cancel() error { 264 s.cancelOnce.Do(func() { close(s.cancel) }) 265 return s.Wait() 266 } 267 268 // loop is the main event loop of a state trie sync. It it responsible for the 269 // assignment of new tasks to peers (including sending it to them) as well as 270 // for the processing of inbound data. Note, that the loop does not directly 271 // receive data from peers, rather those are buffered up in the downloader and 272 // pushed here async. The reason is to decouple processing from data receipt 273 // and timeouts. 274 func (s *stateSync) loop() error { 275 // Listen for new peer events to assign tasks to them 276 newPeer := make(chan *peerConnection, 1024) 277 peerSub := s.d.peers.SubscribeNewPeers(newPeer) 278 defer peerSub.Unsubscribe() 279 280 // Keep assigning new tasks until the sync completes or aborts 281 for s.sched.Pending() > 0 { 282 if err := s.commit(false); err != nil { 283 return err 284 } 285 s.assignTasks() 286 // Tasks assigned, wait for something to happen 287 select { 288 case <-newPeer: 289 // New peer arrived, try to assign it download tasks 290 291 case <-s.cancel: 292 return errCancelStateFetch 293 294 case req := <-s.deliver: 295 // Response, disconnect or timeout triggered, drop the peer if stalling 296 log.Trace("Received node data response", "peer", req.peer.id, "count", len(req.response), "dropped", req.dropped, "timeout", !req.dropped && req.timedOut()) 297 if len(req.items) <= 2 && !req.dropped && req.timedOut() { 298 // 2 items are the minimum requested, if even that times out, we've no use of 299 // this peer at the moment. 300 log.Warn("Stalling state sync, dropping peer", "peer", req.peer.id) 301 s.d.dropPeer(req.peer.id) 302 } 303 var bytes int 304 for _, dd := range req.response { 305 bytes += len(dd) 306 } 307 log.Debug("<<stateSync.loop.deliver>>", "peer", req.peer.id, "response_len", len(req.response), "byte_len", bytes) 308 // Process all the received blobs and check for stale delivery 309 stale, err := s.process(req) 310 if err != nil { 311 log.Warn("Node data write error", "err", err) 312 return err 313 } 314 // The the delivery contains requested data, mark the node idle (otherwise it's a timed out delivery) 315 if !stale { 316 req.peer.SetNodeDataIdle(len(req.response)) 317 } 318 } 319 } 320 return s.commit(true) 321 } 322 323 func (s *stateSync) commit(force bool) error { 324 if !force && s.bytesUncommitted < ethdb.IdealBatchSize { 325 return nil 326 } 327 start := time.Now() 328 b := s.d.stateDB.NewBatch() 329 s.sched.Commit(b) 330 if err := b.Write(); err != nil { 331 return fmt.Errorf("DB write error: %v", err) 332 } 333 s.updateStats(s.numUncommitted, 0, 0, time.Since(start)) 334 s.numUncommitted = 0 335 s.bytesUncommitted = 0 336 return nil 337 } 338 339 // assignTasks attempts to assing new tasks to all idle peers, either from the 340 // batch currently being retried, or fetching new data from the trie sync itself. 341 func (s *stateSync) assignTasks() { 342 // Iterate over all idle peers and try to assign them state fetches 343 peers, _ := s.d.peers.NodeDataIdlePeers() 344 for _, p := range peers { 345 // Assign a batch of fetches proportional to the estimated latency/bandwidth 346 cap := p.NodeDataCapacity(s.d.requestRTT()) 347 req := &stateReq{peer: p, timeout: s.d.requestTTL()} 348 s.fillTasks(cap, req) 349 350 // If the peer was assigned tasks to fetch, send the network request 351 if len(req.items) > 0 { 352 req.peer.log.Trace("Requesting new batch of data", "type", "state", "count", len(req.items)) 353 select { 354 case s.d.trackStateReq <- req: 355 req.peer.FetchNodeData(req.items) 356 case <-s.cancel: 357 } 358 } 359 } 360 } 361 362 // fillTasks fills the given request object with a maximum of n state download 363 // tasks to send to the remote peer. 364 func (s *stateSync) fillTasks(n int, req *stateReq) { 365 // Refill available tasks from the scheduler. 366 if len(s.tasks) < n { 367 new := s.sched.Missing(n - len(s.tasks)) 368 for _, hash := range new { 369 s.tasks[hash] = &stateTask{make(map[string]struct{})} 370 } 371 } 372 // Find tasks that haven't been tried with the request's peer. 373 req.items = make([]common.Hash, 0, n) 374 req.tasks = make(map[common.Hash]*stateTask, n) 375 for hash, t := range s.tasks { 376 // Stop when we've gathered enough requests 377 if len(req.items) == n { 378 break 379 } 380 // Skip any requests we've already tried from this peer 381 if _, ok := t.attempts[req.peer.id]; ok { 382 continue 383 } 384 // Assign the request to this peer 385 t.attempts[req.peer.id] = struct{}{} 386 req.items = append(req.items, hash) 387 req.tasks[hash] = t 388 delete(s.tasks, hash) 389 } 390 } 391 392 // process iterates over a batch of delivered state data, injecting each item 393 // into a running state sync, re-queuing any items that were requested but not 394 // delivered. 395 func (s *stateSync) process(req *stateReq) (bool, error) { 396 // Collect processing stats and update progress if valid data was received 397 duplicate, unexpected := 0, 0 398 399 defer func(start time.Time) { 400 if duplicate > 0 || unexpected > 0 { 401 s.updateStats(0, duplicate, unexpected, time.Since(start)) 402 } 403 }(time.Now()) 404 405 // Iterate over all the delivered data and inject one-by-one into the trie 406 progress, stale := false, len(req.response) > 0 407 408 for _, blob := range req.response { 409 prog, hash, err := s.processNodeData(blob) 410 switch err { 411 case nil: 412 s.numUncommitted++ 413 s.bytesUncommitted += len(blob) 414 progress = progress || prog 415 case trie.ErrNotRequested: 416 unexpected++ 417 case trie.ErrAlreadyProcessed: 418 duplicate++ 419 default: 420 return stale, fmt.Errorf("invalid state node %s: %v", hash.TerminalString(), err) 421 } 422 // If the node delivered a requested item, mark the delivery non-stale 423 if _, ok := req.tasks[hash]; ok { 424 delete(req.tasks, hash) 425 stale = false 426 } 427 } 428 // If we're inside the critical section, reset fail counter since we progressed. 429 if progress && atomic.LoadUint32(&s.d.fsPivotFails) > 1 { 430 log.Trace("Fast-sync progressed, resetting fail counter", "previous", atomic.LoadUint32(&s.d.fsPivotFails)) 431 atomic.StoreUint32(&s.d.fsPivotFails, 1) // Don't ever reset to 0, as that will unlock the pivot block 432 } 433 434 // Put unfulfilled tasks back into the retry queue 435 npeers := s.d.peers.Len() 436 for hash, task := range req.tasks { 437 // If the node did deliver something, missing items may be due to a protocol 438 // limit or a previous timeout + delayed delivery. Both cases should permit 439 // the node to retry the missing items (to avoid single-peer stalls). 440 if len(req.response) > 0 || req.timedOut() { 441 delete(task.attempts, req.peer.id) 442 } 443 // If we've requested the node too many times already, it may be a malicious 444 // sync where nobody has the right data. Abort. 445 if len(task.attempts) >= npeers { 446 return stale, fmt.Errorf("state node %s failed with all peers (%d tries, %d peers)", hash.TerminalString(), len(task.attempts), npeers) 447 } 448 // Missing item, place into the retry queue. 449 s.tasks[hash] = task 450 } 451 return stale, nil 452 } 453 454 // processNodeData tries to inject a trie node data blob delivered from a remote 455 // peer into the state trie, returning whether anything useful was written or any 456 // error occurred. 457 func (s *stateSync) processNodeData(blob []byte) (bool, common.Hash, error) { 458 res := trie.SyncResult{Data: blob} 459 s.keccak.Reset() 460 s.keccak.Write(blob) 461 s.keccak.Sum(res.Hash[:0]) 462 committed, _, err := s.sched.Process([]trie.SyncResult{res}) 463 return committed, res.Hash, err 464 } 465 466 // updateStats bumps the various state sync progress counters and displays a log 467 // message for the user to see. 468 func (s *stateSync) updateStats(written, duplicate, unexpected int, duration time.Duration) { 469 s.d.syncStatsLock.Lock() 470 defer s.d.syncStatsLock.Unlock() 471 472 s.d.syncStatsState.pending = uint64(s.sched.Pending()) 473 s.d.syncStatsState.processed += uint64(written) 474 s.d.syncStatsState.duplicate += uint64(duplicate) 475 s.d.syncStatsState.unexpected += uint64(unexpected) 476 477 if written > 0 || duplicate > 0 || unexpected > 0 { 478 log.Info("Imported new state entries", "count", written, "elapsed", common.PrettyDuration(duration), "processed", s.d.syncStatsState.processed, "pending", s.d.syncStatsState.pending, "retry", len(s.tasks), "duplicate", s.d.syncStatsState.duplicate, "unexpected", s.d.syncStatsState.unexpected) 479 } 480 }