github.com/vantum/vantum@v0.0.0-20180815184342-fe37d5f7a990/eth/downloader/statesync.go (about) 1 // Copyright 2017 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package downloader 18 19 import ( 20 "fmt" 21 "hash" 22 "sync" 23 "time" 24 25 "github.com/vantum/vantum/common" 26 "github.com/vantum/vantum/core/state" 27 "github.com/vantum/vantum/crypto/sha3" 28 "github.com/vantum/vantum/ethdb" 29 "github.com/vantum/vantum/log" 30 "github.com/vantum/vantum/trie" 31 ) 32 33 // stateReq represents a batch of state fetch requests groupped together into 34 // a single data retrieval network packet. 35 type stateReq struct { 36 items []common.Hash // Hashes of the state items to download 37 tasks map[common.Hash]*stateTask // Download tasks to track previous attempts 38 timeout time.Duration // Maximum round trip time for this to complete 39 timer *time.Timer // Timer to fire when the RTT timeout expires 40 peer *peerConnection // Peer that we're requesting from 41 response [][]byte // Response data of the peer (nil for timeouts) 42 dropped bool // Flag whether the peer dropped off early 43 } 44 45 // timedOut returns if this request timed out. 46 func (req *stateReq) timedOut() bool { 47 return req.response == nil 48 } 49 50 // stateSyncStats is a collection of progress stats to report during a state trie 51 // sync to RPC requests as well as to display in user logs. 52 type stateSyncStats struct { 53 processed uint64 // Number of state entries processed 54 duplicate uint64 // Number of state entries downloaded twice 55 unexpected uint64 // Number of non-requested state entries received 56 pending uint64 // Number of still pending state entries 57 } 58 59 // syncState starts downloading state with the given root hash. 60 func (d *Downloader) syncState(root common.Hash) *stateSync { 61 s := newStateSync(d, root) 62 select { 63 case d.stateSyncStart <- s: 64 case <-d.quitCh: 65 s.err = errCancelStateFetch 66 close(s.done) 67 } 68 return s 69 } 70 71 // stateFetcher manages the active state sync and accepts requests 72 // on its behalf. 73 func (d *Downloader) stateFetcher() { 74 for { 75 select { 76 case s := <-d.stateSyncStart: 77 for next := s; next != nil; { 78 next = d.runStateSync(next) 79 } 80 case <-d.stateCh: 81 // Ignore state responses while no sync is running. 82 case <-d.quitCh: 83 return 84 } 85 } 86 } 87 88 // runStateSync runs a state synchronisation until it completes or another root 89 // hash is requested to be switched over to. 90 func (d *Downloader) runStateSync(s *stateSync) *stateSync { 91 var ( 92 active = make(map[string]*stateReq) // Currently in-flight requests 93 finished []*stateReq // Completed or failed requests 94 timeout = make(chan *stateReq) // Timed out active requests 95 ) 96 defer func() { 97 // Cancel active request timers on exit. Also set peers to idle so they're 98 // available for the next sync. 99 for _, req := range active { 100 req.timer.Stop() 101 req.peer.SetNodeDataIdle(len(req.items)) 102 } 103 }() 104 // Run the state sync. 105 go s.run() 106 defer s.Cancel() 107 108 // Listen for peer departure events to cancel assigned tasks 109 peerDrop := make(chan *peerConnection, 1024) 110 peerSub := s.d.peers.SubscribePeerDrops(peerDrop) 111 defer peerSub.Unsubscribe() 112 113 for { 114 // Enable sending of the first buffered element if there is one. 115 var ( 116 deliverReq *stateReq 117 deliverReqCh chan *stateReq 118 ) 119 if len(finished) > 0 { 120 deliverReq = finished[0] 121 deliverReqCh = s.deliver 122 } 123 124 select { 125 // The stateSync lifecycle: 126 case next := <-d.stateSyncStart: 127 return next 128 129 case <-s.done: 130 return nil 131 132 // Send the next finished request to the current sync: 133 case deliverReqCh <- deliverReq: 134 // Shift out the first request, but also set the emptied slot to nil for GC 135 copy(finished, finished[1:]) 136 finished[len(finished)-1] = nil 137 finished = finished[:len(finished)-1] 138 139 // Handle incoming state packs: 140 case pack := <-d.stateCh: 141 // Discard any data not requested (or previsouly timed out) 142 req := active[pack.PeerId()] 143 if req == nil { 144 log.Debug("Unrequested node data", "peer", pack.PeerId(), "len", pack.Items()) 145 continue 146 } 147 // Finalize the request and queue up for processing 148 req.timer.Stop() 149 req.response = pack.(*statePack).states 150 151 finished = append(finished, req) 152 delete(active, pack.PeerId()) 153 154 // Handle dropped peer connections: 155 case p := <-peerDrop: 156 // Skip if no request is currently pending 157 req := active[p.id] 158 if req == nil { 159 continue 160 } 161 // Finalize the request and queue up for processing 162 req.timer.Stop() 163 req.dropped = true 164 165 finished = append(finished, req) 166 delete(active, p.id) 167 168 // Handle timed-out requests: 169 case req := <-timeout: 170 // If the peer is already requesting something else, ignore the stale timeout. 171 // This can happen when the timeout and the delivery happens simultaneously, 172 // causing both pathways to trigger. 173 if active[req.peer.id] != req { 174 continue 175 } 176 // Move the timed out data back into the download queue 177 finished = append(finished, req) 178 delete(active, req.peer.id) 179 180 // Track outgoing state requests: 181 case req := <-d.trackStateReq: 182 // If an active request already exists for this peer, we have a problem. In 183 // theory the trie node schedule must never assign two requests to the same 184 // peer. In practive however, a peer might receive a request, disconnect and 185 // immediately reconnect before the previous times out. In this case the first 186 // request is never honored, alas we must not silently overwrite it, as that 187 // causes valid requests to go missing and sync to get stuck. 188 if old := active[req.peer.id]; old != nil { 189 log.Warn("Busy peer assigned new state fetch", "peer", old.peer.id) 190 191 // Make sure the previous one doesn't get siletly lost 192 old.timer.Stop() 193 old.dropped = true 194 195 finished = append(finished, old) 196 } 197 // Start a timer to notify the sync loop if the peer stalled. 198 req.timer = time.AfterFunc(req.timeout, func() { 199 select { 200 case timeout <- req: 201 case <-s.done: 202 // Prevent leaking of timer goroutines in the unlikely case where a 203 // timer is fired just before exiting runStateSync. 204 } 205 }) 206 active[req.peer.id] = req 207 } 208 } 209 } 210 211 // stateSync schedules requests for downloading a particular state trie defined 212 // by a given state root. 213 type stateSync struct { 214 d *Downloader // Downloader instance to access and manage current peerset 215 216 sched *trie.TrieSync // State trie sync scheduler defining the tasks 217 keccak hash.Hash // Keccak256 hasher to verify deliveries with 218 tasks map[common.Hash]*stateTask // Set of tasks currently queued for retrieval 219 220 numUncommitted int 221 bytesUncommitted int 222 223 deliver chan *stateReq // Delivery channel multiplexing peer responses 224 cancel chan struct{} // Channel to signal a termination request 225 cancelOnce sync.Once // Ensures cancel only ever gets called once 226 done chan struct{} // Channel to signal termination completion 227 err error // Any error hit during sync (set before completion) 228 } 229 230 // stateTask represents a single trie node download taks, containing a set of 231 // peers already attempted retrieval from to detect stalled syncs and abort. 232 type stateTask struct { 233 attempts map[string]struct{} 234 } 235 236 // newStateSync creates a new state trie download scheduler. This method does not 237 // yet start the sync. The user needs to call run to initiate. 238 func newStateSync(d *Downloader, root common.Hash) *stateSync { 239 return &stateSync{ 240 d: d, 241 sched: state.NewStateSync(root, d.stateDB), 242 keccak: sha3.NewKeccak256(), 243 tasks: make(map[common.Hash]*stateTask), 244 deliver: make(chan *stateReq), 245 cancel: make(chan struct{}), 246 done: make(chan struct{}), 247 } 248 } 249 250 // run starts the task assignment and response processing loop, blocking until 251 // it finishes, and finally notifying any goroutines waiting for the loop to 252 // finish. 253 func (s *stateSync) run() { 254 s.err = s.loop() 255 close(s.done) 256 } 257 258 // Wait blocks until the sync is done or canceled. 259 func (s *stateSync) Wait() error { 260 <-s.done 261 return s.err 262 } 263 264 // Cancel cancels the sync and waits until it has shut down. 265 func (s *stateSync) Cancel() error { 266 s.cancelOnce.Do(func() { close(s.cancel) }) 267 return s.Wait() 268 } 269 270 // loop is the main event loop of a state trie sync. It it responsible for the 271 // assignment of new tasks to peers (including sending it to them) as well as 272 // for the processing of inbound data. Note, that the loop does not directly 273 // receive data from peers, rather those are buffered up in the downloader and 274 // pushed here async. The reason is to decouple processing from data receipt 275 // and timeouts. 276 func (s *stateSync) loop() error { 277 // Listen for new peer events to assign tasks to them 278 newPeer := make(chan *peerConnection, 1024) 279 peerSub := s.d.peers.SubscribeNewPeers(newPeer) 280 defer peerSub.Unsubscribe() 281 282 // Keep assigning new tasks until the sync completes or aborts 283 for s.sched.Pending() > 0 { 284 if err := s.commit(false); err != nil { 285 return err 286 } 287 s.assignTasks() 288 // Tasks assigned, wait for something to happen 289 select { 290 case <-newPeer: 291 // New peer arrived, try to assign it download tasks 292 293 case <-s.cancel: 294 return errCancelStateFetch 295 296 case <-s.d.cancelCh: 297 return errCancelStateFetch 298 299 case req := <-s.deliver: 300 // Response, disconnect or timeout triggered, drop the peer if stalling 301 log.Trace("Received node data response", "peer", req.peer.id, "count", len(req.response), "dropped", req.dropped, "timeout", !req.dropped && req.timedOut()) 302 if len(req.items) <= 2 && !req.dropped && req.timedOut() { 303 // 2 items are the minimum requested, if even that times out, we've no use of 304 // this peer at the moment. 305 log.Warn("Stalling state sync, dropping peer", "peer", req.peer.id) 306 s.d.dropPeer(req.peer.id) 307 } 308 // Process all the received blobs and check for stale delivery 309 if err := s.process(req); err != nil { 310 log.Warn("Node data write error", "err", err) 311 return err 312 } 313 req.peer.SetNodeDataIdle(len(req.response)) 314 } 315 } 316 return s.commit(true) 317 } 318 319 func (s *stateSync) commit(force bool) error { 320 if !force && s.bytesUncommitted < ethdb.IdealBatchSize { 321 return nil 322 } 323 start := time.Now() 324 b := s.d.stateDB.NewBatch() 325 s.sched.Commit(b) 326 if err := b.Write(); err != nil { 327 return fmt.Errorf("DB write error: %v", err) 328 } 329 s.updateStats(s.numUncommitted, 0, 0, time.Since(start)) 330 s.numUncommitted = 0 331 s.bytesUncommitted = 0 332 return nil 333 } 334 335 // assignTasks attempts to assing new tasks to all idle peers, either from the 336 // batch currently being retried, or fetching new data from the trie sync itself. 337 func (s *stateSync) assignTasks() { 338 // Iterate over all idle peers and try to assign them state fetches 339 peers, _ := s.d.peers.NodeDataIdlePeers() 340 for _, p := range peers { 341 // Assign a batch of fetches proportional to the estimated latency/bandwidth 342 cap := p.NodeDataCapacity(s.d.requestRTT()) 343 req := &stateReq{peer: p, timeout: s.d.requestTTL()} 344 s.fillTasks(cap, req) 345 346 // If the peer was assigned tasks to fetch, send the network request 347 if len(req.items) > 0 { 348 req.peer.log.Trace("Requesting new batch of data", "type", "state", "count", len(req.items)) 349 select { 350 case s.d.trackStateReq <- req: 351 req.peer.FetchNodeData(req.items) 352 case <-s.cancel: 353 case <-s.d.cancelCh: 354 } 355 } 356 } 357 } 358 359 // fillTasks fills the given request object with a maximum of n state download 360 // tasks to send to the remote peer. 361 func (s *stateSync) fillTasks(n int, req *stateReq) { 362 // Refill available tasks from the scheduler. 363 if len(s.tasks) < n { 364 new := s.sched.Missing(n - len(s.tasks)) 365 for _, hash := range new { 366 s.tasks[hash] = &stateTask{make(map[string]struct{})} 367 } 368 } 369 // Find tasks that haven't been tried with the request's peer. 370 req.items = make([]common.Hash, 0, n) 371 req.tasks = make(map[common.Hash]*stateTask, n) 372 for hash, t := range s.tasks { 373 // Stop when we've gathered enough requests 374 if len(req.items) == n { 375 break 376 } 377 // Skip any requests we've already tried from this peer 378 if _, ok := t.attempts[req.peer.id]; ok { 379 continue 380 } 381 // Assign the request to this peer 382 t.attempts[req.peer.id] = struct{}{} 383 req.items = append(req.items, hash) 384 req.tasks[hash] = t 385 delete(s.tasks, hash) 386 } 387 } 388 389 // process iterates over a batch of delivered state data, injecting each item 390 // into a running state sync, re-queuing any items that were requested but not 391 // delivered. 392 func (s *stateSync) process(req *stateReq) error { 393 // Collect processing stats and update progress if valid data was received 394 duplicate, unexpected := 0, 0 395 396 defer func(start time.Time) { 397 if duplicate > 0 || unexpected > 0 { 398 s.updateStats(0, duplicate, unexpected, time.Since(start)) 399 } 400 }(time.Now()) 401 402 // Iterate over all the delivered data and inject one-by-one into the trie 403 progress := false 404 405 for _, blob := range req.response { 406 prog, hash, err := s.processNodeData(blob) 407 switch err { 408 case nil: 409 s.numUncommitted++ 410 s.bytesUncommitted += len(blob) 411 progress = progress || prog 412 case trie.ErrNotRequested: 413 unexpected++ 414 case trie.ErrAlreadyProcessed: 415 duplicate++ 416 default: 417 return fmt.Errorf("invalid state node %s: %v", hash.TerminalString(), err) 418 } 419 if _, ok := req.tasks[hash]; ok { 420 delete(req.tasks, hash) 421 } 422 } 423 // Put unfulfilled tasks back into the retry queue 424 npeers := s.d.peers.Len() 425 for hash, task := range req.tasks { 426 // If the node did deliver something, missing items may be due to a protocol 427 // limit or a previous timeout + delayed delivery. Both cases should permit 428 // the node to retry the missing items (to avoid single-peer stalls). 429 if len(req.response) > 0 || req.timedOut() { 430 delete(task.attempts, req.peer.id) 431 } 432 // If we've requested the node too many times already, it may be a malicious 433 // sync where nobody has the right data. Abort. 434 if len(task.attempts) >= npeers { 435 return fmt.Errorf("state node %s failed with all peers (%d tries, %d peers)", hash.TerminalString(), len(task.attempts), npeers) 436 } 437 // Missing item, place into the retry queue. 438 s.tasks[hash] = task 439 } 440 return nil 441 } 442 443 // processNodeData tries to inject a trie node data blob delivered from a remote 444 // peer into the state trie, returning whether anything useful was written or any 445 // error occurred. 446 func (s *stateSync) processNodeData(blob []byte) (bool, common.Hash, error) { 447 res := trie.SyncResult{Data: blob} 448 s.keccak.Reset() 449 s.keccak.Write(blob) 450 s.keccak.Sum(res.Hash[:0]) 451 committed, _, err := s.sched.Process([]trie.SyncResult{res}) 452 return committed, res.Hash, err 453 } 454 455 // updateStats bumps the various state sync progress counters and displays a log 456 // message for the user to see. 457 func (s *stateSync) updateStats(written, duplicate, unexpected int, duration time.Duration) { 458 s.d.syncStatsLock.Lock() 459 defer s.d.syncStatsLock.Unlock() 460 461 s.d.syncStatsState.pending = uint64(s.sched.Pending()) 462 s.d.syncStatsState.processed += uint64(written) 463 s.d.syncStatsState.duplicate += uint64(duplicate) 464 s.d.syncStatsState.unexpected += uint64(unexpected) 465 466 if written > 0 || duplicate > 0 || unexpected > 0 { 467 log.Info("Imported new state entries", "count", written, "elapsed", common.PrettyDuration(duration), "processed", s.d.syncStatsState.processed, "pending", s.d.syncStatsState.pending, "retry", len(s.tasks), "duplicate", s.d.syncStatsState.duplicate, "unexpected", s.d.syncStatsState.unexpected) 468 } 469 }