github.com/bamzi/go-ethereum@v1.6.7-0.20170704111104-138f26c93af1/eth/downloader/statesync.go (about) 1 // Copyright 2017 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package downloader 18 19 import ( 20 "fmt" 21 "hash" 22 "sync" 23 "sync/atomic" 24 "time" 25 26 "github.com/ethereum/go-ethereum/common" 27 "github.com/ethereum/go-ethereum/core/state" 28 "github.com/ethereum/go-ethereum/crypto/sha3" 29 "github.com/ethereum/go-ethereum/log" 30 "github.com/ethereum/go-ethereum/trie" 31 ) 32 33 // stateReq represents a batch of state fetch requests groupped together into 34 // a single data retrieval network packet. 35 type stateReq struct { 36 items []common.Hash // Hashes of the state items to download 37 tasks map[common.Hash]*stateTask // Download tasks to track previous attempts 38 timeout time.Duration // Maximum round trip time for this to complete 39 timer *time.Timer // Timer to fire when the RTT timeout expires 40 peer *peer // Peer that we're requesting from 41 response [][]byte // Response data of the peer (nil for timeouts) 42 } 43 44 // timedOut returns if this request timed out. 45 func (req *stateReq) timedOut() bool { 46 return req.response == nil 47 } 48 49 // stateSyncStats is a collection of progress stats to report during a state trie 50 // sync to RPC requests as well as to display in user logs. 51 type stateSyncStats struct { 52 processed uint64 // Number of state entries processed 53 duplicate uint64 // Number of state entries downloaded twice 54 unexpected uint64 // Number of non-requested state entries received 55 pending uint64 // Number of still pending state entries 56 } 57 58 // syncState starts downloading state with the given root hash. 59 func (d *Downloader) syncState(root common.Hash) *stateSync { 60 s := newStateSync(d, root) 61 select { 62 case d.stateSyncStart <- s: 63 case <-d.quitCh: 64 s.err = errCancelStateFetch 65 close(s.done) 66 } 67 return s 68 } 69 70 // stateFetcher manages the active state sync and accepts requests 71 // on its behalf. 72 func (d *Downloader) stateFetcher() { 73 for { 74 select { 75 case s := <-d.stateSyncStart: 76 for next := s; next != nil; { 77 next = d.runStateSync(next) 78 } 79 case <-d.stateCh: 80 // Ignore state responses while no sync is running. 81 case <-d.quitCh: 82 return 83 } 84 } 85 } 86 87 // runStateSync runs a state synchronisation until it completes or another root 88 // hash is requested to be switched over to. 89 func (d *Downloader) runStateSync(s *stateSync) *stateSync { 90 var ( 91 active = make(map[string]*stateReq) // Currently in-flight requests 92 finished []*stateReq // Completed or failed requests 93 timeout = make(chan *stateReq) // Timed out active requests 94 ) 95 defer func() { 96 // Cancel active request timers on exit. Also set peers to idle so they're 97 // available for the next sync. 98 for _, req := range active { 99 req.timer.Stop() 100 req.peer.SetNodeDataIdle(len(req.items)) 101 } 102 }() 103 // Run the state sync. 104 go s.run() 105 defer s.Cancel() 106 107 for { 108 // Enable sending of the first buffered element if there is one. 109 var ( 110 deliverReq *stateReq 111 deliverReqCh chan *stateReq 112 ) 113 if len(finished) > 0 { 114 deliverReq = finished[0] 115 deliverReqCh = s.deliver 116 } 117 118 select { 119 // The stateSync lifecycle: 120 case next := <-d.stateSyncStart: 121 return next 122 123 case <-s.done: 124 return nil 125 126 // Send the next finished request to the current sync: 127 case deliverReqCh <- deliverReq: 128 finished = append(finished[:0], finished[1:]...) 129 130 // Handle incoming state packs: 131 case pack := <-d.stateCh: 132 // Discard any data not requested (or previsouly timed out) 133 req := active[pack.PeerId()] 134 if req == nil { 135 log.Debug("Unrequested node data", "peer", pack.PeerId(), "len", pack.Items()) 136 continue 137 } 138 // Finalize the request and queue up for processing 139 req.timer.Stop() 140 req.response = pack.(*statePack).states 141 142 finished = append(finished, req) 143 delete(active, pack.PeerId()) 144 145 // Handle timed-out requests: 146 case req := <-timeout: 147 // If the peer is already requesting something else, ignore the stale timeout. 148 // This can happen when the timeout and the delivery happens simultaneously, 149 // causing both pathways to trigger. 150 if active[req.peer.id] != req { 151 continue 152 } 153 // Move the timed out data back into the download queue 154 finished = append(finished, req) 155 delete(active, req.peer.id) 156 157 // Track outgoing state requests: 158 case req := <-d.trackStateReq: 159 // If an active request already exists for this peer, we have a problem. In 160 // theory the trie node schedule must never assign two requests to the same 161 // peer. In practive however, a peer might receive a request, disconnect and 162 // immediately reconnect before the previous times out. In this case the first 163 // request is never honored, alas we must not silently overwrite it, as that 164 // causes valid requests to go missing and sync to get stuck. 165 if old := active[req.peer.id]; old != nil { 166 log.Warn("Busy peer assigned new state fetch", "peer", old.peer.id) 167 168 // Make sure the previous one doesn't get siletly lost 169 finished = append(finished, old) 170 } 171 // Start a timer to notify the sync loop if the peer stalled. 172 req.timer = time.AfterFunc(req.timeout, func() { 173 select { 174 case timeout <- req: 175 case <-s.done: 176 // Prevent leaking of timer goroutines in the unlikely case where a 177 // timer is fired just before exiting runStateSync. 178 } 179 }) 180 active[req.peer.id] = req 181 } 182 } 183 } 184 185 // stateSync schedules requests for downloading a particular state trie defined 186 // by a given state root. 187 type stateSync struct { 188 d *Downloader // Downloader instance to access and manage current peerset 189 190 sched *state.StateSync // State trie sync scheduler defining the tasks 191 keccak hash.Hash // Keccak256 hasher to verify deliveries with 192 tasks map[common.Hash]*stateTask // Set of tasks currently queued for retrieval 193 194 deliver chan *stateReq // Delivery channel multiplexing peer responses 195 cancel chan struct{} // Channel to signal a termination request 196 cancelOnce sync.Once // Ensures cancel only ever gets called once 197 done chan struct{} // Channel to signal termination completion 198 err error // Any error hit during sync (set before completion) 199 } 200 201 // stateTask represents a single trie node download taks, containing a set of 202 // peers already attempted retrieval from to detect stalled syncs and abort. 203 type stateTask struct { 204 attempts map[string]struct{} 205 } 206 207 // newStateSync creates a new state trie download scheduler. This method does not 208 // yet start the sync. The user needs to call run to initiate. 209 func newStateSync(d *Downloader, root common.Hash) *stateSync { 210 return &stateSync{ 211 d: d, 212 sched: state.NewStateSync(root, d.stateDB), 213 keccak: sha3.NewKeccak256(), 214 tasks: make(map[common.Hash]*stateTask), 215 deliver: make(chan *stateReq), 216 cancel: make(chan struct{}), 217 done: make(chan struct{}), 218 } 219 } 220 221 // run starts the task assignment and response processing loop, blocking until 222 // it finishes, and finally notifying any goroutines waiting for the loop to 223 // finish. 224 func (s *stateSync) run() { 225 s.err = s.loop() 226 close(s.done) 227 } 228 229 // Wait blocks until the sync is done or canceled. 230 func (s *stateSync) Wait() error { 231 <-s.done 232 return s.err 233 } 234 235 // Cancel cancels the sync and waits until it has shut down. 236 func (s *stateSync) Cancel() error { 237 s.cancelOnce.Do(func() { close(s.cancel) }) 238 return s.Wait() 239 } 240 241 // loop is the main event loop of a state trie sync. It it responsible for the 242 // assignment of new tasks to peers (including sending it to them) as well as 243 // for the processing of inbound data. Note, that the loop does not directly 244 // receive data from peers, rather those are buffered up in the downloader and 245 // pushed here async. The reason is to decouple processing from data receipt 246 // and timeouts. 247 func (s *stateSync) loop() error { 248 // Listen for new peer events to assign tasks to them 249 newPeer := make(chan *peer, 1024) 250 peerSub := s.d.peers.SubscribeNewPeers(newPeer) 251 defer peerSub.Unsubscribe() 252 253 // Keep assigning new tasks until the sync completes or aborts 254 for s.sched.Pending() > 0 { 255 if err := s.assignTasks(); err != nil { 256 return err 257 } 258 // Tasks assigned, wait for something to happen 259 select { 260 case <-newPeer: 261 // New peer arrived, try to assign it download tasks 262 263 case <-s.cancel: 264 return errCancelStateFetch 265 266 case req := <-s.deliver: 267 // Response or timeout triggered, drop the peer if stalling 268 log.Trace("Received node data response", "peer", req.peer.id, "count", len(req.response), "timeout", req.timedOut()) 269 if len(req.items) <= 2 && req.timedOut() { 270 // 2 items are the minimum requested, if even that times out, we've no use of 271 // this peer at the moment. 272 log.Warn("Stalling state sync, dropping peer", "peer", req.peer.id) 273 s.d.dropPeer(req.peer.id) 274 } 275 // Process all the received blobs and check for stale delivery 276 stale, err := s.process(req) 277 if err != nil { 278 log.Warn("Node data write error", "err", err) 279 return err 280 } 281 // The the delivery contains requested data, mark the node idle (otherwise it's a timed out delivery) 282 if !stale { 283 req.peer.SetNodeDataIdle(len(req.response)) 284 } 285 } 286 } 287 return nil 288 } 289 290 // assignTasks attempts to assing new tasks to all idle peers, either from the 291 // batch currently being retried, or fetching new data from the trie sync itself. 292 func (s *stateSync) assignTasks() error { 293 // Iterate over all idle peers and try to assign them state fetches 294 peers, _ := s.d.peers.NodeDataIdlePeers() 295 for _, p := range peers { 296 // Assign a batch of fetches proportional to the estimated latency/bandwidth 297 cap := p.NodeDataCapacity(s.d.requestRTT()) 298 req := &stateReq{peer: p, timeout: s.d.requestTTL()} 299 s.fillTasks(cap, req) 300 301 // If the peer was assigned tasks to fetch, send the network request 302 if len(req.items) > 0 { 303 req.peer.log.Trace("Requesting new batch of data", "type", "state", "count", len(req.items)) 304 305 select { 306 case s.d.trackStateReq <- req: 307 req.peer.FetchNodeData(req.items) 308 case <-s.cancel: 309 } 310 } 311 } 312 return nil 313 } 314 315 // fillTasks fills the given request object with a maximum of n state download 316 // tasks to send to the remote peer. 317 func (s *stateSync) fillTasks(n int, req *stateReq) { 318 // Refill available tasks from the scheduler. 319 if len(s.tasks) < n { 320 new := s.sched.Missing(n - len(s.tasks)) 321 for _, hash := range new { 322 s.tasks[hash] = &stateTask{make(map[string]struct{})} 323 } 324 } 325 // Find tasks that haven't been tried with the request's peer. 326 req.items = make([]common.Hash, 0, n) 327 req.tasks = make(map[common.Hash]*stateTask, n) 328 for hash, t := range s.tasks { 329 // Stop when we've gathered enough requests 330 if len(req.items) == n { 331 break 332 } 333 // Skip any requests we've already tried from this peer 334 if _, ok := t.attempts[req.peer.id]; ok { 335 continue 336 } 337 // Assign the request to this peer 338 t.attempts[req.peer.id] = struct{}{} 339 req.items = append(req.items, hash) 340 req.tasks[hash] = t 341 delete(s.tasks, hash) 342 } 343 } 344 345 // process iterates over a batch of delivered state data, injecting each item 346 // into a running state sync, re-queuing any items that were requested but not 347 // delivered. 348 func (s *stateSync) process(req *stateReq) (bool, error) { 349 // Collect processing stats and update progress if valid data was received 350 processed, written, duplicate, unexpected := 0, 0, 0, 0 351 352 defer func(start time.Time) { 353 if processed+written+duplicate+unexpected > 0 { 354 s.updateStats(processed, written, duplicate, unexpected, time.Since(start)) 355 } 356 }(time.Now()) 357 358 // Iterate over all the delivered data and inject one-by-one into the trie 359 progress, stale := false, len(req.response) > 0 360 361 for _, blob := range req.response { 362 prog, hash, err := s.processNodeData(blob) 363 switch err { 364 case nil: 365 processed++ 366 case trie.ErrNotRequested: 367 unexpected++ 368 case trie.ErrAlreadyProcessed: 369 duplicate++ 370 default: 371 return stale, fmt.Errorf("invalid state node %s: %v", hash.TerminalString(), err) 372 } 373 if prog { 374 progress = true 375 } 376 // If the node delivered a requested item, mark the delivery non-stale 377 if _, ok := req.tasks[hash]; ok { 378 delete(req.tasks, hash) 379 stale = false 380 } 381 } 382 // If some data managed to hit the database, flush and reset failure counters 383 if progress { 384 // Flush any accumulated data out to disk 385 batch := s.d.stateDB.NewBatch() 386 387 count, err := s.sched.Commit(batch) 388 if err != nil { 389 return stale, err 390 } 391 if err := batch.Write(); err != nil { 392 return stale, err 393 } 394 written = count 395 396 // If we're inside the critical section, reset fail counter since we progressed 397 if atomic.LoadUint32(&s.d.fsPivotFails) > 1 { 398 log.Trace("Fast-sync progressed, resetting fail counter", "previous", atomic.LoadUint32(&s.d.fsPivotFails)) 399 atomic.StoreUint32(&s.d.fsPivotFails, 1) // Don't ever reset to 0, as that will unlock the pivot block 400 } 401 } 402 // Put unfulfilled tasks back into the retry queue 403 npeers := s.d.peers.Len() 404 405 for hash, task := range req.tasks { 406 // If the node did deliver something, missing items may be due to a protocol 407 // limit or a previous timeout + delayed delivery. Both cases should permit 408 // the node to retry the missing items (to avoid single-peer stalls). 409 if len(req.response) > 0 || req.timedOut() { 410 delete(task.attempts, req.peer.id) 411 } 412 // If we've requested the node too many times already, it may be a malicious 413 // sync where nobody has the right data. Abort. 414 if len(task.attempts) >= npeers { 415 return stale, fmt.Errorf("state node %s failed with all peers (%d tries, %d peers)", hash.TerminalString(), len(task.attempts), npeers) 416 } 417 // Missing item, place into the retry queue. 418 s.tasks[hash] = task 419 } 420 return stale, nil 421 } 422 423 // processNodeData tries to inject a trie node data blob delivered from a remote 424 // peer into the state trie, returning whether anything useful was written or any 425 // error occurred. 426 func (s *stateSync) processNodeData(blob []byte) (bool, common.Hash, error) { 427 res := trie.SyncResult{Data: blob} 428 429 s.keccak.Reset() 430 s.keccak.Write(blob) 431 s.keccak.Sum(res.Hash[:0]) 432 433 committed, _, err := s.sched.Process([]trie.SyncResult{res}) 434 return committed, res.Hash, err 435 } 436 437 // updateStats bumps the various state sync progress counters and displays a log 438 // message for the user to see. 439 func (s *stateSync) updateStats(processed, written, duplicate, unexpected int, duration time.Duration) { 440 s.d.syncStatsLock.Lock() 441 defer s.d.syncStatsLock.Unlock() 442 443 s.d.syncStatsState.pending = uint64(s.sched.Pending()) 444 s.d.syncStatsState.processed += uint64(processed) 445 s.d.syncStatsState.duplicate += uint64(duplicate) 446 s.d.syncStatsState.unexpected += uint64(unexpected) 447 448 log.Info("Imported new state entries", "count", processed, "flushed", written, "elapsed", common.PrettyDuration(duration), "processed", s.d.syncStatsState.processed, "pending", s.d.syncStatsState.pending, "retry", len(s.tasks), "duplicate", s.d.syncStatsState.duplicate, "unexpected", s.d.syncStatsState.unexpected) 449 }