github.com/sberex/go-sberex@v1.8.2-0.20181113200658-ed96ac38f7d7/eth/downloader/statesync.go (about) 1 // This file is part of the go-sberex library. The go-sberex library is 2 // free software: you can redistribute it and/or modify it under the terms 3 // of the GNU Lesser General Public License as published by the Free 4 // Software Foundation, either version 3 of the License, or (at your option) 5 // any later version. 6 // 7 // The go-sberex library is distributed in the hope that it will be useful, 8 // but WITHOUT ANY WARRANTY; without even the implied warranty of 9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 10 // General Public License <http://www.gnu.org/licenses/> for more details. 11 12 package downloader 13 14 import ( 15 "fmt" 16 "hash" 17 "sync" 18 "time" 19 20 "github.com/Sberex/go-sberex/common" 21 "github.com/Sberex/go-sberex/core/state" 22 "github.com/Sberex/go-sberex/crypto/sha3" 23 "github.com/Sberex/go-sberex/ethdb" 24 "github.com/Sberex/go-sberex/log" 25 "github.com/Sberex/go-sberex/trie" 26 ) 27 28 // stateReq represents a batch of state fetch requests groupped together into 29 // a single data retrieval network packet. 30 type stateReq struct { 31 items []common.Hash // Hashes of the state items to download 32 tasks map[common.Hash]*stateTask // Download tasks to track previous attempts 33 timeout time.Duration // Maximum round trip time for this to complete 34 timer *time.Timer // Timer to fire when the RTT timeout expires 35 peer *peerConnection // Peer that we're requesting from 36 response [][]byte // Response data of the peer (nil for timeouts) 37 dropped bool // Flag whether the peer dropped off early 38 } 39 40 // timedOut returns if this request timed out. 41 func (req *stateReq) timedOut() bool { 42 return req.response == nil 43 } 44 45 // stateSyncStats is a collection of progress stats to report during a state trie 46 // sync to RPC requests as well as to display in user logs. 47 type stateSyncStats struct { 48 processed uint64 // Number of state entries processed 49 duplicate uint64 // Number of state entries downloaded twice 50 unexpected uint64 // Number of non-requested state entries received 51 pending uint64 // Number of still pending state entries 52 } 53 54 // syncState starts downloading state with the given root hash. 55 func (d *Downloader) syncState(root common.Hash) *stateSync { 56 s := newStateSync(d, root) 57 select { 58 case d.stateSyncStart <- s: 59 case <-d.quitCh: 60 s.err = errCancelStateFetch 61 close(s.done) 62 } 63 return s 64 } 65 66 // stateFetcher manages the active state sync and accepts requests 67 // on its behalf. 68 func (d *Downloader) stateFetcher() { 69 for { 70 select { 71 case s := <-d.stateSyncStart: 72 for next := s; next != nil; { 73 next = d.runStateSync(next) 74 } 75 case <-d.stateCh: 76 // Ignore state responses while no sync is running. 77 case <-d.quitCh: 78 return 79 } 80 } 81 } 82 83 // runStateSync runs a state synchronisation until it completes or another root 84 // hash is requested to be switched over to. 85 func (d *Downloader) runStateSync(s *stateSync) *stateSync { 86 var ( 87 active = make(map[string]*stateReq) // Currently in-flight requests 88 finished []*stateReq // Completed or failed requests 89 timeout = make(chan *stateReq) // Timed out active requests 90 ) 91 defer func() { 92 // Cancel active request timers on exit. Also set peers to idle so they're 93 // available for the next sync. 94 for _, req := range active { 95 req.timer.Stop() 96 req.peer.SetNodeDataIdle(len(req.items)) 97 } 98 }() 99 // Run the state sync. 100 go s.run() 101 defer s.Cancel() 102 103 // Listen for peer departure events to cancel assigned tasks 104 peerDrop := make(chan *peerConnection, 1024) 105 peerSub := s.d.peers.SubscribePeerDrops(peerDrop) 106 defer peerSub.Unsubscribe() 107 108 for { 109 // Enable sending of the first buffered element if there is one. 110 var ( 111 deliverReq *stateReq 112 deliverReqCh chan *stateReq 113 ) 114 if len(finished) > 0 { 115 deliverReq = finished[0] 116 deliverReqCh = s.deliver 117 } 118 119 select { 120 // The stateSync lifecycle: 121 case next := <-d.stateSyncStart: 122 return next 123 124 case <-s.done: 125 return nil 126 127 // Send the next finished request to the current sync: 128 case deliverReqCh <- deliverReq: 129 // Shift out the first request, but also set the emptied slot to nil for GC 130 copy(finished, finished[1:]) 131 finished[len(finished)-1] = nil 132 finished = finished[:len(finished)-1] 133 134 // Handle incoming state packs: 135 case pack := <-d.stateCh: 136 // Discard any data not requested (or previsouly timed out) 137 req := active[pack.PeerId()] 138 if req == nil { 139 log.Debug("Unrequested node data", "peer", pack.PeerId(), "len", pack.Items()) 140 continue 141 } 142 // Finalize the request and queue up for processing 143 req.timer.Stop() 144 req.response = pack.(*statePack).states 145 146 finished = append(finished, req) 147 delete(active, pack.PeerId()) 148 149 // Handle dropped peer connections: 150 case p := <-peerDrop: 151 // Skip if no request is currently pending 152 req := active[p.id] 153 if req == nil { 154 continue 155 } 156 // Finalize the request and queue up for processing 157 req.timer.Stop() 158 req.dropped = true 159 160 finished = append(finished, req) 161 delete(active, p.id) 162 163 // Handle timed-out requests: 164 case req := <-timeout: 165 // If the peer is already requesting something else, ignore the stale timeout. 166 // This can happen when the timeout and the delivery happens simultaneously, 167 // causing both pathways to trigger. 168 if active[req.peer.id] != req { 169 continue 170 } 171 // Move the timed out data back into the download queue 172 finished = append(finished, req) 173 delete(active, req.peer.id) 174 175 // Track outgoing state requests: 176 case req := <-d.trackStateReq: 177 // If an active request already exists for this peer, we have a problem. In 178 // theory the trie node schedule must never assign two requests to the same 179 // peer. In practive however, a peer might receive a request, disconnect and 180 // immediately reconnect before the previous times out. In this case the first 181 // request is never honored, alas we must not silently overwrite it, as that 182 // causes valid requests to go missing and sync to get stuck. 183 if old := active[req.peer.id]; old != nil { 184 log.Warn("Busy peer assigned new state fetch", "peer", old.peer.id) 185 186 // Make sure the previous one doesn't get siletly lost 187 old.timer.Stop() 188 old.dropped = true 189 190 finished = append(finished, old) 191 } 192 // Start a timer to notify the sync loop if the peer stalled. 193 req.timer = time.AfterFunc(req.timeout, func() { 194 select { 195 case timeout <- req: 196 case <-s.done: 197 // Prevent leaking of timer goroutines in the unlikely case where a 198 // timer is fired just before exiting runStateSync. 199 } 200 }) 201 active[req.peer.id] = req 202 } 203 } 204 } 205 206 // stateSync schedules requests for downloading a particular state trie defined 207 // by a given state root. 208 type stateSync struct { 209 d *Downloader // Downloader instance to access and manage current peerset 210 211 sched *trie.TrieSync // State trie sync scheduler defining the tasks 212 keccak hash.Hash // Keccak256 hasher to verify deliveries with 213 tasks map[common.Hash]*stateTask // Set of tasks currently queued for retrieval 214 215 numUncommitted int 216 bytesUncommitted int 217 218 deliver chan *stateReq // Delivery channel multiplexing peer responses 219 cancel chan struct{} // Channel to signal a termination request 220 cancelOnce sync.Once // Ensures cancel only ever gets called once 221 done chan struct{} // Channel to signal termination completion 222 err error // Any error hit during sync (set before completion) 223 } 224 225 // stateTask represents a single trie node download taks, containing a set of 226 // peers already attempted retrieval from to detect stalled syncs and abort. 227 type stateTask struct { 228 attempts map[string]struct{} 229 } 230 231 // newStateSync creates a new state trie download scheduler. This method does not 232 // yet start the sync. The user needs to call run to initiate. 233 func newStateSync(d *Downloader, root common.Hash) *stateSync { 234 return &stateSync{ 235 d: d, 236 sched: state.NewStateSync(root, d.stateDB), 237 keccak: sha3.NewKeccak256(), 238 tasks: make(map[common.Hash]*stateTask), 239 deliver: make(chan *stateReq), 240 cancel: make(chan struct{}), 241 done: make(chan struct{}), 242 } 243 } 244 245 // run starts the task assignment and response processing loop, blocking until 246 // it finishes, and finally notifying any goroutines waiting for the loop to 247 // finish. 248 func (s *stateSync) run() { 249 s.err = s.loop() 250 close(s.done) 251 } 252 253 // Wait blocks until the sync is done or canceled. 254 func (s *stateSync) Wait() error { 255 <-s.done 256 return s.err 257 } 258 259 // Cancel cancels the sync and waits until it has shut down. 260 func (s *stateSync) Cancel() error { 261 s.cancelOnce.Do(func() { close(s.cancel) }) 262 return s.Wait() 263 } 264 265 // loop is the main event loop of a state trie sync. It it responsible for the 266 // assignment of new tasks to peers (including sending it to them) as well as 267 // for the processing of inbound data. Note, that the loop does not directly 268 // receive data from peers, rather those are buffered up in the downloader and 269 // pushed here async. The reason is to decouple processing from data receipt 270 // and timeouts. 271 func (s *stateSync) loop() error { 272 // Listen for new peer events to assign tasks to them 273 newPeer := make(chan *peerConnection, 1024) 274 peerSub := s.d.peers.SubscribeNewPeers(newPeer) 275 defer peerSub.Unsubscribe() 276 277 // Keep assigning new tasks until the sync completes or aborts 278 for s.sched.Pending() > 0 { 279 if err := s.commit(false); err != nil { 280 return err 281 } 282 s.assignTasks() 283 // Tasks assigned, wait for something to happen 284 select { 285 case <-newPeer: 286 // New peer arrived, try to assign it download tasks 287 288 case <-s.cancel: 289 return errCancelStateFetch 290 291 case <-s.d.cancelCh: 292 return errCancelStateFetch 293 294 case req := <-s.deliver: 295 // Response, disconnect or timeout triggered, drop the peer if stalling 296 log.Trace("Received node data response", "peer", req.peer.id, "count", len(req.response), "dropped", req.dropped, "timeout", !req.dropped && req.timedOut()) 297 if len(req.items) <= 2 && !req.dropped && req.timedOut() { 298 // 2 items are the minimum requested, if even that times out, we've no use of 299 // this peer at the moment. 300 log.Warn("Stalling state sync, dropping peer", "peer", req.peer.id) 301 s.d.dropPeer(req.peer.id) 302 } 303 // Process all the received blobs and check for stale delivery 304 if err := s.process(req); err != nil { 305 log.Warn("Node data write error", "err", err) 306 return err 307 } 308 req.peer.SetNodeDataIdle(len(req.response)) 309 } 310 } 311 return s.commit(true) 312 } 313 314 func (s *stateSync) commit(force bool) error { 315 if !force && s.bytesUncommitted < ethdb.IdealBatchSize { 316 return nil 317 } 318 start := time.Now() 319 b := s.d.stateDB.NewBatch() 320 s.sched.Commit(b) 321 if err := b.Write(); err != nil { 322 return fmt.Errorf("DB write error: %v", err) 323 } 324 s.updateStats(s.numUncommitted, 0, 0, time.Since(start)) 325 s.numUncommitted = 0 326 s.bytesUncommitted = 0 327 return nil 328 } 329 330 // assignTasks attempts to assing new tasks to all idle peers, either from the 331 // batch currently being retried, or fetching new data from the trie sync itself. 332 func (s *stateSync) assignTasks() { 333 // Iterate over all idle peers and try to assign them state fetches 334 peers, _ := s.d.peers.NodeDataIdlePeers() 335 for _, p := range peers { 336 // Assign a batch of fetches proportional to the estimated latency/bandwidth 337 cap := p.NodeDataCapacity(s.d.requestRTT()) 338 req := &stateReq{peer: p, timeout: s.d.requestTTL()} 339 s.fillTasks(cap, req) 340 341 // If the peer was assigned tasks to fetch, send the network request 342 if len(req.items) > 0 { 343 req.peer.log.Trace("Requesting new batch of data", "type", "state", "count", len(req.items)) 344 select { 345 case s.d.trackStateReq <- req: 346 req.peer.FetchNodeData(req.items) 347 case <-s.cancel: 348 case <-s.d.cancelCh: 349 } 350 } 351 } 352 } 353 354 // fillTasks fills the given request object with a maximum of n state download 355 // tasks to send to the remote peer. 356 func (s *stateSync) fillTasks(n int, req *stateReq) { 357 // Refill available tasks from the scheduler. 358 if len(s.tasks) < n { 359 new := s.sched.Missing(n - len(s.tasks)) 360 for _, hash := range new { 361 s.tasks[hash] = &stateTask{make(map[string]struct{})} 362 } 363 } 364 // Find tasks that haven't been tried with the request's peer. 365 req.items = make([]common.Hash, 0, n) 366 req.tasks = make(map[common.Hash]*stateTask, n) 367 for hash, t := range s.tasks { 368 // Stop when we've gathered enough requests 369 if len(req.items) == n { 370 break 371 } 372 // Skip any requests we've already tried from this peer 373 if _, ok := t.attempts[req.peer.id]; ok { 374 continue 375 } 376 // Assign the request to this peer 377 t.attempts[req.peer.id] = struct{}{} 378 req.items = append(req.items, hash) 379 req.tasks[hash] = t 380 delete(s.tasks, hash) 381 } 382 } 383 384 // process iterates over a batch of delivered state data, injecting each item 385 // into a running state sync, re-queuing any items that were requested but not 386 // delivered. 387 func (s *stateSync) process(req *stateReq) error { 388 // Collect processing stats and update progress if valid data was received 389 duplicate, unexpected := 0, 0 390 391 defer func(start time.Time) { 392 if duplicate > 0 || unexpected > 0 { 393 s.updateStats(0, duplicate, unexpected, time.Since(start)) 394 } 395 }(time.Now()) 396 397 // Iterate over all the delivered data and inject one-by-one into the trie 398 progress := false 399 400 for _, blob := range req.response { 401 prog, hash, err := s.processNodeData(blob) 402 switch err { 403 case nil: 404 s.numUncommitted++ 405 s.bytesUncommitted += len(blob) 406 progress = progress || prog 407 case trie.ErrNotRequested: 408 unexpected++ 409 case trie.ErrAlreadyProcessed: 410 duplicate++ 411 default: 412 return fmt.Errorf("invalid state node %s: %v", hash.TerminalString(), err) 413 } 414 if _, ok := req.tasks[hash]; ok { 415 delete(req.tasks, hash) 416 } 417 } 418 // Put unfulfilled tasks back into the retry queue 419 npeers := s.d.peers.Len() 420 for hash, task := range req.tasks { 421 // If the node did deliver something, missing items may be due to a protocol 422 // limit or a previous timeout + delayed delivery. Both cases should permit 423 // the node to retry the missing items (to avoid single-peer stalls). 424 if len(req.response) > 0 || req.timedOut() { 425 delete(task.attempts, req.peer.id) 426 } 427 // If we've requested the node too many times already, it may be a malicious 428 // sync where nobody has the right data. Abort. 429 if len(task.attempts) >= npeers { 430 return fmt.Errorf("state node %s failed with all peers (%d tries, %d peers)", hash.TerminalString(), len(task.attempts), npeers) 431 } 432 // Missing item, place into the retry queue. 433 s.tasks[hash] = task 434 } 435 return nil 436 } 437 438 // processNodeData tries to inject a trie node data blob delivered from a remote 439 // peer into the state trie, returning whether anything useful was written or any 440 // error occurred. 441 func (s *stateSync) processNodeData(blob []byte) (bool, common.Hash, error) { 442 res := trie.SyncResult{Data: blob} 443 s.keccak.Reset() 444 s.keccak.Write(blob) 445 s.keccak.Sum(res.Hash[:0]) 446 committed, _, err := s.sched.Process([]trie.SyncResult{res}) 447 return committed, res.Hash, err 448 } 449 450 // updateStats bumps the various state sync progress counters and displays a log 451 // message for the user to see. 452 func (s *stateSync) updateStats(written, duplicate, unexpected int, duration time.Duration) { 453 s.d.syncStatsLock.Lock() 454 defer s.d.syncStatsLock.Unlock() 455 456 s.d.syncStatsState.pending = uint64(s.sched.Pending()) 457 s.d.syncStatsState.processed += uint64(written) 458 s.d.syncStatsState.duplicate += uint64(duplicate) 459 s.d.syncStatsState.unexpected += uint64(unexpected) 460 461 if written > 0 || duplicate > 0 || unexpected > 0 { 462 log.Info("Imported new state entries", "count", written, "elapsed", common.PrettyDuration(duration), "processed", s.d.syncStatsState.processed, "pending", s.d.syncStatsState.pending, "retry", len(s.tasks), "duplicate", s.d.syncStatsState.duplicate, "unexpected", s.d.syncStatsState.unexpected) 463 } 464 }