github.com/aigarnetwork/aigar@v0.0.0-20191115204914-d59a6eb70f8e/eth/downloader/statesync.go (about) 1 // Copyright 2018 The go-ethereum Authors 2 // Copyright 2019 The go-aigar Authors 3 // This file is part of the go-aigar library. 4 // 5 // The go-aigar library is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Lesser General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // The go-aigar library is distributed in the hope that it will be useful, 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Lesser General Public License for more details. 14 // 15 // You should have received a copy of the GNU Lesser General Public License 16 // along with the go-aigar library. If not, see <http://www.gnu.org/licenses/>. 17 18 package downloader 19 20 import ( 21 "fmt" 22 "hash" 23 "sync" 24 "time" 25 26 "github.com/AigarNetwork/aigar/common" 27 "github.com/AigarNetwork/aigar/core/rawdb" 28 "github.com/AigarNetwork/aigar/core/state" 29 "github.com/AigarNetwork/aigar/ethdb" 30 "github.com/AigarNetwork/aigar/log" 31 "github.com/AigarNetwork/aigar/trie" 32 "golang.org/x/crypto/sha3" 33 ) 34 35 // stateReq represents a batch of state fetch requests grouped together into 36 // a single data retrieval network packet. 37 type stateReq struct { 38 items []common.Hash // Hashes of the state items to download 39 tasks map[common.Hash]*stateTask // Download tasks to track previous attempts 40 timeout time.Duration // Maximum round trip time for this to complete 41 timer *time.Timer // Timer to fire when the RTT timeout expires 42 peer *peerConnection // Peer that we're requesting from 43 response [][]byte // Response data of the peer (nil for timeouts) 44 dropped bool // Flag whether the peer dropped off early 45 } 46 47 // timedOut returns if this request timed out. 48 func (req *stateReq) timedOut() bool { 49 return req.response == nil 50 } 51 52 // stateSyncStats is a collection of progress stats to report during a state trie 53 // sync to RPC requests as well as to display in user logs. 54 type stateSyncStats struct { 55 processed uint64 // Number of state entries processed 56 duplicate uint64 // Number of state entries downloaded twice 57 unexpected uint64 // Number of non-requested state entries received 58 pending uint64 // Number of still pending state entries 59 } 60 61 // syncState starts downloading state with the given root hash. 62 func (d *Downloader) syncState(root common.Hash) *stateSync { 63 // Create the state sync 64 s := newStateSync(d, root) 65 select { 66 case d.stateSyncStart <- s: 67 case <-d.quitCh: 68 s.err = errCancelStateFetch 69 close(s.done) 70 } 71 return s 72 } 73 74 // stateFetcher manages the active state sync and accepts requests 75 // on its behalf. 76 func (d *Downloader) stateFetcher() { 77 for { 78 select { 79 case s := <-d.stateSyncStart: 80 for next := s; next != nil; { 81 next = d.runStateSync(next) 82 } 83 case <-d.stateCh: 84 // Ignore state responses while no sync is running. 85 case <-d.quitCh: 86 return 87 } 88 } 89 } 90 91 // runStateSync runs a state synchronisation until it completes or another root 92 // hash is requested to be switched over to. 93 func (d *Downloader) runStateSync(s *stateSync) *stateSync { 94 var ( 95 active = make(map[string]*stateReq) // Currently in-flight requests 96 finished []*stateReq // Completed or failed requests 97 timeout = make(chan *stateReq) // Timed out active requests 98 ) 99 defer func() { 100 // Cancel active request timers on exit. Also set peers to idle so they're 101 // available for the next sync. 102 for _, req := range active { 103 req.timer.Stop() 104 req.peer.SetNodeDataIdle(len(req.items)) 105 } 106 }() 107 // Run the state sync. 108 go s.run() 109 defer s.Cancel() 110 111 // Listen for peer departure events to cancel assigned tasks 112 peerDrop := make(chan *peerConnection, 1024) 113 peerSub := s.d.peers.SubscribePeerDrops(peerDrop) 114 defer peerSub.Unsubscribe() 115 116 for { 117 // Enable sending of the first buffered element if there is one. 118 var ( 119 deliverReq *stateReq 120 deliverReqCh chan *stateReq 121 ) 122 if len(finished) > 0 { 123 deliverReq = finished[0] 124 deliverReqCh = s.deliver 125 } 126 127 select { 128 // The stateSync lifecycle: 129 case next := <-d.stateSyncStart: 130 return next 131 132 case <-s.done: 133 return nil 134 135 // Send the next finished request to the current sync: 136 case deliverReqCh <- deliverReq: 137 // Shift out the first request, but also set the emptied slot to nil for GC 138 copy(finished, finished[1:]) 139 finished[len(finished)-1] = nil 140 finished = finished[:len(finished)-1] 141 142 // Handle incoming state packs: 143 case pack := <-d.stateCh: 144 // Discard any data not requested (or previously timed out) 145 req := active[pack.PeerId()] 146 if req == nil { 147 log.Debug("Unrequested node data", "peer", pack.PeerId(), "len", pack.Items()) 148 continue 149 } 150 // Finalize the request and queue up for processing 151 req.timer.Stop() 152 req.response = pack.(*statePack).states 153 154 finished = append(finished, req) 155 delete(active, pack.PeerId()) 156 157 // Handle dropped peer connections: 158 case p := <-peerDrop: 159 // Skip if no request is currently pending 160 req := active[p.id] 161 if req == nil { 162 continue 163 } 164 // Finalize the request and queue up for processing 165 req.timer.Stop() 166 req.dropped = true 167 168 finished = append(finished, req) 169 delete(active, p.id) 170 171 // Handle timed-out requests: 172 case req := <-timeout: 173 // If the peer is already requesting something else, ignore the stale timeout. 174 // This can happen when the timeout and the delivery happens simultaneously, 175 // causing both pathways to trigger. 176 if active[req.peer.id] != req { 177 continue 178 } 179 // Move the timed out data back into the download queue 180 finished = append(finished, req) 181 delete(active, req.peer.id) 182 183 // Track outgoing state requests: 184 case req := <-d.trackStateReq: 185 // If an active request already exists for this peer, we have a problem. In 186 // theory the trie node schedule must never assign two requests to the same 187 // peer. In practice however, a peer might receive a request, disconnect and 188 // immediately reconnect before the previous times out. In this case the first 189 // request is never honored, alas we must not silently overwrite it, as that 190 // causes valid requests to go missing and sync to get stuck. 191 if old := active[req.peer.id]; old != nil { 192 log.Warn("Busy peer assigned new state fetch", "peer", old.peer.id) 193 194 // Make sure the previous one doesn't get siletly lost 195 old.timer.Stop() 196 old.dropped = true 197 198 finished = append(finished, old) 199 } 200 // Start a timer to notify the sync loop if the peer stalled. 201 req.timer = time.AfterFunc(req.timeout, func() { 202 select { 203 case timeout <- req: 204 case <-s.done: 205 // Prevent leaking of timer goroutines in the unlikely case where a 206 // timer is fired just before exiting runStateSync. 207 } 208 }) 209 active[req.peer.id] = req 210 } 211 } 212 } 213 214 // stateSync schedules requests for downloading a particular state trie defined 215 // by a given state root. 216 type stateSync struct { 217 d *Downloader // Downloader instance to access and manage current peerset 218 219 sched *trie.Sync // State trie sync scheduler defining the tasks 220 keccak hash.Hash // Keccak256 hasher to verify deliveries with 221 tasks map[common.Hash]*stateTask // Set of tasks currently queued for retrieval 222 223 numUncommitted int 224 bytesUncommitted int 225 226 deliver chan *stateReq // Delivery channel multiplexing peer responses 227 cancel chan struct{} // Channel to signal a termination request 228 cancelOnce sync.Once // Ensures cancel only ever gets called once 229 done chan struct{} // Channel to signal termination completion 230 err error // Any error hit during sync (set before completion) 231 } 232 233 // stateTask represents a single trie node download task, containing a set of 234 // peers already attempted retrieval from to detect stalled syncs and abort. 235 type stateTask struct { 236 attempts map[string]struct{} 237 } 238 239 // newStateSync creates a new state trie download scheduler. This method does not 240 // yet start the sync. The user needs to call run to initiate. 241 func newStateSync(d *Downloader, root common.Hash) *stateSync { 242 return &stateSync{ 243 d: d, 244 sched: state.NewStateSync(root, d.stateDB, d.stateBloom), 245 keccak: sha3.NewLegacyKeccak256(), 246 tasks: make(map[common.Hash]*stateTask), 247 deliver: make(chan *stateReq), 248 cancel: make(chan struct{}), 249 done: make(chan struct{}), 250 } 251 } 252 253 // run starts the task assignment and response processing loop, blocking until 254 // it finishes, and finally notifying any goroutines waiting for the loop to 255 // finish. 256 func (s *stateSync) run() { 257 s.err = s.loop() 258 close(s.done) 259 } 260 261 // Wait blocks until the sync is done or canceled. 262 func (s *stateSync) Wait() error { 263 <-s.done 264 return s.err 265 } 266 267 // Cancel cancels the sync and waits until it has shut down. 268 func (s *stateSync) Cancel() error { 269 s.cancelOnce.Do(func() { close(s.cancel) }) 270 return s.Wait() 271 } 272 273 // loop is the main event loop of a state trie sync. It it responsible for the 274 // assignment of new tasks to peers (including sending it to them) as well as 275 // for the processing of inbound data. Note, that the loop does not directly 276 // receive data from peers, rather those are buffered up in the downloader and 277 // pushed here async. The reason is to decouple processing from data receipt 278 // and timeouts. 279 func (s *stateSync) loop() (err error) { 280 // Listen for new peer events to assign tasks to them 281 newPeer := make(chan *peerConnection, 1024) 282 peerSub := s.d.peers.SubscribeNewPeers(newPeer) 283 defer peerSub.Unsubscribe() 284 defer func() { 285 cerr := s.commit(true) 286 if err == nil { 287 err = cerr 288 } 289 }() 290 291 // Keep assigning new tasks until the sync completes or aborts 292 for s.sched.Pending() > 0 { 293 if err = s.commit(false); err != nil { 294 return err 295 } 296 s.assignTasks() 297 // Tasks assigned, wait for something to happen 298 select { 299 case <-newPeer: 300 // New peer arrived, try to assign it download tasks 301 302 case <-s.cancel: 303 return errCancelStateFetch 304 305 case <-s.d.cancelCh: 306 return errCanceled 307 308 case req := <-s.deliver: 309 // Response, disconnect or timeout triggered, drop the peer if stalling 310 log.Trace("Received node data response", "peer", req.peer.id, "count", len(req.response), "dropped", req.dropped, "timeout", !req.dropped && req.timedOut()) 311 if len(req.items) <= 2 && !req.dropped && req.timedOut() { 312 // 2 items are the minimum requested, if even that times out, we've no use of 313 // this peer at the moment. 314 log.Warn("Stalling state sync, dropping peer", "peer", req.peer.id) 315 if s.d.dropPeer == nil { 316 // The dropPeer method is nil when `--copydb` is used for a local copy. 317 // Timeouts can occur if e.g. compaction hits at the wrong time, and can be ignored 318 req.peer.log.Warn("Downloader wants to drop peer, but peerdrop-function is not set", "peer", req.peer.id) 319 } else { 320 s.d.dropPeer(req.peer.id) 321 322 // If this peer was the master peer, abort sync immediately 323 s.d.cancelLock.RLock() 324 master := req.peer.id == s.d.cancelPeer 325 s.d.cancelLock.RUnlock() 326 327 if master { 328 s.d.cancel() 329 return errTimeout 330 } 331 } 332 } 333 // Process all the received blobs and check for stale delivery 334 delivered, err := s.process(req) 335 if err != nil { 336 log.Warn("Node data write error", "err", err) 337 return err 338 } 339 req.peer.SetNodeDataIdle(delivered) 340 } 341 } 342 return nil 343 } 344 345 func (s *stateSync) commit(force bool) error { 346 if !force && s.bytesUncommitted < ethdb.IdealBatchSize { 347 return nil 348 } 349 start := time.Now() 350 b := s.d.stateDB.NewBatch() 351 if err := s.sched.Commit(b); err != nil { 352 return err 353 } 354 if err := b.Write(); err != nil { 355 return fmt.Errorf("DB write error: %v", err) 356 } 357 s.updateStats(s.numUncommitted, 0, 0, time.Since(start)) 358 s.numUncommitted = 0 359 s.bytesUncommitted = 0 360 return nil 361 } 362 363 // assignTasks attempts to assign new tasks to all idle peers, either from the 364 // batch currently being retried, or fetching new data from the trie sync itself. 365 func (s *stateSync) assignTasks() { 366 // Iterate over all idle peers and try to assign them state fetches 367 peers, _ := s.d.peers.NodeDataIdlePeers() 368 for _, p := range peers { 369 // Assign a batch of fetches proportional to the estimated latency/bandwidth 370 cap := p.NodeDataCapacity(s.d.requestRTT()) 371 req := &stateReq{peer: p, timeout: s.d.requestTTL()} 372 s.fillTasks(cap, req) 373 374 // If the peer was assigned tasks to fetch, send the network request 375 if len(req.items) > 0 { 376 req.peer.log.Trace("Requesting new batch of data", "type", "state", "count", len(req.items)) 377 select { 378 case s.d.trackStateReq <- req: 379 req.peer.FetchNodeData(req.items) 380 case <-s.cancel: 381 case <-s.d.cancelCh: 382 } 383 } 384 } 385 } 386 387 // fillTasks fills the given request object with a maximum of n state download 388 // tasks to send to the remote peer. 389 func (s *stateSync) fillTasks(n int, req *stateReq) { 390 // Refill available tasks from the scheduler. 391 if len(s.tasks) < n { 392 new := s.sched.Missing(n - len(s.tasks)) 393 for _, hash := range new { 394 s.tasks[hash] = &stateTask{make(map[string]struct{})} 395 } 396 } 397 // Find tasks that haven't been tried with the request's peer. 398 req.items = make([]common.Hash, 0, n) 399 req.tasks = make(map[common.Hash]*stateTask, n) 400 for hash, t := range s.tasks { 401 // Stop when we've gathered enough requests 402 if len(req.items) == n { 403 break 404 } 405 // Skip any requests we've already tried from this peer 406 if _, ok := t.attempts[req.peer.id]; ok { 407 continue 408 } 409 // Assign the request to this peer 410 t.attempts[req.peer.id] = struct{}{} 411 req.items = append(req.items, hash) 412 req.tasks[hash] = t 413 delete(s.tasks, hash) 414 } 415 } 416 417 // process iterates over a batch of delivered state data, injecting each item 418 // into a running state sync, re-queuing any items that were requested but not 419 // delivered. Returns whether the peer actually managed to deliver anything of 420 // value, and any error that occurred. 421 func (s *stateSync) process(req *stateReq) (int, error) { 422 // Collect processing stats and update progress if valid data was received 423 duplicate, unexpected, successful := 0, 0, 0 424 425 defer func(start time.Time) { 426 if duplicate > 0 || unexpected > 0 { 427 s.updateStats(0, duplicate, unexpected, time.Since(start)) 428 } 429 }(time.Now()) 430 431 // Iterate over all the delivered data and inject one-by-one into the trie 432 for _, blob := range req.response { 433 _, hash, err := s.processNodeData(blob) 434 switch err { 435 case nil: 436 s.numUncommitted++ 437 s.bytesUncommitted += len(blob) 438 successful++ 439 case trie.ErrNotRequested: 440 unexpected++ 441 case trie.ErrAlreadyProcessed: 442 duplicate++ 443 default: 444 return successful, fmt.Errorf("invalid state node %s: %v", hash.TerminalString(), err) 445 } 446 delete(req.tasks, hash) 447 } 448 // Put unfulfilled tasks back into the retry queue 449 npeers := s.d.peers.Len() 450 for hash, task := range req.tasks { 451 // If the node did deliver something, missing items may be due to a protocol 452 // limit or a previous timeout + delayed delivery. Both cases should permit 453 // the node to retry the missing items (to avoid single-peer stalls). 454 if len(req.response) > 0 || req.timedOut() { 455 delete(task.attempts, req.peer.id) 456 } 457 // If we've requested the node too many times already, it may be a malicious 458 // sync where nobody has the right data. Abort. 459 if len(task.attempts) >= npeers { 460 return successful, fmt.Errorf("state node %s failed with all peers (%d tries, %d peers)", hash.TerminalString(), len(task.attempts), npeers) 461 } 462 // Missing item, place into the retry queue. 463 s.tasks[hash] = task 464 } 465 return successful, nil 466 } 467 468 // processNodeData tries to inject a trie node data blob delivered from a remote 469 // peer into the state trie, returning whether anything useful was written or any 470 // error occurred. 471 func (s *stateSync) processNodeData(blob []byte) (bool, common.Hash, error) { 472 res := trie.SyncResult{Data: blob} 473 s.keccak.Reset() 474 s.keccak.Write(blob) 475 s.keccak.Sum(res.Hash[:0]) 476 committed, _, err := s.sched.Process([]trie.SyncResult{res}) 477 return committed, res.Hash, err 478 } 479 480 // updateStats bumps the various state sync progress counters and displays a log 481 // message for the user to see. 482 func (s *stateSync) updateStats(written, duplicate, unexpected int, duration time.Duration) { 483 s.d.syncStatsLock.Lock() 484 defer s.d.syncStatsLock.Unlock() 485 486 s.d.syncStatsState.pending = uint64(s.sched.Pending()) 487 s.d.syncStatsState.processed += uint64(written) 488 s.d.syncStatsState.duplicate += uint64(duplicate) 489 s.d.syncStatsState.unexpected += uint64(unexpected) 490 491 if written > 0 || duplicate > 0 || unexpected > 0 { 492 log.Info("Imported new state entries", "count", written, "elapsed", common.PrettyDuration(duration), "processed", s.d.syncStatsState.processed, "pending", s.d.syncStatsState.pending, "retry", len(s.tasks), "duplicate", s.d.syncStatsState.duplicate, "unexpected", s.d.syncStatsState.unexpected) 493 } 494 if written > 0 { 495 rawdb.WriteFastTrieProgress(s.d.stateDB, s.d.syncStatsState.processed) 496 } 497 }