github.com/CommerciumBlockchain/go-commercium@v0.0.0-20220709212705-b46438a77516/eth/protocols/snap/sync.go (about) 1 // Copyright 2020 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package snap 18 19 import ( 20 "bytes" 21 "encoding/json" 22 "errors" 23 "fmt" 24 "math/big" 25 "math/rand" 26 "sync" 27 "time" 28 29 "github.com/CommerciumBlockchain/go-commercium/common" 30 "github.com/CommerciumBlockchain/go-commercium/core/rawdb" 31 "github.com/CommerciumBlockchain/go-commercium/core/state" 32 "github.com/CommerciumBlockchain/go-commercium/crypto" 33 "github.com/CommerciumBlockchain/go-commercium/ethdb" 34 "github.com/CommerciumBlockchain/go-commercium/event" 35 "github.com/CommerciumBlockchain/go-commercium/light" 36 "github.com/CommerciumBlockchain/go-commercium/log" 37 "github.com/CommerciumBlockchain/go-commercium/rlp" 38 "github.com/CommerciumBlockchain/go-commercium/trie" 39 "golang.org/x/crypto/sha3" 40 ) 41 42 var ( 43 // emptyRoot is the known root hash of an empty trie. 44 emptyRoot = common.HexToHash("56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421") 45 46 // emptyCode is the known hash of the empty EVM bytecode. 47 emptyCode = crypto.Keccak256Hash(nil) 48 ) 49 50 const ( 51 // maxRequestSize is the maximum number of bytes to request from a remote peer. 52 maxRequestSize = 512 * 1024 53 54 // maxStorageSetRequestCountis th maximum number of contracts to request the 55 // storage of in a single query. If this number is too low, we're not filling 56 // responses fully and waste round trip times. If it's too high, we're capping 57 // responses and waste bandwidth. 58 maxStorageSetRequestCount = maxRequestSize / 1024 59 60 // maxCodeRequestCount is the maximum number of bytecode blobs to request in a 61 // single query. If this number is too low, we're not filling responses fully 62 // and waste round trip times. If it's too high, we're capping responses and 63 // waste bandwidth. 64 // 65 // Depoyed bytecodes are currently capped at 24KB, so the minimum request 66 // size should be maxRequestSize / 24K. Assuming that most contracts do not 67 // come close to that, requesting 4x should be a good approximation. 68 maxCodeRequestCount = maxRequestSize / (24 * 1024) * 4 69 70 // maxTrieRequestCount is the maximum number of trie node blobs to request in 71 // a single query. If this number is too low, we're not filling responses fully 72 // and waste round trip times. If it's too high, we're capping responses and 73 // waste bandwidth. 74 maxTrieRequestCount = 512 75 76 // requestTimeout is the maximum time a peer is allowed to spend on serving 77 // a single network request. 78 requestTimeout = 10 * time.Second // TODO(karalabe): Make it dynamic ala fast-sync? 79 80 // accountConcurrency is the number of chunks to split the account trie into 81 // to allow concurrent retrievals. 82 accountConcurrency = 16 83 84 // storageConcurrency is the number of chunks to split the a large contract 85 // storage trie into to allow concurrent retrievals. 86 storageConcurrency = 16 87 ) 88 89 // accountRequest tracks a pending account range request to ensure responses are 90 // to actual requests and to validate any security constraints. 91 // 92 // Concurrency note: account requests and responses are handled concurrently from 93 // the main runloop to allow Merkle proof verifications on the peer's thread and 94 // to drop on invalid response. The request struct must contain all the data to 95 // construct the response without accessing runloop internals (i.e. task). That 96 // is only included to allow the runloop to match a response to the task being 97 // synced without having yet another set of maps. 98 type accountRequest struct { 99 peer string // Peer to which this request is assigned 100 id uint64 // Request ID of this request 101 102 cancel chan struct{} // Channel to track sync cancellation 103 timeout *time.Timer // Timer to track delivery timeout 104 stale chan struct{} // Channel to signal the request was dropped 105 106 origin common.Hash // First account requested to allow continuation checks 107 limit common.Hash // Last account requested to allow non-overlapping chunking 108 109 task *accountTask // Task which this request is filling (only access fields through the runloop!!) 110 } 111 112 // accountResponse is an already Merkle-verified remote response to an account 113 // range request. It contains the subtrie for the requested account range and 114 // the database that's going to be filled with the internal nodes on commit. 115 type accountResponse struct { 116 task *accountTask // Task which this request is filling 117 118 hashes []common.Hash // Account hashes in the returned range 119 accounts []*state.Account // Expanded accounts in the returned range 120 121 nodes ethdb.KeyValueStore // Database containing the reconstructed trie nodes 122 trie *trie.Trie // Reconstructed trie to reject incomplete account paths 123 124 bounds map[common.Hash]struct{} // Boundary nodes to avoid persisting incomplete accounts 125 overflow *light.NodeSet // Overflow nodes to avoid persisting across chunk boundaries 126 127 cont bool // Whether the account range has a continuation 128 } 129 130 // bytecodeRequest tracks a pending bytecode request to ensure responses are to 131 // actual requests and to validate any security constraints. 132 // 133 // Concurrency note: bytecode requests and responses are handled concurrently from 134 // the main runloop to allow Keccak256 hash verifications on the peer's thread and 135 // to drop on invalid response. The request struct must contain all the data to 136 // construct the response without accessing runloop internals (i.e. task). That 137 // is only included to allow the runloop to match a response to the task being 138 // synced without having yet another set of maps. 139 type bytecodeRequest struct { 140 peer string // Peer to which this request is assigned 141 id uint64 // Request ID of this request 142 143 cancel chan struct{} // Channel to track sync cancellation 144 timeout *time.Timer // Timer to track delivery timeout 145 stale chan struct{} // Channel to signal the request was dropped 146 147 hashes []common.Hash // Bytecode hashes to validate responses 148 task *accountTask // Task which this request is filling (only access fields through the runloop!!) 149 } 150 151 // bytecodeResponse is an already verified remote response to a bytecode request. 152 type bytecodeResponse struct { 153 task *accountTask // Task which this request is filling 154 155 hashes []common.Hash // Hashes of the bytecode to avoid double hashing 156 codes [][]byte // Actual bytecodes to store into the database (nil = missing) 157 } 158 159 // storageRequest tracks a pending storage ranges request to ensure responses are 160 // to actual requests and to validate any security constraints. 161 // 162 // Concurrency note: storage requests and responses are handled concurrently from 163 // the main runloop to allow Merkel proof verifications on the peer's thread and 164 // to drop on invalid response. The request struct must contain all the data to 165 // construct the response without accessing runloop internals (i.e. tasks). That 166 // is only included to allow the runloop to match a response to the task being 167 // synced without having yet another set of maps. 168 type storageRequest struct { 169 peer string // Peer to which this request is assigned 170 id uint64 // Request ID of this request 171 172 cancel chan struct{} // Channel to track sync cancellation 173 timeout *time.Timer // Timer to track delivery timeout 174 stale chan struct{} // Channel to signal the request was dropped 175 176 accounts []common.Hash // Account hashes to validate responses 177 roots []common.Hash // Storage roots to validate responses 178 179 origin common.Hash // First storage slot requested to allow continuation checks 180 limit common.Hash // Last storage slot requested to allow non-overlapping chunking 181 182 mainTask *accountTask // Task which this response belongs to (only access fields through the runloop!!) 183 subTask *storageTask // Task which this response is filling (only access fields through the runloop!!) 184 } 185 186 // storageResponse is an already Merkle-verified remote response to a storage 187 // range request. It contains the subtries for the requested storage ranges and 188 // the databases that's going to be filled with the internal nodes on commit. 189 type storageResponse struct { 190 mainTask *accountTask // Task which this response belongs to 191 subTask *storageTask // Task which this response is filling 192 193 accounts []common.Hash // Account hashes requested, may be only partially filled 194 roots []common.Hash // Storage roots requested, may be only partially filled 195 196 hashes [][]common.Hash // Storage slot hashes in the returned range 197 slots [][][]byte // Storage slot values in the returned range 198 nodes []ethdb.KeyValueStore // Database containing the reconstructed trie nodes 199 tries []*trie.Trie // Reconstructed tries to reject overflown slots 200 201 // Fields relevant for the last account only 202 bounds map[common.Hash]struct{} // Boundary nodes to avoid persisting (incomplete) 203 overflow *light.NodeSet // Overflow nodes to avoid persisting across chunk boundaries 204 cont bool // Whether the last storage range has a continuation 205 } 206 207 // trienodeHealRequest tracks a pending state trie request to ensure responses 208 // are to actual requests and to validate any security constraints. 209 // 210 // Concurrency note: trie node requests and responses are handled concurrently from 211 // the main runloop to allow Keccak256 hash verifications on the peer's thread and 212 // to drop on invalid response. The request struct must contain all the data to 213 // construct the response without accessing runloop internals (i.e. task). That 214 // is only included to allow the runloop to match a response to the task being 215 // synced without having yet another set of maps. 216 type trienodeHealRequest struct { 217 peer string // Peer to which this request is assigned 218 id uint64 // Request ID of this request 219 220 cancel chan struct{} // Channel to track sync cancellation 221 timeout *time.Timer // Timer to track delivery timeout 222 stale chan struct{} // Channel to signal the request was dropped 223 224 hashes []common.Hash // Trie node hashes to validate responses 225 paths []trie.SyncPath // Trie node paths requested for rescheduling 226 227 task *healTask // Task which this request is filling (only access fields through the runloop!!) 228 } 229 230 // trienodeHealResponse is an already verified remote response to a trie node request. 231 type trienodeHealResponse struct { 232 task *healTask // Task which this request is filling 233 234 hashes []common.Hash // Hashes of the trie nodes to avoid double hashing 235 paths []trie.SyncPath // Trie node paths requested for rescheduling missing ones 236 nodes [][]byte // Actual trie nodes to store into the database (nil = missing) 237 } 238 239 // bytecodeHealRequest tracks a pending bytecode request to ensure responses are to 240 // actual requests and to validate any security constraints. 241 // 242 // Concurrency note: bytecode requests and responses are handled concurrently from 243 // the main runloop to allow Keccak256 hash verifications on the peer's thread and 244 // to drop on invalid response. The request struct must contain all the data to 245 // construct the response without accessing runloop internals (i.e. task). That 246 // is only included to allow the runloop to match a response to the task being 247 // synced without having yet another set of maps. 248 type bytecodeHealRequest struct { 249 peer string // Peer to which this request is assigned 250 id uint64 // Request ID of this request 251 252 cancel chan struct{} // Channel to track sync cancellation 253 timeout *time.Timer // Timer to track delivery timeout 254 stale chan struct{} // Channel to signal the request was dropped 255 256 hashes []common.Hash // Bytecode hashes to validate responses 257 task *healTask // Task which this request is filling (only access fields through the runloop!!) 258 } 259 260 // bytecodeHealResponse is an already verified remote response to a bytecode request. 261 type bytecodeHealResponse struct { 262 task *healTask // Task which this request is filling 263 264 hashes []common.Hash // Hashes of the bytecode to avoid double hashing 265 codes [][]byte // Actual bytecodes to store into the database (nil = missing) 266 } 267 268 // accountTask represents the sync task for a chunk of the account snapshot. 269 type accountTask struct { 270 // These fields get serialized to leveldb on shutdown 271 Next common.Hash // Next account to sync in this interval 272 Last common.Hash // Last account to sync in this interval 273 SubTasks map[common.Hash][]*storageTask // Storage intervals needing fetching for large contracts 274 275 // These fields are internals used during runtime 276 req *accountRequest // Pending request to fill this task 277 res *accountResponse // Validate response filling this task 278 pend int // Number of pending subtasks for this round 279 280 needCode []bool // Flags whether the filling accounts need code retrieval 281 needState []bool // Flags whether the filling accounts need storage retrieval 282 needHeal []bool // Flags whether the filling accounts's state was chunked and need healing 283 284 codeTasks map[common.Hash]struct{} // Code hashes that need retrieval 285 stateTasks map[common.Hash]common.Hash // Account hashes->roots that need full state retrieval 286 287 done bool // Flag whether the task can be removed 288 } 289 290 // storageTask represents the sync task for a chunk of the storage snapshot. 291 type storageTask struct { 292 Next common.Hash // Next account to sync in this interval 293 Last common.Hash // Last account to sync in this interval 294 295 // These fields are internals used during runtime 296 root common.Hash // Storage root hash for this instance 297 req *storageRequest // Pending request to fill this task 298 done bool // Flag whether the task can be removed 299 } 300 301 // healTask represents the sync task for healing the snap-synced chunk boundaries. 302 type healTask struct { 303 scheduler *trie.Sync // State trie sync scheduler defining the tasks 304 305 trieTasks map[common.Hash]trie.SyncPath // Set of trie node tasks currently queued for retrieval 306 codeTasks map[common.Hash]struct{} // Set of byte code tasks currently queued for retrieval 307 } 308 309 // syncProgress is a database entry to allow suspending and resuming a snapshot state 310 // sync. Opposed to full and fast sync, there is no way to restart a suspended 311 // snap sync without prior knowledge of the suspension point. 312 type syncProgress struct { 313 Tasks []*accountTask // The suspended account tasks (contract tasks within) 314 315 // Status report during syncing phase 316 AccountSynced uint64 // Number of accounts downloaded 317 AccountBytes common.StorageSize // Number of account trie bytes persisted to disk 318 BytecodeSynced uint64 // Number of bytecodes downloaded 319 BytecodeBytes common.StorageSize // Number of bytecode bytes downloaded 320 StorageSynced uint64 // Number of storage slots downloaded 321 StorageBytes common.StorageSize // Number of storage trie bytes persisted to disk 322 323 // Status report during healing phase 324 TrienodeHealSynced uint64 // Number of state trie nodes downloaded 325 TrienodeHealBytes common.StorageSize // Number of state trie bytes persisted to disk 326 TrienodeHealDups uint64 // Number of state trie nodes already processed 327 TrienodeHealNops uint64 // Number of state trie nodes not requested 328 BytecodeHealSynced uint64 // Number of bytecodes downloaded 329 BytecodeHealBytes common.StorageSize // Number of bytecodes persisted to disk 330 BytecodeHealDups uint64 // Number of bytecodes already processed 331 BytecodeHealNops uint64 // Number of bytecodes not requested 332 } 333 334 // Syncer is an Ethereum account and storage trie syncer based on snapshots and 335 // the snap protocol. It's purpose is to download all the accounts and storage 336 // slots from remote peers and reassemble chunks of the state trie, on top of 337 // which a state sync can be run to fix any gaps / overlaps. 338 // 339 // Every network request has a variety of failure events: 340 // - The peer disconnects after task assignment, failing to send the request 341 // - The peer disconnects after sending the request, before delivering on it 342 // - The peer remains connected, but does not deliver a response in time 343 // - The peer delivers a stale response after a previous timeout 344 // - The peer delivers a refusal to serve the requested state 345 type Syncer struct { 346 db ethdb.KeyValueStore // Database to store the trie nodes into (and dedup) 347 bloom *trie.SyncBloom // Bloom filter to deduplicate nodes for state fixup 348 349 root common.Hash // Current state trie root being synced 350 tasks []*accountTask // Current account task set being synced 351 healer *healTask // Current state healing task being executed 352 update chan struct{} // Notification channel for possible sync progression 353 354 peers map[string]*Peer // Currently active peers to download from 355 peerJoin *event.Feed // Event feed to react to peers joining 356 peerDrop *event.Feed // Event feed to react to peers dropping 357 358 // Request tracking during syncing phase 359 statelessPeers map[string]struct{} // Peers that failed to deliver state data 360 accountIdlers map[string]struct{} // Peers that aren't serving account requests 361 bytecodeIdlers map[string]struct{} // Peers that aren't serving bytecode requests 362 storageIdlers map[string]struct{} // Peers that aren't serving storage requests 363 364 accountReqs map[uint64]*accountRequest // Account requests currently running 365 bytecodeReqs map[uint64]*bytecodeRequest // Bytecode requests currently running 366 storageReqs map[uint64]*storageRequest // Storage requests currently running 367 368 accountReqFails chan *accountRequest // Failed account range requests to revert 369 bytecodeReqFails chan *bytecodeRequest // Failed bytecode requests to revert 370 storageReqFails chan *storageRequest // Failed storage requests to revert 371 372 accountResps chan *accountResponse // Account sub-tries to integrate into the database 373 bytecodeResps chan *bytecodeResponse // Bytecodes to integrate into the database 374 storageResps chan *storageResponse // Storage sub-tries to integrate into the database 375 376 accountSynced uint64 // Number of accounts downloaded 377 accountBytes common.StorageSize // Number of account trie bytes persisted to disk 378 bytecodeSynced uint64 // Number of bytecodes downloaded 379 bytecodeBytes common.StorageSize // Number of bytecode bytes downloaded 380 storageSynced uint64 // Number of storage slots downloaded 381 storageBytes common.StorageSize // Number of storage trie bytes persisted to disk 382 383 // Request tracking during healing phase 384 trienodeHealIdlers map[string]struct{} // Peers that aren't serving trie node requests 385 bytecodeHealIdlers map[string]struct{} // Peers that aren't serving bytecode requests 386 387 trienodeHealReqs map[uint64]*trienodeHealRequest // Trie node requests currently running 388 bytecodeHealReqs map[uint64]*bytecodeHealRequest // Bytecode requests currently running 389 390 trienodeHealReqFails chan *trienodeHealRequest // Failed trienode requests to revert 391 bytecodeHealReqFails chan *bytecodeHealRequest // Failed bytecode requests to revert 392 393 trienodeHealResps chan *trienodeHealResponse // Trie nodes to integrate into the database 394 bytecodeHealResps chan *bytecodeHealResponse // Bytecodes to integrate into the database 395 396 trienodeHealSynced uint64 // Number of state trie nodes downloaded 397 trienodeHealBytes common.StorageSize // Number of state trie bytes persisted to disk 398 trienodeHealDups uint64 // Number of state trie nodes already processed 399 trienodeHealNops uint64 // Number of state trie nodes not requested 400 bytecodeHealSynced uint64 // Number of bytecodes downloaded 401 bytecodeHealBytes common.StorageSize // Number of bytecodes persisted to disk 402 bytecodeHealDups uint64 // Number of bytecodes already processed 403 bytecodeHealNops uint64 // Number of bytecodes not requested 404 405 startTime time.Time // Time instance when snapshot sync started 406 startAcc common.Hash // Account hash where sync started from 407 logTime time.Time // Time instance when status was last reported 408 409 pend sync.WaitGroup // Tracks network request goroutines for graceful shutdown 410 lock sync.RWMutex // Protects fields that can change outside of sync (peers, reqs, root) 411 } 412 413 func NewSyncer(db ethdb.KeyValueStore, bloom *trie.SyncBloom) *Syncer { 414 return &Syncer{ 415 db: db, 416 bloom: bloom, 417 418 peers: make(map[string]*Peer), 419 peerJoin: new(event.Feed), 420 peerDrop: new(event.Feed), 421 update: make(chan struct{}, 1), 422 423 accountIdlers: make(map[string]struct{}), 424 storageIdlers: make(map[string]struct{}), 425 bytecodeIdlers: make(map[string]struct{}), 426 427 accountReqs: make(map[uint64]*accountRequest), 428 storageReqs: make(map[uint64]*storageRequest), 429 bytecodeReqs: make(map[uint64]*bytecodeRequest), 430 accountReqFails: make(chan *accountRequest), 431 storageReqFails: make(chan *storageRequest), 432 bytecodeReqFails: make(chan *bytecodeRequest), 433 accountResps: make(chan *accountResponse), 434 storageResps: make(chan *storageResponse), 435 bytecodeResps: make(chan *bytecodeResponse), 436 437 trienodeHealIdlers: make(map[string]struct{}), 438 bytecodeHealIdlers: make(map[string]struct{}), 439 440 trienodeHealReqs: make(map[uint64]*trienodeHealRequest), 441 bytecodeHealReqs: make(map[uint64]*bytecodeHealRequest), 442 trienodeHealReqFails: make(chan *trienodeHealRequest), 443 bytecodeHealReqFails: make(chan *bytecodeHealRequest), 444 trienodeHealResps: make(chan *trienodeHealResponse), 445 bytecodeHealResps: make(chan *bytecodeHealResponse), 446 } 447 } 448 449 // Register injects a new data source into the syncer's peerset. 450 func (s *Syncer) Register(peer *Peer) error { 451 // Make sure the peer is not registered yet 452 s.lock.Lock() 453 if _, ok := s.peers[peer.id]; ok { 454 log.Error("Snap peer already registered", "id", peer.id) 455 456 s.lock.Unlock() 457 return errors.New("already registered") 458 } 459 s.peers[peer.id] = peer 460 461 // Mark the peer as idle, even if no sync is running 462 s.accountIdlers[peer.id] = struct{}{} 463 s.storageIdlers[peer.id] = struct{}{} 464 s.bytecodeIdlers[peer.id] = struct{}{} 465 s.trienodeHealIdlers[peer.id] = struct{}{} 466 s.bytecodeHealIdlers[peer.id] = struct{}{} 467 s.lock.Unlock() 468 469 // Notify any active syncs that a new peer can be assigned data 470 s.peerJoin.Send(peer.id) 471 return nil 472 } 473 474 // Unregister injects a new data source into the syncer's peerset. 475 func (s *Syncer) Unregister(id string) error { 476 // Remove all traces of the peer from the registry 477 s.lock.Lock() 478 if _, ok := s.peers[id]; !ok { 479 log.Error("Snap peer not registered", "id", id) 480 481 s.lock.Unlock() 482 return errors.New("not registered") 483 } 484 delete(s.peers, id) 485 486 // Remove status markers, even if no sync is running 487 delete(s.statelessPeers, id) 488 489 delete(s.accountIdlers, id) 490 delete(s.storageIdlers, id) 491 delete(s.bytecodeIdlers, id) 492 delete(s.trienodeHealIdlers, id) 493 delete(s.bytecodeHealIdlers, id) 494 s.lock.Unlock() 495 496 // Notify any active syncs that pending requests need to be reverted 497 s.peerDrop.Send(id) 498 return nil 499 } 500 501 // Sync starts (or resumes a previous) sync cycle to iterate over an state trie 502 // with the given root and reconstruct the nodes based on the snapshot leaves. 503 // Previously downloaded segments will not be redownloaded of fixed, rather any 504 // errors will be healed after the leaves are fully accumulated. 505 func (s *Syncer) Sync(root common.Hash, cancel chan struct{}) error { 506 // Move the trie root from any previous value, revert stateless markers for 507 // any peers and initialize the syncer if it was not yet run 508 s.lock.Lock() 509 s.root = root 510 s.healer = &healTask{ 511 scheduler: state.NewStateSync(root, s.db, s.bloom), 512 trieTasks: make(map[common.Hash]trie.SyncPath), 513 codeTasks: make(map[common.Hash]struct{}), 514 } 515 s.statelessPeers = make(map[string]struct{}) 516 s.lock.Unlock() 517 518 if s.startTime == (time.Time{}) { 519 s.startTime = time.Now() 520 } 521 // Retrieve the previous sync status from LevelDB and abort if already synced 522 s.loadSyncStatus() 523 if len(s.tasks) == 0 && s.healer.scheduler.Pending() == 0 { 524 log.Debug("Snapshot sync already completed") 525 return nil 526 } 527 defer func() { // Persist any progress, independent of failure 528 for _, task := range s.tasks { 529 s.forwardAccountTask(task) 530 } 531 s.cleanAccountTasks() 532 s.saveSyncStatus() 533 }() 534 535 log.Debug("Starting snapshot sync cycle", "root", root) 536 defer s.report(true) 537 538 // Whether sync completed or not, disregard any future packets 539 defer func() { 540 log.Debug("Terminating snapshot sync cycle", "root", root) 541 s.lock.Lock() 542 s.accountReqs = make(map[uint64]*accountRequest) 543 s.storageReqs = make(map[uint64]*storageRequest) 544 s.bytecodeReqs = make(map[uint64]*bytecodeRequest) 545 s.trienodeHealReqs = make(map[uint64]*trienodeHealRequest) 546 s.bytecodeHealReqs = make(map[uint64]*bytecodeHealRequest) 547 s.lock.Unlock() 548 }() 549 // Keep scheduling sync tasks 550 peerJoin := make(chan string, 16) 551 peerJoinSub := s.peerJoin.Subscribe(peerJoin) 552 defer peerJoinSub.Unsubscribe() 553 554 peerDrop := make(chan string, 16) 555 peerDropSub := s.peerDrop.Subscribe(peerDrop) 556 defer peerDropSub.Unsubscribe() 557 558 for { 559 // Remove all completed tasks and terminate sync if everything's done 560 s.cleanStorageTasks() 561 s.cleanAccountTasks() 562 if len(s.tasks) == 0 && s.healer.scheduler.Pending() == 0 { 563 return nil 564 } 565 // Assign all the data retrieval tasks to any free peers 566 s.assignAccountTasks(cancel) 567 s.assignBytecodeTasks(cancel) 568 s.assignStorageTasks(cancel) 569 if len(s.tasks) == 0 { 570 // Sync phase done, run heal phase 571 s.assignTrienodeHealTasks(cancel) 572 s.assignBytecodeHealTasks(cancel) 573 } 574 // Wait for something to happen 575 select { 576 case <-s.update: 577 // Something happened (new peer, delivery, timeout), recheck tasks 578 case <-peerJoin: 579 // A new peer joined, try to schedule it new tasks 580 case id := <-peerDrop: 581 s.revertRequests(id) 582 case <-cancel: 583 return nil 584 585 case req := <-s.accountReqFails: 586 s.revertAccountRequest(req) 587 case req := <-s.bytecodeReqFails: 588 s.revertBytecodeRequest(req) 589 case req := <-s.storageReqFails: 590 s.revertStorageRequest(req) 591 case req := <-s.trienodeHealReqFails: 592 s.revertTrienodeHealRequest(req) 593 case req := <-s.bytecodeHealReqFails: 594 s.revertBytecodeHealRequest(req) 595 596 case res := <-s.accountResps: 597 s.processAccountResponse(res) 598 case res := <-s.bytecodeResps: 599 s.processBytecodeResponse(res) 600 case res := <-s.storageResps: 601 s.processStorageResponse(res) 602 case res := <-s.trienodeHealResps: 603 s.processTrienodeHealResponse(res) 604 case res := <-s.bytecodeHealResps: 605 s.processBytecodeHealResponse(res) 606 } 607 // Report stats if something meaningful happened 608 s.report(false) 609 } 610 } 611 612 // loadSyncStatus retrieves a previously aborted sync status from the database, 613 // or generates a fresh one if none is available. 614 func (s *Syncer) loadSyncStatus() { 615 var progress syncProgress 616 617 if status := rawdb.ReadSanpshotSyncStatus(s.db); status != nil { 618 if err := json.Unmarshal(status, &progress); err != nil { 619 log.Error("Failed to decode snap sync status", "err", err) 620 } else { 621 for _, task := range progress.Tasks { 622 log.Debug("Scheduled account sync task", "from", task.Next, "last", task.Last) 623 } 624 s.tasks = progress.Tasks 625 626 s.accountSynced = progress.AccountSynced 627 s.accountBytes = progress.AccountBytes 628 s.bytecodeSynced = progress.BytecodeSynced 629 s.bytecodeBytes = progress.BytecodeBytes 630 s.storageSynced = progress.StorageSynced 631 s.storageBytes = progress.StorageBytes 632 633 s.trienodeHealSynced = progress.TrienodeHealSynced 634 s.trienodeHealBytes = progress.TrienodeHealBytes 635 s.bytecodeHealSynced = progress.BytecodeHealSynced 636 s.bytecodeHealBytes = progress.BytecodeHealBytes 637 return 638 } 639 } 640 // Either we've failed to decode the previus state, or there was none. 641 // Start a fresh sync by chunking up the account range and scheduling 642 // them for retrieval. 643 s.tasks = nil 644 s.accountSynced, s.accountBytes = 0, 0 645 s.bytecodeSynced, s.bytecodeBytes = 0, 0 646 s.storageSynced, s.storageBytes = 0, 0 647 s.trienodeHealSynced, s.trienodeHealBytes = 0, 0 648 s.bytecodeHealSynced, s.bytecodeHealBytes = 0, 0 649 650 var next common.Hash 651 step := new(big.Int).Sub( 652 new(big.Int).Div( 653 new(big.Int).Exp(common.Big2, common.Big256, nil), 654 big.NewInt(accountConcurrency), 655 ), common.Big1, 656 ) 657 for i := 0; i < accountConcurrency; i++ { 658 last := common.BigToHash(new(big.Int).Add(next.Big(), step)) 659 if i == accountConcurrency-1 { 660 // Make sure we don't overflow if the step is not a proper divisor 661 last = common.HexToHash("0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff") 662 } 663 s.tasks = append(s.tasks, &accountTask{ 664 Next: next, 665 Last: last, 666 SubTasks: make(map[common.Hash][]*storageTask), 667 }) 668 log.Debug("Created account sync task", "from", next, "last", last) 669 next = common.BigToHash(new(big.Int).Add(last.Big(), common.Big1)) 670 } 671 } 672 673 // saveSyncStatus marshals the remaining sync tasks into leveldb. 674 func (s *Syncer) saveSyncStatus() { 675 progress := &syncProgress{ 676 Tasks: s.tasks, 677 AccountSynced: s.accountSynced, 678 AccountBytes: s.accountBytes, 679 BytecodeSynced: s.bytecodeSynced, 680 BytecodeBytes: s.bytecodeBytes, 681 StorageSynced: s.storageSynced, 682 StorageBytes: s.storageBytes, 683 TrienodeHealSynced: s.trienodeHealSynced, 684 TrienodeHealBytes: s.trienodeHealBytes, 685 BytecodeHealSynced: s.bytecodeHealSynced, 686 BytecodeHealBytes: s.bytecodeHealBytes, 687 } 688 status, err := json.Marshal(progress) 689 if err != nil { 690 panic(err) // This can only fail during implementation 691 } 692 rawdb.WriteSnapshotSyncStatus(s.db, status) 693 } 694 695 // cleanAccountTasks removes account range retrieval tasks that have already been 696 // completed. 697 func (s *Syncer) cleanAccountTasks() { 698 for i := 0; i < len(s.tasks); i++ { 699 if s.tasks[i].done { 700 s.tasks = append(s.tasks[:i], s.tasks[i+1:]...) 701 i-- 702 } 703 } 704 } 705 706 // cleanStorageTasks iterates over all the account tasks and storage sub-tasks 707 // within, cleaning any that have been completed. 708 func (s *Syncer) cleanStorageTasks() { 709 for _, task := range s.tasks { 710 for account, subtasks := range task.SubTasks { 711 // Remove storage range retrieval tasks that completed 712 for j := 0; j < len(subtasks); j++ { 713 if subtasks[j].done { 714 subtasks = append(subtasks[:j], subtasks[j+1:]...) 715 j-- 716 } 717 } 718 if len(subtasks) > 0 { 719 task.SubTasks[account] = subtasks 720 continue 721 } 722 // If all storage chunks are done, mark the account as done too 723 for j, hash := range task.res.hashes { 724 if hash == account { 725 task.needState[j] = false 726 } 727 } 728 delete(task.SubTasks, account) 729 task.pend-- 730 731 // If this was the last pending task, forward the account task 732 if task.pend == 0 { 733 s.forwardAccountTask(task) 734 } 735 } 736 } 737 } 738 739 // assignAccountTasks attempts to match idle peers to pending account range 740 // retrievals. 741 func (s *Syncer) assignAccountTasks(cancel chan struct{}) { 742 s.lock.Lock() 743 defer s.lock.Unlock() 744 745 // If there are no idle peers, short circuit assignment 746 if len(s.accountIdlers) == 0 { 747 return 748 } 749 // Iterate over all the tasks and try to find a pending one 750 for _, task := range s.tasks { 751 // Skip any tasks already filling 752 if task.req != nil || task.res != nil { 753 continue 754 } 755 // Task pending retrieval, try to find an idle peer. If no such peer 756 // exists, we probably assigned tasks for all (or they are stateless). 757 // Abort the entire assignment mechanism. 758 var idle string 759 for id := range s.accountIdlers { 760 // If the peer rejected a query in this sync cycle, don't bother asking 761 // again for anything, it's either out of sync or already pruned 762 if _, ok := s.statelessPeers[id]; ok { 763 continue 764 } 765 idle = id 766 break 767 } 768 if idle == "" { 769 return 770 } 771 // Matched a pending task to an idle peer, allocate a unique request id 772 var reqid uint64 773 for { 774 reqid = uint64(rand.Int63()) 775 if reqid == 0 { 776 continue 777 } 778 if _, ok := s.accountReqs[reqid]; ok { 779 continue 780 } 781 break 782 } 783 // Generate the network query and send it to the peer 784 req := &accountRequest{ 785 peer: idle, 786 id: reqid, 787 cancel: cancel, 788 stale: make(chan struct{}), 789 origin: task.Next, 790 limit: task.Last, 791 task: task, 792 } 793 req.timeout = time.AfterFunc(requestTimeout, func() { 794 log.Debug("Account range request timed out") 795 select { 796 case s.accountReqFails <- req: 797 default: 798 } 799 }) 800 s.accountReqs[reqid] = req 801 delete(s.accountIdlers, idle) 802 803 s.pend.Add(1) 804 go func(peer *Peer, root common.Hash) { 805 defer s.pend.Done() 806 807 // Attempt to send the remote request and revert if it fails 808 if err := peer.RequestAccountRange(reqid, root, req.origin, req.limit, maxRequestSize); err != nil { 809 peer.Log().Debug("Failed to request account range", "err", err) 810 select { 811 case s.accountReqFails <- req: 812 default: 813 } 814 } 815 // Request successfully sent, start a 816 }(s.peers[idle], s.root) // We're in the lock, peers[id] surely exists 817 818 // Inject the request into the task to block further assignments 819 task.req = req 820 } 821 } 822 823 // assignBytecodeTasks attempts to match idle peers to pending code retrievals. 824 func (s *Syncer) assignBytecodeTasks(cancel chan struct{}) { 825 s.lock.Lock() 826 defer s.lock.Unlock() 827 828 // If there are no idle peers, short circuit assignment 829 if len(s.bytecodeIdlers) == 0 { 830 return 831 } 832 // Iterate over all the tasks and try to find a pending one 833 for _, task := range s.tasks { 834 // Skip any tasks not in the bytecode retrieval phase 835 if task.res == nil { 836 continue 837 } 838 // Skip tasks that are already retrieving (or done with) all codes 839 if len(task.codeTasks) == 0 { 840 continue 841 } 842 // Task pending retrieval, try to find an idle peer. If no such peer 843 // exists, we probably assigned tasks for all (or they are stateless). 844 // Abort the entire assignment mechanism. 845 var idle string 846 for id := range s.bytecodeIdlers { 847 // If the peer rejected a query in this sync cycle, don't bother asking 848 // again for anything, it's either out of sync or already pruned 849 if _, ok := s.statelessPeers[id]; ok { 850 continue 851 } 852 idle = id 853 break 854 } 855 if idle == "" { 856 return 857 } 858 // Matched a pending task to an idle peer, allocate a unique request id 859 var reqid uint64 860 for { 861 reqid = uint64(rand.Int63()) 862 if reqid == 0 { 863 continue 864 } 865 if _, ok := s.bytecodeReqs[reqid]; ok { 866 continue 867 } 868 break 869 } 870 // Generate the network query and send it to the peer 871 hashes := make([]common.Hash, 0, maxCodeRequestCount) 872 for hash := range task.codeTasks { 873 delete(task.codeTasks, hash) 874 hashes = append(hashes, hash) 875 if len(hashes) >= maxCodeRequestCount { 876 break 877 } 878 } 879 req := &bytecodeRequest{ 880 peer: idle, 881 id: reqid, 882 cancel: cancel, 883 stale: make(chan struct{}), 884 hashes: hashes, 885 task: task, 886 } 887 req.timeout = time.AfterFunc(requestTimeout, func() { 888 log.Debug("Bytecode request timed out") 889 select { 890 case s.bytecodeReqFails <- req: 891 default: 892 } 893 }) 894 s.bytecodeReqs[reqid] = req 895 delete(s.bytecodeIdlers, idle) 896 897 s.pend.Add(1) 898 go func(peer *Peer) { 899 defer s.pend.Done() 900 901 // Attempt to send the remote request and revert if it fails 902 if err := peer.RequestByteCodes(reqid, hashes, maxRequestSize); err != nil { 903 log.Debug("Failed to request bytecodes", "err", err) 904 select { 905 case s.bytecodeReqFails <- req: 906 default: 907 } 908 } 909 // Request successfully sent, start a 910 }(s.peers[idle]) // We're in the lock, peers[id] surely exists 911 } 912 } 913 914 // assignStorageTasks attempts to match idle peers to pending storage range 915 // retrievals. 916 func (s *Syncer) assignStorageTasks(cancel chan struct{}) { 917 s.lock.Lock() 918 defer s.lock.Unlock() 919 920 // If there are no idle peers, short circuit assignment 921 if len(s.storageIdlers) == 0 { 922 return 923 } 924 // Iterate over all the tasks and try to find a pending one 925 for _, task := range s.tasks { 926 // Skip any tasks not in the storage retrieval phase 927 if task.res == nil { 928 continue 929 } 930 // Skip tasks that are already retrieving (or done with) all small states 931 if len(task.SubTasks) == 0 && len(task.stateTasks) == 0 { 932 continue 933 } 934 // Task pending retrieval, try to find an idle peer. If no such peer 935 // exists, we probably assigned tasks for all (or they are stateless). 936 // Abort the entire assignment mechanism. 937 var idle string 938 for id := range s.storageIdlers { 939 // If the peer rejected a query in this sync cycle, don't bother asking 940 // again for anything, it's either out of sync or already pruned 941 if _, ok := s.statelessPeers[id]; ok { 942 continue 943 } 944 idle = id 945 break 946 } 947 if idle == "" { 948 return 949 } 950 // Matched a pending task to an idle peer, allocate a unique request id 951 var reqid uint64 952 for { 953 reqid = uint64(rand.Int63()) 954 if reqid == 0 { 955 continue 956 } 957 if _, ok := s.storageReqs[reqid]; ok { 958 continue 959 } 960 break 961 } 962 // Generate the network query and send it to the peer. If there are 963 // large contract tasks pending, complete those before diving into 964 // even more new contracts. 965 var ( 966 accounts = make([]common.Hash, 0, maxStorageSetRequestCount) 967 roots = make([]common.Hash, 0, maxStorageSetRequestCount) 968 subtask *storageTask 969 ) 970 for account, subtasks := range task.SubTasks { 971 for _, st := range subtasks { 972 // Skip any subtasks already filling 973 if st.req != nil { 974 continue 975 } 976 // Found an incomplete storage chunk, schedule it 977 accounts = append(accounts, account) 978 roots = append(roots, st.root) 979 980 subtask = st 981 break // Large contract chunks are downloaded individually 982 } 983 if subtask != nil { 984 break // Large contract chunks are downloaded individually 985 } 986 } 987 if subtask == nil { 988 // No large contract required retrieval, but small ones available 989 for acccount, root := range task.stateTasks { 990 delete(task.stateTasks, acccount) 991 992 accounts = append(accounts, acccount) 993 roots = append(roots, root) 994 995 if len(accounts) >= maxStorageSetRequestCount { 996 break 997 } 998 } 999 } 1000 // If nothing was found, it means this task is actually already fully 1001 // retrieving, but large contracts are hard to detect. Skip to the next. 1002 if len(accounts) == 0 { 1003 continue 1004 } 1005 req := &storageRequest{ 1006 peer: idle, 1007 id: reqid, 1008 cancel: cancel, 1009 stale: make(chan struct{}), 1010 accounts: accounts, 1011 roots: roots, 1012 mainTask: task, 1013 subTask: subtask, 1014 } 1015 if subtask != nil { 1016 req.origin = subtask.Next 1017 req.limit = subtask.Last 1018 } 1019 req.timeout = time.AfterFunc(requestTimeout, func() { 1020 log.Debug("Storage request timed out") 1021 select { 1022 case s.storageReqFails <- req: 1023 default: 1024 } 1025 }) 1026 s.storageReqs[reqid] = req 1027 delete(s.storageIdlers, idle) 1028 1029 s.pend.Add(1) 1030 go func(peer *Peer, root common.Hash) { 1031 defer s.pend.Done() 1032 1033 // Attempt to send the remote request and revert if it fails 1034 var origin, limit []byte 1035 if subtask != nil { 1036 origin, limit = req.origin[:], req.limit[:] 1037 } 1038 if err := peer.RequestStorageRanges(reqid, root, accounts, origin, limit, maxRequestSize); err != nil { 1039 log.Debug("Failed to request storage", "err", err) 1040 select { 1041 case s.storageReqFails <- req: 1042 default: 1043 } 1044 } 1045 // Request successfully sent, start a 1046 }(s.peers[idle], s.root) // We're in the lock, peers[id] surely exists 1047 1048 // Inject the request into the subtask to block further assignments 1049 if subtask != nil { 1050 subtask.req = req 1051 } 1052 } 1053 } 1054 1055 // assignTrienodeHealTasks attempts to match idle peers to trie node requests to 1056 // heal any trie errors caused by the snap sync's chunked retrieval model. 1057 func (s *Syncer) assignTrienodeHealTasks(cancel chan struct{}) { 1058 s.lock.Lock() 1059 defer s.lock.Unlock() 1060 1061 // If there are no idle peers, short circuit assignment 1062 if len(s.trienodeHealIdlers) == 0 { 1063 return 1064 } 1065 // Iterate over pending tasks and try to find a peer to retrieve with 1066 for len(s.healer.trieTasks) > 0 || s.healer.scheduler.Pending() > 0 { 1067 // If there are not enough trie tasks queued to fully assign, fill the 1068 // queue from the state sync scheduler. The trie synced schedules these 1069 // together with bytecodes, so we need to queue them combined. 1070 var ( 1071 have = len(s.healer.trieTasks) + len(s.healer.codeTasks) 1072 want = maxTrieRequestCount + maxCodeRequestCount 1073 ) 1074 if have < want { 1075 nodes, paths, codes := s.healer.scheduler.Missing(want - have) 1076 for i, hash := range nodes { 1077 s.healer.trieTasks[hash] = paths[i] 1078 } 1079 for _, hash := range codes { 1080 s.healer.codeTasks[hash] = struct{}{} 1081 } 1082 } 1083 // If all the heal tasks are bytecodes or already downloading, bail 1084 if len(s.healer.trieTasks) == 0 { 1085 return 1086 } 1087 // Task pending retrieval, try to find an idle peer. If no such peer 1088 // exists, we probably assigned tasks for all (or they are stateless). 1089 // Abort the entire assignment mechanism. 1090 var idle string 1091 for id := range s.trienodeHealIdlers { 1092 // If the peer rejected a query in this sync cycle, don't bother asking 1093 // again for anything, it's either out of sync or already pruned 1094 if _, ok := s.statelessPeers[id]; ok { 1095 continue 1096 } 1097 idle = id 1098 break 1099 } 1100 if idle == "" { 1101 return 1102 } 1103 // Matched a pending task to an idle peer, allocate a unique request id 1104 var reqid uint64 1105 for { 1106 reqid = uint64(rand.Int63()) 1107 if reqid == 0 { 1108 continue 1109 } 1110 if _, ok := s.trienodeHealReqs[reqid]; ok { 1111 continue 1112 } 1113 break 1114 } 1115 // Generate the network query and send it to the peer 1116 var ( 1117 hashes = make([]common.Hash, 0, maxTrieRequestCount) 1118 paths = make([]trie.SyncPath, 0, maxTrieRequestCount) 1119 pathsets = make([]TrieNodePathSet, 0, maxTrieRequestCount) 1120 ) 1121 for hash, pathset := range s.healer.trieTasks { 1122 delete(s.healer.trieTasks, hash) 1123 1124 hashes = append(hashes, hash) 1125 paths = append(paths, pathset) 1126 pathsets = append(pathsets, [][]byte(pathset)) // TODO(karalabe): group requests by account hash 1127 1128 if len(hashes) >= maxTrieRequestCount { 1129 break 1130 } 1131 } 1132 req := &trienodeHealRequest{ 1133 peer: idle, 1134 id: reqid, 1135 cancel: cancel, 1136 stale: make(chan struct{}), 1137 hashes: hashes, 1138 paths: paths, 1139 task: s.healer, 1140 } 1141 req.timeout = time.AfterFunc(requestTimeout, func() { 1142 log.Debug("Trienode heal request timed out") 1143 select { 1144 case s.trienodeHealReqFails <- req: 1145 default: 1146 } 1147 }) 1148 s.trienodeHealReqs[reqid] = req 1149 delete(s.trienodeHealIdlers, idle) 1150 1151 s.pend.Add(1) 1152 go func(peer *Peer, root common.Hash) { 1153 defer s.pend.Done() 1154 1155 // Attempt to send the remote request and revert if it fails 1156 if err := peer.RequestTrieNodes(reqid, root, pathsets, maxRequestSize); err != nil { 1157 log.Debug("Failed to request trienode healers", "err", err) 1158 select { 1159 case s.trienodeHealReqFails <- req: 1160 default: 1161 } 1162 } 1163 // Request successfully sent, start a 1164 }(s.peers[idle], s.root) // We're in the lock, peers[id] surely exists 1165 } 1166 } 1167 1168 // assignBytecodeHealTasks attempts to match idle peers to bytecode requests to 1169 // heal any trie errors caused by the snap sync's chunked retrieval model. 1170 func (s *Syncer) assignBytecodeHealTasks(cancel chan struct{}) { 1171 s.lock.Lock() 1172 defer s.lock.Unlock() 1173 1174 // If there are no idle peers, short circuit assignment 1175 if len(s.bytecodeHealIdlers) == 0 { 1176 return 1177 } 1178 // Iterate over pending tasks and try to find a peer to retrieve with 1179 for len(s.healer.codeTasks) > 0 || s.healer.scheduler.Pending() > 0 { 1180 // If there are not enough trie tasks queued to fully assign, fill the 1181 // queue from the state sync scheduler. The trie synced schedules these 1182 // together with trie nodes, so we need to queue them combined. 1183 var ( 1184 have = len(s.healer.trieTasks) + len(s.healer.codeTasks) 1185 want = maxTrieRequestCount + maxCodeRequestCount 1186 ) 1187 if have < want { 1188 nodes, paths, codes := s.healer.scheduler.Missing(want - have) 1189 for i, hash := range nodes { 1190 s.healer.trieTasks[hash] = paths[i] 1191 } 1192 for _, hash := range codes { 1193 s.healer.codeTasks[hash] = struct{}{} 1194 } 1195 } 1196 // If all the heal tasks are trienodes or already downloading, bail 1197 if len(s.healer.codeTasks) == 0 { 1198 return 1199 } 1200 // Task pending retrieval, try to find an idle peer. If no such peer 1201 // exists, we probably assigned tasks for all (or they are stateless). 1202 // Abort the entire assignment mechanism. 1203 var idle string 1204 for id := range s.bytecodeHealIdlers { 1205 // If the peer rejected a query in this sync cycle, don't bother asking 1206 // again for anything, it's either out of sync or already pruned 1207 if _, ok := s.statelessPeers[id]; ok { 1208 continue 1209 } 1210 idle = id 1211 break 1212 } 1213 if idle == "" { 1214 return 1215 } 1216 // Matched a pending task to an idle peer, allocate a unique request id 1217 var reqid uint64 1218 for { 1219 reqid = uint64(rand.Int63()) 1220 if reqid == 0 { 1221 continue 1222 } 1223 if _, ok := s.bytecodeHealReqs[reqid]; ok { 1224 continue 1225 } 1226 break 1227 } 1228 // Generate the network query and send it to the peer 1229 hashes := make([]common.Hash, 0, maxCodeRequestCount) 1230 for hash := range s.healer.codeTasks { 1231 delete(s.healer.codeTasks, hash) 1232 1233 hashes = append(hashes, hash) 1234 if len(hashes) >= maxCodeRequestCount { 1235 break 1236 } 1237 } 1238 req := &bytecodeHealRequest{ 1239 peer: idle, 1240 id: reqid, 1241 cancel: cancel, 1242 stale: make(chan struct{}), 1243 hashes: hashes, 1244 task: s.healer, 1245 } 1246 req.timeout = time.AfterFunc(requestTimeout, func() { 1247 log.Debug("Bytecode heal request timed out") 1248 select { 1249 case s.bytecodeHealReqFails <- req: 1250 default: 1251 } 1252 }) 1253 s.bytecodeHealReqs[reqid] = req 1254 delete(s.bytecodeHealIdlers, idle) 1255 1256 s.pend.Add(1) 1257 go func(peer *Peer) { 1258 defer s.pend.Done() 1259 1260 // Attempt to send the remote request and revert if it fails 1261 if err := peer.RequestByteCodes(reqid, hashes, maxRequestSize); err != nil { 1262 log.Debug("Failed to request bytecode healers", "err", err) 1263 select { 1264 case s.bytecodeHealReqFails <- req: 1265 default: 1266 } 1267 } 1268 // Request successfully sent, start a 1269 }(s.peers[idle]) // We're in the lock, peers[id] surely exists 1270 } 1271 } 1272 1273 // revertRequests locates all the currently pending reuqests from a particular 1274 // peer and reverts them, rescheduling for others to fulfill. 1275 func (s *Syncer) revertRequests(peer string) { 1276 // Gather the requests first, revertals need the lock too 1277 s.lock.Lock() 1278 var accountReqs []*accountRequest 1279 for _, req := range s.accountReqs { 1280 if req.peer == peer { 1281 accountReqs = append(accountReqs, req) 1282 } 1283 } 1284 var bytecodeReqs []*bytecodeRequest 1285 for _, req := range s.bytecodeReqs { 1286 if req.peer == peer { 1287 bytecodeReqs = append(bytecodeReqs, req) 1288 } 1289 } 1290 var storageReqs []*storageRequest 1291 for _, req := range s.storageReqs { 1292 if req.peer == peer { 1293 storageReqs = append(storageReqs, req) 1294 } 1295 } 1296 var trienodeHealReqs []*trienodeHealRequest 1297 for _, req := range s.trienodeHealReqs { 1298 if req.peer == peer { 1299 trienodeHealReqs = append(trienodeHealReqs, req) 1300 } 1301 } 1302 var bytecodeHealReqs []*bytecodeHealRequest 1303 for _, req := range s.bytecodeHealReqs { 1304 if req.peer == peer { 1305 bytecodeHealReqs = append(bytecodeHealReqs, req) 1306 } 1307 } 1308 s.lock.Unlock() 1309 1310 // Revert all the requests matching the peer 1311 for _, req := range accountReqs { 1312 s.revertAccountRequest(req) 1313 } 1314 for _, req := range bytecodeReqs { 1315 s.revertBytecodeRequest(req) 1316 } 1317 for _, req := range storageReqs { 1318 s.revertStorageRequest(req) 1319 } 1320 for _, req := range trienodeHealReqs { 1321 s.revertTrienodeHealRequest(req) 1322 } 1323 for _, req := range bytecodeHealReqs { 1324 s.revertBytecodeHealRequest(req) 1325 } 1326 } 1327 1328 // revertAccountRequest cleans up an account range request and returns all failed 1329 // retrieval tasks to the scheduler for reassignment. 1330 func (s *Syncer) revertAccountRequest(req *accountRequest) { 1331 log.Trace("Reverting account request", "peer", req.peer, "reqid", req.id) 1332 select { 1333 case <-req.stale: 1334 log.Trace("Account request already reverted", "peer", req.peer, "reqid", req.id) 1335 return 1336 default: 1337 } 1338 close(req.stale) 1339 1340 // Remove the request from the tracked set 1341 s.lock.Lock() 1342 delete(s.accountReqs, req.id) 1343 s.lock.Unlock() 1344 1345 // If there's a timeout timer still running, abort it and mark the account 1346 // task as not-pending, ready for resheduling 1347 req.timeout.Stop() 1348 if req.task.req == req { 1349 req.task.req = nil 1350 } 1351 } 1352 1353 // revertBytecodeRequest cleans up an bytecode request and returns all failed 1354 // retrieval tasks to the scheduler for reassignment. 1355 func (s *Syncer) revertBytecodeRequest(req *bytecodeRequest) { 1356 log.Trace("Reverting bytecode request", "peer", req.peer) 1357 select { 1358 case <-req.stale: 1359 log.Trace("Bytecode request already reverted", "peer", req.peer, "reqid", req.id) 1360 return 1361 default: 1362 } 1363 close(req.stale) 1364 1365 // Remove the request from the tracked set 1366 s.lock.Lock() 1367 delete(s.bytecodeReqs, req.id) 1368 s.lock.Unlock() 1369 1370 // If there's a timeout timer still running, abort it and mark the code 1371 // retrievals as not-pending, ready for resheduling 1372 req.timeout.Stop() 1373 for _, hash := range req.hashes { 1374 req.task.codeTasks[hash] = struct{}{} 1375 } 1376 } 1377 1378 // revertStorageRequest cleans up a storage range request and returns all failed 1379 // retrieval tasks to the scheduler for reassignment. 1380 func (s *Syncer) revertStorageRequest(req *storageRequest) { 1381 log.Trace("Reverting storage request", "peer", req.peer) 1382 select { 1383 case <-req.stale: 1384 log.Trace("Storage request already reverted", "peer", req.peer, "reqid", req.id) 1385 return 1386 default: 1387 } 1388 close(req.stale) 1389 1390 // Remove the request from the tracked set 1391 s.lock.Lock() 1392 delete(s.storageReqs, req.id) 1393 s.lock.Unlock() 1394 1395 // If there's a timeout timer still running, abort it and mark the storage 1396 // task as not-pending, ready for resheduling 1397 req.timeout.Stop() 1398 if req.subTask != nil { 1399 req.subTask.req = nil 1400 } else { 1401 for i, account := range req.accounts { 1402 req.mainTask.stateTasks[account] = req.roots[i] 1403 } 1404 } 1405 } 1406 1407 // revertTrienodeHealRequest cleans up an trienode heal request and returns all 1408 // failed retrieval tasks to the scheduler for reassignment. 1409 func (s *Syncer) revertTrienodeHealRequest(req *trienodeHealRequest) { 1410 log.Trace("Reverting trienode heal request", "peer", req.peer) 1411 select { 1412 case <-req.stale: 1413 log.Trace("Trienode heal request already reverted", "peer", req.peer, "reqid", req.id) 1414 return 1415 default: 1416 } 1417 close(req.stale) 1418 1419 // Remove the request from the tracked set 1420 s.lock.Lock() 1421 delete(s.trienodeHealReqs, req.id) 1422 s.lock.Unlock() 1423 1424 // If there's a timeout timer still running, abort it and mark the trie node 1425 // retrievals as not-pending, ready for resheduling 1426 req.timeout.Stop() 1427 for i, hash := range req.hashes { 1428 req.task.trieTasks[hash] = [][]byte(req.paths[i]) 1429 } 1430 } 1431 1432 // revertBytecodeHealRequest cleans up an bytecode request and returns all failed 1433 // retrieval tasks to the scheduler for reassignment. 1434 func (s *Syncer) revertBytecodeHealRequest(req *bytecodeHealRequest) { 1435 log.Trace("Reverting bytecode heal request", "peer", req.peer) 1436 select { 1437 case <-req.stale: 1438 log.Trace("Bytecode heal request already reverted", "peer", req.peer, "reqid", req.id) 1439 return 1440 default: 1441 } 1442 close(req.stale) 1443 1444 // Remove the request from the tracked set 1445 s.lock.Lock() 1446 delete(s.bytecodeHealReqs, req.id) 1447 s.lock.Unlock() 1448 1449 // If there's a timeout timer still running, abort it and mark the code 1450 // retrievals as not-pending, ready for resheduling 1451 req.timeout.Stop() 1452 for _, hash := range req.hashes { 1453 req.task.codeTasks[hash] = struct{}{} 1454 } 1455 } 1456 1457 // processAccountResponse integrates an already validated account range response 1458 // into the account tasks. 1459 func (s *Syncer) processAccountResponse(res *accountResponse) { 1460 // Switch the task from pending to filling 1461 res.task.req = nil 1462 res.task.res = res 1463 1464 // Ensure that the response doesn't overflow into the subsequent task 1465 last := res.task.Last.Big() 1466 for i, hash := range res.hashes { 1467 if hash.Big().Cmp(last) > 0 { 1468 // Chunk overflown, cut off excess, but also update the boundary nodes 1469 for j := i; j < len(res.hashes); j++ { 1470 if err := res.trie.Prove(res.hashes[j][:], 0, res.overflow); err != nil { 1471 panic(err) // Account range was already proven, what happened 1472 } 1473 } 1474 res.hashes = res.hashes[:i] 1475 res.accounts = res.accounts[:i] 1476 res.cont = false // Mark range completed 1477 break 1478 } 1479 } 1480 // Itereate over all the accounts and assemble which ones need further sub- 1481 // filling before the entire account range can be persisted. 1482 res.task.needCode = make([]bool, len(res.accounts)) 1483 res.task.needState = make([]bool, len(res.accounts)) 1484 res.task.needHeal = make([]bool, len(res.accounts)) 1485 1486 res.task.codeTasks = make(map[common.Hash]struct{}) 1487 res.task.stateTasks = make(map[common.Hash]common.Hash) 1488 1489 resumed := make(map[common.Hash]struct{}) 1490 1491 res.task.pend = 0 1492 for i, account := range res.accounts { 1493 // Check if the account is a contract with an unknown code 1494 if !bytes.Equal(account.CodeHash, emptyCode[:]) { 1495 if code := rawdb.ReadCodeWithPrefix(s.db, common.BytesToHash(account.CodeHash)); code == nil { 1496 res.task.codeTasks[common.BytesToHash(account.CodeHash)] = struct{}{} 1497 res.task.needCode[i] = true 1498 res.task.pend++ 1499 } 1500 } 1501 // Check if the account is a contract with an unknown storage trie 1502 if account.Root != emptyRoot { 1503 if node, err := s.db.Get(account.Root[:]); err != nil || node == nil { 1504 // If there was a previous large state retrieval in progress, 1505 // don't restart it from scratch. This happens if a sync cycle 1506 // is interrupted and resumed later. However, *do* update the 1507 // previous root hash. 1508 if subtasks, ok := res.task.SubTasks[res.hashes[i]]; ok { 1509 log.Error("Resuming large storage retrieval", "account", res.hashes[i], "root", account.Root) 1510 for _, subtask := range subtasks { 1511 subtask.root = account.Root 1512 } 1513 res.task.needHeal[i] = true 1514 resumed[res.hashes[i]] = struct{}{} 1515 } else { 1516 res.task.stateTasks[res.hashes[i]] = account.Root 1517 } 1518 res.task.needState[i] = true 1519 res.task.pend++ 1520 } 1521 } 1522 } 1523 // Delete any subtasks that have been aborted but not resumed. This may undo 1524 // some progress if a newpeer gives us less accounts than an old one, but for 1525 // now we have to live with that. 1526 for hash := range res.task.SubTasks { 1527 if _, ok := resumed[hash]; !ok { 1528 log.Error("Aborting suspended storage retrieval", "account", hash) 1529 delete(res.task.SubTasks, hash) 1530 } 1531 } 1532 // If the account range contained no contracts, or all have been fully filled 1533 // beforehand, short circuit storage filling and forward to the next task 1534 if res.task.pend == 0 { 1535 s.forwardAccountTask(res.task) 1536 return 1537 } 1538 // Some accounts are incomplete, leave as is for the storage and contract 1539 // task assigners to pick up and fill. 1540 } 1541 1542 // processBytecodeResponse integrates an already validated bytecode response 1543 // into the account tasks. 1544 func (s *Syncer) processBytecodeResponse(res *bytecodeResponse) { 1545 batch := s.db.NewBatch() 1546 1547 var ( 1548 codes uint64 1549 bytes common.StorageSize 1550 ) 1551 for i, hash := range res.hashes { 1552 code := res.codes[i] 1553 1554 // If the bytecode was not delivered, reschedule it 1555 if code == nil { 1556 res.task.codeTasks[hash] = struct{}{} 1557 continue 1558 } 1559 // Code was delivered, mark it not needed any more 1560 for j, account := range res.task.res.accounts { 1561 if res.task.needCode[j] && hash == common.BytesToHash(account.CodeHash) { 1562 res.task.needCode[j] = false 1563 res.task.pend-- 1564 } 1565 } 1566 // Push the bytecode into a database batch 1567 s.bytecodeSynced++ 1568 s.bytecodeBytes += common.StorageSize(len(code)) 1569 1570 codes++ 1571 bytes += common.StorageSize(len(code)) 1572 1573 rawdb.WriteCode(batch, hash, code) 1574 s.bloom.Add(hash[:]) 1575 } 1576 if err := batch.Write(); err != nil { 1577 log.Crit("Failed to persist bytecodes", "err", err) 1578 } 1579 log.Debug("Persisted set of bytecodes", "count", codes, "bytes", bytes) 1580 1581 // If this delivery completed the last pending task, forward the account task 1582 // to the next chunk 1583 if res.task.pend == 0 { 1584 s.forwardAccountTask(res.task) 1585 return 1586 } 1587 // Some accounts are still incomplete, leave as is for the storage and contract 1588 // task assigners to pick up and fill. 1589 } 1590 1591 // processStorageResponse integrates an already validated storage response 1592 // into the account tasks. 1593 func (s *Syncer) processStorageResponse(res *storageResponse) { 1594 // Switch the suntask from pending to idle 1595 if res.subTask != nil { 1596 res.subTask.req = nil 1597 } 1598 batch := s.db.NewBatch() 1599 1600 var ( 1601 slots int 1602 nodes int 1603 skipped int 1604 bytes common.StorageSize 1605 ) 1606 // Iterate over all the accounts and reconstruct their storage tries from the 1607 // delivered slots 1608 delivered := make(map[common.Hash]bool) 1609 for i := 0; i < len(res.hashes); i++ { 1610 delivered[res.roots[i]] = true 1611 } 1612 for i, account := range res.accounts { 1613 // If the account was not delivered, reschedule it 1614 if i >= len(res.hashes) { 1615 if !delivered[res.roots[i]] { 1616 res.mainTask.stateTasks[account] = res.roots[i] 1617 } 1618 continue 1619 } 1620 // State was delivered, if complete mark as not needed any more, otherwise 1621 // mark the account as needing healing 1622 for j, acc := range res.mainTask.res.accounts { 1623 if res.roots[i] == acc.Root { 1624 // If the packet contains multiple contract storage slots, all 1625 // but the last are surely complete. The last contract may be 1626 // chunked, so check it's continuation flag. 1627 if res.subTask == nil && res.mainTask.needState[j] && (i < len(res.hashes)-1 || !res.cont) { 1628 res.mainTask.needState[j] = false 1629 res.mainTask.pend-- 1630 } 1631 // If the last contract was chunked, mark it as needing healing 1632 // to avoid writing it out to disk prematurely. 1633 if res.subTask == nil && !res.mainTask.needHeal[j] && i == len(res.hashes)-1 && res.cont { 1634 res.mainTask.needHeal[j] = true 1635 } 1636 // If the last contract was chunked, we need to switch to large 1637 // contract handling mode 1638 if res.subTask == nil && i == len(res.hashes)-1 && res.cont { 1639 // If we haven't yet started a large-contract retrieval, create 1640 // the subtasks for it within the main account task 1641 if tasks, ok := res.mainTask.SubTasks[account]; !ok { 1642 var ( 1643 next common.Hash 1644 ) 1645 step := new(big.Int).Sub( 1646 new(big.Int).Div( 1647 new(big.Int).Exp(common.Big2, common.Big256, nil), 1648 big.NewInt(storageConcurrency), 1649 ), common.Big1, 1650 ) 1651 for k := 0; k < storageConcurrency; k++ { 1652 last := common.BigToHash(new(big.Int).Add(next.Big(), step)) 1653 if k == storageConcurrency-1 { 1654 // Make sure we don't overflow if the step is not a proper divisor 1655 last = common.HexToHash("0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff") 1656 } 1657 tasks = append(tasks, &storageTask{ 1658 Next: next, 1659 Last: last, 1660 root: acc.Root, 1661 }) 1662 log.Debug("Created storage sync task", "account", account, "root", acc.Root, "from", next, "last", last) 1663 next = common.BigToHash(new(big.Int).Add(last.Big(), common.Big1)) 1664 } 1665 res.mainTask.SubTasks[account] = tasks 1666 1667 // Since we've just created the sub-tasks, this response 1668 // is surely for the first one (zero origin) 1669 res.subTask = tasks[0] 1670 } 1671 } 1672 // If we're in large contract delivery mode, forward the subtask 1673 if res.subTask != nil { 1674 // Ensure the response doesn't overflow into the subsequent task 1675 last := res.subTask.Last.Big() 1676 for k, hash := range res.hashes[i] { 1677 if hash.Big().Cmp(last) > 0 { 1678 // Chunk overflown, cut off excess, but also update the boundary 1679 for l := k; l < len(res.hashes[i]); l++ { 1680 if err := res.tries[i].Prove(res.hashes[i][l][:], 0, res.overflow); err != nil { 1681 panic(err) // Account range was already proven, what happened 1682 } 1683 } 1684 res.hashes[i] = res.hashes[i][:k] 1685 res.slots[i] = res.slots[i][:k] 1686 res.cont = false // Mark range completed 1687 break 1688 } 1689 } 1690 // Forward the relevant storage chunk (even if created just now) 1691 if res.cont { 1692 res.subTask.Next = common.BigToHash(new(big.Int).Add(res.hashes[i][len(res.hashes[i])-1].Big(), big.NewInt(1))) 1693 } else { 1694 res.subTask.done = true 1695 } 1696 } 1697 } 1698 } 1699 // Iterate over all the reconstructed trie nodes and push them to disk 1700 slots += len(res.hashes[i]) 1701 1702 it := res.nodes[i].NewIterator(nil, nil) 1703 for it.Next() { 1704 // Boundary nodes are not written for the last result, since they are incomplete 1705 if i == len(res.hashes)-1 { 1706 if _, ok := res.bounds[common.BytesToHash(it.Key())]; ok { 1707 skipped++ 1708 continue 1709 } 1710 } 1711 // Node is not a boundary, persist to disk 1712 batch.Put(it.Key(), it.Value()) 1713 s.bloom.Add(it.Key()) 1714 1715 bytes += common.StorageSize(common.HashLength + len(it.Value())) 1716 nodes++ 1717 } 1718 it.Release() 1719 } 1720 if err := batch.Write(); err != nil { 1721 log.Crit("Failed to persist storage slots", "err", err) 1722 } 1723 s.storageSynced += uint64(slots) 1724 s.storageBytes += bytes 1725 1726 log.Debug("Persisted set of storage slots", "accounts", len(res.hashes), "slots", slots, "nodes", nodes, "skipped", skipped, "bytes", bytes) 1727 1728 // If this delivery completed the last pending task, forward the account task 1729 // to the next chunk 1730 if res.mainTask.pend == 0 { 1731 s.forwardAccountTask(res.mainTask) 1732 return 1733 } 1734 // Some accounts are still incomplete, leave as is for the storage and contract 1735 // task assigners to pick up and fill. 1736 } 1737 1738 // processTrienodeHealResponse integrates an already validated trienode response 1739 // into the healer tasks. 1740 func (s *Syncer) processTrienodeHealResponse(res *trienodeHealResponse) { 1741 for i, hash := range res.hashes { 1742 node := res.nodes[i] 1743 1744 // If the trie node was not delivered, reschedule it 1745 if node == nil { 1746 res.task.trieTasks[hash] = res.paths[i] 1747 continue 1748 } 1749 // Push the trie node into the state syncer 1750 s.trienodeHealSynced++ 1751 s.trienodeHealBytes += common.StorageSize(len(node)) 1752 1753 err := s.healer.scheduler.Process(trie.SyncResult{Hash: hash, Data: node}) 1754 switch err { 1755 case nil: 1756 case trie.ErrAlreadyProcessed: 1757 s.trienodeHealDups++ 1758 case trie.ErrNotRequested: 1759 s.trienodeHealNops++ 1760 default: 1761 log.Error("Invalid trienode processed", "hash", hash, "err", err) 1762 } 1763 } 1764 batch := s.db.NewBatch() 1765 if err := s.healer.scheduler.Commit(batch); err != nil { 1766 log.Error("Failed to commit healing data", "err", err) 1767 } 1768 if err := batch.Write(); err != nil { 1769 log.Crit("Failed to persist healing data", "err", err) 1770 } 1771 log.Debug("Persisted set of healing data", "bytes", common.StorageSize(batch.ValueSize())) 1772 } 1773 1774 // processBytecodeHealResponse integrates an already validated bytecode response 1775 // into the healer tasks. 1776 func (s *Syncer) processBytecodeHealResponse(res *bytecodeHealResponse) { 1777 for i, hash := range res.hashes { 1778 node := res.codes[i] 1779 1780 // If the trie node was not delivered, reschedule it 1781 if node == nil { 1782 res.task.codeTasks[hash] = struct{}{} 1783 continue 1784 } 1785 // Push the trie node into the state syncer 1786 s.bytecodeHealSynced++ 1787 s.bytecodeHealBytes += common.StorageSize(len(node)) 1788 1789 err := s.healer.scheduler.Process(trie.SyncResult{Hash: hash, Data: node}) 1790 switch err { 1791 case nil: 1792 case trie.ErrAlreadyProcessed: 1793 s.bytecodeHealDups++ 1794 case trie.ErrNotRequested: 1795 s.bytecodeHealNops++ 1796 default: 1797 log.Error("Invalid bytecode processed", "hash", hash, "err", err) 1798 } 1799 } 1800 batch := s.db.NewBatch() 1801 if err := s.healer.scheduler.Commit(batch); err != nil { 1802 log.Error("Failed to commit healing data", "err", err) 1803 } 1804 if err := batch.Write(); err != nil { 1805 log.Crit("Failed to persist healing data", "err", err) 1806 } 1807 log.Debug("Persisted set of healing data", "bytes", common.StorageSize(batch.ValueSize())) 1808 } 1809 1810 // forwardAccountTask takes a filled account task and persists anything available 1811 // into the database, after which it forwards the next account marker so that the 1812 // task's next chunk may be filled. 1813 func (s *Syncer) forwardAccountTask(task *accountTask) { 1814 // Remove any pending delivery 1815 res := task.res 1816 if res == nil { 1817 return // nothing to forward 1818 } 1819 task.res = nil 1820 1821 // Iterate over all the accounts and gather all the incomplete trie nodes. A 1822 // node is incomplete if we haven't yet filled it (sync was interrupted), or 1823 // if we filled it in multiple chunks (storage trie), in which case the few 1824 // nodes on the chunk boundaries are missing. 1825 incompletes := light.NewNodeSet() 1826 for i := range res.accounts { 1827 // If the filling was interrupted, mark everything after as incomplete 1828 if task.needCode[i] || task.needState[i] { 1829 for j := i; j < len(res.accounts); j++ { 1830 if err := res.trie.Prove(res.hashes[j][:], 0, incompletes); err != nil { 1831 panic(err) // Account range was already proven, what happened 1832 } 1833 } 1834 break 1835 } 1836 // Filling not interrupted until this point, mark incomplete if needs healing 1837 if task.needHeal[i] { 1838 if err := res.trie.Prove(res.hashes[i][:], 0, incompletes); err != nil { 1839 panic(err) // Account range was already proven, what happened 1840 } 1841 } 1842 } 1843 // Persist every finalized trie node that's not on the boundary 1844 batch := s.db.NewBatch() 1845 1846 var ( 1847 nodes int 1848 skipped int 1849 bytes common.StorageSize 1850 ) 1851 it := res.nodes.NewIterator(nil, nil) 1852 for it.Next() { 1853 // Boundary nodes are not written, since they are incomplete 1854 if _, ok := res.bounds[common.BytesToHash(it.Key())]; ok { 1855 skipped++ 1856 continue 1857 } 1858 // Overflow nodes are not written, since they mess with another task 1859 if _, err := res.overflow.Get(it.Key()); err == nil { 1860 skipped++ 1861 continue 1862 } 1863 // Accounts with split storage requests are incomplete 1864 if _, err := incompletes.Get(it.Key()); err == nil { 1865 skipped++ 1866 continue 1867 } 1868 // Node is neither a boundary, not an incomplete account, persist to disk 1869 batch.Put(it.Key(), it.Value()) 1870 s.bloom.Add(it.Key()) 1871 1872 bytes += common.StorageSize(common.HashLength + len(it.Value())) 1873 nodes++ 1874 } 1875 it.Release() 1876 1877 if err := batch.Write(); err != nil { 1878 log.Crit("Failed to persist accounts", "err", err) 1879 } 1880 s.accountBytes += bytes 1881 s.accountSynced += uint64(len(res.accounts)) 1882 1883 log.Debug("Persisted range of accounts", "accounts", len(res.accounts), "nodes", nodes, "skipped", skipped, "bytes", bytes) 1884 1885 // Task filling persisted, push it the chunk marker forward to the first 1886 // account still missing data. 1887 for i, hash := range res.hashes { 1888 if task.needCode[i] || task.needState[i] { 1889 return 1890 } 1891 task.Next = common.BigToHash(new(big.Int).Add(hash.Big(), big.NewInt(1))) 1892 } 1893 // All accounts marked as complete, track if the entire task is done 1894 task.done = !res.cont 1895 } 1896 1897 // OnAccounts is a callback method to invoke when a range of accounts are 1898 // received from a remote peer. 1899 func (s *Syncer) OnAccounts(peer *Peer, id uint64, hashes []common.Hash, accounts [][]byte, proof [][]byte) error { 1900 size := common.StorageSize(len(hashes) * common.HashLength) 1901 for _, account := range accounts { 1902 size += common.StorageSize(len(account)) 1903 } 1904 for _, node := range proof { 1905 size += common.StorageSize(len(node)) 1906 } 1907 logger := peer.logger.New("reqid", id) 1908 logger.Trace("Delivering range of accounts", "hashes", len(hashes), "accounts", len(accounts), "proofs", len(proof), "bytes", size) 1909 1910 // Whether or not the response is valid, we can mark the peer as idle and 1911 // notify the scheduler to assign a new task. If the response is invalid, 1912 // we'll drop the peer in a bit. 1913 s.lock.Lock() 1914 if _, ok := s.peers[peer.id]; ok { 1915 s.accountIdlers[peer.id] = struct{}{} 1916 } 1917 select { 1918 case s.update <- struct{}{}: 1919 default: 1920 } 1921 // Ensure the response is for a valid request 1922 req, ok := s.accountReqs[id] 1923 if !ok { 1924 // Request stale, perhaps the peer timed out but came through in the end 1925 logger.Warn("Unexpected account range packet") 1926 s.lock.Unlock() 1927 return nil 1928 } 1929 delete(s.accountReqs, id) 1930 1931 // Clean up the request timeout timer, we'll see how to proceed further based 1932 // on the actual delivered content 1933 req.timeout.Stop() 1934 1935 // Response is valid, but check if peer is signalling that it does not have 1936 // the requested data. For account range queries that means the state being 1937 // retrieved was either already pruned remotely, or the peer is not yet 1938 // synced to our head. 1939 if len(hashes) == 0 && len(accounts) == 0 && len(proof) == 0 { 1940 logger.Debug("Peer rejected account range request", "root", s.root) 1941 s.statelessPeers[peer.id] = struct{}{} 1942 s.lock.Unlock() 1943 return nil 1944 } 1945 root := s.root 1946 s.lock.Unlock() 1947 1948 // Reconstruct a partial trie from the response and verify it 1949 keys := make([][]byte, len(hashes)) 1950 for i, key := range hashes { 1951 keys[i] = common.CopyBytes(key[:]) 1952 } 1953 nodes := make(light.NodeList, len(proof)) 1954 for i, node := range proof { 1955 nodes[i] = node 1956 } 1957 proofdb := nodes.NodeSet() 1958 1959 var end []byte 1960 if len(keys) > 0 { 1961 end = keys[len(keys)-1] 1962 } 1963 db, tr, notary, cont, err := trie.VerifyRangeProof(root, req.origin[:], end, keys, accounts, proofdb) 1964 if err != nil { 1965 logger.Warn("Account range failed proof", "err", err) 1966 return err 1967 } 1968 // Partial trie reconstructed, send it to the scheduler for storage filling 1969 bounds := make(map[common.Hash]struct{}) 1970 1971 it := notary.Accessed().NewIterator(nil, nil) 1972 for it.Next() { 1973 bounds[common.BytesToHash(it.Key())] = struct{}{} 1974 } 1975 it.Release() 1976 1977 accs := make([]*state.Account, len(accounts)) 1978 for i, account := range accounts { 1979 acc := new(state.Account) 1980 if err := rlp.DecodeBytes(account, acc); err != nil { 1981 panic(err) // We created these blobs, we must be able to decode them 1982 } 1983 accs[i] = acc 1984 } 1985 response := &accountResponse{ 1986 task: req.task, 1987 hashes: hashes, 1988 accounts: accs, 1989 nodes: db, 1990 trie: tr, 1991 bounds: bounds, 1992 overflow: light.NewNodeSet(), 1993 cont: cont, 1994 } 1995 select { 1996 case s.accountResps <- response: 1997 case <-req.cancel: 1998 case <-req.stale: 1999 } 2000 return nil 2001 } 2002 2003 // OnByteCodes is a callback method to invoke when a batch of contract 2004 // bytes codes are received from a remote peer. 2005 func (s *Syncer) OnByteCodes(peer *Peer, id uint64, bytecodes [][]byte) error { 2006 s.lock.RLock() 2007 syncing := len(s.tasks) > 0 2008 s.lock.RUnlock() 2009 2010 if syncing { 2011 return s.onByteCodes(peer, id, bytecodes) 2012 } 2013 return s.onHealByteCodes(peer, id, bytecodes) 2014 } 2015 2016 // onByteCodes is a callback method to invoke when a batch of contract 2017 // bytes codes are received from a remote peer in the syncing phase. 2018 func (s *Syncer) onByteCodes(peer *Peer, id uint64, bytecodes [][]byte) error { 2019 var size common.StorageSize 2020 for _, code := range bytecodes { 2021 size += common.StorageSize(len(code)) 2022 } 2023 logger := peer.logger.New("reqid", id) 2024 logger.Trace("Delivering set of bytecodes", "bytecodes", len(bytecodes), "bytes", size) 2025 2026 // Whether or not the response is valid, we can mark the peer as idle and 2027 // notify the scheduler to assign a new task. If the response is invalid, 2028 // we'll drop the peer in a bit. 2029 s.lock.Lock() 2030 if _, ok := s.peers[peer.id]; ok { 2031 s.bytecodeIdlers[peer.id] = struct{}{} 2032 } 2033 select { 2034 case s.update <- struct{}{}: 2035 default: 2036 } 2037 // Ensure the response is for a valid request 2038 req, ok := s.bytecodeReqs[id] 2039 if !ok { 2040 // Request stale, perhaps the peer timed out but came through in the end 2041 logger.Warn("Unexpected bytecode packet") 2042 s.lock.Unlock() 2043 return nil 2044 } 2045 delete(s.bytecodeReqs, id) 2046 2047 // Clean up the request timeout timer, we'll see how to proceed further based 2048 // on the actual delivered content 2049 req.timeout.Stop() 2050 2051 // Response is valid, but check if peer is signalling that it does not have 2052 // the requested data. For bytecode range queries that means the peer is not 2053 // yet synced. 2054 if len(bytecodes) == 0 { 2055 logger.Debug("Peer rejected bytecode request") 2056 s.statelessPeers[peer.id] = struct{}{} 2057 s.lock.Unlock() 2058 return nil 2059 } 2060 s.lock.Unlock() 2061 2062 // Cross reference the requested bytecodes with the response to find gaps 2063 // that the serving node is missing 2064 hasher := sha3.NewLegacyKeccak256() 2065 2066 codes := make([][]byte, len(req.hashes)) 2067 for i, j := 0, 0; i < len(bytecodes); i++ { 2068 // Find the next hash that we've been served, leaving misses with nils 2069 hasher.Reset() 2070 hasher.Write(bytecodes[i]) 2071 hash := hasher.Sum(nil) 2072 2073 for j < len(req.hashes) && !bytes.Equal(hash, req.hashes[j][:]) { 2074 j++ 2075 } 2076 if j < len(req.hashes) { 2077 codes[j] = bytecodes[i] 2078 j++ 2079 continue 2080 } 2081 // We've either ran out of hashes, or got unrequested data 2082 logger.Warn("Unexpected bytecodes", "count", len(bytecodes)-i) 2083 return errors.New("unexpected bytecode") 2084 } 2085 // Response validated, send it to the scheduler for filling 2086 response := &bytecodeResponse{ 2087 task: req.task, 2088 hashes: req.hashes, 2089 codes: codes, 2090 } 2091 select { 2092 case s.bytecodeResps <- response: 2093 case <-req.cancel: 2094 case <-req.stale: 2095 } 2096 return nil 2097 } 2098 2099 // OnStorage is a callback method to invoke when ranges of storage slots 2100 // are received from a remote peer. 2101 func (s *Syncer) OnStorage(peer *Peer, id uint64, hashes [][]common.Hash, slots [][][]byte, proof [][]byte) error { 2102 // Gather some trace stats to aid in debugging issues 2103 var ( 2104 hashCount int 2105 slotCount int 2106 size common.StorageSize 2107 ) 2108 for _, hashset := range hashes { 2109 size += common.StorageSize(common.HashLength * len(hashset)) 2110 hashCount += len(hashset) 2111 } 2112 for _, slotset := range slots { 2113 for _, slot := range slotset { 2114 size += common.StorageSize(len(slot)) 2115 } 2116 slotCount += len(slotset) 2117 } 2118 for _, node := range proof { 2119 size += common.StorageSize(len(node)) 2120 } 2121 logger := peer.logger.New("reqid", id) 2122 logger.Trace("Delivering ranges of storage slots", "accounts", len(hashes), "hashes", hashCount, "slots", slotCount, "proofs", len(proof), "size", size) 2123 2124 // Whether or not the response is valid, we can mark the peer as idle and 2125 // notify the scheduler to assign a new task. If the response is invalid, 2126 // we'll drop the peer in a bit. 2127 s.lock.Lock() 2128 if _, ok := s.peers[peer.id]; ok { 2129 s.storageIdlers[peer.id] = struct{}{} 2130 } 2131 select { 2132 case s.update <- struct{}{}: 2133 default: 2134 } 2135 // Ensure the response is for a valid request 2136 req, ok := s.storageReqs[id] 2137 if !ok { 2138 // Request stale, perhaps the peer timed out but came through in the end 2139 logger.Warn("Unexpected storage ranges packet") 2140 s.lock.Unlock() 2141 return nil 2142 } 2143 delete(s.storageReqs, id) 2144 2145 // Clean up the request timeout timer, we'll see how to proceed further based 2146 // on the actual delivered content 2147 req.timeout.Stop() 2148 2149 // Reject the response if the hash sets and slot sets don't match, or if the 2150 // peer sent more data than requested. 2151 if len(hashes) != len(slots) { 2152 s.lock.Unlock() 2153 logger.Warn("Hash and slot set size mismatch", "hashset", len(hashes), "slotset", len(slots)) 2154 return errors.New("hash and slot set size mismatch") 2155 } 2156 if len(hashes) > len(req.accounts) { 2157 s.lock.Unlock() 2158 logger.Warn("Hash set larger than requested", "hashset", len(hashes), "requested", len(req.accounts)) 2159 return errors.New("hash set larger than requested") 2160 } 2161 // Response is valid, but check if peer is signalling that it does not have 2162 // the requested data. For storage range queries that means the state being 2163 // retrieved was either already pruned remotely, or the peer is not yet 2164 // synced to our head. 2165 if len(hashes) == 0 { 2166 logger.Debug("Peer rejected storage request") 2167 s.statelessPeers[peer.id] = struct{}{} 2168 s.lock.Unlock() 2169 return nil 2170 } 2171 s.lock.Unlock() 2172 2173 // Reconstruct the partial tries from the response and verify them 2174 var ( 2175 dbs = make([]ethdb.KeyValueStore, len(hashes)) 2176 tries = make([]*trie.Trie, len(hashes)) 2177 notary *trie.KeyValueNotary 2178 cont bool 2179 ) 2180 for i := 0; i < len(hashes); i++ { 2181 // Convert the keys and proofs into an internal format 2182 keys := make([][]byte, len(hashes[i])) 2183 for j, key := range hashes[i] { 2184 keys[j] = common.CopyBytes(key[:]) 2185 } 2186 nodes := make(light.NodeList, 0, len(proof)) 2187 if i == len(hashes)-1 { 2188 for _, node := range proof { 2189 nodes = append(nodes, node) 2190 } 2191 } 2192 var err error 2193 if len(nodes) == 0 { 2194 // No proof has been attached, the response must cover the entire key 2195 // space and hash to the origin root. 2196 dbs[i], tries[i], _, _, err = trie.VerifyRangeProof(req.roots[i], nil, nil, keys, slots[i], nil) 2197 if err != nil { 2198 logger.Warn("Storage slots failed proof", "err", err) 2199 return err 2200 } 2201 } else { 2202 // A proof was attached, the response is only partial, check that the 2203 // returned data is indeed part of the storage trie 2204 proofdb := nodes.NodeSet() 2205 2206 var end []byte 2207 if len(keys) > 0 { 2208 end = keys[len(keys)-1] 2209 } 2210 dbs[i], tries[i], notary, cont, err = trie.VerifyRangeProof(req.roots[i], req.origin[:], end, keys, slots[i], proofdb) 2211 if err != nil { 2212 logger.Warn("Storage range failed proof", "err", err) 2213 return err 2214 } 2215 } 2216 } 2217 // Partial tries reconstructed, send them to the scheduler for storage filling 2218 bounds := make(map[common.Hash]struct{}) 2219 2220 if notary != nil { // if all contract storages are delivered in full, no notary will be created 2221 it := notary.Accessed().NewIterator(nil, nil) 2222 for it.Next() { 2223 bounds[common.BytesToHash(it.Key())] = struct{}{} 2224 } 2225 it.Release() 2226 } 2227 response := &storageResponse{ 2228 mainTask: req.mainTask, 2229 subTask: req.subTask, 2230 accounts: req.accounts, 2231 roots: req.roots, 2232 hashes: hashes, 2233 slots: slots, 2234 nodes: dbs, 2235 tries: tries, 2236 bounds: bounds, 2237 overflow: light.NewNodeSet(), 2238 cont: cont, 2239 } 2240 select { 2241 case s.storageResps <- response: 2242 case <-req.cancel: 2243 case <-req.stale: 2244 } 2245 return nil 2246 } 2247 2248 // OnTrieNodes is a callback method to invoke when a batch of trie nodes 2249 // are received from a remote peer. 2250 func (s *Syncer) OnTrieNodes(peer *Peer, id uint64, trienodes [][]byte) error { 2251 var size common.StorageSize 2252 for _, node := range trienodes { 2253 size += common.StorageSize(len(node)) 2254 } 2255 logger := peer.logger.New("reqid", id) 2256 logger.Trace("Delivering set of healing trienodes", "trienodes", len(trienodes), "bytes", size) 2257 2258 // Whether or not the response is valid, we can mark the peer as idle and 2259 // notify the scheduler to assign a new task. If the response is invalid, 2260 // we'll drop the peer in a bit. 2261 s.lock.Lock() 2262 if _, ok := s.peers[peer.id]; ok { 2263 s.trienodeHealIdlers[peer.id] = struct{}{} 2264 } 2265 select { 2266 case s.update <- struct{}{}: 2267 default: 2268 } 2269 // Ensure the response is for a valid request 2270 req, ok := s.trienodeHealReqs[id] 2271 if !ok { 2272 // Request stale, perhaps the peer timed out but came through in the end 2273 logger.Warn("Unexpected trienode heal packet") 2274 s.lock.Unlock() 2275 return nil 2276 } 2277 delete(s.trienodeHealReqs, id) 2278 2279 // Clean up the request timeout timer, we'll see how to proceed further based 2280 // on the actual delivered content 2281 req.timeout.Stop() 2282 2283 // Response is valid, but check if peer is signalling that it does not have 2284 // the requested data. For bytecode range queries that means the peer is not 2285 // yet synced. 2286 if len(trienodes) == 0 { 2287 logger.Debug("Peer rejected trienode heal request") 2288 s.statelessPeers[peer.id] = struct{}{} 2289 s.lock.Unlock() 2290 return nil 2291 } 2292 s.lock.Unlock() 2293 2294 // Cross reference the requested trienodes with the response to find gaps 2295 // that the serving node is missing 2296 hasher := sha3.NewLegacyKeccak256() 2297 2298 nodes := make([][]byte, len(req.hashes)) 2299 for i, j := 0, 0; i < len(trienodes); i++ { 2300 // Find the next hash that we've been served, leaving misses with nils 2301 hasher.Reset() 2302 hasher.Write(trienodes[i]) 2303 hash := hasher.Sum(nil) 2304 2305 for j < len(req.hashes) && !bytes.Equal(hash, req.hashes[j][:]) { 2306 j++ 2307 } 2308 if j < len(req.hashes) { 2309 nodes[j] = trienodes[i] 2310 j++ 2311 continue 2312 } 2313 // We've either ran out of hashes, or got unrequested data 2314 logger.Warn("Unexpected healing trienodes", "count", len(trienodes)-i) 2315 return errors.New("unexpected healing trienode") 2316 } 2317 // Response validated, send it to the scheduler for filling 2318 response := &trienodeHealResponse{ 2319 task: req.task, 2320 hashes: req.hashes, 2321 paths: req.paths, 2322 nodes: nodes, 2323 } 2324 select { 2325 case s.trienodeHealResps <- response: 2326 case <-req.cancel: 2327 case <-req.stale: 2328 } 2329 return nil 2330 } 2331 2332 // onHealByteCodes is a callback method to invoke when a batch of contract 2333 // bytes codes are received from a remote peer in the healing phase. 2334 func (s *Syncer) onHealByteCodes(peer *Peer, id uint64, bytecodes [][]byte) error { 2335 var size common.StorageSize 2336 for _, code := range bytecodes { 2337 size += common.StorageSize(len(code)) 2338 } 2339 logger := peer.logger.New("reqid", id) 2340 logger.Trace("Delivering set of healing bytecodes", "bytecodes", len(bytecodes), "bytes", size) 2341 2342 // Whether or not the response is valid, we can mark the peer as idle and 2343 // notify the scheduler to assign a new task. If the response is invalid, 2344 // we'll drop the peer in a bit. 2345 s.lock.Lock() 2346 if _, ok := s.peers[peer.id]; ok { 2347 s.bytecodeHealIdlers[peer.id] = struct{}{} 2348 } 2349 select { 2350 case s.update <- struct{}{}: 2351 default: 2352 } 2353 // Ensure the response is for a valid request 2354 req, ok := s.bytecodeHealReqs[id] 2355 if !ok { 2356 // Request stale, perhaps the peer timed out but came through in the end 2357 logger.Warn("Unexpected bytecode heal packet") 2358 s.lock.Unlock() 2359 return nil 2360 } 2361 delete(s.bytecodeHealReqs, id) 2362 2363 // Clean up the request timeout timer, we'll see how to proceed further based 2364 // on the actual delivered content 2365 req.timeout.Stop() 2366 2367 // Response is valid, but check if peer is signalling that it does not have 2368 // the requested data. For bytecode range queries that means the peer is not 2369 // yet synced. 2370 if len(bytecodes) == 0 { 2371 logger.Debug("Peer rejected bytecode heal request") 2372 s.statelessPeers[peer.id] = struct{}{} 2373 s.lock.Unlock() 2374 return nil 2375 } 2376 s.lock.Unlock() 2377 2378 // Cross reference the requested bytecodes with the response to find gaps 2379 // that the serving node is missing 2380 hasher := sha3.NewLegacyKeccak256() 2381 2382 codes := make([][]byte, len(req.hashes)) 2383 for i, j := 0, 0; i < len(bytecodes); i++ { 2384 // Find the next hash that we've been served, leaving misses with nils 2385 hasher.Reset() 2386 hasher.Write(bytecodes[i]) 2387 hash := hasher.Sum(nil) 2388 2389 for j < len(req.hashes) && !bytes.Equal(hash, req.hashes[j][:]) { 2390 j++ 2391 } 2392 if j < len(req.hashes) { 2393 codes[j] = bytecodes[i] 2394 j++ 2395 continue 2396 } 2397 // We've either ran out of hashes, or got unrequested data 2398 logger.Warn("Unexpected healing bytecodes", "count", len(bytecodes)-i) 2399 return errors.New("unexpected healing bytecode") 2400 } 2401 // Response validated, send it to the scheduler for filling 2402 response := &bytecodeHealResponse{ 2403 task: req.task, 2404 hashes: req.hashes, 2405 codes: codes, 2406 } 2407 select { 2408 case s.bytecodeHealResps <- response: 2409 case <-req.cancel: 2410 case <-req.stale: 2411 } 2412 return nil 2413 } 2414 2415 // hashSpace is the total size of the 256 bit hash space for accounts. 2416 var hashSpace = new(big.Int).Exp(common.Big2, common.Big256, nil) 2417 2418 // report calculates various status reports and provides it to the user. 2419 func (s *Syncer) report(force bool) { 2420 if len(s.tasks) > 0 { 2421 s.reportSyncProgress(force) 2422 return 2423 } 2424 s.reportHealProgress(force) 2425 } 2426 2427 // reportSyncProgress calculates various status reports and provides it to the user. 2428 func (s *Syncer) reportSyncProgress(force bool) { 2429 // Don't report all the events, just occasionally 2430 if !force && time.Since(s.logTime) < 3*time.Second { 2431 return 2432 } 2433 // Don't report anything until we have a meaningful progress 2434 synced := s.accountBytes + s.bytecodeBytes + s.storageBytes 2435 if synced == 0 { 2436 return 2437 } 2438 accountGaps := new(big.Int) 2439 for _, task := range s.tasks { 2440 accountGaps.Add(accountGaps, new(big.Int).Sub(task.Last.Big(), task.Next.Big())) 2441 } 2442 accountFills := new(big.Int).Sub(hashSpace, accountGaps) 2443 if accountFills.BitLen() == 0 { 2444 return 2445 } 2446 s.logTime = time.Now() 2447 estBytes := float64(new(big.Int).Div( 2448 new(big.Int).Mul(new(big.Int).SetUint64(uint64(synced)), hashSpace), 2449 accountFills, 2450 ).Uint64()) 2451 2452 elapsed := time.Since(s.startTime) 2453 estTime := elapsed / time.Duration(synced) * time.Duration(estBytes) 2454 2455 // Create a mega progress report 2456 var ( 2457 progress = fmt.Sprintf("%.2f%%", float64(synced)*100/estBytes) 2458 accounts = fmt.Sprintf("%d@%v", s.accountSynced, s.accountBytes.TerminalString()) 2459 storage = fmt.Sprintf("%d@%v", s.storageSynced, s.storageBytes.TerminalString()) 2460 bytecode = fmt.Sprintf("%d@%v", s.bytecodeSynced, s.bytecodeBytes.TerminalString()) 2461 ) 2462 log.Info("State sync in progress", "synced", progress, "state", synced, 2463 "accounts", accounts, "slots", storage, "codes", bytecode, "eta", common.PrettyDuration(estTime-elapsed)) 2464 } 2465 2466 // reportHealProgress calculates various status reports and provides it to the user. 2467 func (s *Syncer) reportHealProgress(force bool) { 2468 // Don't report all the events, just occasionally 2469 if !force && time.Since(s.logTime) < 3*time.Second { 2470 return 2471 } 2472 s.logTime = time.Now() 2473 2474 // Create a mega progress report 2475 var ( 2476 trienode = fmt.Sprintf("%d@%v", s.trienodeHealSynced, s.trienodeHealBytes.TerminalString()) 2477 bytecode = fmt.Sprintf("%d@%v", s.bytecodeHealSynced, s.bytecodeHealBytes.TerminalString()) 2478 ) 2479 log.Info("State heal in progress", "nodes", trienode, "codes", bytecode, 2480 "pending", s.healer.scheduler.Pending()) 2481 }