github.com/phillinzzz/newBsc@v1.1.6/eth/protocols/snap/sync.go (about) 1 // Copyright 2020 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package snap 18 19 import ( 20 "bytes" 21 "encoding/json" 22 "errors" 23 "fmt" 24 "math/big" 25 "math/rand" 26 "sort" 27 "sync" 28 "time" 29 30 "github.com/phillinzzz/newBsc/common" 31 "github.com/phillinzzz/newBsc/common/gopool" 32 "github.com/phillinzzz/newBsc/common/math" 33 "github.com/phillinzzz/newBsc/core/rawdb" 34 "github.com/phillinzzz/newBsc/core/state" 35 "github.com/phillinzzz/newBsc/core/state/snapshot" 36 "github.com/phillinzzz/newBsc/crypto" 37 "github.com/phillinzzz/newBsc/ethdb" 38 "github.com/phillinzzz/newBsc/event" 39 "github.com/phillinzzz/newBsc/light" 40 "github.com/phillinzzz/newBsc/log" 41 "github.com/phillinzzz/newBsc/rlp" 42 "github.com/phillinzzz/newBsc/trie" 43 "golang.org/x/crypto/sha3" 44 ) 45 46 var ( 47 // emptyRoot is the known root hash of an empty trie. 48 emptyRoot = common.HexToHash("56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421") 49 50 // emptyCode is the known hash of the empty EVM bytecode. 51 emptyCode = crypto.Keccak256Hash(nil) 52 ) 53 54 const ( 55 // maxRequestSize is the maximum number of bytes to request from a remote peer. 56 maxRequestSize = 128 * 1024 57 58 // maxStorageSetRequestCount is the maximum number of contracts to request the 59 // storage of in a single query. If this number is too low, we're not filling 60 // responses fully and waste round trip times. If it's too high, we're capping 61 // responses and waste bandwidth. 62 maxStorageSetRequestCount = maxRequestSize / 1024 63 64 // maxCodeRequestCount is the maximum number of bytecode blobs to request in a 65 // single query. If this number is too low, we're not filling responses fully 66 // and waste round trip times. If it's too high, we're capping responses and 67 // waste bandwidth. 68 // 69 // Depoyed bytecodes are currently capped at 24KB, so the minimum request 70 // size should be maxRequestSize / 24K. Assuming that most contracts do not 71 // come close to that, requesting 4x should be a good approximation. 72 maxCodeRequestCount = maxRequestSize / (24 * 1024) * 4 73 74 // maxTrieRequestCount is the maximum number of trie node blobs to request in 75 // a single query. If this number is too low, we're not filling responses fully 76 // and waste round trip times. If it's too high, we're capping responses and 77 // waste bandwidth. 78 maxTrieRequestCount = 256 79 ) 80 81 var ( 82 // accountConcurrency is the number of chunks to split the account trie into 83 // to allow concurrent retrievals. 84 accountConcurrency = 16 85 86 // storageConcurrency is the number of chunks to split the a large contract 87 // storage trie into to allow concurrent retrievals. 88 storageConcurrency = 16 89 90 // requestTimeout is the maximum time a peer is allowed to spend on serving 91 // a single network request. 92 requestTimeout = 15 * time.Second // TODO(karalabe): Make it dynamic ala fast-sync? 93 ) 94 95 // ErrCancelled is returned from snap syncing if the operation was prematurely 96 // terminated. 97 var ErrCancelled = errors.New("sync cancelled") 98 99 // accountRequest tracks a pending account range request to ensure responses are 100 // to actual requests and to validate any security constraints. 101 // 102 // Concurrency note: account requests and responses are handled concurrently from 103 // the main runloop to allow Merkle proof verifications on the peer's thread and 104 // to drop on invalid response. The request struct must contain all the data to 105 // construct the response without accessing runloop internals (i.e. task). That 106 // is only included to allow the runloop to match a response to the task being 107 // synced without having yet another set of maps. 108 type accountRequest struct { 109 peer string // Peer to which this request is assigned 110 id uint64 // Request ID of this request 111 112 deliver chan *accountResponse // Channel to deliver successful response on 113 revert chan *accountRequest // Channel to deliver request failure on 114 cancel chan struct{} // Channel to track sync cancellation 115 timeout *time.Timer // Timer to track delivery timeout 116 stale chan struct{} // Channel to signal the request was dropped 117 118 origin common.Hash // First account requested to allow continuation checks 119 limit common.Hash // Last account requested to allow non-overlapping chunking 120 121 task *accountTask // Task which this request is filling (only access fields through the runloop!!) 122 } 123 124 // accountResponse is an already Merkle-verified remote response to an account 125 // range request. It contains the subtrie for the requested account range and 126 // the database that's going to be filled with the internal nodes on commit. 127 type accountResponse struct { 128 task *accountTask // Task which this request is filling 129 130 hashes []common.Hash // Account hashes in the returned range 131 accounts []*state.Account // Expanded accounts in the returned range 132 133 cont bool // Whether the account range has a continuation 134 } 135 136 // bytecodeRequest tracks a pending bytecode request to ensure responses are to 137 // actual requests and to validate any security constraints. 138 // 139 // Concurrency note: bytecode requests and responses are handled concurrently from 140 // the main runloop to allow Keccak256 hash verifications on the peer's thread and 141 // to drop on invalid response. The request struct must contain all the data to 142 // construct the response without accessing runloop internals (i.e. task). That 143 // is only included to allow the runloop to match a response to the task being 144 // synced without having yet another set of maps. 145 type bytecodeRequest struct { 146 peer string // Peer to which this request is assigned 147 id uint64 // Request ID of this request 148 149 deliver chan *bytecodeResponse // Channel to deliver successful response on 150 revert chan *bytecodeRequest // Channel to deliver request failure on 151 cancel chan struct{} // Channel to track sync cancellation 152 timeout *time.Timer // Timer to track delivery timeout 153 stale chan struct{} // Channel to signal the request was dropped 154 155 hashes []common.Hash // Bytecode hashes to validate responses 156 task *accountTask // Task which this request is filling (only access fields through the runloop!!) 157 } 158 159 // bytecodeResponse is an already verified remote response to a bytecode request. 160 type bytecodeResponse struct { 161 task *accountTask // Task which this request is filling 162 163 hashes []common.Hash // Hashes of the bytecode to avoid double hashing 164 codes [][]byte // Actual bytecodes to store into the database (nil = missing) 165 } 166 167 // storageRequest tracks a pending storage ranges request to ensure responses are 168 // to actual requests and to validate any security constraints. 169 // 170 // Concurrency note: storage requests and responses are handled concurrently from 171 // the main runloop to allow Merkel proof verifications on the peer's thread and 172 // to drop on invalid response. The request struct must contain all the data to 173 // construct the response without accessing runloop internals (i.e. tasks). That 174 // is only included to allow the runloop to match a response to the task being 175 // synced without having yet another set of maps. 176 type storageRequest struct { 177 peer string // Peer to which this request is assigned 178 id uint64 // Request ID of this request 179 180 deliver chan *storageResponse // Channel to deliver successful response on 181 revert chan *storageRequest // Channel to deliver request failure on 182 cancel chan struct{} // Channel to track sync cancellation 183 timeout *time.Timer // Timer to track delivery timeout 184 stale chan struct{} // Channel to signal the request was dropped 185 186 accounts []common.Hash // Account hashes to validate responses 187 roots []common.Hash // Storage roots to validate responses 188 189 origin common.Hash // First storage slot requested to allow continuation checks 190 limit common.Hash // Last storage slot requested to allow non-overlapping chunking 191 192 mainTask *accountTask // Task which this response belongs to (only access fields through the runloop!!) 193 subTask *storageTask // Task which this response is filling (only access fields through the runloop!!) 194 } 195 196 // storageResponse is an already Merkle-verified remote response to a storage 197 // range request. It contains the subtries for the requested storage ranges and 198 // the databases that's going to be filled with the internal nodes on commit. 199 type storageResponse struct { 200 mainTask *accountTask // Task which this response belongs to 201 subTask *storageTask // Task which this response is filling 202 203 accounts []common.Hash // Account hashes requested, may be only partially filled 204 roots []common.Hash // Storage roots requested, may be only partially filled 205 206 hashes [][]common.Hash // Storage slot hashes in the returned range 207 slots [][][]byte // Storage slot values in the returned range 208 209 cont bool // Whether the last storage range has a continuation 210 } 211 212 // trienodeHealRequest tracks a pending state trie request to ensure responses 213 // are to actual requests and to validate any security constraints. 214 // 215 // Concurrency note: trie node requests and responses are handled concurrently from 216 // the main runloop to allow Keccak256 hash verifications on the peer's thread and 217 // to drop on invalid response. The request struct must contain all the data to 218 // construct the response without accessing runloop internals (i.e. task). That 219 // is only included to allow the runloop to match a response to the task being 220 // synced without having yet another set of maps. 221 type trienodeHealRequest struct { 222 peer string // Peer to which this request is assigned 223 id uint64 // Request ID of this request 224 225 deliver chan *trienodeHealResponse // Channel to deliver successful response on 226 revert chan *trienodeHealRequest // Channel to deliver request failure on 227 cancel chan struct{} // Channel to track sync cancellation 228 timeout *time.Timer // Timer to track delivery timeout 229 stale chan struct{} // Channel to signal the request was dropped 230 231 hashes []common.Hash // Trie node hashes to validate responses 232 paths []trie.SyncPath // Trie node paths requested for rescheduling 233 234 task *healTask // Task which this request is filling (only access fields through the runloop!!) 235 } 236 237 // trienodeHealResponse is an already verified remote response to a trie node request. 238 type trienodeHealResponse struct { 239 task *healTask // Task which this request is filling 240 241 hashes []common.Hash // Hashes of the trie nodes to avoid double hashing 242 paths []trie.SyncPath // Trie node paths requested for rescheduling missing ones 243 nodes [][]byte // Actual trie nodes to store into the database (nil = missing) 244 } 245 246 // bytecodeHealRequest tracks a pending bytecode request to ensure responses are to 247 // actual requests and to validate any security constraints. 248 // 249 // Concurrency note: bytecode requests and responses are handled concurrently from 250 // the main runloop to allow Keccak256 hash verifications on the peer's thread and 251 // to drop on invalid response. The request struct must contain all the data to 252 // construct the response without accessing runloop internals (i.e. task). That 253 // is only included to allow the runloop to match a response to the task being 254 // synced without having yet another set of maps. 255 type bytecodeHealRequest struct { 256 peer string // Peer to which this request is assigned 257 id uint64 // Request ID of this request 258 259 deliver chan *bytecodeHealResponse // Channel to deliver successful response on 260 revert chan *bytecodeHealRequest // Channel to deliver request failure on 261 cancel chan struct{} // Channel to track sync cancellation 262 timeout *time.Timer // Timer to track delivery timeout 263 stale chan struct{} // Channel to signal the request was dropped 264 265 hashes []common.Hash // Bytecode hashes to validate responses 266 task *healTask // Task which this request is filling (only access fields through the runloop!!) 267 } 268 269 // bytecodeHealResponse is an already verified remote response to a bytecode request. 270 type bytecodeHealResponse struct { 271 task *healTask // Task which this request is filling 272 273 hashes []common.Hash // Hashes of the bytecode to avoid double hashing 274 codes [][]byte // Actual bytecodes to store into the database (nil = missing) 275 } 276 277 // accountTask represents the sync task for a chunk of the account snapshot. 278 type accountTask struct { 279 // These fields get serialized to leveldb on shutdown 280 Next common.Hash // Next account to sync in this interval 281 Last common.Hash // Last account to sync in this interval 282 SubTasks map[common.Hash][]*storageTask // Storage intervals needing fetching for large contracts 283 284 // These fields are internals used during runtime 285 req *accountRequest // Pending request to fill this task 286 res *accountResponse // Validate response filling this task 287 pend int // Number of pending subtasks for this round 288 289 needCode []bool // Flags whether the filling accounts need code retrieval 290 needState []bool // Flags whether the filling accounts need storage retrieval 291 needHeal []bool // Flags whether the filling accounts's state was chunked and need healing 292 293 codeTasks map[common.Hash]struct{} // Code hashes that need retrieval 294 stateTasks map[common.Hash]common.Hash // Account hashes->roots that need full state retrieval 295 296 genBatch ethdb.Batch // Batch used by the node generator 297 genTrie *trie.StackTrie // Node generator from storage slots 298 299 done bool // Flag whether the task can be removed 300 } 301 302 // storageTask represents the sync task for a chunk of the storage snapshot. 303 type storageTask struct { 304 Next common.Hash // Next account to sync in this interval 305 Last common.Hash // Last account to sync in this interval 306 307 // These fields are internals used during runtime 308 root common.Hash // Storage root hash for this instance 309 req *storageRequest // Pending request to fill this task 310 311 genBatch ethdb.Batch // Batch used by the node generator 312 genTrie *trie.StackTrie // Node generator from storage slots 313 314 done bool // Flag whether the task can be removed 315 } 316 317 // healTask represents the sync task for healing the snap-synced chunk boundaries. 318 type healTask struct { 319 scheduler *trie.Sync // State trie sync scheduler defining the tasks 320 321 trieTasks map[common.Hash]trie.SyncPath // Set of trie node tasks currently queued for retrieval 322 codeTasks map[common.Hash]struct{} // Set of byte code tasks currently queued for retrieval 323 } 324 325 // syncProgress is a database entry to allow suspending and resuming a snapshot state 326 // sync. Opposed to full and fast sync, there is no way to restart a suspended 327 // snap sync without prior knowledge of the suspension point. 328 type syncProgress struct { 329 Tasks []*accountTask // The suspended account tasks (contract tasks within) 330 331 // Status report during syncing phase 332 AccountSynced uint64 // Number of accounts downloaded 333 AccountBytes common.StorageSize // Number of account trie bytes persisted to disk 334 BytecodeSynced uint64 // Number of bytecodes downloaded 335 BytecodeBytes common.StorageSize // Number of bytecode bytes downloaded 336 StorageSynced uint64 // Number of storage slots downloaded 337 StorageBytes common.StorageSize // Number of storage trie bytes persisted to disk 338 339 // Status report during healing phase 340 TrienodeHealSynced uint64 // Number of state trie nodes downloaded 341 TrienodeHealBytes common.StorageSize // Number of state trie bytes persisted to disk 342 TrienodeHealDups uint64 // Number of state trie nodes already processed 343 TrienodeHealNops uint64 // Number of state trie nodes not requested 344 BytecodeHealSynced uint64 // Number of bytecodes downloaded 345 BytecodeHealBytes common.StorageSize // Number of bytecodes persisted to disk 346 BytecodeHealDups uint64 // Number of bytecodes already processed 347 BytecodeHealNops uint64 // Number of bytecodes not requested 348 } 349 350 // SyncPeer abstracts out the methods required for a peer to be synced against 351 // with the goal of allowing the construction of mock peers without the full 352 // blown networking. 353 type SyncPeer interface { 354 // ID retrieves the peer's unique identifier. 355 ID() string 356 357 // RequestAccountRange fetches a batch of accounts rooted in a specific account 358 // trie, starting with the origin. 359 RequestAccountRange(id uint64, root, origin, limit common.Hash, bytes uint64) error 360 361 // RequestStorageRanges fetches a batch of storage slots belonging to one or 362 // more accounts. If slots from only one accout is requested, an origin marker 363 // may also be used to retrieve from there. 364 RequestStorageRanges(id uint64, root common.Hash, accounts []common.Hash, origin, limit []byte, bytes uint64) error 365 366 // RequestByteCodes fetches a batch of bytecodes by hash. 367 RequestByteCodes(id uint64, hashes []common.Hash, bytes uint64) error 368 369 // RequestTrieNodes fetches a batch of account or storage trie nodes rooted in 370 // a specificstate trie. 371 RequestTrieNodes(id uint64, root common.Hash, paths []TrieNodePathSet, bytes uint64) error 372 373 // Log retrieves the peer's own contextual logger. 374 Log() log.Logger 375 } 376 377 // Syncer is an Ethereum account and storage trie syncer based on snapshots and 378 // the snap protocol. It's purpose is to download all the accounts and storage 379 // slots from remote peers and reassemble chunks of the state trie, on top of 380 // which a state sync can be run to fix any gaps / overlaps. 381 // 382 // Every network request has a variety of failure events: 383 // - The peer disconnects after task assignment, failing to send the request 384 // - The peer disconnects after sending the request, before delivering on it 385 // - The peer remains connected, but does not deliver a response in time 386 // - The peer delivers a stale response after a previous timeout 387 // - The peer delivers a refusal to serve the requested state 388 type Syncer struct { 389 db ethdb.KeyValueStore // Database to store the trie nodes into (and dedup) 390 391 root common.Hash // Current state trie root being synced 392 tasks []*accountTask // Current account task set being synced 393 snapped bool // Flag to signal that snap phase is done 394 healer *healTask // Current state healing task being executed 395 update chan struct{} // Notification channel for possible sync progression 396 397 peers map[string]SyncPeer // Currently active peers to download from 398 peerJoin *event.Feed // Event feed to react to peers joining 399 peerDrop *event.Feed // Event feed to react to peers dropping 400 401 // Request tracking during syncing phase 402 statelessPeers map[string]struct{} // Peers that failed to deliver state data 403 accountIdlers map[string]struct{} // Peers that aren't serving account requests 404 bytecodeIdlers map[string]struct{} // Peers that aren't serving bytecode requests 405 storageIdlers map[string]struct{} // Peers that aren't serving storage requests 406 407 accountReqs map[uint64]*accountRequest // Account requests currently running 408 bytecodeReqs map[uint64]*bytecodeRequest // Bytecode requests currently running 409 storageReqs map[uint64]*storageRequest // Storage requests currently running 410 411 accountSynced uint64 // Number of accounts downloaded 412 accountBytes common.StorageSize // Number of account trie bytes persisted to disk 413 bytecodeSynced uint64 // Number of bytecodes downloaded 414 bytecodeBytes common.StorageSize // Number of bytecode bytes downloaded 415 storageSynced uint64 // Number of storage slots downloaded 416 storageBytes common.StorageSize // Number of storage trie bytes persisted to disk 417 418 // Request tracking during healing phase 419 trienodeHealIdlers map[string]struct{} // Peers that aren't serving trie node requests 420 bytecodeHealIdlers map[string]struct{} // Peers that aren't serving bytecode requests 421 422 trienodeHealReqs map[uint64]*trienodeHealRequest // Trie node requests currently running 423 bytecodeHealReqs map[uint64]*bytecodeHealRequest // Bytecode requests currently running 424 425 trienodeHealSynced uint64 // Number of state trie nodes downloaded 426 trienodeHealBytes common.StorageSize // Number of state trie bytes persisted to disk 427 trienodeHealDups uint64 // Number of state trie nodes already processed 428 trienodeHealNops uint64 // Number of state trie nodes not requested 429 bytecodeHealSynced uint64 // Number of bytecodes downloaded 430 bytecodeHealBytes common.StorageSize // Number of bytecodes persisted to disk 431 bytecodeHealDups uint64 // Number of bytecodes already processed 432 bytecodeHealNops uint64 // Number of bytecodes not requested 433 434 stateWriter ethdb.Batch // Shared batch writer used for persisting raw states 435 accountHealed uint64 // Number of accounts downloaded during the healing stage 436 accountHealedBytes common.StorageSize // Number of raw account bytes persisted to disk during the healing stage 437 storageHealed uint64 // Number of storage slots downloaded during the healing stage 438 storageHealedBytes common.StorageSize // Number of raw storage bytes persisted to disk during the healing stage 439 440 startTime time.Time // Time instance when snapshot sync started 441 logTime time.Time // Time instance when status was last reported 442 443 pend sync.WaitGroup // Tracks network request goroutines for graceful shutdown 444 lock sync.RWMutex // Protects fields that can change outside of sync (peers, reqs, root) 445 } 446 447 // NewSyncer creates a new snapshot syncer to download the Ethereum state over the 448 // snap protocol. 449 func NewSyncer(db ethdb.KeyValueStore) *Syncer { 450 return &Syncer{ 451 db: db, 452 453 peers: make(map[string]SyncPeer), 454 peerJoin: new(event.Feed), 455 peerDrop: new(event.Feed), 456 update: make(chan struct{}, 1), 457 458 accountIdlers: make(map[string]struct{}), 459 storageIdlers: make(map[string]struct{}), 460 bytecodeIdlers: make(map[string]struct{}), 461 462 accountReqs: make(map[uint64]*accountRequest), 463 storageReqs: make(map[uint64]*storageRequest), 464 bytecodeReqs: make(map[uint64]*bytecodeRequest), 465 466 trienodeHealIdlers: make(map[string]struct{}), 467 bytecodeHealIdlers: make(map[string]struct{}), 468 469 trienodeHealReqs: make(map[uint64]*trienodeHealRequest), 470 bytecodeHealReqs: make(map[uint64]*bytecodeHealRequest), 471 stateWriter: db.NewBatch(), 472 } 473 } 474 475 // Register injects a new data source into the syncer's peerset. 476 func (s *Syncer) Register(peer SyncPeer) error { 477 // Make sure the peer is not registered yet 478 id := peer.ID() 479 480 s.lock.Lock() 481 if _, ok := s.peers[id]; ok { 482 log.Error("Snap peer already registered", "id", id) 483 484 s.lock.Unlock() 485 return errors.New("already registered") 486 } 487 s.peers[id] = peer 488 489 // Mark the peer as idle, even if no sync is running 490 s.accountIdlers[id] = struct{}{} 491 s.storageIdlers[id] = struct{}{} 492 s.bytecodeIdlers[id] = struct{}{} 493 s.trienodeHealIdlers[id] = struct{}{} 494 s.bytecodeHealIdlers[id] = struct{}{} 495 s.lock.Unlock() 496 497 // Notify any active syncs that a new peer can be assigned data 498 s.peerJoin.Send(id) 499 return nil 500 } 501 502 // Unregister injects a new data source into the syncer's peerset. 503 func (s *Syncer) Unregister(id string) error { 504 // Remove all traces of the peer from the registry 505 s.lock.Lock() 506 if _, ok := s.peers[id]; !ok { 507 log.Error("Snap peer not registered", "id", id) 508 509 s.lock.Unlock() 510 return errors.New("not registered") 511 } 512 delete(s.peers, id) 513 514 // Remove status markers, even if no sync is running 515 delete(s.statelessPeers, id) 516 517 delete(s.accountIdlers, id) 518 delete(s.storageIdlers, id) 519 delete(s.bytecodeIdlers, id) 520 delete(s.trienodeHealIdlers, id) 521 delete(s.bytecodeHealIdlers, id) 522 s.lock.Unlock() 523 524 // Notify any active syncs that pending requests need to be reverted 525 s.peerDrop.Send(id) 526 return nil 527 } 528 529 // Sync starts (or resumes a previous) sync cycle to iterate over an state trie 530 // with the given root and reconstruct the nodes based on the snapshot leaves. 531 // Previously downloaded segments will not be redownloaded of fixed, rather any 532 // errors will be healed after the leaves are fully accumulated. 533 func (s *Syncer) Sync(root common.Hash, cancel chan struct{}) error { 534 // Move the trie root from any previous value, revert stateless markers for 535 // any peers and initialize the syncer if it was not yet run 536 s.lock.Lock() 537 s.root = root 538 s.healer = &healTask{ 539 scheduler: state.NewStateSync(root, s.db, nil, s.onHealState), 540 trieTasks: make(map[common.Hash]trie.SyncPath), 541 codeTasks: make(map[common.Hash]struct{}), 542 } 543 s.statelessPeers = make(map[string]struct{}) 544 s.lock.Unlock() 545 546 if s.startTime == (time.Time{}) { 547 s.startTime = time.Now() 548 } 549 // Retrieve the previous sync status from LevelDB and abort if already synced 550 s.loadSyncStatus() 551 if len(s.tasks) == 0 && s.healer.scheduler.Pending() == 0 { 552 log.Debug("Snapshot sync already completed") 553 return nil 554 } 555 defer func() { // Persist any progress, independent of failure 556 for _, task := range s.tasks { 557 s.forwardAccountTask(task) 558 } 559 s.cleanAccountTasks() 560 s.saveSyncStatus() 561 }() 562 563 log.Debug("Starting snapshot sync cycle", "root", root) 564 565 // Flush out the last committed raw states 566 defer func() { 567 if s.stateWriter.ValueSize() > 0 { 568 s.stateWriter.Write() 569 s.stateWriter.Reset() 570 } 571 }() 572 defer s.report(true) 573 574 // Whether sync completed or not, disregard any future packets 575 defer func() { 576 log.Debug("Terminating snapshot sync cycle", "root", root) 577 s.lock.Lock() 578 s.accountReqs = make(map[uint64]*accountRequest) 579 s.storageReqs = make(map[uint64]*storageRequest) 580 s.bytecodeReqs = make(map[uint64]*bytecodeRequest) 581 s.trienodeHealReqs = make(map[uint64]*trienodeHealRequest) 582 s.bytecodeHealReqs = make(map[uint64]*bytecodeHealRequest) 583 s.lock.Unlock() 584 }() 585 // Keep scheduling sync tasks 586 peerJoin := make(chan string, 16) 587 peerJoinSub := s.peerJoin.Subscribe(peerJoin) 588 defer peerJoinSub.Unsubscribe() 589 590 peerDrop := make(chan string, 16) 591 peerDropSub := s.peerDrop.Subscribe(peerDrop) 592 defer peerDropSub.Unsubscribe() 593 594 // Create a set of unique channels for this sync cycle. We need these to be 595 // ephemeral so a data race doesn't accidentally deliver something stale on 596 // a persistent channel across syncs (yup, this happened) 597 var ( 598 accountReqFails = make(chan *accountRequest) 599 storageReqFails = make(chan *storageRequest) 600 bytecodeReqFails = make(chan *bytecodeRequest) 601 accountResps = make(chan *accountResponse) 602 storageResps = make(chan *storageResponse) 603 bytecodeResps = make(chan *bytecodeResponse) 604 trienodeHealReqFails = make(chan *trienodeHealRequest) 605 bytecodeHealReqFails = make(chan *bytecodeHealRequest) 606 trienodeHealResps = make(chan *trienodeHealResponse) 607 bytecodeHealResps = make(chan *bytecodeHealResponse) 608 ) 609 for { 610 // Remove all completed tasks and terminate sync if everything's done 611 s.cleanStorageTasks() 612 s.cleanAccountTasks() 613 if len(s.tasks) == 0 && s.healer.scheduler.Pending() == 0 { 614 return nil 615 } 616 // Assign all the data retrieval tasks to any free peers 617 s.assignAccountTasks(accountResps, accountReqFails, cancel) 618 s.assignBytecodeTasks(bytecodeResps, bytecodeReqFails, cancel) 619 s.assignStorageTasks(storageResps, storageReqFails, cancel) 620 621 if len(s.tasks) == 0 { 622 // Sync phase done, run heal phase 623 s.assignTrienodeHealTasks(trienodeHealResps, trienodeHealReqFails, cancel) 624 s.assignBytecodeHealTasks(bytecodeHealResps, bytecodeHealReqFails, cancel) 625 } 626 // Wait for something to happen 627 select { 628 case <-s.update: 629 // Something happened (new peer, delivery, timeout), recheck tasks 630 case <-peerJoin: 631 // A new peer joined, try to schedule it new tasks 632 case id := <-peerDrop: 633 s.revertRequests(id) 634 case <-cancel: 635 return ErrCancelled 636 637 case req := <-accountReqFails: 638 s.revertAccountRequest(req) 639 case req := <-bytecodeReqFails: 640 s.revertBytecodeRequest(req) 641 case req := <-storageReqFails: 642 s.revertStorageRequest(req) 643 case req := <-trienodeHealReqFails: 644 s.revertTrienodeHealRequest(req) 645 case req := <-bytecodeHealReqFails: 646 s.revertBytecodeHealRequest(req) 647 648 case res := <-accountResps: 649 s.processAccountResponse(res) 650 case res := <-bytecodeResps: 651 s.processBytecodeResponse(res) 652 case res := <-storageResps: 653 s.processStorageResponse(res) 654 case res := <-trienodeHealResps: 655 s.processTrienodeHealResponse(res) 656 case res := <-bytecodeHealResps: 657 s.processBytecodeHealResponse(res) 658 } 659 // Report stats if something meaningful happened 660 s.report(false) 661 } 662 } 663 664 // loadSyncStatus retrieves a previously aborted sync status from the database, 665 // or generates a fresh one if none is available. 666 func (s *Syncer) loadSyncStatus() { 667 var progress syncProgress 668 669 if status := rawdb.ReadSnapshotSyncStatus(s.db); status != nil { 670 if err := json.Unmarshal(status, &progress); err != nil { 671 log.Error("Failed to decode snap sync status", "err", err) 672 } else { 673 for _, task := range progress.Tasks { 674 log.Debug("Scheduled account sync task", "from", task.Next, "last", task.Last) 675 } 676 s.tasks = progress.Tasks 677 for _, task := range s.tasks { 678 task.genBatch = ethdb.HookedBatch{ 679 Batch: s.db.NewBatch(), 680 OnPut: func(key []byte, value []byte) { 681 s.accountBytes += common.StorageSize(len(key) + len(value)) 682 }, 683 } 684 task.genTrie = trie.NewStackTrie(task.genBatch) 685 686 for _, subtasks := range task.SubTasks { 687 for _, subtask := range subtasks { 688 subtask.genBatch = ethdb.HookedBatch{ 689 Batch: s.db.NewBatch(), 690 OnPut: func(key []byte, value []byte) { 691 s.storageBytes += common.StorageSize(len(key) + len(value)) 692 }, 693 } 694 subtask.genTrie = trie.NewStackTrie(subtask.genBatch) 695 } 696 } 697 } 698 s.snapped = len(s.tasks) == 0 699 700 s.accountSynced = progress.AccountSynced 701 s.accountBytes = progress.AccountBytes 702 s.bytecodeSynced = progress.BytecodeSynced 703 s.bytecodeBytes = progress.BytecodeBytes 704 s.storageSynced = progress.StorageSynced 705 s.storageBytes = progress.StorageBytes 706 707 s.trienodeHealSynced = progress.TrienodeHealSynced 708 s.trienodeHealBytes = progress.TrienodeHealBytes 709 s.bytecodeHealSynced = progress.BytecodeHealSynced 710 s.bytecodeHealBytes = progress.BytecodeHealBytes 711 return 712 } 713 } 714 // Either we've failed to decode the previus state, or there was none. 715 // Start a fresh sync by chunking up the account range and scheduling 716 // them for retrieval. 717 s.tasks = nil 718 s.accountSynced, s.accountBytes = 0, 0 719 s.bytecodeSynced, s.bytecodeBytes = 0, 0 720 s.storageSynced, s.storageBytes = 0, 0 721 s.trienodeHealSynced, s.trienodeHealBytes = 0, 0 722 s.bytecodeHealSynced, s.bytecodeHealBytes = 0, 0 723 724 var next common.Hash 725 step := new(big.Int).Sub( 726 new(big.Int).Div( 727 new(big.Int).Exp(common.Big2, common.Big256, nil), 728 big.NewInt(int64(accountConcurrency)), 729 ), common.Big1, 730 ) 731 for i := 0; i < accountConcurrency; i++ { 732 last := common.BigToHash(new(big.Int).Add(next.Big(), step)) 733 if i == accountConcurrency-1 { 734 // Make sure we don't overflow if the step is not a proper divisor 735 last = common.HexToHash("0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff") 736 } 737 batch := ethdb.HookedBatch{ 738 Batch: s.db.NewBatch(), 739 OnPut: func(key []byte, value []byte) { 740 s.accountBytes += common.StorageSize(len(key) + len(value)) 741 }, 742 } 743 s.tasks = append(s.tasks, &accountTask{ 744 Next: next, 745 Last: last, 746 SubTasks: make(map[common.Hash][]*storageTask), 747 genBatch: batch, 748 genTrie: trie.NewStackTrie(batch), 749 }) 750 log.Debug("Created account sync task", "from", next, "last", last) 751 next = common.BigToHash(new(big.Int).Add(last.Big(), common.Big1)) 752 } 753 } 754 755 // saveSyncStatus marshals the remaining sync tasks into leveldb. 756 func (s *Syncer) saveSyncStatus() { 757 // Serialize any partial progress to disk before spinning down 758 for _, task := range s.tasks { 759 if err := task.genBatch.Write(); err != nil { 760 log.Error("Failed to persist account slots", "err", err) 761 } 762 for _, subtasks := range task.SubTasks { 763 for _, subtask := range subtasks { 764 if err := subtask.genBatch.Write(); err != nil { 765 log.Error("Failed to persist storage slots", "err", err) 766 } 767 } 768 } 769 } 770 // Store the actual progress markers 771 progress := &syncProgress{ 772 Tasks: s.tasks, 773 AccountSynced: s.accountSynced, 774 AccountBytes: s.accountBytes, 775 BytecodeSynced: s.bytecodeSynced, 776 BytecodeBytes: s.bytecodeBytes, 777 StorageSynced: s.storageSynced, 778 StorageBytes: s.storageBytes, 779 TrienodeHealSynced: s.trienodeHealSynced, 780 TrienodeHealBytes: s.trienodeHealBytes, 781 BytecodeHealSynced: s.bytecodeHealSynced, 782 BytecodeHealBytes: s.bytecodeHealBytes, 783 } 784 status, err := json.Marshal(progress) 785 if err != nil { 786 panic(err) // This can only fail during implementation 787 } 788 rawdb.WriteSnapshotSyncStatus(s.db, status) 789 } 790 791 // cleanAccountTasks removes account range retrieval tasks that have already been 792 // completed. 793 func (s *Syncer) cleanAccountTasks() { 794 // If the sync was already done before, don't even bother 795 if len(s.tasks) == 0 { 796 return 797 } 798 // Sync wasn't finished previously, check for any task that can be finalized 799 for i := 0; i < len(s.tasks); i++ { 800 if s.tasks[i].done { 801 s.tasks = append(s.tasks[:i], s.tasks[i+1:]...) 802 i-- 803 } 804 } 805 // If everything was just finalized just, generate the account trie and start heal 806 if len(s.tasks) == 0 { 807 s.lock.Lock() 808 s.snapped = true 809 s.lock.Unlock() 810 811 // Push the final sync report 812 s.reportSyncProgress(true) 813 } 814 } 815 816 // cleanStorageTasks iterates over all the account tasks and storage sub-tasks 817 // within, cleaning any that have been completed. 818 func (s *Syncer) cleanStorageTasks() { 819 for _, task := range s.tasks { 820 for account, subtasks := range task.SubTasks { 821 // Remove storage range retrieval tasks that completed 822 for j := 0; j < len(subtasks); j++ { 823 if subtasks[j].done { 824 subtasks = append(subtasks[:j], subtasks[j+1:]...) 825 j-- 826 } 827 } 828 if len(subtasks) > 0 { 829 task.SubTasks[account] = subtasks 830 continue 831 } 832 // If all storage chunks are done, mark the account as done too 833 for j, hash := range task.res.hashes { 834 if hash == account { 835 task.needState[j] = false 836 } 837 } 838 delete(task.SubTasks, account) 839 task.pend-- 840 841 // If this was the last pending task, forward the account task 842 if task.pend == 0 { 843 s.forwardAccountTask(task) 844 } 845 } 846 } 847 } 848 849 // assignAccountTasks attempts to match idle peers to pending account range 850 // retrievals. 851 func (s *Syncer) assignAccountTasks(success chan *accountResponse, fail chan *accountRequest, cancel chan struct{}) { 852 s.lock.Lock() 853 defer s.lock.Unlock() 854 855 // If there are no idle peers, short circuit assignment 856 if len(s.accountIdlers) == 0 { 857 return 858 } 859 // Iterate over all the tasks and try to find a pending one 860 for _, task := range s.tasks { 861 // Skip any tasks already filling 862 if task.req != nil || task.res != nil { 863 continue 864 } 865 // Task pending retrieval, try to find an idle peer. If no such peer 866 // exists, we probably assigned tasks for all (or they are stateless). 867 // Abort the entire assignment mechanism. 868 var idle string 869 for id := range s.accountIdlers { 870 // If the peer rejected a query in this sync cycle, don't bother asking 871 // again for anything, it's either out of sync or already pruned 872 if _, ok := s.statelessPeers[id]; ok { 873 continue 874 } 875 idle = id 876 break 877 } 878 if idle == "" { 879 return 880 } 881 peer := s.peers[idle] 882 883 // Matched a pending task to an idle peer, allocate a unique request id 884 var reqid uint64 885 for { 886 reqid = uint64(rand.Int63()) 887 if reqid == 0 { 888 continue 889 } 890 if _, ok := s.accountReqs[reqid]; ok { 891 continue 892 } 893 break 894 } 895 // Generate the network query and send it to the peer 896 req := &accountRequest{ 897 peer: idle, 898 id: reqid, 899 deliver: success, 900 revert: fail, 901 cancel: cancel, 902 stale: make(chan struct{}), 903 origin: task.Next, 904 limit: task.Last, 905 task: task, 906 } 907 req.timeout = time.AfterFunc(requestTimeout, func() { 908 peer.Log().Debug("Account range request timed out", "reqid", reqid) 909 s.scheduleRevertAccountRequest(req) 910 }) 911 s.accountReqs[reqid] = req 912 delete(s.accountIdlers, idle) 913 914 s.pend.Add(1) 915 root := s.root 916 gopool.Submit(func() { 917 defer s.pend.Done() 918 919 // Attempt to send the remote request and revert if it fails 920 if err := peer.RequestAccountRange(reqid, root, req.origin, req.limit, maxRequestSize); err != nil { 921 peer.Log().Debug("Failed to request account range", "err", err) 922 s.scheduleRevertAccountRequest(req) 923 } 924 }) 925 926 // Inject the request into the task to block further assignments 927 task.req = req 928 } 929 } 930 931 // assignBytecodeTasks attempts to match idle peers to pending code retrievals. 932 func (s *Syncer) assignBytecodeTasks(success chan *bytecodeResponse, fail chan *bytecodeRequest, cancel chan struct{}) { 933 s.lock.Lock() 934 defer s.lock.Unlock() 935 936 // If there are no idle peers, short circuit assignment 937 if len(s.bytecodeIdlers) == 0 { 938 return 939 } 940 // Iterate over all the tasks and try to find a pending one 941 for _, task := range s.tasks { 942 // Skip any tasks not in the bytecode retrieval phase 943 if task.res == nil { 944 continue 945 } 946 // Skip tasks that are already retrieving (or done with) all codes 947 if len(task.codeTasks) == 0 { 948 continue 949 } 950 // Task pending retrieval, try to find an idle peer. If no such peer 951 // exists, we probably assigned tasks for all (or they are stateless). 952 // Abort the entire assignment mechanism. 953 var idle string 954 for id := range s.bytecodeIdlers { 955 // If the peer rejected a query in this sync cycle, don't bother asking 956 // again for anything, it's either out of sync or already pruned 957 if _, ok := s.statelessPeers[id]; ok { 958 continue 959 } 960 idle = id 961 break 962 } 963 if idle == "" { 964 return 965 } 966 peer := s.peers[idle] 967 968 // Matched a pending task to an idle peer, allocate a unique request id 969 var reqid uint64 970 for { 971 reqid = uint64(rand.Int63()) 972 if reqid == 0 { 973 continue 974 } 975 if _, ok := s.bytecodeReqs[reqid]; ok { 976 continue 977 } 978 break 979 } 980 // Generate the network query and send it to the peer 981 hashes := make([]common.Hash, 0, maxCodeRequestCount) 982 for hash := range task.codeTasks { 983 delete(task.codeTasks, hash) 984 hashes = append(hashes, hash) 985 if len(hashes) >= maxCodeRequestCount { 986 break 987 } 988 } 989 req := &bytecodeRequest{ 990 peer: idle, 991 id: reqid, 992 deliver: success, 993 revert: fail, 994 cancel: cancel, 995 stale: make(chan struct{}), 996 hashes: hashes, 997 task: task, 998 } 999 req.timeout = time.AfterFunc(requestTimeout, func() { 1000 peer.Log().Debug("Bytecode request timed out", "reqid", reqid) 1001 s.scheduleRevertBytecodeRequest(req) 1002 }) 1003 s.bytecodeReqs[reqid] = req 1004 delete(s.bytecodeIdlers, idle) 1005 1006 s.pend.Add(1) 1007 gopool.Submit(func() { 1008 defer s.pend.Done() 1009 1010 // Attempt to send the remote request and revert if it fails 1011 if err := peer.RequestByteCodes(reqid, hashes, maxRequestSize); err != nil { 1012 log.Debug("Failed to request bytecodes", "err", err) 1013 s.scheduleRevertBytecodeRequest(req) 1014 } 1015 }) 1016 } 1017 } 1018 1019 // assignStorageTasks attempts to match idle peers to pending storage range 1020 // retrievals. 1021 func (s *Syncer) assignStorageTasks(success chan *storageResponse, fail chan *storageRequest, cancel chan struct{}) { 1022 s.lock.Lock() 1023 defer s.lock.Unlock() 1024 1025 // If there are no idle peers, short circuit assignment 1026 if len(s.storageIdlers) == 0 { 1027 return 1028 } 1029 // Iterate over all the tasks and try to find a pending one 1030 for _, task := range s.tasks { 1031 // Skip any tasks not in the storage retrieval phase 1032 if task.res == nil { 1033 continue 1034 } 1035 // Skip tasks that are already retrieving (or done with) all small states 1036 if len(task.SubTasks) == 0 && len(task.stateTasks) == 0 { 1037 continue 1038 } 1039 // Task pending retrieval, try to find an idle peer. If no such peer 1040 // exists, we probably assigned tasks for all (or they are stateless). 1041 // Abort the entire assignment mechanism. 1042 var idle string 1043 for id := range s.storageIdlers { 1044 // If the peer rejected a query in this sync cycle, don't bother asking 1045 // again for anything, it's either out of sync or already pruned 1046 if _, ok := s.statelessPeers[id]; ok { 1047 continue 1048 } 1049 idle = id 1050 break 1051 } 1052 if idle == "" { 1053 return 1054 } 1055 peer := s.peers[idle] 1056 1057 // Matched a pending task to an idle peer, allocate a unique request id 1058 var reqid uint64 1059 for { 1060 reqid = uint64(rand.Int63()) 1061 if reqid == 0 { 1062 continue 1063 } 1064 if _, ok := s.storageReqs[reqid]; ok { 1065 continue 1066 } 1067 break 1068 } 1069 // Generate the network query and send it to the peer. If there are 1070 // large contract tasks pending, complete those before diving into 1071 // even more new contracts. 1072 var ( 1073 accounts = make([]common.Hash, 0, maxStorageSetRequestCount) 1074 roots = make([]common.Hash, 0, maxStorageSetRequestCount) 1075 subtask *storageTask 1076 ) 1077 for account, subtasks := range task.SubTasks { 1078 for _, st := range subtasks { 1079 // Skip any subtasks already filling 1080 if st.req != nil { 1081 continue 1082 } 1083 // Found an incomplete storage chunk, schedule it 1084 accounts = append(accounts, account) 1085 roots = append(roots, st.root) 1086 subtask = st 1087 break // Large contract chunks are downloaded individually 1088 } 1089 if subtask != nil { 1090 break // Large contract chunks are downloaded individually 1091 } 1092 } 1093 if subtask == nil { 1094 // No large contract required retrieval, but small ones available 1095 for acccount, root := range task.stateTasks { 1096 delete(task.stateTasks, acccount) 1097 1098 accounts = append(accounts, acccount) 1099 roots = append(roots, root) 1100 1101 if len(accounts) >= maxStorageSetRequestCount { 1102 break 1103 } 1104 } 1105 } 1106 // If nothing was found, it means this task is actually already fully 1107 // retrieving, but large contracts are hard to detect. Skip to the next. 1108 if len(accounts) == 0 { 1109 continue 1110 } 1111 req := &storageRequest{ 1112 peer: idle, 1113 id: reqid, 1114 deliver: success, 1115 revert: fail, 1116 cancel: cancel, 1117 stale: make(chan struct{}), 1118 accounts: accounts, 1119 roots: roots, 1120 mainTask: task, 1121 subTask: subtask, 1122 } 1123 if subtask != nil { 1124 req.origin = subtask.Next 1125 req.limit = subtask.Last 1126 } 1127 req.timeout = time.AfterFunc(requestTimeout, func() { 1128 peer.Log().Debug("Storage request timed out", "reqid", reqid) 1129 s.scheduleRevertStorageRequest(req) 1130 }) 1131 s.storageReqs[reqid] = req 1132 delete(s.storageIdlers, idle) 1133 1134 s.pend.Add(1) 1135 root := s.root 1136 gopool.Submit(func() { 1137 defer s.pend.Done() 1138 1139 // Attempt to send the remote request and revert if it fails 1140 var origin, limit []byte 1141 if subtask != nil { 1142 origin, limit = req.origin[:], req.limit[:] 1143 } 1144 if err := peer.RequestStorageRanges(reqid, root, accounts, origin, limit, maxRequestSize); err != nil { 1145 log.Debug("Failed to request storage", "err", err) 1146 s.scheduleRevertStorageRequest(req) 1147 } 1148 }) 1149 1150 // Inject the request into the subtask to block further assignments 1151 if subtask != nil { 1152 subtask.req = req 1153 } 1154 } 1155 } 1156 1157 // assignTrienodeHealTasks attempts to match idle peers to trie node requests to 1158 // heal any trie errors caused by the snap sync's chunked retrieval model. 1159 func (s *Syncer) assignTrienodeHealTasks(success chan *trienodeHealResponse, fail chan *trienodeHealRequest, cancel chan struct{}) { 1160 s.lock.Lock() 1161 defer s.lock.Unlock() 1162 1163 // If there are no idle peers, short circuit assignment 1164 if len(s.trienodeHealIdlers) == 0 { 1165 return 1166 } 1167 // Iterate over pending tasks and try to find a peer to retrieve with 1168 for len(s.healer.trieTasks) > 0 || s.healer.scheduler.Pending() > 0 { 1169 // If there are not enough trie tasks queued to fully assign, fill the 1170 // queue from the state sync scheduler. The trie synced schedules these 1171 // together with bytecodes, so we need to queue them combined. 1172 var ( 1173 have = len(s.healer.trieTasks) + len(s.healer.codeTasks) 1174 want = maxTrieRequestCount + maxCodeRequestCount 1175 ) 1176 if have < want { 1177 nodes, paths, codes := s.healer.scheduler.Missing(want - have) 1178 for i, hash := range nodes { 1179 s.healer.trieTasks[hash] = paths[i] 1180 } 1181 for _, hash := range codes { 1182 s.healer.codeTasks[hash] = struct{}{} 1183 } 1184 } 1185 // If all the heal tasks are bytecodes or already downloading, bail 1186 if len(s.healer.trieTasks) == 0 { 1187 return 1188 } 1189 // Task pending retrieval, try to find an idle peer. If no such peer 1190 // exists, we probably assigned tasks for all (or they are stateless). 1191 // Abort the entire assignment mechanism. 1192 var idle string 1193 for id := range s.trienodeHealIdlers { 1194 // If the peer rejected a query in this sync cycle, don't bother asking 1195 // again for anything, it's either out of sync or already pruned 1196 if _, ok := s.statelessPeers[id]; ok { 1197 continue 1198 } 1199 idle = id 1200 break 1201 } 1202 if idle == "" { 1203 return 1204 } 1205 peer := s.peers[idle] 1206 1207 // Matched a pending task to an idle peer, allocate a unique request id 1208 var reqid uint64 1209 for { 1210 reqid = uint64(rand.Int63()) 1211 if reqid == 0 { 1212 continue 1213 } 1214 if _, ok := s.trienodeHealReqs[reqid]; ok { 1215 continue 1216 } 1217 break 1218 } 1219 // Generate the network query and send it to the peer 1220 var ( 1221 hashes = make([]common.Hash, 0, maxTrieRequestCount) 1222 paths = make([]trie.SyncPath, 0, maxTrieRequestCount) 1223 pathsets = make([]TrieNodePathSet, 0, maxTrieRequestCount) 1224 ) 1225 for hash, pathset := range s.healer.trieTasks { 1226 delete(s.healer.trieTasks, hash) 1227 1228 hashes = append(hashes, hash) 1229 paths = append(paths, pathset) 1230 pathsets = append(pathsets, [][]byte(pathset)) // TODO(karalabe): group requests by account hash 1231 1232 if len(hashes) >= maxTrieRequestCount { 1233 break 1234 } 1235 } 1236 req := &trienodeHealRequest{ 1237 peer: idle, 1238 id: reqid, 1239 deliver: success, 1240 revert: fail, 1241 cancel: cancel, 1242 stale: make(chan struct{}), 1243 hashes: hashes, 1244 paths: paths, 1245 task: s.healer, 1246 } 1247 req.timeout = time.AfterFunc(requestTimeout, func() { 1248 peer.Log().Debug("Trienode heal request timed out", "reqid", reqid) 1249 s.scheduleRevertTrienodeHealRequest(req) 1250 }) 1251 s.trienodeHealReqs[reqid] = req 1252 delete(s.trienodeHealIdlers, idle) 1253 1254 s.pend.Add(1) 1255 root := s.root 1256 gopool.Submit(func() { 1257 defer s.pend.Done() 1258 1259 // Attempt to send the remote request and revert if it fails 1260 if err := peer.RequestTrieNodes(reqid, root, pathsets, maxRequestSize); err != nil { 1261 log.Debug("Failed to request trienode healers", "err", err) 1262 s.scheduleRevertTrienodeHealRequest(req) 1263 } 1264 }) 1265 } 1266 } 1267 1268 // assignBytecodeHealTasks attempts to match idle peers to bytecode requests to 1269 // heal any trie errors caused by the snap sync's chunked retrieval model. 1270 func (s *Syncer) assignBytecodeHealTasks(success chan *bytecodeHealResponse, fail chan *bytecodeHealRequest, cancel chan struct{}) { 1271 s.lock.Lock() 1272 defer s.lock.Unlock() 1273 1274 // If there are no idle peers, short circuit assignment 1275 if len(s.bytecodeHealIdlers) == 0 { 1276 return 1277 } 1278 // Iterate over pending tasks and try to find a peer to retrieve with 1279 for len(s.healer.codeTasks) > 0 || s.healer.scheduler.Pending() > 0 { 1280 // If there are not enough trie tasks queued to fully assign, fill the 1281 // queue from the state sync scheduler. The trie synced schedules these 1282 // together with trie nodes, so we need to queue them combined. 1283 var ( 1284 have = len(s.healer.trieTasks) + len(s.healer.codeTasks) 1285 want = maxTrieRequestCount + maxCodeRequestCount 1286 ) 1287 if have < want { 1288 nodes, paths, codes := s.healer.scheduler.Missing(want - have) 1289 for i, hash := range nodes { 1290 s.healer.trieTasks[hash] = paths[i] 1291 } 1292 for _, hash := range codes { 1293 s.healer.codeTasks[hash] = struct{}{} 1294 } 1295 } 1296 // If all the heal tasks are trienodes or already downloading, bail 1297 if len(s.healer.codeTasks) == 0 { 1298 return 1299 } 1300 // Task pending retrieval, try to find an idle peer. If no such peer 1301 // exists, we probably assigned tasks for all (or they are stateless). 1302 // Abort the entire assignment mechanism. 1303 var idle string 1304 for id := range s.bytecodeHealIdlers { 1305 // If the peer rejected a query in this sync cycle, don't bother asking 1306 // again for anything, it's either out of sync or already pruned 1307 if _, ok := s.statelessPeers[id]; ok { 1308 continue 1309 } 1310 idle = id 1311 break 1312 } 1313 if idle == "" { 1314 return 1315 } 1316 peer := s.peers[idle] 1317 1318 // Matched a pending task to an idle peer, allocate a unique request id 1319 var reqid uint64 1320 for { 1321 reqid = uint64(rand.Int63()) 1322 if reqid == 0 { 1323 continue 1324 } 1325 if _, ok := s.bytecodeHealReqs[reqid]; ok { 1326 continue 1327 } 1328 break 1329 } 1330 // Generate the network query and send it to the peer 1331 hashes := make([]common.Hash, 0, maxCodeRequestCount) 1332 for hash := range s.healer.codeTasks { 1333 delete(s.healer.codeTasks, hash) 1334 1335 hashes = append(hashes, hash) 1336 if len(hashes) >= maxCodeRequestCount { 1337 break 1338 } 1339 } 1340 req := &bytecodeHealRequest{ 1341 peer: idle, 1342 id: reqid, 1343 deliver: success, 1344 revert: fail, 1345 cancel: cancel, 1346 stale: make(chan struct{}), 1347 hashes: hashes, 1348 task: s.healer, 1349 } 1350 req.timeout = time.AfterFunc(requestTimeout, func() { 1351 peer.Log().Debug("Bytecode heal request timed out", "reqid", reqid) 1352 s.scheduleRevertBytecodeHealRequest(req) 1353 }) 1354 s.bytecodeHealReqs[reqid] = req 1355 delete(s.bytecodeHealIdlers, idle) 1356 1357 s.pend.Add(1) 1358 gopool.Submit(func() { 1359 defer s.pend.Done() 1360 1361 // Attempt to send the remote request and revert if it fails 1362 if err := peer.RequestByteCodes(reqid, hashes, maxRequestSize); err != nil { 1363 log.Debug("Failed to request bytecode healers", "err", err) 1364 s.scheduleRevertBytecodeHealRequest(req) 1365 } 1366 }) 1367 } 1368 } 1369 1370 // revertRequests locates all the currently pending reuqests from a particular 1371 // peer and reverts them, rescheduling for others to fulfill. 1372 func (s *Syncer) revertRequests(peer string) { 1373 // Gather the requests first, revertals need the lock too 1374 s.lock.Lock() 1375 var accountReqs []*accountRequest 1376 for _, req := range s.accountReqs { 1377 if req.peer == peer { 1378 accountReqs = append(accountReqs, req) 1379 } 1380 } 1381 var bytecodeReqs []*bytecodeRequest 1382 for _, req := range s.bytecodeReqs { 1383 if req.peer == peer { 1384 bytecodeReqs = append(bytecodeReqs, req) 1385 } 1386 } 1387 var storageReqs []*storageRequest 1388 for _, req := range s.storageReqs { 1389 if req.peer == peer { 1390 storageReqs = append(storageReqs, req) 1391 } 1392 } 1393 var trienodeHealReqs []*trienodeHealRequest 1394 for _, req := range s.trienodeHealReqs { 1395 if req.peer == peer { 1396 trienodeHealReqs = append(trienodeHealReqs, req) 1397 } 1398 } 1399 var bytecodeHealReqs []*bytecodeHealRequest 1400 for _, req := range s.bytecodeHealReqs { 1401 if req.peer == peer { 1402 bytecodeHealReqs = append(bytecodeHealReqs, req) 1403 } 1404 } 1405 s.lock.Unlock() 1406 1407 // Revert all the requests matching the peer 1408 for _, req := range accountReqs { 1409 s.revertAccountRequest(req) 1410 } 1411 for _, req := range bytecodeReqs { 1412 s.revertBytecodeRequest(req) 1413 } 1414 for _, req := range storageReqs { 1415 s.revertStorageRequest(req) 1416 } 1417 for _, req := range trienodeHealReqs { 1418 s.revertTrienodeHealRequest(req) 1419 } 1420 for _, req := range bytecodeHealReqs { 1421 s.revertBytecodeHealRequest(req) 1422 } 1423 } 1424 1425 // scheduleRevertAccountRequest asks the event loop to clean up an account range 1426 // request and return all failed retrieval tasks to the scheduler for reassignment. 1427 func (s *Syncer) scheduleRevertAccountRequest(req *accountRequest) { 1428 select { 1429 case req.revert <- req: 1430 // Sync event loop notified 1431 case <-req.cancel: 1432 // Sync cycle got cancelled 1433 case <-req.stale: 1434 // Request already reverted 1435 } 1436 } 1437 1438 // revertAccountRequest cleans up an account range request and returns all failed 1439 // retrieval tasks to the scheduler for reassignment. 1440 // 1441 // Note, this needs to run on the event runloop thread to reschedule to idle peers. 1442 // On peer threads, use scheduleRevertAccountRequest. 1443 func (s *Syncer) revertAccountRequest(req *accountRequest) { 1444 log.Debug("Reverting account request", "peer", req.peer, "reqid", req.id) 1445 select { 1446 case <-req.stale: 1447 log.Trace("Account request already reverted", "peer", req.peer, "reqid", req.id) 1448 return 1449 default: 1450 } 1451 close(req.stale) 1452 1453 // Remove the request from the tracked set 1454 s.lock.Lock() 1455 delete(s.accountReqs, req.id) 1456 s.lock.Unlock() 1457 1458 // If there's a timeout timer still running, abort it and mark the account 1459 // task as not-pending, ready for resheduling 1460 req.timeout.Stop() 1461 if req.task.req == req { 1462 req.task.req = nil 1463 } 1464 } 1465 1466 // scheduleRevertBytecodeRequest asks the event loop to clean up a bytecode request 1467 // and return all failed retrieval tasks to the scheduler for reassignment. 1468 func (s *Syncer) scheduleRevertBytecodeRequest(req *bytecodeRequest) { 1469 select { 1470 case req.revert <- req: 1471 // Sync event loop notified 1472 case <-req.cancel: 1473 // Sync cycle got cancelled 1474 case <-req.stale: 1475 // Request already reverted 1476 } 1477 } 1478 1479 // revertBytecodeRequest cleans up a bytecode request and returns all failed 1480 // retrieval tasks to the scheduler for reassignment. 1481 // 1482 // Note, this needs to run on the event runloop thread to reschedule to idle peers. 1483 // On peer threads, use scheduleRevertBytecodeRequest. 1484 func (s *Syncer) revertBytecodeRequest(req *bytecodeRequest) { 1485 log.Debug("Reverting bytecode request", "peer", req.peer) 1486 select { 1487 case <-req.stale: 1488 log.Trace("Bytecode request already reverted", "peer", req.peer, "reqid", req.id) 1489 return 1490 default: 1491 } 1492 close(req.stale) 1493 1494 // Remove the request from the tracked set 1495 s.lock.Lock() 1496 delete(s.bytecodeReqs, req.id) 1497 s.lock.Unlock() 1498 1499 // If there's a timeout timer still running, abort it and mark the code 1500 // retrievals as not-pending, ready for resheduling 1501 req.timeout.Stop() 1502 for _, hash := range req.hashes { 1503 req.task.codeTasks[hash] = struct{}{} 1504 } 1505 } 1506 1507 // scheduleRevertStorageRequest asks the event loop to clean up a storage range 1508 // request and return all failed retrieval tasks to the scheduler for reassignment. 1509 func (s *Syncer) scheduleRevertStorageRequest(req *storageRequest) { 1510 select { 1511 case req.revert <- req: 1512 // Sync event loop notified 1513 case <-req.cancel: 1514 // Sync cycle got cancelled 1515 case <-req.stale: 1516 // Request already reverted 1517 } 1518 } 1519 1520 // revertStorageRequest cleans up a storage range request and returns all failed 1521 // retrieval tasks to the scheduler for reassignment. 1522 // 1523 // Note, this needs to run on the event runloop thread to reschedule to idle peers. 1524 // On peer threads, use scheduleRevertStorageRequest. 1525 func (s *Syncer) revertStorageRequest(req *storageRequest) { 1526 log.Debug("Reverting storage request", "peer", req.peer) 1527 select { 1528 case <-req.stale: 1529 log.Trace("Storage request already reverted", "peer", req.peer, "reqid", req.id) 1530 return 1531 default: 1532 } 1533 close(req.stale) 1534 1535 // Remove the request from the tracked set 1536 s.lock.Lock() 1537 delete(s.storageReqs, req.id) 1538 s.lock.Unlock() 1539 1540 // If there's a timeout timer still running, abort it and mark the storage 1541 // task as not-pending, ready for resheduling 1542 req.timeout.Stop() 1543 if req.subTask != nil { 1544 req.subTask.req = nil 1545 } else { 1546 for i, account := range req.accounts { 1547 req.mainTask.stateTasks[account] = req.roots[i] 1548 } 1549 } 1550 } 1551 1552 // scheduleRevertTrienodeHealRequest asks the event loop to clean up a trienode heal 1553 // request and return all failed retrieval tasks to the scheduler for reassignment. 1554 func (s *Syncer) scheduleRevertTrienodeHealRequest(req *trienodeHealRequest) { 1555 select { 1556 case req.revert <- req: 1557 // Sync event loop notified 1558 case <-req.cancel: 1559 // Sync cycle got cancelled 1560 case <-req.stale: 1561 // Request already reverted 1562 } 1563 } 1564 1565 // revertTrienodeHealRequest cleans up a trienode heal request and returns all 1566 // failed retrieval tasks to the scheduler for reassignment. 1567 // 1568 // Note, this needs to run on the event runloop thread to reschedule to idle peers. 1569 // On peer threads, use scheduleRevertTrienodeHealRequest. 1570 func (s *Syncer) revertTrienodeHealRequest(req *trienodeHealRequest) { 1571 log.Debug("Reverting trienode heal request", "peer", req.peer) 1572 select { 1573 case <-req.stale: 1574 log.Trace("Trienode heal request already reverted", "peer", req.peer, "reqid", req.id) 1575 return 1576 default: 1577 } 1578 close(req.stale) 1579 1580 // Remove the request from the tracked set 1581 s.lock.Lock() 1582 delete(s.trienodeHealReqs, req.id) 1583 s.lock.Unlock() 1584 1585 // If there's a timeout timer still running, abort it and mark the trie node 1586 // retrievals as not-pending, ready for resheduling 1587 req.timeout.Stop() 1588 for i, hash := range req.hashes { 1589 req.task.trieTasks[hash] = req.paths[i] 1590 } 1591 } 1592 1593 // scheduleRevertBytecodeHealRequest asks the event loop to clean up a bytecode heal 1594 // request and return all failed retrieval tasks to the scheduler for reassignment. 1595 func (s *Syncer) scheduleRevertBytecodeHealRequest(req *bytecodeHealRequest) { 1596 select { 1597 case req.revert <- req: 1598 // Sync event loop notified 1599 case <-req.cancel: 1600 // Sync cycle got cancelled 1601 case <-req.stale: 1602 // Request already reverted 1603 } 1604 } 1605 1606 // revertBytecodeHealRequest cleans up a bytecode heal request and returns all 1607 // failed retrieval tasks to the scheduler for reassignment. 1608 // 1609 // Note, this needs to run on the event runloop thread to reschedule to idle peers. 1610 // On peer threads, use scheduleRevertBytecodeHealRequest. 1611 func (s *Syncer) revertBytecodeHealRequest(req *bytecodeHealRequest) { 1612 log.Debug("Reverting bytecode heal request", "peer", req.peer) 1613 select { 1614 case <-req.stale: 1615 log.Trace("Bytecode heal request already reverted", "peer", req.peer, "reqid", req.id) 1616 return 1617 default: 1618 } 1619 close(req.stale) 1620 1621 // Remove the request from the tracked set 1622 s.lock.Lock() 1623 delete(s.bytecodeHealReqs, req.id) 1624 s.lock.Unlock() 1625 1626 // If there's a timeout timer still running, abort it and mark the code 1627 // retrievals as not-pending, ready for resheduling 1628 req.timeout.Stop() 1629 for _, hash := range req.hashes { 1630 req.task.codeTasks[hash] = struct{}{} 1631 } 1632 } 1633 1634 // processAccountResponse integrates an already validated account range response 1635 // into the account tasks. 1636 func (s *Syncer) processAccountResponse(res *accountResponse) { 1637 // Switch the task from pending to filling 1638 res.task.req = nil 1639 res.task.res = res 1640 1641 // Ensure that the response doesn't overflow into the subsequent task 1642 last := res.task.Last.Big() 1643 for i, hash := range res.hashes { 1644 // Mark the range complete if the last is already included. 1645 // Keep iteration to delete the extra states if exists. 1646 cmp := hash.Big().Cmp(last) 1647 if cmp == 0 { 1648 res.cont = false 1649 continue 1650 } 1651 if cmp > 0 { 1652 // Chunk overflown, cut off excess 1653 res.hashes = res.hashes[:i] 1654 res.accounts = res.accounts[:i] 1655 res.cont = false // Mark range completed 1656 break 1657 } 1658 } 1659 // Iterate over all the accounts and assemble which ones need further sub- 1660 // filling before the entire account range can be persisted. 1661 res.task.needCode = make([]bool, len(res.accounts)) 1662 res.task.needState = make([]bool, len(res.accounts)) 1663 res.task.needHeal = make([]bool, len(res.accounts)) 1664 1665 res.task.codeTasks = make(map[common.Hash]struct{}) 1666 res.task.stateTasks = make(map[common.Hash]common.Hash) 1667 1668 resumed := make(map[common.Hash]struct{}) 1669 1670 res.task.pend = 0 1671 for i, account := range res.accounts { 1672 // Check if the account is a contract with an unknown code 1673 if !bytes.Equal(account.CodeHash, emptyCode[:]) { 1674 if code := rawdb.ReadCodeWithPrefix(s.db, common.BytesToHash(account.CodeHash)); code == nil { 1675 res.task.codeTasks[common.BytesToHash(account.CodeHash)] = struct{}{} 1676 res.task.needCode[i] = true 1677 res.task.pend++ 1678 } 1679 } 1680 // Check if the account is a contract with an unknown storage trie 1681 if account.Root != emptyRoot { 1682 if node, err := s.db.Get(account.Root[:]); err != nil || node == nil { 1683 // If there was a previous large state retrieval in progress, 1684 // don't restart it from scratch. This happens if a sync cycle 1685 // is interrupted and resumed later. However, *do* update the 1686 // previous root hash. 1687 if subtasks, ok := res.task.SubTasks[res.hashes[i]]; ok { 1688 log.Debug("Resuming large storage retrieval", "account", res.hashes[i], "root", account.Root) 1689 for _, subtask := range subtasks { 1690 subtask.root = account.Root 1691 } 1692 res.task.needHeal[i] = true 1693 resumed[res.hashes[i]] = struct{}{} 1694 } else { 1695 res.task.stateTasks[res.hashes[i]] = account.Root 1696 } 1697 res.task.needState[i] = true 1698 res.task.pend++ 1699 } 1700 } 1701 } 1702 // Delete any subtasks that have been aborted but not resumed. This may undo 1703 // some progress if a new peer gives us less accounts than an old one, but for 1704 // now we have to live with that. 1705 for hash := range res.task.SubTasks { 1706 if _, ok := resumed[hash]; !ok { 1707 log.Debug("Aborting suspended storage retrieval", "account", hash) 1708 delete(res.task.SubTasks, hash) 1709 } 1710 } 1711 // If the account range contained no contracts, or all have been fully filled 1712 // beforehand, short circuit storage filling and forward to the next task 1713 if res.task.pend == 0 { 1714 s.forwardAccountTask(res.task) 1715 return 1716 } 1717 // Some accounts are incomplete, leave as is for the storage and contract 1718 // task assigners to pick up and fill. 1719 } 1720 1721 // processBytecodeResponse integrates an already validated bytecode response 1722 // into the account tasks. 1723 func (s *Syncer) processBytecodeResponse(res *bytecodeResponse) { 1724 batch := s.db.NewBatch() 1725 1726 var ( 1727 codes uint64 1728 ) 1729 for i, hash := range res.hashes { 1730 code := res.codes[i] 1731 1732 // If the bytecode was not delivered, reschedule it 1733 if code == nil { 1734 res.task.codeTasks[hash] = struct{}{} 1735 continue 1736 } 1737 // Code was delivered, mark it not needed any more 1738 for j, account := range res.task.res.accounts { 1739 if res.task.needCode[j] && hash == common.BytesToHash(account.CodeHash) { 1740 res.task.needCode[j] = false 1741 res.task.pend-- 1742 } 1743 } 1744 // Push the bytecode into a database batch 1745 codes++ 1746 rawdb.WriteCode(batch, hash, code) 1747 } 1748 bytes := common.StorageSize(batch.ValueSize()) 1749 if err := batch.Write(); err != nil { 1750 log.Crit("Failed to persist bytecodes", "err", err) 1751 } 1752 s.bytecodeSynced += codes 1753 s.bytecodeBytes += bytes 1754 1755 log.Debug("Persisted set of bytecodes", "count", codes, "bytes", bytes) 1756 1757 // If this delivery completed the last pending task, forward the account task 1758 // to the next chunk 1759 if res.task.pend == 0 { 1760 s.forwardAccountTask(res.task) 1761 return 1762 } 1763 // Some accounts are still incomplete, leave as is for the storage and contract 1764 // task assigners to pick up and fill. 1765 } 1766 1767 // processStorageResponse integrates an already validated storage response 1768 // into the account tasks. 1769 func (s *Syncer) processStorageResponse(res *storageResponse) { 1770 // Switch the subtask from pending to idle 1771 if res.subTask != nil { 1772 res.subTask.req = nil 1773 } 1774 batch := ethdb.HookedBatch{ 1775 Batch: s.db.NewBatch(), 1776 OnPut: func(key []byte, value []byte) { 1777 s.storageBytes += common.StorageSize(len(key) + len(value)) 1778 }, 1779 } 1780 var ( 1781 slots int 1782 oldStorageBytes = s.storageBytes 1783 ) 1784 // Iterate over all the accounts and reconstruct their storage tries from the 1785 // delivered slots 1786 for i, account := range res.accounts { 1787 // If the account was not delivered, reschedule it 1788 if i >= len(res.hashes) { 1789 res.mainTask.stateTasks[account] = res.roots[i] 1790 continue 1791 } 1792 // State was delivered, if complete mark as not needed any more, otherwise 1793 // mark the account as needing healing 1794 for j, hash := range res.mainTask.res.hashes { 1795 if account != hash { 1796 continue 1797 } 1798 acc := res.mainTask.res.accounts[j] 1799 1800 // If the packet contains multiple contract storage slots, all 1801 // but the last are surely complete. The last contract may be 1802 // chunked, so check it's continuation flag. 1803 if res.subTask == nil && res.mainTask.needState[j] && (i < len(res.hashes)-1 || !res.cont) { 1804 res.mainTask.needState[j] = false 1805 res.mainTask.pend-- 1806 } 1807 // If the last contract was chunked, mark it as needing healing 1808 // to avoid writing it out to disk prematurely. 1809 if res.subTask == nil && !res.mainTask.needHeal[j] && i == len(res.hashes)-1 && res.cont { 1810 res.mainTask.needHeal[j] = true 1811 } 1812 // If the last contract was chunked, we need to switch to large 1813 // contract handling mode 1814 if res.subTask == nil && i == len(res.hashes)-1 && res.cont { 1815 // If we haven't yet started a large-contract retrieval, create 1816 // the subtasks for it within the main account task 1817 if tasks, ok := res.mainTask.SubTasks[account]; !ok { 1818 var ( 1819 keys = res.hashes[i] 1820 chunks = uint64(storageConcurrency) 1821 lastKey common.Hash 1822 ) 1823 if len(keys) > 0 { 1824 lastKey = keys[len(keys)-1] 1825 } 1826 // If the number of slots remaining is low, decrease the 1827 // number of chunks. Somewhere on the order of 10-15K slots 1828 // fit into a packet of 500KB. A key/slot pair is maximum 64 1829 // bytes, so pessimistically maxRequestSize/64 = 8K. 1830 // 1831 // Chunk so that at least 2 packets are needed to fill a task. 1832 if estimate, err := estimateRemainingSlots(len(keys), lastKey); err == nil { 1833 if n := estimate / (2 * (maxRequestSize / 64)); n+1 < chunks { 1834 chunks = n + 1 1835 } 1836 log.Debug("Chunked large contract", "initiators", len(keys), "tail", lastKey, "remaining", estimate, "chunks", chunks) 1837 } else { 1838 log.Debug("Chunked large contract", "initiators", len(keys), "tail", lastKey, "chunks", chunks) 1839 } 1840 r := newHashRange(lastKey, chunks) 1841 1842 // Our first task is the one that was just filled by this response. 1843 batch := ethdb.HookedBatch{ 1844 Batch: s.db.NewBatch(), 1845 OnPut: func(key []byte, value []byte) { 1846 s.storageBytes += common.StorageSize(len(key) + len(value)) 1847 }, 1848 } 1849 tasks = append(tasks, &storageTask{ 1850 Next: common.Hash{}, 1851 Last: r.End(), 1852 root: acc.Root, 1853 genBatch: batch, 1854 genTrie: trie.NewStackTrie(batch), 1855 }) 1856 for r.Next() { 1857 batch := ethdb.HookedBatch{ 1858 Batch: s.db.NewBatch(), 1859 OnPut: func(key []byte, value []byte) { 1860 s.storageBytes += common.StorageSize(len(key) + len(value)) 1861 }, 1862 } 1863 tasks = append(tasks, &storageTask{ 1864 Next: r.Start(), 1865 Last: r.End(), 1866 root: acc.Root, 1867 genBatch: batch, 1868 genTrie: trie.NewStackTrie(batch), 1869 }) 1870 } 1871 for _, task := range tasks { 1872 log.Debug("Created storage sync task", "account", account, "root", acc.Root, "from", task.Next, "last", task.Last) 1873 } 1874 res.mainTask.SubTasks[account] = tasks 1875 1876 // Since we've just created the sub-tasks, this response 1877 // is surely for the first one (zero origin) 1878 res.subTask = tasks[0] 1879 } 1880 } 1881 // If we're in large contract delivery mode, forward the subtask 1882 if res.subTask != nil { 1883 // Ensure the response doesn't overflow into the subsequent task 1884 last := res.subTask.Last.Big() 1885 // Find the first overflowing key. While at it, mark res as complete 1886 // if we find the range to include or pass the 'last' 1887 index := sort.Search(len(res.hashes[i]), func(k int) bool { 1888 cmp := res.hashes[i][k].Big().Cmp(last) 1889 if cmp >= 0 { 1890 res.cont = false 1891 } 1892 return cmp > 0 1893 }) 1894 if index >= 0 { 1895 // cut off excess 1896 res.hashes[i] = res.hashes[i][:index] 1897 res.slots[i] = res.slots[i][:index] 1898 } 1899 // Forward the relevant storage chunk (even if created just now) 1900 if res.cont { 1901 res.subTask.Next = incHash(res.hashes[i][len(res.hashes[i])-1]) 1902 } else { 1903 res.subTask.done = true 1904 } 1905 } 1906 } 1907 // Iterate over all the complete contracts, reconstruct the trie nodes and 1908 // push them to disk. If the contract is chunked, the trie nodes will be 1909 // reconstructed later. 1910 slots += len(res.hashes[i]) 1911 1912 if i < len(res.hashes)-1 || res.subTask == nil { 1913 tr := trie.NewStackTrie(batch) 1914 for j := 0; j < len(res.hashes[i]); j++ { 1915 tr.Update(res.hashes[i][j][:], res.slots[i][j]) 1916 } 1917 tr.Commit() 1918 } 1919 // Persist the received storage segements. These flat state maybe 1920 // outdated during the sync, but it can be fixed later during the 1921 // snapshot generation. 1922 for j := 0; j < len(res.hashes[i]); j++ { 1923 rawdb.WriteStorageSnapshot(batch, account, res.hashes[i][j], res.slots[i][j]) 1924 1925 // If we're storing large contracts, generate the trie nodes 1926 // on the fly to not trash the gluing points 1927 if i == len(res.hashes)-1 && res.subTask != nil { 1928 res.subTask.genTrie.Update(res.hashes[i][j][:], res.slots[i][j]) 1929 } 1930 } 1931 } 1932 // Large contracts could have generated new trie nodes, flush them to disk 1933 if res.subTask != nil { 1934 if res.subTask.done { 1935 if root, err := res.subTask.genTrie.Commit(); err != nil { 1936 log.Error("Failed to commit stack slots", "err", err) 1937 } else if root == res.subTask.root { 1938 // If the chunk's root is an overflown but full delivery, clear the heal request 1939 for i, account := range res.mainTask.res.hashes { 1940 if account == res.accounts[len(res.accounts)-1] { 1941 res.mainTask.needHeal[i] = false 1942 } 1943 } 1944 } 1945 } 1946 if res.subTask.genBatch.ValueSize() > ethdb.IdealBatchSize || res.subTask.done { 1947 if err := res.subTask.genBatch.Write(); err != nil { 1948 log.Error("Failed to persist stack slots", "err", err) 1949 } 1950 res.subTask.genBatch.Reset() 1951 } 1952 } 1953 // Flush anything written just now and update the stats 1954 if err := batch.Write(); err != nil { 1955 log.Crit("Failed to persist storage slots", "err", err) 1956 } 1957 s.storageSynced += uint64(slots) 1958 1959 log.Debug("Persisted set of storage slots", "accounts", len(res.hashes), "slots", slots, "bytes", s.storageBytes-oldStorageBytes) 1960 1961 // If this delivery completed the last pending task, forward the account task 1962 // to the next chunk 1963 if res.mainTask.pend == 0 { 1964 s.forwardAccountTask(res.mainTask) 1965 return 1966 } 1967 // Some accounts are still incomplete, leave as is for the storage and contract 1968 // task assigners to pick up and fill. 1969 } 1970 1971 // processTrienodeHealResponse integrates an already validated trienode response 1972 // into the healer tasks. 1973 func (s *Syncer) processTrienodeHealResponse(res *trienodeHealResponse) { 1974 for i, hash := range res.hashes { 1975 node := res.nodes[i] 1976 1977 // If the trie node was not delivered, reschedule it 1978 if node == nil { 1979 res.task.trieTasks[hash] = res.paths[i] 1980 continue 1981 } 1982 // Push the trie node into the state syncer 1983 s.trienodeHealSynced++ 1984 s.trienodeHealBytes += common.StorageSize(len(node)) 1985 1986 err := s.healer.scheduler.Process(trie.SyncResult{Hash: hash, Data: node}) 1987 switch err { 1988 case nil: 1989 case trie.ErrAlreadyProcessed: 1990 s.trienodeHealDups++ 1991 case trie.ErrNotRequested: 1992 s.trienodeHealNops++ 1993 default: 1994 log.Error("Invalid trienode processed", "hash", hash, "err", err) 1995 } 1996 } 1997 batch := s.db.NewBatch() 1998 if err := s.healer.scheduler.Commit(batch); err != nil { 1999 log.Error("Failed to commit healing data", "err", err) 2000 } 2001 if err := batch.Write(); err != nil { 2002 log.Crit("Failed to persist healing data", "err", err) 2003 } 2004 log.Debug("Persisted set of healing data", "type", "trienodes", "bytes", common.StorageSize(batch.ValueSize())) 2005 } 2006 2007 // processBytecodeHealResponse integrates an already validated bytecode response 2008 // into the healer tasks. 2009 func (s *Syncer) processBytecodeHealResponse(res *bytecodeHealResponse) { 2010 for i, hash := range res.hashes { 2011 node := res.codes[i] 2012 2013 // If the trie node was not delivered, reschedule it 2014 if node == nil { 2015 res.task.codeTasks[hash] = struct{}{} 2016 continue 2017 } 2018 // Push the trie node into the state syncer 2019 s.bytecodeHealSynced++ 2020 s.bytecodeHealBytes += common.StorageSize(len(node)) 2021 2022 err := s.healer.scheduler.Process(trie.SyncResult{Hash: hash, Data: node}) 2023 switch err { 2024 case nil: 2025 case trie.ErrAlreadyProcessed: 2026 s.bytecodeHealDups++ 2027 case trie.ErrNotRequested: 2028 s.bytecodeHealNops++ 2029 default: 2030 log.Error("Invalid bytecode processed", "hash", hash, "err", err) 2031 } 2032 } 2033 batch := s.db.NewBatch() 2034 if err := s.healer.scheduler.Commit(batch); err != nil { 2035 log.Error("Failed to commit healing data", "err", err) 2036 } 2037 if err := batch.Write(); err != nil { 2038 log.Crit("Failed to persist healing data", "err", err) 2039 } 2040 log.Debug("Persisted set of healing data", "type", "bytecode", "bytes", common.StorageSize(batch.ValueSize())) 2041 } 2042 2043 // forwardAccountTask takes a filled account task and persists anything available 2044 // into the database, after which it forwards the next account marker so that the 2045 // task's next chunk may be filled. 2046 func (s *Syncer) forwardAccountTask(task *accountTask) { 2047 // Remove any pending delivery 2048 res := task.res 2049 if res == nil { 2050 return // nothing to forward 2051 } 2052 task.res = nil 2053 2054 // Persist the received account segements. These flat state maybe 2055 // outdated during the sync, but it can be fixed later during the 2056 // snapshot generation. 2057 oldAccountBytes := s.accountBytes 2058 2059 batch := ethdb.HookedBatch{ 2060 Batch: s.db.NewBatch(), 2061 OnPut: func(key []byte, value []byte) { 2062 s.accountBytes += common.StorageSize(len(key) + len(value)) 2063 }, 2064 } 2065 for i, hash := range res.hashes { 2066 if task.needCode[i] || task.needState[i] { 2067 break 2068 } 2069 slim := snapshot.SlimAccountRLP(res.accounts[i].Nonce, res.accounts[i].Balance, res.accounts[i].Root, res.accounts[i].CodeHash) 2070 rawdb.WriteAccountSnapshot(batch, hash, slim) 2071 2072 // If the task is complete, drop it into the stack trie to generate 2073 // account trie nodes for it 2074 if !task.needHeal[i] { 2075 full, err := snapshot.FullAccountRLP(slim) // TODO(karalabe): Slim parsing can be omitted 2076 if err != nil { 2077 panic(err) // Really shouldn't ever happen 2078 } 2079 task.genTrie.Update(hash[:], full) 2080 } 2081 } 2082 // Flush anything written just now and update the stats 2083 if err := batch.Write(); err != nil { 2084 log.Crit("Failed to persist accounts", "err", err) 2085 } 2086 s.accountSynced += uint64(len(res.accounts)) 2087 2088 // Task filling persisted, push it the chunk marker forward to the first 2089 // account still missing data. 2090 for i, hash := range res.hashes { 2091 if task.needCode[i] || task.needState[i] { 2092 return 2093 } 2094 task.Next = incHash(hash) 2095 } 2096 // All accounts marked as complete, track if the entire task is done 2097 task.done = !res.cont 2098 2099 // Stack trie could have generated trie nodes, push them to disk (we need to 2100 // flush after finalizing task.done. It's fine even if we crash and lose this 2101 // write as it will only cause more data to be downloaded during heal. 2102 if task.done { 2103 if _, err := task.genTrie.Commit(); err != nil { 2104 log.Error("Failed to commit stack account", "err", err) 2105 } 2106 } 2107 if task.genBatch.ValueSize() > ethdb.IdealBatchSize || task.done { 2108 if err := task.genBatch.Write(); err != nil { 2109 log.Error("Failed to persist stack account", "err", err) 2110 } 2111 task.genBatch.Reset() 2112 } 2113 log.Debug("Persisted range of accounts", "accounts", len(res.accounts), "bytes", s.accountBytes-oldAccountBytes) 2114 } 2115 2116 // OnAccounts is a callback method to invoke when a range of accounts are 2117 // received from a remote peer. 2118 func (s *Syncer) OnAccounts(peer SyncPeer, id uint64, hashes []common.Hash, accounts [][]byte, proof [][]byte) error { 2119 size := common.StorageSize(len(hashes) * common.HashLength) 2120 for _, account := range accounts { 2121 size += common.StorageSize(len(account)) 2122 } 2123 for _, node := range proof { 2124 size += common.StorageSize(len(node)) 2125 } 2126 logger := peer.Log().New("reqid", id) 2127 logger.Trace("Delivering range of accounts", "hashes", len(hashes), "accounts", len(accounts), "proofs", len(proof), "bytes", size) 2128 2129 // Whether or not the response is valid, we can mark the peer as idle and 2130 // notify the scheduler to assign a new task. If the response is invalid, 2131 // we'll drop the peer in a bit. 2132 s.lock.Lock() 2133 if _, ok := s.peers[peer.ID()]; ok { 2134 s.accountIdlers[peer.ID()] = struct{}{} 2135 } 2136 select { 2137 case s.update <- struct{}{}: 2138 default: 2139 } 2140 // Ensure the response is for a valid request 2141 req, ok := s.accountReqs[id] 2142 if !ok { 2143 // Request stale, perhaps the peer timed out but came through in the end 2144 logger.Warn("Unexpected account range packet") 2145 s.lock.Unlock() 2146 return nil 2147 } 2148 delete(s.accountReqs, id) 2149 2150 // Clean up the request timeout timer, we'll see how to proceed further based 2151 // on the actual delivered content 2152 if !req.timeout.Stop() { 2153 // The timeout is already triggered, and this request will be reverted+rescheduled 2154 s.lock.Unlock() 2155 return nil 2156 } 2157 // Response is valid, but check if peer is signalling that it does not have 2158 // the requested data. For account range queries that means the state being 2159 // retrieved was either already pruned remotely, or the peer is not yet 2160 // synced to our head. 2161 if len(hashes) == 0 && len(accounts) == 0 && len(proof) == 0 { 2162 logger.Debug("Peer rejected account range request", "root", s.root) 2163 s.statelessPeers[peer.ID()] = struct{}{} 2164 s.lock.Unlock() 2165 2166 // Signal this request as failed, and ready for rescheduling 2167 s.scheduleRevertAccountRequest(req) 2168 return nil 2169 } 2170 root := s.root 2171 s.lock.Unlock() 2172 2173 // Reconstruct a partial trie from the response and verify it 2174 keys := make([][]byte, len(hashes)) 2175 for i, key := range hashes { 2176 keys[i] = common.CopyBytes(key[:]) 2177 } 2178 nodes := make(light.NodeList, len(proof)) 2179 for i, node := range proof { 2180 nodes[i] = node 2181 } 2182 proofdb := nodes.NodeSet() 2183 2184 var end []byte 2185 if len(keys) > 0 { 2186 end = keys[len(keys)-1] 2187 } 2188 cont, err := trie.VerifyRangeProof(root, req.origin[:], end, keys, accounts, proofdb) 2189 if err != nil { 2190 logger.Warn("Account range failed proof", "err", err) 2191 // Signal this request as failed, and ready for rescheduling 2192 s.scheduleRevertAccountRequest(req) 2193 return err 2194 } 2195 accs := make([]*state.Account, len(accounts)) 2196 for i, account := range accounts { 2197 acc := new(state.Account) 2198 if err := rlp.DecodeBytes(account, acc); err != nil { 2199 panic(err) // We created these blobs, we must be able to decode them 2200 } 2201 accs[i] = acc 2202 } 2203 response := &accountResponse{ 2204 task: req.task, 2205 hashes: hashes, 2206 accounts: accs, 2207 cont: cont, 2208 } 2209 select { 2210 case req.deliver <- response: 2211 case <-req.cancel: 2212 case <-req.stale: 2213 } 2214 return nil 2215 } 2216 2217 // OnByteCodes is a callback method to invoke when a batch of contract 2218 // bytes codes are received from a remote peer. 2219 func (s *Syncer) OnByteCodes(peer SyncPeer, id uint64, bytecodes [][]byte) error { 2220 s.lock.RLock() 2221 syncing := !s.snapped 2222 s.lock.RUnlock() 2223 2224 if syncing { 2225 return s.onByteCodes(peer, id, bytecodes) 2226 } 2227 return s.onHealByteCodes(peer, id, bytecodes) 2228 } 2229 2230 // onByteCodes is a callback method to invoke when a batch of contract 2231 // bytes codes are received from a remote peer in the syncing phase. 2232 func (s *Syncer) onByteCodes(peer SyncPeer, id uint64, bytecodes [][]byte) error { 2233 var size common.StorageSize 2234 for _, code := range bytecodes { 2235 size += common.StorageSize(len(code)) 2236 } 2237 logger := peer.Log().New("reqid", id) 2238 logger.Trace("Delivering set of bytecodes", "bytecodes", len(bytecodes), "bytes", size) 2239 2240 // Whether or not the response is valid, we can mark the peer as idle and 2241 // notify the scheduler to assign a new task. If the response is invalid, 2242 // we'll drop the peer in a bit. 2243 s.lock.Lock() 2244 if _, ok := s.peers[peer.ID()]; ok { 2245 s.bytecodeIdlers[peer.ID()] = struct{}{} 2246 } 2247 select { 2248 case s.update <- struct{}{}: 2249 default: 2250 } 2251 // Ensure the response is for a valid request 2252 req, ok := s.bytecodeReqs[id] 2253 if !ok { 2254 // Request stale, perhaps the peer timed out but came through in the end 2255 logger.Warn("Unexpected bytecode packet") 2256 s.lock.Unlock() 2257 return nil 2258 } 2259 delete(s.bytecodeReqs, id) 2260 2261 // Clean up the request timeout timer, we'll see how to proceed further based 2262 // on the actual delivered content 2263 if !req.timeout.Stop() { 2264 // The timeout is already triggered, and this request will be reverted+rescheduled 2265 s.lock.Unlock() 2266 return nil 2267 } 2268 2269 // Response is valid, but check if peer is signalling that it does not have 2270 // the requested data. For bytecode range queries that means the peer is not 2271 // yet synced. 2272 if len(bytecodes) == 0 { 2273 logger.Debug("Peer rejected bytecode request") 2274 s.statelessPeers[peer.ID()] = struct{}{} 2275 s.lock.Unlock() 2276 2277 // Signal this request as failed, and ready for rescheduling 2278 s.scheduleRevertBytecodeRequest(req) 2279 return nil 2280 } 2281 s.lock.Unlock() 2282 2283 // Cross reference the requested bytecodes with the response to find gaps 2284 // that the serving node is missing 2285 hasher := sha3.NewLegacyKeccak256().(crypto.KeccakState) 2286 hash := make([]byte, 32) 2287 2288 codes := make([][]byte, len(req.hashes)) 2289 for i, j := 0, 0; i < len(bytecodes); i++ { 2290 // Find the next hash that we've been served, leaving misses with nils 2291 hasher.Reset() 2292 hasher.Write(bytecodes[i]) 2293 hasher.Read(hash) 2294 2295 for j < len(req.hashes) && !bytes.Equal(hash, req.hashes[j][:]) { 2296 j++ 2297 } 2298 if j < len(req.hashes) { 2299 codes[j] = bytecodes[i] 2300 j++ 2301 continue 2302 } 2303 // We've either ran out of hashes, or got unrequested data 2304 logger.Warn("Unexpected bytecodes", "count", len(bytecodes)-i) 2305 // Signal this request as failed, and ready for rescheduling 2306 s.scheduleRevertBytecodeRequest(req) 2307 return errors.New("unexpected bytecode") 2308 } 2309 // Response validated, send it to the scheduler for filling 2310 response := &bytecodeResponse{ 2311 task: req.task, 2312 hashes: req.hashes, 2313 codes: codes, 2314 } 2315 select { 2316 case req.deliver <- response: 2317 case <-req.cancel: 2318 case <-req.stale: 2319 } 2320 return nil 2321 } 2322 2323 // OnStorage is a callback method to invoke when ranges of storage slots 2324 // are received from a remote peer. 2325 func (s *Syncer) OnStorage(peer SyncPeer, id uint64, hashes [][]common.Hash, slots [][][]byte, proof [][]byte) error { 2326 // Gather some trace stats to aid in debugging issues 2327 var ( 2328 hashCount int 2329 slotCount int 2330 size common.StorageSize 2331 ) 2332 for _, hashset := range hashes { 2333 size += common.StorageSize(common.HashLength * len(hashset)) 2334 hashCount += len(hashset) 2335 } 2336 for _, slotset := range slots { 2337 for _, slot := range slotset { 2338 size += common.StorageSize(len(slot)) 2339 } 2340 slotCount += len(slotset) 2341 } 2342 for _, node := range proof { 2343 size += common.StorageSize(len(node)) 2344 } 2345 logger := peer.Log().New("reqid", id) 2346 logger.Trace("Delivering ranges of storage slots", "accounts", len(hashes), "hashes", hashCount, "slots", slotCount, "proofs", len(proof), "size", size) 2347 2348 // Whether or not the response is valid, we can mark the peer as idle and 2349 // notify the scheduler to assign a new task. If the response is invalid, 2350 // we'll drop the peer in a bit. 2351 s.lock.Lock() 2352 if _, ok := s.peers[peer.ID()]; ok { 2353 s.storageIdlers[peer.ID()] = struct{}{} 2354 } 2355 select { 2356 case s.update <- struct{}{}: 2357 default: 2358 } 2359 // Ensure the response is for a valid request 2360 req, ok := s.storageReqs[id] 2361 if !ok { 2362 // Request stale, perhaps the peer timed out but came through in the end 2363 logger.Warn("Unexpected storage ranges packet") 2364 s.lock.Unlock() 2365 return nil 2366 } 2367 delete(s.storageReqs, id) 2368 2369 // Clean up the request timeout timer, we'll see how to proceed further based 2370 // on the actual delivered content 2371 if !req.timeout.Stop() { 2372 // The timeout is already triggered, and this request will be reverted+rescheduled 2373 s.lock.Unlock() 2374 return nil 2375 } 2376 2377 // Reject the response if the hash sets and slot sets don't match, or if the 2378 // peer sent more data than requested. 2379 if len(hashes) != len(slots) { 2380 s.lock.Unlock() 2381 s.scheduleRevertStorageRequest(req) // reschedule request 2382 logger.Warn("Hash and slot set size mismatch", "hashset", len(hashes), "slotset", len(slots)) 2383 return errors.New("hash and slot set size mismatch") 2384 } 2385 if len(hashes) > len(req.accounts) { 2386 s.lock.Unlock() 2387 s.scheduleRevertStorageRequest(req) // reschedule request 2388 logger.Warn("Hash set larger than requested", "hashset", len(hashes), "requested", len(req.accounts)) 2389 return errors.New("hash set larger than requested") 2390 } 2391 // Response is valid, but check if peer is signalling that it does not have 2392 // the requested data. For storage range queries that means the state being 2393 // retrieved was either already pruned remotely, or the peer is not yet 2394 // synced to our head. 2395 if len(hashes) == 0 { 2396 logger.Debug("Peer rejected storage request") 2397 s.statelessPeers[peer.ID()] = struct{}{} 2398 s.lock.Unlock() 2399 s.scheduleRevertStorageRequest(req) // reschedule request 2400 return nil 2401 } 2402 s.lock.Unlock() 2403 2404 // Reconstruct the partial tries from the response and verify them 2405 var cont bool 2406 2407 for i := 0; i < len(hashes); i++ { 2408 // Convert the keys and proofs into an internal format 2409 keys := make([][]byte, len(hashes[i])) 2410 for j, key := range hashes[i] { 2411 keys[j] = common.CopyBytes(key[:]) 2412 } 2413 nodes := make(light.NodeList, 0, len(proof)) 2414 if i == len(hashes)-1 { 2415 for _, node := range proof { 2416 nodes = append(nodes, node) 2417 } 2418 } 2419 var err error 2420 if len(nodes) == 0 { 2421 // No proof has been attached, the response must cover the entire key 2422 // space and hash to the origin root. 2423 _, err = trie.VerifyRangeProof(req.roots[i], nil, nil, keys, slots[i], nil) 2424 if err != nil { 2425 s.scheduleRevertStorageRequest(req) // reschedule request 2426 logger.Warn("Storage slots failed proof", "err", err) 2427 return err 2428 } 2429 } else { 2430 // A proof was attached, the response is only partial, check that the 2431 // returned data is indeed part of the storage trie 2432 proofdb := nodes.NodeSet() 2433 2434 var end []byte 2435 if len(keys) > 0 { 2436 end = keys[len(keys)-1] 2437 } 2438 cont, err = trie.VerifyRangeProof(req.roots[i], req.origin[:], end, keys, slots[i], proofdb) 2439 if err != nil { 2440 s.scheduleRevertStorageRequest(req) // reschedule request 2441 logger.Warn("Storage range failed proof", "err", err) 2442 return err 2443 } 2444 } 2445 } 2446 // Partial tries reconstructed, send them to the scheduler for storage filling 2447 response := &storageResponse{ 2448 mainTask: req.mainTask, 2449 subTask: req.subTask, 2450 accounts: req.accounts, 2451 roots: req.roots, 2452 hashes: hashes, 2453 slots: slots, 2454 cont: cont, 2455 } 2456 select { 2457 case req.deliver <- response: 2458 case <-req.cancel: 2459 case <-req.stale: 2460 } 2461 return nil 2462 } 2463 2464 // OnTrieNodes is a callback method to invoke when a batch of trie nodes 2465 // are received from a remote peer. 2466 func (s *Syncer) OnTrieNodes(peer SyncPeer, id uint64, trienodes [][]byte) error { 2467 var size common.StorageSize 2468 for _, node := range trienodes { 2469 size += common.StorageSize(len(node)) 2470 } 2471 logger := peer.Log().New("reqid", id) 2472 logger.Trace("Delivering set of healing trienodes", "trienodes", len(trienodes), "bytes", size) 2473 2474 // Whether or not the response is valid, we can mark the peer as idle and 2475 // notify the scheduler to assign a new task. If the response is invalid, 2476 // we'll drop the peer in a bit. 2477 s.lock.Lock() 2478 if _, ok := s.peers[peer.ID()]; ok { 2479 s.trienodeHealIdlers[peer.ID()] = struct{}{} 2480 } 2481 select { 2482 case s.update <- struct{}{}: 2483 default: 2484 } 2485 // Ensure the response is for a valid request 2486 req, ok := s.trienodeHealReqs[id] 2487 if !ok { 2488 // Request stale, perhaps the peer timed out but came through in the end 2489 logger.Warn("Unexpected trienode heal packet") 2490 s.lock.Unlock() 2491 return nil 2492 } 2493 delete(s.trienodeHealReqs, id) 2494 2495 // Clean up the request timeout timer, we'll see how to proceed further based 2496 // on the actual delivered content 2497 if !req.timeout.Stop() { 2498 // The timeout is already triggered, and this request will be reverted+rescheduled 2499 s.lock.Unlock() 2500 return nil 2501 } 2502 2503 // Response is valid, but check if peer is signalling that it does not have 2504 // the requested data. For bytecode range queries that means the peer is not 2505 // yet synced. 2506 if len(trienodes) == 0 { 2507 logger.Debug("Peer rejected trienode heal request") 2508 s.statelessPeers[peer.ID()] = struct{}{} 2509 s.lock.Unlock() 2510 2511 // Signal this request as failed, and ready for rescheduling 2512 s.scheduleRevertTrienodeHealRequest(req) 2513 return nil 2514 } 2515 s.lock.Unlock() 2516 2517 // Cross reference the requested trienodes with the response to find gaps 2518 // that the serving node is missing 2519 hasher := sha3.NewLegacyKeccak256().(crypto.KeccakState) 2520 hash := make([]byte, 32) 2521 2522 nodes := make([][]byte, len(req.hashes)) 2523 for i, j := 0, 0; i < len(trienodes); i++ { 2524 // Find the next hash that we've been served, leaving misses with nils 2525 hasher.Reset() 2526 hasher.Write(trienodes[i]) 2527 hasher.Read(hash) 2528 2529 for j < len(req.hashes) && !bytes.Equal(hash, req.hashes[j][:]) { 2530 j++ 2531 } 2532 if j < len(req.hashes) { 2533 nodes[j] = trienodes[i] 2534 j++ 2535 continue 2536 } 2537 // We've either ran out of hashes, or got unrequested data 2538 logger.Warn("Unexpected healing trienodes", "count", len(trienodes)-i) 2539 // Signal this request as failed, and ready for rescheduling 2540 s.scheduleRevertTrienodeHealRequest(req) 2541 return errors.New("unexpected healing trienode") 2542 } 2543 // Response validated, send it to the scheduler for filling 2544 response := &trienodeHealResponse{ 2545 task: req.task, 2546 hashes: req.hashes, 2547 paths: req.paths, 2548 nodes: nodes, 2549 } 2550 select { 2551 case req.deliver <- response: 2552 case <-req.cancel: 2553 case <-req.stale: 2554 } 2555 return nil 2556 } 2557 2558 // onHealByteCodes is a callback method to invoke when a batch of contract 2559 // bytes codes are received from a remote peer in the healing phase. 2560 func (s *Syncer) onHealByteCodes(peer SyncPeer, id uint64, bytecodes [][]byte) error { 2561 var size common.StorageSize 2562 for _, code := range bytecodes { 2563 size += common.StorageSize(len(code)) 2564 } 2565 logger := peer.Log().New("reqid", id) 2566 logger.Trace("Delivering set of healing bytecodes", "bytecodes", len(bytecodes), "bytes", size) 2567 2568 // Whether or not the response is valid, we can mark the peer as idle and 2569 // notify the scheduler to assign a new task. If the response is invalid, 2570 // we'll drop the peer in a bit. 2571 s.lock.Lock() 2572 if _, ok := s.peers[peer.ID()]; ok { 2573 s.bytecodeHealIdlers[peer.ID()] = struct{}{} 2574 } 2575 select { 2576 case s.update <- struct{}{}: 2577 default: 2578 } 2579 // Ensure the response is for a valid request 2580 req, ok := s.bytecodeHealReqs[id] 2581 if !ok { 2582 // Request stale, perhaps the peer timed out but came through in the end 2583 logger.Warn("Unexpected bytecode heal packet") 2584 s.lock.Unlock() 2585 return nil 2586 } 2587 delete(s.bytecodeHealReqs, id) 2588 2589 // Clean up the request timeout timer, we'll see how to proceed further based 2590 // on the actual delivered content 2591 if !req.timeout.Stop() { 2592 // The timeout is already triggered, and this request will be reverted+rescheduled 2593 s.lock.Unlock() 2594 return nil 2595 } 2596 2597 // Response is valid, but check if peer is signalling that it does not have 2598 // the requested data. For bytecode range queries that means the peer is not 2599 // yet synced. 2600 if len(bytecodes) == 0 { 2601 logger.Debug("Peer rejected bytecode heal request") 2602 s.statelessPeers[peer.ID()] = struct{}{} 2603 s.lock.Unlock() 2604 2605 // Signal this request as failed, and ready for rescheduling 2606 s.scheduleRevertBytecodeHealRequest(req) 2607 return nil 2608 } 2609 s.lock.Unlock() 2610 2611 // Cross reference the requested bytecodes with the response to find gaps 2612 // that the serving node is missing 2613 hasher := sha3.NewLegacyKeccak256().(crypto.KeccakState) 2614 hash := make([]byte, 32) 2615 2616 codes := make([][]byte, len(req.hashes)) 2617 for i, j := 0, 0; i < len(bytecodes); i++ { 2618 // Find the next hash that we've been served, leaving misses with nils 2619 hasher.Reset() 2620 hasher.Write(bytecodes[i]) 2621 hasher.Read(hash) 2622 2623 for j < len(req.hashes) && !bytes.Equal(hash, req.hashes[j][:]) { 2624 j++ 2625 } 2626 if j < len(req.hashes) { 2627 codes[j] = bytecodes[i] 2628 j++ 2629 continue 2630 } 2631 // We've either ran out of hashes, or got unrequested data 2632 logger.Warn("Unexpected healing bytecodes", "count", len(bytecodes)-i) 2633 // Signal this request as failed, and ready for rescheduling 2634 s.scheduleRevertBytecodeHealRequest(req) 2635 return errors.New("unexpected healing bytecode") 2636 } 2637 // Response validated, send it to the scheduler for filling 2638 response := &bytecodeHealResponse{ 2639 task: req.task, 2640 hashes: req.hashes, 2641 codes: codes, 2642 } 2643 select { 2644 case req.deliver <- response: 2645 case <-req.cancel: 2646 case <-req.stale: 2647 } 2648 return nil 2649 } 2650 2651 // onHealState is a callback method to invoke when a flat state(account 2652 // or storage slot) is downloded during the healing stage. The flat states 2653 // can be persisted blindly and can be fixed later in the generation stage. 2654 // Note it's not concurrent safe, please handle the concurrent issue outside. 2655 func (s *Syncer) onHealState(paths [][]byte, value []byte) error { 2656 if len(paths) == 1 { 2657 var account state.Account 2658 if err := rlp.DecodeBytes(value, &account); err != nil { 2659 return nil 2660 } 2661 blob := snapshot.SlimAccountRLP(account.Nonce, account.Balance, account.Root, account.CodeHash) 2662 rawdb.WriteAccountSnapshot(s.stateWriter, common.BytesToHash(paths[0]), blob) 2663 s.accountHealed += 1 2664 s.accountHealedBytes += common.StorageSize(1 + common.HashLength + len(blob)) 2665 } 2666 if len(paths) == 2 { 2667 rawdb.WriteStorageSnapshot(s.stateWriter, common.BytesToHash(paths[0]), common.BytesToHash(paths[1]), value) 2668 s.storageHealed += 1 2669 s.storageHealedBytes += common.StorageSize(1 + 2*common.HashLength + len(value)) 2670 } 2671 if s.stateWriter.ValueSize() > ethdb.IdealBatchSize { 2672 s.stateWriter.Write() // It's fine to ignore the error here 2673 s.stateWriter.Reset() 2674 } 2675 return nil 2676 } 2677 2678 // hashSpace is the total size of the 256 bit hash space for accounts. 2679 var hashSpace = new(big.Int).Exp(common.Big2, common.Big256, nil) 2680 2681 // report calculates various status reports and provides it to the user. 2682 func (s *Syncer) report(force bool) { 2683 if len(s.tasks) > 0 { 2684 s.reportSyncProgress(force) 2685 return 2686 } 2687 s.reportHealProgress(force) 2688 } 2689 2690 // reportSyncProgress calculates various status reports and provides it to the user. 2691 func (s *Syncer) reportSyncProgress(force bool) { 2692 // Don't report all the events, just occasionally 2693 if !force && time.Since(s.logTime) < 8*time.Second { 2694 return 2695 } 2696 // Don't report anything until we have a meaningful progress 2697 synced := s.accountBytes + s.bytecodeBytes + s.storageBytes 2698 if synced == 0 { 2699 return 2700 } 2701 accountGaps := new(big.Int) 2702 for _, task := range s.tasks { 2703 accountGaps.Add(accountGaps, new(big.Int).Sub(task.Last.Big(), task.Next.Big())) 2704 } 2705 accountFills := new(big.Int).Sub(hashSpace, accountGaps) 2706 if accountFills.BitLen() == 0 { 2707 return 2708 } 2709 s.logTime = time.Now() 2710 estBytes := float64(new(big.Int).Div( 2711 new(big.Int).Mul(new(big.Int).SetUint64(uint64(synced)), hashSpace), 2712 accountFills, 2713 ).Uint64()) 2714 2715 elapsed := time.Since(s.startTime) 2716 estTime := elapsed / time.Duration(synced) * time.Duration(estBytes) 2717 2718 // Create a mega progress report 2719 var ( 2720 progress = fmt.Sprintf("%.2f%%", float64(synced)*100/estBytes) 2721 accounts = fmt.Sprintf("%v@%v", log.FormatLogfmtUint64(s.accountSynced), s.accountBytes.TerminalString()) 2722 storage = fmt.Sprintf("%v@%v", log.FormatLogfmtUint64(s.storageSynced), s.storageBytes.TerminalString()) 2723 bytecode = fmt.Sprintf("%v@%v", log.FormatLogfmtUint64(s.bytecodeSynced), s.bytecodeBytes.TerminalString()) 2724 ) 2725 log.Info("State sync in progress", "synced", progress, "state", synced, 2726 "accounts", accounts, "slots", storage, "codes", bytecode, "eta", common.PrettyDuration(estTime-elapsed)) 2727 } 2728 2729 // reportHealProgress calculates various status reports and provides it to the user. 2730 func (s *Syncer) reportHealProgress(force bool) { 2731 // Don't report all the events, just occasionally 2732 if !force && time.Since(s.logTime) < 8*time.Second { 2733 return 2734 } 2735 s.logTime = time.Now() 2736 2737 // Create a mega progress report 2738 var ( 2739 trienode = fmt.Sprintf("%v@%v", log.FormatLogfmtUint64(s.trienodeHealSynced), s.trienodeHealBytes.TerminalString()) 2740 bytecode = fmt.Sprintf("%v@%v", log.FormatLogfmtUint64(s.bytecodeHealSynced), s.bytecodeHealBytes.TerminalString()) 2741 accounts = fmt.Sprintf("%v@%v", log.FormatLogfmtUint64(s.accountHealed), s.accountHealedBytes.TerminalString()) 2742 storage = fmt.Sprintf("%v@%v", log.FormatLogfmtUint64(s.storageHealed), s.storageHealedBytes.TerminalString()) 2743 ) 2744 log.Info("State heal in progress", "accounts", accounts, "slots", storage, 2745 "codes", bytecode, "nodes", trienode, "pending", s.healer.scheduler.Pending()) 2746 } 2747 2748 // estimateRemainingSlots tries to determine roughly how many slots are left in 2749 // a contract storage, based on the number of keys and the last hash. This method 2750 // assumes that the hashes are lexicographically ordered and evenly distributed. 2751 func estimateRemainingSlots(hashes int, last common.Hash) (uint64, error) { 2752 if last == (common.Hash{}) { 2753 return 0, errors.New("last hash empty") 2754 } 2755 space := new(big.Int).Mul(math.MaxBig256, big.NewInt(int64(hashes))) 2756 space.Div(space, last.Big()) 2757 if !space.IsUint64() { 2758 // Gigantic address space probably due to too few or malicious slots 2759 return 0, errors.New("too few slots for estimation") 2760 } 2761 return space.Uint64() - uint64(hashes), nil 2762 }