github.com/klaytn/klaytn@v1.10.2/storage/statedb/database.go (about) 1 // Modifications Copyright 2018 The klaytn Authors 2 // Copyright 2015 The go-ethereum Authors 3 // This file is part of the go-ethereum library. 4 // 5 // The go-ethereum library is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Lesser General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // The go-ethereum library is distributed in the hope that it will be useful, 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Lesser General Public License for more details. 14 // 15 // You should have received a copy of the GNU Lesser General Public License 16 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 17 // 18 // This file is derived from trie/database.go (2018/06/04). 19 // Modified and improved for the klaytn development. 20 21 package statedb 22 23 import ( 24 "errors" 25 "fmt" 26 "io" 27 "math/rand" 28 "sync" 29 "time" 30 31 "github.com/klaytn/klaytn/common" 32 "github.com/klaytn/klaytn/log" 33 "github.com/klaytn/klaytn/rlp" 34 "github.com/klaytn/klaytn/storage/database" 35 "github.com/pbnjay/memory" 36 "github.com/rcrowley/go-metrics" 37 ) 38 39 var ( 40 logger = log.NewModuleLogger(log.StorageStateDB) 41 42 // metrics for Cap state 43 memcacheFlushTimeGauge = metrics.NewRegisteredGauge("trie/memcache/flush/time", nil) 44 memcacheFlushNodesGauge = metrics.NewRegisteredGauge("trie/memcache/flush/nodes", nil) 45 memcacheFlushSizeGauge = metrics.NewRegisteredGauge("trie/memcache/flush/size", nil) 46 47 // metrics for GC 48 memcacheGCTimeGauge = metrics.NewRegisteredGauge("trie/memcache/gc/time", nil) 49 memcacheGCNodesMeter = metrics.NewRegisteredMeter("trie/memcache/gc/nodes", nil) 50 memcacheGCSizeMeter = metrics.NewRegisteredMeter("trie/memcache/gc/size", nil) 51 52 // metrics for commit state 53 memcacheCommitTimeGauge = metrics.NewRegisteredGauge("trie/memcache/commit/time", nil) 54 memcacheCommitNodesMeter = metrics.NewRegisteredMeter("trie/memcache/commit/nodes", nil) 55 memcacheCommitSizeMeter = metrics.NewRegisteredMeter("trie/memcache/commit/size", nil) 56 memcacheUncacheTimeGauge = metrics.NewRegisteredGauge("trie/memcache/uncache/time", nil) 57 58 // metrics for state trie cache db 59 memcacheCleanHitMeter = metrics.NewRegisteredMeter("trie/memcache/clean/hit", nil) 60 memcacheCleanMissMeter = metrics.NewRegisteredMeter("trie/memcache/clean/miss", nil) 61 memcacheCleanPrefetchMissMeter = metrics.NewRegisteredMeter("trie/memcache/clean/prefetch/miss", nil) 62 memcacheCleanReadMeter = metrics.NewRegisteredMeter("trie/memcache/clean/read", nil) 63 memcacheCleanWriteMeter = metrics.NewRegisteredMeter("trie/memcache/clean/write", nil) 64 65 // metric of total node number 66 memcacheNodesGauge = metrics.NewRegisteredGauge("trie/memcache/nodes", nil) 67 ) 68 69 // secureKeyPrefix is the database key prefix used to store trie node preimages. 70 var secureKeyPrefix = []byte("secure-key-") 71 72 // secureKeyPrefixLength is the length of the above prefix 73 const secureKeyPrefixLength = 11 74 75 // secureKeyLength is the length of the above prefix + 32byte hash. 76 const secureKeyLength = secureKeyPrefixLength + 32 77 78 // commitResultChSizeLimit limits the size of channel used for commitResult. 79 const commitResultChSizeLimit = 100 * 10000 80 81 // AutoScaling is for auto-scaling cache size. If cacheSize is set to this value, 82 // cache size is set scaling to physical memeory 83 const AutoScaling = -1 84 85 type DatabaseReader interface { 86 // Get retrieves the value associated with key from the database. 87 Get(key []byte) (value []byte, err error) 88 89 // Has retrieves whether a key is present in the database. 90 Has(key []byte) (bool, error) 91 } 92 93 // Database is an intermediate write layer between the trie data structures and 94 // the disk database. The aim is to accumulate trie writes in-memory and only 95 // periodically flush a couple tries to disk, garbage collecting the remainder. 96 type Database struct { 97 diskDB database.DBManager // Persistent storage for matured trie nodes 98 99 nodes map[common.Hash]*cachedNode // Data and references relationships of a trie node 100 oldest common.Hash // Oldest tracked node, flush-list head 101 newest common.Hash // Newest tracked node, flush-list tail 102 103 preimages map[common.Hash][]byte // Preimages of nodes from the secure trie 104 105 gctime time.Duration // Time spent on garbage collection since last commit 106 gcnodes uint64 // Nodes garbage collected since last commit 107 gcsize common.StorageSize // Data storage garbage collected since last commit 108 gcLock sync.RWMutex // Lock for preventing to garbage collect cachedNode without flushing 109 110 flushtime time.Duration // Time spent on data flushing since last commit 111 flushnodes uint64 // Nodes flushed since last commit 112 flushsize common.StorageSize // Data storage flushed since last commit 113 114 nodesSize common.StorageSize // Storage size of the nodes cache 115 preimagesSize common.StorageSize // Storage size of the preimages cache 116 117 lock sync.RWMutex 118 119 trieNodeCache TrieNodeCache // GC friendly memory cache of trie node RLPs 120 trieNodeCacheConfig *TrieNodeCacheConfig // Configuration of trieNodeCache 121 savingTrieNodeCacheTriggered bool // Whether saving trie node cache has been triggered or not 122 } 123 124 // rawNode is a simple binary blob used to differentiate between collapsed trie 125 // nodes and already encoded RLP binary blobs (while at the same time store them 126 // in the same cache fields). 127 type rawNode []byte 128 129 func (n rawNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") } 130 func (n rawNode) cache() (hashNode, bool) { panic("this should never end up in a live trie") } 131 func (n rawNode) fstring(ind string) string { panic("this should never end up in a live trie") } 132 func (n rawNode) lenEncoded() uint16 { panic("this should never end up in a live trie") } 133 func (n rawNode) EncodeRLP(w io.Writer) error { 134 _, err := w.Write([]byte(n)) 135 return err 136 } 137 138 // rawFullNode represents only the useful data content of a full node, with the 139 // caches and flags stripped out to minimize its data database. This type honors 140 // the same RLP encoding as the original parent. 141 type rawFullNode [17]node 142 143 func (n rawFullNode) canUnload(uint16, uint16) bool { panic("this should never end up in a live trie") } 144 func (n rawFullNode) cache() (hashNode, bool) { panic("this should never end up in a live trie") } 145 func (n rawFullNode) fstring(ind string) string { panic("this should never end up in a live trie") } 146 func (n rawFullNode) lenEncoded() uint16 { panic("this should never end up in a live trie") } 147 148 func (n rawFullNode) EncodeRLP(w io.Writer) error { 149 var nodes [17]node 150 151 for i, child := range n { 152 if child != nil { 153 nodes[i] = child 154 } else { 155 nodes[i] = nilValueNode 156 } 157 } 158 return rlp.Encode(w, nodes) 159 } 160 161 // rawShortNode represents only the useful data content of a short node, with the 162 // caches and flags stripped out to minimize its data database. This type honors 163 // the same RLP encoding as the original parent. 164 type rawShortNode struct { 165 Key []byte 166 Val node 167 } 168 169 func (n rawShortNode) canUnload(uint16, uint16) bool { 170 panic("this should never end up in a live trie") 171 } 172 func (n rawShortNode) cache() (hashNode, bool) { panic("this should never end up in a live trie") } 173 func (n rawShortNode) fstring(ind string) string { panic("this should never end up in a live trie") } 174 func (n rawShortNode) lenEncoded() uint16 { panic("this should never end up in a live trie") } 175 176 // cachedNode is all the information we know about a single cached trie node 177 // in the memory database write layer. 178 type cachedNode struct { 179 node node // Cached collapsed trie node, or raw rlp data 180 // TODO-Klaytn: need to change data type of this if we increase the code size limit 181 size uint16 // Byte size of the useful cached data 182 183 parents uint64 // Number of live nodes referencing this one 184 children map[common.Hash]uint64 // External children referenced by this node 185 186 flushPrev common.Hash // Previous node in the flush-list 187 flushNext common.Hash // Next node in the flush-list 188 } 189 190 // rlp returns the raw rlp encoded blob of the cached trie node, either directly 191 // from the cache, or by regenerating it from the collapsed node. 192 func (n *cachedNode) rlp() []byte { 193 if node, ok := n.node.(rawNode); ok { 194 return node 195 } 196 blob, err := rlp.EncodeToBytes(n.node) 197 if err != nil { 198 panic(err) 199 } 200 return blob 201 } 202 203 // obj returns the decoded and expanded trie node, either directly from the cache, 204 // or by regenerating it from the rlp encoded blob. 205 func (n *cachedNode) obj(hash common.Hash) node { 206 if node, ok := n.node.(rawNode); ok { 207 return mustDecodeNode(hash[:], node) 208 } 209 return expandNode(hash[:], n.node) 210 } 211 212 // childs returns all the tracked children of this node, both the implicit ones 213 // from inside the node as well as the explicit ones from outside the node. 214 func (n *cachedNode) childs() []common.Hash { 215 children := make([]common.Hash, 0, 16) 216 for child := range n.children { 217 children = append(children, child) 218 } 219 if _, ok := n.node.(rawNode); !ok { 220 gatherChildren(n.node, &children) 221 } 222 return children 223 } 224 225 // gatherChildren traverses the node hierarchy of a collapsed database node and 226 // retrieves all the hashnode children. 227 func gatherChildren(n node, children *[]common.Hash) { 228 switch n := n.(type) { 229 case *rawShortNode: 230 gatherChildren(n.Val, children) 231 232 case rawFullNode: 233 for i := 0; i < 16; i++ { 234 gatherChildren(n[i], children) 235 } 236 case hashNode: 237 *children = append(*children, common.BytesToHash(n)) 238 239 case valueNode, nil, rawNode: 240 241 default: 242 panic(fmt.Sprintf("unknown node type: %T", n)) 243 } 244 } 245 246 // simplifyNode traverses the hierarchy of an expanded memory node and discards 247 // all the internal caches, returning a node that only contains the raw data. 248 func simplifyNode(n node) node { 249 switch n := n.(type) { 250 case *shortNode: 251 // Short nodes discard the flags and cascade 252 return &rawShortNode{Key: n.Key, Val: simplifyNode(n.Val)} 253 254 case *fullNode: 255 // Full nodes discard the flags and cascade 256 node := rawFullNode(n.Children) 257 for i := 0; i < len(node); i++ { 258 if node[i] != nil { 259 node[i] = simplifyNode(node[i]) 260 } 261 } 262 return node 263 264 case valueNode, hashNode, rawNode: 265 return n 266 267 default: 268 panic(fmt.Sprintf("unknown node type: %T", n)) 269 } 270 } 271 272 // expandNode traverses the node hierarchy of a collapsed database node and converts 273 // all fields and keys into expanded memory form. 274 func expandNode(hash hashNode, n node) node { 275 switch n := n.(type) { 276 case *rawShortNode: 277 // Short nodes need key and child expansion 278 return &shortNode{ 279 Key: compactToHex(n.Key), 280 Val: expandNode(nil, n.Val), 281 flags: nodeFlag{ 282 hash: hash, 283 }, 284 } 285 286 case rawFullNode: 287 // Full nodes need child expansion 288 node := &fullNode{ 289 flags: nodeFlag{ 290 hash: hash, 291 }, 292 } 293 for i := 0; i < len(node.Children); i++ { 294 if n[i] != nil { 295 node.Children[i] = expandNode(nil, n[i]) 296 } 297 } 298 return node 299 300 case valueNode, hashNode: 301 return n 302 303 default: 304 panic(fmt.Sprintf("unknown node type: %T", n)) 305 } 306 } 307 308 // NewDatabase creates a new trie database to store ephemeral trie content before 309 // its written out to disk or garbage collected. 310 func NewDatabase(diskDB database.DBManager) *Database { 311 return NewDatabaseWithNewCache(diskDB, GetEmptyTrieNodeCacheConfig()) 312 } 313 314 // NewDatabaseWithNewCache creates a new trie database to store ephemeral trie content 315 // before its written out to disk or garbage collected. It also acts as a read cache 316 // for nodes loaded from disk. 317 func NewDatabaseWithNewCache(diskDB database.DBManager, cacheConfig *TrieNodeCacheConfig) *Database { 318 trieNodeCache, err := NewTrieNodeCache(cacheConfig) 319 if err != nil { 320 logger.Error("Invalid trie node cache config", "err", err, "config", cacheConfig) 321 } 322 323 return &Database{ 324 diskDB: diskDB, 325 nodes: map[common.Hash]*cachedNode{{}: {}}, 326 preimages: make(map[common.Hash][]byte), 327 trieNodeCache: trieNodeCache, 328 trieNodeCacheConfig: cacheConfig, 329 } 330 } 331 332 // NewDatabaseWithExistingCache creates a new trie database to store ephemeral trie content 333 // before its written out to disk or garbage collected. It also acts as a read cache 334 // for nodes loaded from disk. 335 func NewDatabaseWithExistingCache(diskDB database.DBManager, cache TrieNodeCache) *Database { 336 return &Database{ 337 diskDB: diskDB, 338 nodes: map[common.Hash]*cachedNode{{}: {}}, 339 preimages: make(map[common.Hash][]byte), 340 trieNodeCache: cache, 341 } 342 } 343 344 func getTrieNodeCacheSizeMiB() int { 345 totalPhysicalMemMiB := float64(memory.TotalMemory() / 1024 / 1024) 346 347 if totalPhysicalMemMiB < 10*1024 { 348 return 0 349 } else if totalPhysicalMemMiB < 20*1024 { 350 return 1 * 1024 // allocate 1G for small memory 351 } 352 353 memoryScalePercent := 0.3 // allocate 30% for 20 < mem < 100 354 if totalPhysicalMemMiB > 100*1024 { 355 memoryScalePercent = 0.35 // allocate 35% for 100 < mem 356 } 357 358 return int(totalPhysicalMemMiB * memoryScalePercent) 359 } 360 361 // DiskDB retrieves the persistent database backing the trie database. 362 func (db *Database) DiskDB() database.DBManager { 363 return db.diskDB 364 } 365 366 // TrieNodeCache retrieves the trieNodeCache of the trie database. 367 func (db *Database) TrieNodeCache() TrieNodeCache { 368 return db.trieNodeCache 369 } 370 371 // GetTrieNodeCacheConfig returns the configuration of TrieNodeCache. 372 func (db *Database) GetTrieNodeCacheConfig() *TrieNodeCacheConfig { 373 return db.trieNodeCacheConfig 374 } 375 376 // GetTrieNodeLocalCacheByteLimit returns the byte size of trie node cache. 377 func (db *Database) GetTrieNodeLocalCacheByteLimit() uint64 { 378 return uint64(db.trieNodeCacheConfig.LocalCacheSizeMiB) * 1024 * 1024 379 } 380 381 // RLockGCCachedNode locks the GC lock of CachedNode. 382 func (db *Database) RLockGCCachedNode() { 383 db.gcLock.RLock() 384 } 385 386 // RUnlockGCCachedNode unlocks the GC lock of CachedNode. 387 func (db *Database) RUnlockGCCachedNode() { 388 db.gcLock.RUnlock() 389 } 390 391 // NodeChildren retrieves the children of the given hash trie 392 func (db *Database) NodeChildren(hash common.Hash) ([]common.Hash, error) { 393 childrenHash := make([]common.Hash, 0, 16) 394 395 if (hash == common.Hash{}) { 396 return childrenHash, ErrZeroHashNode 397 } 398 399 n, _ := db.node(hash) 400 if n == nil { 401 return childrenHash, nil 402 } 403 404 children := make([]node, 0, 16) 405 406 switch n := (n).(type) { 407 case *shortNode: 408 children = []node{n.Val} 409 case *fullNode: 410 for i := 0; i < 17; i++ { 411 if n.Children[i] != nil { 412 children = append(children, n.Children[i]) 413 } 414 } 415 } 416 417 for _, child := range children { 418 n, ok := child.(hashNode) 419 if ok { 420 hash := common.BytesToHash(n) 421 childrenHash = append(childrenHash, hash) 422 } 423 } 424 425 return childrenHash, nil 426 } 427 428 // insert inserts a collapsed trie node into the memory database. 429 // The blob size must be specified to allow proper size tracking. 430 // All nodes inserted by this function will be reference tracked 431 // and in theory should only used for **trie nodes** insertion. 432 func (db *Database) insert(hash common.Hash, lenEncoded uint16, node node) { 433 // If the node's already cached, skip 434 if _, ok := db.nodes[hash]; ok { 435 return 436 } 437 // Create the cached entry for this node 438 entry := &cachedNode{ 439 node: simplifyNode(node), 440 size: lenEncoded, 441 flushPrev: db.newest, 442 } 443 for _, child := range entry.childs() { 444 if c := db.nodes[child]; c != nil { 445 c.parents++ 446 } 447 } 448 db.nodes[hash] = entry 449 450 // Update the flush-list endpoints 451 if db.oldest == (common.Hash{}) { 452 db.oldest, db.newest = hash, hash 453 } else { 454 if _, ok := db.nodes[db.newest]; !ok { 455 missingNewest := db.newest 456 db.newest = db.getLastNodeHashInFlushList() 457 db.nodes[db.newest].flushNext = common.Hash{} 458 logger.Error("Found a newest node for missingNewest", "oldNewest", missingNewest, "newNewest", db.newest) 459 } 460 db.nodes[db.newest].flushNext, db.newest = hash, hash 461 } 462 db.nodesSize += common.StorageSize(common.HashLength + entry.size) 463 } 464 465 // insertPreimage writes a new trie node pre-image to the memory database if it's 466 // yet unknown. The method will make a copy of the slice. 467 // 468 // Note, this method assumes that the database's lock is held! 469 func (db *Database) insertPreimage(hash common.Hash, preimage []byte) { 470 if _, ok := db.preimages[hash]; ok { 471 return 472 } 473 db.preimages[hash] = common.CopyBytes(preimage) 474 db.preimagesSize += common.StorageSize(common.HashLength + len(preimage)) 475 } 476 477 // getCachedNode finds an encoded node in the trie node cache if enabled. 478 func (db *Database) getCachedNode(hash common.Hash) []byte { 479 if db.trieNodeCache != nil { 480 if enc := db.trieNodeCache.Get(hash[:]); enc != nil { 481 memcacheCleanHitMeter.Mark(1) 482 memcacheCleanReadMeter.Mark(int64(len(enc))) 483 return enc 484 } 485 } 486 return nil 487 } 488 489 // setCachedNode stores an encoded node to the trie node cache if enabled. 490 func (db *Database) setCachedNode(hash, enc []byte) { 491 if db.trieNodeCache != nil { 492 db.trieNodeCache.Set(hash, enc) 493 memcacheCleanMissMeter.Mark(1) 494 memcacheCleanWriteMeter.Mark(int64(len(enc))) 495 } 496 } 497 498 // node retrieves a cached trie node from memory, or returns nil if node can be 499 // found in the memory cache. 500 func (db *Database) node(hash common.Hash) (n node, fromDB bool) { 501 // Retrieve the node from the trie node cache if available 502 if enc := db.getCachedNode(hash); enc != nil { 503 if dec, err := decodeNode(hash[:], enc); err == nil { 504 return dec, false 505 } else { 506 logger.Error("node from cached trie node fails to be decoded!", "err", err) 507 } 508 } 509 510 // Retrieve the node from the state cache if available 511 db.lock.RLock() 512 node := db.nodes[hash] 513 db.lock.RUnlock() 514 if node != nil { 515 return node.obj(hash), false 516 } 517 518 // Content unavailable in memory, attempt to retrieve from disk 519 enc, err := db.diskDB.ReadCachedTrieNode(hash) 520 if err != nil || enc == nil { 521 return nil, true 522 } 523 db.setCachedNode(hash[:], enc) 524 return mustDecodeNode(hash[:], enc), true 525 } 526 527 // Node retrieves an encoded cached trie node from memory. If it cannot be found 528 // cached, the method queries the persistent database for the content. 529 func (db *Database) Node(hash common.Hash) ([]byte, error) { 530 if (hash == common.Hash{}) { 531 return nil, ErrZeroHashNode 532 } 533 // Retrieve the node from the trie node cache if available 534 if enc := db.getCachedNode(hash); enc != nil { 535 return enc, nil 536 } 537 538 // Retrieve the node from cache if available 539 db.lock.RLock() 540 node := db.nodes[hash] 541 db.lock.RUnlock() 542 543 if node != nil { 544 return node.rlp(), nil 545 } 546 // Content unavailable in memory, attempt to retrieve from disk 547 enc, err := db.diskDB.ReadCachedTrieNode(hash) 548 if err == nil && enc != nil { 549 db.setCachedNode(hash[:], enc) 550 } 551 return enc, err 552 } 553 554 // NodeFromOld retrieves an encoded cached trie node from memory. If it cannot be found 555 // cached, the method queries the old persistent database for the content. 556 func (db *Database) NodeFromOld(hash common.Hash) ([]byte, error) { 557 if (hash == common.Hash{}) { 558 return nil, ErrZeroHashNode 559 } 560 // Retrieve the node from the trie node cache if available 561 if enc := db.getCachedNode(hash); enc != nil { 562 return enc, nil 563 } 564 565 // Retrieve the node from cache if available 566 db.lock.RLock() 567 node := db.nodes[hash] 568 db.lock.RUnlock() 569 570 if node != nil { 571 return node.rlp(), nil 572 } 573 // Content unavailable in memory, attempt to retrieve from disk 574 enc, err := db.diskDB.ReadCachedTrieNodeFromOld(hash) 575 if err == nil && enc != nil { 576 db.setCachedNode(hash[:], enc) 577 } 578 return enc, err 579 } 580 581 // DoesExistCachedNode returns if the node exists on cached trie node in memory. 582 func (db *Database) DoesExistCachedNode(hash common.Hash) bool { 583 // Retrieve the node from cache if available 584 db.lock.RLock() 585 _, ok := db.nodes[hash] 586 db.lock.RUnlock() 587 return ok 588 } 589 590 // DoesExistNodeInPersistent returns if the node exists on the persistent database or its cache. 591 func (db *Database) DoesExistNodeInPersistent(hash common.Hash) bool { 592 // Retrieve the node from DB cache if available 593 if enc := db.getCachedNode(hash); enc != nil { 594 return true 595 } 596 597 // Content unavailable in DB cache, attempt to retrieve from disk 598 enc, err := db.diskDB.ReadCachedTrieNode(hash) 599 if err == nil && enc != nil { 600 return true 601 } 602 603 return false 604 } 605 606 // preimage retrieves a cached trie node pre-image from memory. If it cannot be 607 // found cached, the method queries the persistent database for the content. 608 func (db *Database) preimage(hash common.Hash) ([]byte, error) { 609 // Retrieve the node from cache if available 610 db.lock.RLock() 611 preimage := db.preimages[hash] 612 db.lock.RUnlock() 613 614 if preimage != nil { 615 return preimage, nil 616 } 617 // Content unavailable in memory, attempt to retrieve from disk 618 return db.diskDB.ReadCachedTrieNodePreimage(secureKey(hash)) 619 } 620 621 // secureKey returns the database key for the preimage of key (as a newly 622 // allocated byte-slice) 623 func secureKey(hash common.Hash) []byte { 624 buf := make([]byte, secureKeyLength) 625 copy(buf, secureKeyPrefix) 626 copy(buf[secureKeyPrefixLength:], hash[:]) 627 return buf 628 } 629 630 // Nodes retrieves the hashes of all the nodes cached within the memory database. 631 // This method is extremely expensive and should only be used to validate internal 632 // states in test code. 633 func (db *Database) Nodes() []common.Hash { 634 db.lock.RLock() 635 defer db.lock.RUnlock() 636 637 hashes := make([]common.Hash, 0, len(db.nodes)) 638 for hash := range db.nodes { 639 if hash != (common.Hash{}) { // Special case for "root" references/nodes 640 hashes = append(hashes, hash) 641 } 642 } 643 return hashes 644 } 645 646 // Reference adds a new reference from a parent node to a child node. 647 // This function is used to add reference between internal trie node 648 // and external node(e.g. storage trie root), all internal trie nodes 649 // are referenced together by database itself. 650 func (db *Database) Reference(child common.Hash, parent common.Hash) { 651 db.lock.Lock() 652 defer db.lock.Unlock() 653 654 db.reference(child, parent) 655 } 656 657 // reference is the private locked version of Reference. 658 func (db *Database) reference(child common.Hash, parent common.Hash) { 659 // If the node does not exist, it's a node pulled from disk, skip 660 node, ok := db.nodes[child] 661 if !ok { 662 return 663 } 664 // If the reference already exists, only duplicate for roots 665 if db.nodes[parent].children == nil { 666 db.nodes[parent].children = make(map[common.Hash]uint64) 667 } else if _, ok = db.nodes[parent].children[child]; ok && parent != (common.Hash{}) { 668 return 669 } 670 node.parents++ 671 db.nodes[parent].children[child]++ 672 } 673 674 // Dereference removes an existing reference from a root node. 675 func (db *Database) Dereference(root common.Hash) { 676 // Sanity check to ensure that the meta-root is not removed 677 if common.EmptyHash(root) { 678 logger.Error("Attempted to dereference the trie cache meta root") 679 return 680 } 681 682 db.gcLock.Lock() 683 defer db.gcLock.Unlock() 684 685 db.lock.Lock() 686 defer db.lock.Unlock() 687 688 nodes, storage, start := len(db.nodes), db.nodesSize, time.Now() 689 db.dereference(root, common.Hash{}) 690 691 db.gcnodes += uint64(nodes - len(db.nodes)) 692 db.gcsize += storage - db.nodesSize 693 db.gctime += time.Since(start) 694 695 memcacheGCTimeGauge.Update(int64(time.Since(start))) 696 memcacheGCSizeMeter.Mark(int64(storage - db.nodesSize)) 697 memcacheGCNodesMeter.Mark(int64(nodes - len(db.nodes))) 698 699 logger.Debug("Dereferenced trie from memory database", "nodes", nodes-len(db.nodes), "size", storage-db.nodesSize, "time", time.Since(start), 700 "gcnodes", db.gcnodes, "gcsize", db.gcsize, "gctime", db.gctime, "livenodes", len(db.nodes), "livesize", db.nodesSize) 701 } 702 703 // dereference is the private locked version of Dereference. 704 func (db *Database) dereference(child common.Hash, parent common.Hash) { 705 // Dereference the parent-child 706 node := db.nodes[parent] 707 708 if node.children != nil && node.children[child] > 0 { 709 node.children[child]-- 710 if node.children[child] == 0 { 711 delete(node.children, child) 712 } 713 } 714 // If the node does not exist, it's a previously committed node. 715 node, ok := db.nodes[child] 716 if !ok { 717 return 718 } 719 // If there are no more references to the child, delete it and cascade 720 if node.parents > 0 { 721 // This is a special cornercase where a node loaded from disk (i.e. not in the 722 // memcache any more) gets reinjected as a new node (short node split into full, 723 // then reverted into short), causing a cached node to have no parents. That is 724 // no problem in itself, but don't make maxint parents out of it. 725 node.parents-- 726 } 727 if node.parents == 0 { 728 // Remove the node from the flush-list 729 db.removeNodeInFlushList(child) 730 // Dereference all children and delete the node 731 for _, hash := range node.childs() { 732 db.dereference(hash, child) 733 } 734 delete(db.nodes, child) 735 db.nodesSize -= common.StorageSize(common.HashLength + int(node.size)) 736 } 737 } 738 739 // Cap iteratively flushes old but still referenced trie nodes until the total 740 // memory usage goes below the given threshold. 741 func (db *Database) Cap(limit common.StorageSize) error { 742 // Create a database batch to flush persistent data out. It is important that 743 // outside code doesn't see an inconsistent state (referenced data removed from 744 // memory cache during commit but not yet in persistent database). This is ensured 745 // by only uncaching existing data when the database write finalizes. 746 db.lock.RLock() 747 748 nodes, nodeSize, start := len(db.nodes), db.nodesSize, time.Now() 749 preimagesSize := db.preimagesSize 750 751 // db.nodesSize only contains the useful data in the cache, but when reporting 752 // the total memory consumption, the maintenance metadata is also needed to be 753 // counted. For every useful node, we track 2 extra hashes as the flushlist. 754 size := db.nodesSize + common.StorageSize((len(db.nodes)-1)*2*common.HashLength) 755 756 // If the preimage cache got large enough, push to disk. If it's still small 757 // leave for later to deduplicate writes. 758 flushPreimages := db.preimagesSize > 4*1024*1024 759 if flushPreimages { 760 if err := db.writeBatchPreimages(); err != nil { 761 db.lock.RUnlock() 762 return err 763 } 764 } 765 // Keep committing nodes from the flush-list until we're below allowance 766 oldest := db.oldest 767 batch := db.diskDB.NewBatch(database.StateTrieDB) 768 for size > limit && oldest != (common.Hash{}) { 769 // Fetch the oldest referenced node and push into the batch 770 node := db.nodes[oldest] 771 enc := node.rlp() 772 if err := database.PutAndWriteBatchesOverThreshold(batch, oldest[:], enc); err != nil { 773 db.lock.RUnlock() 774 return err 775 } 776 777 if db.trieNodeCache != nil { 778 db.trieNodeCache.Set(oldest[:], enc) 779 } 780 // Iterate to the next flush item, or abort if the size cap was achieved. Size 781 // is the total size, including both the useful cached data (hash -> blob), as 782 // well as the flushlist metadata (2*hash). When flushing items from the cache, 783 // we need to reduce both. 784 size -= common.StorageSize(3*common.HashLength + int(node.size)) 785 oldest = node.flushNext 786 } 787 // Flush out any remainder data from the last batch 788 if _, err := database.WriteBatches(batch); err != nil { 789 logger.Error("Failed to write flush list to disk", "err", err) 790 db.lock.RUnlock() 791 return err 792 } 793 794 db.lock.RUnlock() 795 796 // Write successful, clear out the flushed data 797 db.lock.Lock() 798 defer db.lock.Unlock() 799 800 if flushPreimages { 801 db.preimages = make(map[common.Hash][]byte) 802 db.preimagesSize = 0 803 } 804 for db.oldest != oldest { 805 node := db.nodes[db.oldest] 806 delete(db.nodes, db.oldest) 807 db.oldest = node.flushNext 808 809 db.nodesSize -= common.StorageSize(common.HashLength + int(node.size)) 810 } 811 if db.oldest != (common.Hash{}) { 812 db.nodes[db.oldest].flushPrev = common.Hash{} 813 } else { 814 db.newest = common.Hash{} 815 } 816 db.flushnodes += uint64(nodes - len(db.nodes)) 817 db.flushsize += nodeSize - db.nodesSize 818 db.flushtime += time.Since(start) 819 820 memcacheFlushTimeGauge.Update(int64(time.Since(start))) 821 memcacheFlushSizeGauge.Update(int64(nodeSize - db.nodesSize)) 822 memcacheFlushNodesGauge.Update(int64(nodes - len(db.nodes))) 823 824 logger.Info("Persisted nodes from memory database by Cap", "nodes", nodes-len(db.nodes), 825 "size", nodeSize-db.nodesSize, "preimagesSize", preimagesSize-db.preimagesSize, "time", time.Since(start), 826 "flushnodes", db.flushnodes, "flushsize", db.flushsize, "flushtime", db.flushtime, "livenodes", len(db.nodes), 827 "livesize", db.nodesSize) 828 return nil 829 } 830 831 func (db *Database) writeBatchPreimages() error { 832 // TODO-Klaytn What kind of batch should be used below? 833 preimagesBatch := db.diskDB.NewBatch(database.StateTrieDB) 834 835 // We reuse an ephemeral buffer for the keys. The batch Put operation 836 // copies it internally, so we can reuse it. 837 var keyBuf [secureKeyLength]byte 838 copy(keyBuf[:], secureKeyPrefix) 839 840 // Move all of the accumulated preimages into a write batch 841 for hash, preimage := range db.preimages { 842 copy(keyBuf[secureKeyPrefixLength:], hash[:]) 843 if err := preimagesBatch.Put(keyBuf[:], preimage); err != nil { 844 logger.Error("Failed to commit preimages from trie database", "err", err) 845 return err 846 } 847 848 if _, err := database.WriteBatchesOverThreshold(preimagesBatch); err != nil { 849 return err 850 } 851 } 852 853 // Write batch ready, unlock for readers during persistence 854 if _, err := database.WriteBatches(preimagesBatch); err != nil { 855 logger.Error("Failed to write preimages to disk", "err", err) 856 return err 857 } 858 859 return nil 860 } 861 862 // commitResult contains the result from concurrent commit calls. 863 // key and val are nil if the commitResult indicates the end of 864 // concurrentCommit goroutine. 865 type commitResult struct { 866 key []byte 867 val []byte 868 } 869 870 func (db *Database) writeBatchNodes(node common.Hash) error { 871 rootNode, ok := db.nodes[node] 872 if !ok { 873 return nil 874 } 875 876 // To limit the size of commitResult channel, we use commitResultChSizeLimit here. 877 var resultCh chan commitResult 878 if len(db.nodes) > commitResultChSizeLimit { 879 resultCh = make(chan commitResult, commitResultChSizeLimit) 880 } else { 881 resultCh = make(chan commitResult, len(db.nodes)) 882 } 883 numGoRoutines := len(rootNode.childs()) 884 for i, child := range rootNode.childs() { 885 go db.concurrentCommit(child, resultCh, i) 886 } 887 888 batch := db.diskDB.NewBatch(database.StateTrieDB) 889 for numGoRoutines > 0 { 890 result := <-resultCh 891 if result.key == nil && result.val == nil { 892 numGoRoutines-- 893 continue 894 } 895 896 if err := batch.Put(result.key, result.val); err != nil { 897 return err 898 } 899 if batch.ValueSize() > database.IdealBatchSize { 900 if err := batch.Write(); err != nil { 901 return err 902 } 903 batch.Reset() 904 } 905 } 906 907 enc := rootNode.rlp() 908 if err := batch.Put(node[:], enc); err != nil { 909 return err 910 } 911 if err := batch.Write(); err != nil { 912 logger.Error("Failed to write trie to disk", "err", err) 913 return err 914 } 915 if db.trieNodeCache != nil { 916 db.trieNodeCache.Set(node[:], enc) 917 } 918 919 return nil 920 } 921 922 func (db *Database) concurrentCommit(hash common.Hash, resultCh chan<- commitResult, childIndex int) { 923 logger.Trace("concurrentCommit start", "childIndex", childIndex) 924 defer logger.Trace("concurrentCommit end", "childIndex", childIndex) 925 db.commit(hash, resultCh) 926 resultCh <- commitResult{nil, nil} 927 } 928 929 // Commit iterates over all the children of a particular node, writes them out 930 // to disk, forcefully tearing down all references in both directions. 931 // 932 // As a side effect, all pre-images accumulated up to this point are also written. 933 func (db *Database) Commit(node common.Hash, report bool, blockNum uint64) error { 934 // Create a database batch to flush persistent data out. It is important that 935 // outside code doesn't see an inconsistent state (referenced data removed from 936 // memory cache during commit but not yet in persistent database). This is ensured 937 // by only uncaching existing data when the database write finalizes. 938 db.lock.RLock() 939 940 commitStart := time.Now() 941 if err := db.writeBatchPreimages(); err != nil { 942 db.lock.RUnlock() 943 return err 944 } 945 946 // Move the trie itself into the batch, flushing if enough data is accumulated 947 numNodes, nodesSize := len(db.nodes), db.nodesSize 948 if err := db.writeBatchNodes(node); err != nil { 949 db.lock.RUnlock() 950 return err 951 } 952 953 db.lock.RUnlock() 954 955 // Write successful, clear out the flushed data 956 db.lock.Lock() 957 defer db.lock.Unlock() 958 959 db.preimages = make(map[common.Hash][]byte) 960 db.preimagesSize = 0 961 962 uncacheStart := time.Now() 963 db.uncache(node) 964 commitEnd := time.Now() 965 966 memcacheUncacheTimeGauge.Update(int64(commitEnd.Sub(uncacheStart))) 967 memcacheCommitTimeGauge.Update(int64(commitEnd.Sub(commitStart))) 968 memcacheCommitSizeMeter.Mark(int64(nodesSize - db.nodesSize)) 969 memcacheCommitNodesMeter.Mark(int64(numNodes - len(db.nodes))) 970 971 localLogger := logger.Info 972 if !report { 973 localLogger = logger.Debug 974 } 975 localLogger("Persisted trie from memory database", "blockNum", blockNum, 976 "updated nodes", numNodes-len(db.nodes), "updated nodes size", nodesSize-db.nodesSize, 977 "time", commitEnd.Sub(commitStart), "gcnodes", db.gcnodes, "gcsize", db.gcsize, "gctime", db.gctime, 978 "livenodes", len(db.nodes), "livesize", db.nodesSize) 979 980 // Reset the garbage collection statistics 981 db.gcnodes, db.gcsize, db.gctime = 0, 0, 0 982 db.flushnodes, db.flushsize, db.flushtime = 0, 0, 0 983 return nil 984 } 985 986 // commit iteratively encodes nodes from parents to child nodes. 987 func (db *Database) commit(hash common.Hash, resultCh chan<- commitResult) { 988 node, ok := db.nodes[hash] 989 if !ok { 990 return 991 } 992 for _, child := range node.childs() { 993 db.commit(child, resultCh) 994 } 995 enc := node.rlp() 996 resultCh <- commitResult{hash[:], enc} 997 998 if db.trieNodeCache != nil { 999 db.trieNodeCache.Set(hash[:], enc) 1000 } 1001 } 1002 1003 // uncache is the post-processing step of a commit operation where the already 1004 // persisted trie is removed from the cache. The reason behind the two-phase 1005 // commit is to ensure consistent data availability while moving from memory 1006 // to disk. 1007 func (db *Database) uncache(hash common.Hash) { 1008 // If the node does not exists, we're done on this path 1009 node, ok := db.nodes[hash] 1010 if !ok { 1011 return 1012 } 1013 // Node still exists, remove it from the flush-list 1014 db.removeNodeInFlushList(hash) 1015 // Uncache the node's subtries and remove the node itself too 1016 for _, child := range node.childs() { 1017 db.uncache(child) 1018 } 1019 delete(db.nodes, hash) 1020 db.nodesSize -= common.StorageSize(common.HashLength + int(node.size)) 1021 } 1022 1023 // Size returns the current database size of the memory cache in front of the 1024 // persistent database layer. 1025 func (db *Database) Size() (common.StorageSize, common.StorageSize, common.StorageSize) { 1026 db.lock.RLock() 1027 defer db.lock.RUnlock() 1028 1029 // db.nodesSize only contains the useful data in the cache, but when reporting 1030 // the total memory consumption, the maintenance metadata is also needed to be 1031 // counted. For every useful node, we track 2 extra hashes as the flushlist. 1032 flushlistSize := common.StorageSize((len(db.nodes) - 1) * 2 * common.HashLength) 1033 return db.nodesSize + flushlistSize, db.nodesSize, db.preimagesSize 1034 } 1035 1036 // verifyIntegrity is a debug method to iterate over the entire trie stored in 1037 // memory and check whether every node is reachable from the meta root. The goal 1038 // is to find any errors that might cause memory leaks and or trie nodes to go 1039 // missing. 1040 // 1041 // This method is extremely CPU and memory intensive, only use when must. 1042 func (db *Database) verifyIntegrity() { 1043 // Iterate over all the cached nodes and accumulate them into a set 1044 reachable := map[common.Hash]struct{}{{}: {}} 1045 1046 for child := range db.nodes[common.Hash{}].children { 1047 db.accumulate(child, reachable) 1048 } 1049 // Find any unreachable but cached nodes 1050 unreachable := []string{} 1051 for hash, node := range db.nodes { 1052 if _, ok := reachable[hash]; !ok { 1053 unreachable = append(unreachable, fmt.Sprintf("%x: {Node: %v, Parents: %d, Prev: %x, Next: %x}", 1054 hash, node.node, node.parents, node.flushPrev, node.flushNext)) 1055 } 1056 } 1057 if len(unreachable) != 0 { 1058 panic(fmt.Sprintf("trie cache memory leak: %v", unreachable)) 1059 } 1060 } 1061 1062 // accumulate iterates over the trie defined by hash and accumulates all the 1063 // cached children found in memory. 1064 func (db *Database) accumulate(hash common.Hash, reachable map[common.Hash]struct{}) { 1065 // Mark the node reachable if present in the memory cache 1066 node, ok := db.nodes[hash] 1067 if !ok { 1068 return 1069 } 1070 reachable[hash] = struct{}{} 1071 1072 // Iterate over all the children and accumulate them too 1073 for _, child := range node.childs() { 1074 db.accumulate(child, reachable) 1075 } 1076 } 1077 1078 func (db *Database) removeNodeInFlushList(hash common.Hash) { 1079 node, ok := db.nodes[hash] 1080 if !ok { 1081 return 1082 } 1083 1084 if hash == db.oldest && hash == db.newest { 1085 db.oldest = common.Hash{} 1086 db.newest = common.Hash{} 1087 } else if hash == db.oldest { 1088 db.oldest = node.flushNext 1089 db.nodes[node.flushNext].flushPrev = common.Hash{} 1090 } else if hash == db.newest { 1091 db.newest = node.flushPrev 1092 db.nodes[node.flushPrev].flushNext = common.Hash{} 1093 } else { 1094 db.nodes[node.flushPrev].flushNext = node.flushNext 1095 db.nodes[node.flushNext].flushPrev = node.flushPrev 1096 } 1097 } 1098 1099 func (db *Database) getLastNodeHashInFlushList() common.Hash { 1100 var lastNodeHash common.Hash 1101 nodeHash := db.oldest 1102 for { 1103 if _, ok := db.nodes[nodeHash]; ok { 1104 lastNodeHash = nodeHash 1105 } else { 1106 logger.Debug("not found next noode in map of flush list") 1107 break 1108 } 1109 1110 if db.nodes[nodeHash].flushNext != (common.Hash{}) { 1111 nodeHash = db.nodes[nodeHash].flushNext 1112 } else { 1113 logger.Debug("found last noode in map of flush list") 1114 break 1115 } 1116 } 1117 return lastNodeHash 1118 } 1119 1120 // UpdateMetricNodes updates the size of Database.nodes 1121 func (db *Database) UpdateMetricNodes() { 1122 memcacheNodesGauge.Update(int64(len(db.nodes))) 1123 if db.trieNodeCache != nil { 1124 db.trieNodeCache.UpdateStats() 1125 } 1126 } 1127 1128 var ( 1129 errDisabledTrieNodeCache = errors.New("trie node cache is disabled, nothing to save to file") 1130 errSavingTrieNodeCacheInProgress = errors.New("saving trie node cache has been triggered already") 1131 ) 1132 1133 func (db *Database) CanSaveTrieNodeCacheToFile() error { 1134 if db.trieNodeCache == nil { 1135 return errDisabledTrieNodeCache 1136 } 1137 if db.savingTrieNodeCacheTriggered { 1138 return errSavingTrieNodeCacheInProgress 1139 } 1140 return nil 1141 } 1142 1143 // SaveTrieNodeCacheToFile saves the current cached trie nodes to file to reuse when the node restarts 1144 func (db *Database) SaveTrieNodeCacheToFile(filePath string, concurrency int) { 1145 db.savingTrieNodeCacheTriggered = true 1146 start := time.Now() 1147 logger.Info("start saving cache to file", 1148 "filePath", filePath, "concurrency", concurrency) 1149 if err := db.trieNodeCache.SaveToFile(filePath, concurrency); err != nil { 1150 logger.Error("failed to save cache to file", 1151 "filePath", filePath, "elapsed", time.Since(start), "err", err) 1152 } else { 1153 logger.Info("successfully saved cache to file", 1154 "filePath", filePath, "elapsed", time.Since(start)) 1155 } 1156 db.savingTrieNodeCacheTriggered = false 1157 } 1158 1159 // DumpPeriodically atomically saves fast cache data to the given dir with the specified interval. 1160 func (db *Database) SaveCachePeriodically(c *TrieNodeCacheConfig, stopCh <-chan struct{}) { 1161 rand.Seed(time.Now().UnixNano()) 1162 randomVal := 0.5 + rand.Float64()/2.0 // 0.5 <= randomVal < 1.0 1163 startTime := time.Duration(int(randomVal * float64(c.FastCacheSavePeriod))) 1164 logger.Info("first periodic cache saving will be triggered", "after", startTime) 1165 1166 timer := time.NewTimer(startTime) 1167 defer timer.Stop() 1168 1169 for { 1170 select { 1171 case <-timer.C: 1172 if err := db.CanSaveTrieNodeCacheToFile(); err != nil { 1173 logger.Warn("failed to trigger periodic cache saving", "err", err) 1174 continue 1175 } 1176 db.SaveTrieNodeCacheToFile(c.FastCacheFileDir, 1) 1177 timer.Reset(c.FastCacheSavePeriod) 1178 case <-stopCh: 1179 return 1180 } 1181 } 1182 } 1183 1184 // NodeInfo is a struct used for collecting trie statistics 1185 type NodeInfo struct { 1186 Depth int // 0 if not a leaf node 1187 Finished bool // true if the uppermost call is finished 1188 } 1189 1190 // CollectChildrenStats collects the depth of the trie recursively 1191 func (db *Database) CollectChildrenStats(node common.Hash, depth int, resultCh chan<- NodeInfo) { 1192 n, _ := db.node(node) 1193 if n == nil { 1194 return 1195 } 1196 // retrieve the children of the given node 1197 childrenNodes, err := db.NodeChildren(node) 1198 if err != nil { 1199 logger.Error("failed to retrieve the children nodes", 1200 "node", node.String(), "err", err) 1201 return 1202 } 1203 // write the depth of the node only if the node is a leaf node, otherwise set 0 1204 resultDepth := 0 1205 if len(childrenNodes) == 0 { 1206 resultDepth = depth 1207 } 1208 // send the result to the channel and iterate its children 1209 resultCh <- NodeInfo{Depth: resultDepth} 1210 for _, child := range childrenNodes { 1211 db.CollectChildrenStats(child, depth+1, resultCh) 1212 } 1213 }