github.com/annchain/OG@v0.0.9/trie/database.go (about) 1 // Copyright 2018 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package trie 18 19 import ( 20 "github.com/annchain/OG/arefactor/og/types" 21 "sync" 22 "time" 23 24 "github.com/annchain/OG/common" 25 "github.com/annchain/OG/metrics" 26 "github.com/annchain/OG/ogdb" 27 log "github.com/sirupsen/logrus" 28 ) 29 30 var ( 31 memcacheFlushTimeTimer = metrics.NewRegisteredResettingTimer("trie/memcache/flush/time", nil) 32 memcacheFlushNodesMeter = metrics.NewRegisteredMeter("trie/memcache/flush/nodes", nil) 33 memcacheFlushSizeMeter = metrics.NewRegisteredMeter("trie/memcache/flush/size", nil) 34 35 memcacheGCTimeTimer = metrics.NewRegisteredResettingTimer("trie/memcache/gc/time", nil) 36 memcacheGCNodesMeter = metrics.NewRegisteredMeter("trie/memcache/gc/nodes", nil) 37 memcacheGCSizeMeter = metrics.NewRegisteredMeter("trie/memcache/gc/size", nil) 38 39 memcacheCommitTimeTimer = metrics.NewRegisteredResettingTimer("trie/memcache/commit/time", nil) 40 memcacheCommitNodesMeter = metrics.NewRegisteredMeter("trie/memcache/commit/nodes", nil) 41 memcacheCommitSizeMeter = metrics.NewRegisteredMeter("trie/memcache/commit/size", nil) 42 ) 43 44 // secureKeyPrefix is the database key prefix used to store trie node preimages. 45 var secureKeyPrefix = []byte("secure-key-") 46 47 // secureKeyLength is the length of the above prefix + 32byte hash. 48 const secureKeyLength = 11 + 32 49 50 // DatabaseReader wraps the Get and IsTxExists method of a backing store for the trie. 51 type DatabaseReader interface { 52 // Get retrieves the value associated with key form the database. 53 Get(key []byte) (value []byte, err error) 54 55 // IsTxExists retrieves whether a key is present in the database. 56 Has(key []byte) (bool, error) 57 } 58 59 // Database is an intermediate write layer between the trie data structures and 60 // the disk database. The aim is to accumulate trie writes in-memory and only 61 // periodically flush a couple tries to disk, garbage collecting the remainder. 62 type Database struct { 63 diskdb ogdb.Database // Persistent storage for matured trie nodes 64 65 nodes map[types.Hash]*cachedNode // Data and references relationships of a node 66 oldest types.Hash // Oldest tracked node, flush-list head 67 newest types.Hash // Newest tracked node, flush-list tail 68 69 preimages map[types.Hash][]byte // Preimages of nodes from the secure trie 70 seckeybuf [secureKeyLength]byte // Ephemeral buffer for calculating preimage keys 71 72 gctime time.Duration // Time spent on garbage collection since last commit 73 gcnodes uint64 // Nodes garbage collected since last commit 74 gcsize common.StorageSize // Data storage garbage collected since last commit 75 76 flushtime time.Duration // Time spent on data flushing since last commit 77 flushnodes uint64 // Nodes flushed since last commit 78 flushsize common.StorageSize // Data storage flushed since last commit 79 80 nodesSize common.StorageSize // Storage size of the nodes cache (exc. flushlist) 81 preimagesSize common.StorageSize // Storage size of the preimages cache 82 83 lock sync.RWMutex 84 } 85 86 // cachedNode is all the information we know about a single cached node in the 87 // memory database write layer. 88 type cachedNode struct { 89 blob []byte // Cached data block of the trie node 90 parents int // Number of live nodes referencing this one 91 children map[types.Hash]int // Children referenced by this nodes 92 93 flushPrev types.Hash // Previous node in the flush-list 94 flushNext types.Hash // Next node in the flush-list 95 } 96 97 // NewDatabase creates a new trie database to store ephemeral trie content before 98 // its written out to disk or garbage collected. 99 func NewDatabase(diskdb ogdb.Database) *Database { 100 return &Database{ 101 diskdb: diskdb, 102 nodes: map[types.Hash]*cachedNode{ 103 {}: {children: make(map[types.Hash]int)}, 104 }, 105 preimages: make(map[types.Hash][]byte), 106 } 107 } 108 109 // DiskDB retrieves the persistent storage backing the trie database. 110 func (db *Database) DiskDB() DatabaseReader { 111 return db.diskdb 112 } 113 114 // Insert writes a new trie node to the memory database if it's yet unknown. The 115 // method will make a copy of the slice. 116 func (db *Database) Insert(hash types.Hash, blob []byte) { 117 db.lock.Lock() 118 defer db.lock.Unlock() 119 120 db.insert(hash, blob) 121 } 122 123 // insert is the private locked version of Insert. 124 func (db *Database) insert(hash types.Hash, blob []byte) { 125 // If the node's already cached, skip 126 if _, ok := db.nodes[hash]; ok { 127 return 128 } 129 db.nodes[hash] = &cachedNode{ 130 blob: common.CopyBytes(blob), 131 children: make(map[types.Hash]int), 132 flushPrev: db.newest, 133 } 134 // Update the flush-list endpoints 135 //log.Tracef("Panic debug, insert hash: %x, db.oldest: %x", hash.KeyBytes, db.oldest.KeyBytes) 136 if db.oldest == (types.Hash{}) { 137 db.oldest, db.newest = hash, hash 138 } else { 139 //log.Tracef("Panic debug, insert hash: %x, get db.newest: %x", hash.KeyBytes, db.newest.KeyBytes) 140 db.nodes[db.newest].flushNext, db.newest = hash, hash 141 } 142 db.nodesSize += common.StorageSize(types.HashLength + len(blob)) 143 } 144 145 // insertPreimage writes a new trie node pre-image to the memory database if it's 146 // yet unknown. The method will make a copy of the slice. 147 // 148 // Note, this method assumes that the database's lock is held! 149 func (db *Database) insertPreimage(hash types.Hash, preimage []byte) { 150 if _, ok := db.preimages[hash]; ok { 151 return 152 } 153 db.preimages[hash] = common.CopyBytes(preimage) 154 db.preimagesSize += common.StorageSize(types.HashLength + len(preimage)) 155 } 156 157 // Node retrieves a cached trie node from memory. If it cannot be found cached, 158 // the method queries the persistent database for the content. 159 func (db *Database) Node(hash types.Hash) ([]byte, error) { 160 // Retrieve the node from cache if available 161 db.lock.RLock() 162 node := db.nodes[hash] 163 db.lock.RUnlock() 164 165 if node != nil { 166 return node.blob, nil 167 } 168 // Content unavailable in memory, attempt to retrieve from disk 169 return db.diskdb.Get(hash.ToBytes()) 170 } 171 172 // preimage retrieves a cached trie node pre-image from memory. If it cannot be 173 // found cached, the method queries the persistent database for the content. 174 func (db *Database) preimage(hash types.Hash) ([]byte, error) { 175 // Retrieve the node from cache if available 176 db.lock.RLock() 177 preimage := db.preimages[hash] 178 db.lock.RUnlock() 179 180 if preimage != nil { 181 return preimage, nil 182 } 183 // Content unavailable in memory, attempt to retrieve from disk 184 return db.diskdb.Get(db.secureKey(hash.ToBytes())) 185 } 186 187 // secureKey returns the database key for the preimage of key, as an ephemeral 188 // buffer. The caller must not hold onto the return value because it will become 189 // invalid on the next call. 190 func (db *Database) secureKey(key []byte) []byte { 191 buf := append(db.seckeybuf[:0], secureKeyPrefix...) 192 buf = append(buf, key...) 193 return buf 194 } 195 196 // Nodes retrieves the hashes of all the nodes cached within the memory database. 197 // This method is extremely expensive and should only be used to validate internal 198 // states in test code. 199 func (db *Database) Nodes() types.Hashes { 200 db.lock.RLock() 201 defer db.lock.RUnlock() 202 203 var hashes = make(types.Hashes, 0, len(db.nodes)) 204 for hash := range db.nodes { 205 if hash != (types.Hash{}) { // Special case for "root" references/nodes 206 hashes = append(hashes, hash) 207 } 208 } 209 return hashes 210 } 211 212 // Reference adds a new reference from a parent node to a child node. 213 func (db *Database) Reference(child types.Hash, parent types.Hash) { 214 db.lock.RLock() 215 defer db.lock.RUnlock() 216 217 db.reference(child, parent) 218 } 219 220 // reference is the private locked version of Reference. 221 func (db *Database) reference(child types.Hash, parent types.Hash) { 222 // If the node does not exist, it's a node pulled from disk, skip 223 node, ok := db.nodes[child] 224 if !ok { 225 return 226 } 227 // If the reference already exists, only duplicate for roots 228 if _, ok = db.nodes[parent].children[child]; ok && parent != (types.Hash{}) { 229 return 230 } 231 node.parents++ 232 db.nodes[parent].children[child]++ 233 } 234 235 // Dereference removes an existing reference from a parent node to a child node. 236 func (db *Database) Dereference(child types.Hash, parent types.Hash) { 237 db.lock.Lock() 238 defer db.lock.Unlock() 239 240 nodes, storage, start := len(db.nodes), db.nodesSize, time.Now() 241 db.dereference(child, parent) 242 243 db.gcnodes += uint64(nodes - len(db.nodes)) 244 db.gcsize += storage - db.nodesSize 245 db.gctime += time.Since(start) 246 247 memcacheGCTimeTimer.Update(time.Since(start)) 248 memcacheGCSizeMeter.Mark(int64(storage - db.nodesSize)) 249 memcacheGCNodesMeter.Mark(int64(nodes - len(db.nodes))) 250 251 log.Debug("Dereferenced trie from memory database", "nodes", nodes-len(db.nodes), "size", storage-db.nodesSize, "time", time.Since(start), 252 "gcnodes", db.gcnodes, "gcsize", db.gcsize, "gctime", db.gctime, "livenodes", len(db.nodes), "livesize", db.nodesSize) 253 } 254 255 // dereference is the private locked version of Dereference. 256 func (db *Database) dereference(child types.Hash, parent types.Hash) { 257 // Dereference the parent-child 258 node := db.nodes[parent] 259 260 node.children[child]-- 261 if node.children[child] == 0 { 262 delete(node.children, child) 263 } 264 // If the child does not exist, it's a previously committed node. 265 node, ok := db.nodes[child] 266 if !ok { 267 return 268 } 269 // If there are no more references to the child, delete it and cascade 270 node.parents-- 271 if node.parents == 0 { 272 // Remove the node from the flush-list 273 if child == db.oldest { 274 db.oldest = node.flushNext 275 } else { 276 db.nodes[node.flushPrev].flushNext = node.flushNext 277 db.nodes[node.flushNext].flushPrev = node.flushPrev 278 } 279 // Dereference all children and delete the node 280 for hash := range node.children { 281 db.dereference(hash, child) 282 } 283 delete(db.nodes, child) 284 db.nodesSize -= common.StorageSize(types.HashLength + len(node.blob)) 285 } 286 } 287 288 // Cap iteratively flushes old but still referenced trie nodes until the total 289 // memory usage goes below the given threshold. 290 func (db *Database) Cap(limit common.StorageSize) error { 291 // Create a database batch to flush persistent data out. It is important that 292 // outside code doesn't see an inconsistent state (referenced data removed from 293 // memory cache during commit but not yet in persistent storage). This is ensured 294 // by only uncaching existing data when the database write finalizes. 295 db.lock.RLock() 296 297 nodes, storage, start := len(db.nodes), db.nodesSize, time.Now() 298 batch := db.diskdb.NewBatch() 299 300 // db.nodesSize only contains the useful data in the cache, but when reporting 301 // the total memory consumption, the maintenance metadata is also needed to be 302 // counted. For every useful node, we track 2 extra hashes as the flushlist. 303 size := db.nodesSize + common.StorageSize((len(db.nodes)-1)*2*types.HashLength) 304 305 // If the preimage cache got large enough, push to disk. If it's still small 306 // leave for later to deduplicate writes. 307 flushPreimages := db.preimagesSize > 4*1024*1024 308 if flushPreimages { 309 for hash, preimage := range db.preimages { 310 if err := batch.Put(db.secureKey(hash.ToBytes()), preimage); err != nil { 311 log.Error("Failed to commit preimage from trie database", "err", err) 312 db.lock.RUnlock() 313 return err 314 } 315 if batch.ValueSize() > ogdb.IdealBatchSize { 316 if err := batch.Write(); err != nil { 317 db.lock.RUnlock() 318 return err 319 } 320 batch.Reset() 321 } 322 } 323 } 324 // Keep committing nodes from the flush-list until we're below allowance 325 oldest := db.oldest 326 for size > limit && oldest != (types.Hash{}) { 327 // Fetch the oldest referenced node and push into the batch 328 node := db.nodes[oldest] 329 if err := batch.Put(oldest.ToBytes(), node.blob); err != nil { 330 db.lock.RUnlock() 331 return err 332 } 333 // If we exceeded the ideal batch size, commit and reset 334 if batch.ValueSize() >= ogdb.IdealBatchSize { 335 if err := batch.Write(); err != nil { 336 log.Error("Failed to write flush list to disk", "err", err) 337 db.lock.RUnlock() 338 return err 339 } 340 batch.Reset() 341 } 342 // Iterate to the next flush item, or abort if the size cap was achieved. Size 343 // is the total size, including both the useful cached data (hash -> blob), as 344 // well as the flushlist metadata (2*hash). When flushing items from the cache, 345 // we need to reduce both. 346 size -= common.StorageSize(3*types.HashLength + len(node.blob)) 347 oldest = node.flushNext 348 } 349 // Flush out any remainder data from the last batch 350 if err := batch.Write(); err != nil { 351 log.Error("Failed to write flush list to disk", "err", err) 352 db.lock.RUnlock() 353 return err 354 } 355 db.lock.RUnlock() 356 357 // Write successful, clear out the flushed data 358 db.lock.Lock() 359 defer db.lock.Unlock() 360 361 if flushPreimages { 362 db.preimages = make(map[types.Hash][]byte) 363 db.preimagesSize = 0 364 } 365 for db.oldest != oldest { 366 node := db.nodes[db.oldest] 367 delete(db.nodes, db.oldest) 368 db.oldest = node.flushNext 369 370 db.nodesSize -= common.StorageSize(types.HashLength + len(node.blob)) 371 } 372 if db.oldest != (types.Hash{}) { 373 db.nodes[db.oldest].flushPrev = types.Hash{} 374 } 375 db.flushnodes += uint64(nodes - len(db.nodes)) 376 db.flushsize += storage - db.nodesSize 377 db.flushtime += time.Since(start) 378 379 memcacheFlushTimeTimer.Update(time.Since(start)) 380 memcacheFlushSizeMeter.Mark(int64(storage - db.nodesSize)) 381 memcacheFlushNodesMeter.Mark(int64(nodes - len(db.nodes))) 382 383 log.Debug("Persisted nodes from memory database", "nodes", nodes-len(db.nodes), "size", storage-db.nodesSize, "time", time.Since(start), 384 "flushnodes", db.flushnodes, "flushsize", db.flushsize, "flushtime", db.flushtime, "livenodes", len(db.nodes), "livesize", db.nodesSize) 385 386 return nil 387 } 388 389 // Commit iterates over all the children of a particular node, writes them out 390 // to disk, forcefully tearing down all references in both directions. 391 // 392 // As a side effect, all pre-images accumulated up to this point are also written. 393 func (db *Database) Commit(node types.Hash, report bool) error { 394 // Create a database batch to flush persistent data out. It is important that 395 // outside code doesn't see an inconsistent state (referenced data removed from 396 // memory cache during commit but not yet in persistent storage). This is ensured 397 // by only uncaching existing data when the database write finalizes. 398 db.lock.RLock() 399 400 start := time.Now() 401 batch := db.diskdb.NewBatch() 402 403 // Move all of the accumulated preimages into a write batch 404 for hash, preimage := range db.preimages { 405 if err := batch.Put(db.secureKey(hash.ToBytes()), preimage); err != nil { 406 log.Error("Failed to commit preimage from trie database", "err", err) 407 db.lock.RUnlock() 408 return err 409 } 410 if batch.ValueSize() > ogdb.IdealBatchSize { 411 if err := batch.Write(); err != nil { 412 return err 413 } 414 batch.Reset() 415 } 416 } 417 // Move the trie itself into the batch, flushing if enough data is accumulated 418 nodes, storage := len(db.nodes), db.nodesSize 419 if err := db.commit(node, batch); err != nil { 420 log.Error("Failed to commit trie from trie database", "err", err) 421 db.lock.RUnlock() 422 return err 423 } 424 // Write batch ready, unlock for readers during persistence 425 if err := batch.Write(); err != nil { 426 log.Error("Failed to write trie to disk", "err", err) 427 db.lock.RUnlock() 428 return err 429 } 430 db.lock.RUnlock() 431 432 // Write successful, clear out the flushed data 433 db.lock.Lock() 434 defer db.lock.Unlock() 435 436 db.preimages = make(map[types.Hash][]byte) 437 db.preimagesSize = 0 438 439 db.uncache(node) 440 441 memcacheCommitTimeTimer.Update(time.Since(start)) 442 memcacheCommitSizeMeter.Mark(int64(storage - db.nodesSize)) 443 memcacheCommitNodesMeter.Mark(int64(nodes - len(db.nodes))) 444 445 logger := log.Info 446 if !report { 447 logger = log.Debug 448 } 449 logger("Persisted trie from memory database", "nodes", nodes-len(db.nodes)+int(db.flushnodes), "size", storage-db.nodesSize+db.flushsize, "time", time.Since(start)+db.flushtime, 450 "gcnodes", db.gcnodes, "gcsize", db.gcsize, "gctime", db.gctime, "livenodes", len(db.nodes), "livesize", db.nodesSize) 451 452 // Reset the garbage collection statistics 453 db.gcnodes, db.gcsize, db.gctime = 0, 0, 0 454 db.flushnodes, db.flushsize, db.flushtime = 0, 0, 0 455 456 return nil 457 } 458 459 // commit is the private locked version of Commit. 460 func (db *Database) commit(hash types.Hash, batch ogdb.Batch) error { 461 // If the node does not exist, it's a previously committed node 462 node, ok := db.nodes[hash] 463 if !ok { 464 return nil 465 } 466 for child := range node.children { 467 if err := db.commit(child, batch); err != nil { 468 return err 469 } 470 } 471 if err := batch.Put(hash.ToBytes(), node.blob); err != nil { 472 return err 473 } 474 // If we've reached an optimal batch size, commit and start over 475 if batch.ValueSize() >= ogdb.IdealBatchSize { 476 if err := batch.Write(); err != nil { 477 return err 478 } 479 batch.Reset() 480 } 481 return nil 482 } 483 484 // uncache is the post-processing step of a commit operation where the already 485 // persisted trie is removed from the cache. The reason behind the two-phase 486 // commit is to ensure consistent data availability while moving from memory 487 // to disk. 488 func (db *Database) uncache(hash types.Hash) { 489 490 //log.Tracef("Panic debug, uncache the node: %x, cur db.oldest: %x", hash.KeyBytes, db.oldest.KeyBytes) 491 // If the node does not exist, we're done on this path 492 node, ok := db.nodes[hash] 493 if !ok { 494 return 495 } 496 // Node still exists, remove it from the flush-list 497 if hash == db.oldest { 498 //log.Tracef("Panic debug, uncache the node: %x, set oldest to: %x", hash.KeyBytes, node.flushNext.KeyBytes) 499 db.oldest = node.flushNext 500 } else { 501 //log.Tracef("Panic debug, uncache the node: %x, delete node between next: %x, prev: %x", hash.KeyBytes, node.flushNext.KeyBytes, node.flushPrev.KeyBytes) 502 db.nodes[node.flushPrev].flushNext = node.flushNext 503 db.nodes[node.flushNext].flushPrev = node.flushPrev 504 } 505 // Uncache the node's subtries and remove the node itself too 506 for child := range node.children { 507 db.uncache(child) 508 } 509 delete(db.nodes, hash) 510 db.nodesSize -= common.StorageSize(types.HashLength + len(node.blob)) 511 } 512 513 // Size returns the current storage size of the memory cache in front of the 514 // persistent database layer. 515 func (db *Database) Size() (common.StorageSize, common.StorageSize) { 516 db.lock.RLock() 517 defer db.lock.RUnlock() 518 519 // db.nodesSize only contains the useful data in the cache, but when reporting 520 // the total memory consumption, the maintenance metadata is also needed to be 521 // counted. For every useful node, we track 2 extra hashes as the flushlist. 522 var flushlistSize = common.StorageSize((len(db.nodes) - 1) * 2 * types.HashLength) 523 return db.nodesSize + flushlistSize, db.preimagesSize 524 }