github.com/ethereum/go-ethereum@v1.16.1/ethdb/pebble/pebble.go (about) 1 // Copyright 2023 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 // Package pebble implements the key-value database layer based on pebble. 18 package pebble 19 20 import ( 21 "fmt" 22 "runtime" 23 "strings" 24 "sync" 25 "sync/atomic" 26 "time" 27 28 "github.com/cockroachdb/pebble" 29 "github.com/cockroachdb/pebble/bloom" 30 "github.com/ethereum/go-ethereum/common" 31 "github.com/ethereum/go-ethereum/ethdb" 32 "github.com/ethereum/go-ethereum/log" 33 "github.com/ethereum/go-ethereum/metrics" 34 ) 35 36 const ( 37 // minCache is the minimum amount of memory in megabytes to allocate to pebble 38 // read and write caching, split half and half. 39 minCache = 16 40 41 // minHandles is the minimum number of files handles to allocate to the open 42 // database files. 43 minHandles = 16 44 45 // metricsGatheringInterval specifies the interval to retrieve pebble database 46 // compaction, io and pause stats to report to the user. 47 metricsGatheringInterval = 3 * time.Second 48 49 // degradationWarnInterval specifies how often warning should be printed if the 50 // leveldb database cannot keep up with requested writes. 51 degradationWarnInterval = time.Minute 52 ) 53 54 // Database is a persistent key-value store based on the pebble storage engine. 55 // Apart from basic data storage functionality it also supports batch writes and 56 // iterating over the keyspace in binary-alphabetical order. 57 type Database struct { 58 fn string // filename for reporting 59 db *pebble.DB // Underlying pebble storage engine 60 namespace string // Namespace for metrics 61 62 compTimeMeter *metrics.Meter // Meter for measuring the total time spent in database compaction 63 compReadMeter *metrics.Meter // Meter for measuring the data read during compaction 64 compWriteMeter *metrics.Meter // Meter for measuring the data written during compaction 65 writeDelayNMeter *metrics.Meter // Meter for measuring the write delay number due to database compaction 66 writeDelayMeter *metrics.Meter // Meter for measuring the write delay duration due to database compaction 67 diskSizeGauge *metrics.Gauge // Gauge for tracking the size of all the levels in the database 68 diskReadMeter *metrics.Meter // Meter for measuring the effective amount of data read 69 diskWriteMeter *metrics.Meter // Meter for measuring the effective amount of data written 70 memCompGauge *metrics.Gauge // Gauge for tracking the number of memory compaction 71 level0CompGauge *metrics.Gauge // Gauge for tracking the number of table compaction in level0 72 nonlevel0CompGauge *metrics.Gauge // Gauge for tracking the number of table compaction in non0 level 73 seekCompGauge *metrics.Gauge // Gauge for tracking the number of table compaction caused by read opt 74 manualMemAllocGauge *metrics.Gauge // Gauge for tracking amount of non-managed memory currently allocated 75 liveMemTablesGauge *metrics.Gauge // Gauge for tracking the number of live memory tables 76 zombieMemTablesGauge *metrics.Gauge // Gauge for tracking the number of zombie memory tables 77 blockCacheHitGauge *metrics.Gauge // Gauge for tracking the number of total hit in the block cache 78 blockCacheMissGauge *metrics.Gauge // Gauge for tracking the number of total miss in the block cache 79 tableCacheHitGauge *metrics.Gauge // Gauge for tracking the number of total hit in the table cache 80 tableCacheMissGauge *metrics.Gauge // Gauge for tracking the number of total miss in the table cache 81 filterHitGauge *metrics.Gauge // Gauge for tracking the number of total hit in bloom filter 82 filterMissGauge *metrics.Gauge // Gauge for tracking the number of total miss in bloom filter 83 estimatedCompDebtGauge *metrics.Gauge // Gauge for tracking the number of bytes that need to be compacted 84 liveCompGauge *metrics.Gauge // Gauge for tracking the number of in-progress compactions 85 liveCompSizeGauge *metrics.Gauge // Gauge for tracking the size of in-progress compactions 86 liveIterGauge *metrics.Gauge // Gauge for tracking the number of live database iterators 87 levelsGauge []*metrics.Gauge // Gauge for tracking the number of tables in levels 88 89 quitLock sync.RWMutex // Mutex protecting the quit channel and the closed flag 90 quitChan chan chan error // Quit channel to stop the metrics collection before closing the database 91 closed bool // keep track of whether we're Closed 92 93 log log.Logger // Contextual logger tracking the database path 94 95 activeComp int // Current number of active compactions 96 compStartTime time.Time // The start time of the earliest currently-active compaction 97 compTime atomic.Int64 // Total time spent in compaction in ns 98 level0Comp atomic.Uint32 // Total number of level-zero compactions 99 nonLevel0Comp atomic.Uint32 // Total number of non level-zero compactions 100 101 writeStalled atomic.Bool // Flag whether the write is stalled 102 writeDelayStartTime time.Time // The start time of the latest write stall 103 writeDelayReason string // The reason of the latest write stall 104 writeDelayCount atomic.Int64 // Total number of write stall counts 105 writeDelayTime atomic.Int64 // Total time spent in write stalls 106 107 writeOptions *pebble.WriteOptions 108 } 109 110 func (d *Database) onCompactionBegin(info pebble.CompactionInfo) { 111 if d.activeComp == 0 { 112 d.compStartTime = time.Now() 113 } 114 l0 := info.Input[0] 115 if l0.Level == 0 { 116 d.level0Comp.Add(1) 117 } else { 118 d.nonLevel0Comp.Add(1) 119 } 120 d.activeComp++ 121 } 122 123 func (d *Database) onCompactionEnd(info pebble.CompactionInfo) { 124 if d.activeComp == 1 { 125 d.compTime.Add(int64(time.Since(d.compStartTime))) 126 } else if d.activeComp == 0 { 127 panic("should not happen") 128 } 129 d.activeComp-- 130 } 131 132 func (d *Database) onWriteStallBegin(b pebble.WriteStallBeginInfo) { 133 d.writeDelayStartTime = time.Now() 134 d.writeDelayCount.Add(1) 135 d.writeStalled.Store(true) 136 137 // Take just the first word of the reason. These are two potential 138 // reasons for the write stall: 139 // - memtable count limit reached 140 // - L0 file count limit exceeded 141 reason := b.Reason 142 if i := strings.IndexByte(reason, ' '); i != -1 { 143 reason = reason[:i] 144 } 145 if reason == "L0" || reason == "memtable" { 146 d.writeDelayReason = reason 147 metrics.GetOrRegisterGauge(d.namespace+"stall/count/"+reason, nil).Inc(1) 148 } 149 } 150 151 func (d *Database) onWriteStallEnd() { 152 d.writeDelayTime.Add(int64(time.Since(d.writeDelayStartTime))) 153 d.writeStalled.Store(false) 154 155 if d.writeDelayReason != "" { 156 metrics.GetOrRegisterResettingTimer(d.namespace+"stall/time/"+d.writeDelayReason, nil).UpdateSince(d.writeDelayStartTime) 157 d.writeDelayReason = "" 158 } 159 d.writeDelayStartTime = time.Time{} 160 } 161 162 // panicLogger is just a noop logger to disable Pebble's internal logger. 163 // 164 // TODO(karalabe): Remove when Pebble sets this as the default. 165 type panicLogger struct{} 166 167 func (l panicLogger) Infof(format string, args ...interface{}) { 168 } 169 170 func (l panicLogger) Errorf(format string, args ...interface{}) { 171 } 172 173 func (l panicLogger) Fatalf(format string, args ...interface{}) { 174 panic(fmt.Errorf("fatal: "+format, args...)) 175 } 176 177 // New returns a wrapped pebble DB object. The namespace is the prefix that the 178 // metrics reporting should use for surfacing internal stats. 179 func New(file string, cache int, handles int, namespace string, readonly bool) (*Database, error) { 180 // Ensure we have some minimal caching and file guarantees 181 if cache < minCache { 182 cache = minCache 183 } 184 if handles < minHandles { 185 handles = minHandles 186 } 187 logger := log.New("database", file) 188 logger.Info("Allocated cache and file handles", "cache", common.StorageSize(cache*1024*1024), "handles", handles) 189 190 // The max memtable size is limited by the uint32 offsets stored in 191 // internal/arenaskl.node, DeferredBatchOp, and flushableBatchEntry. 192 // 193 // - MaxUint32 on 64-bit platforms; 194 // - MaxInt on 32-bit platforms. 195 // 196 // It is used when slices are limited to Uint32 on 64-bit platforms (the 197 // length limit for slices is naturally MaxInt on 32-bit platforms). 198 // 199 // Taken from https://github.com/cockroachdb/pebble/blob/master/internal/constants/constants.go 200 maxMemTableSize := (1<<31)<<(^uint(0)>>63) - 1 201 202 // Four memory tables are configured, each with a default size of 256 MB. 203 // Having multiple smaller memory tables while keeping the total memory 204 // limit unchanged allows writes to be flushed more smoothly. This helps 205 // avoid compaction spikes and mitigates write stalls caused by heavy 206 // compaction workloads. 207 memTableLimit := 4 208 memTableSize := cache * 1024 * 1024 / 2 / memTableLimit 209 210 // The memory table size is currently capped at maxMemTableSize-1 due to a 211 // known bug in the pebble where maxMemTableSize is not recognized as a 212 // valid size. 213 // 214 // TODO use the maxMemTableSize as the maximum table size once the issue 215 // in pebble is fixed. 216 if memTableSize >= maxMemTableSize { 217 memTableSize = maxMemTableSize - 1 218 } 219 db := &Database{ 220 fn: file, 221 log: logger, 222 quitChan: make(chan chan error), 223 224 // Use asynchronous write mode by default. Otherwise, the overhead of frequent fsync 225 // operations can be significant, especially on platforms with slow fsync performance 226 // (e.g., macOS) or less capable SSDs. 227 // 228 // Note that enabling async writes means recent data may be lost in the event of an 229 // application-level panic (writes will also be lost on a machine-level failure, 230 // of course). Geth is expected to handle recovery from an unclean shutdown. 231 writeOptions: pebble.NoSync, 232 } 233 opt := &pebble.Options{ 234 // Pebble has a single combined cache area and the write 235 // buffers are taken from this too. Assign all available 236 // memory allowance for cache. 237 Cache: pebble.NewCache(int64(cache * 1024 * 1024)), 238 MaxOpenFiles: handles, 239 240 // The size of memory table(as well as the write buffer). 241 // Note, there may have more than two memory tables in the system. 242 MemTableSize: uint64(memTableSize), 243 244 // MemTableStopWritesThreshold places a hard limit on the size 245 // of the existent MemTables(including the frozen one). 246 // Note, this must be the number of tables not the size of all memtables 247 // according to https://github.com/cockroachdb/pebble/blob/master/options.go#L738-L742 248 // and to https://github.com/cockroachdb/pebble/blob/master/db.go#L1892-L1903. 249 MemTableStopWritesThreshold: memTableLimit, 250 251 // The default compaction concurrency(1 thread), 252 // Here use all available CPUs for faster compaction. 253 MaxConcurrentCompactions: runtime.NumCPU, 254 255 // Per-level options. Options for at least one level must be specified. The 256 // options for the last level are used for all subsequent levels. 257 Levels: []pebble.LevelOptions{ 258 {TargetFileSize: 2 * 1024 * 1024, FilterPolicy: bloom.FilterPolicy(10)}, 259 {TargetFileSize: 4 * 1024 * 1024, FilterPolicy: bloom.FilterPolicy(10)}, 260 {TargetFileSize: 8 * 1024 * 1024, FilterPolicy: bloom.FilterPolicy(10)}, 261 {TargetFileSize: 16 * 1024 * 1024, FilterPolicy: bloom.FilterPolicy(10)}, 262 {TargetFileSize: 32 * 1024 * 1024, FilterPolicy: bloom.FilterPolicy(10)}, 263 {TargetFileSize: 64 * 1024 * 1024, FilterPolicy: bloom.FilterPolicy(10)}, 264 {TargetFileSize: 128 * 1024 * 1024, FilterPolicy: bloom.FilterPolicy(10)}, 265 }, 266 ReadOnly: readonly, 267 EventListener: &pebble.EventListener{ 268 CompactionBegin: db.onCompactionBegin, 269 CompactionEnd: db.onCompactionEnd, 270 WriteStallBegin: db.onWriteStallBegin, 271 WriteStallEnd: db.onWriteStallEnd, 272 }, 273 Logger: panicLogger{}, // TODO(karalabe): Delete when this is upstreamed in Pebble 274 275 // Pebble is configured to use asynchronous write mode, meaning write operations 276 // return as soon as the data is cached in memory, without waiting for the WAL 277 // to be written. This mode offers better write performance but risks losing 278 // recent writes if the application crashes or a power failure/system crash occurs. 279 // 280 // By setting the WALBytesPerSync, the cached WAL writes will be periodically 281 // flushed at the background if the accumulated size exceeds this threshold. 282 WALBytesPerSync: 5 * ethdb.IdealBatchSize, 283 284 // L0CompactionThreshold specifies the number of L0 read-amplification 285 // necessary to trigger an L0 compaction. It essentially refers to the 286 // number of sub-levels at the L0. For each sub-level, it contains several 287 // L0 files which are non-overlapping with each other, typically produced 288 // by a single memory-table flush. 289 // 290 // The default value in Pebble is 4, which is a bit too large to have 291 // the compaction debt as around 10GB. By reducing it to 2, the compaction 292 // debt will be less than 1GB, but with more frequent compactions scheduled. 293 L0CompactionThreshold: 2, 294 } 295 // Disable seek compaction explicitly. Check https://github.com/ethereum/go-ethereum/pull/20130 296 // for more details. 297 opt.Experimental.ReadSamplingMultiplier = -1 298 299 // Open the db and recover any potential corruptions 300 innerDB, err := pebble.Open(file, opt) 301 if err != nil { 302 return nil, err 303 } 304 db.db = innerDB 305 306 db.compTimeMeter = metrics.GetOrRegisterMeter(namespace+"compact/time", nil) 307 db.compReadMeter = metrics.GetOrRegisterMeter(namespace+"compact/input", nil) 308 db.compWriteMeter = metrics.GetOrRegisterMeter(namespace+"compact/output", nil) 309 db.diskSizeGauge = metrics.GetOrRegisterGauge(namespace+"disk/size", nil) 310 db.diskReadMeter = metrics.GetOrRegisterMeter(namespace+"disk/read", nil) 311 db.diskWriteMeter = metrics.GetOrRegisterMeter(namespace+"disk/write", nil) 312 db.writeDelayMeter = metrics.GetOrRegisterMeter(namespace+"compact/writedelay/duration", nil) 313 db.writeDelayNMeter = metrics.GetOrRegisterMeter(namespace+"compact/writedelay/counter", nil) 314 db.memCompGauge = metrics.GetOrRegisterGauge(namespace+"compact/memory", nil) 315 db.level0CompGauge = metrics.GetOrRegisterGauge(namespace+"compact/level0", nil) 316 db.nonlevel0CompGauge = metrics.GetOrRegisterGauge(namespace+"compact/nonlevel0", nil) 317 db.seekCompGauge = metrics.GetOrRegisterGauge(namespace+"compact/seek", nil) 318 db.manualMemAllocGauge = metrics.GetOrRegisterGauge(namespace+"memory/manualalloc", nil) 319 db.liveMemTablesGauge = metrics.GetOrRegisterGauge(namespace+"table/live", nil) 320 db.zombieMemTablesGauge = metrics.GetOrRegisterGauge(namespace+"table/zombie", nil) 321 db.blockCacheHitGauge = metrics.GetOrRegisterGauge(namespace+"cache/block/hit", nil) 322 db.blockCacheMissGauge = metrics.GetOrRegisterGauge(namespace+"cache/block/miss", nil) 323 db.tableCacheHitGauge = metrics.GetOrRegisterGauge(namespace+"cache/table/hit", nil) 324 db.tableCacheMissGauge = metrics.GetOrRegisterGauge(namespace+"cache/table/miss", nil) 325 db.filterHitGauge = metrics.GetOrRegisterGauge(namespace+"filter/hit", nil) 326 db.filterMissGauge = metrics.GetOrRegisterGauge(namespace+"filter/miss", nil) 327 db.estimatedCompDebtGauge = metrics.GetOrRegisterGauge(namespace+"compact/estimateDebt", nil) 328 db.liveCompGauge = metrics.GetOrRegisterGauge(namespace+"compact/live/count", nil) 329 db.liveCompSizeGauge = metrics.GetOrRegisterGauge(namespace+"compact/live/size", nil) 330 db.liveIterGauge = metrics.GetOrRegisterGauge(namespace+"iter/count", nil) 331 332 // Start up the metrics gathering and return 333 go db.meter(metricsGatheringInterval, namespace) 334 return db, nil 335 } 336 337 // Close stops the metrics collection, flushes any pending data to disk and closes 338 // all io accesses to the underlying key-value store. 339 func (d *Database) Close() error { 340 d.quitLock.Lock() 341 defer d.quitLock.Unlock() 342 // Allow double closing, simplifies things 343 if d.closed { 344 return nil 345 } 346 d.closed = true 347 if d.quitChan != nil { 348 errc := make(chan error) 349 d.quitChan <- errc 350 if err := <-errc; err != nil { 351 d.log.Error("Metrics collection failed", "err", err) 352 } 353 d.quitChan = nil 354 } 355 return d.db.Close() 356 } 357 358 // Has retrieves if a key is present in the key-value store. 359 func (d *Database) Has(key []byte) (bool, error) { 360 d.quitLock.RLock() 361 defer d.quitLock.RUnlock() 362 if d.closed { 363 return false, pebble.ErrClosed 364 } 365 _, closer, err := d.db.Get(key) 366 if err == pebble.ErrNotFound { 367 return false, nil 368 } else if err != nil { 369 return false, err 370 } 371 if err = closer.Close(); err != nil { 372 return false, err 373 } 374 return true, nil 375 } 376 377 // Get retrieves the given key if it's present in the key-value store. 378 func (d *Database) Get(key []byte) ([]byte, error) { 379 d.quitLock.RLock() 380 defer d.quitLock.RUnlock() 381 if d.closed { 382 return nil, pebble.ErrClosed 383 } 384 dat, closer, err := d.db.Get(key) 385 if err != nil { 386 return nil, err 387 } 388 ret := make([]byte, len(dat)) 389 copy(ret, dat) 390 if err = closer.Close(); err != nil { 391 return nil, err 392 } 393 return ret, nil 394 } 395 396 // Put inserts the given value into the key-value store. 397 func (d *Database) Put(key []byte, value []byte) error { 398 d.quitLock.RLock() 399 defer d.quitLock.RUnlock() 400 if d.closed { 401 return pebble.ErrClosed 402 } 403 return d.db.Set(key, value, d.writeOptions) 404 } 405 406 // Delete removes the key from the key-value store. 407 func (d *Database) Delete(key []byte) error { 408 d.quitLock.RLock() 409 defer d.quitLock.RUnlock() 410 if d.closed { 411 return pebble.ErrClosed 412 } 413 return d.db.Delete(key, d.writeOptions) 414 } 415 416 // DeleteRange deletes all of the keys (and values) in the range [start,end) 417 // (inclusive on start, exclusive on end). 418 func (d *Database) DeleteRange(start, end []byte) error { 419 d.quitLock.RLock() 420 defer d.quitLock.RUnlock() 421 422 if d.closed { 423 return pebble.ErrClosed 424 } 425 // There is no special flag to represent the end of key range 426 // in pebble(nil in leveldb). Use an ugly hack to construct a 427 // large key to represent it. 428 if end == nil { 429 end = ethdb.MaximumKey 430 } 431 return d.db.DeleteRange(start, end, d.writeOptions) 432 } 433 434 // NewBatch creates a write-only key-value store that buffers changes to its host 435 // database until a final write is called. 436 func (d *Database) NewBatch() ethdb.Batch { 437 return &batch{ 438 b: d.db.NewBatch(), 439 db: d, 440 } 441 } 442 443 // NewBatchWithSize creates a write-only database batch with pre-allocated buffer. 444 func (d *Database) NewBatchWithSize(size int) ethdb.Batch { 445 return &batch{ 446 b: d.db.NewBatchWithSize(size), 447 db: d, 448 } 449 } 450 451 // upperBound returns the upper bound for the given prefix 452 func upperBound(prefix []byte) (limit []byte) { 453 for i := len(prefix) - 1; i >= 0; i-- { 454 c := prefix[i] 455 if c == 0xff { 456 continue 457 } 458 limit = make([]byte, i+1) 459 copy(limit, prefix) 460 limit[i] = c + 1 461 break 462 } 463 return limit 464 } 465 466 // Stat returns the internal metrics of Pebble in a text format. It's a developer 467 // method to read everything there is to read, independent of Pebble version. 468 func (d *Database) Stat() (string, error) { 469 return d.db.Metrics().String(), nil 470 } 471 472 // Compact flattens the underlying data store for the given key range. In essence, 473 // deleted and overwritten versions are discarded, and the data is rearranged to 474 // reduce the cost of operations needed to access them. 475 // 476 // A nil start is treated as a key before all keys in the data store; a nil limit 477 // is treated as a key after all keys in the data store. If both is nil then it 478 // will compact entire data store. 479 func (d *Database) Compact(start []byte, limit []byte) error { 480 // There is no special flag to represent the end of key range 481 // in pebble(nil in leveldb). Use an ugly hack to construct a 482 // large key to represent it. 483 // Note any prefixed database entry will be smaller than this 484 // flag, as for trie nodes we need the 32 byte 0xff because 485 // there might be a shared prefix starting with a number of 486 // 0xff-s, so 32 ensures than only a hash collision could touch it. 487 // https://github.com/cockroachdb/pebble/issues/2359#issuecomment-1443995833 488 if limit == nil { 489 limit = ethdb.MaximumKey 490 } 491 return d.db.Compact(start, limit, true) // Parallelization is preferred 492 } 493 494 // Path returns the path to the database directory. 495 func (d *Database) Path() string { 496 return d.fn 497 } 498 499 // SyncKeyValue flushes all pending writes in the write-ahead-log to disk, 500 // ensuring data durability up to that point. 501 func (d *Database) SyncKeyValue() error { 502 // The entry (value=nil) is not written to the database; it is only 503 // added to the WAL. Writing this special log entry in sync mode 504 // automatically flushes all previous writes, ensuring database 505 // durability up to this point. 506 b := d.db.NewBatch() 507 b.LogData(nil, nil) 508 return d.db.Apply(b, pebble.Sync) 509 } 510 511 // meter periodically retrieves internal pebble counters and reports them to 512 // the metrics subsystem. 513 func (d *Database) meter(refresh time.Duration, namespace string) { 514 var errc chan error 515 timer := time.NewTimer(refresh) 516 defer timer.Stop() 517 518 // Create storage and warning log tracer for write delay. 519 var ( 520 compTimes [2]int64 521 compWrites [2]int64 522 compReads [2]int64 523 524 nWrites [2]int64 525 526 writeDelayTimes [2]int64 527 writeDelayCounts [2]int64 528 lastWriteStallReport time.Time 529 ) 530 531 // Iterate ad infinitum and collect the stats 532 for i := 1; errc == nil; i++ { 533 var ( 534 compWrite int64 535 compRead int64 536 nWrite int64 537 538 stats = d.db.Metrics() 539 compTime = d.compTime.Load() 540 writeDelayCount = d.writeDelayCount.Load() 541 writeDelayTime = d.writeDelayTime.Load() 542 nonLevel0CompCount = int64(d.nonLevel0Comp.Load()) 543 level0CompCount = int64(d.level0Comp.Load()) 544 ) 545 writeDelayTimes[i%2] = writeDelayTime 546 writeDelayCounts[i%2] = writeDelayCount 547 compTimes[i%2] = compTime 548 549 for _, levelMetrics := range stats.Levels { 550 nWrite += int64(levelMetrics.BytesCompacted) 551 nWrite += int64(levelMetrics.BytesFlushed) 552 compWrite += int64(levelMetrics.BytesCompacted) 553 compRead += int64(levelMetrics.BytesRead) 554 } 555 556 nWrite += int64(stats.WAL.BytesWritten) 557 558 compWrites[i%2] = compWrite 559 compReads[i%2] = compRead 560 nWrites[i%2] = nWrite 561 562 d.writeDelayNMeter.Mark(writeDelayCounts[i%2] - writeDelayCounts[(i-1)%2]) 563 d.writeDelayMeter.Mark(writeDelayTimes[i%2] - writeDelayTimes[(i-1)%2]) 564 // Print a warning log if writing has been stalled for a while. The log will 565 // be printed per minute to avoid overwhelming users. 566 if d.writeStalled.Load() && writeDelayCounts[i%2] == writeDelayCounts[(i-1)%2] && 567 time.Now().After(lastWriteStallReport.Add(degradationWarnInterval)) { 568 d.log.Warn("Database compacting, degraded performance") 569 lastWriteStallReport = time.Now() 570 } 571 d.compTimeMeter.Mark(compTimes[i%2] - compTimes[(i-1)%2]) 572 d.compReadMeter.Mark(compReads[i%2] - compReads[(i-1)%2]) 573 d.compWriteMeter.Mark(compWrites[i%2] - compWrites[(i-1)%2]) 574 d.diskSizeGauge.Update(int64(stats.DiskSpaceUsage())) 575 d.diskReadMeter.Mark(0) // pebble doesn't track non-compaction reads 576 d.diskWriteMeter.Mark(nWrites[i%2] - nWrites[(i-1)%2]) 577 578 // See https://github.com/cockroachdb/pebble/pull/1628#pullrequestreview-1026664054 579 manuallyAllocated := stats.BlockCache.Size + int64(stats.MemTable.Size) + int64(stats.MemTable.ZombieSize) 580 d.manualMemAllocGauge.Update(manuallyAllocated) 581 d.memCompGauge.Update(stats.Flush.Count) 582 d.nonlevel0CompGauge.Update(nonLevel0CompCount) 583 d.level0CompGauge.Update(level0CompCount) 584 d.seekCompGauge.Update(stats.Compact.ReadCount) 585 d.liveCompGauge.Update(stats.Compact.NumInProgress) 586 d.liveCompSizeGauge.Update(stats.Compact.InProgressBytes) 587 d.liveIterGauge.Update(stats.TableIters) 588 589 d.liveMemTablesGauge.Update(stats.MemTable.Count) 590 d.zombieMemTablesGauge.Update(stats.MemTable.ZombieCount) 591 d.estimatedCompDebtGauge.Update(int64(stats.Compact.EstimatedDebt)) 592 d.tableCacheHitGauge.Update(stats.TableCache.Hits) 593 d.tableCacheMissGauge.Update(stats.TableCache.Misses) 594 d.blockCacheHitGauge.Update(stats.BlockCache.Hits) 595 d.blockCacheMissGauge.Update(stats.BlockCache.Misses) 596 d.filterHitGauge.Update(stats.Filter.Hits) 597 d.filterMissGauge.Update(stats.Filter.Misses) 598 599 for i, level := range stats.Levels { 600 // Append metrics for additional layers 601 if i >= len(d.levelsGauge) { 602 d.levelsGauge = append(d.levelsGauge, metrics.GetOrRegisterGauge(namespace+fmt.Sprintf("tables/level%v", i), nil)) 603 } 604 d.levelsGauge[i].Update(level.NumFiles) 605 } 606 607 // Sleep a bit, then repeat the stats collection 608 select { 609 case errc = <-d.quitChan: 610 // Quit requesting, stop hammering the database 611 case <-timer.C: 612 timer.Reset(refresh) 613 // Timeout, gather a new set of stats 614 } 615 } 616 errc <- nil 617 } 618 619 // batch is a write-only batch that commits changes to its host database 620 // when Write is called. A batch cannot be used concurrently. 621 type batch struct { 622 b *pebble.Batch 623 db *Database 624 size int 625 } 626 627 // Put inserts the given value into the batch for later committing. 628 func (b *batch) Put(key, value []byte) error { 629 if err := b.b.Set(key, value, nil); err != nil { 630 return err 631 } 632 b.size += len(key) + len(value) 633 return nil 634 } 635 636 // Delete inserts the key removal into the batch for later committing. 637 func (b *batch) Delete(key []byte) error { 638 if err := b.b.Delete(key, nil); err != nil { 639 return err 640 } 641 b.size += len(key) 642 return nil 643 } 644 645 // DeleteRange removes all keys in the range [start, end) from the batch for 646 // later committing, inclusive on start, exclusive on end. 647 func (b *batch) DeleteRange(start, end []byte) error { 648 // There is no special flag to represent the end of key range 649 // in pebble(nil in leveldb). Use an ugly hack to construct a 650 // large key to represent it. 651 if end == nil { 652 end = ethdb.MaximumKey 653 } 654 if err := b.b.DeleteRange(start, end, nil); err != nil { 655 return err 656 } 657 // Approximate size impact - just the keys 658 b.size += len(start) + len(end) 659 return nil 660 } 661 662 // ValueSize retrieves the amount of data queued up for writing. 663 func (b *batch) ValueSize() int { 664 return b.size 665 } 666 667 // Write flushes any accumulated data to disk. 668 func (b *batch) Write() error { 669 b.db.quitLock.RLock() 670 defer b.db.quitLock.RUnlock() 671 if b.db.closed { 672 return pebble.ErrClosed 673 } 674 return b.b.Commit(b.db.writeOptions) 675 } 676 677 // Reset resets the batch for reuse. 678 func (b *batch) Reset() { 679 b.b.Reset() 680 b.size = 0 681 } 682 683 // Replay replays the batch contents. 684 func (b *batch) Replay(w ethdb.KeyValueWriter) error { 685 reader := b.b.Reader() 686 for { 687 kind, k, v, ok, err := reader.Next() 688 if !ok || err != nil { 689 return err 690 } 691 // The (k,v) slices might be overwritten if the batch is reset/reused, 692 // and the receiver should copy them if they are to be retained long-term. 693 if kind == pebble.InternalKeyKindSet { 694 if err = w.Put(k, v); err != nil { 695 return err 696 } 697 } else if kind == pebble.InternalKeyKindDelete { 698 if err = w.Delete(k); err != nil { 699 return err 700 } 701 } else if kind == pebble.InternalKeyKindRangeDelete { 702 // For range deletion, k is the start key and v is the end key 703 if rangeDeleter, ok := w.(ethdb.KeyValueRangeDeleter); ok { 704 if err = rangeDeleter.DeleteRange(k, v); err != nil { 705 return err 706 } 707 } else { 708 return fmt.Errorf("ethdb.KeyValueWriter does not implement DeleteRange") 709 } 710 } else { 711 return fmt.Errorf("unhandled operation, keytype: %v", kind) 712 } 713 } 714 } 715 716 // pebbleIterator is a wrapper of underlying iterator in storage engine. 717 // The purpose of this structure is to implement the missing APIs. 718 // 719 // The pebble iterator is not thread-safe. 720 type pebbleIterator struct { 721 iter *pebble.Iterator 722 moved bool 723 released bool 724 } 725 726 // NewIterator creates a binary-alphabetical iterator over a subset 727 // of database content with a particular key prefix, starting at a particular 728 // initial key (or after, if it does not exist). 729 func (d *Database) NewIterator(prefix []byte, start []byte) ethdb.Iterator { 730 iter, _ := d.db.NewIter(&pebble.IterOptions{ 731 LowerBound: append(prefix, start...), 732 UpperBound: upperBound(prefix), 733 }) 734 iter.First() 735 return &pebbleIterator{iter: iter, moved: true, released: false} 736 } 737 738 // Next moves the iterator to the next key/value pair. It returns whether the 739 // iterator is exhausted. 740 func (iter *pebbleIterator) Next() bool { 741 if iter.moved { 742 iter.moved = false 743 return iter.iter.Valid() 744 } 745 return iter.iter.Next() 746 } 747 748 // Error returns any accumulated error. Exhausting all the key/value pairs 749 // is not considered to be an error. 750 func (iter *pebbleIterator) Error() error { 751 return iter.iter.Error() 752 } 753 754 // Key returns the key of the current key/value pair, or nil if done. The caller 755 // should not modify the contents of the returned slice, and its contents may 756 // change on the next call to Next. 757 func (iter *pebbleIterator) Key() []byte { 758 return iter.iter.Key() 759 } 760 761 // Value returns the value of the current key/value pair, or nil if done. The 762 // caller should not modify the contents of the returned slice, and its contents 763 // may change on the next call to Next. 764 func (iter *pebbleIterator) Value() []byte { 765 return iter.iter.Value() 766 } 767 768 // Release releases associated resources. Release should always succeed and can 769 // be called multiple times without causing error. 770 func (iter *pebbleIterator) Release() { 771 if !iter.released { 772 iter.iter.Close() 773 iter.released = true 774 } 775 }