github.com/ethereum/go-ethereum@v1.16.1/ethdb/pebble/pebble.go (about)

     1  // Copyright 2023 The go-ethereum Authors
     2  // This file is part of the go-ethereum library.
     3  //
     4  // The go-ethereum library is free software: you can redistribute it and/or modify
     5  // it under the terms of the GNU Lesser General Public License as published by
     6  // the Free Software Foundation, either version 3 of the License, or
     7  // (at your option) any later version.
     8  //
     9  // The go-ethereum library is distributed in the hope that it will be useful,
    10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    12  // GNU Lesser General Public License for more details.
    13  //
    14  // You should have received a copy of the GNU Lesser General Public License
    15  // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
    16  
    17  // Package pebble implements the key-value database layer based on pebble.
    18  package pebble
    19  
    20  import (
    21  	"fmt"
    22  	"runtime"
    23  	"strings"
    24  	"sync"
    25  	"sync/atomic"
    26  	"time"
    27  
    28  	"github.com/cockroachdb/pebble"
    29  	"github.com/cockroachdb/pebble/bloom"
    30  	"github.com/ethereum/go-ethereum/common"
    31  	"github.com/ethereum/go-ethereum/ethdb"
    32  	"github.com/ethereum/go-ethereum/log"
    33  	"github.com/ethereum/go-ethereum/metrics"
    34  )
    35  
    36  const (
    37  	// minCache is the minimum amount of memory in megabytes to allocate to pebble
    38  	// read and write caching, split half and half.
    39  	minCache = 16
    40  
    41  	// minHandles is the minimum number of files handles to allocate to the open
    42  	// database files.
    43  	minHandles = 16
    44  
    45  	// metricsGatheringInterval specifies the interval to retrieve pebble database
    46  	// compaction, io and pause stats to report to the user.
    47  	metricsGatheringInterval = 3 * time.Second
    48  
    49  	// degradationWarnInterval specifies how often warning should be printed if the
    50  	// leveldb database cannot keep up with requested writes.
    51  	degradationWarnInterval = time.Minute
    52  )
    53  
    54  // Database is a persistent key-value store based on the pebble storage engine.
    55  // Apart from basic data storage functionality it also supports batch writes and
    56  // iterating over the keyspace in binary-alphabetical order.
    57  type Database struct {
    58  	fn        string     // filename for reporting
    59  	db        *pebble.DB // Underlying pebble storage engine
    60  	namespace string     // Namespace for metrics
    61  
    62  	compTimeMeter          *metrics.Meter   // Meter for measuring the total time spent in database compaction
    63  	compReadMeter          *metrics.Meter   // Meter for measuring the data read during compaction
    64  	compWriteMeter         *metrics.Meter   // Meter for measuring the data written during compaction
    65  	writeDelayNMeter       *metrics.Meter   // Meter for measuring the write delay number due to database compaction
    66  	writeDelayMeter        *metrics.Meter   // Meter for measuring the write delay duration due to database compaction
    67  	diskSizeGauge          *metrics.Gauge   // Gauge for tracking the size of all the levels in the database
    68  	diskReadMeter          *metrics.Meter   // Meter for measuring the effective amount of data read
    69  	diskWriteMeter         *metrics.Meter   // Meter for measuring the effective amount of data written
    70  	memCompGauge           *metrics.Gauge   // Gauge for tracking the number of memory compaction
    71  	level0CompGauge        *metrics.Gauge   // Gauge for tracking the number of table compaction in level0
    72  	nonlevel0CompGauge     *metrics.Gauge   // Gauge for tracking the number of table compaction in non0 level
    73  	seekCompGauge          *metrics.Gauge   // Gauge for tracking the number of table compaction caused by read opt
    74  	manualMemAllocGauge    *metrics.Gauge   // Gauge for tracking amount of non-managed memory currently allocated
    75  	liveMemTablesGauge     *metrics.Gauge   // Gauge for tracking the number of live memory tables
    76  	zombieMemTablesGauge   *metrics.Gauge   // Gauge for tracking the number of zombie memory tables
    77  	blockCacheHitGauge     *metrics.Gauge   // Gauge for tracking the number of total hit in the block cache
    78  	blockCacheMissGauge    *metrics.Gauge   // Gauge for tracking the number of total miss in the block cache
    79  	tableCacheHitGauge     *metrics.Gauge   // Gauge for tracking the number of total hit in the table cache
    80  	tableCacheMissGauge    *metrics.Gauge   // Gauge for tracking the number of total miss in the table cache
    81  	filterHitGauge         *metrics.Gauge   // Gauge for tracking the number of total hit in bloom filter
    82  	filterMissGauge        *metrics.Gauge   // Gauge for tracking the number of total miss in bloom filter
    83  	estimatedCompDebtGauge *metrics.Gauge   // Gauge for tracking the number of bytes that need to be compacted
    84  	liveCompGauge          *metrics.Gauge   // Gauge for tracking the number of in-progress compactions
    85  	liveCompSizeGauge      *metrics.Gauge   // Gauge for tracking the size of in-progress compactions
    86  	liveIterGauge          *metrics.Gauge   // Gauge for tracking the number of live database iterators
    87  	levelsGauge            []*metrics.Gauge // Gauge for tracking the number of tables in levels
    88  
    89  	quitLock sync.RWMutex    // Mutex protecting the quit channel and the closed flag
    90  	quitChan chan chan error // Quit channel to stop the metrics collection before closing the database
    91  	closed   bool            // keep track of whether we're Closed
    92  
    93  	log log.Logger // Contextual logger tracking the database path
    94  
    95  	activeComp    int           // Current number of active compactions
    96  	compStartTime time.Time     // The start time of the earliest currently-active compaction
    97  	compTime      atomic.Int64  // Total time spent in compaction in ns
    98  	level0Comp    atomic.Uint32 // Total number of level-zero compactions
    99  	nonLevel0Comp atomic.Uint32 // Total number of non level-zero compactions
   100  
   101  	writeStalled        atomic.Bool  // Flag whether the write is stalled
   102  	writeDelayStartTime time.Time    // The start time of the latest write stall
   103  	writeDelayReason    string       // The reason of the latest write stall
   104  	writeDelayCount     atomic.Int64 // Total number of write stall counts
   105  	writeDelayTime      atomic.Int64 // Total time spent in write stalls
   106  
   107  	writeOptions *pebble.WriteOptions
   108  }
   109  
   110  func (d *Database) onCompactionBegin(info pebble.CompactionInfo) {
   111  	if d.activeComp == 0 {
   112  		d.compStartTime = time.Now()
   113  	}
   114  	l0 := info.Input[0]
   115  	if l0.Level == 0 {
   116  		d.level0Comp.Add(1)
   117  	} else {
   118  		d.nonLevel0Comp.Add(1)
   119  	}
   120  	d.activeComp++
   121  }
   122  
   123  func (d *Database) onCompactionEnd(info pebble.CompactionInfo) {
   124  	if d.activeComp == 1 {
   125  		d.compTime.Add(int64(time.Since(d.compStartTime)))
   126  	} else if d.activeComp == 0 {
   127  		panic("should not happen")
   128  	}
   129  	d.activeComp--
   130  }
   131  
   132  func (d *Database) onWriteStallBegin(b pebble.WriteStallBeginInfo) {
   133  	d.writeDelayStartTime = time.Now()
   134  	d.writeDelayCount.Add(1)
   135  	d.writeStalled.Store(true)
   136  
   137  	// Take just the first word of the reason. These are two potential
   138  	// reasons for the write stall:
   139  	// - memtable count limit reached
   140  	// - L0 file count limit exceeded
   141  	reason := b.Reason
   142  	if i := strings.IndexByte(reason, ' '); i != -1 {
   143  		reason = reason[:i]
   144  	}
   145  	if reason == "L0" || reason == "memtable" {
   146  		d.writeDelayReason = reason
   147  		metrics.GetOrRegisterGauge(d.namespace+"stall/count/"+reason, nil).Inc(1)
   148  	}
   149  }
   150  
   151  func (d *Database) onWriteStallEnd() {
   152  	d.writeDelayTime.Add(int64(time.Since(d.writeDelayStartTime)))
   153  	d.writeStalled.Store(false)
   154  
   155  	if d.writeDelayReason != "" {
   156  		metrics.GetOrRegisterResettingTimer(d.namespace+"stall/time/"+d.writeDelayReason, nil).UpdateSince(d.writeDelayStartTime)
   157  		d.writeDelayReason = ""
   158  	}
   159  	d.writeDelayStartTime = time.Time{}
   160  }
   161  
   162  // panicLogger is just a noop logger to disable Pebble's internal logger.
   163  //
   164  // TODO(karalabe): Remove when Pebble sets this as the default.
   165  type panicLogger struct{}
   166  
   167  func (l panicLogger) Infof(format string, args ...interface{}) {
   168  }
   169  
   170  func (l panicLogger) Errorf(format string, args ...interface{}) {
   171  }
   172  
   173  func (l panicLogger) Fatalf(format string, args ...interface{}) {
   174  	panic(fmt.Errorf("fatal: "+format, args...))
   175  }
   176  
   177  // New returns a wrapped pebble DB object. The namespace is the prefix that the
   178  // metrics reporting should use for surfacing internal stats.
   179  func New(file string, cache int, handles int, namespace string, readonly bool) (*Database, error) {
   180  	// Ensure we have some minimal caching and file guarantees
   181  	if cache < minCache {
   182  		cache = minCache
   183  	}
   184  	if handles < minHandles {
   185  		handles = minHandles
   186  	}
   187  	logger := log.New("database", file)
   188  	logger.Info("Allocated cache and file handles", "cache", common.StorageSize(cache*1024*1024), "handles", handles)
   189  
   190  	// The max memtable size is limited by the uint32 offsets stored in
   191  	// internal/arenaskl.node, DeferredBatchOp, and flushableBatchEntry.
   192  	//
   193  	// - MaxUint32 on 64-bit platforms;
   194  	// - MaxInt on 32-bit platforms.
   195  	//
   196  	// It is used when slices are limited to Uint32 on 64-bit platforms (the
   197  	// length limit for slices is naturally MaxInt on 32-bit platforms).
   198  	//
   199  	// Taken from https://github.com/cockroachdb/pebble/blob/master/internal/constants/constants.go
   200  	maxMemTableSize := (1<<31)<<(^uint(0)>>63) - 1
   201  
   202  	// Four memory tables are configured, each with a default size of 256 MB.
   203  	// Having multiple smaller memory tables while keeping the total memory
   204  	// limit unchanged allows writes to be flushed more smoothly. This helps
   205  	// avoid compaction spikes and mitigates write stalls caused by heavy
   206  	// compaction workloads.
   207  	memTableLimit := 4
   208  	memTableSize := cache * 1024 * 1024 / 2 / memTableLimit
   209  
   210  	// The memory table size is currently capped at maxMemTableSize-1 due to a
   211  	// known bug in the pebble where maxMemTableSize is not recognized as a
   212  	// valid size.
   213  	//
   214  	// TODO use the maxMemTableSize as the maximum table size once the issue
   215  	// in pebble is fixed.
   216  	if memTableSize >= maxMemTableSize {
   217  		memTableSize = maxMemTableSize - 1
   218  	}
   219  	db := &Database{
   220  		fn:       file,
   221  		log:      logger,
   222  		quitChan: make(chan chan error),
   223  
   224  		// Use asynchronous write mode by default. Otherwise, the overhead of frequent fsync
   225  		// operations can be significant, especially on platforms with slow fsync performance
   226  		// (e.g., macOS) or less capable SSDs.
   227  		//
   228  		// Note that enabling async writes means recent data may be lost in the event of an
   229  		// application-level panic (writes will also be lost on a machine-level failure,
   230  		// of course). Geth is expected to handle recovery from an unclean shutdown.
   231  		writeOptions: pebble.NoSync,
   232  	}
   233  	opt := &pebble.Options{
   234  		// Pebble has a single combined cache area and the write
   235  		// buffers are taken from this too. Assign all available
   236  		// memory allowance for cache.
   237  		Cache:        pebble.NewCache(int64(cache * 1024 * 1024)),
   238  		MaxOpenFiles: handles,
   239  
   240  		// The size of memory table(as well as the write buffer).
   241  		// Note, there may have more than two memory tables in the system.
   242  		MemTableSize: uint64(memTableSize),
   243  
   244  		// MemTableStopWritesThreshold places a hard limit on the size
   245  		// of the existent MemTables(including the frozen one).
   246  		// Note, this must be the number of tables not the size of all memtables
   247  		// according to https://github.com/cockroachdb/pebble/blob/master/options.go#L738-L742
   248  		// and to https://github.com/cockroachdb/pebble/blob/master/db.go#L1892-L1903.
   249  		MemTableStopWritesThreshold: memTableLimit,
   250  
   251  		// The default compaction concurrency(1 thread),
   252  		// Here use all available CPUs for faster compaction.
   253  		MaxConcurrentCompactions: runtime.NumCPU,
   254  
   255  		// Per-level options. Options for at least one level must be specified. The
   256  		// options for the last level are used for all subsequent levels.
   257  		Levels: []pebble.LevelOptions{
   258  			{TargetFileSize: 2 * 1024 * 1024, FilterPolicy: bloom.FilterPolicy(10)},
   259  			{TargetFileSize: 4 * 1024 * 1024, FilterPolicy: bloom.FilterPolicy(10)},
   260  			{TargetFileSize: 8 * 1024 * 1024, FilterPolicy: bloom.FilterPolicy(10)},
   261  			{TargetFileSize: 16 * 1024 * 1024, FilterPolicy: bloom.FilterPolicy(10)},
   262  			{TargetFileSize: 32 * 1024 * 1024, FilterPolicy: bloom.FilterPolicy(10)},
   263  			{TargetFileSize: 64 * 1024 * 1024, FilterPolicy: bloom.FilterPolicy(10)},
   264  			{TargetFileSize: 128 * 1024 * 1024, FilterPolicy: bloom.FilterPolicy(10)},
   265  		},
   266  		ReadOnly: readonly,
   267  		EventListener: &pebble.EventListener{
   268  			CompactionBegin: db.onCompactionBegin,
   269  			CompactionEnd:   db.onCompactionEnd,
   270  			WriteStallBegin: db.onWriteStallBegin,
   271  			WriteStallEnd:   db.onWriteStallEnd,
   272  		},
   273  		Logger: panicLogger{}, // TODO(karalabe): Delete when this is upstreamed in Pebble
   274  
   275  		// Pebble is configured to use asynchronous write mode, meaning write operations
   276  		// return as soon as the data is cached in memory, without waiting for the WAL
   277  		// to be written. This mode offers better write performance but risks losing
   278  		// recent writes if the application crashes or a power failure/system crash occurs.
   279  		//
   280  		// By setting the WALBytesPerSync, the cached WAL writes will be periodically
   281  		// flushed at the background if the accumulated size exceeds this threshold.
   282  		WALBytesPerSync: 5 * ethdb.IdealBatchSize,
   283  
   284  		// L0CompactionThreshold specifies the number of L0 read-amplification
   285  		// necessary to trigger an L0 compaction. It essentially refers to the
   286  		// number of sub-levels at the L0. For each sub-level, it contains several
   287  		// L0 files which are non-overlapping with each other, typically produced
   288  		// by a single memory-table flush.
   289  		//
   290  		// The default value in Pebble is 4, which is a bit too large to have
   291  		// the compaction debt as around 10GB. By reducing it to 2, the compaction
   292  		// debt will be less than 1GB, but with more frequent compactions scheduled.
   293  		L0CompactionThreshold: 2,
   294  	}
   295  	// Disable seek compaction explicitly. Check https://github.com/ethereum/go-ethereum/pull/20130
   296  	// for more details.
   297  	opt.Experimental.ReadSamplingMultiplier = -1
   298  
   299  	// Open the db and recover any potential corruptions
   300  	innerDB, err := pebble.Open(file, opt)
   301  	if err != nil {
   302  		return nil, err
   303  	}
   304  	db.db = innerDB
   305  
   306  	db.compTimeMeter = metrics.GetOrRegisterMeter(namespace+"compact/time", nil)
   307  	db.compReadMeter = metrics.GetOrRegisterMeter(namespace+"compact/input", nil)
   308  	db.compWriteMeter = metrics.GetOrRegisterMeter(namespace+"compact/output", nil)
   309  	db.diskSizeGauge = metrics.GetOrRegisterGauge(namespace+"disk/size", nil)
   310  	db.diskReadMeter = metrics.GetOrRegisterMeter(namespace+"disk/read", nil)
   311  	db.diskWriteMeter = metrics.GetOrRegisterMeter(namespace+"disk/write", nil)
   312  	db.writeDelayMeter = metrics.GetOrRegisterMeter(namespace+"compact/writedelay/duration", nil)
   313  	db.writeDelayNMeter = metrics.GetOrRegisterMeter(namespace+"compact/writedelay/counter", nil)
   314  	db.memCompGauge = metrics.GetOrRegisterGauge(namespace+"compact/memory", nil)
   315  	db.level0CompGauge = metrics.GetOrRegisterGauge(namespace+"compact/level0", nil)
   316  	db.nonlevel0CompGauge = metrics.GetOrRegisterGauge(namespace+"compact/nonlevel0", nil)
   317  	db.seekCompGauge = metrics.GetOrRegisterGauge(namespace+"compact/seek", nil)
   318  	db.manualMemAllocGauge = metrics.GetOrRegisterGauge(namespace+"memory/manualalloc", nil)
   319  	db.liveMemTablesGauge = metrics.GetOrRegisterGauge(namespace+"table/live", nil)
   320  	db.zombieMemTablesGauge = metrics.GetOrRegisterGauge(namespace+"table/zombie", nil)
   321  	db.blockCacheHitGauge = metrics.GetOrRegisterGauge(namespace+"cache/block/hit", nil)
   322  	db.blockCacheMissGauge = metrics.GetOrRegisterGauge(namespace+"cache/block/miss", nil)
   323  	db.tableCacheHitGauge = metrics.GetOrRegisterGauge(namespace+"cache/table/hit", nil)
   324  	db.tableCacheMissGauge = metrics.GetOrRegisterGauge(namespace+"cache/table/miss", nil)
   325  	db.filterHitGauge = metrics.GetOrRegisterGauge(namespace+"filter/hit", nil)
   326  	db.filterMissGauge = metrics.GetOrRegisterGauge(namespace+"filter/miss", nil)
   327  	db.estimatedCompDebtGauge = metrics.GetOrRegisterGauge(namespace+"compact/estimateDebt", nil)
   328  	db.liveCompGauge = metrics.GetOrRegisterGauge(namespace+"compact/live/count", nil)
   329  	db.liveCompSizeGauge = metrics.GetOrRegisterGauge(namespace+"compact/live/size", nil)
   330  	db.liveIterGauge = metrics.GetOrRegisterGauge(namespace+"iter/count", nil)
   331  
   332  	// Start up the metrics gathering and return
   333  	go db.meter(metricsGatheringInterval, namespace)
   334  	return db, nil
   335  }
   336  
   337  // Close stops the metrics collection, flushes any pending data to disk and closes
   338  // all io accesses to the underlying key-value store.
   339  func (d *Database) Close() error {
   340  	d.quitLock.Lock()
   341  	defer d.quitLock.Unlock()
   342  	// Allow double closing, simplifies things
   343  	if d.closed {
   344  		return nil
   345  	}
   346  	d.closed = true
   347  	if d.quitChan != nil {
   348  		errc := make(chan error)
   349  		d.quitChan <- errc
   350  		if err := <-errc; err != nil {
   351  			d.log.Error("Metrics collection failed", "err", err)
   352  		}
   353  		d.quitChan = nil
   354  	}
   355  	return d.db.Close()
   356  }
   357  
   358  // Has retrieves if a key is present in the key-value store.
   359  func (d *Database) Has(key []byte) (bool, error) {
   360  	d.quitLock.RLock()
   361  	defer d.quitLock.RUnlock()
   362  	if d.closed {
   363  		return false, pebble.ErrClosed
   364  	}
   365  	_, closer, err := d.db.Get(key)
   366  	if err == pebble.ErrNotFound {
   367  		return false, nil
   368  	} else if err != nil {
   369  		return false, err
   370  	}
   371  	if err = closer.Close(); err != nil {
   372  		return false, err
   373  	}
   374  	return true, nil
   375  }
   376  
   377  // Get retrieves the given key if it's present in the key-value store.
   378  func (d *Database) Get(key []byte) ([]byte, error) {
   379  	d.quitLock.RLock()
   380  	defer d.quitLock.RUnlock()
   381  	if d.closed {
   382  		return nil, pebble.ErrClosed
   383  	}
   384  	dat, closer, err := d.db.Get(key)
   385  	if err != nil {
   386  		return nil, err
   387  	}
   388  	ret := make([]byte, len(dat))
   389  	copy(ret, dat)
   390  	if err = closer.Close(); err != nil {
   391  		return nil, err
   392  	}
   393  	return ret, nil
   394  }
   395  
   396  // Put inserts the given value into the key-value store.
   397  func (d *Database) Put(key []byte, value []byte) error {
   398  	d.quitLock.RLock()
   399  	defer d.quitLock.RUnlock()
   400  	if d.closed {
   401  		return pebble.ErrClosed
   402  	}
   403  	return d.db.Set(key, value, d.writeOptions)
   404  }
   405  
   406  // Delete removes the key from the key-value store.
   407  func (d *Database) Delete(key []byte) error {
   408  	d.quitLock.RLock()
   409  	defer d.quitLock.RUnlock()
   410  	if d.closed {
   411  		return pebble.ErrClosed
   412  	}
   413  	return d.db.Delete(key, d.writeOptions)
   414  }
   415  
   416  // DeleteRange deletes all of the keys (and values) in the range [start,end)
   417  // (inclusive on start, exclusive on end).
   418  func (d *Database) DeleteRange(start, end []byte) error {
   419  	d.quitLock.RLock()
   420  	defer d.quitLock.RUnlock()
   421  
   422  	if d.closed {
   423  		return pebble.ErrClosed
   424  	}
   425  	// There is no special flag to represent the end of key range
   426  	// in pebble(nil in leveldb). Use an ugly hack to construct a
   427  	// large key to represent it.
   428  	if end == nil {
   429  		end = ethdb.MaximumKey
   430  	}
   431  	return d.db.DeleteRange(start, end, d.writeOptions)
   432  }
   433  
   434  // NewBatch creates a write-only key-value store that buffers changes to its host
   435  // database until a final write is called.
   436  func (d *Database) NewBatch() ethdb.Batch {
   437  	return &batch{
   438  		b:  d.db.NewBatch(),
   439  		db: d,
   440  	}
   441  }
   442  
   443  // NewBatchWithSize creates a write-only database batch with pre-allocated buffer.
   444  func (d *Database) NewBatchWithSize(size int) ethdb.Batch {
   445  	return &batch{
   446  		b:  d.db.NewBatchWithSize(size),
   447  		db: d,
   448  	}
   449  }
   450  
   451  // upperBound returns the upper bound for the given prefix
   452  func upperBound(prefix []byte) (limit []byte) {
   453  	for i := len(prefix) - 1; i >= 0; i-- {
   454  		c := prefix[i]
   455  		if c == 0xff {
   456  			continue
   457  		}
   458  		limit = make([]byte, i+1)
   459  		copy(limit, prefix)
   460  		limit[i] = c + 1
   461  		break
   462  	}
   463  	return limit
   464  }
   465  
   466  // Stat returns the internal metrics of Pebble in a text format. It's a developer
   467  // method to read everything there is to read, independent of Pebble version.
   468  func (d *Database) Stat() (string, error) {
   469  	return d.db.Metrics().String(), nil
   470  }
   471  
   472  // Compact flattens the underlying data store for the given key range. In essence,
   473  // deleted and overwritten versions are discarded, and the data is rearranged to
   474  // reduce the cost of operations needed to access them.
   475  //
   476  // A nil start is treated as a key before all keys in the data store; a nil limit
   477  // is treated as a key after all keys in the data store. If both is nil then it
   478  // will compact entire data store.
   479  func (d *Database) Compact(start []byte, limit []byte) error {
   480  	// There is no special flag to represent the end of key range
   481  	// in pebble(nil in leveldb). Use an ugly hack to construct a
   482  	// large key to represent it.
   483  	// Note any prefixed database entry will be smaller than this
   484  	// flag, as for trie nodes we need the 32 byte 0xff because
   485  	// there might be a shared prefix starting with a number of
   486  	// 0xff-s, so 32 ensures than only a hash collision could touch it.
   487  	// https://github.com/cockroachdb/pebble/issues/2359#issuecomment-1443995833
   488  	if limit == nil {
   489  		limit = ethdb.MaximumKey
   490  	}
   491  	return d.db.Compact(start, limit, true) // Parallelization is preferred
   492  }
   493  
   494  // Path returns the path to the database directory.
   495  func (d *Database) Path() string {
   496  	return d.fn
   497  }
   498  
   499  // SyncKeyValue flushes all pending writes in the write-ahead-log to disk,
   500  // ensuring data durability up to that point.
   501  func (d *Database) SyncKeyValue() error {
   502  	// The entry (value=nil) is not written to the database; it is only
   503  	// added to the WAL. Writing this special log entry in sync mode
   504  	// automatically flushes all previous writes, ensuring database
   505  	// durability up to this point.
   506  	b := d.db.NewBatch()
   507  	b.LogData(nil, nil)
   508  	return d.db.Apply(b, pebble.Sync)
   509  }
   510  
   511  // meter periodically retrieves internal pebble counters and reports them to
   512  // the metrics subsystem.
   513  func (d *Database) meter(refresh time.Duration, namespace string) {
   514  	var errc chan error
   515  	timer := time.NewTimer(refresh)
   516  	defer timer.Stop()
   517  
   518  	// Create storage and warning log tracer for write delay.
   519  	var (
   520  		compTimes  [2]int64
   521  		compWrites [2]int64
   522  		compReads  [2]int64
   523  
   524  		nWrites [2]int64
   525  
   526  		writeDelayTimes      [2]int64
   527  		writeDelayCounts     [2]int64
   528  		lastWriteStallReport time.Time
   529  	)
   530  
   531  	// Iterate ad infinitum and collect the stats
   532  	for i := 1; errc == nil; i++ {
   533  		var (
   534  			compWrite int64
   535  			compRead  int64
   536  			nWrite    int64
   537  
   538  			stats              = d.db.Metrics()
   539  			compTime           = d.compTime.Load()
   540  			writeDelayCount    = d.writeDelayCount.Load()
   541  			writeDelayTime     = d.writeDelayTime.Load()
   542  			nonLevel0CompCount = int64(d.nonLevel0Comp.Load())
   543  			level0CompCount    = int64(d.level0Comp.Load())
   544  		)
   545  		writeDelayTimes[i%2] = writeDelayTime
   546  		writeDelayCounts[i%2] = writeDelayCount
   547  		compTimes[i%2] = compTime
   548  
   549  		for _, levelMetrics := range stats.Levels {
   550  			nWrite += int64(levelMetrics.BytesCompacted)
   551  			nWrite += int64(levelMetrics.BytesFlushed)
   552  			compWrite += int64(levelMetrics.BytesCompacted)
   553  			compRead += int64(levelMetrics.BytesRead)
   554  		}
   555  
   556  		nWrite += int64(stats.WAL.BytesWritten)
   557  
   558  		compWrites[i%2] = compWrite
   559  		compReads[i%2] = compRead
   560  		nWrites[i%2] = nWrite
   561  
   562  		d.writeDelayNMeter.Mark(writeDelayCounts[i%2] - writeDelayCounts[(i-1)%2])
   563  		d.writeDelayMeter.Mark(writeDelayTimes[i%2] - writeDelayTimes[(i-1)%2])
   564  		// Print a warning log if writing has been stalled for a while. The log will
   565  		// be printed per minute to avoid overwhelming users.
   566  		if d.writeStalled.Load() && writeDelayCounts[i%2] == writeDelayCounts[(i-1)%2] &&
   567  			time.Now().After(lastWriteStallReport.Add(degradationWarnInterval)) {
   568  			d.log.Warn("Database compacting, degraded performance")
   569  			lastWriteStallReport = time.Now()
   570  		}
   571  		d.compTimeMeter.Mark(compTimes[i%2] - compTimes[(i-1)%2])
   572  		d.compReadMeter.Mark(compReads[i%2] - compReads[(i-1)%2])
   573  		d.compWriteMeter.Mark(compWrites[i%2] - compWrites[(i-1)%2])
   574  		d.diskSizeGauge.Update(int64(stats.DiskSpaceUsage()))
   575  		d.diskReadMeter.Mark(0) // pebble doesn't track non-compaction reads
   576  		d.diskWriteMeter.Mark(nWrites[i%2] - nWrites[(i-1)%2])
   577  
   578  		// See https://github.com/cockroachdb/pebble/pull/1628#pullrequestreview-1026664054
   579  		manuallyAllocated := stats.BlockCache.Size + int64(stats.MemTable.Size) + int64(stats.MemTable.ZombieSize)
   580  		d.manualMemAllocGauge.Update(manuallyAllocated)
   581  		d.memCompGauge.Update(stats.Flush.Count)
   582  		d.nonlevel0CompGauge.Update(nonLevel0CompCount)
   583  		d.level0CompGauge.Update(level0CompCount)
   584  		d.seekCompGauge.Update(stats.Compact.ReadCount)
   585  		d.liveCompGauge.Update(stats.Compact.NumInProgress)
   586  		d.liveCompSizeGauge.Update(stats.Compact.InProgressBytes)
   587  		d.liveIterGauge.Update(stats.TableIters)
   588  
   589  		d.liveMemTablesGauge.Update(stats.MemTable.Count)
   590  		d.zombieMemTablesGauge.Update(stats.MemTable.ZombieCount)
   591  		d.estimatedCompDebtGauge.Update(int64(stats.Compact.EstimatedDebt))
   592  		d.tableCacheHitGauge.Update(stats.TableCache.Hits)
   593  		d.tableCacheMissGauge.Update(stats.TableCache.Misses)
   594  		d.blockCacheHitGauge.Update(stats.BlockCache.Hits)
   595  		d.blockCacheMissGauge.Update(stats.BlockCache.Misses)
   596  		d.filterHitGauge.Update(stats.Filter.Hits)
   597  		d.filterMissGauge.Update(stats.Filter.Misses)
   598  
   599  		for i, level := range stats.Levels {
   600  			// Append metrics for additional layers
   601  			if i >= len(d.levelsGauge) {
   602  				d.levelsGauge = append(d.levelsGauge, metrics.GetOrRegisterGauge(namespace+fmt.Sprintf("tables/level%v", i), nil))
   603  			}
   604  			d.levelsGauge[i].Update(level.NumFiles)
   605  		}
   606  
   607  		// Sleep a bit, then repeat the stats collection
   608  		select {
   609  		case errc = <-d.quitChan:
   610  			// Quit requesting, stop hammering the database
   611  		case <-timer.C:
   612  			timer.Reset(refresh)
   613  			// Timeout, gather a new set of stats
   614  		}
   615  	}
   616  	errc <- nil
   617  }
   618  
   619  // batch is a write-only batch that commits changes to its host database
   620  // when Write is called. A batch cannot be used concurrently.
   621  type batch struct {
   622  	b    *pebble.Batch
   623  	db   *Database
   624  	size int
   625  }
   626  
   627  // Put inserts the given value into the batch for later committing.
   628  func (b *batch) Put(key, value []byte) error {
   629  	if err := b.b.Set(key, value, nil); err != nil {
   630  		return err
   631  	}
   632  	b.size += len(key) + len(value)
   633  	return nil
   634  }
   635  
   636  // Delete inserts the key removal into the batch for later committing.
   637  func (b *batch) Delete(key []byte) error {
   638  	if err := b.b.Delete(key, nil); err != nil {
   639  		return err
   640  	}
   641  	b.size += len(key)
   642  	return nil
   643  }
   644  
   645  // DeleteRange removes all keys in the range [start, end) from the batch for
   646  // later committing, inclusive on start, exclusive on end.
   647  func (b *batch) DeleteRange(start, end []byte) error {
   648  	// There is no special flag to represent the end of key range
   649  	// in pebble(nil in leveldb). Use an ugly hack to construct a
   650  	// large key to represent it.
   651  	if end == nil {
   652  		end = ethdb.MaximumKey
   653  	}
   654  	if err := b.b.DeleteRange(start, end, nil); err != nil {
   655  		return err
   656  	}
   657  	// Approximate size impact - just the keys
   658  	b.size += len(start) + len(end)
   659  	return nil
   660  }
   661  
   662  // ValueSize retrieves the amount of data queued up for writing.
   663  func (b *batch) ValueSize() int {
   664  	return b.size
   665  }
   666  
   667  // Write flushes any accumulated data to disk.
   668  func (b *batch) Write() error {
   669  	b.db.quitLock.RLock()
   670  	defer b.db.quitLock.RUnlock()
   671  	if b.db.closed {
   672  		return pebble.ErrClosed
   673  	}
   674  	return b.b.Commit(b.db.writeOptions)
   675  }
   676  
   677  // Reset resets the batch for reuse.
   678  func (b *batch) Reset() {
   679  	b.b.Reset()
   680  	b.size = 0
   681  }
   682  
   683  // Replay replays the batch contents.
   684  func (b *batch) Replay(w ethdb.KeyValueWriter) error {
   685  	reader := b.b.Reader()
   686  	for {
   687  		kind, k, v, ok, err := reader.Next()
   688  		if !ok || err != nil {
   689  			return err
   690  		}
   691  		// The (k,v) slices might be overwritten if the batch is reset/reused,
   692  		// and the receiver should copy them if they are to be retained long-term.
   693  		if kind == pebble.InternalKeyKindSet {
   694  			if err = w.Put(k, v); err != nil {
   695  				return err
   696  			}
   697  		} else if kind == pebble.InternalKeyKindDelete {
   698  			if err = w.Delete(k); err != nil {
   699  				return err
   700  			}
   701  		} else if kind == pebble.InternalKeyKindRangeDelete {
   702  			// For range deletion, k is the start key and v is the end key
   703  			if rangeDeleter, ok := w.(ethdb.KeyValueRangeDeleter); ok {
   704  				if err = rangeDeleter.DeleteRange(k, v); err != nil {
   705  					return err
   706  				}
   707  			} else {
   708  				return fmt.Errorf("ethdb.KeyValueWriter does not implement DeleteRange")
   709  			}
   710  		} else {
   711  			return fmt.Errorf("unhandled operation, keytype: %v", kind)
   712  		}
   713  	}
   714  }
   715  
   716  // pebbleIterator is a wrapper of underlying iterator in storage engine.
   717  // The purpose of this structure is to implement the missing APIs.
   718  //
   719  // The pebble iterator is not thread-safe.
   720  type pebbleIterator struct {
   721  	iter     *pebble.Iterator
   722  	moved    bool
   723  	released bool
   724  }
   725  
   726  // NewIterator creates a binary-alphabetical iterator over a subset
   727  // of database content with a particular key prefix, starting at a particular
   728  // initial key (or after, if it does not exist).
   729  func (d *Database) NewIterator(prefix []byte, start []byte) ethdb.Iterator {
   730  	iter, _ := d.db.NewIter(&pebble.IterOptions{
   731  		LowerBound: append(prefix, start...),
   732  		UpperBound: upperBound(prefix),
   733  	})
   734  	iter.First()
   735  	return &pebbleIterator{iter: iter, moved: true, released: false}
   736  }
   737  
   738  // Next moves the iterator to the next key/value pair. It returns whether the
   739  // iterator is exhausted.
   740  func (iter *pebbleIterator) Next() bool {
   741  	if iter.moved {
   742  		iter.moved = false
   743  		return iter.iter.Valid()
   744  	}
   745  	return iter.iter.Next()
   746  }
   747  
   748  // Error returns any accumulated error. Exhausting all the key/value pairs
   749  // is not considered to be an error.
   750  func (iter *pebbleIterator) Error() error {
   751  	return iter.iter.Error()
   752  }
   753  
   754  // Key returns the key of the current key/value pair, or nil if done. The caller
   755  // should not modify the contents of the returned slice, and its contents may
   756  // change on the next call to Next.
   757  func (iter *pebbleIterator) Key() []byte {
   758  	return iter.iter.Key()
   759  }
   760  
   761  // Value returns the value of the current key/value pair, or nil if done. The
   762  // caller should not modify the contents of the returned slice, and its contents
   763  // may change on the next call to Next.
   764  func (iter *pebbleIterator) Value() []byte {
   765  	return iter.iter.Value()
   766  }
   767  
   768  // Release releases associated resources. Release should always succeed and can
   769  // be called multiple times without causing error.
   770  func (iter *pebbleIterator) Release() {
   771  	if !iter.released {
   772  		iter.iter.Close()
   773  		iter.released = true
   774  	}
   775  }