github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/cache/clockpro.go (about)

     1  // Copyright 2018. All rights reserved. Use of this source code is governed by
     2  // an MIT-style license that can be found in the LICENSE file.
     3  
     4  // Package cache implements the CLOCK-Pro caching algorithm.
     5  //
     6  // CLOCK-Pro is a patent-free alternative to the Adaptive Replacement Cache,
     7  // https://en.wikipedia.org/wiki/Adaptive_replacement_cache.
     8  // It is an approximation of LIRS ( https://en.wikipedia.org/wiki/LIRS_caching_algorithm ),
     9  // much like the CLOCK page replacement algorithm is an approximation of LRU.
    10  //
    11  // This implementation is based on the python code from https://bitbucket.org/SamiLehtinen/pyclockpro .
    12  //
    13  // Slides describing the algorithm: http://fr.slideshare.net/huliang64/clockpro
    14  //
    15  // The original paper: http://static.usenix.org/event/usenix05/tech/general/full_papers/jiang/jiang_html/html.html
    16  //
    17  // It is MIT licensed, like the original.
    18  package cache // import "github.com/cockroachdb/pebble/internal/cache"
    19  
    20  import (
    21  	"fmt"
    22  	"os"
    23  	"runtime"
    24  	"runtime/debug"
    25  	"strings"
    26  	"sync"
    27  	"sync/atomic"
    28  
    29  	"github.com/cockroachdb/pebble/internal/base"
    30  	"github.com/cockroachdb/pebble/internal/invariants"
    31  )
    32  
    33  type fileKey struct {
    34  	// id is the namespace for fileNums.
    35  	id      uint64
    36  	fileNum base.DiskFileNum
    37  }
    38  
    39  type key struct {
    40  	fileKey
    41  	offset uint64
    42  }
    43  
    44  // file returns the "file key" for the receiver. This is the key used for the
    45  // shard.files map.
    46  func (k key) file() key {
    47  	k.offset = 0
    48  	return k
    49  }
    50  
    51  func (k key) String() string {
    52  	return fmt.Sprintf("%d/%d/%d", k.id, k.fileNum, k.offset)
    53  }
    54  
    55  // Handle provides a strong reference to a value in the cache. The reference
    56  // does not pin the value in the cache, but it does prevent the underlying byte
    57  // slice from being reused.
    58  type Handle struct {
    59  	value *Value
    60  }
    61  
    62  // Get returns the value stored in handle.
    63  func (h Handle) Get() []byte {
    64  	if h.value != nil {
    65  		// NB: We don't increment shard.hits in this code path because we only want
    66  		// to record a hit when the handle is retrieved from the cache.
    67  		return h.value.buf
    68  	}
    69  	return nil
    70  }
    71  
    72  // Release releases the reference to the cache entry.
    73  func (h Handle) Release() {
    74  	h.value.release()
    75  }
    76  
    77  type shard struct {
    78  	hits   atomic.Int64
    79  	misses atomic.Int64
    80  
    81  	mu sync.RWMutex
    82  
    83  	reservedSize int64
    84  	maxSize      int64
    85  	coldTarget   int64
    86  	blocks       robinHoodMap // fileNum+offset -> block
    87  	files        robinHoodMap // fileNum -> list of blocks
    88  
    89  	// The blocks and files maps store values in manually managed memory that is
    90  	// invisible to the Go GC. This is fine for Value and entry objects that are
    91  	// stored in manually managed memory, but when the "invariants" build tag is
    92  	// set, all Value and entry objects are Go allocated and the entries map will
    93  	// contain a reference to every entry.
    94  	entries map[*entry]struct{}
    95  
    96  	handHot  *entry
    97  	handCold *entry
    98  	handTest *entry
    99  
   100  	sizeHot  int64
   101  	sizeCold int64
   102  	sizeTest int64
   103  
   104  	// The count fields are used exclusively for asserting expectations.
   105  	// We've seen infinite looping (cockroachdb/cockroach#70154) that
   106  	// could be explained by a corrupted sizeCold. Through asserting on
   107  	// these fields, we hope to gain more insight from any future
   108  	// reproductions.
   109  	countHot  int64
   110  	countCold int64
   111  	countTest int64
   112  }
   113  
   114  func (c *shard) Get(id uint64, fileNum base.DiskFileNum, offset uint64) Handle {
   115  	c.mu.RLock()
   116  	var value *Value
   117  	if e := c.blocks.Get(key{fileKey{id, fileNum}, offset}); e != nil {
   118  		value = e.acquireValue()
   119  		if value != nil {
   120  			e.referenced.Store(true)
   121  		}
   122  	}
   123  	c.mu.RUnlock()
   124  	if value == nil {
   125  		c.misses.Add(1)
   126  		return Handle{}
   127  	}
   128  	c.hits.Add(1)
   129  	return Handle{value: value}
   130  }
   131  
   132  func (c *shard) Set(id uint64, fileNum base.DiskFileNum, offset uint64, value *Value) Handle {
   133  	if n := value.refs(); n != 1 {
   134  		panic(fmt.Sprintf("pebble: Value has already been added to the cache: refs=%d", n))
   135  	}
   136  
   137  	c.mu.Lock()
   138  	defer c.mu.Unlock()
   139  
   140  	k := key{fileKey{id, fileNum}, offset}
   141  	e := c.blocks.Get(k)
   142  
   143  	switch {
   144  	case e == nil:
   145  		// no cache entry? add it
   146  		e = newEntry(c, k, int64(len(value.buf)))
   147  		e.setValue(value)
   148  		if c.metaAdd(k, e) {
   149  			value.ref.trace("add-cold")
   150  			c.sizeCold += e.size
   151  			c.countCold++
   152  		} else {
   153  			value.ref.trace("skip-cold")
   154  			e.free()
   155  			e = nil
   156  		}
   157  
   158  	case e.peekValue() != nil:
   159  		// cache entry was a hot or cold page
   160  		e.setValue(value)
   161  		e.referenced.Store(true)
   162  		delta := int64(len(value.buf)) - e.size
   163  		e.size = int64(len(value.buf))
   164  		if e.ptype == etHot {
   165  			value.ref.trace("add-hot")
   166  			c.sizeHot += delta
   167  		} else {
   168  			value.ref.trace("add-cold")
   169  			c.sizeCold += delta
   170  		}
   171  		c.evict()
   172  
   173  	default:
   174  		// cache entry was a test page
   175  		c.sizeTest -= e.size
   176  		c.countTest--
   177  		c.metaDel(e).release()
   178  		c.metaCheck(e)
   179  
   180  		e.size = int64(len(value.buf))
   181  		c.coldTarget += e.size
   182  		if c.coldTarget > c.targetSize() {
   183  			c.coldTarget = c.targetSize()
   184  		}
   185  
   186  		e.referenced.Store(false)
   187  		e.setValue(value)
   188  		e.ptype = etHot
   189  		if c.metaAdd(k, e) {
   190  			value.ref.trace("add-hot")
   191  			c.sizeHot += e.size
   192  			c.countHot++
   193  		} else {
   194  			value.ref.trace("skip-hot")
   195  			e.free()
   196  			e = nil
   197  		}
   198  	}
   199  
   200  	c.checkConsistency()
   201  
   202  	// Values are initialized with a reference count of 1. That reference count
   203  	// is being transferred to the returned Handle.
   204  	return Handle{value: value}
   205  }
   206  
   207  func (c *shard) checkConsistency() {
   208  	// See the comment above the count{Hot,Cold,Test} fields.
   209  	switch {
   210  	case c.sizeHot < 0 || c.sizeCold < 0 || c.sizeTest < 0 || c.countHot < 0 || c.countCold < 0 || c.countTest < 0:
   211  		panic(fmt.Sprintf("pebble: unexpected negative: %d (%d bytes) hot, %d (%d bytes) cold, %d (%d bytes) test",
   212  			c.countHot, c.sizeHot, c.countCold, c.sizeCold, c.countTest, c.sizeTest))
   213  	case c.sizeHot > 0 && c.countHot == 0:
   214  		panic(fmt.Sprintf("pebble: mismatch %d hot size, %d hot count", c.sizeHot, c.countHot))
   215  	case c.sizeCold > 0 && c.countCold == 0:
   216  		panic(fmt.Sprintf("pebble: mismatch %d cold size, %d cold count", c.sizeCold, c.countCold))
   217  	case c.sizeTest > 0 && c.countTest == 0:
   218  		panic(fmt.Sprintf("pebble: mismatch %d test size, %d test count", c.sizeTest, c.countTest))
   219  	}
   220  }
   221  
   222  // Delete deletes the cached value for the specified file and offset.
   223  func (c *shard) Delete(id uint64, fileNum base.DiskFileNum, offset uint64) {
   224  	// The common case is there is nothing to delete, so do a quick check with
   225  	// shared lock.
   226  	k := key{fileKey{id, fileNum}, offset}
   227  	c.mu.RLock()
   228  	exists := c.blocks.Get(k) != nil
   229  	c.mu.RUnlock()
   230  	if !exists {
   231  		return
   232  	}
   233  
   234  	var deletedValue *Value
   235  	func() {
   236  		c.mu.Lock()
   237  		defer c.mu.Unlock()
   238  
   239  		e := c.blocks.Get(k)
   240  		if e == nil {
   241  			return
   242  		}
   243  		deletedValue = c.metaEvict(e)
   244  		c.checkConsistency()
   245  	}()
   246  	// Now that the mutex has been dropped, release the reference which will
   247  	// potentially free the memory associated with the previous cached value.
   248  	deletedValue.release()
   249  }
   250  
   251  // EvictFile evicts all of the cache values for the specified file.
   252  func (c *shard) EvictFile(id uint64, fileNum base.DiskFileNum) {
   253  	fkey := key{fileKey{id, fileNum}, 0}
   254  	for c.evictFileRun(fkey) {
   255  		// Sched switch to give another goroutine an opportunity to acquire the
   256  		// shard mutex.
   257  		runtime.Gosched()
   258  	}
   259  }
   260  
   261  func (c *shard) evictFileRun(fkey key) (moreRemaining bool) {
   262  	// If most of the file's blocks are held in the block cache, evicting all
   263  	// the blocks may take a while. We don't want to block the entire cache
   264  	// shard, forcing concurrent readers to wait until we're finished. We drop
   265  	// the mutex every [blocksPerMutexAcquisition] blocks to give other
   266  	// goroutines an opportunity to make progress.
   267  	const blocksPerMutexAcquisition = 5
   268  	c.mu.Lock()
   269  
   270  	// Releasing a value may result in free-ing it back to the memory allocator.
   271  	// This can have a nontrivial cost that we'd prefer to not pay while holding
   272  	// the shard mutex, so we collect the evicted values in a local slice and
   273  	// only release them in a defer after dropping the cache mutex.
   274  	var obsoleteValuesAlloc [blocksPerMutexAcquisition]*Value
   275  	obsoleteValues := obsoleteValuesAlloc[:0]
   276  	defer func() {
   277  		c.mu.Unlock()
   278  		for _, v := range obsoleteValues {
   279  			v.release()
   280  		}
   281  	}()
   282  
   283  	blocks := c.files.Get(fkey)
   284  	if blocks == nil {
   285  		// No blocks for this file.
   286  		return false
   287  	}
   288  
   289  	// b is the current head of the doubly linked list, and n is the entry after b.
   290  	for b, n := blocks, (*entry)(nil); len(obsoleteValues) < cap(obsoleteValues); b = n {
   291  		n = b.fileLink.next
   292  		obsoleteValues = append(obsoleteValues, c.metaEvict(b))
   293  		if b == n {
   294  			// b == n represents the case where b was the last entry remaining
   295  			// in the doubly linked list, which is why it pointed at itself. So
   296  			// no more entries left.
   297  			c.checkConsistency()
   298  			return false
   299  		}
   300  	}
   301  	// Exhausted blocksPerMutexAcquisition.
   302  	return true
   303  }
   304  
   305  func (c *shard) Free() {
   306  	c.mu.Lock()
   307  	defer c.mu.Unlock()
   308  
   309  	// NB: we use metaDel rather than metaEvict in order to avoid the expensive
   310  	// metaCheck call when the "invariants" build tag is specified.
   311  	for c.handHot != nil {
   312  		e := c.handHot
   313  		c.metaDel(c.handHot).release()
   314  		e.free()
   315  	}
   316  
   317  	c.blocks.free()
   318  	c.files.free()
   319  }
   320  
   321  func (c *shard) Reserve(n int) {
   322  	c.mu.Lock()
   323  	defer c.mu.Unlock()
   324  	c.reservedSize += int64(n)
   325  
   326  	// Changing c.reservedSize will either increase or decrease
   327  	// the targetSize. But we want coldTarget to be in the range
   328  	// [0, targetSize]. So, if c.targetSize decreases, make sure
   329  	// that the coldTarget fits within the limits.
   330  	targetSize := c.targetSize()
   331  	if c.coldTarget > targetSize {
   332  		c.coldTarget = targetSize
   333  	}
   334  
   335  	c.evict()
   336  	c.checkConsistency()
   337  }
   338  
   339  // Size returns the current space used by the cache.
   340  func (c *shard) Size() int64 {
   341  	c.mu.RLock()
   342  	size := c.sizeHot + c.sizeCold
   343  	c.mu.RUnlock()
   344  	return size
   345  }
   346  
   347  func (c *shard) targetSize() int64 {
   348  	target := c.maxSize - c.reservedSize
   349  	// Always return a positive integer for targetSize. This is so that we don't
   350  	// end up in an infinite loop in evict(), in cases where reservedSize is
   351  	// greater than or equal to maxSize.
   352  	if target < 1 {
   353  		return 1
   354  	}
   355  	return target
   356  }
   357  
   358  // Add the entry to the cache, returning true if the entry was added and false
   359  // if it would not fit in the cache.
   360  func (c *shard) metaAdd(key key, e *entry) bool {
   361  	c.evict()
   362  	if e.size > c.targetSize() {
   363  		// The entry is larger than the target cache size.
   364  		return false
   365  	}
   366  
   367  	c.blocks.Put(key, e)
   368  	if entriesGoAllocated {
   369  		// Go allocated entries need to be referenced from Go memory. The entries
   370  		// map provides that reference.
   371  		c.entries[e] = struct{}{}
   372  	}
   373  
   374  	if c.handHot == nil {
   375  		// first element
   376  		c.handHot = e
   377  		c.handCold = e
   378  		c.handTest = e
   379  	} else {
   380  		c.handHot.link(e)
   381  	}
   382  
   383  	if c.handCold == c.handHot {
   384  		c.handCold = c.handCold.prev()
   385  	}
   386  
   387  	fkey := key.file()
   388  	if fileBlocks := c.files.Get(fkey); fileBlocks == nil {
   389  		c.files.Put(fkey, e)
   390  	} else {
   391  		fileBlocks.linkFile(e)
   392  	}
   393  	return true
   394  }
   395  
   396  // Remove the entry from the cache. This removes the entry from the blocks map,
   397  // the files map, and ensures that hand{Hot,Cold,Test} are not pointing at the
   398  // entry. Returns the deleted value that must be released, if any.
   399  func (c *shard) metaDel(e *entry) (deletedValue *Value) {
   400  	if value := e.peekValue(); value != nil {
   401  		value.ref.trace("metaDel")
   402  	}
   403  	// Remove the pointer to the value.
   404  	deletedValue = e.val
   405  	e.val = nil
   406  
   407  	c.blocks.Delete(e.key)
   408  	if entriesGoAllocated {
   409  		// Go allocated entries need to be referenced from Go memory. The entries
   410  		// map provides that reference.
   411  		delete(c.entries, e)
   412  	}
   413  
   414  	if e == c.handHot {
   415  		c.handHot = c.handHot.prev()
   416  	}
   417  	if e == c.handCold {
   418  		c.handCold = c.handCold.prev()
   419  	}
   420  	if e == c.handTest {
   421  		c.handTest = c.handTest.prev()
   422  	}
   423  
   424  	if e.unlink() == e {
   425  		// This was the last entry in the cache.
   426  		c.handHot = nil
   427  		c.handCold = nil
   428  		c.handTest = nil
   429  	}
   430  
   431  	fkey := e.key.file()
   432  	if next := e.unlinkFile(); e == next {
   433  		c.files.Delete(fkey)
   434  	} else {
   435  		c.files.Put(fkey, next)
   436  	}
   437  	return deletedValue
   438  }
   439  
   440  // Check that the specified entry is not referenced by the cache.
   441  func (c *shard) metaCheck(e *entry) {
   442  	if invariants.Enabled {
   443  		if _, ok := c.entries[e]; ok {
   444  			fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in entries map\n%s",
   445  				e, e.key, debug.Stack())
   446  			os.Exit(1)
   447  		}
   448  		if c.blocks.findByValue(e) != nil {
   449  			fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in blocks map\n%s\n%s",
   450  				e, e.key, &c.blocks, debug.Stack())
   451  			os.Exit(1)
   452  		}
   453  		if c.files.findByValue(e) != nil {
   454  			fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in files map\n%s\n%s",
   455  				e, e.key, &c.files, debug.Stack())
   456  			os.Exit(1)
   457  		}
   458  		// NB: c.hand{Hot,Cold,Test} are pointers into a single linked list. We
   459  		// only have to traverse one of them to check all of them.
   460  		var countHot, countCold, countTest int64
   461  		var sizeHot, sizeCold, sizeTest int64
   462  		for t := c.handHot.next(); t != nil; t = t.next() {
   463  			// Recompute count{Hot,Cold,Test} and size{Hot,Cold,Test}.
   464  			switch t.ptype {
   465  			case etHot:
   466  				countHot++
   467  				sizeHot += t.size
   468  			case etCold:
   469  				countCold++
   470  				sizeCold += t.size
   471  			case etTest:
   472  				countTest++
   473  				sizeTest += t.size
   474  			}
   475  			if e == t {
   476  				fmt.Fprintf(os.Stderr, "%p: %s unexpectedly found in blocks list\n%s",
   477  					e, e.key, debug.Stack())
   478  				os.Exit(1)
   479  			}
   480  			if t == c.handHot {
   481  				break
   482  			}
   483  		}
   484  		if countHot != c.countHot || countCold != c.countCold || countTest != c.countTest ||
   485  			sizeHot != c.sizeHot || sizeCold != c.sizeCold || sizeTest != c.sizeTest {
   486  			fmt.Fprintf(os.Stderr, `divergence of Hot,Cold,Test statistics
   487  				cache's statistics: hot %d, %d, cold %d, %d, test %d, %d
   488  				recalculated statistics: hot %d, %d, cold %d, %d, test %d, %d\n%s`,
   489  				c.countHot, c.sizeHot, c.countCold, c.sizeCold, c.countTest, c.sizeTest,
   490  				countHot, sizeHot, countCold, sizeCold, countTest, sizeTest,
   491  				debug.Stack())
   492  			os.Exit(1)
   493  		}
   494  	}
   495  }
   496  
   497  func (c *shard) metaEvict(e *entry) (evictedValue *Value) {
   498  	switch e.ptype {
   499  	case etHot:
   500  		c.sizeHot -= e.size
   501  		c.countHot--
   502  	case etCold:
   503  		c.sizeCold -= e.size
   504  		c.countCold--
   505  	case etTest:
   506  		c.sizeTest -= e.size
   507  		c.countTest--
   508  	}
   509  	evictedValue = c.metaDel(e)
   510  	c.metaCheck(e)
   511  	e.free()
   512  	return evictedValue
   513  }
   514  
   515  func (c *shard) evict() {
   516  	for c.targetSize() <= c.sizeHot+c.sizeCold && c.handCold != nil {
   517  		c.runHandCold(c.countCold, c.sizeCold)
   518  	}
   519  }
   520  
   521  func (c *shard) runHandCold(countColdDebug, sizeColdDebug int64) {
   522  	// countColdDebug and sizeColdDebug should equal c.countCold and
   523  	// c.sizeCold. They're parameters only to aid in debugging of
   524  	// cockroachdb/cockroach#70154. Since they're parameters, their
   525  	// arguments will appear within stack traces should we encounter
   526  	// a reproduction.
   527  	if c.countCold != countColdDebug || c.sizeCold != sizeColdDebug {
   528  		panic(fmt.Sprintf("runHandCold: cold count and size are %d, %d, arguments are %d and %d",
   529  			c.countCold, c.sizeCold, countColdDebug, sizeColdDebug))
   530  	}
   531  
   532  	e := c.handCold
   533  	if e.ptype == etCold {
   534  		if e.referenced.Load() {
   535  			e.referenced.Store(false)
   536  			e.ptype = etHot
   537  			c.sizeCold -= e.size
   538  			c.countCold--
   539  			c.sizeHot += e.size
   540  			c.countHot++
   541  		} else {
   542  			e.setValue(nil)
   543  			e.ptype = etTest
   544  			c.sizeCold -= e.size
   545  			c.countCold--
   546  			c.sizeTest += e.size
   547  			c.countTest++
   548  			for c.targetSize() < c.sizeTest && c.handTest != nil {
   549  				c.runHandTest()
   550  			}
   551  		}
   552  	}
   553  
   554  	c.handCold = c.handCold.next()
   555  
   556  	for c.targetSize()-c.coldTarget <= c.sizeHot && c.handHot != nil {
   557  		c.runHandHot()
   558  	}
   559  }
   560  
   561  func (c *shard) runHandHot() {
   562  	if c.handHot == c.handTest && c.handTest != nil {
   563  		c.runHandTest()
   564  		if c.handHot == nil {
   565  			return
   566  		}
   567  	}
   568  
   569  	e := c.handHot
   570  	if e.ptype == etHot {
   571  		if e.referenced.Load() {
   572  			e.referenced.Store(false)
   573  		} else {
   574  			e.ptype = etCold
   575  			c.sizeHot -= e.size
   576  			c.countHot--
   577  			c.sizeCold += e.size
   578  			c.countCold++
   579  		}
   580  	}
   581  
   582  	c.handHot = c.handHot.next()
   583  }
   584  
   585  func (c *shard) runHandTest() {
   586  	if c.sizeCold > 0 && c.handTest == c.handCold && c.handCold != nil {
   587  		// sizeCold is > 0, so assert that countCold == 0. See the
   588  		// comment above count{Hot,Cold,Test}.
   589  		if c.countCold == 0 {
   590  			panic(fmt.Sprintf("pebble: mismatch %d cold size, %d cold count", c.sizeCold, c.countCold))
   591  		}
   592  
   593  		c.runHandCold(c.countCold, c.sizeCold)
   594  		if c.handTest == nil {
   595  			return
   596  		}
   597  	}
   598  
   599  	e := c.handTest
   600  	if e.ptype == etTest {
   601  		c.sizeTest -= e.size
   602  		c.countTest--
   603  		c.coldTarget -= e.size
   604  		if c.coldTarget < 0 {
   605  			c.coldTarget = 0
   606  		}
   607  		c.metaDel(e).release()
   608  		c.metaCheck(e)
   609  		e.free()
   610  	}
   611  
   612  	c.handTest = c.handTest.next()
   613  }
   614  
   615  // Metrics holds metrics for the cache.
   616  type Metrics struct {
   617  	// The number of bytes inuse by the cache.
   618  	Size int64
   619  	// The count of objects (blocks or tables) in the cache.
   620  	Count int64
   621  	// The number of cache hits.
   622  	Hits int64
   623  	// The number of cache misses.
   624  	Misses int64
   625  }
   626  
   627  // Cache implements Pebble's sharded block cache. The Clock-PRO algorithm is
   628  // used for page replacement
   629  // (http://static.usenix.org/event/usenix05/tech/general/full_papers/jiang/jiang_html/html.html). In
   630  // order to provide better concurrency, 4 x NumCPUs shards are created, with
   631  // each shard being given 1/n of the target cache size. The Clock-PRO algorithm
   632  // is run independently on each shard.
   633  //
   634  // Blocks are keyed by an (id, fileNum, offset) triple. The ID is a namespace
   635  // for file numbers and allows a single Cache to be shared between multiple
   636  // Pebble instances. The fileNum and offset refer to an sstable file number and
   637  // the offset of the block within the file. Because sstables are immutable and
   638  // file numbers are never reused, (fileNum,offset) are unique for the lifetime
   639  // of a Pebble instance.
   640  //
   641  // In addition to maintaining a map from (fileNum,offset) to data, each shard
   642  // maintains a map of the cached blocks for a particular fileNum. This allows
   643  // efficient eviction of all of the blocks for a file which is used when an
   644  // sstable is deleted from disk.
   645  //
   646  // # Memory Management
   647  //
   648  // In order to reduce pressure on the Go GC, manual memory management is
   649  // performed for the data stored in the cache. Manual memory management is
   650  // performed by calling into C.{malloc,free} to allocate memory. Cache.Values
   651  // are reference counted and the memory backing a manual value is freed when
   652  // the reference count drops to 0.
   653  //
   654  // Manual memory management brings the possibility of memory leaks. It is
   655  // imperative that every Handle returned by Cache.{Get,Set} is eventually
   656  // released. The "invariants" build tag enables a leak detection facility that
   657  // places a GC finalizer on cache.Value. When the cache.Value finalizer is run,
   658  // if the underlying buffer is still present a leak has occurred. The "tracing"
   659  // build tag enables tracing of cache.Value reference count manipulation and
   660  // eases finding where a leak has occurred. These two facilities are usually
   661  // used in combination by specifying `-tags invariants,tracing`. Note that
   662  // "tracing" produces a significant slowdown, while "invariants" does not.
   663  type Cache struct {
   664  	refs    atomic.Int64
   665  	maxSize int64
   666  	idAlloc atomic.Uint64
   667  	shards  []shard
   668  
   669  	// Traces recorded by Cache.trace. Used for debugging.
   670  	tr struct {
   671  		sync.Mutex
   672  		msgs []string
   673  	}
   674  }
   675  
   676  // New creates a new cache of the specified size. Memory for the cache is
   677  // allocated on demand, not during initialization. The cache is created with a
   678  // reference count of 1. Each DB it is associated with adds a reference, so the
   679  // creator of the cache should usually release their reference after the DB is
   680  // created.
   681  //
   682  //	c := cache.New(...)
   683  //	defer c.Unref()
   684  //	d, err := pebble.Open(pebble.Options{Cache: c})
   685  func New(size int64) *Cache {
   686  	// How many cache shards should we create?
   687  	//
   688  	// Note that the probability two processors will try to access the same
   689  	// shard at the same time increases superlinearly with the number of
   690  	// processors (Eg, consider the brithday problem where each CPU is a person,
   691  	// and each shard is a possible birthday).
   692  	//
   693  	// We could consider growing the number of shards superlinearly, but
   694  	// increasing the shard count may reduce the effectiveness of the caching
   695  	// algorithm if frequently-accessed blocks are insufficiently distributed
   696  	// across shards. If a shard's size is smaller than a single frequently
   697  	// scanned sstable, then the shard will be unable to hold the entire
   698  	// frequently-scanned table in memory despite other shards still holding
   699  	// infrequently accessed blocks.
   700  	//
   701  	// Experimentally, we've observed contention contributing to tail latencies
   702  	// at 2 shards per processor. For now we use 4 shards per processor,
   703  	// recognizing this may not be final word.
   704  	m := 4 * runtime.GOMAXPROCS(0)
   705  
   706  	// In tests we can use large CPU machines with small cache sizes and have
   707  	// many caches in existence at a time. If sharding into m shards would
   708  	// produce too small shards, constrain the number of shards to 4.
   709  	const minimumShardSize = 4 << 20 // 4 MiB
   710  	if m > 4 && int(size)/m < minimumShardSize {
   711  		m = 4
   712  	}
   713  	return newShards(size, m)
   714  }
   715  
   716  func newShards(size int64, shards int) *Cache {
   717  	c := &Cache{
   718  		maxSize: size,
   719  		shards:  make([]shard, shards),
   720  	}
   721  	c.refs.Store(1)
   722  	c.idAlloc.Store(1)
   723  	c.trace("alloc", c.refs.Load())
   724  	for i := range c.shards {
   725  		c.shards[i] = shard{
   726  			maxSize:    size / int64(len(c.shards)),
   727  			coldTarget: size / int64(len(c.shards)),
   728  		}
   729  		if entriesGoAllocated {
   730  			c.shards[i].entries = make(map[*entry]struct{})
   731  		}
   732  		c.shards[i].blocks.init(16)
   733  		c.shards[i].files.init(16)
   734  	}
   735  
   736  	// Note: this is a no-op if invariants are disabled or race is enabled.
   737  	invariants.SetFinalizer(c, func(obj interface{}) {
   738  		c := obj.(*Cache)
   739  		if v := c.refs.Load(); v != 0 {
   740  			c.tr.Lock()
   741  			fmt.Fprintf(os.Stderr,
   742  				"pebble: cache (%p) has non-zero reference count: %d\n", c, v)
   743  			if len(c.tr.msgs) > 0 {
   744  				fmt.Fprintf(os.Stderr, "%s\n", strings.Join(c.tr.msgs, "\n"))
   745  			}
   746  			c.tr.Unlock()
   747  			os.Exit(1)
   748  		}
   749  	})
   750  	return c
   751  }
   752  
   753  func (c *Cache) getShard(id uint64, fileNum base.DiskFileNum, offset uint64) *shard {
   754  	if id == 0 {
   755  		panic("pebble: 0 cache ID is invalid")
   756  	}
   757  
   758  	// Inlined version of fnv.New64 + Write.
   759  	const offset64 = 14695981039346656037
   760  	const prime64 = 1099511628211
   761  
   762  	h := uint64(offset64)
   763  	for i := 0; i < 8; i++ {
   764  		h *= prime64
   765  		h ^= uint64(id & 0xff)
   766  		id >>= 8
   767  	}
   768  	fileNumVal := uint64(fileNum.FileNum())
   769  	for i := 0; i < 8; i++ {
   770  		h *= prime64
   771  		h ^= uint64(fileNumVal) & 0xff
   772  		fileNumVal >>= 8
   773  	}
   774  	for i := 0; i < 8; i++ {
   775  		h *= prime64
   776  		h ^= uint64(offset & 0xff)
   777  		offset >>= 8
   778  	}
   779  
   780  	return &c.shards[h%uint64(len(c.shards))]
   781  }
   782  
   783  // Ref adds a reference to the cache. The cache only remains valid as long a
   784  // reference is maintained to it.
   785  func (c *Cache) Ref() {
   786  	v := c.refs.Add(1)
   787  	if v <= 1 {
   788  		panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v))
   789  	}
   790  	c.trace("ref", v)
   791  }
   792  
   793  // Unref releases a reference on the cache.
   794  func (c *Cache) Unref() {
   795  	v := c.refs.Add(-1)
   796  	c.trace("unref", v)
   797  	switch {
   798  	case v < 0:
   799  		panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v))
   800  	case v == 0:
   801  		for i := range c.shards {
   802  			c.shards[i].Free()
   803  		}
   804  	}
   805  }
   806  
   807  // Get retrieves the cache value for the specified file and offset, returning
   808  // nil if no value is present.
   809  func (c *Cache) Get(id uint64, fileNum base.DiskFileNum, offset uint64) Handle {
   810  	return c.getShard(id, fileNum, offset).Get(id, fileNum, offset)
   811  }
   812  
   813  // Set sets the cache value for the specified file and offset, overwriting an
   814  // existing value if present. A Handle is returned which provides faster
   815  // retrieval of the cached value than Get (lock-free and avoidance of the map
   816  // lookup). The value must have been allocated by Cache.Alloc.
   817  func (c *Cache) Set(id uint64, fileNum base.DiskFileNum, offset uint64, value *Value) Handle {
   818  	return c.getShard(id, fileNum, offset).Set(id, fileNum, offset, value)
   819  }
   820  
   821  // Delete deletes the cached value for the specified file and offset.
   822  func (c *Cache) Delete(id uint64, fileNum base.DiskFileNum, offset uint64) {
   823  	c.getShard(id, fileNum, offset).Delete(id, fileNum, offset)
   824  }
   825  
   826  // EvictFile evicts all of the cache values for the specified file.
   827  func (c *Cache) EvictFile(id uint64, fileNum base.DiskFileNum) {
   828  	if id == 0 {
   829  		panic("pebble: 0 cache ID is invalid")
   830  	}
   831  	for i := range c.shards {
   832  		c.shards[i].EvictFile(id, fileNum)
   833  	}
   834  }
   835  
   836  // MaxSize returns the max size of the cache.
   837  func (c *Cache) MaxSize() int64 {
   838  	return c.maxSize
   839  }
   840  
   841  // Size returns the current space used by the cache.
   842  func (c *Cache) Size() int64 {
   843  	var size int64
   844  	for i := range c.shards {
   845  		size += c.shards[i].Size()
   846  	}
   847  	return size
   848  }
   849  
   850  // Alloc allocates a byte slice of the specified size, possibly reusing
   851  // previously allocated but unused memory. The memory backing the value is
   852  // manually managed. The caller MUST either add the value to the cache (via
   853  // Cache.Set), or release the value (via Cache.Free). Failure to do so will
   854  // result in a memory leak.
   855  func Alloc(n int) *Value {
   856  	return newValue(n)
   857  }
   858  
   859  // Free frees the specified value. The buffer associated with the value will
   860  // possibly be reused, making it invalid to use the buffer after calling
   861  // Free. Do not call Free on a value that has been added to the cache.
   862  func Free(v *Value) {
   863  	if n := v.refs(); n > 1 {
   864  		panic(fmt.Sprintf("pebble: Value has been added to the cache: refs=%d", n))
   865  	}
   866  	v.release()
   867  }
   868  
   869  // Reserve N bytes in the cache. This effectively shrinks the size of the cache
   870  // by N bytes, without actually consuming any memory. The returned closure
   871  // should be invoked to release the reservation.
   872  func (c *Cache) Reserve(n int) func() {
   873  	// Round-up the per-shard reservation. Most reservations should be large, so
   874  	// this probably doesn't matter in practice.
   875  	shardN := (n + len(c.shards) - 1) / len(c.shards)
   876  	for i := range c.shards {
   877  		c.shards[i].Reserve(shardN)
   878  	}
   879  	return func() {
   880  		if shardN == -1 {
   881  			panic("pebble: cache reservation already released")
   882  		}
   883  		for i := range c.shards {
   884  			c.shards[i].Reserve(-shardN)
   885  		}
   886  		shardN = -1
   887  	}
   888  }
   889  
   890  // Metrics returns the metrics for the cache.
   891  func (c *Cache) Metrics() Metrics {
   892  	var m Metrics
   893  	for i := range c.shards {
   894  		s := &c.shards[i]
   895  		s.mu.RLock()
   896  		m.Count += int64(s.blocks.Count())
   897  		m.Size += s.sizeHot + s.sizeCold
   898  		s.mu.RUnlock()
   899  		m.Hits += s.hits.Load()
   900  		m.Misses += s.misses.Load()
   901  	}
   902  	return m
   903  }
   904  
   905  // NewID returns a new ID to be used as a namespace for cached file
   906  // blocks.
   907  func (c *Cache) NewID() uint64 {
   908  	return c.idAlloc.Add(1)
   909  }