github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/table_cache.go

github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/table_cache.go (about)

     1  // Copyright 2020 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package bitalostable
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"fmt"
    11  	"io"
    12  	"runtime/debug"
    13  	"runtime/pprof"
    14  	"sync"
    15  	"sync/atomic"
    16  	"unsafe"
    17  
    18  	"github.com/cockroachdb/errors"
    19  	"github.com/zuoyebang/bitalostable/internal/base"
    20  	"github.com/zuoyebang/bitalostable/internal/invariants"
    21  	"github.com/zuoyebang/bitalostable/internal/keyspan"
    22  	"github.com/zuoyebang/bitalostable/internal/manifest"
    23  	"github.com/zuoyebang/bitalostable/internal/private"
    24  	"github.com/zuoyebang/bitalostable/sstable"
    25  	"github.com/zuoyebang/bitalostable/vfs"
    26  )
    27  
    28  var emptyIter = &errorIter{err: nil}
    29  var emptyKeyspanIter = &errorKeyspanIter{err: nil}
    30  
    31  // filteredAll is a singleton internalIterator implementation used when an
    32  // sstable does contain point keys, but all the keys are filtered by the active
    33  // PointKeyFilters set in the iterator's IterOptions.
    34  //
    35  // filteredAll implements filteredIter, ensuring the level iterator recognizes
    36  // when it may need to return file boundaries to keep the rangeDelIter open
    37  // during mergingIter operation.
    38  var filteredAll = &filteredAllKeysIter{errorIter: errorIter{err: nil}}
    39  
    40  var _ filteredIter = filteredAll
    41  
    42  type filteredAllKeysIter struct {
    43  	errorIter
    44  }
    45  
    46  func (s *filteredAllKeysIter) MaybeFilteredKeys() bool {
    47  	return true
    48  }
    49  
    50  var tableCacheLabels = pprof.Labels("bitalostable", "table-cache")
    51  
    52  // tableCacheOpts contains the db specific fields
    53  // of a table cache. This is stored in the tableCacheContainer
    54  // along with the table cache.
    55  // NB: It is important to make sure that the fields in this
    56  // struct are read-only. Since the fields here are shared
    57  // by every single tableCacheShard, if non read-only fields
    58  // are updated, we could have unnecessary evictions of those
    59  // fields, and the surrounding fields from the CPU caches.
    60  type tableCacheOpts struct {
    61  	atomic struct {
    62  		// iterCount in the tableCacheOpts keeps track of iterators
    63  		// opened or closed by a DB. It's used to keep track of
    64  		// leaked iterators on a per-db level.
    65  		iterCount *int32
    66  	}
    67  
    68  	logger        Logger
    69  	cacheID       uint64
    70  	dirname       string
    71  	fs            vfs.FS
    72  	opts          sstable.ReaderOptions
    73  	filterMetrics *FilterMetrics
    74  }
    75  
    76  // tableCacheContainer contains the table cache and
    77  // fields which are unique to the DB.
    78  type tableCacheContainer struct {
    79  	tableCache *TableCache
    80  
    81  	// dbOpts contains fields relevant to the table cache
    82  	// which are unique to each DB.
    83  	dbOpts tableCacheOpts
    84  }
    85  
    86  // newTableCacheContainer will panic if the underlying cache in the table cache
    87  // doesn't match Options.Cache.
    88  func newTableCacheContainer(
    89  	tc *TableCache, cacheID uint64, dirname string, fs vfs.FS, opts *Options, size int,
    90  ) *tableCacheContainer {
    91  	// We will release a ref to table cache acquired here when tableCacheContainer.close is called.
    92  	if tc != nil {
    93  		if tc.cache != opts.Cache {
    94  			panic("bitalostable: underlying cache for the table cache and db are different")
    95  		}
    96  		tc.Ref()
    97  	} else {
    98  		// NewTableCache should create a ref to tc which the container should
    99  		// drop whenever it is closed.
   100  		tc = NewTableCache(opts.Cache, opts.Experimental.TableCacheShards, size)
   101  	}
   102  
   103  	t := &tableCacheContainer{}
   104  	t.tableCache = tc
   105  	t.dbOpts.logger = opts.Logger
   106  	t.dbOpts.cacheID = cacheID
   107  	t.dbOpts.dirname = dirname
   108  	t.dbOpts.fs = fs
   109  	t.dbOpts.opts = opts.MakeReaderOptions()
   110  	t.dbOpts.filterMetrics = &FilterMetrics{}
   111  	t.dbOpts.atomic.iterCount = new(int32)
   112  	return t
   113  }
   114  
   115  // Before calling close, make sure that there will be no further need
   116  // to access any of the files associated with the store.
   117  func (c *tableCacheContainer) close() error {
   118  	// We want to do some cleanup work here. Check for leaked iterators
   119  	// by the DB using this container. Note that we'll still perform cleanup
   120  	// below in the case that there are leaked iterators.
   121  	var err error
   122  	if v := atomic.LoadInt32(c.dbOpts.atomic.iterCount); v > 0 {
   123  		err = errors.Errorf("leaked iterators: %d", errors.Safe(v))
   124  	}
   125  
   126  	// Release nodes here.
   127  	for _, shard := range c.tableCache.shards {
   128  		if shard != nil {
   129  			shard.removeDB(&c.dbOpts)
   130  		}
   131  	}
   132  	return firstError(err, c.tableCache.Unref())
   133  }
   134  
   135  func (c *tableCacheContainer) newIters(
   136  	file *manifest.FileMetadata, opts *IterOptions, internalOpts internalIterOpts,
   137  ) (internalIterator, keyspan.FragmentIterator, error) {
   138  	return c.tableCache.getShard(file.FileNum).newIters(file, opts, internalOpts, &c.dbOpts)
   139  }
   140  
   141  func (c *tableCacheContainer) newRangeKeyIter(
   142  	file *manifest.FileMetadata, opts *keyspan.SpanIterOptions,
   143  ) (keyspan.FragmentIterator, error) {
   144  	return c.tableCache.getShard(file.FileNum).newRangeKeyIter(file, opts, &c.dbOpts)
   145  }
   146  
   147  func (c *tableCacheContainer) getTableProperties(file *fileMetadata) (*sstable.Properties, error) {
   148  	return c.tableCache.getShard(file.FileNum).getTableProperties(file, &c.dbOpts)
   149  }
   150  
   151  func (c *tableCacheContainer) evict(fileNum FileNum) {
   152  	c.tableCache.getShard(fileNum).evict(fileNum, &c.dbOpts, false)
   153  }
   154  
   155  func (c *tableCacheContainer) metrics() (CacheMetrics, FilterMetrics) {
   156  	var m CacheMetrics
   157  	for i := range c.tableCache.shards {
   158  		s := c.tableCache.shards[i]
   159  		s.mu.RLock()
   160  		m.Count += int64(len(s.mu.nodes))
   161  		s.mu.RUnlock()
   162  		m.Hits += atomic.LoadInt64(&s.atomic.hits)
   163  		m.Misses += atomic.LoadInt64(&s.atomic.misses)
   164  	}
   165  	m.Size = m.Count * int64(unsafe.Sizeof(sstable.Reader{}))
   166  	f := FilterMetrics{
   167  		Hits:   atomic.LoadInt64(&c.dbOpts.filterMetrics.Hits),
   168  		Misses: atomic.LoadInt64(&c.dbOpts.filterMetrics.Misses),
   169  	}
   170  	return m, f
   171  }
   172  
   173  func (c *tableCacheContainer) withReader(meta *fileMetadata, fn func(*sstable.Reader) error) error {
   174  	s := c.tableCache.getShard(meta.FileNum)
   175  	v := s.findNode(meta, &c.dbOpts)
   176  	defer s.unrefValue(v)
   177  	if v.err != nil {
   178  		base.MustExist(c.dbOpts.fs, v.filename, c.dbOpts.logger, v.err)
   179  		return v.err
   180  	}
   181  	return fn(v.reader)
   182  }
   183  
   184  func (c *tableCacheContainer) iterCount() int64 {
   185  	return int64(atomic.LoadInt32(c.dbOpts.atomic.iterCount))
   186  }
   187  
   188  // TableCache is a shareable cache for open sstables.
   189  type TableCache struct {
   190  	// atomic contains fields which are accessed atomically. Go allocations
   191  	// are guaranteed to be 64-bit aligned which we take advantage of by
   192  	// placing the 64-bit fields which we access atomically at the beginning
   193  	// of the TableCache struct. For more information, see
   194  	// https://golang.org/pkg/sync/atomic/#pkg-note-BUG.
   195  	atomic struct {
   196  		refs int64
   197  	}
   198  
   199  	cache  *Cache
   200  	shards []*tableCacheShard
   201  }
   202  
   203  // Ref adds a reference to the table cache. Once tableCache.init returns,
   204  // the table cache only remains valid if there is at least one reference
   205  // to it.
   206  func (c *TableCache) Ref() {
   207  	v := atomic.AddInt64(&c.atomic.refs, 1)
   208  	// We don't want the reference count to ever go from 0 -> 1,
   209  	// cause a reference count of 0 implies that we've closed the cache.
   210  	if v <= 1 {
   211  		panic(fmt.Sprintf("bitalostable: inconsistent reference count: %d", v))
   212  	}
   213  }
   214  
   215  // Unref removes a reference to the table cache.
   216  func (c *TableCache) Unref() error {
   217  	v := atomic.AddInt64(&c.atomic.refs, -1)
   218  	switch {
   219  	case v < 0:
   220  		panic(fmt.Sprintf("bitalostable: inconsistent reference count: %d", v))
   221  	case v == 0:
   222  		var err error
   223  		for i := range c.shards {
   224  			// The cache shard is not allocated yet, nothing to close
   225  			if c.shards[i] == nil {
   226  				continue
   227  			}
   228  			err = firstError(err, c.shards[i].Close())
   229  		}
   230  
   231  		// Unref the cache which we create a reference to when the tableCache
   232  		// is first instantiated.
   233  		c.cache.Unref()
   234  		return err
   235  	}
   236  	return nil
   237  }
   238  
   239  // NewTableCache will create a reference to the table cache. It is the callers responsibility
   240  // to call tableCache.Unref if they will no longer hold a reference to the table cache.
   241  func NewTableCache(cache *Cache, numShards int, size int) *TableCache {
   242  	if size == 0 {
   243  		panic("bitalostable: cannot create a table cache of size 0")
   244  	} else if numShards == 0 {
   245  		panic("bitalostable: cannot create a table cache with 0 shards")
   246  	}
   247  
   248  	c := &TableCache{}
   249  	c.cache = cache
   250  	c.cache.Ref()
   251  
   252  	c.shards = make([]*tableCacheShard, numShards)
   253  	for i := range c.shards {
   254  		c.shards[i] = &tableCacheShard{}
   255  		c.shards[i].init(size / len(c.shards))
   256  	}
   257  
   258  	// Hold a ref to the cache here.
   259  	c.atomic.refs = 1
   260  
   261  	return c
   262  }
   263  
   264  func (c *TableCache) getShard(fileNum FileNum) *tableCacheShard {
   265  	return c.shards[uint64(fileNum)%uint64(len(c.shards))]
   266  }
   267  
   268  type tableCacheKey struct {
   269  	cacheID uint64
   270  	fileNum FileNum
   271  }
   272  
   273  type tableCacheShard struct {
   274  	// WARNING: The following struct `atomic` contains fields are accessed atomically.
   275  	//
   276  	// Go allocations are guaranteed to be 64-bit aligned which we take advantage
   277  	// of by placing the 64-bit fields which we access atomically at the beginning
   278  	// of the DB struct. For more information, see https://golang.org/pkg/sync/atomic/#pkg-note-BUG.
   279  	atomic struct {
   280  		hits      int64
   281  		misses    int64
   282  		iterCount int32
   283  	}
   284  
   285  	size int
   286  
   287  	mu struct {
   288  		sync.RWMutex
   289  		nodes map[tableCacheKey]*tableCacheNode
   290  		// The iters map is only created and populated in race builds.
   291  		iters map[io.Closer][]byte
   292  
   293  		handHot  *tableCacheNode
   294  		handCold *tableCacheNode
   295  		handTest *tableCacheNode
   296  
   297  		coldTarget int
   298  		sizeHot    int
   299  		sizeCold   int
   300  		sizeTest   int
   301  	}
   302  	releasing       sync.WaitGroup
   303  	releasingCh     chan *tableCacheValue
   304  	releaseLoopExit sync.WaitGroup
   305  }
   306  
   307  func (c *tableCacheShard) init(size int) {
   308  	c.size = size
   309  
   310  	c.mu.nodes = make(map[tableCacheKey]*tableCacheNode)
   311  	c.mu.coldTarget = size
   312  	c.releasingCh = make(chan *tableCacheValue, 100)
   313  	c.releaseLoopExit.Add(1)
   314  	go c.releaseLoop()
   315  
   316  	if invariants.RaceEnabled {
   317  		c.mu.iters = make(map[io.Closer][]byte)
   318  	}
   319  }
   320  
   321  func (c *tableCacheShard) releaseLoop() {
   322  	pprof.Do(context.Background(), tableCacheLabels, func(context.Context) {
   323  		defer c.releaseLoopExit.Done()
   324  		for v := range c.releasingCh {
   325  			v.release(c)
   326  		}
   327  	})
   328  }
   329  
   330  // checkAndIntersectFilters checks the specific table and block property filters
   331  // for intersection with any available table and block-level properties. Returns
   332  // true for ok if this table should be read by this iterator.
   333  func (c *tableCacheShard) checkAndIntersectFilters(
   334  	v *tableCacheValue,
   335  	tableFilter func(userProps map[string]string) bool,
   336  	blockPropertyFilters []BlockPropertyFilter,
   337  	boundLimitedFilter sstable.BoundLimitedBlockPropertyFilter,
   338  ) (ok bool, filterer *sstable.BlockPropertiesFilterer, err error) {
   339  	if tableFilter != nil &&
   340  		!tableFilter(v.reader.Properties.UserProperties) {
   341  		return false, nil, nil
   342  	}
   343  
   344  	if boundLimitedFilter != nil || len(blockPropertyFilters) > 0 {
   345  		filterer = sstable.NewBlockPropertiesFilterer(blockPropertyFilters, boundLimitedFilter)
   346  		intersects, err :=
   347  			filterer.IntersectsUserPropsAndFinishInit(v.reader.Properties.UserProperties)
   348  		if err != nil {
   349  			return false, nil, err
   350  		}
   351  		if !intersects {
   352  			return false, nil, nil
   353  		}
   354  	}
   355  	return true, filterer, nil
   356  }
   357  
   358  func (c *tableCacheShard) newIters(
   359  	file *manifest.FileMetadata,
   360  	opts *IterOptions,
   361  	internalOpts internalIterOpts,
   362  	dbOpts *tableCacheOpts,
   363  ) (internalIterator, keyspan.FragmentIterator, error) {
   364  	// Calling findNode gives us the responsibility of decrementing v's
   365  	// refCount. If opening the underlying table resulted in error, then we
   366  	// decrement this straight away. Otherwise, we pass that responsibility to
   367  	// the sstable iterator, which decrements when it is closed.
   368  	v := c.findNode(file, dbOpts)
   369  	if v.err != nil {
   370  		defer c.unrefValue(v)
   371  		base.MustExist(dbOpts.fs, v.filename, dbOpts.logger, v.err)
   372  		return nil, nil, v.err
   373  	}
   374  
   375  	ok := true
   376  	var filterer *sstable.BlockPropertiesFilterer
   377  	var err error
   378  	if opts != nil {
   379  		ok, filterer, err = c.checkAndIntersectFilters(v, opts.TableFilter,
   380  			opts.PointKeyFilters, internalOpts.boundLimitedFilter)
   381  	}
   382  	if err != nil {
   383  		c.unrefValue(v)
   384  		return nil, nil, err
   385  	}
   386  
   387  	// NB: range-del iterator does not maintain a reference to the table, nor
   388  	// does it need to read from it after creation.
   389  	rangeDelIter, err := v.reader.NewRawRangeDelIter()
   390  	if err != nil {
   391  		c.unrefValue(v)
   392  		return nil, nil, err
   393  	}
   394  
   395  	if !ok {
   396  		c.unrefValue(v)
   397  		// Return an empty iterator. This iterator has no mutable state, so
   398  		// using a singleton is fine.
   399  		// NB: We still return the potentially non-empty rangeDelIter. This
   400  		// ensures the iterator observes the file's range deletions even if the
   401  		// block property filters exclude all the file's point keys. The range
   402  		// deletions may still delete keys lower in the LSM in files that DO
   403  		// match the active filters.
   404  		//
   405  		// The point iterator returned must implement the filteredIter
   406  		// interface, so that the level iterator surfaces file boundaries when
   407  		// range deletions are present.
   408  		return filteredAll, rangeDelIter, err
   409  	}
   410  
   411  	var iter sstable.Iterator
   412  	useFilter := true
   413  	if opts != nil {
   414  		useFilter = manifest.LevelToInt(opts.level) != 6 || opts.UseL6Filters
   415  	}
   416  	if internalOpts.bytesIterated != nil {
   417  		iter, err = v.reader.NewCompactionIter(internalOpts.bytesIterated)
   418  	} else {
   419  		iter, err = v.reader.NewIterWithBlockPropertyFilters(
   420  			opts.GetLowerBound(), opts.GetUpperBound(), filterer, useFilter, internalOpts.stats)
   421  	}
   422  	if err != nil {
   423  		if rangeDelIter != nil {
   424  			_ = rangeDelIter.Close()
   425  		}
   426  		c.unrefValue(v)
   427  		return nil, nil, err
   428  	}
   429  	// NB: v.closeHook takes responsibility for calling unrefValue(v) here. Take
   430  	// care to avoid introduceingan allocation here by adding a closure.
   431  	iter.SetCloseHook(v.closeHook)
   432  
   433  	atomic.AddInt32(&c.atomic.iterCount, 1)
   434  	atomic.AddInt32(dbOpts.atomic.iterCount, 1)
   435  	if invariants.RaceEnabled {
   436  		c.mu.Lock()
   437  		c.mu.iters[iter] = debug.Stack()
   438  		c.mu.Unlock()
   439  	}
   440  	return iter, rangeDelIter, nil
   441  }
   442  
   443  func (c *tableCacheShard) newRangeKeyIter(
   444  	file *manifest.FileMetadata, opts *keyspan.SpanIterOptions, dbOpts *tableCacheOpts,
   445  ) (keyspan.FragmentIterator, error) {
   446  	// Calling findNode gives us the responsibility of decrementing v's
   447  	// refCount. If opening the underlying table resulted in error, then we
   448  	// decrement this straight away. Otherwise, we pass that responsibility to
   449  	// the sstable iterator, which decrements when it is closed.
   450  	v := c.findNode(file, dbOpts)
   451  	if v.err != nil {
   452  		defer c.unrefValue(v)
   453  		base.MustExist(dbOpts.fs, v.filename, dbOpts.logger, v.err)
   454  		return nil, v.err
   455  	}
   456  
   457  	ok := true
   458  	var err error
   459  	// Don't filter a table's range keys if the file contains RANGEKEYDELs.
   460  	// The RANGEKEYDELs may delete range keys in other levels. Skipping the
   461  	// file's range key blocks may surface deleted range keys below. This is
   462  	// done here, rather than deferring to the block-property collector in order
   463  	// to maintain parity with point keys and the treatment of RANGEDELs.
   464  	if opts != nil && v.reader.Properties.NumRangeKeyDels == 0 {
   465  		ok, _, err = c.checkAndIntersectFilters(v, nil, opts.RangeKeyFilters, nil)
   466  	}
   467  	if err != nil {
   468  		c.unrefValue(v)
   469  		return nil, err
   470  	}
   471  	if !ok {
   472  		c.unrefValue(v)
   473  		// Return the empty iterator. This iterator has no mutable state, so
   474  		// using a singleton is fine.
   475  		return emptyKeyspanIter, err
   476  	}
   477  
   478  	var iter keyspan.FragmentIterator
   479  	iter, err = v.reader.NewRawRangeKeyIter()
   480  	// iter is a block iter that holds the entire value of the block in memory.
   481  	// No need to hold onto a ref of the cache value.
   482  	c.unrefValue(v)
   483  
   484  	if err != nil || iter == nil {
   485  		return nil, err
   486  	}
   487  
   488  	return iter, nil
   489  }
   490  
   491  // getTableProperties return sst table properties for target file
   492  func (c *tableCacheShard) getTableProperties(
   493  	file *fileMetadata, dbOpts *tableCacheOpts,
   494  ) (*sstable.Properties, error) {
   495  	// Calling findNode gives us the responsibility of decrementing v's refCount here
   496  	v := c.findNode(file, dbOpts)
   497  	defer c.unrefValue(v)
   498  
   499  	if v.err != nil {
   500  		return nil, v.err
   501  	}
   502  	return &v.reader.Properties, nil
   503  }
   504  
   505  // releaseNode releases a node from the tableCacheShard.
   506  //
   507  // c.mu must be held when calling this.
   508  func (c *tableCacheShard) releaseNode(n *tableCacheNode) {
   509  	c.unlinkNode(n)
   510  	c.clearNode(n)
   511  }
   512  
   513  // unlinkNode removes a node from the tableCacheShard, leaving the shard
   514  // reference in place.
   515  //
   516  // c.mu must be held when calling this.
   517  func (c *tableCacheShard) unlinkNode(n *tableCacheNode) {
   518  	key := tableCacheKey{n.cacheID, n.meta.FileNum}
   519  	delete(c.mu.nodes, key)
   520  
   521  	switch n.ptype {
   522  	case tableCacheNodeHot:
   523  		c.mu.sizeHot--
   524  	case tableCacheNodeCold:
   525  		c.mu.sizeCold--
   526  	case tableCacheNodeTest:
   527  		c.mu.sizeTest--
   528  	}
   529  
   530  	if n == c.mu.handHot {
   531  		c.mu.handHot = c.mu.handHot.prev()
   532  	}
   533  	if n == c.mu.handCold {
   534  		c.mu.handCold = c.mu.handCold.prev()
   535  	}
   536  	if n == c.mu.handTest {
   537  		c.mu.handTest = c.mu.handTest.prev()
   538  	}
   539  
   540  	if n.unlink() == n {
   541  		// This was the last entry in the cache.
   542  		c.mu.handHot = nil
   543  		c.mu.handCold = nil
   544  		c.mu.handTest = nil
   545  	}
   546  
   547  	n.links.prev = nil
   548  	n.links.next = nil
   549  }
   550  
   551  func (c *tableCacheShard) clearNode(n *tableCacheNode) {
   552  	if v := n.value; v != nil {
   553  		n.value = nil
   554  		c.unrefValue(v)
   555  	}
   556  }
   557  
   558  // unrefValue decrements the reference count for the specified value, releasing
   559  // it if the reference count fell to 0. Note that the value has a reference if
   560  // it is present in tableCacheShard.mu.nodes, so a reference count of 0 means
   561  // the node has already been removed from that map.
   562  func (c *tableCacheShard) unrefValue(v *tableCacheValue) {
   563  	if atomic.AddInt32(&v.refCount, -1) == 0 {
   564  		c.releasing.Add(1)
   565  		c.releasingCh <- v
   566  	}
   567  }
   568  
   569  // findNode returns the node for the table with the given file number, creating
   570  // that node if it didn't already exist. The caller is responsible for
   571  // decrementing the returned node's refCount.
   572  func (c *tableCacheShard) findNode(meta *fileMetadata, dbOpts *tableCacheOpts) *tableCacheValue {
   573  	// Fast-path for a hit in the cache.
   574  	c.mu.RLock()
   575  	key := tableCacheKey{dbOpts.cacheID, meta.FileNum}
   576  	if n := c.mu.nodes[key]; n != nil && n.value != nil {
   577  		// Fast-path hit.
   578  		//
   579  		// The caller is responsible for decrementing the refCount.
   580  		v := n.value
   581  		atomic.AddInt32(&v.refCount, 1)
   582  		c.mu.RUnlock()
   583  		atomic.StoreInt32(&n.referenced, 1)
   584  		atomic.AddInt64(&c.atomic.hits, 1)
   585  		<-v.loaded
   586  		return v
   587  	}
   588  	c.mu.RUnlock()
   589  
   590  	c.mu.Lock()
   591  
   592  	n := c.mu.nodes[key]
   593  	switch {
   594  	case n == nil:
   595  		// Slow-path miss of a non-existent node.
   596  		n = &tableCacheNode{
   597  			meta:  meta,
   598  			ptype: tableCacheNodeCold,
   599  		}
   600  		c.addNode(n, dbOpts)
   601  		c.mu.sizeCold++
   602  
   603  	case n.value != nil:
   604  		// Slow-path hit of a hot or cold node.
   605  		//
   606  		// The caller is responsible for decrementing the refCount.
   607  		v := n.value
   608  		atomic.AddInt32(&v.refCount, 1)
   609  		atomic.StoreInt32(&n.referenced, 1)
   610  		atomic.AddInt64(&c.atomic.hits, 1)
   611  		c.mu.Unlock()
   612  		<-v.loaded
   613  		return v
   614  
   615  	default:
   616  		// Slow-path miss of a test node.
   617  		c.unlinkNode(n)
   618  		c.mu.coldTarget++
   619  		if c.mu.coldTarget > c.size {
   620  			c.mu.coldTarget = c.size
   621  		}
   622  
   623  		atomic.StoreInt32(&n.referenced, 0)
   624  		n.ptype = tableCacheNodeHot
   625  		c.addNode(n, dbOpts)
   626  		c.mu.sizeHot++
   627  	}
   628  
   629  	atomic.AddInt64(&c.atomic.misses, 1)
   630  
   631  	v := &tableCacheValue{
   632  		loaded:   make(chan struct{}),
   633  		refCount: 2,
   634  	}
   635  	// Cache the closure invoked when an iterator is closed. This avoids an
   636  	// allocation on every call to newIters.
   637  	v.closeHook = func(i sstable.Iterator) error {
   638  		if invariants.RaceEnabled {
   639  			c.mu.Lock()
   640  			delete(c.mu.iters, i)
   641  			c.mu.Unlock()
   642  		}
   643  		c.unrefValue(v)
   644  		atomic.AddInt32(&c.atomic.iterCount, -1)
   645  		atomic.AddInt32(dbOpts.atomic.iterCount, -1)
   646  		return nil
   647  	}
   648  	n.value = v
   649  
   650  	c.mu.Unlock()
   651  
   652  	// Note adding to the cache lists must complete before we begin loading the
   653  	// table as a failure during load will result in the node being unlinked.
   654  	pprof.Do(context.Background(), tableCacheLabels, func(context.Context) {
   655  		v.load(meta, c, dbOpts)
   656  	})
   657  	return v
   658  }
   659  
   660  func (c *tableCacheShard) addNode(n *tableCacheNode, dbOpts *tableCacheOpts) {
   661  	c.evictNodes()
   662  	n.cacheID = dbOpts.cacheID
   663  	key := tableCacheKey{n.cacheID, n.meta.FileNum}
   664  	c.mu.nodes[key] = n
   665  
   666  	n.links.next = n
   667  	n.links.prev = n
   668  	if c.mu.handHot == nil {
   669  		// First element.
   670  		c.mu.handHot = n
   671  		c.mu.handCold = n
   672  		c.mu.handTest = n
   673  	} else {
   674  		c.mu.handHot.link(n)
   675  	}
   676  
   677  	if c.mu.handCold == c.mu.handHot {
   678  		c.mu.handCold = c.mu.handCold.prev()
   679  	}
   680  }
   681  
   682  func (c *tableCacheShard) evictNodes() {
   683  	for c.size <= c.mu.sizeHot+c.mu.sizeCold && c.mu.handCold != nil {
   684  		c.runHandCold()
   685  	}
   686  }
   687  
   688  func (c *tableCacheShard) runHandCold() {
   689  	n := c.mu.handCold
   690  	if n.ptype == tableCacheNodeCold {
   691  		if atomic.LoadInt32(&n.referenced) == 1 {
   692  			atomic.StoreInt32(&n.referenced, 0)
   693  			n.ptype = tableCacheNodeHot
   694  			c.mu.sizeCold--
   695  			c.mu.sizeHot++
   696  		} else {
   697  			c.clearNode(n)
   698  			n.ptype = tableCacheNodeTest
   699  			c.mu.sizeCold--
   700  			c.mu.sizeTest++
   701  			for c.size < c.mu.sizeTest && c.mu.handTest != nil {
   702  				c.runHandTest()
   703  			}
   704  		}
   705  	}
   706  
   707  	c.mu.handCold = c.mu.handCold.next()
   708  
   709  	for c.size-c.mu.coldTarget <= c.mu.sizeHot && c.mu.handHot != nil {
   710  		c.runHandHot()
   711  	}
   712  }
   713  
   714  func (c *tableCacheShard) runHandHot() {
   715  	if c.mu.handHot == c.mu.handTest && c.mu.handTest != nil {
   716  		c.runHandTest()
   717  		if c.mu.handHot == nil {
   718  			return
   719  		}
   720  	}
   721  
   722  	n := c.mu.handHot
   723  	if n.ptype == tableCacheNodeHot {
   724  		if atomic.LoadInt32(&n.referenced) == 1 {
   725  			atomic.StoreInt32(&n.referenced, 0)
   726  		} else {
   727  			n.ptype = tableCacheNodeCold
   728  			c.mu.sizeHot--
   729  			c.mu.sizeCold++
   730  		}
   731  	}
   732  
   733  	c.mu.handHot = c.mu.handHot.next()
   734  }
   735  
   736  func (c *tableCacheShard) runHandTest() {
   737  	if c.mu.sizeCold > 0 && c.mu.handTest == c.mu.handCold && c.mu.handCold != nil {
   738  		c.runHandCold()
   739  		if c.mu.handTest == nil {
   740  			return
   741  		}
   742  	}
   743  
   744  	n := c.mu.handTest
   745  	if n.ptype == tableCacheNodeTest {
   746  		c.mu.coldTarget--
   747  		if c.mu.coldTarget < 0 {
   748  			c.mu.coldTarget = 0
   749  		}
   750  		c.unlinkNode(n)
   751  		c.clearNode(n)
   752  	}
   753  
   754  	c.mu.handTest = c.mu.handTest.next()
   755  }
   756  
   757  func (c *tableCacheShard) evict(fileNum FileNum, dbOpts *tableCacheOpts, allowLeak bool) {
   758  	c.mu.Lock()
   759  	key := tableCacheKey{dbOpts.cacheID, fileNum}
   760  	n := c.mu.nodes[key]
   761  	var v *tableCacheValue
   762  	if n != nil {
   763  		// NB: This is equivalent to tableCacheShard.releaseNode(), but we perform
   764  		// the tableCacheNode.release() call synchronously below to ensure the
   765  		// sstable file descriptor is closed before returning. Note that
   766  		// tableCacheShard.releasing needs to be incremented while holding
   767  		// tableCacheShard.mu in order to avoid a race with Close()
   768  		c.unlinkNode(n)
   769  		v = n.value
   770  		if v != nil {
   771  			if !allowLeak {
   772  				if t := atomic.AddInt32(&v.refCount, -1); t != 0 {
   773  					dbOpts.logger.Fatalf("sstable %s: refcount is not zero: %d\n%s", fileNum, t, debug.Stack())
   774  				}
   775  			}
   776  			c.releasing.Add(1)
   777  		}
   778  	}
   779  
   780  	c.mu.Unlock()
   781  
   782  	if v != nil {
   783  		v.release(c)
   784  	}
   785  
   786  	dbOpts.opts.Cache.EvictFile(dbOpts.cacheID, fileNum)
   787  }
   788  
   789  // removeDB evicts any nodes which have a reference to the DB
   790  // associated with dbOpts.cacheID. Make sure that there will
   791  // be no more accesses to the files associated with the DB.
   792  func (c *tableCacheShard) removeDB(dbOpts *tableCacheOpts) {
   793  	var fileNums []base.FileNum
   794  
   795  	c.mu.RLock()
   796  	// Collect the fileNums which need to be cleaned.
   797  	var firstNode *tableCacheNode
   798  	node := c.mu.handHot
   799  	for node != firstNode {
   800  		if firstNode == nil {
   801  			firstNode = node
   802  		}
   803  
   804  		if node.cacheID == dbOpts.cacheID {
   805  			fileNums = append(fileNums, node.meta.FileNum)
   806  		}
   807  		node = node.next()
   808  	}
   809  	c.mu.RUnlock()
   810  
   811  	// Evict all the nodes associated with the DB.
   812  	// This should synchronously close all the files
   813  	// associated with the DB.
   814  	for _, fNum := range fileNums {
   815  		c.evict(fNum, dbOpts, true)
   816  	}
   817  }
   818  
   819  func (c *tableCacheShard) Close() error {
   820  	c.mu.Lock()
   821  	defer c.mu.Unlock()
   822  
   823  	// Check for leaked iterators. Note that we'll still perform cleanup below in
   824  	// the case that there are leaked iterators.
   825  	var err error
   826  	if v := atomic.LoadInt32(&c.atomic.iterCount); v > 0 {
   827  		if !invariants.RaceEnabled {
   828  			err = errors.Errorf("leaked iterators: %d", errors.Safe(v))
   829  		} else {
   830  			var buf bytes.Buffer
   831  			for _, stack := range c.mu.iters {
   832  				fmt.Fprintf(&buf, "%s\n", stack)
   833  			}
   834  			err = errors.Errorf("leaked iterators: %d\n%s", errors.Safe(v), buf.String())
   835  		}
   836  	}
   837  
   838  	for c.mu.handHot != nil {
   839  		n := c.mu.handHot
   840  		if n.value != nil {
   841  			if atomic.AddInt32(&n.value.refCount, -1) == 0 {
   842  				c.releasing.Add(1)
   843  				c.releasingCh <- n.value
   844  			}
   845  		}
   846  		c.unlinkNode(n)
   847  	}
   848  	c.mu.nodes = nil
   849  	c.mu.handHot = nil
   850  	c.mu.handCold = nil
   851  	c.mu.handTest = nil
   852  
   853  	// Only shutdown the releasing goroutine if there were no leaked
   854  	// iterators. If there were leaked iterators, we leave the goroutine running
   855  	// and the releasingCh open so that a subsequent iterator close can
   856  	// complete. This behavior is used by iterator leak tests. Leaking the
   857  	// goroutine for these tests is less bad not closing the iterator which
   858  	// triggers other warnings about block cache handles not being released.
   859  	if err != nil {
   860  		c.releasing.Wait()
   861  		return err
   862  	}
   863  
   864  	close(c.releasingCh)
   865  	c.releasing.Wait()
   866  	c.releaseLoopExit.Wait()
   867  	return err
   868  }
   869  
   870  type tableCacheValue struct {
   871  	closeHook func(i sstable.Iterator) error
   872  	reader    *sstable.Reader
   873  	filename  string
   874  	err       error
   875  	loaded    chan struct{}
   876  	// Reference count for the value. The reader is closed when the reference
   877  	// count drops to zero.
   878  	refCount int32
   879  }
   880  
   881  func (v *tableCacheValue) load(meta *fileMetadata, c *tableCacheShard, dbOpts *tableCacheOpts) {
   882  	// Try opening the fileTypeTable first.
   883  	var f vfs.File
   884  	v.filename = base.MakeFilepath(dbOpts.fs, dbOpts.dirname, fileTypeTable, meta.FileNum)
   885  	f, v.err = dbOpts.fs.Open(v.filename, vfs.RandomReadsOption)
   886  	if v.err == nil {
   887  		cacheOpts := private.SSTableCacheOpts(dbOpts.cacheID, meta.FileNum).(sstable.ReaderOption)
   888  		reopenOpt := sstable.FileReopenOpt{FS: dbOpts.fs, Filename: v.filename}
   889  		v.reader, v.err = sstable.NewReader(f, dbOpts.opts, cacheOpts, dbOpts.filterMetrics, reopenOpt)
   890  	}
   891  	if v.err == nil {
   892  		if meta.SmallestSeqNum == meta.LargestSeqNum {
   893  			v.reader.Properties.GlobalSeqNum = meta.LargestSeqNum
   894  		}
   895  	}
   896  	if v.err != nil {
   897  		c.mu.Lock()
   898  		defer c.mu.Unlock()
   899  		// Lookup the node in the cache again as it might have already been
   900  		// removed.
   901  		key := tableCacheKey{dbOpts.cacheID, meta.FileNum}
   902  		n := c.mu.nodes[key]
   903  		if n != nil && n.value == v {
   904  			c.releaseNode(n)
   905  		}
   906  	}
   907  	close(v.loaded)
   908  }
   909  
   910  func (v *tableCacheValue) release(c *tableCacheShard) {
   911  	<-v.loaded
   912  	// Nothing to be done about an error at this point. Close the reader if it is
   913  	// open.
   914  	if v.reader != nil {
   915  		_ = v.reader.Close()
   916  	}
   917  	c.releasing.Done()
   918  }
   919  
   920  type tableCacheNodeType int8
   921  
   922  const (
   923  	tableCacheNodeTest tableCacheNodeType = iota
   924  	tableCacheNodeCold
   925  	tableCacheNodeHot
   926  )
   927  
   928  func (p tableCacheNodeType) String() string {
   929  	switch p {
   930  	case tableCacheNodeTest:
   931  		return "test"
   932  	case tableCacheNodeCold:
   933  		return "cold"
   934  	case tableCacheNodeHot:
   935  		return "hot"
   936  	}
   937  	return "unknown"
   938  }
   939  
   940  type tableCacheNode struct {
   941  	meta  *fileMetadata
   942  	value *tableCacheValue
   943  
   944  	links struct {
   945  		next *tableCacheNode
   946  		prev *tableCacheNode
   947  	}
   948  	ptype tableCacheNodeType
   949  	// referenced is atomically set to indicate that this entry has been accessed
   950  	// since the last time one of the clock hands swept it.
   951  	referenced int32
   952  
   953  	// Storing the cache id associated with the DB instance here
   954  	// avoids the need to thread the dbOpts struct through many functions.
   955  	cacheID uint64
   956  }
   957  
   958  func (n *tableCacheNode) next() *tableCacheNode {
   959  	if n == nil {
   960  		return nil
   961  	}
   962  	return n.links.next
   963  }
   964  
   965  func (n *tableCacheNode) prev() *tableCacheNode {
   966  	if n == nil {
   967  		return nil
   968  	}
   969  	return n.links.prev
   970  }
   971  
   972  func (n *tableCacheNode) link(s *tableCacheNode) {
   973  	s.links.prev = n.links.prev
   974  	s.links.prev.links.next = s
   975  	s.links.next = n
   976  	s.links.next.links.prev = s
   977  }
   978  
   979  func (n *tableCacheNode) unlink() *tableCacheNode {
   980  	next := n.links.next
   981  	n.links.prev.links.next = n.links.next
   982  	n.links.next.links.prev = n.links.prev
   983  	n.links.prev = n
   984  	n.links.next = n
   985  	return next
   986  }