github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/table_cache.go (about)

     1  // Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"bytes"
     9  	"errors"
    10  	"fmt"
    11  	"runtime"
    12  	"runtime/debug"
    13  	"sync"
    14  	"sync/atomic"
    15  
    16  	"github.com/petermattis/pebble/internal/base"
    17  	"github.com/petermattis/pebble/sstable"
    18  	"github.com/petermattis/pebble/vfs"
    19  )
    20  
    21  const defaultTableCacheHitBuffer = 64
    22  
    23  var emptyIter = &errorIter{err: nil}
    24  
    25  type tableCache struct {
    26  	shards []tableCacheShard
    27  }
    28  
    29  func (c *tableCache) init(
    30  	dbNum uint64, dirname string, fs vfs.FS, opts *Options, size, hitBuffer int,
    31  ) {
    32  	c.shards = make([]tableCacheShard, runtime.NumCPU())
    33  	for i := range c.shards {
    34  		c.shards[i].init(dbNum, dirname, fs, opts, size/len(c.shards), hitBuffer)
    35  	}
    36  }
    37  
    38  func (c *tableCache) getShard(fileNum uint64) *tableCacheShard {
    39  	return &c.shards[fileNum%uint64(len(c.shards))]
    40  }
    41  
    42  func (c *tableCache) newIters(
    43  	meta *fileMetadata, opts *IterOptions, bytesIterated *uint64,
    44  ) (internalIterator, internalIterator, error) {
    45  	return c.getShard(meta.FileNum).newIters(meta, opts, bytesIterated)
    46  }
    47  
    48  func (c *tableCache) evict(fileNum uint64) {
    49  	c.getShard(fileNum).evict(fileNum)
    50  }
    51  
    52  func (c *tableCache) Close() error {
    53  	for i := range c.shards {
    54  		err := c.shards[i].Close()
    55  		if err != nil {
    56  			return err
    57  		}
    58  	}
    59  	return nil
    60  }
    61  
    62  type tableCacheShard struct {
    63  	dbNum   uint64
    64  	dirname string
    65  	fs      vfs.FS
    66  	opts    *Options
    67  	size    int
    68  
    69  	mu struct {
    70  		sync.RWMutex
    71  		nodes map[uint64]*tableCacheNode
    72  		// The iters map is only created and populated in race builds.
    73  		iters map[sstable.Iterator][]byte
    74  		lru   tableCacheNode
    75  	}
    76  
    77  	iterCount int32
    78  	releasing sync.WaitGroup
    79  	hitsPool  *sync.Pool
    80  }
    81  
    82  func (c *tableCacheShard) init(
    83  	dbNum uint64, dirname string, fs vfs.FS, opts *Options, size, hitBuffer int,
    84  ) {
    85  	c.dbNum = dbNum
    86  	c.dirname = dirname
    87  	c.fs = fs
    88  	c.opts = opts
    89  	c.size = size
    90  	c.mu.nodes = make(map[uint64]*tableCacheNode)
    91  	c.mu.lru.next = &c.mu.lru
    92  	c.mu.lru.prev = &c.mu.lru
    93  	c.hitsPool = &sync.Pool{
    94  		New: func() interface{} {
    95  			return &tableCacheHits{
    96  				hits:  make([]*tableCacheNode, 0, hitBuffer),
    97  				shard: c,
    98  			}
    99  		},
   100  	}
   101  
   102  	if raceEnabled {
   103  		c.mu.iters = make(map[sstable.Iterator][]byte)
   104  	}
   105  }
   106  
   107  func (c *tableCacheShard) newIters(
   108  	meta *fileMetadata, opts *IterOptions, bytesIterated *uint64,
   109  ) (internalIterator, internalIterator, error) {
   110  	// Calling findNode gives us the responsibility of decrementing n's
   111  	// refCount. If opening the underlying table resulted in error, then we
   112  	// decrement this straight away. Otherwise, we pass that responsibility to
   113  	// the sstable iterator, which decrements when it is closed.
   114  	n := c.findNode(meta)
   115  	<-n.loaded
   116  	if n.err != nil {
   117  		c.unrefNode(n)
   118  		return nil, nil, n.err
   119  	}
   120  
   121  	if opts != nil &&
   122  		opts.TableFilter != nil &&
   123  		!opts.TableFilter(n.reader.Properties.UserProperties) {
   124  		// Return the empty iterator. This iterator has no mutable state, so
   125  		// using a singleton is fine.
   126  		return emptyIter, nil, nil
   127  	}
   128  	var iter sstable.Iterator
   129  	if bytesIterated != nil {
   130  		iter = n.reader.NewCompactionIter(bytesIterated)
   131  	} else {
   132  		iter = n.reader.NewIter(opts.GetLowerBound(), opts.GetUpperBound())
   133  	}
   134  	atomic.AddInt32(&c.iterCount, 1)
   135  	if raceEnabled {
   136  		c.mu.Lock()
   137  		c.mu.iters[iter] = debug.Stack()
   138  		c.mu.Unlock()
   139  	}
   140  	iter.SetCloseHook(n.closeHook)
   141  
   142  	// NB: range-del iterator does not maintain a reference to the table, nor
   143  	// does it need to read from it after creation.
   144  	if rangeDelIter := n.reader.NewRangeDelIter(); rangeDelIter != nil {
   145  		return iter, rangeDelIter, nil
   146  	}
   147  	// NB: Translate a nil range-del iterator into a nil interface.
   148  	return iter, nil, nil
   149  }
   150  
   151  // releaseNode releases a node from the tableCacheShard.
   152  //
   153  // c.mu must be held when calling this.
   154  func (c *tableCacheShard) releaseNode(n *tableCacheNode) {
   155  	delete(c.mu.nodes, n.meta.FileNum)
   156  	n.next.prev = n.prev
   157  	n.prev.next = n.next
   158  	n.prev = nil
   159  	n.next = nil
   160  	c.unrefNode(n)
   161  }
   162  
   163  // unrefNode decrements the reference count for the specified node, releasing
   164  // it if the reference count fell to 0. Note that the node has a reference if
   165  // it is present in tableCacheShard.mu.nodes, so a reference count of 0 means the
   166  // node has already been removed from that map.
   167  //
   168  // Returns true if the node was released and false otherwise.
   169  func (c *tableCacheShard) unrefNode(n *tableCacheNode) {
   170  	if atomic.AddInt32(&n.refCount, -1) == 0 {
   171  		c.releasing.Add(1)
   172  		go n.release(c)
   173  	}
   174  }
   175  
   176  // findNode returns the node for the table with the given file number, creating
   177  // that node if it didn't already exist. The caller is responsible for
   178  // decrementing the returned node's refCount.
   179  func (c *tableCacheShard) findNode(meta *fileMetadata) *tableCacheNode {
   180  	// Fast-path for a hit in the cache. We grab the lock in shared mode, and use
   181  	// a batching mechanism to perform updates to the LRU list.
   182  	c.mu.RLock()
   183  	if n := c.mu.nodes[meta.FileNum]; n != nil {
   184  		// The caller is responsible for decrementing the refCount.
   185  		atomic.AddInt32(&n.refCount, 1)
   186  		c.mu.RUnlock()
   187  
   188  		// Record a hit for the node. This has to be done with tableCacheShard.mu
   189  		// unlocked as it might result in a call to
   190  		// tableCacheShard.recordHits. Note that the sync.Pool acts as a
   191  		// thread-local cache of the accesses. This is lossy (a GC can result in
   192  		// the sync.Pool be cleared), but that is ok as we don't need perfect
   193  		// accuracy for the LRU list.
   194  		hits := c.hitsPool.Get().(*tableCacheHits)
   195  		hits.recordHit(n)
   196  		c.hitsPool.Put(hits)
   197  		return n
   198  	}
   199  	c.mu.RUnlock()
   200  
   201  	c.mu.Lock()
   202  	defer c.mu.Unlock()
   203  
   204  	{
   205  		// Flush the thread-local hits buffer as we already have the shard locked
   206  		// exclusively.
   207  		hits := c.hitsPool.Get().(*tableCacheHits)
   208  		hits.flushLocked()
   209  		c.hitsPool.Put(hits)
   210  	}
   211  
   212  	n := c.mu.nodes[meta.FileNum]
   213  	if n == nil {
   214  		n = &tableCacheNode{
   215  			// Cache the closure invoked when an iterator is closed. This avoids an
   216  			// allocation on every call to newIters.
   217  			closeHook: func(i sstable.Iterator) error {
   218  				if raceEnabled {
   219  					c.mu.Lock()
   220  					delete(c.mu.iters, i)
   221  					c.mu.Unlock()
   222  				}
   223  				c.unrefNode(n)
   224  				atomic.AddInt32(&c.iterCount, -1)
   225  				return nil
   226  			},
   227  			meta:     meta,
   228  			refCount: 1,
   229  			loaded:   make(chan struct{}),
   230  		}
   231  		c.mu.nodes[meta.FileNum] = n
   232  		if len(c.mu.nodes) > c.size {
   233  			// Release the tail node.
   234  			c.releaseNode(c.mu.lru.prev)
   235  		}
   236  		go n.load(c)
   237  	} else {
   238  		// Remove n from the doubly-linked list.
   239  		n.next.prev = n.prev
   240  		n.prev.next = n.next
   241  	}
   242  	// Insert n at the front of the doubly-linked list.
   243  	n.next = c.mu.lru.next
   244  	n.prev = &c.mu.lru
   245  	n.next.prev = n
   246  	n.prev.next = n
   247  	// The caller is responsible for decrementing the refCount.
   248  	atomic.AddInt32(&n.refCount, 1)
   249  	return n
   250  }
   251  
   252  func (c *tableCacheShard) evict(fileNum uint64) {
   253  	c.mu.Lock()
   254  	if n := c.mu.nodes[fileNum]; n != nil {
   255  		c.releaseNode(n)
   256  	}
   257  	c.mu.Unlock()
   258  
   259  	c.opts.Cache.EvictFile(c.dbNum, fileNum)
   260  }
   261  
   262  func (c *tableCacheShard) recordHits(hits []*tableCacheNode) {
   263  	c.mu.Lock()
   264  	c.recordHitsLocked(hits)
   265  	c.mu.Unlock()
   266  }
   267  
   268  func (c *tableCacheShard) recordHitsLocked(hits []*tableCacheNode) {
   269  	for _, n := range hits {
   270  		if n.next == nil || n.prev == nil {
   271  			// The node is no longer on the LRU list.
   272  			continue
   273  		}
   274  		// Remove n from the doubly-linked list.
   275  		n.next.prev = n.prev
   276  		n.prev.next = n.next
   277  		// Insert n at the front of the doubly-linked list.
   278  		n.next = c.mu.lru.next
   279  		n.prev = &c.mu.lru
   280  		n.next.prev = n
   281  		n.prev.next = n
   282  	}
   283  }
   284  
   285  func (c *tableCacheShard) Close() error {
   286  	c.mu.Lock()
   287  	defer c.mu.Unlock()
   288  
   289  	if v := atomic.LoadInt32(&c.iterCount); v > 0 {
   290  		if !raceEnabled {
   291  			return fmt.Errorf("leaked iterators: %d", v)
   292  		}
   293  		var buf bytes.Buffer
   294  		fmt.Fprintf(&buf, "leaked iterators: %d\n", v)
   295  		for _, stack := range c.mu.iters {
   296  			fmt.Fprintf(&buf, "%s\n", stack)
   297  		}
   298  		return errors.New(buf.String())
   299  	}
   300  
   301  	for n := c.mu.lru.next; n != &c.mu.lru; n = n.next {
   302  		if atomic.AddInt32(&n.refCount, -1) == 0 {
   303  			c.releasing.Add(1)
   304  			go n.release(c)
   305  		}
   306  	}
   307  	c.mu.nodes = nil
   308  	c.mu.lru.next = nil
   309  	c.mu.lru.prev = nil
   310  
   311  	c.releasing.Wait()
   312  	return nil
   313  }
   314  
   315  type tableCacheNode struct {
   316  	closeHook func(i sstable.Iterator) error
   317  
   318  	meta   *fileMetadata
   319  	reader *sstable.Reader
   320  	err    error
   321  	loaded chan struct{}
   322  
   323  	// The remaining fields are protected by the tableCache mutex.
   324  
   325  	next, prev *tableCacheNode
   326  	refCount   int32
   327  }
   328  
   329  func (n *tableCacheNode) load(c *tableCacheShard) {
   330  	// Try opening the fileTypeTable first.
   331  	f, err := c.fs.Open(base.MakeFilename(c.dirname, fileTypeTable, n.meta.FileNum),
   332  		vfs.RandomReadsOption)
   333  	if err != nil {
   334  		n.err = err
   335  		close(n.loaded)
   336  		return
   337  	}
   338  	n.reader, n.err = sstable.NewReader(f, c.dbNum, n.meta.FileNum, c.opts)
   339  	if n.meta.SmallestSeqNum == n.meta.LargestSeqNum {
   340  		n.reader.Properties.GlobalSeqNum = n.meta.LargestSeqNum
   341  	}
   342  	close(n.loaded)
   343  }
   344  
   345  func (n *tableCacheNode) release(c *tableCacheShard) {
   346  	<-n.loaded
   347  	// Nothing to be done about an error at this point. Close the reader if it is
   348  	// open.
   349  	if n.reader != nil {
   350  		_ = n.reader.Close()
   351  	}
   352  	c.releasing.Done()
   353  }
   354  
   355  // tableCacheHits batches a set of node accesses in order to amortize exclusive
   356  // lock acquisition.
   357  type tableCacheHits struct {
   358  	hits  []*tableCacheNode
   359  	shard *tableCacheShard
   360  }
   361  
   362  func (f *tableCacheHits) recordHit(n *tableCacheNode) {
   363  	f.hits = append(f.hits, n)
   364  	if len(f.hits) == cap(f.hits) {
   365  		f.shard.recordHits(f.hits)
   366  		f.hits = f.hits[:0]
   367  	}
   368  }
   369  
   370  func (f *tableCacheHits) flushLocked() {
   371  	f.shard.recordHitsLocked(f.hits)
   372  	f.hits = f.hits[:0]
   373  }