github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/mem_table.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"os"
    11  	"sync"
    12  	"sync/atomic"
    13  
    14  	"github.com/cockroachdb/errors"
    15  	"github.com/cockroachdb/pebble/internal/arenaskl"
    16  	"github.com/cockroachdb/pebble/internal/base"
    17  	"github.com/cockroachdb/pebble/internal/keyspan"
    18  	"github.com/cockroachdb/pebble/internal/manual"
    19  	"github.com/cockroachdb/pebble/internal/rangedel"
    20  	"github.com/cockroachdb/pebble/internal/rangekey"
    21  )
    22  
    23  func memTableEntrySize(keyBytes, valueBytes int) uint64 {
    24  	return arenaskl.MaxNodeSize(uint32(keyBytes)+8, uint32(valueBytes))
    25  }
    26  
    27  // memTableEmptySize is the amount of allocated space in the arena when the
    28  // memtable is empty.
    29  var memTableEmptySize = func() uint32 {
    30  	var pointSkl arenaskl.Skiplist
    31  	var rangeDelSkl arenaskl.Skiplist
    32  	var rangeKeySkl arenaskl.Skiplist
    33  	arena := arenaskl.NewArena(make([]byte, 16<<10 /* 16 KB */))
    34  	pointSkl.Reset(arena, bytes.Compare)
    35  	rangeDelSkl.Reset(arena, bytes.Compare)
    36  	rangeKeySkl.Reset(arena, bytes.Compare)
    37  	return arena.Size()
    38  }()
    39  
    40  // A memTable implements an in-memory layer of the LSM. A memTable is mutable,
    41  // but append-only. Records are added, but never removed. Deletion is supported
    42  // via tombstones, but it is up to higher level code (see Iterator) to support
    43  // processing those tombstones.
    44  //
    45  // A memTable is implemented on top of a lock-free arena-backed skiplist. An
    46  // arena is a fixed size contiguous chunk of memory (see
    47  // Options.MemTableSize). A memTable's memory consumption is thus fixed at the
    48  // time of creation (with the exception of the cached fragmented range
    49  // tombstones). The arena-backed skiplist provides both forward and reverse
    50  // links which makes forward and reverse iteration the same speed.
    51  //
    52  // A batch is "applied" to a memTable in a two step process: prepare(batch) ->
    53  // apply(batch). memTable.prepare() is not thread-safe and must be called with
    54  // external synchronization. Preparation reserves space in the memTable for the
    55  // batch. Note that we pessimistically compute how much space a batch will
    56  // consume in the memTable (see memTableEntrySize and
    57  // Batch.memTableSize). Preparation is an O(1) operation. Applying a batch to
    58  // the memTable can be performed concurrently with other apply
    59  // operations. Applying a batch is an O(n logm) operation where N is the number
    60  // of records in the batch and M is the number of records in the memtable. The
    61  // commitPipeline serializes batch preparation, and allows batch application to
    62  // proceed concurrently.
    63  //
    64  // It is safe to call get, apply, newIter, and newRangeDelIter concurrently.
    65  type memTable struct {
    66  	cmp         Compare
    67  	formatKey   base.FormatKey
    68  	equal       Equal
    69  	arenaBuf    []byte
    70  	skl         arenaskl.Skiplist
    71  	rangeDelSkl arenaskl.Skiplist
    72  	rangeKeySkl arenaskl.Skiplist
    73  	// reserved tracks the amount of space used by the memtable, both by actual
    74  	// data stored in the memtable as well as inflight batch commit
    75  	// operations. This value is incremented pessimistically by prepare() in
    76  	// order to account for the space needed by a batch.
    77  	reserved uint32
    78  	// writerRefs tracks the write references on the memtable. The two sources of
    79  	// writer references are the memtable being on DB.mu.mem.queue and from
    80  	// inflight mutations that have reserved space in the memtable but not yet
    81  	// applied. The memtable cannot be flushed to disk until the writer refs
    82  	// drops to zero.
    83  	writerRefs atomic.Int32
    84  	tombstones keySpanCache
    85  	rangeKeys  keySpanCache
    86  	// The current logSeqNum at the time the memtable was created. This is
    87  	// guaranteed to be less than or equal to any seqnum stored in the memtable.
    88  	logSeqNum                    uint64
    89  	releaseAccountingReservation func()
    90  }
    91  
    92  func (m *memTable) free() {
    93  	if m != nil {
    94  		m.releaseAccountingReservation()
    95  		manual.Free(m.arenaBuf)
    96  		m.arenaBuf = nil
    97  	}
    98  }
    99  
   100  // memTableOptions holds configuration used when creating a memTable. All of
   101  // the fields are optional and will be filled with defaults if not specified
   102  // which is used by tests.
   103  type memTableOptions struct {
   104  	*Options
   105  	arenaBuf                     []byte
   106  	size                         int
   107  	logSeqNum                    uint64
   108  	releaseAccountingReservation func()
   109  }
   110  
   111  func checkMemTable(obj interface{}) {
   112  	m := obj.(*memTable)
   113  	if m.arenaBuf != nil {
   114  		fmt.Fprintf(os.Stderr, "%p: memTable buffer was not freed\n", m.arenaBuf)
   115  		os.Exit(1)
   116  	}
   117  }
   118  
   119  // newMemTable returns a new MemTable of the specified size. If size is zero,
   120  // Options.MemTableSize is used instead.
   121  func newMemTable(opts memTableOptions) *memTable {
   122  	opts.Options = opts.Options.EnsureDefaults()
   123  	m := new(memTable)
   124  	m.init(opts)
   125  	return m
   126  }
   127  
   128  func (m *memTable) init(opts memTableOptions) {
   129  	if opts.size == 0 {
   130  		opts.size = int(opts.MemTableSize)
   131  	}
   132  	*m = memTable{
   133  		cmp:                          opts.Comparer.Compare,
   134  		formatKey:                    opts.Comparer.FormatKey,
   135  		equal:                        opts.Comparer.Equal,
   136  		arenaBuf:                     opts.arenaBuf,
   137  		logSeqNum:                    opts.logSeqNum,
   138  		releaseAccountingReservation: opts.releaseAccountingReservation,
   139  	}
   140  	m.writerRefs.Store(1)
   141  	m.tombstones = keySpanCache{
   142  		cmp:           m.cmp,
   143  		formatKey:     m.formatKey,
   144  		skl:           &m.rangeDelSkl,
   145  		constructSpan: rangeDelConstructSpan,
   146  	}
   147  	m.rangeKeys = keySpanCache{
   148  		cmp:           m.cmp,
   149  		formatKey:     m.formatKey,
   150  		skl:           &m.rangeKeySkl,
   151  		constructSpan: rangekey.Decode,
   152  	}
   153  
   154  	if m.arenaBuf == nil {
   155  		m.arenaBuf = make([]byte, opts.size)
   156  	}
   157  
   158  	arena := arenaskl.NewArena(m.arenaBuf)
   159  	m.skl.Reset(arena, m.cmp)
   160  	m.rangeDelSkl.Reset(arena, m.cmp)
   161  	m.rangeKeySkl.Reset(arena, m.cmp)
   162  	m.reserved = arena.Size()
   163  }
   164  
   165  func (m *memTable) writerRef() {
   166  	switch v := m.writerRefs.Add(1); {
   167  	case v <= 1:
   168  		panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v))
   169  	}
   170  }
   171  
   172  // writerUnref drops a ref on the memtable. Returns true if this was the last ref.
   173  func (m *memTable) writerUnref() (wasLastRef bool) {
   174  	switch v := m.writerRefs.Add(-1); {
   175  	case v < 0:
   176  		panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v))
   177  	case v == 0:
   178  		return true
   179  	default:
   180  		return false
   181  	}
   182  }
   183  
   184  // readyForFlush is part of the flushable interface.
   185  func (m *memTable) readyForFlush() bool {
   186  	return m.writerRefs.Load() == 0
   187  }
   188  
   189  // Prepare reserves space for the batch in the memtable and references the
   190  // memtable preventing it from being flushed until the batch is applied. Note
   191  // that prepare is not thread-safe, while apply is. The caller must call
   192  // writerUnref() after the batch has been applied.
   193  func (m *memTable) prepare(batch *Batch) error {
   194  	avail := m.availBytes()
   195  	if batch.memTableSize > uint64(avail) {
   196  		return arenaskl.ErrArenaFull
   197  	}
   198  	m.reserved += uint32(batch.memTableSize)
   199  
   200  	m.writerRef()
   201  	return nil
   202  }
   203  
   204  func (m *memTable) apply(batch *Batch, seqNum uint64) error {
   205  	if seqNum < m.logSeqNum {
   206  		return base.CorruptionErrorf("pebble: batch seqnum %d is less than memtable creation seqnum %d",
   207  			errors.Safe(seqNum), errors.Safe(m.logSeqNum))
   208  	}
   209  
   210  	var ins arenaskl.Inserter
   211  	var tombstoneCount, rangeKeyCount uint32
   212  	startSeqNum := seqNum
   213  	for r := batch.Reader(); ; seqNum++ {
   214  		kind, ukey, value, ok, err := r.Next()
   215  		if !ok {
   216  			if err != nil {
   217  				return err
   218  			}
   219  			break
   220  		}
   221  		ikey := base.MakeInternalKey(ukey, seqNum, kind)
   222  		switch kind {
   223  		case InternalKeyKindRangeDelete:
   224  			err = m.rangeDelSkl.Add(ikey, value)
   225  			tombstoneCount++
   226  		case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
   227  			err = m.rangeKeySkl.Add(ikey, value)
   228  			rangeKeyCount++
   229  		case InternalKeyKindLogData:
   230  			// Don't increment seqNum for LogData, since these are not applied
   231  			// to the memtable.
   232  			seqNum--
   233  		case InternalKeyKindIngestSST:
   234  			panic("pebble: cannot apply ingested sstable key kind to memtable")
   235  		default:
   236  			err = ins.Add(&m.skl, ikey, value)
   237  		}
   238  		if err != nil {
   239  			return err
   240  		}
   241  	}
   242  	if seqNum != startSeqNum+uint64(batch.Count()) {
   243  		return base.CorruptionErrorf("pebble: inconsistent batch count: %d vs %d",
   244  			errors.Safe(seqNum), errors.Safe(startSeqNum+uint64(batch.Count())))
   245  	}
   246  	if tombstoneCount != 0 {
   247  		m.tombstones.invalidate(tombstoneCount)
   248  	}
   249  	if rangeKeyCount != 0 {
   250  		m.rangeKeys.invalidate(rangeKeyCount)
   251  	}
   252  	return nil
   253  }
   254  
   255  // newIter is part of the flushable interface. It returns an iterator that is
   256  // unpositioned (Iterator.Valid() will return false). The iterator can be
   257  // positioned via a call to SeekGE, SeekLT, First or Last.
   258  func (m *memTable) newIter(o *IterOptions) internalIterator {
   259  	return m.skl.NewIter(o.GetLowerBound(), o.GetUpperBound())
   260  }
   261  
   262  // newFlushIter is part of the flushable interface.
   263  func (m *memTable) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator {
   264  	return m.skl.NewFlushIter(bytesFlushed)
   265  }
   266  
   267  // newRangeDelIter is part of the flushable interface.
   268  func (m *memTable) newRangeDelIter(*IterOptions) keyspan.FragmentIterator {
   269  	tombstones := m.tombstones.get()
   270  	if tombstones == nil {
   271  		return nil
   272  	}
   273  	return keyspan.NewIter(m.cmp, tombstones)
   274  }
   275  
   276  // newRangeKeyIter is part of the flushable interface.
   277  func (m *memTable) newRangeKeyIter(*IterOptions) keyspan.FragmentIterator {
   278  	rangeKeys := m.rangeKeys.get()
   279  	if rangeKeys == nil {
   280  		return nil
   281  	}
   282  	return keyspan.NewIter(m.cmp, rangeKeys)
   283  }
   284  
   285  // containsRangeKeys is part of the flushable interface.
   286  func (m *memTable) containsRangeKeys() bool {
   287  	return m.rangeKeys.count.Load() > 0
   288  }
   289  
   290  func (m *memTable) availBytes() uint32 {
   291  	a := m.skl.Arena()
   292  	if m.writerRefs.Load() == 1 {
   293  		// If there are no other concurrent apply operations, we can update the
   294  		// reserved bytes setting to accurately reflect how many bytes of been
   295  		// allocated vs the over-estimation present in memTableEntrySize.
   296  		m.reserved = a.Size()
   297  	}
   298  	return a.Capacity() - m.reserved
   299  }
   300  
   301  // inuseBytes is part of the flushable interface.
   302  func (m *memTable) inuseBytes() uint64 {
   303  	return uint64(m.skl.Size() - memTableEmptySize)
   304  }
   305  
   306  // totalBytes is part of the flushable interface.
   307  func (m *memTable) totalBytes() uint64 {
   308  	return uint64(m.skl.Arena().Capacity())
   309  }
   310  
   311  // empty returns whether the MemTable has no key/value pairs.
   312  func (m *memTable) empty() bool {
   313  	return m.skl.Size() == memTableEmptySize
   314  }
   315  
   316  // A keySpanFrags holds a set of fragmented keyspan.Spans with a particular key
   317  // kind at a particular moment for a memtable.
   318  //
   319  // When a new span of a particular kind is added to the memtable, it may overlap
   320  // with other spans of the same kind. Instead of performing the fragmentation
   321  // whenever an iterator requires it, fragments are cached within a keySpanCache
   322  // type. The keySpanCache uses keySpanFrags to hold the cached fragmented spans.
   323  //
   324  // The count of keys (and keys of any given kind) in a memtable only
   325  // monotonically increases. The count of key spans of a particular kind is used
   326  // as a stand-in for a 'sequence number'. A keySpanFrags represents the
   327  // fragmented state of the memtable's keys of a given kind at the moment while
   328  // there existed `count` keys of that kind in the memtable.
   329  //
   330  // It's currently only used to contain fragmented range deletion tombstones.
   331  type keySpanFrags struct {
   332  	count uint32
   333  	once  sync.Once
   334  	spans []keyspan.Span
   335  }
   336  
   337  type constructSpan func(ik base.InternalKey, v []byte, keysDst []keyspan.Key) (keyspan.Span, error)
   338  
   339  func rangeDelConstructSpan(
   340  	ik base.InternalKey, v []byte, keysDst []keyspan.Key,
   341  ) (keyspan.Span, error) {
   342  	return rangedel.Decode(ik, v, keysDst), nil
   343  }
   344  
   345  // get retrieves the fragmented spans, populating them if necessary. Note that
   346  // the populated span fragments may be built from more than f.count memTable
   347  // spans, but that is ok for correctness. All we're requiring is that the
   348  // memTable contains at least f.count keys of the configured kind. This
   349  // situation can occur if there are multiple concurrent additions of the key
   350  // kind and a concurrent reader. The reader can load a keySpanFrags and populate
   351  // it even though is has been invalidated (i.e. replaced with a newer
   352  // keySpanFrags).
   353  func (f *keySpanFrags) get(
   354  	skl *arenaskl.Skiplist, cmp Compare, formatKey base.FormatKey, constructSpan constructSpan,
   355  ) []keyspan.Span {
   356  	f.once.Do(func() {
   357  		frag := &keyspan.Fragmenter{
   358  			Cmp:    cmp,
   359  			Format: formatKey,
   360  			Emit: func(fragmented keyspan.Span) {
   361  				f.spans = append(f.spans, fragmented)
   362  			},
   363  		}
   364  		it := skl.NewIter(nil, nil)
   365  		var keysDst []keyspan.Key
   366  		for key, val := it.First(); key != nil; key, val = it.Next() {
   367  			s, err := constructSpan(*key, val.InPlaceValue(), keysDst)
   368  			if err != nil {
   369  				panic(err)
   370  			}
   371  			frag.Add(s)
   372  			keysDst = s.Keys[len(s.Keys):]
   373  		}
   374  		frag.Finish()
   375  	})
   376  	return f.spans
   377  }
   378  
   379  // A keySpanCache is used to cache a set of fragmented spans. The cache is
   380  // invalidated whenever a key of the same kind is added to a memTable, and
   381  // populated when empty when a span iterator of that key kind is created.
   382  type keySpanCache struct {
   383  	count         atomic.Uint32
   384  	frags         atomic.Pointer[keySpanFrags]
   385  	cmp           Compare
   386  	formatKey     base.FormatKey
   387  	constructSpan constructSpan
   388  	skl           *arenaskl.Skiplist
   389  }
   390  
   391  // Invalidate the current set of cached spans, indicating the number of
   392  // spans that were added.
   393  func (c *keySpanCache) invalidate(count uint32) {
   394  	newCount := c.count.Add(count)
   395  	var frags *keySpanFrags
   396  
   397  	for {
   398  		oldFrags := c.frags.Load()
   399  		if oldFrags != nil && oldFrags.count >= newCount {
   400  			// Someone else invalidated the cache before us and their invalidation
   401  			// subsumes ours.
   402  			break
   403  		}
   404  		if frags == nil {
   405  			frags = &keySpanFrags{count: newCount}
   406  		}
   407  		if c.frags.CompareAndSwap(oldFrags, frags) {
   408  			// We successfully invalidated the cache.
   409  			break
   410  		}
   411  		// Someone else invalidated the cache. Loop and try again.
   412  	}
   413  }
   414  
   415  func (c *keySpanCache) get() []keyspan.Span {
   416  	frags := c.frags.Load()
   417  	if frags == nil {
   418  		return nil
   419  	}
   420  	return frags.get(c.skl, c.cmp, c.formatKey, c.constructSpan)
   421  }