github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/mem_table.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package bitalostable
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"os"
    11  	"sync"
    12  	"sync/atomic"
    13  	"unsafe"
    14  
    15  	"github.com/cockroachdb/errors"
    16  	"github.com/zuoyebang/bitalostable/internal/arenaskl"
    17  	"github.com/zuoyebang/bitalostable/internal/base"
    18  	"github.com/zuoyebang/bitalostable/internal/keyspan"
    19  	"github.com/zuoyebang/bitalostable/internal/rangedel"
    20  	"github.com/zuoyebang/bitalostable/internal/rangekey"
    21  )
    22  
    23  func memTableEntrySize(keyBytes, valueBytes int) uint64 {
    24  	return arenaskl.MaxNodeSize(uint32(keyBytes)+8, uint32(valueBytes))
    25  }
    26  
    27  // memTableEmptySize is the amount of allocated space in the arena when the
    28  // memtable is empty.
    29  var memTableEmptySize = func() uint32 {
    30  	var pointSkl arenaskl.Skiplist
    31  	var rangeDelSkl arenaskl.Skiplist
    32  	var rangeKeySkl arenaskl.Skiplist
    33  	arena := arenaskl.NewArena(make([]byte, 16<<10 /* 16 KB */))
    34  	pointSkl.Reset(arena, bytes.Compare)
    35  	rangeDelSkl.Reset(arena, bytes.Compare)
    36  	rangeKeySkl.Reset(arena, bytes.Compare)
    37  	return arena.Size()
    38  }()
    39  
    40  // A memTable implements an in-memory layer of the LSM. A memTable is mutable,
    41  // but append-only. Records are added, but never removed. Deletion is supported
    42  // via tombstones, but it is up to higher level code (see Iterator) to support
    43  // processing those tombstones.
    44  //
    45  // A memTable is implemented on top of a lock-free arena-backed skiplist. An
    46  // arena is a fixed size contiguous chunk of memory (see
    47  // Options.MemTableSize). A memTable's memory consumption is thus fixed at the
    48  // time of creation (with the exception of the cached fragmented range
    49  // tombstones). The arena-backed skiplist provides both forward and reverse
    50  // links which makes forward and reverse iteration the same speed.
    51  //
    52  // A batch is "applied" to a memTable in a two step process: prepare(batch) ->
    53  // apply(batch). memTable.prepare() is not thread-safe and must be called with
    54  // external synchronization. Preparation reserves space in the memTable for the
    55  // batch. Note that we pessimistically compute how much space a batch will
    56  // consume in the memTable (see memTableEntrySize and
    57  // Batch.memTableSize). Preparation is an O(1) operation. Applying a batch to
    58  // the memTable can be performed concurrently with other apply
    59  // operations. Applying a batch is an O(n logm) operation where N is the number
    60  // of records in the batch and M is the number of records in the memtable. The
    61  // commitPipeline serializes batch preparation, and allows batch application to
    62  // proceed concurrently.
    63  //
    64  // It is safe to call get, apply, newIter, and newRangeDelIter concurrently.
    65  type memTable struct {
    66  	cmp         Compare
    67  	formatKey   base.FormatKey
    68  	equal       Equal
    69  	arenaBuf    []byte
    70  	skl         arenaskl.Skiplist
    71  	rangeDelSkl arenaskl.Skiplist
    72  	rangeKeySkl arenaskl.Skiplist
    73  	// reserved tracks the amount of space used by the memtable, both by actual
    74  	// data stored in the memtable as well as inflight batch commit
    75  	// operations. This value is incremented pessimistically by prepare() in
    76  	// order to account for the space needed by a batch.
    77  	reserved uint32
    78  	// writerRefs tracks the write references on the memtable. The two sources of
    79  	// writer references are the memtable being on DB.mu.mem.queue and from
    80  	// inflight mutations that have reserved space in the memtable but not yet
    81  	// applied. The memtable cannot be flushed to disk until the writer refs
    82  	// drops to zero.
    83  	writerRefs int32
    84  	tombstones keySpanCache
    85  	rangeKeys  keySpanCache
    86  	// The current logSeqNum at the time the memtable was created. This is
    87  	// guaranteed to be less than or equal to any seqnum stored in the memtable.
    88  	logSeqNum uint64
    89  }
    90  
    91  // memTableOptions holds configuration used when creating a memTable. All of
    92  // the fields are optional and will be filled with defaults if not specified
    93  // which is used by tests.
    94  type memTableOptions struct {
    95  	*Options
    96  	arenaBuf  []byte
    97  	size      int
    98  	logSeqNum uint64
    99  }
   100  
   101  func checkMemTable(obj interface{}) {
   102  	m := obj.(*memTable)
   103  	if m.arenaBuf != nil {
   104  		fmt.Fprintf(os.Stderr, "%p: memTable buffer was not freed\n", m.arenaBuf)
   105  		os.Exit(1)
   106  	}
   107  }
   108  
   109  // newMemTable returns a new MemTable of the specified size. If size is zero,
   110  // Options.MemTableSize is used instead.
   111  func newMemTable(opts memTableOptions) *memTable {
   112  	opts.Options = opts.Options.EnsureDefaults()
   113  	if opts.size == 0 {
   114  		opts.size = opts.MemTableSize
   115  	}
   116  
   117  	m := &memTable{
   118  		cmp:        opts.Comparer.Compare,
   119  		formatKey:  opts.Comparer.FormatKey,
   120  		equal:      opts.Comparer.Equal,
   121  		arenaBuf:   opts.arenaBuf,
   122  		writerRefs: 1,
   123  		logSeqNum:  opts.logSeqNum,
   124  	}
   125  	m.tombstones = keySpanCache{
   126  		cmp:           m.cmp,
   127  		formatKey:     m.formatKey,
   128  		skl:           &m.rangeDelSkl,
   129  		constructSpan: rangeDelConstructSpan,
   130  	}
   131  	m.rangeKeys = keySpanCache{
   132  		cmp:           m.cmp,
   133  		formatKey:     m.formatKey,
   134  		skl:           &m.rangeKeySkl,
   135  		constructSpan: rangekey.Decode,
   136  	}
   137  
   138  	if m.arenaBuf == nil {
   139  		m.arenaBuf = make([]byte, opts.size)
   140  	}
   141  
   142  	arena := arenaskl.NewArena(m.arenaBuf)
   143  	m.skl.Reset(arena, m.cmp)
   144  	m.rangeDelSkl.Reset(arena, m.cmp)
   145  	m.rangeKeySkl.Reset(arena, m.cmp)
   146  	return m
   147  }
   148  
   149  func (m *memTable) release() {
   150  }
   151  
   152  func (m *memTable) writerRef() {
   153  	switch v := atomic.AddInt32(&m.writerRefs, 1); {
   154  	case v <= 1:
   155  		panic(fmt.Sprintf("bitalostable: inconsistent reference count: %d", v))
   156  	}
   157  }
   158  
   159  func (m *memTable) writerUnref() bool {
   160  	switch v := atomic.AddInt32(&m.writerRefs, -1); {
   161  	case v < 0:
   162  		panic(fmt.Sprintf("bitalostable: inconsistent reference count: %d", v))
   163  	case v == 0:
   164  		return true
   165  	default:
   166  		return false
   167  	}
   168  }
   169  
   170  func (m *memTable) readyForFlush() bool {
   171  	return atomic.LoadInt32(&m.writerRefs) == 0
   172  }
   173  
   174  // Prepare reserves space for the batch in the memtable and references the
   175  // memtable preventing it from being flushed until the batch is applied. Note
   176  // that prepare is not thread-safe, while apply is. The caller must call
   177  // writerUnref() after the batch has been applied.
   178  func (m *memTable) prepare(batch *Batch) error {
   179  	avail := m.availBytes()
   180  	if batch.memTableSize > uint64(avail) {
   181  		return arenaskl.ErrArenaFull
   182  	}
   183  	m.reserved += uint32(batch.memTableSize)
   184  
   185  	m.writerRef()
   186  	return nil
   187  }
   188  
   189  func (m *memTable) apply(batch *Batch, seqNum uint64) error {
   190  	if seqNum < m.logSeqNum {
   191  		return base.CorruptionErrorf("bitalostable: batch seqnum %d is less than memtable creation seqnum %d",
   192  			errors.Safe(seqNum), errors.Safe(m.logSeqNum))
   193  	}
   194  
   195  	var ins arenaskl.Inserter
   196  	var tombstoneCount, rangeKeyCount uint32
   197  	startSeqNum := seqNum
   198  	for r := batch.Reader(); ; seqNum++ {
   199  		kind, ukey, value, ok := r.Next()
   200  		if !ok {
   201  			break
   202  		}
   203  		var err error
   204  		ikey := base.MakeInternalKey(ukey, seqNum, kind)
   205  		switch kind {
   206  		case InternalKeyKindRangeDelete:
   207  			err = m.rangeDelSkl.Add(ikey, value)
   208  			tombstoneCount++
   209  		case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
   210  			err = m.rangeKeySkl.Add(ikey, value)
   211  			rangeKeyCount++
   212  		case InternalKeyKindLogData:
   213  			// Don't increment seqNum for LogData, since these are not applied
   214  			// to the memtable.
   215  			seqNum--
   216  		default:
   217  			err = ins.Add(&m.skl, ikey, value)
   218  		}
   219  		if err != nil {
   220  			return err
   221  		}
   222  	}
   223  	if seqNum != startSeqNum+uint64(batch.Count()) {
   224  		return base.CorruptionErrorf("bitalostable: inconsistent batch count: %d vs %d",
   225  			errors.Safe(seqNum), errors.Safe(startSeqNum+uint64(batch.Count())))
   226  	}
   227  	if tombstoneCount != 0 {
   228  		m.tombstones.invalidate(tombstoneCount)
   229  	}
   230  	if rangeKeyCount != 0 {
   231  		m.rangeKeys.invalidate(rangeKeyCount)
   232  	}
   233  	return nil
   234  }
   235  
   236  // newIter returns an iterator that is unpositioned (Iterator.Valid() will
   237  // return false). The iterator can be positioned via a call to SeekGE,
   238  // SeekLT, First or Last.
   239  func (m *memTable) newIter(o *IterOptions) internalIterator {
   240  	return m.skl.NewIter(o.GetLowerBound(), o.GetUpperBound())
   241  }
   242  
   243  func (m *memTable) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator {
   244  	return m.skl.NewFlushIter(bytesFlushed)
   245  }
   246  
   247  func (m *memTable) newRangeDelIter(*IterOptions) keyspan.FragmentIterator {
   248  	tombstones := m.tombstones.get()
   249  	if tombstones == nil {
   250  		return nil
   251  	}
   252  	return keyspan.NewIter(m.cmp, tombstones)
   253  }
   254  
   255  func (m *memTable) newRangeKeyIter(*IterOptions) keyspan.FragmentIterator {
   256  	rangeKeys := m.rangeKeys.get()
   257  	if rangeKeys == nil {
   258  		return nil
   259  	}
   260  	return keyspan.NewIter(m.cmp, rangeKeys)
   261  }
   262  
   263  func (m *memTable) containsRangeKeys() bool {
   264  	return atomic.LoadUint32(&m.rangeKeys.atomicCount) > 0
   265  }
   266  
   267  func (m *memTable) availBytes() uint32 {
   268  	a := m.skl.Arena()
   269  	if atomic.LoadInt32(&m.writerRefs) == 1 {
   270  		// If there are no other concurrent apply operations, we can update the
   271  		// reserved bytes setting to accurately reflect how many bytes of been
   272  		// allocated vs the over-estimation present in memTableEntrySize.
   273  		m.reserved = a.Size()
   274  	}
   275  	return a.Capacity() - m.reserved
   276  }
   277  
   278  func (m *memTable) inuseBytes() uint64 {
   279  	return uint64(m.skl.Size() - memTableEmptySize)
   280  }
   281  
   282  func (m *memTable) totalBytes() uint64 {
   283  	return uint64(m.skl.Arena().Capacity())
   284  }
   285  
   286  // empty returns whether the MemTable has no key/value pairs.
   287  func (m *memTable) empty() bool {
   288  	return m.skl.Size() == memTableEmptySize
   289  }
   290  
   291  // A keySpanFrags holds a set of fragmented keyspan.Spans with a particular key
   292  // kind at a particular moment for a memtable.
   293  //
   294  // When a new span of a particular kind is added to the memtable, it may overlap
   295  // with other spans of the same kind. Instead of performing the fragmentation
   296  // whenever an iterator requires it, fragments are cached within a keySpanCache
   297  // type. The keySpanCache uses keySpanFrags to hold the cached fragmented spans.
   298  //
   299  // The count of keys (and keys of any given kind) in a memtable only
   300  // monotonically increases. The count of key spans of a particular kind is used
   301  // as a stand-in for a 'sequence number'. A keySpanFrags represents the
   302  // fragmented state of the memtable's keys of a given kind at the moment while
   303  // there existed `count` keys of that kind in the memtable.
   304  //
   305  // It's currently only used to contain fragmented range deletion tombstones.
   306  type keySpanFrags struct {
   307  	count uint32
   308  	once  sync.Once
   309  	spans []keyspan.Span
   310  }
   311  
   312  type constructSpan func(ik base.InternalKey, v []byte, keysDst []keyspan.Key) (keyspan.Span, error)
   313  
   314  func rangeDelConstructSpan(
   315  	ik base.InternalKey, v []byte, keysDst []keyspan.Key,
   316  ) (keyspan.Span, error) {
   317  	return rangedel.Decode(ik, v, keysDst), nil
   318  }
   319  
   320  // get retrieves the fragmented spans, populating them if necessary. Note that
   321  // the populated span fragments may be built from more than f.count memTable
   322  // spans, but that is ok for correctness. All we're requiring is that the
   323  // memTable contains at least f.count keys of the configured kind. This
   324  // situation can occur if there are multiple concurrent additions of the key
   325  // kind and a concurrent reader. The reader can load a keySpanFrags and populate
   326  // it even though is has been invalidated (i.e. replaced with a newer
   327  // keySpanFrags).
   328  func (f *keySpanFrags) get(
   329  	skl *arenaskl.Skiplist, cmp Compare, formatKey base.FormatKey, constructSpan constructSpan,
   330  ) []keyspan.Span {
   331  	f.once.Do(func() {
   332  		frag := &keyspan.Fragmenter{
   333  			Cmp:    cmp,
   334  			Format: formatKey,
   335  			Emit: func(fragmented keyspan.Span) {
   336  				f.spans = append(f.spans, fragmented)
   337  			},
   338  		}
   339  		it := skl.NewIter(nil, nil)
   340  		var keysDst []keyspan.Key
   341  		for key, val := it.First(); key != nil; key, val = it.Next() {
   342  			s, err := constructSpan(*key, val, keysDst)
   343  			if err != nil {
   344  				panic(err)
   345  			}
   346  			frag.Add(s)
   347  			keysDst = s.Keys[len(s.Keys):]
   348  		}
   349  		frag.Finish()
   350  	})
   351  	return f.spans
   352  }
   353  
   354  // A keySpanCache is used to cache a set of fragmented spans. The cache is
   355  // invalidated whenever a key of the same kind is added to a memTable, and
   356  // populated when empty when a span iterator of that key kind is created.
   357  type keySpanCache struct {
   358  	atomicCount   uint32
   359  	frags         unsafe.Pointer
   360  	cmp           Compare
   361  	formatKey     base.FormatKey
   362  	constructSpan constructSpan
   363  	skl           *arenaskl.Skiplist
   364  }
   365  
   366  // Invalidate the current set of cached spans, indicating the number of
   367  // spans that were added.
   368  func (c *keySpanCache) invalidate(count uint32) {
   369  	newCount := atomic.AddUint32(&c.atomicCount, count)
   370  	var frags *keySpanFrags
   371  
   372  	for {
   373  		oldPtr := atomic.LoadPointer(&c.frags)
   374  		if oldPtr != nil {
   375  			oldFrags := (*keySpanFrags)(oldPtr)
   376  			if oldFrags.count >= newCount {
   377  				// Someone else invalidated the cache before us and their invalidation
   378  				// subsumes ours.
   379  				break
   380  			}
   381  		}
   382  		if frags == nil {
   383  			frags = &keySpanFrags{count: newCount}
   384  		}
   385  		if atomic.CompareAndSwapPointer(&c.frags, oldPtr, unsafe.Pointer(frags)) {
   386  			// We successfully invalidated the cache.
   387  			break
   388  		}
   389  		// Someone else invalidated the cache. Loop and try again.
   390  	}
   391  }
   392  
   393  func (c *keySpanCache) get() []keyspan.Span {
   394  	frags := (*keySpanFrags)(atomic.LoadPointer(&c.frags))
   395  	if frags == nil {
   396  		return nil
   397  	}
   398  	return frags.get(c.skl, c.cmp, c.formatKey, c.constructSpan)
   399  }