github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/mem_table.go

github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/mem_table.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"fmt"
     9  	"sync"
    10  	"sync/atomic"
    11  	"unsafe"
    12  
    13  	"github.com/petermattis/pebble/internal/arenaskl"
    14  	"github.com/petermattis/pebble/internal/base"
    15  	"github.com/petermattis/pebble/internal/rangedel"
    16  )
    17  
    18  func memTableEntrySize(keyBytes, valueBytes int) uint32 {
    19  	return arenaskl.MaxNodeSize(uint32(keyBytes)+8, uint32(valueBytes))
    20  }
    21  
    22  // A memTable implements an in-memory layer of the LSM. A memTable is mutable,
    23  // but append-only. Records are added, but never removed. Deletion is supported
    24  // via tombstones, but it is up to higher level code (see Iterator) to support
    25  // processing those tombstones.
    26  //
    27  // A memTable is implemented on top of a lock-free arena-backed skiplist. An
    28  // arena is a fixed size contiguous chunk of memory (see
    29  // Options.MemTableSize). A memTable's memory consumtion is thus fixed at
    30  // the time of creation (with the exception of the cached fragmented range
    31  // tombstones). The arena-backed skiplist provides both forward and reverse
    32  // links which makes forward and reverse iteration the same speed.
    33  //
    34  // A batch is "applied" to a memTable in a two step process: prepare(batch) ->
    35  // apply(batch). memTable.prepare() is not thread-safe and must be called with
    36  // external sychronization. Preparation reserves space in the memTable for the
    37  // batch. Note that we pessimistically compute how much space a batch will
    38  // consume in the memTable (see memTableEntrySize and
    39  // Batch.memTableSize). Preparation is an O(1) operation. Applying a batch to
    40  // the memTable can be performed concurrently with other apply
    41  // operations. Applying a batch is an O(n logm) operation where N is the number
    42  // of records in the batch and M is the number of records in the memtable. The
    43  // commitPipeline serializes batch preparation, and allows batch application to
    44  // proceed concurrently.
    45  //
    46  // It is safe to call get, apply, newIter, and newRangeDelIter concurrently.
    47  type memTable struct {
    48  	cmp         Compare
    49  	equal       Equal
    50  	skl         arenaskl.Skiplist
    51  	rangeDelSkl arenaskl.Skiplist
    52  	emptySize   uint32
    53  	reserved    uint32
    54  	refs        int32
    55  	flushedCh   chan struct{}
    56  	tombstones  rangeTombstoneCache
    57  	logNum      uint64
    58  	logSize     uint64
    59  }
    60  
    61  // newMemTable returns a new MemTable.
    62  func newMemTable(o *Options) *memTable {
    63  	o = o.EnsureDefaults()
    64  	m := &memTable{
    65  		cmp:       o.Comparer.Compare,
    66  		equal:     o.Comparer.Equal,
    67  		refs:      1,
    68  		flushedCh: make(chan struct{}),
    69  	}
    70  	arena := arenaskl.NewArena(uint32(o.MemTableSize), 0)
    71  	m.skl.Reset(arena, m.cmp)
    72  	m.rangeDelSkl.Reset(arena, m.cmp)
    73  	m.emptySize = arena.Size()
    74  	return m
    75  }
    76  
    77  func (m *memTable) ref() {
    78  	atomic.AddInt32(&m.refs, 1)
    79  }
    80  
    81  func (m *memTable) unref() bool {
    82  	switch v := atomic.AddInt32(&m.refs, -1); {
    83  	case v < 0:
    84  		panic("pebble: inconsistent reference count")
    85  	case v == 0:
    86  		return true
    87  	default:
    88  		return false
    89  	}
    90  }
    91  
    92  func (m *memTable) flushed() chan struct{} {
    93  	return m.flushedCh
    94  }
    95  
    96  func (m *memTable) readyForFlush() bool {
    97  	return atomic.LoadInt32(&m.refs) == 0
    98  }
    99  
   100  func (m *memTable) logInfo() (uint64, uint64) {
   101  	return m.logNum, m.logSize
   102  }
   103  
   104  // Get gets the value for the given key. It returns ErrNotFound if the DB does
   105  // not contain the key.
   106  func (m *memTable) get(key []byte) (value []byte, err error) {
   107  	it := m.skl.NewIter(nil, nil)
   108  	ikey, val := it.SeekGE(key)
   109  	if ikey == nil {
   110  		return nil, ErrNotFound
   111  	}
   112  	if !m.equal(key, ikey.UserKey) {
   113  		return nil, ErrNotFound
   114  	}
   115  	if ikey.Kind() == InternalKeyKindDelete {
   116  		return nil, ErrNotFound
   117  	}
   118  	return val, nil
   119  }
   120  
   121  // Prepare reserves space for the batch in the memtable and references the
   122  // memtable preventing it from being flushed until the batch is applied. Note
   123  // that prepare is not thread-safe, while apply is. The caller must call
   124  // unref() after the batch has been applied.
   125  func (m *memTable) prepare(batch *Batch) error {
   126  	a := m.skl.Arena()
   127  	if atomic.LoadInt32(&m.refs) == 1 {
   128  		// If there are no other concurrent apply operations, we can update the
   129  		// reserved bytes setting to accurately reflect how many bytes of been
   130  		// allocated vs the over-estimation present in memTableEntrySize.
   131  		m.reserved = a.Size()
   132  	}
   133  
   134  	avail := a.Capacity() - m.reserved
   135  	if batch.memTableSize > avail {
   136  		return arenaskl.ErrArenaFull
   137  	}
   138  	m.reserved += batch.memTableSize
   139  
   140  	m.ref()
   141  	return nil
   142  }
   143  
   144  func (m *memTable) apply(batch *Batch, seqNum uint64) error {
   145  	var ins arenaskl.Inserter
   146  	var tombstoneCount uint32
   147  	startSeqNum := seqNum
   148  	for r := batch.Reader(); ; seqNum++ {
   149  		kind, ukey, value, ok := r.Next()
   150  		if !ok {
   151  			break
   152  		}
   153  		var err error
   154  		ikey := base.MakeInternalKey(ukey, seqNum, kind)
   155  		switch kind {
   156  		case InternalKeyKindRangeDelete:
   157  			err = m.rangeDelSkl.Add(ikey, value)
   158  			tombstoneCount++
   159  		case InternalKeyKindLogData:
   160  		default:
   161  			err = ins.Add(&m.skl, ikey, value)
   162  		}
   163  		if err != nil {
   164  			return err
   165  		}
   166  	}
   167  	if seqNum != startSeqNum+uint64(batch.Count()) {
   168  		panic(fmt.Sprintf("pebble: inconsistent batch count: %d vs %d",
   169  			seqNum, startSeqNum+uint64(batch.Count())))
   170  	}
   171  	if tombstoneCount != 0 {
   172  		m.tombstones.invalidate(tombstoneCount)
   173  	}
   174  	return nil
   175  }
   176  
   177  // newIter returns an iterator that is unpositioned (Iterator.Valid() will
   178  // return false). The iterator can be positioned via a call to SeekGE,
   179  // SeekLT, First or Last.
   180  func (m *memTable) newIter(o *IterOptions) internalIterator {
   181  	return m.skl.NewIter(o.GetLowerBound(), o.GetUpperBound())
   182  }
   183  
   184  func (m *memTable) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator {
   185  	return m.skl.NewFlushIter(bytesFlushed)
   186  }
   187  
   188  func (m *memTable) newRangeDelIter(*IterOptions) internalIterator {
   189  	tombstones := m.tombstones.get(m)
   190  	if tombstones == nil {
   191  		return nil
   192  	}
   193  	return rangedel.NewIter(m.cmp, tombstones)
   194  }
   195  
   196  func (m *memTable) totalBytes() uint64 {
   197  	return uint64(m.skl.Size() - m.emptySize)
   198  }
   199  
   200  func (m *memTable) close() error {
   201  	return nil
   202  }
   203  
   204  // empty returns whether the MemTable has no key/value pairs.
   205  func (m *memTable) empty() bool {
   206  	return m.skl.Size() == m.emptySize
   207  }
   208  
   209  // A rangeTombstoneFrags holds a set of fragmented range tombstones generated
   210  // at a particular "sequence number" for a memtable. Rather than use actual
   211  // sequence numbers, this cache uses a count of the number of range tombstones
   212  // in the memTable. Note that the count of range tombstones in a memTable only
   213  // ever increases, which provides a monotonically increasing sequence.
   214  type rangeTombstoneFrags struct {
   215  	count      uint32
   216  	once       sync.Once
   217  	tombstones []rangedel.Tombstone
   218  }
   219  
   220  // get retrieves the fragmented tombstones, populating them if necessary. Note
   221  // that the populated tombstone fragments may be built from more than f.count
   222  // memTable range tombstones, but that is ok for correctness. All we're
   223  // requiring is that the memTable contains at least f.count range
   224  // tombstones. This situation can occur if there are multiple concurrent
   225  // additions of range tombstones and a concurrent reader. The reader can load a
   226  // tombstoneFrags and populate it even though is has been invalidated
   227  // (i.e. replaced with a newer tombstoneFrags).
   228  func (f *rangeTombstoneFrags) get(m *memTable) []rangedel.Tombstone {
   229  	f.once.Do(func() {
   230  		frag := &rangedel.Fragmenter{
   231  			Cmp: m.cmp,
   232  			Emit: func(fragmented []rangedel.Tombstone) {
   233  				f.tombstones = append(f.tombstones, fragmented...)
   234  			},
   235  		}
   236  		it := m.rangeDelSkl.NewIter(nil, nil)
   237  		for key, val := it.First(); key != nil; key, val = it.Next() {
   238  			frag.Add(*key, val)
   239  		}
   240  		frag.Finish()
   241  	})
   242  	return f.tombstones
   243  }
   244  
   245  // A rangeTombstoneCache is used to cache a set of fragmented tombstones. The
   246  // cache is invalidated whenever a tombstone is added to a memTable, and
   247  // populated when empty when a range-del iterator is created.
   248  type rangeTombstoneCache struct {
   249  	count uint32
   250  	frags unsafe.Pointer
   251  }
   252  
   253  // Invalidate the current set of cached tombstones, indicating the number of
   254  // tombstones that were added.
   255  func (c *rangeTombstoneCache) invalidate(count uint32) {
   256  	newCount := atomic.AddUint32(&c.count, count)
   257  	var frags *rangeTombstoneFrags
   258  
   259  	for {
   260  		oldPtr := atomic.LoadPointer(&c.frags)
   261  		if oldPtr != nil {
   262  			oldFrags := (*rangeTombstoneFrags)(oldPtr)
   263  			if oldFrags.count >= newCount {
   264  				// Someone else invalidated the cache before us and their invalidation
   265  				// subsumes ours.
   266  				break
   267  			}
   268  		}
   269  		if frags == nil {
   270  			frags = &rangeTombstoneFrags{count: newCount}
   271  		}
   272  		if atomic.CompareAndSwapPointer(&c.frags, oldPtr, unsafe.Pointer(frags)) {
   273  			// We successfully invalidated the cache.
   274  			break
   275  		}
   276  		// Someone else invalidated the cache. Loop and try again.
   277  	}
   278  }
   279  
   280  func (c *rangeTombstoneCache) get(m *memTable) []rangedel.Tombstone {
   281  	frags := (*rangeTombstoneFrags)(atomic.LoadPointer(&c.frags))
   282  	if frags == nil {
   283  		return nil
   284  	}
   285  	return frags.get(m)
   286  }