github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/mem_table.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "fmt" 9 "sync" 10 "sync/atomic" 11 "unsafe" 12 13 "github.com/petermattis/pebble/internal/arenaskl" 14 "github.com/petermattis/pebble/internal/base" 15 "github.com/petermattis/pebble/internal/rangedel" 16 ) 17 18 func memTableEntrySize(keyBytes, valueBytes int) uint32 { 19 return arenaskl.MaxNodeSize(uint32(keyBytes)+8, uint32(valueBytes)) 20 } 21 22 // A memTable implements an in-memory layer of the LSM. A memTable is mutable, 23 // but append-only. Records are added, but never removed. Deletion is supported 24 // via tombstones, but it is up to higher level code (see Iterator) to support 25 // processing those tombstones. 26 // 27 // A memTable is implemented on top of a lock-free arena-backed skiplist. An 28 // arena is a fixed size contiguous chunk of memory (see 29 // Options.MemTableSize). A memTable's memory consumtion is thus fixed at 30 // the time of creation (with the exception of the cached fragmented range 31 // tombstones). The arena-backed skiplist provides both forward and reverse 32 // links which makes forward and reverse iteration the same speed. 33 // 34 // A batch is "applied" to a memTable in a two step process: prepare(batch) -> 35 // apply(batch). memTable.prepare() is not thread-safe and must be called with 36 // external sychronization. Preparation reserves space in the memTable for the 37 // batch. Note that we pessimistically compute how much space a batch will 38 // consume in the memTable (see memTableEntrySize and 39 // Batch.memTableSize). Preparation is an O(1) operation. Applying a batch to 40 // the memTable can be performed concurrently with other apply 41 // operations. Applying a batch is an O(n logm) operation where N is the number 42 // of records in the batch and M is the number of records in the memtable. The 43 // commitPipeline serializes batch preparation, and allows batch application to 44 // proceed concurrently. 45 // 46 // It is safe to call get, apply, newIter, and newRangeDelIter concurrently. 47 type memTable struct { 48 cmp Compare 49 equal Equal 50 skl arenaskl.Skiplist 51 rangeDelSkl arenaskl.Skiplist 52 emptySize uint32 53 reserved uint32 54 refs int32 55 flushedCh chan struct{} 56 tombstones rangeTombstoneCache 57 logNum uint64 58 logSize uint64 59 } 60 61 // newMemTable returns a new MemTable. 62 func newMemTable(o *Options) *memTable { 63 o = o.EnsureDefaults() 64 m := &memTable{ 65 cmp: o.Comparer.Compare, 66 equal: o.Comparer.Equal, 67 refs: 1, 68 flushedCh: make(chan struct{}), 69 } 70 arena := arenaskl.NewArena(uint32(o.MemTableSize), 0) 71 m.skl.Reset(arena, m.cmp) 72 m.rangeDelSkl.Reset(arena, m.cmp) 73 m.emptySize = arena.Size() 74 return m 75 } 76 77 func (m *memTable) ref() { 78 atomic.AddInt32(&m.refs, 1) 79 } 80 81 func (m *memTable) unref() bool { 82 switch v := atomic.AddInt32(&m.refs, -1); { 83 case v < 0: 84 panic("pebble: inconsistent reference count") 85 case v == 0: 86 return true 87 default: 88 return false 89 } 90 } 91 92 func (m *memTable) flushed() chan struct{} { 93 return m.flushedCh 94 } 95 96 func (m *memTable) readyForFlush() bool { 97 return atomic.LoadInt32(&m.refs) == 0 98 } 99 100 func (m *memTable) logInfo() (uint64, uint64) { 101 return m.logNum, m.logSize 102 } 103 104 // Get gets the value for the given key. It returns ErrNotFound if the DB does 105 // not contain the key. 106 func (m *memTable) get(key []byte) (value []byte, err error) { 107 it := m.skl.NewIter(nil, nil) 108 ikey, val := it.SeekGE(key) 109 if ikey == nil { 110 return nil, ErrNotFound 111 } 112 if !m.equal(key, ikey.UserKey) { 113 return nil, ErrNotFound 114 } 115 if ikey.Kind() == InternalKeyKindDelete { 116 return nil, ErrNotFound 117 } 118 return val, nil 119 } 120 121 // Prepare reserves space for the batch in the memtable and references the 122 // memtable preventing it from being flushed until the batch is applied. Note 123 // that prepare is not thread-safe, while apply is. The caller must call 124 // unref() after the batch has been applied. 125 func (m *memTable) prepare(batch *Batch) error { 126 a := m.skl.Arena() 127 if atomic.LoadInt32(&m.refs) == 1 { 128 // If there are no other concurrent apply operations, we can update the 129 // reserved bytes setting to accurately reflect how many bytes of been 130 // allocated vs the over-estimation present in memTableEntrySize. 131 m.reserved = a.Size() 132 } 133 134 avail := a.Capacity() - m.reserved 135 if batch.memTableSize > avail { 136 return arenaskl.ErrArenaFull 137 } 138 m.reserved += batch.memTableSize 139 140 m.ref() 141 return nil 142 } 143 144 func (m *memTable) apply(batch *Batch, seqNum uint64) error { 145 var ins arenaskl.Inserter 146 var tombstoneCount uint32 147 startSeqNum := seqNum 148 for r := batch.Reader(); ; seqNum++ { 149 kind, ukey, value, ok := r.Next() 150 if !ok { 151 break 152 } 153 var err error 154 ikey := base.MakeInternalKey(ukey, seqNum, kind) 155 switch kind { 156 case InternalKeyKindRangeDelete: 157 err = m.rangeDelSkl.Add(ikey, value) 158 tombstoneCount++ 159 case InternalKeyKindLogData: 160 default: 161 err = ins.Add(&m.skl, ikey, value) 162 } 163 if err != nil { 164 return err 165 } 166 } 167 if seqNum != startSeqNum+uint64(batch.Count()) { 168 panic(fmt.Sprintf("pebble: inconsistent batch count: %d vs %d", 169 seqNum, startSeqNum+uint64(batch.Count()))) 170 } 171 if tombstoneCount != 0 { 172 m.tombstones.invalidate(tombstoneCount) 173 } 174 return nil 175 } 176 177 // newIter returns an iterator that is unpositioned (Iterator.Valid() will 178 // return false). The iterator can be positioned via a call to SeekGE, 179 // SeekLT, First or Last. 180 func (m *memTable) newIter(o *IterOptions) internalIterator { 181 return m.skl.NewIter(o.GetLowerBound(), o.GetUpperBound()) 182 } 183 184 func (m *memTable) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator { 185 return m.skl.NewFlushIter(bytesFlushed) 186 } 187 188 func (m *memTable) newRangeDelIter(*IterOptions) internalIterator { 189 tombstones := m.tombstones.get(m) 190 if tombstones == nil { 191 return nil 192 } 193 return rangedel.NewIter(m.cmp, tombstones) 194 } 195 196 func (m *memTable) totalBytes() uint64 { 197 return uint64(m.skl.Size() - m.emptySize) 198 } 199 200 func (m *memTable) close() error { 201 return nil 202 } 203 204 // empty returns whether the MemTable has no key/value pairs. 205 func (m *memTable) empty() bool { 206 return m.skl.Size() == m.emptySize 207 } 208 209 // A rangeTombstoneFrags holds a set of fragmented range tombstones generated 210 // at a particular "sequence number" for a memtable. Rather than use actual 211 // sequence numbers, this cache uses a count of the number of range tombstones 212 // in the memTable. Note that the count of range tombstones in a memTable only 213 // ever increases, which provides a monotonically increasing sequence. 214 type rangeTombstoneFrags struct { 215 count uint32 216 once sync.Once 217 tombstones []rangedel.Tombstone 218 } 219 220 // get retrieves the fragmented tombstones, populating them if necessary. Note 221 // that the populated tombstone fragments may be built from more than f.count 222 // memTable range tombstones, but that is ok for correctness. All we're 223 // requiring is that the memTable contains at least f.count range 224 // tombstones. This situation can occur if there are multiple concurrent 225 // additions of range tombstones and a concurrent reader. The reader can load a 226 // tombstoneFrags and populate it even though is has been invalidated 227 // (i.e. replaced with a newer tombstoneFrags). 228 func (f *rangeTombstoneFrags) get(m *memTable) []rangedel.Tombstone { 229 f.once.Do(func() { 230 frag := &rangedel.Fragmenter{ 231 Cmp: m.cmp, 232 Emit: func(fragmented []rangedel.Tombstone) { 233 f.tombstones = append(f.tombstones, fragmented...) 234 }, 235 } 236 it := m.rangeDelSkl.NewIter(nil, nil) 237 for key, val := it.First(); key != nil; key, val = it.Next() { 238 frag.Add(*key, val) 239 } 240 frag.Finish() 241 }) 242 return f.tombstones 243 } 244 245 // A rangeTombstoneCache is used to cache a set of fragmented tombstones. The 246 // cache is invalidated whenever a tombstone is added to a memTable, and 247 // populated when empty when a range-del iterator is created. 248 type rangeTombstoneCache struct { 249 count uint32 250 frags unsafe.Pointer 251 } 252 253 // Invalidate the current set of cached tombstones, indicating the number of 254 // tombstones that were added. 255 func (c *rangeTombstoneCache) invalidate(count uint32) { 256 newCount := atomic.AddUint32(&c.count, count) 257 var frags *rangeTombstoneFrags 258 259 for { 260 oldPtr := atomic.LoadPointer(&c.frags) 261 if oldPtr != nil { 262 oldFrags := (*rangeTombstoneFrags)(oldPtr) 263 if oldFrags.count >= newCount { 264 // Someone else invalidated the cache before us and their invalidation 265 // subsumes ours. 266 break 267 } 268 } 269 if frags == nil { 270 frags = &rangeTombstoneFrags{count: newCount} 271 } 272 if atomic.CompareAndSwapPointer(&c.frags, oldPtr, unsafe.Pointer(frags)) { 273 // We successfully invalidated the cache. 274 break 275 } 276 // Someone else invalidated the cache. Loop and try again. 277 } 278 } 279 280 func (c *rangeTombstoneCache) get(m *memTable) []rangedel.Tombstone { 281 frags := (*rangeTombstoneFrags)(atomic.LoadPointer(&c.frags)) 282 if frags == nil { 283 return nil 284 } 285 return frags.get(m) 286 }