github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/mem_table.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package bitalostable 6 7 import ( 8 "bytes" 9 "fmt" 10 "os" 11 "sync" 12 "sync/atomic" 13 "unsafe" 14 15 "github.com/cockroachdb/errors" 16 "github.com/zuoyebang/bitalostable/internal/arenaskl" 17 "github.com/zuoyebang/bitalostable/internal/base" 18 "github.com/zuoyebang/bitalostable/internal/keyspan" 19 "github.com/zuoyebang/bitalostable/internal/rangedel" 20 "github.com/zuoyebang/bitalostable/internal/rangekey" 21 ) 22 23 func memTableEntrySize(keyBytes, valueBytes int) uint64 { 24 return arenaskl.MaxNodeSize(uint32(keyBytes)+8, uint32(valueBytes)) 25 } 26 27 // memTableEmptySize is the amount of allocated space in the arena when the 28 // memtable is empty. 29 var memTableEmptySize = func() uint32 { 30 var pointSkl arenaskl.Skiplist 31 var rangeDelSkl arenaskl.Skiplist 32 var rangeKeySkl arenaskl.Skiplist 33 arena := arenaskl.NewArena(make([]byte, 16<<10 /* 16 KB */)) 34 pointSkl.Reset(arena, bytes.Compare) 35 rangeDelSkl.Reset(arena, bytes.Compare) 36 rangeKeySkl.Reset(arena, bytes.Compare) 37 return arena.Size() 38 }() 39 40 // A memTable implements an in-memory layer of the LSM. A memTable is mutable, 41 // but append-only. Records are added, but never removed. Deletion is supported 42 // via tombstones, but it is up to higher level code (see Iterator) to support 43 // processing those tombstones. 44 // 45 // A memTable is implemented on top of a lock-free arena-backed skiplist. An 46 // arena is a fixed size contiguous chunk of memory (see 47 // Options.MemTableSize). A memTable's memory consumption is thus fixed at the 48 // time of creation (with the exception of the cached fragmented range 49 // tombstones). The arena-backed skiplist provides both forward and reverse 50 // links which makes forward and reverse iteration the same speed. 51 // 52 // A batch is "applied" to a memTable in a two step process: prepare(batch) -> 53 // apply(batch). memTable.prepare() is not thread-safe and must be called with 54 // external synchronization. Preparation reserves space in the memTable for the 55 // batch. Note that we pessimistically compute how much space a batch will 56 // consume in the memTable (see memTableEntrySize and 57 // Batch.memTableSize). Preparation is an O(1) operation. Applying a batch to 58 // the memTable can be performed concurrently with other apply 59 // operations. Applying a batch is an O(n logm) operation where N is the number 60 // of records in the batch and M is the number of records in the memtable. The 61 // commitPipeline serializes batch preparation, and allows batch application to 62 // proceed concurrently. 63 // 64 // It is safe to call get, apply, newIter, and newRangeDelIter concurrently. 65 type memTable struct { 66 cmp Compare 67 formatKey base.FormatKey 68 equal Equal 69 arenaBuf []byte 70 skl arenaskl.Skiplist 71 rangeDelSkl arenaskl.Skiplist 72 rangeKeySkl arenaskl.Skiplist 73 // reserved tracks the amount of space used by the memtable, both by actual 74 // data stored in the memtable as well as inflight batch commit 75 // operations. This value is incremented pessimistically by prepare() in 76 // order to account for the space needed by a batch. 77 reserved uint32 78 // writerRefs tracks the write references on the memtable. The two sources of 79 // writer references are the memtable being on DB.mu.mem.queue and from 80 // inflight mutations that have reserved space in the memtable but not yet 81 // applied. The memtable cannot be flushed to disk until the writer refs 82 // drops to zero. 83 writerRefs int32 84 tombstones keySpanCache 85 rangeKeys keySpanCache 86 // The current logSeqNum at the time the memtable was created. This is 87 // guaranteed to be less than or equal to any seqnum stored in the memtable. 88 logSeqNum uint64 89 } 90 91 // memTableOptions holds configuration used when creating a memTable. All of 92 // the fields are optional and will be filled with defaults if not specified 93 // which is used by tests. 94 type memTableOptions struct { 95 *Options 96 arenaBuf []byte 97 size int 98 logSeqNum uint64 99 } 100 101 func checkMemTable(obj interface{}) { 102 m := obj.(*memTable) 103 if m.arenaBuf != nil { 104 fmt.Fprintf(os.Stderr, "%p: memTable buffer was not freed\n", m.arenaBuf) 105 os.Exit(1) 106 } 107 } 108 109 // newMemTable returns a new MemTable of the specified size. If size is zero, 110 // Options.MemTableSize is used instead. 111 func newMemTable(opts memTableOptions) *memTable { 112 opts.Options = opts.Options.EnsureDefaults() 113 if opts.size == 0 { 114 opts.size = opts.MemTableSize 115 } 116 117 m := &memTable{ 118 cmp: opts.Comparer.Compare, 119 formatKey: opts.Comparer.FormatKey, 120 equal: opts.Comparer.Equal, 121 arenaBuf: opts.arenaBuf, 122 writerRefs: 1, 123 logSeqNum: opts.logSeqNum, 124 } 125 m.tombstones = keySpanCache{ 126 cmp: m.cmp, 127 formatKey: m.formatKey, 128 skl: &m.rangeDelSkl, 129 constructSpan: rangeDelConstructSpan, 130 } 131 m.rangeKeys = keySpanCache{ 132 cmp: m.cmp, 133 formatKey: m.formatKey, 134 skl: &m.rangeKeySkl, 135 constructSpan: rangekey.Decode, 136 } 137 138 if m.arenaBuf == nil { 139 m.arenaBuf = make([]byte, opts.size) 140 } 141 142 arena := arenaskl.NewArena(m.arenaBuf) 143 m.skl.Reset(arena, m.cmp) 144 m.rangeDelSkl.Reset(arena, m.cmp) 145 m.rangeKeySkl.Reset(arena, m.cmp) 146 return m 147 } 148 149 func (m *memTable) release() { 150 } 151 152 func (m *memTable) writerRef() { 153 switch v := atomic.AddInt32(&m.writerRefs, 1); { 154 case v <= 1: 155 panic(fmt.Sprintf("bitalostable: inconsistent reference count: %d", v)) 156 } 157 } 158 159 func (m *memTable) writerUnref() bool { 160 switch v := atomic.AddInt32(&m.writerRefs, -1); { 161 case v < 0: 162 panic(fmt.Sprintf("bitalostable: inconsistent reference count: %d", v)) 163 case v == 0: 164 return true 165 default: 166 return false 167 } 168 } 169 170 func (m *memTable) readyForFlush() bool { 171 return atomic.LoadInt32(&m.writerRefs) == 0 172 } 173 174 // Prepare reserves space for the batch in the memtable and references the 175 // memtable preventing it from being flushed until the batch is applied. Note 176 // that prepare is not thread-safe, while apply is. The caller must call 177 // writerUnref() after the batch has been applied. 178 func (m *memTable) prepare(batch *Batch) error { 179 avail := m.availBytes() 180 if batch.memTableSize > uint64(avail) { 181 return arenaskl.ErrArenaFull 182 } 183 m.reserved += uint32(batch.memTableSize) 184 185 m.writerRef() 186 return nil 187 } 188 189 func (m *memTable) apply(batch *Batch, seqNum uint64) error { 190 if seqNum < m.logSeqNum { 191 return base.CorruptionErrorf("bitalostable: batch seqnum %d is less than memtable creation seqnum %d", 192 errors.Safe(seqNum), errors.Safe(m.logSeqNum)) 193 } 194 195 var ins arenaskl.Inserter 196 var tombstoneCount, rangeKeyCount uint32 197 startSeqNum := seqNum 198 for r := batch.Reader(); ; seqNum++ { 199 kind, ukey, value, ok := r.Next() 200 if !ok { 201 break 202 } 203 var err error 204 ikey := base.MakeInternalKey(ukey, seqNum, kind) 205 switch kind { 206 case InternalKeyKindRangeDelete: 207 err = m.rangeDelSkl.Add(ikey, value) 208 tombstoneCount++ 209 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 210 err = m.rangeKeySkl.Add(ikey, value) 211 rangeKeyCount++ 212 case InternalKeyKindLogData: 213 // Don't increment seqNum for LogData, since these are not applied 214 // to the memtable. 215 seqNum-- 216 default: 217 err = ins.Add(&m.skl, ikey, value) 218 } 219 if err != nil { 220 return err 221 } 222 } 223 if seqNum != startSeqNum+uint64(batch.Count()) { 224 return base.CorruptionErrorf("bitalostable: inconsistent batch count: %d vs %d", 225 errors.Safe(seqNum), errors.Safe(startSeqNum+uint64(batch.Count()))) 226 } 227 if tombstoneCount != 0 { 228 m.tombstones.invalidate(tombstoneCount) 229 } 230 if rangeKeyCount != 0 { 231 m.rangeKeys.invalidate(rangeKeyCount) 232 } 233 return nil 234 } 235 236 // newIter returns an iterator that is unpositioned (Iterator.Valid() will 237 // return false). The iterator can be positioned via a call to SeekGE, 238 // SeekLT, First or Last. 239 func (m *memTable) newIter(o *IterOptions) internalIterator { 240 return m.skl.NewIter(o.GetLowerBound(), o.GetUpperBound()) 241 } 242 243 func (m *memTable) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator { 244 return m.skl.NewFlushIter(bytesFlushed) 245 } 246 247 func (m *memTable) newRangeDelIter(*IterOptions) keyspan.FragmentIterator { 248 tombstones := m.tombstones.get() 249 if tombstones == nil { 250 return nil 251 } 252 return keyspan.NewIter(m.cmp, tombstones) 253 } 254 255 func (m *memTable) newRangeKeyIter(*IterOptions) keyspan.FragmentIterator { 256 rangeKeys := m.rangeKeys.get() 257 if rangeKeys == nil { 258 return nil 259 } 260 return keyspan.NewIter(m.cmp, rangeKeys) 261 } 262 263 func (m *memTable) containsRangeKeys() bool { 264 return atomic.LoadUint32(&m.rangeKeys.atomicCount) > 0 265 } 266 267 func (m *memTable) availBytes() uint32 { 268 a := m.skl.Arena() 269 if atomic.LoadInt32(&m.writerRefs) == 1 { 270 // If there are no other concurrent apply operations, we can update the 271 // reserved bytes setting to accurately reflect how many bytes of been 272 // allocated vs the over-estimation present in memTableEntrySize. 273 m.reserved = a.Size() 274 } 275 return a.Capacity() - m.reserved 276 } 277 278 func (m *memTable) inuseBytes() uint64 { 279 return uint64(m.skl.Size() - memTableEmptySize) 280 } 281 282 func (m *memTable) totalBytes() uint64 { 283 return uint64(m.skl.Arena().Capacity()) 284 } 285 286 // empty returns whether the MemTable has no key/value pairs. 287 func (m *memTable) empty() bool { 288 return m.skl.Size() == memTableEmptySize 289 } 290 291 // A keySpanFrags holds a set of fragmented keyspan.Spans with a particular key 292 // kind at a particular moment for a memtable. 293 // 294 // When a new span of a particular kind is added to the memtable, it may overlap 295 // with other spans of the same kind. Instead of performing the fragmentation 296 // whenever an iterator requires it, fragments are cached within a keySpanCache 297 // type. The keySpanCache uses keySpanFrags to hold the cached fragmented spans. 298 // 299 // The count of keys (and keys of any given kind) in a memtable only 300 // monotonically increases. The count of key spans of a particular kind is used 301 // as a stand-in for a 'sequence number'. A keySpanFrags represents the 302 // fragmented state of the memtable's keys of a given kind at the moment while 303 // there existed `count` keys of that kind in the memtable. 304 // 305 // It's currently only used to contain fragmented range deletion tombstones. 306 type keySpanFrags struct { 307 count uint32 308 once sync.Once 309 spans []keyspan.Span 310 } 311 312 type constructSpan func(ik base.InternalKey, v []byte, keysDst []keyspan.Key) (keyspan.Span, error) 313 314 func rangeDelConstructSpan( 315 ik base.InternalKey, v []byte, keysDst []keyspan.Key, 316 ) (keyspan.Span, error) { 317 return rangedel.Decode(ik, v, keysDst), nil 318 } 319 320 // get retrieves the fragmented spans, populating them if necessary. Note that 321 // the populated span fragments may be built from more than f.count memTable 322 // spans, but that is ok for correctness. All we're requiring is that the 323 // memTable contains at least f.count keys of the configured kind. This 324 // situation can occur if there are multiple concurrent additions of the key 325 // kind and a concurrent reader. The reader can load a keySpanFrags and populate 326 // it even though is has been invalidated (i.e. replaced with a newer 327 // keySpanFrags). 328 func (f *keySpanFrags) get( 329 skl *arenaskl.Skiplist, cmp Compare, formatKey base.FormatKey, constructSpan constructSpan, 330 ) []keyspan.Span { 331 f.once.Do(func() { 332 frag := &keyspan.Fragmenter{ 333 Cmp: cmp, 334 Format: formatKey, 335 Emit: func(fragmented keyspan.Span) { 336 f.spans = append(f.spans, fragmented) 337 }, 338 } 339 it := skl.NewIter(nil, nil) 340 var keysDst []keyspan.Key 341 for key, val := it.First(); key != nil; key, val = it.Next() { 342 s, err := constructSpan(*key, val, keysDst) 343 if err != nil { 344 panic(err) 345 } 346 frag.Add(s) 347 keysDst = s.Keys[len(s.Keys):] 348 } 349 frag.Finish() 350 }) 351 return f.spans 352 } 353 354 // A keySpanCache is used to cache a set of fragmented spans. The cache is 355 // invalidated whenever a key of the same kind is added to a memTable, and 356 // populated when empty when a span iterator of that key kind is created. 357 type keySpanCache struct { 358 atomicCount uint32 359 frags unsafe.Pointer 360 cmp Compare 361 formatKey base.FormatKey 362 constructSpan constructSpan 363 skl *arenaskl.Skiplist 364 } 365 366 // Invalidate the current set of cached spans, indicating the number of 367 // spans that were added. 368 func (c *keySpanCache) invalidate(count uint32) { 369 newCount := atomic.AddUint32(&c.atomicCount, count) 370 var frags *keySpanFrags 371 372 for { 373 oldPtr := atomic.LoadPointer(&c.frags) 374 if oldPtr != nil { 375 oldFrags := (*keySpanFrags)(oldPtr) 376 if oldFrags.count >= newCount { 377 // Someone else invalidated the cache before us and their invalidation 378 // subsumes ours. 379 break 380 } 381 } 382 if frags == nil { 383 frags = &keySpanFrags{count: newCount} 384 } 385 if atomic.CompareAndSwapPointer(&c.frags, oldPtr, unsafe.Pointer(frags)) { 386 // We successfully invalidated the cache. 387 break 388 } 389 // Someone else invalidated the cache. Loop and try again. 390 } 391 } 392 393 func (c *keySpanCache) get() []keyspan.Span { 394 frags := (*keySpanFrags)(atomic.LoadPointer(&c.frags)) 395 if frags == nil { 396 return nil 397 } 398 return frags.get(c.skl, c.cmp, c.formatKey, c.constructSpan) 399 }