github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/batch.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "context" 9 "encoding/binary" 10 "fmt" 11 "io" 12 "math" 13 "sort" 14 "sync" 15 "sync/atomic" 16 "time" 17 "unsafe" 18 19 "github.com/cockroachdb/errors" 20 "github.com/cockroachdb/pebble/internal/base" 21 "github.com/cockroachdb/pebble/internal/batchskl" 22 "github.com/cockroachdb/pebble/internal/humanize" 23 "github.com/cockroachdb/pebble/internal/invariants" 24 "github.com/cockroachdb/pebble/internal/keyspan" 25 "github.com/cockroachdb/pebble/internal/private" 26 "github.com/cockroachdb/pebble/internal/rangedel" 27 "github.com/cockroachdb/pebble/internal/rangekey" 28 "github.com/cockroachdb/pebble/internal/rawalloc" 29 ) 30 31 const ( 32 batchCountOffset = 8 33 batchHeaderLen = 12 34 batchInitialSize = 1 << 10 // 1 KB 35 batchMaxRetainedSize = 1 << 20 // 1 MB 36 invalidBatchCount = 1<<32 - 1 37 maxVarintLen32 = 5 38 ) 39 40 // ErrNotIndexed means that a read operation on a batch failed because the 41 // batch is not indexed and thus doesn't support reads. 42 var ErrNotIndexed = errors.New("pebble: batch not indexed") 43 44 // ErrInvalidBatch indicates that a batch is invalid or otherwise corrupted. 45 var ErrInvalidBatch = base.MarkCorruptionError(errors.New("pebble: invalid batch")) 46 47 // ErrBatchTooLarge indicates that a batch is invalid or otherwise corrupted. 48 var ErrBatchTooLarge = base.MarkCorruptionError(errors.Newf("pebble: batch too large: >= %s", humanize.Bytes.Uint64(maxBatchSize))) 49 50 // DeferredBatchOp represents a batch operation (eg. set, merge, delete) that is 51 // being inserted into the batch. Indexing is not performed on the specified key 52 // until Finish is called, hence the name deferred. This struct lets the caller 53 // copy or encode keys/values directly into the batch representation instead of 54 // copying into an intermediary buffer then having pebble.Batch copy off of it. 55 type DeferredBatchOp struct { 56 index *batchskl.Skiplist 57 58 // Key and Value point to parts of the binary batch representation where 59 // keys and values should be encoded/copied into. len(Key) and len(Value) 60 // bytes must be copied into these slices respectively before calling 61 // Finish(). Changing where these slices point to is not allowed. 62 Key, Value []byte 63 offset uint32 64 } 65 66 // Finish completes the addition of this batch operation, and adds it to the 67 // index if necessary. Must be called once (and exactly once) keys/values 68 // have been filled into Key and Value. Not calling Finish or not 69 // copying/encoding keys will result in an incomplete index, and calling Finish 70 // twice may result in a panic. 71 func (d DeferredBatchOp) Finish() error { 72 if d.index != nil { 73 if err := d.index.Add(d.offset); err != nil { 74 return err 75 } 76 } 77 return nil 78 } 79 80 // A Batch is a sequence of Sets, Merges, Deletes, DeleteRanges, RangeKeySets, 81 // RangeKeyUnsets, and/or RangeKeyDeletes that are applied atomically. Batch 82 // implements the Reader interface, but only an indexed batch supports reading 83 // (without error) via Get or NewIter. A non-indexed batch will return 84 // ErrNotIndexed when read from. A batch is not safe for concurrent use, and 85 // consumers should use a batch per goroutine or provide their own 86 // synchronization. 87 // 88 // # Indexing 89 // 90 // Batches can be optionally indexed (see DB.NewIndexedBatch). An indexed batch 91 // allows iteration via an Iterator (see Batch.NewIter). The iterator provides 92 // a merged view of the operations in the batch and the underlying 93 // database. This is implemented by treating the batch as an additional layer 94 // in the LSM where every entry in the batch is considered newer than any entry 95 // in the underlying database (batch entries have the InternalKeySeqNumBatch 96 // bit set). By treating the batch as an additional layer in the LSM, iteration 97 // supports all batch operations (i.e. Set, Merge, Delete, DeleteRange, 98 // RangeKeySet, RangeKeyUnset, RangeKeyDelete) with minimal effort. 99 // 100 // The same key can be operated on multiple times in a batch, though only the 101 // latest operation will be visible. For example, Put("a", "b"), Delete("a") 102 // will cause the key "a" to not be visible in the batch. Put("a", "b"), 103 // Put("a", "c") will cause a read of "a" to return the value "c". 104 // 105 // The batch index is implemented via an skiplist (internal/batchskl). While 106 // the skiplist implementation is very fast, inserting into an indexed batch is 107 // significantly slower than inserting into a non-indexed batch. Only use an 108 // indexed batch if you require reading from it. 109 // 110 // # Atomic commit 111 // 112 // The operations in a batch are persisted by calling Batch.Commit which is 113 // equivalent to calling DB.Apply(batch). A batch is committed atomically by 114 // writing the internal batch representation to the WAL, adding all of the 115 // batch operations to the memtable associated with the WAL, and then 116 // incrementing the visible sequence number so that subsequent reads can see 117 // the effects of the batch operations. If WriteOptions.Sync is true, a call to 118 // Batch.Commit will guarantee that the batch is persisted to disk before 119 // returning. See commitPipeline for more on the implementation details. 120 // 121 // # Large batches 122 // 123 // The size of a batch is limited only by available memory (be aware that 124 // indexed batches require considerably additional memory for the skiplist 125 // structure). A given WAL file has a single memtable associated with it (this 126 // restriction could be removed, but doing so is onerous and complex). And a 127 // memtable has a fixed size due to the underlying fixed size arena. Note that 128 // this differs from RocksDB where a memtable can grow arbitrarily large using 129 // a list of arena chunks. In RocksDB this is accomplished by storing pointers 130 // in the arena memory, but that isn't possible in Go. 131 // 132 // During Batch.Commit, a batch which is larger than a threshold (> 133 // MemTableSize/2) is wrapped in a flushableBatch and inserted into the queue 134 // of memtables. A flushableBatch forces WAL to be rotated, but that happens 135 // anyways when the memtable becomes full so this does not cause significant 136 // WAL churn. Because the flushableBatch is readable as another layer in the 137 // LSM, Batch.Commit returns as soon as the flushableBatch has been added to 138 // the queue of memtables. 139 // 140 // Internally, a flushableBatch provides Iterator support by sorting the batch 141 // contents (the batch is sorted once, when it is added to the memtable 142 // queue). Sorting the batch contents and insertion of the contents into a 143 // memtable have the same big-O time, but the constant factor dominates 144 // here. Sorting is significantly faster and uses significantly less memory. 145 // 146 // # Internal representation 147 // 148 // The internal batch representation is a contiguous byte buffer with a fixed 149 // 12-byte header, followed by a series of records. 150 // 151 // +-------------+------------+--- ... ---+ 152 // | SeqNum (8B) | Count (4B) | Entries | 153 // +-------------+------------+--- ... ---+ 154 // 155 // Each record has a 1-byte kind tag prefix, followed by 1 or 2 length prefixed 156 // strings (varstring): 157 // 158 // +-----------+-----------------+-------------------+ 159 // | Kind (1B) | Key (varstring) | Value (varstring) | 160 // +-----------+-----------------+-------------------+ 161 // 162 // A varstring is a varint32 followed by N bytes of data. The Kind tags are 163 // exactly those specified by InternalKeyKind. The following table shows the 164 // format for records of each kind: 165 // 166 // InternalKeyKindDelete varstring 167 // InternalKeyKindLogData varstring 168 // InternalKeyKindIngestSST varstring 169 // InternalKeyKindSet varstring varstring 170 // InternalKeyKindMerge varstring varstring 171 // InternalKeyKindRangeDelete varstring varstring 172 // InternalKeyKindRangeKeySet varstring varstring 173 // InternalKeyKindRangeKeyUnset varstring varstring 174 // InternalKeyKindRangeKeyDelete varstring varstring 175 // 176 // The intuitive understanding here are that the arguments to Delete, Set, 177 // Merge, DeleteRange and RangeKeyDelete are encoded into the batch. The 178 // RangeKeySet and RangeKeyUnset operations are slightly more complicated, 179 // encoding their end key, suffix and value [in the case of RangeKeySet] within 180 // the Value varstring. For more information on the value encoding for 181 // RangeKeySet and RangeKeyUnset, see the internal/rangekey package. 182 // 183 // The internal batch representation is the on disk format for a batch in the 184 // WAL, and thus stable. New record kinds may be added, but the existing ones 185 // will not be modified. 186 type Batch struct { 187 batchInternal 188 applied atomic.Bool 189 } 190 191 // batchInternal contains the set of fields within Batch that are non-atomic and 192 // capable of being reset using a *b = batchInternal{} struct copy. 193 type batchInternal struct { 194 // Data is the wire format of a batch's log entry: 195 // - 8 bytes for a sequence number of the first batch element, 196 // or zeroes if the batch has not yet been applied, 197 // - 4 bytes for the count: the number of elements in the batch, 198 // or "\xff\xff\xff\xff" if the batch is invalid, 199 // - count elements, being: 200 // - one byte for the kind 201 // - the varint-string user key, 202 // - the varint-string value (if kind != delete). 203 // The sequence number and count are stored in little-endian order. 204 // 205 // The data field can be (but is not guaranteed to be) nil for new 206 // batches. Large batches will set the data field to nil when committed as 207 // the data has been moved to a flushableBatch and inserted into the queue of 208 // memtables. 209 data []byte 210 cmp Compare 211 formatKey base.FormatKey 212 abbreviatedKey AbbreviatedKey 213 214 // An upper bound on required space to add this batch to a memtable. 215 // Note that although batches are limited to 4 GiB in size, that limit 216 // applies to len(data), not the memtable size. The upper bound on the 217 // size of a memtable node is larger than the overhead of the batch's log 218 // encoding, so memTableSize is larger than len(data) and may overflow a 219 // uint32. 220 memTableSize uint64 221 222 // The db to which the batch will be committed. Do not change this field 223 // after the batch has been created as it might invalidate internal state. 224 // Batch.memTableSize is only refreshed if Batch.db is set. Setting db to 225 // nil once it has been set implies that the Batch has encountered an error. 226 db *DB 227 228 // The count of records in the batch. This count will be stored in the batch 229 // data whenever Repr() is called. 230 count uint64 231 232 // The count of range deletions in the batch. Updated every time a range 233 // deletion is added. 234 countRangeDels uint64 235 236 // The count of range key sets, unsets and deletes in the batch. Updated 237 // every time a RANGEKEYSET, RANGEKEYUNSET or RANGEKEYDEL key is added. 238 countRangeKeys uint64 239 240 // A deferredOp struct, stored in the Batch so that a pointer can be returned 241 // from the *Deferred() methods rather than a value. 242 deferredOp DeferredBatchOp 243 244 // An optional skiplist keyed by offset into data of the entry. 245 index *batchskl.Skiplist 246 rangeDelIndex *batchskl.Skiplist 247 rangeKeyIndex *batchskl.Skiplist 248 249 // Fragmented range deletion tombstones. Cached the first time a range 250 // deletion iterator is requested. The cache is invalidated whenever a new 251 // range deletion is added to the batch. This cache can only be used when 252 // opening an iterator to read at a batch sequence number >= 253 // tombstonesSeqNum. This is the case for all new iterators created over a 254 // batch but it's not the case for all cloned iterators. 255 tombstones []keyspan.Span 256 tombstonesSeqNum uint64 257 258 // Fragmented range key spans. Cached the first time a range key iterator is 259 // requested. The cache is invalidated whenever a new range key 260 // (RangeKey{Set,Unset,Del}) is added to the batch. This cache can only be 261 // used when opening an iterator to read at a batch sequence number >= 262 // tombstonesSeqNum. This is the case for all new iterators created over a 263 // batch but it's not the case for all cloned iterators. 264 rangeKeys []keyspan.Span 265 rangeKeysSeqNum uint64 266 267 // The flushableBatch wrapper if the batch is too large to fit in the 268 // memtable. 269 flushable *flushableBatch 270 271 // minimumFormatMajorVersion indicates the format major version required in 272 // order to commit this batch. If an operation requires a particular format 273 // major version, it ratchets the batch's minimumFormatMajorVersion. When 274 // the batch is committed, this is validated against the database's current 275 // format major version. 276 minimumFormatMajorVersion FormatMajorVersion 277 278 // Synchronous Apply uses the commit WaitGroup for both publishing the 279 // seqnum and waiting for the WAL fsync (if needed). Asynchronous 280 // ApplyNoSyncWait, which implies WriteOptions.Sync is true, uses the commit 281 // WaitGroup for publishing the seqnum and the fsyncWait WaitGroup for 282 // waiting for the WAL fsync. 283 // 284 // TODO(sumeer): if we find that ApplyNoSyncWait in conjunction with 285 // SyncWait is causing higher memory usage because of the time duration 286 // between when the sync is already done, and a goroutine calls SyncWait 287 // (followed by Batch.Close), we could separate out {fsyncWait, commitErr} 288 // into a separate struct that is allocated separately (using another 289 // sync.Pool), and only that struct needs to outlive Batch.Close (which 290 // could then be called immediately after ApplyNoSyncWait). commitStats 291 // will also need to be in this separate struct. 292 commit sync.WaitGroup 293 fsyncWait sync.WaitGroup 294 295 commitStats BatchCommitStats 296 297 commitErr error 298 299 // Position bools together to reduce the sizeof the struct. 300 301 // ingestedSSTBatch indicates that the batch contains one or more key kinds 302 // of InternalKeyKindIngestSST. If the batch contains key kinds of IngestSST 303 // then it will only contain key kinds of IngestSST. 304 ingestedSSTBatch bool 305 306 // committing is set to true when a batch begins to commit. It's used to 307 // ensure the batch is not mutated concurrently. It is not an atomic 308 // deliberately, so as to avoid the overhead on batch mutations. This is 309 // okay, because under correct usage this field will never be accessed 310 // concurrently. It's only under incorrect usage the memory accesses of this 311 // variable may violate memory safety. Since we don't use atomics here, 312 // false negatives are possible. 313 committing bool 314 } 315 316 // BatchCommitStats exposes stats related to committing a batch. 317 // 318 // NB: there is no Pebble internal tracing (using LoggerAndTracer) of slow 319 // batch commits. The caller can use these stats to do their own tracing as 320 // needed. 321 type BatchCommitStats struct { 322 // TotalDuration is the time spent in DB.{Apply,ApplyNoSyncWait} or 323 // Batch.Commit, plus the time waiting in Batch.SyncWait. If there is a gap 324 // between calling ApplyNoSyncWait and calling SyncWait, that gap could 325 // include some duration in which real work was being done for the commit 326 // and will not be included here. This missing time is considered acceptable 327 // since the goal of these stats is to understand user-facing latency. 328 // 329 // TotalDuration includes time spent in various queues both inside Pebble 330 // and outside Pebble (I/O queues, goroutine scheduler queue, mutex wait 331 // etc.). For some of these queues (which we consider important) the wait 332 // times are included below -- these expose low-level implementation detail 333 // and are meant for expert diagnosis and subject to change. There may be 334 // unaccounted time after subtracting those values from TotalDuration. 335 TotalDuration time.Duration 336 // SemaphoreWaitDuration is the wait time for semaphores in 337 // commitPipeline.Commit. 338 SemaphoreWaitDuration time.Duration 339 // WALQueueWaitDuration is the wait time for allocating memory blocks in the 340 // LogWriter (due to the LogWriter not writing fast enough). At the moment 341 // this is duration is always zero because a single WAL will allow 342 // allocating memory blocks up to the entire memtable size. In the future, 343 // we may pipeline WALs and bound the WAL queued blocks separately, so this 344 // field is preserved for that possibility. 345 WALQueueWaitDuration time.Duration 346 // MemTableWriteStallDuration is the wait caused by a write stall due to too 347 // many memtables (due to not flushing fast enough). 348 MemTableWriteStallDuration time.Duration 349 // L0ReadAmpWriteStallDuration is the wait caused by a write stall due to 350 // high read amplification in L0 (due to not compacting fast enough out of 351 // L0). 352 L0ReadAmpWriteStallDuration time.Duration 353 // WALRotationDuration is the wait time for WAL rotation, which includes 354 // syncing and closing the old WAL and creating (or reusing) a new one. 355 WALRotationDuration time.Duration 356 // CommitWaitDuration is the wait for publishing the seqnum plus the 357 // duration for the WAL sync (if requested). The former should be tiny and 358 // one can assume that this is all due to the WAL sync. 359 CommitWaitDuration time.Duration 360 } 361 362 var _ Reader = (*Batch)(nil) 363 var _ Writer = (*Batch)(nil) 364 365 var batchPool = sync.Pool{ 366 New: func() interface{} { 367 return &Batch{} 368 }, 369 } 370 371 type indexedBatch struct { 372 batch Batch 373 index batchskl.Skiplist 374 } 375 376 var indexedBatchPool = sync.Pool{ 377 New: func() interface{} { 378 return &indexedBatch{} 379 }, 380 } 381 382 func newBatch(db *DB) *Batch { 383 b := batchPool.Get().(*Batch) 384 b.db = db 385 return b 386 } 387 388 func newBatchWithSize(db *DB, size int) *Batch { 389 b := newBatch(db) 390 if cap(b.data) < size { 391 b.data = rawalloc.New(0, size) 392 } 393 return b 394 } 395 396 func newIndexedBatch(db *DB, comparer *Comparer) *Batch { 397 i := indexedBatchPool.Get().(*indexedBatch) 398 i.batch.cmp = comparer.Compare 399 i.batch.formatKey = comparer.FormatKey 400 i.batch.abbreviatedKey = comparer.AbbreviatedKey 401 i.batch.db = db 402 i.batch.index = &i.index 403 i.batch.index.Init(&i.batch.data, i.batch.cmp, i.batch.abbreviatedKey) 404 return &i.batch 405 } 406 407 func newIndexedBatchWithSize(db *DB, comparer *Comparer, size int) *Batch { 408 b := newIndexedBatch(db, comparer) 409 if cap(b.data) < size { 410 b.data = rawalloc.New(0, size) 411 } 412 return b 413 } 414 415 // nextSeqNum returns the batch "sequence number" that will be given to the next 416 // key written to the batch. During iteration keys within an indexed batch are 417 // given a sequence number consisting of their offset within the batch combined 418 // with the base.InternalKeySeqNumBatch bit. These sequence numbers are only 419 // used during iteration, and the keys are assigned ordinary sequence numbers 420 // when the batch is committed. 421 func (b *Batch) nextSeqNum() uint64 { 422 return uint64(len(b.data)) | base.InternalKeySeqNumBatch 423 } 424 425 func (b *Batch) release() { 426 if b.db == nil { 427 // The batch was not created using newBatch or newIndexedBatch, or an error 428 // was encountered. We don't try to reuse batches that encountered an error 429 // because they might be stuck somewhere in the system and attempting to 430 // reuse such batches is a recipe for onerous debugging sessions. Instead, 431 // let the GC do its job. 432 return 433 } 434 b.db = nil 435 436 // NB: This is ugly (it would be cleaner if we could just assign a Batch{}), 437 // but necessary so that we can use atomic.StoreUint32 for the Batch.applied 438 // field. Without using an atomic to clear that field the Go race detector 439 // complains. 440 b.Reset() 441 b.cmp = nil 442 b.formatKey = nil 443 b.abbreviatedKey = nil 444 445 if b.index == nil { 446 batchPool.Put(b) 447 } else { 448 b.index, b.rangeDelIndex, b.rangeKeyIndex = nil, nil, nil 449 indexedBatchPool.Put((*indexedBatch)(unsafe.Pointer(b))) 450 } 451 } 452 453 func (b *Batch) refreshMemTableSize() error { 454 b.memTableSize = 0 455 if len(b.data) < batchHeaderLen { 456 return nil 457 } 458 459 b.countRangeDels = 0 460 b.countRangeKeys = 0 461 b.minimumFormatMajorVersion = 0 462 for r := b.Reader(); ; { 463 kind, key, value, ok, err := r.Next() 464 if !ok { 465 if err != nil { 466 return err 467 } 468 break 469 } 470 switch kind { 471 case InternalKeyKindRangeDelete: 472 b.countRangeDels++ 473 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 474 b.countRangeKeys++ 475 case InternalKeyKindDeleteSized: 476 if b.minimumFormatMajorVersion < FormatDeleteSizedAndObsolete { 477 b.minimumFormatMajorVersion = FormatDeleteSizedAndObsolete 478 } 479 case InternalKeyKindIngestSST: 480 if b.minimumFormatMajorVersion < FormatFlushableIngest { 481 b.minimumFormatMajorVersion = FormatFlushableIngest 482 } 483 // This key kind doesn't contribute to the memtable size. 484 continue 485 } 486 b.memTableSize += memTableEntrySize(len(key), len(value)) 487 } 488 if b.countRangeKeys > 0 && b.minimumFormatMajorVersion < FormatRangeKeys { 489 b.minimumFormatMajorVersion = FormatRangeKeys 490 } 491 return nil 492 } 493 494 // Apply the operations contained in the batch to the receiver batch. 495 // 496 // It is safe to modify the contents of the arguments after Apply returns. 497 func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error { 498 if b.ingestedSSTBatch { 499 panic("pebble: invalid batch application") 500 } 501 if len(batch.data) == 0 { 502 return nil 503 } 504 if len(batch.data) < batchHeaderLen { 505 return ErrInvalidBatch 506 } 507 508 offset := len(b.data) 509 if offset == 0 { 510 b.init(offset) 511 offset = batchHeaderLen 512 } 513 b.data = append(b.data, batch.data[batchHeaderLen:]...) 514 515 b.setCount(b.Count() + batch.Count()) 516 517 if b.db != nil || b.index != nil { 518 // Only iterate over the new entries if we need to track memTableSize or in 519 // order to update the index. 520 for iter := BatchReader(b.data[offset:]); len(iter) > 0; { 521 offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0])) 522 kind, key, value, ok, err := iter.Next() 523 if !ok { 524 if err != nil { 525 return err 526 } 527 break 528 } 529 switch kind { 530 case InternalKeyKindRangeDelete: 531 b.countRangeDels++ 532 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 533 b.countRangeKeys++ 534 case InternalKeyKindIngestSST: 535 panic("pebble: invalid key kind for batch") 536 } 537 if b.index != nil { 538 var err error 539 switch kind { 540 case InternalKeyKindRangeDelete: 541 b.tombstones = nil 542 b.tombstonesSeqNum = 0 543 if b.rangeDelIndex == nil { 544 b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey) 545 } 546 err = b.rangeDelIndex.Add(uint32(offset)) 547 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 548 b.rangeKeys = nil 549 b.rangeKeysSeqNum = 0 550 if b.rangeKeyIndex == nil { 551 b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey) 552 } 553 err = b.rangeKeyIndex.Add(uint32(offset)) 554 default: 555 err = b.index.Add(uint32(offset)) 556 } 557 if err != nil { 558 return err 559 } 560 } 561 b.memTableSize += memTableEntrySize(len(key), len(value)) 562 } 563 } 564 return nil 565 } 566 567 // Get gets the value for the given key. It returns ErrNotFound if the Batch 568 // does not contain the key. 569 // 570 // The caller should not modify the contents of the returned slice, but it is 571 // safe to modify the contents of the argument after Get returns. The returned 572 // slice will remain valid until the returned Closer is closed. On success, the 573 // caller MUST call closer.Close() or a memory leak will occur. 574 func (b *Batch) Get(key []byte) ([]byte, io.Closer, error) { 575 if b.index == nil { 576 return nil, nil, ErrNotIndexed 577 } 578 return b.db.getInternal(key, b, nil /* snapshot */) 579 } 580 581 func (b *Batch) prepareDeferredKeyValueRecord(keyLen, valueLen int, kind InternalKeyKind) { 582 if b.committing { 583 panic("pebble: batch already committing") 584 } 585 if len(b.data) == 0 { 586 b.init(keyLen + valueLen + 2*binary.MaxVarintLen64 + batchHeaderLen) 587 } 588 b.count++ 589 b.memTableSize += memTableEntrySize(keyLen, valueLen) 590 591 pos := len(b.data) 592 b.deferredOp.offset = uint32(pos) 593 b.grow(1 + 2*maxVarintLen32 + keyLen + valueLen) 594 b.data[pos] = byte(kind) 595 pos++ 596 597 { 598 // TODO(peter): Manually inlined version binary.PutUvarint(). This is 20% 599 // faster on BenchmarkBatchSet on go1.13. Remove if go1.14 or future 600 // versions show this to not be a performance win. 601 x := uint32(keyLen) 602 for x >= 0x80 { 603 b.data[pos] = byte(x) | 0x80 604 x >>= 7 605 pos++ 606 } 607 b.data[pos] = byte(x) 608 pos++ 609 } 610 611 b.deferredOp.Key = b.data[pos : pos+keyLen] 612 pos += keyLen 613 614 { 615 // TODO(peter): Manually inlined version binary.PutUvarint(). This is 20% 616 // faster on BenchmarkBatchSet on go1.13. Remove if go1.14 or future 617 // versions show this to not be a performance win. 618 x := uint32(valueLen) 619 for x >= 0x80 { 620 b.data[pos] = byte(x) | 0x80 621 x >>= 7 622 pos++ 623 } 624 b.data[pos] = byte(x) 625 pos++ 626 } 627 628 b.deferredOp.Value = b.data[pos : pos+valueLen] 629 // Shrink data since varints may be shorter than the upper bound. 630 b.data = b.data[:pos+valueLen] 631 } 632 633 func (b *Batch) prepareDeferredKeyRecord(keyLen int, kind InternalKeyKind) { 634 if b.committing { 635 panic("pebble: batch already committing") 636 } 637 if len(b.data) == 0 { 638 b.init(keyLen + binary.MaxVarintLen64 + batchHeaderLen) 639 } 640 b.count++ 641 b.memTableSize += memTableEntrySize(keyLen, 0) 642 643 pos := len(b.data) 644 b.deferredOp.offset = uint32(pos) 645 b.grow(1 + maxVarintLen32 + keyLen) 646 b.data[pos] = byte(kind) 647 pos++ 648 649 { 650 // TODO(peter): Manually inlined version binary.PutUvarint(). Remove if 651 // go1.13 or future versions show this to not be a performance win. See 652 // BenchmarkBatchSet. 653 x := uint32(keyLen) 654 for x >= 0x80 { 655 b.data[pos] = byte(x) | 0x80 656 x >>= 7 657 pos++ 658 } 659 b.data[pos] = byte(x) 660 pos++ 661 } 662 663 b.deferredOp.Key = b.data[pos : pos+keyLen] 664 b.deferredOp.Value = nil 665 666 // Shrink data since varint may be shorter than the upper bound. 667 b.data = b.data[:pos+keyLen] 668 } 669 670 // AddInternalKey allows the caller to add an internal key of point key or range 671 // key kinds (but not RangeDelete) to a batch. Passing in an internal key of 672 // kind RangeDelete will result in a panic. Note that the seqnum in the internal 673 // key is effectively ignored, even though the Kind is preserved. This is 674 // because the batch format does not allow for a per-key seqnum to be specified, 675 // only a batch-wide one. 676 // 677 // Note that non-indexed keys (IngestKeyKind{LogData,IngestSST}) are not 678 // supported with this method as they require specialized logic. 679 func (b *Batch) AddInternalKey(key *base.InternalKey, value []byte, _ *WriteOptions) error { 680 keyLen := len(key.UserKey) 681 hasValue := false 682 switch kind := key.Kind(); kind { 683 case InternalKeyKindRangeDelete: 684 panic("unexpected range delete in AddInternalKey") 685 case InternalKeyKindSingleDelete, InternalKeyKindDelete: 686 b.prepareDeferredKeyRecord(keyLen, kind) 687 b.deferredOp.index = b.index 688 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 689 b.prepareDeferredKeyValueRecord(keyLen, len(value), kind) 690 hasValue = true 691 b.incrementRangeKeysCount() 692 default: 693 b.prepareDeferredKeyValueRecord(keyLen, len(value), kind) 694 hasValue = true 695 b.deferredOp.index = b.index 696 } 697 copy(b.deferredOp.Key, key.UserKey) 698 if hasValue { 699 copy(b.deferredOp.Value, value) 700 } 701 702 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 703 // in go1.13 will remove the need for this. 704 if b.index != nil { 705 if err := b.index.Add(b.deferredOp.offset); err != nil { 706 return err 707 } 708 } 709 return nil 710 } 711 712 // Set adds an action to the batch that sets the key to map to the value. 713 // 714 // It is safe to modify the contents of the arguments after Set returns. 715 func (b *Batch) Set(key, value []byte, _ *WriteOptions) error { 716 deferredOp := b.SetDeferred(len(key), len(value)) 717 copy(deferredOp.Key, key) 718 copy(deferredOp.Value, value) 719 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 720 // in go1.13 will remove the need for this. 721 if b.index != nil { 722 if err := b.index.Add(deferredOp.offset); err != nil { 723 return err 724 } 725 } 726 return nil 727 } 728 729 // SetDeferred is similar to Set in that it adds a set operation to the batch, 730 // except it only takes in key/value lengths instead of complete slices, 731 // letting the caller encode into those objects and then call Finish() on the 732 // returned object. 733 func (b *Batch) SetDeferred(keyLen, valueLen int) *DeferredBatchOp { 734 b.prepareDeferredKeyValueRecord(keyLen, valueLen, InternalKeyKindSet) 735 b.deferredOp.index = b.index 736 return &b.deferredOp 737 } 738 739 // Merge adds an action to the batch that merges the value at key with the new 740 // value. The details of the merge are dependent upon the configured merge 741 // operator. 742 // 743 // It is safe to modify the contents of the arguments after Merge returns. 744 func (b *Batch) Merge(key, value []byte, _ *WriteOptions) error { 745 deferredOp := b.MergeDeferred(len(key), len(value)) 746 copy(deferredOp.Key, key) 747 copy(deferredOp.Value, value) 748 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 749 // in go1.13 will remove the need for this. 750 if b.index != nil { 751 if err := b.index.Add(deferredOp.offset); err != nil { 752 return err 753 } 754 } 755 return nil 756 } 757 758 // MergeDeferred is similar to Merge in that it adds a merge operation to the 759 // batch, except it only takes in key/value lengths instead of complete slices, 760 // letting the caller encode into those objects and then call Finish() on the 761 // returned object. 762 func (b *Batch) MergeDeferred(keyLen, valueLen int) *DeferredBatchOp { 763 b.prepareDeferredKeyValueRecord(keyLen, valueLen, InternalKeyKindMerge) 764 b.deferredOp.index = b.index 765 return &b.deferredOp 766 } 767 768 // Delete adds an action to the batch that deletes the entry for key. 769 // 770 // It is safe to modify the contents of the arguments after Delete returns. 771 func (b *Batch) Delete(key []byte, _ *WriteOptions) error { 772 deferredOp := b.DeleteDeferred(len(key)) 773 copy(deferredOp.Key, key) 774 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 775 // in go1.13 will remove the need for this. 776 if b.index != nil { 777 if err := b.index.Add(deferredOp.offset); err != nil { 778 return err 779 } 780 } 781 return nil 782 } 783 784 // DeleteDeferred is similar to Delete in that it adds a delete operation to 785 // the batch, except it only takes in key/value lengths instead of complete 786 // slices, letting the caller encode into those objects and then call Finish() 787 // on the returned object. 788 func (b *Batch) DeleteDeferred(keyLen int) *DeferredBatchOp { 789 b.prepareDeferredKeyRecord(keyLen, InternalKeyKindDelete) 790 b.deferredOp.index = b.index 791 return &b.deferredOp 792 } 793 794 // DeleteSized behaves identically to Delete, but takes an additional 795 // argument indicating the size of the value being deleted. DeleteSized 796 // should be preferred when the caller has the expectation that there exists 797 // a single internal KV pair for the key (eg, the key has not been 798 // overwritten recently), and the caller knows the size of its value. 799 // 800 // DeleteSized will record the value size within the tombstone and use it to 801 // inform compaction-picking heuristics which strive to reduce space 802 // amplification in the LSM. This "calling your shot" mechanic allows the 803 // storage engine to more accurately estimate and reduce space amplification. 804 // 805 // It is safe to modify the contents of the arguments after DeleteSized 806 // returns. 807 func (b *Batch) DeleteSized(key []byte, deletedValueSize uint32, _ *WriteOptions) error { 808 deferredOp := b.DeleteSizedDeferred(len(key), deletedValueSize) 809 copy(b.deferredOp.Key, key) 810 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Check if in a 811 // later Go release this is unnecessary. 812 if b.index != nil { 813 if err := b.index.Add(deferredOp.offset); err != nil { 814 return err 815 } 816 } 817 return nil 818 } 819 820 // DeleteSizedDeferred is similar to DeleteSized in that it adds a sized delete 821 // operation to the batch, except it only takes in key length instead of a 822 // complete key slice, letting the caller encode into the DeferredBatchOp.Key 823 // slice and then call Finish() on the returned object. 824 func (b *Batch) DeleteSizedDeferred(keyLen int, deletedValueSize uint32) *DeferredBatchOp { 825 if b.minimumFormatMajorVersion < FormatDeleteSizedAndObsolete { 826 b.minimumFormatMajorVersion = FormatDeleteSizedAndObsolete 827 } 828 829 // Encode the sum of the key length and the value in the value. 830 v := uint64(deletedValueSize) + uint64(keyLen) 831 832 // Encode `v` as a varint. 833 var buf [binary.MaxVarintLen64]byte 834 n := 0 835 { 836 x := v 837 for x >= 0x80 { 838 buf[n] = byte(x) | 0x80 839 x >>= 7 840 n++ 841 } 842 buf[n] = byte(x) 843 n++ 844 } 845 846 // NB: In batch entries and sstable entries, values are stored as 847 // varstrings. Here, the value is itself a simple varint. This results in an 848 // unnecessary double layer of encoding: 849 // varint(n) varint(deletedValueSize) 850 // The first varint will always be 1-byte, since a varint-encoded uint64 851 // will never exceed 128 bytes. This unnecessary extra byte and wrapping is 852 // preserved to avoid special casing across the database, and in particular 853 // in sstable block decoding which is performance sensitive. 854 b.prepareDeferredKeyValueRecord(keyLen, n, InternalKeyKindDeleteSized) 855 b.deferredOp.index = b.index 856 copy(b.deferredOp.Value, buf[:n]) 857 return &b.deferredOp 858 } 859 860 // SingleDelete adds an action to the batch that single deletes the entry for key. 861 // See Writer.SingleDelete for more details on the semantics of SingleDelete. 862 // 863 // It is safe to modify the contents of the arguments after SingleDelete returns. 864 func (b *Batch) SingleDelete(key []byte, _ *WriteOptions) error { 865 deferredOp := b.SingleDeleteDeferred(len(key)) 866 copy(deferredOp.Key, key) 867 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 868 // in go1.13 will remove the need for this. 869 if b.index != nil { 870 if err := b.index.Add(deferredOp.offset); err != nil { 871 return err 872 } 873 } 874 return nil 875 } 876 877 // SingleDeleteDeferred is similar to SingleDelete in that it adds a single delete 878 // operation to the batch, except it only takes in key/value lengths instead of 879 // complete slices, letting the caller encode into those objects and then call 880 // Finish() on the returned object. 881 func (b *Batch) SingleDeleteDeferred(keyLen int) *DeferredBatchOp { 882 b.prepareDeferredKeyRecord(keyLen, InternalKeyKindSingleDelete) 883 b.deferredOp.index = b.index 884 return &b.deferredOp 885 } 886 887 // DeleteRange deletes all of the point keys (and values) in the range 888 // [start,end) (inclusive on start, exclusive on end). DeleteRange does NOT 889 // delete overlapping range keys (eg, keys set via RangeKeySet). 890 // 891 // It is safe to modify the contents of the arguments after DeleteRange 892 // returns. 893 func (b *Batch) DeleteRange(start, end []byte, _ *WriteOptions) error { 894 deferredOp := b.DeleteRangeDeferred(len(start), len(end)) 895 copy(deferredOp.Key, start) 896 copy(deferredOp.Value, end) 897 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 898 // in go1.13 will remove the need for this. 899 if deferredOp.index != nil { 900 if err := deferredOp.index.Add(deferredOp.offset); err != nil { 901 return err 902 } 903 } 904 return nil 905 } 906 907 // DeleteRangeDeferred is similar to DeleteRange in that it adds a delete range 908 // operation to the batch, except it only takes in key lengths instead of 909 // complete slices, letting the caller encode into those objects and then call 910 // Finish() on the returned object. Note that DeferredBatchOp.Key should be 911 // populated with the start key, and DeferredBatchOp.Value should be populated 912 // with the end key. 913 func (b *Batch) DeleteRangeDeferred(startLen, endLen int) *DeferredBatchOp { 914 b.prepareDeferredKeyValueRecord(startLen, endLen, InternalKeyKindRangeDelete) 915 b.countRangeDels++ 916 if b.index != nil { 917 b.tombstones = nil 918 b.tombstonesSeqNum = 0 919 // Range deletions are rare, so we lazily allocate the index for them. 920 if b.rangeDelIndex == nil { 921 b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey) 922 } 923 b.deferredOp.index = b.rangeDelIndex 924 } 925 return &b.deferredOp 926 } 927 928 // RangeKeySet sets a range key mapping the key range [start, end) at the MVCC 929 // timestamp suffix to value. The suffix is optional. If any portion of the key 930 // range [start, end) is already set by a range key with the same suffix value, 931 // RangeKeySet overrides it. 932 // 933 // It is safe to modify the contents of the arguments after RangeKeySet returns. 934 func (b *Batch) RangeKeySet(start, end, suffix, value []byte, _ *WriteOptions) error { 935 if invariants.Enabled && b.db != nil && b.db.opts.Comparer.Split != nil { 936 // RangeKeySet is only supported on prefix keys. 937 if b.db.opts.Comparer.Split(start) != len(start) { 938 panic("RangeKeySet called with suffixed start key") 939 } 940 if b.db.opts.Comparer.Split(end) != len(end) { 941 panic("RangeKeySet called with suffixed end key") 942 } 943 } 944 suffixValues := [1]rangekey.SuffixValue{{Suffix: suffix, Value: value}} 945 internalValueLen := rangekey.EncodedSetValueLen(end, suffixValues[:]) 946 947 deferredOp := b.rangeKeySetDeferred(len(start), internalValueLen) 948 copy(deferredOp.Key, start) 949 n := rangekey.EncodeSetValue(deferredOp.Value, end, suffixValues[:]) 950 if n != internalValueLen { 951 panic("unexpected internal value length mismatch") 952 } 953 954 // Manually inline DeferredBatchOp.Finish(). 955 if deferredOp.index != nil { 956 if err := deferredOp.index.Add(deferredOp.offset); err != nil { 957 return err 958 } 959 } 960 return nil 961 } 962 963 func (b *Batch) rangeKeySetDeferred(startLen, internalValueLen int) *DeferredBatchOp { 964 b.prepareDeferredKeyValueRecord(startLen, internalValueLen, InternalKeyKindRangeKeySet) 965 b.incrementRangeKeysCount() 966 return &b.deferredOp 967 } 968 969 func (b *Batch) incrementRangeKeysCount() { 970 b.countRangeKeys++ 971 if b.minimumFormatMajorVersion < FormatRangeKeys { 972 b.minimumFormatMajorVersion = FormatRangeKeys 973 } 974 if b.index != nil { 975 b.rangeKeys = nil 976 b.rangeKeysSeqNum = 0 977 // Range keys are rare, so we lazily allocate the index for them. 978 if b.rangeKeyIndex == nil { 979 b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey) 980 } 981 b.deferredOp.index = b.rangeKeyIndex 982 } 983 } 984 985 // RangeKeyUnset removes a range key mapping the key range [start, end) at the 986 // MVCC timestamp suffix. The suffix may be omitted to remove an unsuffixed 987 // range key. RangeKeyUnset only removes portions of range keys that fall within 988 // the [start, end) key span, and only range keys with suffixes that exactly 989 // match the unset suffix. 990 // 991 // It is safe to modify the contents of the arguments after RangeKeyUnset 992 // returns. 993 func (b *Batch) RangeKeyUnset(start, end, suffix []byte, _ *WriteOptions) error { 994 if invariants.Enabled && b.db != nil && b.db.opts.Comparer.Split != nil { 995 // RangeKeyUnset is only supported on prefix keys. 996 if b.db.opts.Comparer.Split(start) != len(start) { 997 panic("RangeKeyUnset called with suffixed start key") 998 } 999 if b.db.opts.Comparer.Split(end) != len(end) { 1000 panic("RangeKeyUnset called with suffixed end key") 1001 } 1002 } 1003 suffixes := [1][]byte{suffix} 1004 internalValueLen := rangekey.EncodedUnsetValueLen(end, suffixes[:]) 1005 1006 deferredOp := b.rangeKeyUnsetDeferred(len(start), internalValueLen) 1007 copy(deferredOp.Key, start) 1008 n := rangekey.EncodeUnsetValue(deferredOp.Value, end, suffixes[:]) 1009 if n != internalValueLen { 1010 panic("unexpected internal value length mismatch") 1011 } 1012 1013 // Manually inline DeferredBatchOp.Finish() 1014 if deferredOp.index != nil { 1015 if err := deferredOp.index.Add(deferredOp.offset); err != nil { 1016 return err 1017 } 1018 } 1019 return nil 1020 } 1021 1022 func (b *Batch) rangeKeyUnsetDeferred(startLen, internalValueLen int) *DeferredBatchOp { 1023 b.prepareDeferredKeyValueRecord(startLen, internalValueLen, InternalKeyKindRangeKeyUnset) 1024 b.incrementRangeKeysCount() 1025 return &b.deferredOp 1026 } 1027 1028 // RangeKeyDelete deletes all of the range keys in the range [start,end) 1029 // (inclusive on start, exclusive on end). It does not delete point keys (for 1030 // that use DeleteRange). RangeKeyDelete removes all range keys within the 1031 // bounds, including those with or without suffixes. 1032 // 1033 // It is safe to modify the contents of the arguments after RangeKeyDelete 1034 // returns. 1035 func (b *Batch) RangeKeyDelete(start, end []byte, _ *WriteOptions) error { 1036 if invariants.Enabled && b.db != nil && b.db.opts.Comparer.Split != nil { 1037 // RangeKeyDelete is only supported on prefix keys. 1038 if b.db.opts.Comparer.Split(start) != len(start) { 1039 panic("RangeKeyDelete called with suffixed start key") 1040 } 1041 if b.db.opts.Comparer.Split(end) != len(end) { 1042 panic("RangeKeyDelete called with suffixed end key") 1043 } 1044 } 1045 deferredOp := b.RangeKeyDeleteDeferred(len(start), len(end)) 1046 copy(deferredOp.Key, start) 1047 copy(deferredOp.Value, end) 1048 // Manually inline DeferredBatchOp.Finish(). 1049 if deferredOp.index != nil { 1050 if err := deferredOp.index.Add(deferredOp.offset); err != nil { 1051 return err 1052 } 1053 } 1054 return nil 1055 } 1056 1057 // RangeKeyDeleteDeferred is similar to RangeKeyDelete in that it adds an 1058 // operation to delete range keys to the batch, except it only takes in key 1059 // lengths instead of complete slices, letting the caller encode into those 1060 // objects and then call Finish() on the returned object. Note that 1061 // DeferredBatchOp.Key should be populated with the start key, and 1062 // DeferredBatchOp.Value should be populated with the end key. 1063 func (b *Batch) RangeKeyDeleteDeferred(startLen, endLen int) *DeferredBatchOp { 1064 b.prepareDeferredKeyValueRecord(startLen, endLen, InternalKeyKindRangeKeyDelete) 1065 b.incrementRangeKeysCount() 1066 return &b.deferredOp 1067 } 1068 1069 // LogData adds the specified to the batch. The data will be written to the 1070 // WAL, but not added to memtables or sstables. Log data is never indexed, 1071 // which makes it useful for testing WAL performance. 1072 // 1073 // It is safe to modify the contents of the argument after LogData returns. 1074 func (b *Batch) LogData(data []byte, _ *WriteOptions) error { 1075 origCount, origMemTableSize := b.count, b.memTableSize 1076 b.prepareDeferredKeyRecord(len(data), InternalKeyKindLogData) 1077 copy(b.deferredOp.Key, data) 1078 // Since LogData only writes to the WAL and does not affect the memtable, we 1079 // restore b.count and b.memTableSize to their origin values. Note that 1080 // Batch.count only refers to records that are added to the memtable. 1081 b.count, b.memTableSize = origCount, origMemTableSize 1082 return nil 1083 } 1084 1085 // IngestSST adds the FileNum for an sstable to the batch. The data will only be 1086 // written to the WAL (not added to memtables or sstables). 1087 func (b *Batch) ingestSST(fileNum base.FileNum) { 1088 if b.Empty() { 1089 b.ingestedSSTBatch = true 1090 } else if !b.ingestedSSTBatch { 1091 // Batch contains other key kinds. 1092 panic("pebble: invalid call to ingestSST") 1093 } 1094 1095 origMemTableSize := b.memTableSize 1096 var buf [binary.MaxVarintLen64]byte 1097 length := binary.PutUvarint(buf[:], uint64(fileNum)) 1098 b.prepareDeferredKeyRecord(length, InternalKeyKindIngestSST) 1099 copy(b.deferredOp.Key, buf[:length]) 1100 // Since IngestSST writes only to the WAL and does not affect the memtable, 1101 // we restore b.memTableSize to its original value. Note that Batch.count 1102 // is not reset because for the InternalKeyKindIngestSST the count is the 1103 // number of sstable paths which have been added to the batch. 1104 b.memTableSize = origMemTableSize 1105 b.minimumFormatMajorVersion = FormatFlushableIngest 1106 } 1107 1108 // Empty returns true if the batch is empty, and false otherwise. 1109 func (b *Batch) Empty() bool { 1110 return len(b.data) <= batchHeaderLen 1111 } 1112 1113 // Len returns the current size of the batch in bytes. 1114 func (b *Batch) Len() int { 1115 if len(b.data) <= batchHeaderLen { 1116 return batchHeaderLen 1117 } 1118 return len(b.data) 1119 } 1120 1121 // Repr returns the underlying batch representation. It is not safe to modify 1122 // the contents. Reset() will not change the contents of the returned value, 1123 // though any other mutation operation may do so. 1124 func (b *Batch) Repr() []byte { 1125 if len(b.data) == 0 { 1126 b.init(batchHeaderLen) 1127 } 1128 binary.LittleEndian.PutUint32(b.countData(), b.Count()) 1129 return b.data 1130 } 1131 1132 // SetRepr sets the underlying batch representation. The batch takes ownership 1133 // of the supplied slice. It is not safe to modify it afterwards until the 1134 // Batch is no longer in use. 1135 func (b *Batch) SetRepr(data []byte) error { 1136 if len(data) < batchHeaderLen { 1137 return base.CorruptionErrorf("invalid batch") 1138 } 1139 b.data = data 1140 b.count = uint64(binary.LittleEndian.Uint32(b.countData())) 1141 var err error 1142 if b.db != nil { 1143 // Only track memTableSize for batches that will be committed to the DB. 1144 err = b.refreshMemTableSize() 1145 } 1146 return err 1147 } 1148 1149 // NewIter returns an iterator that is unpositioned (Iterator.Valid() will 1150 // return false). The iterator can be positioned via a call to SeekGE, 1151 // SeekPrefixGE, SeekLT, First or Last. Only indexed batches support iterators. 1152 // 1153 // The returned Iterator observes all of the Batch's existing mutations, but no 1154 // later mutations. Its view can be refreshed via RefreshBatchSnapshot or 1155 // SetOptions(). 1156 func (b *Batch) NewIter(o *IterOptions) (*Iterator, error) { 1157 return b.NewIterWithContext(context.Background(), o) 1158 } 1159 1160 // NewIterWithContext is like NewIter, and additionally accepts a context for 1161 // tracing. 1162 func (b *Batch) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error) { 1163 if b.index == nil { 1164 return nil, ErrNotIndexed 1165 } 1166 return b.db.newIter(ctx, b, newIterOpts{}, o), nil 1167 } 1168 1169 // NewBatchOnlyIter constructs an iterator that only reads the contents of the 1170 // batch, and does not overlay the batch mutations on top of the DB state. 1171 // 1172 // The returned Iterator observes all of the Batch's existing mutations, but 1173 // no later mutations. Its view can be refreshed via RefreshBatchSnapshot or 1174 // SetOptions(). 1175 func (b *Batch) NewBatchOnlyIter(ctx context.Context, o *IterOptions) (*Iterator, error) { 1176 if b.index == nil { 1177 return nil, ErrNotIndexed 1178 } 1179 return b.db.newIter(ctx, b, newIterOpts{batch: batchIterOpts{batchOnly: true}}, o), nil 1180 } 1181 1182 // newInternalIter creates a new internalIterator that iterates over the 1183 // contents of the batch. 1184 func (b *Batch) newInternalIter(o *IterOptions) *batchIter { 1185 iter := &batchIter{} 1186 b.initInternalIter(o, iter) 1187 return iter 1188 } 1189 1190 func (b *Batch) initInternalIter(o *IterOptions, iter *batchIter) { 1191 *iter = batchIter{ 1192 cmp: b.cmp, 1193 batch: b, 1194 iter: b.index.NewIter(o.GetLowerBound(), o.GetUpperBound()), 1195 // NB: We explicitly do not propagate the batch snapshot to the point 1196 // key iterator. Filtering point keys within the batch iterator can 1197 // cause pathological behavior where a batch iterator advances 1198 // significantly farther than necessary filtering many batch keys that 1199 // are not visible at the batch sequence number. Instead, the merging 1200 // iterator enforces bounds. 1201 // 1202 // For example, consider an engine that contains the committed keys 1203 // 'bar' and 'bax', with no keys between them. Consider a batch 1204 // containing keys 1,000 keys within the range [a,z]. All of the 1205 // batch keys were added to the batch after the iterator was 1206 // constructed, so they are not visible to the iterator. A call to 1207 // SeekGE('bax') would seek the LSM iterators and discover the key 1208 // 'bax'. It would also seek the batch iterator, landing on the key 1209 // 'baz' but discover it that it's not visible. The batch iterator would 1210 // next through the rest of the batch's keys, only to discover there are 1211 // no visible keys greater than or equal to 'bax'. 1212 // 1213 // Filtering these batch points within the merging iterator ensures that 1214 // the batch iterator never needs to iterate beyond 'baz', because it 1215 // already found a smaller, visible key 'bax'. 1216 snapshot: base.InternalKeySeqNumMax, 1217 } 1218 } 1219 1220 func (b *Batch) newRangeDelIter(o *IterOptions, batchSnapshot uint64) *keyspan.Iter { 1221 // Construct an iterator even if rangeDelIndex is nil, because it is allowed 1222 // to refresh later, so we need the container to exist. 1223 iter := new(keyspan.Iter) 1224 b.initRangeDelIter(o, iter, batchSnapshot) 1225 return iter 1226 } 1227 1228 func (b *Batch) initRangeDelIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot uint64) { 1229 if b.rangeDelIndex == nil { 1230 iter.Init(b.cmp, nil) 1231 return 1232 } 1233 1234 // Fragment the range tombstones the first time a range deletion iterator is 1235 // requested. The cached tombstones are invalidated if another range 1236 // deletion tombstone is added to the batch. This cache is only guaranteed 1237 // to be correct if we're opening an iterator to read at a batch sequence 1238 // number at least as high as tombstonesSeqNum. The cache is guaranteed to 1239 // include all tombstones up to tombstonesSeqNum, and if any additional 1240 // tombstones were added after that sequence number the cache would've been 1241 // cleared. 1242 nextSeqNum := b.nextSeqNum() 1243 if b.tombstones != nil && b.tombstonesSeqNum <= batchSnapshot { 1244 iter.Init(b.cmp, b.tombstones) 1245 return 1246 } 1247 1248 tombstones := make([]keyspan.Span, 0, b.countRangeDels) 1249 frag := &keyspan.Fragmenter{ 1250 Cmp: b.cmp, 1251 Format: b.formatKey, 1252 Emit: func(s keyspan.Span) { 1253 tombstones = append(tombstones, s) 1254 }, 1255 } 1256 it := &batchIter{ 1257 cmp: b.cmp, 1258 batch: b, 1259 iter: b.rangeDelIndex.NewIter(nil, nil), 1260 snapshot: batchSnapshot, 1261 } 1262 fragmentRangeDels(frag, it, int(b.countRangeDels)) 1263 iter.Init(b.cmp, tombstones) 1264 1265 // If we just read all the tombstones in the batch (eg, batchSnapshot was 1266 // set to b.nextSeqNum()), then cache the tombstones so that a subsequent 1267 // call to initRangeDelIter may use them without refragmenting. 1268 if nextSeqNum == batchSnapshot { 1269 b.tombstones = tombstones 1270 b.tombstonesSeqNum = nextSeqNum 1271 } 1272 } 1273 1274 func fragmentRangeDels(frag *keyspan.Fragmenter, it internalIterator, count int) { 1275 // The memory management here is a bit subtle. The keys and values returned 1276 // by the iterator are slices in Batch.data. Thus the fragmented tombstones 1277 // are slices within Batch.data. If additional entries are added to the 1278 // Batch, Batch.data may be reallocated. The references in the fragmented 1279 // tombstones will remain valid, pointing into the old Batch.data. GC for 1280 // the win. 1281 1282 // Use a single []keyspan.Key buffer to avoid allocating many 1283 // individual []keyspan.Key slices with a single element each. 1284 keyBuf := make([]keyspan.Key, 0, count) 1285 for key, val := it.First(); key != nil; key, val = it.Next() { 1286 s := rangedel.Decode(*key, val.InPlaceValue(), keyBuf) 1287 keyBuf = s.Keys[len(s.Keys):] 1288 1289 // Set a fixed capacity to avoid accidental overwriting. 1290 s.Keys = s.Keys[:len(s.Keys):len(s.Keys)] 1291 frag.Add(s) 1292 } 1293 frag.Finish() 1294 } 1295 1296 func (b *Batch) newRangeKeyIter(o *IterOptions, batchSnapshot uint64) *keyspan.Iter { 1297 // Construct an iterator even if rangeKeyIndex is nil, because it is allowed 1298 // to refresh later, so we need the container to exist. 1299 iter := new(keyspan.Iter) 1300 b.initRangeKeyIter(o, iter, batchSnapshot) 1301 return iter 1302 } 1303 1304 func (b *Batch) initRangeKeyIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot uint64) { 1305 if b.rangeKeyIndex == nil { 1306 iter.Init(b.cmp, nil) 1307 return 1308 } 1309 1310 // Fragment the range keys the first time a range key iterator is requested. 1311 // The cached spans are invalidated if another range key is added to the 1312 // batch. This cache is only guaranteed to be correct if we're opening an 1313 // iterator to read at a batch sequence number at least as high as 1314 // rangeKeysSeqNum. The cache is guaranteed to include all range keys up to 1315 // rangeKeysSeqNum, and if any additional range keys were added after that 1316 // sequence number the cache would've been cleared. 1317 nextSeqNum := b.nextSeqNum() 1318 if b.rangeKeys != nil && b.rangeKeysSeqNum <= batchSnapshot { 1319 iter.Init(b.cmp, b.rangeKeys) 1320 return 1321 } 1322 1323 rangeKeys := make([]keyspan.Span, 0, b.countRangeKeys) 1324 frag := &keyspan.Fragmenter{ 1325 Cmp: b.cmp, 1326 Format: b.formatKey, 1327 Emit: func(s keyspan.Span) { 1328 rangeKeys = append(rangeKeys, s) 1329 }, 1330 } 1331 it := &batchIter{ 1332 cmp: b.cmp, 1333 batch: b, 1334 iter: b.rangeKeyIndex.NewIter(nil, nil), 1335 snapshot: batchSnapshot, 1336 } 1337 fragmentRangeKeys(frag, it, int(b.countRangeKeys)) 1338 iter.Init(b.cmp, rangeKeys) 1339 1340 // If we just read all the range keys in the batch (eg, batchSnapshot was 1341 // set to b.nextSeqNum()), then cache the range keys so that a subsequent 1342 // call to initRangeKeyIter may use them without refragmenting. 1343 if nextSeqNum == batchSnapshot { 1344 b.rangeKeys = rangeKeys 1345 b.rangeKeysSeqNum = nextSeqNum 1346 } 1347 } 1348 1349 func fragmentRangeKeys(frag *keyspan.Fragmenter, it internalIterator, count int) error { 1350 // The memory management here is a bit subtle. The keys and values 1351 // returned by the iterator are slices in Batch.data. Thus the 1352 // fragmented key spans are slices within Batch.data. If additional 1353 // entries are added to the Batch, Batch.data may be reallocated. The 1354 // references in the fragmented keys will remain valid, pointing into 1355 // the old Batch.data. GC for the win. 1356 1357 // Use a single []keyspan.Key buffer to avoid allocating many 1358 // individual []keyspan.Key slices with a single element each. 1359 keyBuf := make([]keyspan.Key, 0, count) 1360 for ik, val := it.First(); ik != nil; ik, val = it.Next() { 1361 s, err := rangekey.Decode(*ik, val.InPlaceValue(), keyBuf) 1362 if err != nil { 1363 return err 1364 } 1365 keyBuf = s.Keys[len(s.Keys):] 1366 1367 // Set a fixed capacity to avoid accidental overwriting. 1368 s.Keys = s.Keys[:len(s.Keys):len(s.Keys)] 1369 frag.Add(s) 1370 } 1371 frag.Finish() 1372 return nil 1373 } 1374 1375 // Commit applies the batch to its parent writer. 1376 func (b *Batch) Commit(o *WriteOptions) error { 1377 return b.db.Apply(b, o) 1378 } 1379 1380 // Close closes the batch without committing it. 1381 func (b *Batch) Close() error { 1382 b.release() 1383 return nil 1384 } 1385 1386 // Indexed returns true if the batch is indexed (i.e. supports read 1387 // operations). 1388 func (b *Batch) Indexed() bool { 1389 return b.index != nil 1390 } 1391 1392 // init ensures that the batch data slice is initialized to meet the 1393 // minimum required size and allocates space for the batch header. 1394 func (b *Batch) init(size int) { 1395 n := batchInitialSize 1396 for n < size { 1397 n *= 2 1398 } 1399 if cap(b.data) < n { 1400 b.data = rawalloc.New(batchHeaderLen, n) 1401 } 1402 b.data = b.data[:batchHeaderLen] 1403 clear(b.data) // Zero the sequence number in the header 1404 } 1405 1406 // Reset resets the batch for reuse. The underlying byte slice (that is 1407 // returned by Repr()) may not be modified. It is only necessary to call this 1408 // method if a batch is explicitly being reused. Close automatically takes are 1409 // of releasing resources when appropriate for batches that are internally 1410 // being reused. 1411 func (b *Batch) Reset() { 1412 // Zero out the struct, retaining only the fields necessary for manual 1413 // reuse. 1414 b.batchInternal = batchInternal{ 1415 data: b.data, 1416 cmp: b.cmp, 1417 formatKey: b.formatKey, 1418 abbreviatedKey: b.abbreviatedKey, 1419 index: b.index, 1420 db: b.db, 1421 } 1422 b.applied.Store(false) 1423 if b.data != nil { 1424 if cap(b.data) > batchMaxRetainedSize { 1425 // If the capacity of the buffer is larger than our maximum 1426 // retention size, don't re-use it. Let it be GC-ed instead. 1427 // This prevents the memory from an unusually large batch from 1428 // being held on to indefinitely. 1429 b.data = nil 1430 } else { 1431 // Otherwise, reset the buffer for re-use. 1432 b.data = b.data[:batchHeaderLen] 1433 clear(b.data) 1434 } 1435 } 1436 if b.index != nil { 1437 b.index.Init(&b.data, b.cmp, b.abbreviatedKey) 1438 } 1439 } 1440 1441 // seqNumData returns the 8 byte little-endian sequence number. Zero means that 1442 // the batch has not yet been applied. 1443 func (b *Batch) seqNumData() []byte { 1444 return b.data[:8] 1445 } 1446 1447 // countData returns the 4 byte little-endian count data. "\xff\xff\xff\xff" 1448 // means that the batch is invalid. 1449 func (b *Batch) countData() []byte { 1450 return b.data[8:12] 1451 } 1452 1453 func (b *Batch) grow(n int) { 1454 newSize := len(b.data) + n 1455 if uint64(newSize) >= maxBatchSize { 1456 panic(ErrBatchTooLarge) 1457 } 1458 if newSize > cap(b.data) { 1459 newCap := 2 * cap(b.data) 1460 for newCap < newSize { 1461 newCap *= 2 1462 } 1463 newData := rawalloc.New(len(b.data), newCap) 1464 copy(newData, b.data) 1465 b.data = newData 1466 } 1467 b.data = b.data[:newSize] 1468 } 1469 1470 func (b *Batch) setSeqNum(seqNum uint64) { 1471 binary.LittleEndian.PutUint64(b.seqNumData(), seqNum) 1472 } 1473 1474 // SeqNum returns the batch sequence number which is applied to the first 1475 // record in the batch. The sequence number is incremented for each subsequent 1476 // record. It returns zero if the batch is empty. 1477 func (b *Batch) SeqNum() uint64 { 1478 if len(b.data) == 0 { 1479 b.init(batchHeaderLen) 1480 } 1481 return binary.LittleEndian.Uint64(b.seqNumData()) 1482 } 1483 1484 func (b *Batch) setCount(v uint32) { 1485 b.count = uint64(v) 1486 } 1487 1488 // Count returns the count of memtable-modifying operations in this batch. All 1489 // operations with the except of LogData increment this count. For IngestSSTs, 1490 // count is only used to indicate the number of SSTs ingested in the record, the 1491 // batch isn't applied to the memtable. 1492 func (b *Batch) Count() uint32 { 1493 if b.count > math.MaxUint32 { 1494 panic(ErrInvalidBatch) 1495 } 1496 return uint32(b.count) 1497 } 1498 1499 // Reader returns a BatchReader for the current batch contents. If the batch is 1500 // mutated, the new entries will not be visible to the reader. 1501 func (b *Batch) Reader() BatchReader { 1502 if len(b.data) == 0 { 1503 b.init(batchHeaderLen) 1504 } 1505 return b.data[batchHeaderLen:] 1506 } 1507 1508 func batchDecodeStr(data []byte) (odata []byte, s []byte, ok bool) { 1509 // TODO(jackson): This will index out of bounds if there's no varint or an 1510 // invalid varint (eg, a single 0xff byte). Correcting will add a bit of 1511 // overhead. We could avoid that overhead whenever len(data) >= 1512 // binary.MaxVarint32? 1513 1514 var v uint32 1515 var n int 1516 ptr := unsafe.Pointer(&data[0]) 1517 if a := *((*uint8)(ptr)); a < 128 { 1518 v = uint32(a) 1519 n = 1 1520 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 1521 v = uint32(b)<<7 | uint32(a) 1522 n = 2 1523 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 1524 v = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1525 n = 3 1526 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 1527 v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1528 n = 4 1529 } else { 1530 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 1531 v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1532 n = 5 1533 } 1534 1535 data = data[n:] 1536 if v > uint32(len(data)) { 1537 return nil, nil, false 1538 } 1539 return data[v:], data[:v], true 1540 } 1541 1542 // SyncWait is to be used in conjunction with DB.ApplyNoSyncWait. 1543 func (b *Batch) SyncWait() error { 1544 now := time.Now() 1545 b.fsyncWait.Wait() 1546 if b.commitErr != nil { 1547 b.db = nil // prevent batch reuse on error 1548 } 1549 waitDuration := time.Since(now) 1550 b.commitStats.CommitWaitDuration += waitDuration 1551 b.commitStats.TotalDuration += waitDuration 1552 return b.commitErr 1553 } 1554 1555 // CommitStats returns stats related to committing the batch. Should be called 1556 // after Batch.Commit, DB.Apply. If DB.ApplyNoSyncWait is used, should be 1557 // called after Batch.SyncWait. 1558 func (b *Batch) CommitStats() BatchCommitStats { 1559 return b.commitStats 1560 } 1561 1562 // BatchReader iterates over the entries contained in a batch. 1563 type BatchReader []byte 1564 1565 // ReadBatch constructs a BatchReader from a batch representation. The 1566 // header is not validated. ReadBatch returns a new batch reader and the 1567 // count of entries contained within the batch. 1568 func ReadBatch(repr []byte) (r BatchReader, count uint32) { 1569 if len(repr) <= batchHeaderLen { 1570 return nil, count 1571 } 1572 count = binary.LittleEndian.Uint32(repr[batchCountOffset:batchHeaderLen]) 1573 return repr[batchHeaderLen:], count 1574 } 1575 1576 // Next returns the next entry in this batch, if there is one. If the reader has 1577 // reached the end of the batch, Next returns ok=false and a nil error. If the 1578 // batch is corrupt and the next entry is illegible, Next returns ok=false and a 1579 // non-nil error. 1580 func (r *BatchReader) Next() (kind InternalKeyKind, ukey []byte, value []byte, ok bool, err error) { 1581 if len(*r) == 0 { 1582 return 0, nil, nil, false, nil 1583 } 1584 kind = InternalKeyKind((*r)[0]) 1585 if kind > InternalKeyKindMax { 1586 return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "invalid key kind 0x%x", (*r)[0]) 1587 } 1588 *r, ukey, ok = batchDecodeStr((*r)[1:]) 1589 if !ok { 1590 return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "decoding user key") 1591 } 1592 switch kind { 1593 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete, 1594 InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete, 1595 InternalKeyKindDeleteSized: 1596 *r, value, ok = batchDecodeStr(*r) 1597 if !ok { 1598 return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "decoding %s value", kind) 1599 } 1600 } 1601 return kind, ukey, value, true, nil 1602 } 1603 1604 // Note: batchIter mirrors the implementation of flushableBatchIter. Keep the 1605 // two in sync. 1606 type batchIter struct { 1607 cmp Compare 1608 batch *Batch 1609 iter batchskl.Iterator 1610 err error 1611 // snapshot holds a batch "sequence number" at which the batch is being 1612 // read. This sequence number has the InternalKeySeqNumBatch bit set, so it 1613 // encodes an offset within the batch. Only batch entries earlier than the 1614 // offset are visible during iteration. 1615 snapshot uint64 1616 } 1617 1618 // batchIter implements the base.InternalIterator interface. 1619 var _ base.InternalIterator = (*batchIter)(nil) 1620 1621 func (i *batchIter) String() string { 1622 return "batch" 1623 } 1624 1625 func (i *batchIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) { 1626 // Ignore TrySeekUsingNext if the view of the batch changed. 1627 if flags.TrySeekUsingNext() && flags.BatchJustRefreshed() { 1628 flags = flags.DisableTrySeekUsingNext() 1629 } 1630 1631 i.err = nil // clear cached iteration error 1632 ikey := i.iter.SeekGE(key, flags) 1633 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1634 ikey = i.iter.Next() 1635 } 1636 if ikey == nil { 1637 return nil, base.LazyValue{} 1638 } 1639 return ikey, base.MakeInPlaceValue(i.value()) 1640 } 1641 1642 func (i *batchIter) SeekPrefixGE( 1643 prefix, key []byte, flags base.SeekGEFlags, 1644 ) (*base.InternalKey, base.LazyValue) { 1645 i.err = nil // clear cached iteration error 1646 return i.SeekGE(key, flags) 1647 } 1648 1649 func (i *batchIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) { 1650 i.err = nil // clear cached iteration error 1651 ikey := i.iter.SeekLT(key) 1652 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1653 ikey = i.iter.Prev() 1654 } 1655 if ikey == nil { 1656 return nil, base.LazyValue{} 1657 } 1658 return ikey, base.MakeInPlaceValue(i.value()) 1659 } 1660 1661 func (i *batchIter) First() (*InternalKey, base.LazyValue) { 1662 i.err = nil // clear cached iteration error 1663 ikey := i.iter.First() 1664 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1665 ikey = i.iter.Next() 1666 } 1667 if ikey == nil { 1668 return nil, base.LazyValue{} 1669 } 1670 return ikey, base.MakeInPlaceValue(i.value()) 1671 } 1672 1673 func (i *batchIter) Last() (*InternalKey, base.LazyValue) { 1674 i.err = nil // clear cached iteration error 1675 ikey := i.iter.Last() 1676 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1677 ikey = i.iter.Prev() 1678 } 1679 if ikey == nil { 1680 return nil, base.LazyValue{} 1681 } 1682 return ikey, base.MakeInPlaceValue(i.value()) 1683 } 1684 1685 func (i *batchIter) Next() (*InternalKey, base.LazyValue) { 1686 ikey := i.iter.Next() 1687 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1688 ikey = i.iter.Next() 1689 } 1690 if ikey == nil { 1691 return nil, base.LazyValue{} 1692 } 1693 return ikey, base.MakeInPlaceValue(i.value()) 1694 } 1695 1696 func (i *batchIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) { 1697 // Because NextPrefix was invoked `succKey` must be ≥ the key at i's current 1698 // position. Seek the arena iterator using TrySeekUsingNext. 1699 ikey := i.iter.SeekGE(succKey, base.SeekGEFlagsNone.EnableTrySeekUsingNext()) 1700 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1701 ikey = i.iter.Next() 1702 } 1703 if ikey == nil { 1704 return nil, base.LazyValue{} 1705 } 1706 return ikey, base.MakeInPlaceValue(i.value()) 1707 } 1708 1709 func (i *batchIter) Prev() (*InternalKey, base.LazyValue) { 1710 ikey := i.iter.Prev() 1711 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1712 ikey = i.iter.Prev() 1713 } 1714 if ikey == nil { 1715 return nil, base.LazyValue{} 1716 } 1717 return ikey, base.MakeInPlaceValue(i.value()) 1718 } 1719 1720 func (i *batchIter) value() []byte { 1721 offset, _, keyEnd := i.iter.KeyInfo() 1722 data := i.batch.data 1723 if len(data[offset:]) == 0 { 1724 i.err = base.CorruptionErrorf("corrupted batch") 1725 return nil 1726 } 1727 1728 switch InternalKeyKind(data[offset]) { 1729 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete, 1730 InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete, 1731 InternalKeyKindDeleteSized: 1732 _, value, ok := batchDecodeStr(data[keyEnd:]) 1733 if !ok { 1734 return nil 1735 } 1736 return value 1737 default: 1738 return nil 1739 } 1740 } 1741 1742 func (i *batchIter) Error() error { 1743 return i.err 1744 } 1745 1746 func (i *batchIter) Close() error { 1747 _ = i.iter.Close() 1748 return i.err 1749 } 1750 1751 func (i *batchIter) SetBounds(lower, upper []byte) { 1752 i.iter.SetBounds(lower, upper) 1753 } 1754 1755 func (i *batchIter) SetContext(_ context.Context) {} 1756 1757 type flushableBatchEntry struct { 1758 // offset is the byte offset of the record within the batch repr. 1759 offset uint32 1760 // index is the 0-based ordinal number of the record within the batch. Used 1761 // to compute the seqnum for the record. 1762 index uint32 1763 // key{Start,End} are the start and end byte offsets of the key within the 1764 // batch repr. Cached to avoid decoding the key length on every 1765 // comparison. The value is stored starting at keyEnd. 1766 keyStart uint32 1767 keyEnd uint32 1768 } 1769 1770 // flushableBatch wraps an existing batch and provides the interfaces needed 1771 // for making the batch flushable (i.e. able to mimic a memtable). 1772 type flushableBatch struct { 1773 cmp Compare 1774 formatKey base.FormatKey 1775 data []byte 1776 1777 // The base sequence number for the entries in the batch. This is the same 1778 // value as Batch.seqNum() and is cached here for performance. 1779 seqNum uint64 1780 1781 // A slice of offsets and indices for the entries in the batch. Used to 1782 // implement flushableBatchIter. Unlike the indexing on a normal batch, a 1783 // flushable batch is indexed such that batch entry i will be given the 1784 // sequence number flushableBatch.seqNum+i. 1785 // 1786 // Sorted in increasing order of key and decreasing order of offset (since 1787 // higher offsets correspond to higher sequence numbers). 1788 // 1789 // Does not include range deletion entries or range key entries. 1790 offsets []flushableBatchEntry 1791 1792 // Fragmented range deletion tombstones. 1793 tombstones []keyspan.Span 1794 1795 // Fragmented range keys. 1796 rangeKeys []keyspan.Span 1797 } 1798 1799 var _ flushable = (*flushableBatch)(nil) 1800 1801 // newFlushableBatch creates a new batch that implements the flushable 1802 // interface. This allows the batch to act like a memtable and be placed in the 1803 // queue of flushable memtables. Note that the flushable batch takes ownership 1804 // of the batch data. 1805 func newFlushableBatch(batch *Batch, comparer *Comparer) (*flushableBatch, error) { 1806 b := &flushableBatch{ 1807 data: batch.data, 1808 cmp: comparer.Compare, 1809 formatKey: comparer.FormatKey, 1810 offsets: make([]flushableBatchEntry, 0, batch.Count()), 1811 } 1812 if b.data != nil { 1813 // Note that this sequence number is not correct when this batch has not 1814 // been applied since the sequence number has not been assigned yet. The 1815 // correct sequence number will be set later. But it is correct when the 1816 // batch is being replayed from the WAL. 1817 b.seqNum = batch.SeqNum() 1818 } 1819 var rangeDelOffsets []flushableBatchEntry 1820 var rangeKeyOffsets []flushableBatchEntry 1821 if len(b.data) > batchHeaderLen { 1822 // Non-empty batch. 1823 var index uint32 1824 for iter := BatchReader(b.data[batchHeaderLen:]); len(iter) > 0; index++ { 1825 offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0])) 1826 kind, key, _, ok, err := iter.Next() 1827 if !ok { 1828 if err != nil { 1829 return nil, err 1830 } 1831 break 1832 } 1833 entry := flushableBatchEntry{ 1834 offset: uint32(offset), 1835 index: uint32(index), 1836 } 1837 if keySize := uint32(len(key)); keySize == 0 { 1838 // Must add 2 to the offset. One byte encodes `kind` and the next 1839 // byte encodes `0`, which is the length of the key. 1840 entry.keyStart = uint32(offset) + 2 1841 entry.keyEnd = entry.keyStart 1842 } else { 1843 entry.keyStart = uint32(uintptr(unsafe.Pointer(&key[0])) - 1844 uintptr(unsafe.Pointer(&b.data[0]))) 1845 entry.keyEnd = entry.keyStart + keySize 1846 } 1847 switch kind { 1848 case InternalKeyKindRangeDelete: 1849 rangeDelOffsets = append(rangeDelOffsets, entry) 1850 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 1851 rangeKeyOffsets = append(rangeKeyOffsets, entry) 1852 default: 1853 b.offsets = append(b.offsets, entry) 1854 } 1855 } 1856 } 1857 1858 // Sort all of offsets, rangeDelOffsets and rangeKeyOffsets, using *batch's 1859 // sort.Interface implementation. 1860 pointOffsets := b.offsets 1861 sort.Sort(b) 1862 b.offsets = rangeDelOffsets 1863 sort.Sort(b) 1864 b.offsets = rangeKeyOffsets 1865 sort.Sort(b) 1866 b.offsets = pointOffsets 1867 1868 if len(rangeDelOffsets) > 0 { 1869 frag := &keyspan.Fragmenter{ 1870 Cmp: b.cmp, 1871 Format: b.formatKey, 1872 Emit: func(s keyspan.Span) { 1873 b.tombstones = append(b.tombstones, s) 1874 }, 1875 } 1876 it := &flushableBatchIter{ 1877 batch: b, 1878 data: b.data, 1879 offsets: rangeDelOffsets, 1880 cmp: b.cmp, 1881 index: -1, 1882 } 1883 fragmentRangeDels(frag, it, len(rangeDelOffsets)) 1884 } 1885 if len(rangeKeyOffsets) > 0 { 1886 frag := &keyspan.Fragmenter{ 1887 Cmp: b.cmp, 1888 Format: b.formatKey, 1889 Emit: func(s keyspan.Span) { 1890 b.rangeKeys = append(b.rangeKeys, s) 1891 }, 1892 } 1893 it := &flushableBatchIter{ 1894 batch: b, 1895 data: b.data, 1896 offsets: rangeKeyOffsets, 1897 cmp: b.cmp, 1898 index: -1, 1899 } 1900 fragmentRangeKeys(frag, it, len(rangeKeyOffsets)) 1901 } 1902 return b, nil 1903 } 1904 1905 func (b *flushableBatch) setSeqNum(seqNum uint64) { 1906 if b.seqNum != 0 { 1907 panic(fmt.Sprintf("pebble: flushableBatch.seqNum already set: %d", b.seqNum)) 1908 } 1909 b.seqNum = seqNum 1910 for i := range b.tombstones { 1911 for j := range b.tombstones[i].Keys { 1912 b.tombstones[i].Keys[j].Trailer = base.MakeTrailer( 1913 b.tombstones[i].Keys[j].SeqNum()+seqNum, 1914 b.tombstones[i].Keys[j].Kind(), 1915 ) 1916 } 1917 } 1918 for i := range b.rangeKeys { 1919 for j := range b.rangeKeys[i].Keys { 1920 b.rangeKeys[i].Keys[j].Trailer = base.MakeTrailer( 1921 b.rangeKeys[i].Keys[j].SeqNum()+seqNum, 1922 b.rangeKeys[i].Keys[j].Kind(), 1923 ) 1924 } 1925 } 1926 } 1927 1928 func (b *flushableBatch) Len() int { 1929 return len(b.offsets) 1930 } 1931 1932 func (b *flushableBatch) Less(i, j int) bool { 1933 ei := &b.offsets[i] 1934 ej := &b.offsets[j] 1935 ki := b.data[ei.keyStart:ei.keyEnd] 1936 kj := b.data[ej.keyStart:ej.keyEnd] 1937 switch c := b.cmp(ki, kj); { 1938 case c < 0: 1939 return true 1940 case c > 0: 1941 return false 1942 default: 1943 return ei.offset > ej.offset 1944 } 1945 } 1946 1947 func (b *flushableBatch) Swap(i, j int) { 1948 b.offsets[i], b.offsets[j] = b.offsets[j], b.offsets[i] 1949 } 1950 1951 // newIter is part of the flushable interface. 1952 func (b *flushableBatch) newIter(o *IterOptions) internalIterator { 1953 return &flushableBatchIter{ 1954 batch: b, 1955 data: b.data, 1956 offsets: b.offsets, 1957 cmp: b.cmp, 1958 index: -1, 1959 lower: o.GetLowerBound(), 1960 upper: o.GetUpperBound(), 1961 } 1962 } 1963 1964 // newFlushIter is part of the flushable interface. 1965 func (b *flushableBatch) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator { 1966 return &flushFlushableBatchIter{ 1967 flushableBatchIter: flushableBatchIter{ 1968 batch: b, 1969 data: b.data, 1970 offsets: b.offsets, 1971 cmp: b.cmp, 1972 index: -1, 1973 }, 1974 bytesIterated: bytesFlushed, 1975 } 1976 } 1977 1978 // newRangeDelIter is part of the flushable interface. 1979 func (b *flushableBatch) newRangeDelIter(o *IterOptions) keyspan.FragmentIterator { 1980 if len(b.tombstones) == 0 { 1981 return nil 1982 } 1983 return keyspan.NewIter(b.cmp, b.tombstones) 1984 } 1985 1986 // newRangeKeyIter is part of the flushable interface. 1987 func (b *flushableBatch) newRangeKeyIter(o *IterOptions) keyspan.FragmentIterator { 1988 if len(b.rangeKeys) == 0 { 1989 return nil 1990 } 1991 return keyspan.NewIter(b.cmp, b.rangeKeys) 1992 } 1993 1994 // containsRangeKeys is part of the flushable interface. 1995 func (b *flushableBatch) containsRangeKeys() bool { return len(b.rangeKeys) > 0 } 1996 1997 // inuseBytes is part of the flushable interface. 1998 func (b *flushableBatch) inuseBytes() uint64 { 1999 return uint64(len(b.data) - batchHeaderLen) 2000 } 2001 2002 // totalBytes is part of the flushable interface. 2003 func (b *flushableBatch) totalBytes() uint64 { 2004 return uint64(cap(b.data)) 2005 } 2006 2007 // readyForFlush is part of the flushable interface. 2008 func (b *flushableBatch) readyForFlush() bool { 2009 // A flushable batch is always ready for flush; it must be flushed together 2010 // with the previous memtable. 2011 return true 2012 } 2013 2014 // Note: flushableBatchIter mirrors the implementation of batchIter. Keep the 2015 // two in sync. 2016 type flushableBatchIter struct { 2017 // Members to be initialized by creator. 2018 batch *flushableBatch 2019 // The bytes backing the batch. Always the same as batch.data? 2020 data []byte 2021 // The sorted entries. This is not always equal to batch.offsets. 2022 offsets []flushableBatchEntry 2023 cmp Compare 2024 // Must be initialized to -1. It is the index into offsets that represents 2025 // the current iterator position. 2026 index int 2027 2028 // For internal use by the implementation. 2029 key InternalKey 2030 err error 2031 2032 // Optionally initialize to bounds of iteration, if any. 2033 lower []byte 2034 upper []byte 2035 } 2036 2037 // flushableBatchIter implements the base.InternalIterator interface. 2038 var _ base.InternalIterator = (*flushableBatchIter)(nil) 2039 2040 func (i *flushableBatchIter) String() string { 2041 return "flushable-batch" 2042 } 2043 2044 // SeekGE implements internalIterator.SeekGE, as documented in the pebble 2045 // package. Ignore flags.TrySeekUsingNext() since we don't expect this 2046 // optimization to provide much benefit here at the moment. 2047 func (i *flushableBatchIter) SeekGE( 2048 key []byte, flags base.SeekGEFlags, 2049 ) (*InternalKey, base.LazyValue) { 2050 i.err = nil // clear cached iteration error 2051 ikey := base.MakeSearchKey(key) 2052 i.index = sort.Search(len(i.offsets), func(j int) bool { 2053 return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0 2054 }) 2055 if i.index >= len(i.offsets) { 2056 return nil, base.LazyValue{} 2057 } 2058 i.key = i.getKey(i.index) 2059 if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 { 2060 i.index = len(i.offsets) 2061 return nil, base.LazyValue{} 2062 } 2063 return &i.key, i.value() 2064 } 2065 2066 // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the 2067 // pebble package. 2068 func (i *flushableBatchIter) SeekPrefixGE( 2069 prefix, key []byte, flags base.SeekGEFlags, 2070 ) (*base.InternalKey, base.LazyValue) { 2071 return i.SeekGE(key, flags) 2072 } 2073 2074 // SeekLT implements internalIterator.SeekLT, as documented in the pebble 2075 // package. 2076 func (i *flushableBatchIter) SeekLT( 2077 key []byte, flags base.SeekLTFlags, 2078 ) (*InternalKey, base.LazyValue) { 2079 i.err = nil // clear cached iteration error 2080 ikey := base.MakeSearchKey(key) 2081 i.index = sort.Search(len(i.offsets), func(j int) bool { 2082 return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0 2083 }) 2084 i.index-- 2085 if i.index < 0 { 2086 return nil, base.LazyValue{} 2087 } 2088 i.key = i.getKey(i.index) 2089 if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 { 2090 i.index = -1 2091 return nil, base.LazyValue{} 2092 } 2093 return &i.key, i.value() 2094 } 2095 2096 // First implements internalIterator.First, as documented in the pebble 2097 // package. 2098 func (i *flushableBatchIter) First() (*InternalKey, base.LazyValue) { 2099 i.err = nil // clear cached iteration error 2100 if len(i.offsets) == 0 { 2101 return nil, base.LazyValue{} 2102 } 2103 i.index = 0 2104 i.key = i.getKey(i.index) 2105 if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 { 2106 i.index = len(i.offsets) 2107 return nil, base.LazyValue{} 2108 } 2109 return &i.key, i.value() 2110 } 2111 2112 // Last implements internalIterator.Last, as documented in the pebble 2113 // package. 2114 func (i *flushableBatchIter) Last() (*InternalKey, base.LazyValue) { 2115 i.err = nil // clear cached iteration error 2116 if len(i.offsets) == 0 { 2117 return nil, base.LazyValue{} 2118 } 2119 i.index = len(i.offsets) - 1 2120 i.key = i.getKey(i.index) 2121 if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 { 2122 i.index = -1 2123 return nil, base.LazyValue{} 2124 } 2125 return &i.key, i.value() 2126 } 2127 2128 // Note: flushFlushableBatchIter.Next mirrors the implementation of 2129 // flushableBatchIter.Next due to performance. Keep the two in sync. 2130 func (i *flushableBatchIter) Next() (*InternalKey, base.LazyValue) { 2131 if i.index == len(i.offsets) { 2132 return nil, base.LazyValue{} 2133 } 2134 i.index++ 2135 if i.index == len(i.offsets) { 2136 return nil, base.LazyValue{} 2137 } 2138 i.key = i.getKey(i.index) 2139 if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 { 2140 i.index = len(i.offsets) 2141 return nil, base.LazyValue{} 2142 } 2143 return &i.key, i.value() 2144 } 2145 2146 func (i *flushableBatchIter) Prev() (*InternalKey, base.LazyValue) { 2147 if i.index < 0 { 2148 return nil, base.LazyValue{} 2149 } 2150 i.index-- 2151 if i.index < 0 { 2152 return nil, base.LazyValue{} 2153 } 2154 i.key = i.getKey(i.index) 2155 if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 { 2156 i.index = -1 2157 return nil, base.LazyValue{} 2158 } 2159 return &i.key, i.value() 2160 } 2161 2162 // Note: flushFlushableBatchIter.NextPrefix mirrors the implementation of 2163 // flushableBatchIter.NextPrefix due to performance. Keep the two in sync. 2164 func (i *flushableBatchIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) { 2165 return i.SeekGE(succKey, base.SeekGEFlagsNone.EnableTrySeekUsingNext()) 2166 } 2167 2168 func (i *flushableBatchIter) getKey(index int) InternalKey { 2169 e := &i.offsets[index] 2170 kind := InternalKeyKind(i.data[e.offset]) 2171 key := i.data[e.keyStart:e.keyEnd] 2172 return base.MakeInternalKey(key, i.batch.seqNum+uint64(e.index), kind) 2173 } 2174 2175 func (i *flushableBatchIter) value() base.LazyValue { 2176 p := i.data[i.offsets[i.index].offset:] 2177 if len(p) == 0 { 2178 i.err = base.CorruptionErrorf("corrupted batch") 2179 return base.LazyValue{} 2180 } 2181 kind := InternalKeyKind(p[0]) 2182 if kind > InternalKeyKindMax { 2183 i.err = base.CorruptionErrorf("corrupted batch") 2184 return base.LazyValue{} 2185 } 2186 var value []byte 2187 var ok bool 2188 switch kind { 2189 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete, 2190 InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete, 2191 InternalKeyKindDeleteSized: 2192 keyEnd := i.offsets[i.index].keyEnd 2193 _, value, ok = batchDecodeStr(i.data[keyEnd:]) 2194 if !ok { 2195 i.err = base.CorruptionErrorf("corrupted batch") 2196 return base.LazyValue{} 2197 } 2198 } 2199 return base.MakeInPlaceValue(value) 2200 } 2201 2202 func (i *flushableBatchIter) Valid() bool { 2203 return i.index >= 0 && i.index < len(i.offsets) 2204 } 2205 2206 func (i *flushableBatchIter) Error() error { 2207 return i.err 2208 } 2209 2210 func (i *flushableBatchIter) Close() error { 2211 return i.err 2212 } 2213 2214 func (i *flushableBatchIter) SetBounds(lower, upper []byte) { 2215 i.lower = lower 2216 i.upper = upper 2217 } 2218 2219 func (i *flushableBatchIter) SetContext(_ context.Context) {} 2220 2221 // flushFlushableBatchIter is similar to flushableBatchIter but it keeps track 2222 // of number of bytes iterated. 2223 type flushFlushableBatchIter struct { 2224 flushableBatchIter 2225 bytesIterated *uint64 2226 } 2227 2228 // flushFlushableBatchIter implements the base.InternalIterator interface. 2229 var _ base.InternalIterator = (*flushFlushableBatchIter)(nil) 2230 2231 func (i *flushFlushableBatchIter) String() string { 2232 return "flushable-batch" 2233 } 2234 2235 func (i *flushFlushableBatchIter) SeekGE( 2236 key []byte, flags base.SeekGEFlags, 2237 ) (*InternalKey, base.LazyValue) { 2238 panic("pebble: SeekGE unimplemented") 2239 } 2240 2241 func (i *flushFlushableBatchIter) SeekPrefixGE( 2242 prefix, key []byte, flags base.SeekGEFlags, 2243 ) (*base.InternalKey, base.LazyValue) { 2244 panic("pebble: SeekPrefixGE unimplemented") 2245 } 2246 2247 func (i *flushFlushableBatchIter) SeekLT( 2248 key []byte, flags base.SeekLTFlags, 2249 ) (*InternalKey, base.LazyValue) { 2250 panic("pebble: SeekLT unimplemented") 2251 } 2252 2253 func (i *flushFlushableBatchIter) First() (*InternalKey, base.LazyValue) { 2254 i.err = nil // clear cached iteration error 2255 key, val := i.flushableBatchIter.First() 2256 if key == nil { 2257 return nil, base.LazyValue{} 2258 } 2259 entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset 2260 *i.bytesIterated += uint64(entryBytes) + i.valueSize() 2261 return key, val 2262 } 2263 2264 func (i *flushFlushableBatchIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { 2265 panic("pebble: Prev unimplemented") 2266 } 2267 2268 // Note: flushFlushableBatchIter.Next mirrors the implementation of 2269 // flushableBatchIter.Next due to performance. Keep the two in sync. 2270 func (i *flushFlushableBatchIter) Next() (*InternalKey, base.LazyValue) { 2271 if i.index == len(i.offsets) { 2272 return nil, base.LazyValue{} 2273 } 2274 i.index++ 2275 if i.index == len(i.offsets) { 2276 return nil, base.LazyValue{} 2277 } 2278 i.key = i.getKey(i.index) 2279 entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset 2280 *i.bytesIterated += uint64(entryBytes) + i.valueSize() 2281 return &i.key, i.value() 2282 } 2283 2284 func (i flushFlushableBatchIter) Prev() (*InternalKey, base.LazyValue) { 2285 panic("pebble: Prev unimplemented") 2286 } 2287 2288 func (i flushFlushableBatchIter) valueSize() uint64 { 2289 p := i.data[i.offsets[i.index].offset:] 2290 if len(p) == 0 { 2291 i.err = base.CorruptionErrorf("corrupted batch") 2292 return 0 2293 } 2294 kind := InternalKeyKind(p[0]) 2295 if kind > InternalKeyKindMax { 2296 i.err = base.CorruptionErrorf("corrupted batch") 2297 return 0 2298 } 2299 var length uint64 2300 switch kind { 2301 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete: 2302 keyEnd := i.offsets[i.index].keyEnd 2303 v, n := binary.Uvarint(i.data[keyEnd:]) 2304 if n <= 0 { 2305 i.err = base.CorruptionErrorf("corrupted batch") 2306 return 0 2307 } 2308 length = v + uint64(n) 2309 } 2310 return length 2311 } 2312 2313 // batchSort returns iterators for the sorted contents of the batch. It is 2314 // intended for testing use only. The batch.Sort dance is done to prevent 2315 // exposing this method in the public pebble interface. 2316 func batchSort( 2317 i interface{}, 2318 ) ( 2319 points internalIterator, 2320 rangeDels keyspan.FragmentIterator, 2321 rangeKeys keyspan.FragmentIterator, 2322 ) { 2323 b := i.(*Batch) 2324 if b.Indexed() { 2325 pointIter := b.newInternalIter(nil) 2326 rangeDelIter := b.newRangeDelIter(nil, math.MaxUint64) 2327 rangeKeyIter := b.newRangeKeyIter(nil, math.MaxUint64) 2328 return pointIter, rangeDelIter, rangeKeyIter 2329 } 2330 f, err := newFlushableBatch(b, b.db.opts.Comparer) 2331 if err != nil { 2332 panic(err) 2333 } 2334 return f.newIter(nil), f.newRangeDelIter(nil), f.newRangeKeyIter(nil) 2335 } 2336 2337 func init() { 2338 private.BatchSort = batchSort 2339 }