github.com/cockroachdb/pebble@v1.1.5/batch.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "context" 9 "encoding/binary" 10 "fmt" 11 "io" 12 "math" 13 "sort" 14 "sync" 15 "sync/atomic" 16 "time" 17 "unsafe" 18 19 "github.com/cockroachdb/errors" 20 "github.com/cockroachdb/pebble/internal/base" 21 "github.com/cockroachdb/pebble/internal/batchskl" 22 "github.com/cockroachdb/pebble/internal/humanize" 23 "github.com/cockroachdb/pebble/internal/keyspan" 24 "github.com/cockroachdb/pebble/internal/private" 25 "github.com/cockroachdb/pebble/internal/rangedel" 26 "github.com/cockroachdb/pebble/internal/rangekey" 27 "github.com/cockroachdb/pebble/internal/rawalloc" 28 ) 29 30 const ( 31 batchCountOffset = 8 32 batchHeaderLen = 12 33 batchInitialSize = 1 << 10 // 1 KB 34 batchMaxRetainedSize = 1 << 20 // 1 MB 35 invalidBatchCount = 1<<32 - 1 36 maxVarintLen32 = 5 37 ) 38 39 // ErrNotIndexed means that a read operation on a batch failed because the 40 // batch is not indexed and thus doesn't support reads. 41 var ErrNotIndexed = errors.New("pebble: batch not indexed") 42 43 // ErrInvalidBatch indicates that a batch is invalid or otherwise corrupted. 44 var ErrInvalidBatch = base.MarkCorruptionError(errors.New("pebble: invalid batch")) 45 46 // ErrBatchTooLarge indicates that a batch is invalid or otherwise corrupted. 47 var ErrBatchTooLarge = base.MarkCorruptionError(errors.Newf("pebble: batch too large: >= %s", humanize.Bytes.Uint64(maxBatchSize))) 48 49 // DeferredBatchOp represents a batch operation (eg. set, merge, delete) that is 50 // being inserted into the batch. Indexing is not performed on the specified key 51 // until Finish is called, hence the name deferred. This struct lets the caller 52 // copy or encode keys/values directly into the batch representation instead of 53 // copying into an intermediary buffer then having pebble.Batch copy off of it. 54 type DeferredBatchOp struct { 55 index *batchskl.Skiplist 56 57 // Key and Value point to parts of the binary batch representation where 58 // keys and values should be encoded/copied into. len(Key) and len(Value) 59 // bytes must be copied into these slices respectively before calling 60 // Finish(). Changing where these slices point to is not allowed. 61 Key, Value []byte 62 offset uint32 63 } 64 65 // Finish completes the addition of this batch operation, and adds it to the 66 // index if necessary. Must be called once (and exactly once) keys/values 67 // have been filled into Key and Value. Not calling Finish or not 68 // copying/encoding keys will result in an incomplete index, and calling Finish 69 // twice may result in a panic. 70 func (d DeferredBatchOp) Finish() error { 71 if d.index != nil { 72 if err := d.index.Add(d.offset); err != nil { 73 return err 74 } 75 } 76 return nil 77 } 78 79 // A Batch is a sequence of Sets, Merges, Deletes, DeleteRanges, RangeKeySets, 80 // RangeKeyUnsets, and/or RangeKeyDeletes that are applied atomically. Batch 81 // implements the Reader interface, but only an indexed batch supports reading 82 // (without error) via Get or NewIter. A non-indexed batch will return 83 // ErrNotIndexed when read from. A batch is not safe for concurrent use, and 84 // consumers should use a batch per goroutine or provide their own 85 // synchronization. 86 // 87 // # Indexing 88 // 89 // Batches can be optionally indexed (see DB.NewIndexedBatch). An indexed batch 90 // allows iteration via an Iterator (see Batch.NewIter). The iterator provides 91 // a merged view of the operations in the batch and the underlying 92 // database. This is implemented by treating the batch as an additional layer 93 // in the LSM where every entry in the batch is considered newer than any entry 94 // in the underlying database (batch entries have the InternalKeySeqNumBatch 95 // bit set). By treating the batch as an additional layer in the LSM, iteration 96 // supports all batch operations (i.e. Set, Merge, Delete, DeleteRange, 97 // RangeKeySet, RangeKeyUnset, RangeKeyDelete) with minimal effort. 98 // 99 // The same key can be operated on multiple times in a batch, though only the 100 // latest operation will be visible. For example, Put("a", "b"), Delete("a") 101 // will cause the key "a" to not be visible in the batch. Put("a", "b"), 102 // Put("a", "c") will cause a read of "a" to return the value "c". 103 // 104 // The batch index is implemented via an skiplist (internal/batchskl). While 105 // the skiplist implementation is very fast, inserting into an indexed batch is 106 // significantly slower than inserting into a non-indexed batch. Only use an 107 // indexed batch if you require reading from it. 108 // 109 // # Atomic commit 110 // 111 // The operations in a batch are persisted by calling Batch.Commit which is 112 // equivalent to calling DB.Apply(batch). A batch is committed atomically by 113 // writing the internal batch representation to the WAL, adding all of the 114 // batch operations to the memtable associated with the WAL, and then 115 // incrementing the visible sequence number so that subsequent reads can see 116 // the effects of the batch operations. If WriteOptions.Sync is true, a call to 117 // Batch.Commit will guarantee that the batch is persisted to disk before 118 // returning. See commitPipeline for more on the implementation details. 119 // 120 // # Large batches 121 // 122 // The size of a batch is limited only by available memory (be aware that 123 // indexed batches require considerably additional memory for the skiplist 124 // structure). A given WAL file has a single memtable associated with it (this 125 // restriction could be removed, but doing so is onerous and complex). And a 126 // memtable has a fixed size due to the underlying fixed size arena. Note that 127 // this differs from RocksDB where a memtable can grow arbitrarily large using 128 // a list of arena chunks. In RocksDB this is accomplished by storing pointers 129 // in the arena memory, but that isn't possible in Go. 130 // 131 // During Batch.Commit, a batch which is larger than a threshold (> 132 // MemTableSize/2) is wrapped in a flushableBatch and inserted into the queue 133 // of memtables. A flushableBatch forces WAL to be rotated, but that happens 134 // anyways when the memtable becomes full so this does not cause significant 135 // WAL churn. Because the flushableBatch is readable as another layer in the 136 // LSM, Batch.Commit returns as soon as the flushableBatch has been added to 137 // the queue of memtables. 138 // 139 // Internally, a flushableBatch provides Iterator support by sorting the batch 140 // contents (the batch is sorted once, when it is added to the memtable 141 // queue). Sorting the batch contents and insertion of the contents into a 142 // memtable have the same big-O time, but the constant factor dominates 143 // here. Sorting is significantly faster and uses significantly less memory. 144 // 145 // # Internal representation 146 // 147 // The internal batch representation is a contiguous byte buffer with a fixed 148 // 12-byte header, followed by a series of records. 149 // 150 // +-------------+------------+--- ... ---+ 151 // | SeqNum (8B) | Count (4B) | Entries | 152 // +-------------+------------+--- ... ---+ 153 // 154 // Each record has a 1-byte kind tag prefix, followed by 1 or 2 length prefixed 155 // strings (varstring): 156 // 157 // +-----------+-----------------+-------------------+ 158 // | Kind (1B) | Key (varstring) | Value (varstring) | 159 // +-----------+-----------------+-------------------+ 160 // 161 // A varstring is a varint32 followed by N bytes of data. The Kind tags are 162 // exactly those specified by InternalKeyKind. The following table shows the 163 // format for records of each kind: 164 // 165 // InternalKeyKindDelete varstring 166 // InternalKeyKindLogData varstring 167 // InternalKeyKindIngestSST varstring 168 // InternalKeyKindSet varstring varstring 169 // InternalKeyKindMerge varstring varstring 170 // InternalKeyKindRangeDelete varstring varstring 171 // InternalKeyKindRangeKeySet varstring varstring 172 // InternalKeyKindRangeKeyUnset varstring varstring 173 // InternalKeyKindRangeKeyDelete varstring varstring 174 // 175 // The intuitive understanding here are that the arguments to Delete, Set, 176 // Merge, DeleteRange and RangeKeyDelete are encoded into the batch. The 177 // RangeKeySet and RangeKeyUnset operations are slightly more complicated, 178 // encoding their end key, suffix and value [in the case of RangeKeySet] within 179 // the Value varstring. For more information on the value encoding for 180 // RangeKeySet and RangeKeyUnset, see the internal/rangekey package. 181 // 182 // The internal batch representation is the on disk format for a batch in the 183 // WAL, and thus stable. New record kinds may be added, but the existing ones 184 // will not be modified. 185 type Batch struct { 186 batchInternal 187 applied atomic.Bool 188 } 189 190 // batchInternal contains the set of fields within Batch that are non-atomic and 191 // capable of being reset using a *b = batchInternal{} struct copy. 192 type batchInternal struct { 193 // Data is the wire format of a batch's log entry: 194 // - 8 bytes for a sequence number of the first batch element, 195 // or zeroes if the batch has not yet been applied, 196 // - 4 bytes for the count: the number of elements in the batch, 197 // or "\xff\xff\xff\xff" if the batch is invalid, 198 // - count elements, being: 199 // - one byte for the kind 200 // - the varint-string user key, 201 // - the varint-string value (if kind != delete). 202 // The sequence number and count are stored in little-endian order. 203 // 204 // The data field can be (but is not guaranteed to be) nil for new 205 // batches. Large batches will set the data field to nil when committed as 206 // the data has been moved to a flushableBatch and inserted into the queue of 207 // memtables. 208 data []byte 209 cmp Compare 210 formatKey base.FormatKey 211 abbreviatedKey AbbreviatedKey 212 213 // An upper bound on required space to add this batch to a memtable. 214 // Note that although batches are limited to 4 GiB in size, that limit 215 // applies to len(data), not the memtable size. The upper bound on the 216 // size of a memtable node is larger than the overhead of the batch's log 217 // encoding, so memTableSize is larger than len(data) and may overflow a 218 // uint32. 219 memTableSize uint64 220 221 // The db to which the batch will be committed. Do not change this field 222 // after the batch has been created as it might invalidate internal state. 223 // Batch.memTableSize is only refreshed if Batch.db is set. Setting db to 224 // nil once it has been set implies that the Batch has encountered an error. 225 db *DB 226 227 // The count of records in the batch. This count will be stored in the batch 228 // data whenever Repr() is called. 229 count uint64 230 231 // The count of range deletions in the batch. Updated every time a range 232 // deletion is added. 233 countRangeDels uint64 234 235 // The count of range key sets, unsets and deletes in the batch. Updated 236 // every time a RANGEKEYSET, RANGEKEYUNSET or RANGEKEYDEL key is added. 237 countRangeKeys uint64 238 239 // A deferredOp struct, stored in the Batch so that a pointer can be returned 240 // from the *Deferred() methods rather than a value. 241 deferredOp DeferredBatchOp 242 243 // An optional skiplist keyed by offset into data of the entry. 244 index *batchskl.Skiplist 245 rangeDelIndex *batchskl.Skiplist 246 rangeKeyIndex *batchskl.Skiplist 247 248 // Fragmented range deletion tombstones. Cached the first time a range 249 // deletion iterator is requested. The cache is invalidated whenever a new 250 // range deletion is added to the batch. This cache can only be used when 251 // opening an iterator to read at a batch sequence number >= 252 // tombstonesSeqNum. This is the case for all new iterators created over a 253 // batch but it's not the case for all cloned iterators. 254 tombstones []keyspan.Span 255 tombstonesSeqNum uint64 256 257 // Fragmented range key spans. Cached the first time a range key iterator is 258 // requested. The cache is invalidated whenever a new range key 259 // (RangeKey{Set,Unset,Del}) is added to the batch. This cache can only be 260 // used when opening an iterator to read at a batch sequence number >= 261 // tombstonesSeqNum. This is the case for all new iterators created over a 262 // batch but it's not the case for all cloned iterators. 263 rangeKeys []keyspan.Span 264 rangeKeysSeqNum uint64 265 266 // The flushableBatch wrapper if the batch is too large to fit in the 267 // memtable. 268 flushable *flushableBatch 269 270 // minimumFormatMajorVersion indicates the format major version required in 271 // order to commit this batch. If an operation requires a particular format 272 // major version, it ratchets the batch's minimumFormatMajorVersion. When 273 // the batch is committed, this is validated against the database's current 274 // format major version. 275 minimumFormatMajorVersion FormatMajorVersion 276 277 // Synchronous Apply uses the commit WaitGroup for both publishing the 278 // seqnum and waiting for the WAL fsync (if needed). Asynchronous 279 // ApplyNoSyncWait, which implies WriteOptions.Sync is true, uses the commit 280 // WaitGroup for publishing the seqnum and the fsyncWait WaitGroup for 281 // waiting for the WAL fsync. 282 // 283 // TODO(sumeer): if we find that ApplyNoSyncWait in conjunction with 284 // SyncWait is causing higher memory usage because of the time duration 285 // between when the sync is already done, and a goroutine calls SyncWait 286 // (followed by Batch.Close), we could separate out {fsyncWait, commitErr} 287 // into a separate struct that is allocated separately (using another 288 // sync.Pool), and only that struct needs to outlive Batch.Close (which 289 // could then be called immediately after ApplyNoSyncWait). commitStats 290 // will also need to be in this separate struct. 291 commit sync.WaitGroup 292 fsyncWait sync.WaitGroup 293 294 commitStats BatchCommitStats 295 296 commitErr error 297 298 // Position bools together to reduce the sizeof the struct. 299 300 // ingestedSSTBatch indicates that the batch contains one or more key kinds 301 // of InternalKeyKindIngestSST. If the batch contains key kinds of IngestSST 302 // then it will only contain key kinds of IngestSST. 303 ingestedSSTBatch bool 304 305 // committing is set to true when a batch begins to commit. It's used to 306 // ensure the batch is not mutated concurrently. It is not an atomic 307 // deliberately, so as to avoid the overhead on batch mutations. This is 308 // okay, because under correct usage this field will never be accessed 309 // concurrently. It's only under incorrect usage the memory accesses of this 310 // variable may violate memory safety. Since we don't use atomics here, 311 // false negatives are possible. 312 committing bool 313 } 314 315 // BatchCommitStats exposes stats related to committing a batch. 316 // 317 // NB: there is no Pebble internal tracing (using LoggerAndTracer) of slow 318 // batch commits. The caller can use these stats to do their own tracing as 319 // needed. 320 type BatchCommitStats struct { 321 // TotalDuration is the time spent in DB.{Apply,ApplyNoSyncWait} or 322 // Batch.Commit, plus the time waiting in Batch.SyncWait. If there is a gap 323 // between calling ApplyNoSyncWait and calling SyncWait, that gap could 324 // include some duration in which real work was being done for the commit 325 // and will not be included here. This missing time is considered acceptable 326 // since the goal of these stats is to understand user-facing latency. 327 // 328 // TotalDuration includes time spent in various queues both inside Pebble 329 // and outside Pebble (I/O queues, goroutine scheduler queue, mutex wait 330 // etc.). For some of these queues (which we consider important) the wait 331 // times are included below -- these expose low-level implementation detail 332 // and are meant for expert diagnosis and subject to change. There may be 333 // unaccounted time after subtracting those values from TotalDuration. 334 TotalDuration time.Duration 335 // SemaphoreWaitDuration is the wait time for semaphores in 336 // commitPipeline.Commit. 337 SemaphoreWaitDuration time.Duration 338 // WALQueueWaitDuration is the wait time for allocating memory blocks in the 339 // LogWriter (due to the LogWriter not writing fast enough). At the moment 340 // this is duration is always zero because a single WAL will allow 341 // allocating memory blocks up to the entire memtable size. In the future, 342 // we may pipeline WALs and bound the WAL queued blocks separately, so this 343 // field is preserved for that possibility. 344 WALQueueWaitDuration time.Duration 345 // MemTableWriteStallDuration is the wait caused by a write stall due to too 346 // many memtables (due to not flushing fast enough). 347 MemTableWriteStallDuration time.Duration 348 // L0ReadAmpWriteStallDuration is the wait caused by a write stall due to 349 // high read amplification in L0 (due to not compacting fast enough out of 350 // L0). 351 L0ReadAmpWriteStallDuration time.Duration 352 // WALRotationDuration is the wait time for WAL rotation, which includes 353 // syncing and closing the old WAL and creating (or reusing) a new one. 354 WALRotationDuration time.Duration 355 // CommitWaitDuration is the wait for publishing the seqnum plus the 356 // duration for the WAL sync (if requested). The former should be tiny and 357 // one can assume that this is all due to the WAL sync. 358 CommitWaitDuration time.Duration 359 } 360 361 var _ Reader = (*Batch)(nil) 362 var _ Writer = (*Batch)(nil) 363 364 var batchPool = sync.Pool{ 365 New: func() interface{} { 366 return &Batch{} 367 }, 368 } 369 370 type indexedBatch struct { 371 batch Batch 372 index batchskl.Skiplist 373 } 374 375 var indexedBatchPool = sync.Pool{ 376 New: func() interface{} { 377 return &indexedBatch{} 378 }, 379 } 380 381 func newBatch(db *DB) *Batch { 382 b := batchPool.Get().(*Batch) 383 b.db = db 384 return b 385 } 386 387 func newBatchWithSize(db *DB, size int) *Batch { 388 b := newBatch(db) 389 if cap(b.data) < size { 390 b.data = rawalloc.New(0, size) 391 } 392 return b 393 } 394 395 func newIndexedBatch(db *DB, comparer *Comparer) *Batch { 396 i := indexedBatchPool.Get().(*indexedBatch) 397 i.batch.cmp = comparer.Compare 398 i.batch.formatKey = comparer.FormatKey 399 i.batch.abbreviatedKey = comparer.AbbreviatedKey 400 i.batch.db = db 401 i.batch.index = &i.index 402 i.batch.index.Init(&i.batch.data, i.batch.cmp, i.batch.abbreviatedKey) 403 return &i.batch 404 } 405 406 func newIndexedBatchWithSize(db *DB, comparer *Comparer, size int) *Batch { 407 b := newIndexedBatch(db, comparer) 408 if cap(b.data) < size { 409 b.data = rawalloc.New(0, size) 410 } 411 return b 412 } 413 414 // nextSeqNum returns the batch "sequence number" that will be given to the next 415 // key written to the batch. During iteration keys within an indexed batch are 416 // given a sequence number consisting of their offset within the batch combined 417 // with the base.InternalKeySeqNumBatch bit. These sequence numbers are only 418 // used during iteration, and the keys are assigned ordinary sequence numbers 419 // when the batch is committed. 420 func (b *Batch) nextSeqNum() uint64 { 421 return uint64(len(b.data)) | base.InternalKeySeqNumBatch 422 } 423 424 func (b *Batch) release() { 425 if b.db == nil { 426 // The batch was not created using newBatch or newIndexedBatch, or an error 427 // was encountered. We don't try to reuse batches that encountered an error 428 // because they might be stuck somewhere in the system and attempting to 429 // reuse such batches is a recipe for onerous debugging sessions. Instead, 430 // let the GC do its job. 431 return 432 } 433 b.db = nil 434 435 // NB: This is ugly (it would be cleaner if we could just assign a Batch{}), 436 // but necessary so that we can use atomic.StoreUint32 for the Batch.applied 437 // field. Without using an atomic to clear that field the Go race detector 438 // complains. 439 b.Reset() 440 b.cmp = nil 441 b.formatKey = nil 442 b.abbreviatedKey = nil 443 444 if b.index == nil { 445 batchPool.Put(b) 446 } else { 447 b.index, b.rangeDelIndex, b.rangeKeyIndex = nil, nil, nil 448 indexedBatchPool.Put((*indexedBatch)(unsafe.Pointer(b))) 449 } 450 } 451 452 func (b *Batch) refreshMemTableSize() error { 453 b.memTableSize = 0 454 if len(b.data) < batchHeaderLen { 455 return nil 456 } 457 458 b.countRangeDels = 0 459 b.countRangeKeys = 0 460 b.minimumFormatMajorVersion = 0 461 for r := b.Reader(); ; { 462 kind, key, value, ok, err := r.Next() 463 if !ok { 464 if err != nil { 465 return err 466 } 467 break 468 } 469 switch kind { 470 case InternalKeyKindRangeDelete: 471 b.countRangeDels++ 472 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 473 b.countRangeKeys++ 474 case InternalKeyKindDeleteSized: 475 if b.minimumFormatMajorVersion < FormatDeleteSizedAndObsolete { 476 b.minimumFormatMajorVersion = FormatDeleteSizedAndObsolete 477 } 478 case InternalKeyKindIngestSST: 479 if b.minimumFormatMajorVersion < FormatFlushableIngest { 480 b.minimumFormatMajorVersion = FormatFlushableIngest 481 } 482 // This key kind doesn't contribute to the memtable size. 483 continue 484 } 485 b.memTableSize += memTableEntrySize(len(key), len(value)) 486 } 487 if b.countRangeKeys > 0 && b.minimumFormatMajorVersion < FormatRangeKeys { 488 b.minimumFormatMajorVersion = FormatRangeKeys 489 } 490 return nil 491 } 492 493 // Apply the operations contained in the batch to the receiver batch. 494 // 495 // It is safe to modify the contents of the arguments after Apply returns. 496 func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error { 497 if b.ingestedSSTBatch { 498 panic("pebble: invalid batch application") 499 } 500 if len(batch.data) == 0 { 501 return nil 502 } 503 if len(batch.data) < batchHeaderLen { 504 return ErrInvalidBatch 505 } 506 507 offset := len(b.data) 508 if offset == 0 { 509 b.init(offset) 510 offset = batchHeaderLen 511 } 512 b.data = append(b.data, batch.data[batchHeaderLen:]...) 513 514 b.setCount(b.Count() + batch.Count()) 515 516 if b.db != nil || b.index != nil { 517 // Only iterate over the new entries if we need to track memTableSize or in 518 // order to update the index. 519 for iter := BatchReader(b.data[offset:]); len(iter) > 0; { 520 offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0])) 521 kind, key, value, ok, err := iter.Next() 522 if !ok { 523 if err != nil { 524 return err 525 } 526 break 527 } 528 switch kind { 529 case InternalKeyKindRangeDelete: 530 b.countRangeDels++ 531 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 532 b.countRangeKeys++ 533 case InternalKeyKindIngestSST: 534 panic("pebble: invalid key kind for batch") 535 } 536 if b.index != nil { 537 var err error 538 switch kind { 539 case InternalKeyKindRangeDelete: 540 b.tombstones = nil 541 b.tombstonesSeqNum = 0 542 if b.rangeDelIndex == nil { 543 b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey) 544 } 545 err = b.rangeDelIndex.Add(uint32(offset)) 546 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 547 b.rangeKeys = nil 548 b.rangeKeysSeqNum = 0 549 if b.rangeKeyIndex == nil { 550 b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey) 551 } 552 err = b.rangeKeyIndex.Add(uint32(offset)) 553 default: 554 err = b.index.Add(uint32(offset)) 555 } 556 if err != nil { 557 return err 558 } 559 } 560 b.memTableSize += memTableEntrySize(len(key), len(value)) 561 } 562 } 563 return nil 564 } 565 566 // Get gets the value for the given key. It returns ErrNotFound if the Batch 567 // does not contain the key. 568 // 569 // The caller should not modify the contents of the returned slice, but it is 570 // safe to modify the contents of the argument after Get returns. The returned 571 // slice will remain valid until the returned Closer is closed. On success, the 572 // caller MUST call closer.Close() or a memory leak will occur. 573 func (b *Batch) Get(key []byte) ([]byte, io.Closer, error) { 574 if b.index == nil { 575 return nil, nil, ErrNotIndexed 576 } 577 return b.db.getInternal(key, b, nil /* snapshot */) 578 } 579 580 func (b *Batch) prepareDeferredKeyValueRecord(keyLen, valueLen int, kind InternalKeyKind) { 581 if b.committing { 582 panic("pebble: batch already committing") 583 } 584 if len(b.data) == 0 { 585 b.init(keyLen + valueLen + 2*binary.MaxVarintLen64 + batchHeaderLen) 586 } 587 b.count++ 588 b.memTableSize += memTableEntrySize(keyLen, valueLen) 589 590 pos := len(b.data) 591 b.deferredOp.offset = uint32(pos) 592 b.grow(1 + 2*maxVarintLen32 + keyLen + valueLen) 593 b.data[pos] = byte(kind) 594 pos++ 595 596 { 597 // TODO(peter): Manually inlined version binary.PutUvarint(). This is 20% 598 // faster on BenchmarkBatchSet on go1.13. Remove if go1.14 or future 599 // versions show this to not be a performance win. 600 x := uint32(keyLen) 601 for x >= 0x80 { 602 b.data[pos] = byte(x) | 0x80 603 x >>= 7 604 pos++ 605 } 606 b.data[pos] = byte(x) 607 pos++ 608 } 609 610 b.deferredOp.Key = b.data[pos : pos+keyLen] 611 pos += keyLen 612 613 { 614 // TODO(peter): Manually inlined version binary.PutUvarint(). This is 20% 615 // faster on BenchmarkBatchSet on go1.13. Remove if go1.14 or future 616 // versions show this to not be a performance win. 617 x := uint32(valueLen) 618 for x >= 0x80 { 619 b.data[pos] = byte(x) | 0x80 620 x >>= 7 621 pos++ 622 } 623 b.data[pos] = byte(x) 624 pos++ 625 } 626 627 b.deferredOp.Value = b.data[pos : pos+valueLen] 628 // Shrink data since varints may be shorter than the upper bound. 629 b.data = b.data[:pos+valueLen] 630 } 631 632 func (b *Batch) prepareDeferredKeyRecord(keyLen int, kind InternalKeyKind) { 633 if b.committing { 634 panic("pebble: batch already committing") 635 } 636 if len(b.data) == 0 { 637 b.init(keyLen + binary.MaxVarintLen64 + batchHeaderLen) 638 } 639 b.count++ 640 b.memTableSize += memTableEntrySize(keyLen, 0) 641 642 pos := len(b.data) 643 b.deferredOp.offset = uint32(pos) 644 b.grow(1 + maxVarintLen32 + keyLen) 645 b.data[pos] = byte(kind) 646 pos++ 647 648 { 649 // TODO(peter): Manually inlined version binary.PutUvarint(). Remove if 650 // go1.13 or future versions show this to not be a performance win. See 651 // BenchmarkBatchSet. 652 x := uint32(keyLen) 653 for x >= 0x80 { 654 b.data[pos] = byte(x) | 0x80 655 x >>= 7 656 pos++ 657 } 658 b.data[pos] = byte(x) 659 pos++ 660 } 661 662 b.deferredOp.Key = b.data[pos : pos+keyLen] 663 b.deferredOp.Value = nil 664 665 // Shrink data since varint may be shorter than the upper bound. 666 b.data = b.data[:pos+keyLen] 667 } 668 669 // AddInternalKey allows the caller to add an internal key of point key kinds to 670 // a batch. Passing in an internal key of kind RangeKey* or RangeDelete will 671 // result in a panic. Note that the seqnum in the internal key is effectively 672 // ignored, even though the Kind is preserved. This is because the batch format 673 // does not allow for a per-key seqnum to be specified, only a batch-wide one. 674 // 675 // Note that non-indexed keys (IngestKeyKind{LogData,IngestSST}) are not 676 // supported with this method as they require specialized logic. 677 func (b *Batch) AddInternalKey(key *base.InternalKey, value []byte, _ *WriteOptions) error { 678 keyLen := len(key.UserKey) 679 hasValue := false 680 switch key.Kind() { 681 case InternalKeyKindRangeDelete, InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 682 panic("unexpected range delete or range key kind in AddInternalKey") 683 case InternalKeyKindSingleDelete, InternalKeyKindDelete: 684 b.prepareDeferredKeyRecord(len(key.UserKey), key.Kind()) 685 default: 686 b.prepareDeferredKeyValueRecord(keyLen, len(value), key.Kind()) 687 hasValue = true 688 } 689 b.deferredOp.index = b.index 690 copy(b.deferredOp.Key, key.UserKey) 691 if hasValue { 692 copy(b.deferredOp.Value, value) 693 } 694 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 695 // in go1.13 will remove the need for this. 696 if b.index != nil { 697 if err := b.index.Add(b.deferredOp.offset); err != nil { 698 return err 699 } 700 } 701 return nil 702 } 703 704 // Set adds an action to the batch that sets the key to map to the value. 705 // 706 // It is safe to modify the contents of the arguments after Set returns. 707 func (b *Batch) Set(key, value []byte, _ *WriteOptions) error { 708 deferredOp := b.SetDeferred(len(key), len(value)) 709 copy(deferredOp.Key, key) 710 copy(deferredOp.Value, value) 711 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 712 // in go1.13 will remove the need for this. 713 if b.index != nil { 714 if err := b.index.Add(deferredOp.offset); err != nil { 715 return err 716 } 717 } 718 return nil 719 } 720 721 // SetDeferred is similar to Set in that it adds a set operation to the batch, 722 // except it only takes in key/value lengths instead of complete slices, 723 // letting the caller encode into those objects and then call Finish() on the 724 // returned object. 725 func (b *Batch) SetDeferred(keyLen, valueLen int) *DeferredBatchOp { 726 b.prepareDeferredKeyValueRecord(keyLen, valueLen, InternalKeyKindSet) 727 b.deferredOp.index = b.index 728 return &b.deferredOp 729 } 730 731 // Merge adds an action to the batch that merges the value at key with the new 732 // value. The details of the merge are dependent upon the configured merge 733 // operator. 734 // 735 // It is safe to modify the contents of the arguments after Merge returns. 736 func (b *Batch) Merge(key, value []byte, _ *WriteOptions) error { 737 deferredOp := b.MergeDeferred(len(key), len(value)) 738 copy(deferredOp.Key, key) 739 copy(deferredOp.Value, value) 740 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 741 // in go1.13 will remove the need for this. 742 if b.index != nil { 743 if err := b.index.Add(deferredOp.offset); err != nil { 744 return err 745 } 746 } 747 return nil 748 } 749 750 // MergeDeferred is similar to Merge in that it adds a merge operation to the 751 // batch, except it only takes in key/value lengths instead of complete slices, 752 // letting the caller encode into those objects and then call Finish() on the 753 // returned object. 754 func (b *Batch) MergeDeferred(keyLen, valueLen int) *DeferredBatchOp { 755 b.prepareDeferredKeyValueRecord(keyLen, valueLen, InternalKeyKindMerge) 756 b.deferredOp.index = b.index 757 return &b.deferredOp 758 } 759 760 // Delete adds an action to the batch that deletes the entry for key. 761 // 762 // It is safe to modify the contents of the arguments after Delete returns. 763 func (b *Batch) Delete(key []byte, _ *WriteOptions) error { 764 deferredOp := b.DeleteDeferred(len(key)) 765 copy(deferredOp.Key, key) 766 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 767 // in go1.13 will remove the need for this. 768 if b.index != nil { 769 if err := b.index.Add(deferredOp.offset); err != nil { 770 return err 771 } 772 } 773 return nil 774 } 775 776 // DeleteDeferred is similar to Delete in that it adds a delete operation to 777 // the batch, except it only takes in key/value lengths instead of complete 778 // slices, letting the caller encode into those objects and then call Finish() 779 // on the returned object. 780 func (b *Batch) DeleteDeferred(keyLen int) *DeferredBatchOp { 781 b.prepareDeferredKeyRecord(keyLen, InternalKeyKindDelete) 782 b.deferredOp.index = b.index 783 return &b.deferredOp 784 } 785 786 // DeleteSized behaves identically to Delete, but takes an additional 787 // argument indicating the size of the value being deleted. DeleteSized 788 // should be preferred when the caller has the expectation that there exists 789 // a single internal KV pair for the key (eg, the key has not been 790 // overwritten recently), and the caller knows the size of its value. 791 // 792 // DeleteSized will record the value size within the tombstone and use it to 793 // inform compaction-picking heuristics which strive to reduce space 794 // amplification in the LSM. This "calling your shot" mechanic allows the 795 // storage engine to more accurately estimate and reduce space amplification. 796 // 797 // It is safe to modify the contents of the arguments after DeleteSized 798 // returns. 799 func (b *Batch) DeleteSized(key []byte, deletedValueSize uint32, _ *WriteOptions) error { 800 deferredOp := b.DeleteSizedDeferred(len(key), deletedValueSize) 801 copy(b.deferredOp.Key, key) 802 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Check if in a 803 // later Go release this is unnecessary. 804 if b.index != nil { 805 if err := b.index.Add(deferredOp.offset); err != nil { 806 return err 807 } 808 } 809 return nil 810 } 811 812 // DeleteSizedDeferred is similar to DeleteSized in that it adds a sized delete 813 // operation to the batch, except it only takes in key length instead of a 814 // complete key slice, letting the caller encode into the DeferredBatchOp.Key 815 // slice and then call Finish() on the returned object. 816 func (b *Batch) DeleteSizedDeferred(keyLen int, deletedValueSize uint32) *DeferredBatchOp { 817 if b.minimumFormatMajorVersion < FormatDeleteSizedAndObsolete { 818 b.minimumFormatMajorVersion = FormatDeleteSizedAndObsolete 819 } 820 821 // Encode the sum of the key length and the value in the value. 822 v := uint64(deletedValueSize) + uint64(keyLen) 823 824 // Encode `v` as a varint. 825 var buf [binary.MaxVarintLen64]byte 826 n := 0 827 { 828 x := v 829 for x >= 0x80 { 830 buf[n] = byte(x) | 0x80 831 x >>= 7 832 n++ 833 } 834 buf[n] = byte(x) 835 n++ 836 } 837 838 // NB: In batch entries and sstable entries, values are stored as 839 // varstrings. Here, the value is itself a simple varint. This results in an 840 // unnecessary double layer of encoding: 841 // varint(n) varint(deletedValueSize) 842 // The first varint will always be 1-byte, since a varint-encoded uint64 843 // will never exceed 128 bytes. This unnecessary extra byte and wrapping is 844 // preserved to avoid special casing across the database, and in particular 845 // in sstable block decoding which is performance sensitive. 846 b.prepareDeferredKeyValueRecord(keyLen, n, InternalKeyKindDeleteSized) 847 b.deferredOp.index = b.index 848 copy(b.deferredOp.Value, buf[:n]) 849 return &b.deferredOp 850 } 851 852 // SingleDelete adds an action to the batch that single deletes the entry for key. 853 // See Writer.SingleDelete for more details on the semantics of SingleDelete. 854 // 855 // It is safe to modify the contents of the arguments after SingleDelete returns. 856 func (b *Batch) SingleDelete(key []byte, _ *WriteOptions) error { 857 deferredOp := b.SingleDeleteDeferred(len(key)) 858 copy(deferredOp.Key, key) 859 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 860 // in go1.13 will remove the need for this. 861 if b.index != nil { 862 if err := b.index.Add(deferredOp.offset); err != nil { 863 return err 864 } 865 } 866 return nil 867 } 868 869 // SingleDeleteDeferred is similar to SingleDelete in that it adds a single delete 870 // operation to the batch, except it only takes in key/value lengths instead of 871 // complete slices, letting the caller encode into those objects and then call 872 // Finish() on the returned object. 873 func (b *Batch) SingleDeleteDeferred(keyLen int) *DeferredBatchOp { 874 b.prepareDeferredKeyRecord(keyLen, InternalKeyKindSingleDelete) 875 b.deferredOp.index = b.index 876 return &b.deferredOp 877 } 878 879 // DeleteRange deletes all of the point keys (and values) in the range 880 // [start,end) (inclusive on start, exclusive on end). DeleteRange does NOT 881 // delete overlapping range keys (eg, keys set via RangeKeySet). 882 // 883 // It is safe to modify the contents of the arguments after DeleteRange 884 // returns. 885 func (b *Batch) DeleteRange(start, end []byte, _ *WriteOptions) error { 886 deferredOp := b.DeleteRangeDeferred(len(start), len(end)) 887 copy(deferredOp.Key, start) 888 copy(deferredOp.Value, end) 889 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 890 // in go1.13 will remove the need for this. 891 if deferredOp.index != nil { 892 if err := deferredOp.index.Add(deferredOp.offset); err != nil { 893 return err 894 } 895 } 896 return nil 897 } 898 899 // DeleteRangeDeferred is similar to DeleteRange in that it adds a delete range 900 // operation to the batch, except it only takes in key lengths instead of 901 // complete slices, letting the caller encode into those objects and then call 902 // Finish() on the returned object. Note that DeferredBatchOp.Key should be 903 // populated with the start key, and DeferredBatchOp.Value should be populated 904 // with the end key. 905 func (b *Batch) DeleteRangeDeferred(startLen, endLen int) *DeferredBatchOp { 906 b.prepareDeferredKeyValueRecord(startLen, endLen, InternalKeyKindRangeDelete) 907 b.countRangeDels++ 908 if b.index != nil { 909 b.tombstones = nil 910 b.tombstonesSeqNum = 0 911 // Range deletions are rare, so we lazily allocate the index for them. 912 if b.rangeDelIndex == nil { 913 b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey) 914 } 915 b.deferredOp.index = b.rangeDelIndex 916 } 917 return &b.deferredOp 918 } 919 920 // RangeKeySet sets a range key mapping the key range [start, end) at the MVCC 921 // timestamp suffix to value. The suffix is optional. If any portion of the key 922 // range [start, end) is already set by a range key with the same suffix value, 923 // RangeKeySet overrides it. 924 // 925 // It is safe to modify the contents of the arguments after RangeKeySet returns. 926 func (b *Batch) RangeKeySet(start, end, suffix, value []byte, _ *WriteOptions) error { 927 suffixValues := [1]rangekey.SuffixValue{{Suffix: suffix, Value: value}} 928 internalValueLen := rangekey.EncodedSetValueLen(end, suffixValues[:]) 929 930 deferredOp := b.rangeKeySetDeferred(len(start), internalValueLen) 931 copy(deferredOp.Key, start) 932 n := rangekey.EncodeSetValue(deferredOp.Value, end, suffixValues[:]) 933 if n != internalValueLen { 934 panic("unexpected internal value length mismatch") 935 } 936 937 // Manually inline DeferredBatchOp.Finish(). 938 if deferredOp.index != nil { 939 if err := deferredOp.index.Add(deferredOp.offset); err != nil { 940 return err 941 } 942 } 943 return nil 944 } 945 946 func (b *Batch) rangeKeySetDeferred(startLen, internalValueLen int) *DeferredBatchOp { 947 b.prepareDeferredKeyValueRecord(startLen, internalValueLen, InternalKeyKindRangeKeySet) 948 b.incrementRangeKeysCount() 949 return &b.deferredOp 950 } 951 952 func (b *Batch) incrementRangeKeysCount() { 953 b.countRangeKeys++ 954 if b.minimumFormatMajorVersion < FormatRangeKeys { 955 b.minimumFormatMajorVersion = FormatRangeKeys 956 } 957 if b.index != nil { 958 b.rangeKeys = nil 959 b.rangeKeysSeqNum = 0 960 // Range keys are rare, so we lazily allocate the index for them. 961 if b.rangeKeyIndex == nil { 962 b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey) 963 } 964 b.deferredOp.index = b.rangeKeyIndex 965 } 966 } 967 968 // RangeKeyUnset removes a range key mapping the key range [start, end) at the 969 // MVCC timestamp suffix. The suffix may be omitted to remove an unsuffixed 970 // range key. RangeKeyUnset only removes portions of range keys that fall within 971 // the [start, end) key span, and only range keys with suffixes that exactly 972 // match the unset suffix. 973 // 974 // It is safe to modify the contents of the arguments after RangeKeyUnset 975 // returns. 976 func (b *Batch) RangeKeyUnset(start, end, suffix []byte, _ *WriteOptions) error { 977 suffixes := [1][]byte{suffix} 978 internalValueLen := rangekey.EncodedUnsetValueLen(end, suffixes[:]) 979 980 deferredOp := b.rangeKeyUnsetDeferred(len(start), internalValueLen) 981 copy(deferredOp.Key, start) 982 n := rangekey.EncodeUnsetValue(deferredOp.Value, end, suffixes[:]) 983 if n != internalValueLen { 984 panic("unexpected internal value length mismatch") 985 } 986 987 // Manually inline DeferredBatchOp.Finish() 988 if deferredOp.index != nil { 989 if err := deferredOp.index.Add(deferredOp.offset); err != nil { 990 return err 991 } 992 } 993 return nil 994 } 995 996 func (b *Batch) rangeKeyUnsetDeferred(startLen, internalValueLen int) *DeferredBatchOp { 997 b.prepareDeferredKeyValueRecord(startLen, internalValueLen, InternalKeyKindRangeKeyUnset) 998 b.incrementRangeKeysCount() 999 return &b.deferredOp 1000 } 1001 1002 // RangeKeyDelete deletes all of the range keys in the range [start,end) 1003 // (inclusive on start, exclusive on end). It does not delete point keys (for 1004 // that use DeleteRange). RangeKeyDelete removes all range keys within the 1005 // bounds, including those with or without suffixes. 1006 // 1007 // It is safe to modify the contents of the arguments after RangeKeyDelete 1008 // returns. 1009 func (b *Batch) RangeKeyDelete(start, end []byte, _ *WriteOptions) error { 1010 deferredOp := b.RangeKeyDeleteDeferred(len(start), len(end)) 1011 copy(deferredOp.Key, start) 1012 copy(deferredOp.Value, end) 1013 // Manually inline DeferredBatchOp.Finish(). 1014 if deferredOp.index != nil { 1015 if err := deferredOp.index.Add(deferredOp.offset); err != nil { 1016 return err 1017 } 1018 } 1019 return nil 1020 } 1021 1022 // RangeKeyDeleteDeferred is similar to RangeKeyDelete in that it adds an 1023 // operation to delete range keys to the batch, except it only takes in key 1024 // lengths instead of complete slices, letting the caller encode into those 1025 // objects and then call Finish() on the returned object. Note that 1026 // DeferredBatchOp.Key should be populated with the start key, and 1027 // DeferredBatchOp.Value should be populated with the end key. 1028 func (b *Batch) RangeKeyDeleteDeferred(startLen, endLen int) *DeferredBatchOp { 1029 b.prepareDeferredKeyValueRecord(startLen, endLen, InternalKeyKindRangeKeyDelete) 1030 b.incrementRangeKeysCount() 1031 return &b.deferredOp 1032 } 1033 1034 // LogData adds the specified to the batch. The data will be written to the 1035 // WAL, but not added to memtables or sstables. Log data is never indexed, 1036 // which makes it useful for testing WAL performance. 1037 // 1038 // It is safe to modify the contents of the argument after LogData returns. 1039 func (b *Batch) LogData(data []byte, _ *WriteOptions) error { 1040 origCount, origMemTableSize := b.count, b.memTableSize 1041 b.prepareDeferredKeyRecord(len(data), InternalKeyKindLogData) 1042 copy(b.deferredOp.Key, data) 1043 // Since LogData only writes to the WAL and does not affect the memtable, we 1044 // restore b.count and b.memTableSize to their origin values. Note that 1045 // Batch.count only refers to records that are added to the memtable. 1046 b.count, b.memTableSize = origCount, origMemTableSize 1047 return nil 1048 } 1049 1050 // IngestSST adds the FileNum for an sstable to the batch. The data will only be 1051 // written to the WAL (not added to memtables or sstables). 1052 func (b *Batch) ingestSST(fileNum base.FileNum) { 1053 if b.Empty() { 1054 b.ingestedSSTBatch = true 1055 } else if !b.ingestedSSTBatch { 1056 // Batch contains other key kinds. 1057 panic("pebble: invalid call to ingestSST") 1058 } 1059 1060 origMemTableSize := b.memTableSize 1061 var buf [binary.MaxVarintLen64]byte 1062 length := binary.PutUvarint(buf[:], uint64(fileNum)) 1063 b.prepareDeferredKeyRecord(length, InternalKeyKindIngestSST) 1064 copy(b.deferredOp.Key, buf[:length]) 1065 // Since IngestSST writes only to the WAL and does not affect the memtable, 1066 // we restore b.memTableSize to its original value. Note that Batch.count 1067 // is not reset because for the InternalKeyKindIngestSST the count is the 1068 // number of sstable paths which have been added to the batch. 1069 b.memTableSize = origMemTableSize 1070 b.minimumFormatMajorVersion = FormatFlushableIngest 1071 } 1072 1073 // Empty returns true if the batch is empty, and false otherwise. 1074 func (b *Batch) Empty() bool { 1075 return len(b.data) <= batchHeaderLen 1076 } 1077 1078 // Len returns the current size of the batch in bytes. 1079 func (b *Batch) Len() int { 1080 if len(b.data) <= batchHeaderLen { 1081 return batchHeaderLen 1082 } 1083 return len(b.data) 1084 } 1085 1086 // Repr returns the underlying batch representation. It is not safe to modify 1087 // the contents. Reset() will not change the contents of the returned value, 1088 // though any other mutation operation may do so. 1089 func (b *Batch) Repr() []byte { 1090 if len(b.data) == 0 { 1091 b.init(batchHeaderLen) 1092 } 1093 binary.LittleEndian.PutUint32(b.countData(), b.Count()) 1094 return b.data 1095 } 1096 1097 // SetRepr sets the underlying batch representation. The batch takes ownership 1098 // of the supplied slice. It is not safe to modify it afterwards until the 1099 // Batch is no longer in use. 1100 func (b *Batch) SetRepr(data []byte) error { 1101 if len(data) < batchHeaderLen { 1102 return base.CorruptionErrorf("invalid batch") 1103 } 1104 b.data = data 1105 b.count = uint64(binary.LittleEndian.Uint32(b.countData())) 1106 var err error 1107 if b.db != nil { 1108 // Only track memTableSize for batches that will be committed to the DB. 1109 err = b.refreshMemTableSize() 1110 } 1111 return err 1112 } 1113 1114 // NewIter returns an iterator that is unpositioned (Iterator.Valid() will 1115 // return false). The iterator can be positioned via a call to SeekGE, 1116 // SeekPrefixGE, SeekLT, First or Last. Only indexed batches support iterators. 1117 // 1118 // The returned Iterator observes all of the Batch's existing mutations, but no 1119 // later mutations. Its view can be refreshed via RefreshBatchSnapshot or 1120 // SetOptions(). 1121 func (b *Batch) NewIter(o *IterOptions) (*Iterator, error) { 1122 return b.NewIterWithContext(context.Background(), o), nil 1123 } 1124 1125 // NewIterWithContext is like NewIter, and additionally accepts a context for 1126 // tracing. 1127 func (b *Batch) NewIterWithContext(ctx context.Context, o *IterOptions) *Iterator { 1128 if b.index == nil { 1129 return &Iterator{err: ErrNotIndexed} 1130 } 1131 return b.db.newIter(ctx, b, snapshotIterOpts{}, o) 1132 } 1133 1134 // newInternalIter creates a new internalIterator that iterates over the 1135 // contents of the batch. 1136 func (b *Batch) newInternalIter(o *IterOptions) *batchIter { 1137 iter := &batchIter{} 1138 b.initInternalIter(o, iter) 1139 return iter 1140 } 1141 1142 func (b *Batch) initInternalIter(o *IterOptions, iter *batchIter) { 1143 *iter = batchIter{ 1144 cmp: b.cmp, 1145 batch: b, 1146 iter: b.index.NewIter(o.GetLowerBound(), o.GetUpperBound()), 1147 // NB: We explicitly do not propagate the batch snapshot to the point 1148 // key iterator. Filtering point keys within the batch iterator can 1149 // cause pathological behavior where a batch iterator advances 1150 // significantly farther than necessary filtering many batch keys that 1151 // are not visible at the batch sequence number. Instead, the merging 1152 // iterator enforces bounds. 1153 // 1154 // For example, consider an engine that contains the committed keys 1155 // 'bar' and 'bax', with no keys between them. Consider a batch 1156 // containing keys 1,000 keys within the range [a,z]. All of the 1157 // batch keys were added to the batch after the iterator was 1158 // constructed, so they are not visible to the iterator. A call to 1159 // SeekGE('bax') would seek the LSM iterators and discover the key 1160 // 'bax'. It would also seek the batch iterator, landing on the key 1161 // 'baz' but discover it that it's not visible. The batch iterator would 1162 // next through the rest of the batch's keys, only to discover there are 1163 // no visible keys greater than or equal to 'bax'. 1164 // 1165 // Filtering these batch points within the merging iterator ensures that 1166 // the batch iterator never needs to iterate beyond 'baz', because it 1167 // already found a smaller, visible key 'bax'. 1168 snapshot: base.InternalKeySeqNumMax, 1169 } 1170 } 1171 1172 func (b *Batch) newRangeDelIter(o *IterOptions, batchSnapshot uint64) *keyspan.Iter { 1173 // Construct an iterator even if rangeDelIndex is nil, because it is allowed 1174 // to refresh later, so we need the container to exist. 1175 iter := new(keyspan.Iter) 1176 b.initRangeDelIter(o, iter, batchSnapshot) 1177 return iter 1178 } 1179 1180 func (b *Batch) initRangeDelIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot uint64) { 1181 if b.rangeDelIndex == nil { 1182 iter.Init(b.cmp, nil) 1183 return 1184 } 1185 1186 // Fragment the range tombstones the first time a range deletion iterator is 1187 // requested. The cached tombstones are invalidated if another range 1188 // deletion tombstone is added to the batch. This cache is only guaranteed 1189 // to be correct if we're opening an iterator to read at a batch sequence 1190 // number at least as high as tombstonesSeqNum. The cache is guaranteed to 1191 // include all tombstones up to tombstonesSeqNum, and if any additional 1192 // tombstones were added after that sequence number the cache would've been 1193 // cleared. 1194 nextSeqNum := b.nextSeqNum() 1195 if b.tombstones != nil && b.tombstonesSeqNum <= batchSnapshot { 1196 iter.Init(b.cmp, b.tombstones) 1197 return 1198 } 1199 1200 tombstones := make([]keyspan.Span, 0, b.countRangeDels) 1201 frag := &keyspan.Fragmenter{ 1202 Cmp: b.cmp, 1203 Format: b.formatKey, 1204 Emit: func(s keyspan.Span) { 1205 tombstones = append(tombstones, s) 1206 }, 1207 } 1208 it := &batchIter{ 1209 cmp: b.cmp, 1210 batch: b, 1211 iter: b.rangeDelIndex.NewIter(nil, nil), 1212 snapshot: batchSnapshot, 1213 } 1214 fragmentRangeDels(frag, it, int(b.countRangeDels)) 1215 iter.Init(b.cmp, tombstones) 1216 1217 // If we just read all the tombstones in the batch (eg, batchSnapshot was 1218 // set to b.nextSeqNum()), then cache the tombstones so that a subsequent 1219 // call to initRangeDelIter may use them without refragmenting. 1220 if nextSeqNum == batchSnapshot { 1221 b.tombstones = tombstones 1222 b.tombstonesSeqNum = nextSeqNum 1223 } 1224 } 1225 1226 func fragmentRangeDels(frag *keyspan.Fragmenter, it internalIterator, count int) { 1227 // The memory management here is a bit subtle. The keys and values returned 1228 // by the iterator are slices in Batch.data. Thus the fragmented tombstones 1229 // are slices within Batch.data. If additional entries are added to the 1230 // Batch, Batch.data may be reallocated. The references in the fragmented 1231 // tombstones will remain valid, pointing into the old Batch.data. GC for 1232 // the win. 1233 1234 // Use a single []keyspan.Key buffer to avoid allocating many 1235 // individual []keyspan.Key slices with a single element each. 1236 keyBuf := make([]keyspan.Key, 0, count) 1237 for key, val := it.First(); key != nil; key, val = it.Next() { 1238 s := rangedel.Decode(*key, val.InPlaceValue(), keyBuf) 1239 keyBuf = s.Keys[len(s.Keys):] 1240 1241 // Set a fixed capacity to avoid accidental overwriting. 1242 s.Keys = s.Keys[:len(s.Keys):len(s.Keys)] 1243 frag.Add(s) 1244 } 1245 frag.Finish() 1246 } 1247 1248 func (b *Batch) newRangeKeyIter(o *IterOptions, batchSnapshot uint64) *keyspan.Iter { 1249 // Construct an iterator even if rangeKeyIndex is nil, because it is allowed 1250 // to refresh later, so we need the container to exist. 1251 iter := new(keyspan.Iter) 1252 b.initRangeKeyIter(o, iter, batchSnapshot) 1253 return iter 1254 } 1255 1256 func (b *Batch) initRangeKeyIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot uint64) { 1257 if b.rangeKeyIndex == nil { 1258 iter.Init(b.cmp, nil) 1259 return 1260 } 1261 1262 // Fragment the range keys the first time a range key iterator is requested. 1263 // The cached spans are invalidated if another range key is added to the 1264 // batch. This cache is only guaranteed to be correct if we're opening an 1265 // iterator to read at a batch sequence number at least as high as 1266 // rangeKeysSeqNum. The cache is guaranteed to include all range keys up to 1267 // rangeKeysSeqNum, and if any additional range keys were added after that 1268 // sequence number the cache would've been cleared. 1269 nextSeqNum := b.nextSeqNum() 1270 if b.rangeKeys != nil && b.rangeKeysSeqNum <= batchSnapshot { 1271 iter.Init(b.cmp, b.rangeKeys) 1272 return 1273 } 1274 1275 rangeKeys := make([]keyspan.Span, 0, b.countRangeKeys) 1276 frag := &keyspan.Fragmenter{ 1277 Cmp: b.cmp, 1278 Format: b.formatKey, 1279 Emit: func(s keyspan.Span) { 1280 rangeKeys = append(rangeKeys, s) 1281 }, 1282 } 1283 it := &batchIter{ 1284 cmp: b.cmp, 1285 batch: b, 1286 iter: b.rangeKeyIndex.NewIter(nil, nil), 1287 snapshot: batchSnapshot, 1288 } 1289 fragmentRangeKeys(frag, it, int(b.countRangeKeys)) 1290 iter.Init(b.cmp, rangeKeys) 1291 1292 // If we just read all the range keys in the batch (eg, batchSnapshot was 1293 // set to b.nextSeqNum()), then cache the range keys so that a subsequent 1294 // call to initRangeKeyIter may use them without refragmenting. 1295 if nextSeqNum == batchSnapshot { 1296 b.rangeKeys = rangeKeys 1297 b.rangeKeysSeqNum = nextSeqNum 1298 } 1299 } 1300 1301 func fragmentRangeKeys(frag *keyspan.Fragmenter, it internalIterator, count int) error { 1302 // The memory management here is a bit subtle. The keys and values 1303 // returned by the iterator are slices in Batch.data. Thus the 1304 // fragmented key spans are slices within Batch.data. If additional 1305 // entries are added to the Batch, Batch.data may be reallocated. The 1306 // references in the fragmented keys will remain valid, pointing into 1307 // the old Batch.data. GC for the win. 1308 1309 // Use a single []keyspan.Key buffer to avoid allocating many 1310 // individual []keyspan.Key slices with a single element each. 1311 keyBuf := make([]keyspan.Key, 0, count) 1312 for ik, val := it.First(); ik != nil; ik, val = it.Next() { 1313 s, err := rangekey.Decode(*ik, val.InPlaceValue(), keyBuf) 1314 if err != nil { 1315 return err 1316 } 1317 keyBuf = s.Keys[len(s.Keys):] 1318 1319 // Set a fixed capacity to avoid accidental overwriting. 1320 s.Keys = s.Keys[:len(s.Keys):len(s.Keys)] 1321 frag.Add(s) 1322 } 1323 frag.Finish() 1324 return nil 1325 } 1326 1327 // Commit applies the batch to its parent writer. 1328 func (b *Batch) Commit(o *WriteOptions) error { 1329 return b.db.Apply(b, o) 1330 } 1331 1332 // Close closes the batch without committing it. 1333 func (b *Batch) Close() error { 1334 b.release() 1335 return nil 1336 } 1337 1338 // Indexed returns true if the batch is indexed (i.e. supports read 1339 // operations). 1340 func (b *Batch) Indexed() bool { 1341 return b.index != nil 1342 } 1343 1344 // init ensures that the batch data slice is initialized to meet the 1345 // minimum required size and allocates space for the batch header. 1346 func (b *Batch) init(size int) { 1347 n := batchInitialSize 1348 for n < size { 1349 n *= 2 1350 } 1351 if cap(b.data) < n { 1352 b.data = rawalloc.New(batchHeaderLen, n) 1353 } 1354 b.data = b.data[:batchHeaderLen] 1355 // Zero the sequence number in the header. 1356 for i := 0; i < len(b.data); i++ { 1357 b.data[i] = 0 1358 } 1359 } 1360 1361 // Reset resets the batch for reuse. The underlying byte slice (that is 1362 // returned by Repr()) may not be modified. It is only necessary to call this 1363 // method if a batch is explicitly being reused. Close automatically takes are 1364 // of releasing resources when appropriate for batches that are internally 1365 // being reused. 1366 func (b *Batch) Reset() { 1367 // Zero out the struct, retaining only the fields necessary for manual 1368 // reuse. 1369 b.batchInternal = batchInternal{ 1370 data: b.data, 1371 cmp: b.cmp, 1372 formatKey: b.formatKey, 1373 abbreviatedKey: b.abbreviatedKey, 1374 index: b.index, 1375 db: b.db, 1376 } 1377 b.applied.Store(false) 1378 if b.data != nil { 1379 if cap(b.data) > batchMaxRetainedSize { 1380 // If the capacity of the buffer is larger than our maximum 1381 // retention size, don't re-use it. Let it be GC-ed instead. 1382 // This prevents the memory from an unusually large batch from 1383 // being held on to indefinitely. 1384 b.data = nil 1385 } else { 1386 // Otherwise, reset the buffer for re-use. 1387 b.data = b.data[:batchHeaderLen] 1388 // Zero the sequence number in the header. 1389 for i := 0; i < len(b.data); i++ { 1390 b.data[i] = 0 1391 } 1392 } 1393 } 1394 if b.index != nil { 1395 b.index.Init(&b.data, b.cmp, b.abbreviatedKey) 1396 } 1397 } 1398 1399 // seqNumData returns the 8 byte little-endian sequence number. Zero means that 1400 // the batch has not yet been applied. 1401 func (b *Batch) seqNumData() []byte { 1402 return b.data[:8] 1403 } 1404 1405 // countData returns the 4 byte little-endian count data. "\xff\xff\xff\xff" 1406 // means that the batch is invalid. 1407 func (b *Batch) countData() []byte { 1408 return b.data[8:12] 1409 } 1410 1411 func (b *Batch) grow(n int) { 1412 newSize := len(b.data) + n 1413 if uint64(newSize) >= maxBatchSize { 1414 panic(ErrBatchTooLarge) 1415 } 1416 if newSize > cap(b.data) { 1417 newCap := 2 * cap(b.data) 1418 for newCap < newSize { 1419 newCap *= 2 1420 } 1421 newData := rawalloc.New(len(b.data), newCap) 1422 copy(newData, b.data) 1423 b.data = newData 1424 } 1425 b.data = b.data[:newSize] 1426 } 1427 1428 func (b *Batch) setSeqNum(seqNum uint64) { 1429 binary.LittleEndian.PutUint64(b.seqNumData(), seqNum) 1430 } 1431 1432 // SeqNum returns the batch sequence number which is applied to the first 1433 // record in the batch. The sequence number is incremented for each subsequent 1434 // record. It returns zero if the batch is empty. 1435 func (b *Batch) SeqNum() uint64 { 1436 if len(b.data) == 0 { 1437 b.init(batchHeaderLen) 1438 } 1439 return binary.LittleEndian.Uint64(b.seqNumData()) 1440 } 1441 1442 func (b *Batch) setCount(v uint32) { 1443 b.count = uint64(v) 1444 } 1445 1446 // Count returns the count of memtable-modifying operations in this batch. All 1447 // operations with the except of LogData increment this count. For IngestSSTs, 1448 // count is only used to indicate the number of SSTs ingested in the record, the 1449 // batch isn't applied to the memtable. 1450 func (b *Batch) Count() uint32 { 1451 if b.count > math.MaxUint32 { 1452 panic(ErrInvalidBatch) 1453 } 1454 return uint32(b.count) 1455 } 1456 1457 // Reader returns a BatchReader for the current batch contents. If the batch is 1458 // mutated, the new entries will not be visible to the reader. 1459 func (b *Batch) Reader() BatchReader { 1460 if len(b.data) == 0 { 1461 b.init(batchHeaderLen) 1462 } 1463 return b.data[batchHeaderLen:] 1464 } 1465 1466 func batchDecodeStr(data []byte) (odata []byte, s []byte, ok bool) { 1467 // TODO(jackson): This will index out of bounds if there's no varint or an 1468 // invalid varint (eg, a single 0xff byte). Correcting will add a bit of 1469 // overhead. We could avoid that overhead whenever len(data) >= 1470 // binary.MaxVarint32? 1471 1472 var v uint32 1473 var n int 1474 ptr := unsafe.Pointer(&data[0]) 1475 if a := *((*uint8)(ptr)); a < 128 { 1476 v = uint32(a) 1477 n = 1 1478 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 1479 v = uint32(b)<<7 | uint32(a) 1480 n = 2 1481 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 1482 v = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1483 n = 3 1484 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 1485 v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1486 n = 4 1487 } else { 1488 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 1489 v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1490 n = 5 1491 } 1492 1493 data = data[n:] 1494 if v > uint32(len(data)) { 1495 return nil, nil, false 1496 } 1497 return data[v:], data[:v], true 1498 } 1499 1500 // SyncWait is to be used in conjunction with DB.ApplyNoSyncWait. 1501 func (b *Batch) SyncWait() error { 1502 now := time.Now() 1503 b.fsyncWait.Wait() 1504 if b.commitErr != nil { 1505 b.db = nil // prevent batch reuse on error 1506 } 1507 waitDuration := time.Since(now) 1508 b.commitStats.CommitWaitDuration += waitDuration 1509 b.commitStats.TotalDuration += waitDuration 1510 return b.commitErr 1511 } 1512 1513 // CommitStats returns stats related to committing the batch. Should be called 1514 // after Batch.Commit, DB.Apply. If DB.ApplyNoSyncWait is used, should be 1515 // called after Batch.SyncWait. 1516 func (b *Batch) CommitStats() BatchCommitStats { 1517 return b.commitStats 1518 } 1519 1520 // BatchReader iterates over the entries contained in a batch. 1521 type BatchReader []byte 1522 1523 // ReadBatch constructs a BatchReader from a batch representation. The 1524 // header is not validated. ReadBatch returns a new batch reader and the 1525 // count of entries contained within the batch. 1526 func ReadBatch(repr []byte) (r BatchReader, count uint32) { 1527 if len(repr) <= batchHeaderLen { 1528 return nil, count 1529 } 1530 count = binary.LittleEndian.Uint32(repr[batchCountOffset:batchHeaderLen]) 1531 return repr[batchHeaderLen:], count 1532 } 1533 1534 // Next returns the next entry in this batch, if there is one. If the reader has 1535 // reached the end of the batch, Next returns ok=false and a nil error. If the 1536 // batch is corrupt and the next entry is illegible, Next returns ok=false and a 1537 // non-nil error. 1538 func (r *BatchReader) Next() (kind InternalKeyKind, ukey []byte, value []byte, ok bool, err error) { 1539 if len(*r) == 0 { 1540 return 0, nil, nil, false, nil 1541 } 1542 kind = InternalKeyKind((*r)[0]) 1543 if kind > InternalKeyKindMax { 1544 return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "invalid key kind 0x%x", (*r)[0]) 1545 } 1546 *r, ukey, ok = batchDecodeStr((*r)[1:]) 1547 if !ok { 1548 return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "decoding user key") 1549 } 1550 switch kind { 1551 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete, 1552 InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete, 1553 InternalKeyKindDeleteSized: 1554 *r, value, ok = batchDecodeStr(*r) 1555 if !ok { 1556 return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "decoding %s value", kind) 1557 } 1558 } 1559 return kind, ukey, value, true, nil 1560 } 1561 1562 // Note: batchIter mirrors the implementation of flushableBatchIter. Keep the 1563 // two in sync. 1564 type batchIter struct { 1565 cmp Compare 1566 batch *Batch 1567 iter batchskl.Iterator 1568 err error 1569 // snapshot holds a batch "sequence number" at which the batch is being 1570 // read. This sequence number has the InternalKeySeqNumBatch bit set, so it 1571 // encodes an offset within the batch. Only batch entries earlier than the 1572 // offset are visible during iteration. 1573 snapshot uint64 1574 } 1575 1576 // batchIter implements the base.InternalIterator interface. 1577 var _ base.InternalIterator = (*batchIter)(nil) 1578 1579 func (i *batchIter) String() string { 1580 return "batch" 1581 } 1582 1583 func (i *batchIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) { 1584 // Ignore TrySeekUsingNext if the view of the batch changed. 1585 if flags.TrySeekUsingNext() && flags.BatchJustRefreshed() { 1586 flags = flags.DisableTrySeekUsingNext() 1587 } 1588 1589 i.err = nil // clear cached iteration error 1590 ikey := i.iter.SeekGE(key, flags) 1591 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1592 ikey = i.iter.Next() 1593 } 1594 if ikey == nil { 1595 return nil, base.LazyValue{} 1596 } 1597 return ikey, base.MakeInPlaceValue(i.value()) 1598 } 1599 1600 func (i *batchIter) SeekPrefixGE( 1601 prefix, key []byte, flags base.SeekGEFlags, 1602 ) (*base.InternalKey, base.LazyValue) { 1603 i.err = nil // clear cached iteration error 1604 return i.SeekGE(key, flags) 1605 } 1606 1607 func (i *batchIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) { 1608 i.err = nil // clear cached iteration error 1609 ikey := i.iter.SeekLT(key) 1610 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1611 ikey = i.iter.Prev() 1612 } 1613 if ikey == nil { 1614 return nil, base.LazyValue{} 1615 } 1616 return ikey, base.MakeInPlaceValue(i.value()) 1617 } 1618 1619 func (i *batchIter) First() (*InternalKey, base.LazyValue) { 1620 i.err = nil // clear cached iteration error 1621 ikey := i.iter.First() 1622 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1623 ikey = i.iter.Next() 1624 } 1625 if ikey == nil { 1626 return nil, base.LazyValue{} 1627 } 1628 return ikey, base.MakeInPlaceValue(i.value()) 1629 } 1630 1631 func (i *batchIter) Last() (*InternalKey, base.LazyValue) { 1632 i.err = nil // clear cached iteration error 1633 ikey := i.iter.Last() 1634 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1635 ikey = i.iter.Prev() 1636 } 1637 if ikey == nil { 1638 return nil, base.LazyValue{} 1639 } 1640 return ikey, base.MakeInPlaceValue(i.value()) 1641 } 1642 1643 func (i *batchIter) Next() (*InternalKey, base.LazyValue) { 1644 ikey := i.iter.Next() 1645 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1646 ikey = i.iter.Next() 1647 } 1648 if ikey == nil { 1649 return nil, base.LazyValue{} 1650 } 1651 return ikey, base.MakeInPlaceValue(i.value()) 1652 } 1653 1654 func (i *batchIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) { 1655 // Because NextPrefix was invoked `succKey` must be ≥ the key at i's current 1656 // position. Seek the arena iterator using TrySeekUsingNext. 1657 ikey := i.iter.SeekGE(succKey, base.SeekGEFlagsNone.EnableTrySeekUsingNext()) 1658 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1659 ikey = i.iter.Next() 1660 } 1661 if ikey == nil { 1662 return nil, base.LazyValue{} 1663 } 1664 return ikey, base.MakeInPlaceValue(i.value()) 1665 } 1666 1667 func (i *batchIter) Prev() (*InternalKey, base.LazyValue) { 1668 ikey := i.iter.Prev() 1669 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1670 ikey = i.iter.Prev() 1671 } 1672 if ikey == nil { 1673 return nil, base.LazyValue{} 1674 } 1675 return ikey, base.MakeInPlaceValue(i.value()) 1676 } 1677 1678 func (i *batchIter) value() []byte { 1679 offset, _, keyEnd := i.iter.KeyInfo() 1680 data := i.batch.data 1681 if len(data[offset:]) == 0 { 1682 i.err = base.CorruptionErrorf("corrupted batch") 1683 return nil 1684 } 1685 1686 switch InternalKeyKind(data[offset]) { 1687 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete, 1688 InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete, 1689 InternalKeyKindDeleteSized: 1690 _, value, ok := batchDecodeStr(data[keyEnd:]) 1691 if !ok { 1692 return nil 1693 } 1694 return value 1695 default: 1696 return nil 1697 } 1698 } 1699 1700 func (i *batchIter) Error() error { 1701 return i.err 1702 } 1703 1704 func (i *batchIter) Close() error { 1705 _ = i.iter.Close() 1706 return i.err 1707 } 1708 1709 func (i *batchIter) SetBounds(lower, upper []byte) { 1710 i.iter.SetBounds(lower, upper) 1711 } 1712 1713 type flushableBatchEntry struct { 1714 // offset is the byte offset of the record within the batch repr. 1715 offset uint32 1716 // index is the 0-based ordinal number of the record within the batch. Used 1717 // to compute the seqnum for the record. 1718 index uint32 1719 // key{Start,End} are the start and end byte offsets of the key within the 1720 // batch repr. Cached to avoid decoding the key length on every 1721 // comparison. The value is stored starting at keyEnd. 1722 keyStart uint32 1723 keyEnd uint32 1724 } 1725 1726 // flushableBatch wraps an existing batch and provides the interfaces needed 1727 // for making the batch flushable (i.e. able to mimic a memtable). 1728 type flushableBatch struct { 1729 cmp Compare 1730 formatKey base.FormatKey 1731 data []byte 1732 1733 // The base sequence number for the entries in the batch. This is the same 1734 // value as Batch.seqNum() and is cached here for performance. 1735 seqNum uint64 1736 1737 // A slice of offsets and indices for the entries in the batch. Used to 1738 // implement flushableBatchIter. Unlike the indexing on a normal batch, a 1739 // flushable batch is indexed such that batch entry i will be given the 1740 // sequence number flushableBatch.seqNum+i. 1741 // 1742 // Sorted in increasing order of key and decreasing order of offset (since 1743 // higher offsets correspond to higher sequence numbers). 1744 // 1745 // Does not include range deletion entries or range key entries. 1746 offsets []flushableBatchEntry 1747 1748 // Fragmented range deletion tombstones. 1749 tombstones []keyspan.Span 1750 1751 // Fragmented range keys. 1752 rangeKeys []keyspan.Span 1753 } 1754 1755 var _ flushable = (*flushableBatch)(nil) 1756 1757 // newFlushableBatch creates a new batch that implements the flushable 1758 // interface. This allows the batch to act like a memtable and be placed in the 1759 // queue of flushable memtables. Note that the flushable batch takes ownership 1760 // of the batch data. 1761 func newFlushableBatch(batch *Batch, comparer *Comparer) (*flushableBatch, error) { 1762 b := &flushableBatch{ 1763 data: batch.data, 1764 cmp: comparer.Compare, 1765 formatKey: comparer.FormatKey, 1766 offsets: make([]flushableBatchEntry, 0, batch.Count()), 1767 } 1768 if b.data != nil { 1769 // Note that this sequence number is not correct when this batch has not 1770 // been applied since the sequence number has not been assigned yet. The 1771 // correct sequence number will be set later. But it is correct when the 1772 // batch is being replayed from the WAL. 1773 b.seqNum = batch.SeqNum() 1774 } 1775 var rangeDelOffsets []flushableBatchEntry 1776 var rangeKeyOffsets []flushableBatchEntry 1777 if len(b.data) > batchHeaderLen { 1778 // Non-empty batch. 1779 var index uint32 1780 for iter := BatchReader(b.data[batchHeaderLen:]); len(iter) > 0; index++ { 1781 offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0])) 1782 kind, key, _, ok, err := iter.Next() 1783 if !ok { 1784 if err != nil { 1785 return nil, err 1786 } 1787 break 1788 } 1789 entry := flushableBatchEntry{ 1790 offset: uint32(offset), 1791 index: uint32(index), 1792 } 1793 if keySize := uint32(len(key)); keySize == 0 { 1794 // Must add 2 to the offset. One byte encodes `kind` and the next 1795 // byte encodes `0`, which is the length of the key. 1796 entry.keyStart = uint32(offset) + 2 1797 entry.keyEnd = entry.keyStart 1798 } else { 1799 entry.keyStart = uint32(uintptr(unsafe.Pointer(&key[0])) - 1800 uintptr(unsafe.Pointer(&b.data[0]))) 1801 entry.keyEnd = entry.keyStart + keySize 1802 } 1803 switch kind { 1804 case InternalKeyKindRangeDelete: 1805 rangeDelOffsets = append(rangeDelOffsets, entry) 1806 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 1807 rangeKeyOffsets = append(rangeKeyOffsets, entry) 1808 default: 1809 b.offsets = append(b.offsets, entry) 1810 } 1811 } 1812 } 1813 1814 // Sort all of offsets, rangeDelOffsets and rangeKeyOffsets, using *batch's 1815 // sort.Interface implementation. 1816 pointOffsets := b.offsets 1817 sort.Sort(b) 1818 b.offsets = rangeDelOffsets 1819 sort.Sort(b) 1820 b.offsets = rangeKeyOffsets 1821 sort.Sort(b) 1822 b.offsets = pointOffsets 1823 1824 if len(rangeDelOffsets) > 0 { 1825 frag := &keyspan.Fragmenter{ 1826 Cmp: b.cmp, 1827 Format: b.formatKey, 1828 Emit: func(s keyspan.Span) { 1829 b.tombstones = append(b.tombstones, s) 1830 }, 1831 } 1832 it := &flushableBatchIter{ 1833 batch: b, 1834 data: b.data, 1835 offsets: rangeDelOffsets, 1836 cmp: b.cmp, 1837 index: -1, 1838 } 1839 fragmentRangeDels(frag, it, len(rangeDelOffsets)) 1840 } 1841 if len(rangeKeyOffsets) > 0 { 1842 frag := &keyspan.Fragmenter{ 1843 Cmp: b.cmp, 1844 Format: b.formatKey, 1845 Emit: func(s keyspan.Span) { 1846 b.rangeKeys = append(b.rangeKeys, s) 1847 }, 1848 } 1849 it := &flushableBatchIter{ 1850 batch: b, 1851 data: b.data, 1852 offsets: rangeKeyOffsets, 1853 cmp: b.cmp, 1854 index: -1, 1855 } 1856 fragmentRangeKeys(frag, it, len(rangeKeyOffsets)) 1857 } 1858 return b, nil 1859 } 1860 1861 func (b *flushableBatch) setSeqNum(seqNum uint64) { 1862 if b.seqNum != 0 { 1863 panic(fmt.Sprintf("pebble: flushableBatch.seqNum already set: %d", b.seqNum)) 1864 } 1865 b.seqNum = seqNum 1866 for i := range b.tombstones { 1867 for j := range b.tombstones[i].Keys { 1868 b.tombstones[i].Keys[j].Trailer = base.MakeTrailer( 1869 b.tombstones[i].Keys[j].SeqNum()+seqNum, 1870 b.tombstones[i].Keys[j].Kind(), 1871 ) 1872 } 1873 } 1874 for i := range b.rangeKeys { 1875 for j := range b.rangeKeys[i].Keys { 1876 b.rangeKeys[i].Keys[j].Trailer = base.MakeTrailer( 1877 b.rangeKeys[i].Keys[j].SeqNum()+seqNum, 1878 b.rangeKeys[i].Keys[j].Kind(), 1879 ) 1880 } 1881 } 1882 } 1883 1884 func (b *flushableBatch) Len() int { 1885 return len(b.offsets) 1886 } 1887 1888 func (b *flushableBatch) Less(i, j int) bool { 1889 ei := &b.offsets[i] 1890 ej := &b.offsets[j] 1891 ki := b.data[ei.keyStart:ei.keyEnd] 1892 kj := b.data[ej.keyStart:ej.keyEnd] 1893 switch c := b.cmp(ki, kj); { 1894 case c < 0: 1895 return true 1896 case c > 0: 1897 return false 1898 default: 1899 return ei.offset > ej.offset 1900 } 1901 } 1902 1903 func (b *flushableBatch) Swap(i, j int) { 1904 b.offsets[i], b.offsets[j] = b.offsets[j], b.offsets[i] 1905 } 1906 1907 // newIter is part of the flushable interface. 1908 func (b *flushableBatch) newIter(o *IterOptions) internalIterator { 1909 return &flushableBatchIter{ 1910 batch: b, 1911 data: b.data, 1912 offsets: b.offsets, 1913 cmp: b.cmp, 1914 index: -1, 1915 lower: o.GetLowerBound(), 1916 upper: o.GetUpperBound(), 1917 } 1918 } 1919 1920 // newFlushIter is part of the flushable interface. 1921 func (b *flushableBatch) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator { 1922 return &flushFlushableBatchIter{ 1923 flushableBatchIter: flushableBatchIter{ 1924 batch: b, 1925 data: b.data, 1926 offsets: b.offsets, 1927 cmp: b.cmp, 1928 index: -1, 1929 }, 1930 bytesIterated: bytesFlushed, 1931 } 1932 } 1933 1934 // newRangeDelIter is part of the flushable interface. 1935 func (b *flushableBatch) newRangeDelIter(o *IterOptions) keyspan.FragmentIterator { 1936 if len(b.tombstones) == 0 { 1937 return nil 1938 } 1939 return keyspan.NewIter(b.cmp, b.tombstones) 1940 } 1941 1942 // newRangeKeyIter is part of the flushable interface. 1943 func (b *flushableBatch) newRangeKeyIter(o *IterOptions) keyspan.FragmentIterator { 1944 if len(b.rangeKeys) == 0 { 1945 return nil 1946 } 1947 return keyspan.NewIter(b.cmp, b.rangeKeys) 1948 } 1949 1950 // containsRangeKeys is part of the flushable interface. 1951 func (b *flushableBatch) containsRangeKeys() bool { return len(b.rangeKeys) > 0 } 1952 1953 // inuseBytes is part of the flushable interface. 1954 func (b *flushableBatch) inuseBytes() uint64 { 1955 return uint64(len(b.data) - batchHeaderLen) 1956 } 1957 1958 // totalBytes is part of the flushable interface. 1959 func (b *flushableBatch) totalBytes() uint64 { 1960 return uint64(cap(b.data)) 1961 } 1962 1963 // readyForFlush is part of the flushable interface. 1964 func (b *flushableBatch) readyForFlush() bool { 1965 // A flushable batch is always ready for flush; it must be flushed together 1966 // with the previous memtable. 1967 return true 1968 } 1969 1970 // Note: flushableBatchIter mirrors the implementation of batchIter. Keep the 1971 // two in sync. 1972 type flushableBatchIter struct { 1973 // Members to be initialized by creator. 1974 batch *flushableBatch 1975 // The bytes backing the batch. Always the same as batch.data? 1976 data []byte 1977 // The sorted entries. This is not always equal to batch.offsets. 1978 offsets []flushableBatchEntry 1979 cmp Compare 1980 // Must be initialized to -1. It is the index into offsets that represents 1981 // the current iterator position. 1982 index int 1983 1984 // For internal use by the implementation. 1985 key InternalKey 1986 err error 1987 1988 // Optionally initialize to bounds of iteration, if any. 1989 lower []byte 1990 upper []byte 1991 } 1992 1993 // flushableBatchIter implements the base.InternalIterator interface. 1994 var _ base.InternalIterator = (*flushableBatchIter)(nil) 1995 1996 func (i *flushableBatchIter) String() string { 1997 return "flushable-batch" 1998 } 1999 2000 // SeekGE implements internalIterator.SeekGE, as documented in the pebble 2001 // package. Ignore flags.TrySeekUsingNext() since we don't expect this 2002 // optimization to provide much benefit here at the moment. 2003 func (i *flushableBatchIter) SeekGE( 2004 key []byte, flags base.SeekGEFlags, 2005 ) (*InternalKey, base.LazyValue) { 2006 i.err = nil // clear cached iteration error 2007 ikey := base.MakeSearchKey(key) 2008 i.index = sort.Search(len(i.offsets), func(j int) bool { 2009 return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0 2010 }) 2011 if i.index >= len(i.offsets) { 2012 return nil, base.LazyValue{} 2013 } 2014 i.key = i.getKey(i.index) 2015 if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 { 2016 i.index = len(i.offsets) 2017 return nil, base.LazyValue{} 2018 } 2019 return &i.key, i.value() 2020 } 2021 2022 // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the 2023 // pebble package. 2024 func (i *flushableBatchIter) SeekPrefixGE( 2025 prefix, key []byte, flags base.SeekGEFlags, 2026 ) (*base.InternalKey, base.LazyValue) { 2027 return i.SeekGE(key, flags) 2028 } 2029 2030 // SeekLT implements internalIterator.SeekLT, as documented in the pebble 2031 // package. 2032 func (i *flushableBatchIter) SeekLT( 2033 key []byte, flags base.SeekLTFlags, 2034 ) (*InternalKey, base.LazyValue) { 2035 i.err = nil // clear cached iteration error 2036 ikey := base.MakeSearchKey(key) 2037 i.index = sort.Search(len(i.offsets), func(j int) bool { 2038 return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0 2039 }) 2040 i.index-- 2041 if i.index < 0 { 2042 return nil, base.LazyValue{} 2043 } 2044 i.key = i.getKey(i.index) 2045 if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 { 2046 i.index = -1 2047 return nil, base.LazyValue{} 2048 } 2049 return &i.key, i.value() 2050 } 2051 2052 // First implements internalIterator.First, as documented in the pebble 2053 // package. 2054 func (i *flushableBatchIter) First() (*InternalKey, base.LazyValue) { 2055 i.err = nil // clear cached iteration error 2056 if len(i.offsets) == 0 { 2057 return nil, base.LazyValue{} 2058 } 2059 i.index = 0 2060 i.key = i.getKey(i.index) 2061 if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 { 2062 i.index = len(i.offsets) 2063 return nil, base.LazyValue{} 2064 } 2065 return &i.key, i.value() 2066 } 2067 2068 // Last implements internalIterator.Last, as documented in the pebble 2069 // package. 2070 func (i *flushableBatchIter) Last() (*InternalKey, base.LazyValue) { 2071 i.err = nil // clear cached iteration error 2072 if len(i.offsets) == 0 { 2073 return nil, base.LazyValue{} 2074 } 2075 i.index = len(i.offsets) - 1 2076 i.key = i.getKey(i.index) 2077 if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 { 2078 i.index = -1 2079 return nil, base.LazyValue{} 2080 } 2081 return &i.key, i.value() 2082 } 2083 2084 // Note: flushFlushableBatchIter.Next mirrors the implementation of 2085 // flushableBatchIter.Next due to performance. Keep the two in sync. 2086 func (i *flushableBatchIter) Next() (*InternalKey, base.LazyValue) { 2087 if i.index == len(i.offsets) { 2088 return nil, base.LazyValue{} 2089 } 2090 i.index++ 2091 if i.index == len(i.offsets) { 2092 return nil, base.LazyValue{} 2093 } 2094 i.key = i.getKey(i.index) 2095 if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 { 2096 i.index = len(i.offsets) 2097 return nil, base.LazyValue{} 2098 } 2099 return &i.key, i.value() 2100 } 2101 2102 func (i *flushableBatchIter) Prev() (*InternalKey, base.LazyValue) { 2103 if i.index < 0 { 2104 return nil, base.LazyValue{} 2105 } 2106 i.index-- 2107 if i.index < 0 { 2108 return nil, base.LazyValue{} 2109 } 2110 i.key = i.getKey(i.index) 2111 if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 { 2112 i.index = -1 2113 return nil, base.LazyValue{} 2114 } 2115 return &i.key, i.value() 2116 } 2117 2118 // Note: flushFlushableBatchIter.NextPrefix mirrors the implementation of 2119 // flushableBatchIter.NextPrefix due to performance. Keep the two in sync. 2120 func (i *flushableBatchIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) { 2121 return i.SeekGE(succKey, base.SeekGEFlagsNone.EnableTrySeekUsingNext()) 2122 } 2123 2124 func (i *flushableBatchIter) getKey(index int) InternalKey { 2125 e := &i.offsets[index] 2126 kind := InternalKeyKind(i.data[e.offset]) 2127 key := i.data[e.keyStart:e.keyEnd] 2128 return base.MakeInternalKey(key, i.batch.seqNum+uint64(e.index), kind) 2129 } 2130 2131 func (i *flushableBatchIter) value() base.LazyValue { 2132 p := i.data[i.offsets[i.index].offset:] 2133 if len(p) == 0 { 2134 i.err = base.CorruptionErrorf("corrupted batch") 2135 return base.LazyValue{} 2136 } 2137 kind := InternalKeyKind(p[0]) 2138 if kind > InternalKeyKindMax { 2139 i.err = base.CorruptionErrorf("corrupted batch") 2140 return base.LazyValue{} 2141 } 2142 var value []byte 2143 var ok bool 2144 switch kind { 2145 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete, 2146 InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete, 2147 InternalKeyKindDeleteSized: 2148 keyEnd := i.offsets[i.index].keyEnd 2149 _, value, ok = batchDecodeStr(i.data[keyEnd:]) 2150 if !ok { 2151 i.err = base.CorruptionErrorf("corrupted batch") 2152 return base.LazyValue{} 2153 } 2154 } 2155 return base.MakeInPlaceValue(value) 2156 } 2157 2158 func (i *flushableBatchIter) Valid() bool { 2159 return i.index >= 0 && i.index < len(i.offsets) 2160 } 2161 2162 func (i *flushableBatchIter) Error() error { 2163 return i.err 2164 } 2165 2166 func (i *flushableBatchIter) Close() error { 2167 return i.err 2168 } 2169 2170 func (i *flushableBatchIter) SetBounds(lower, upper []byte) { 2171 i.lower = lower 2172 i.upper = upper 2173 } 2174 2175 // flushFlushableBatchIter is similar to flushableBatchIter but it keeps track 2176 // of number of bytes iterated. 2177 type flushFlushableBatchIter struct { 2178 flushableBatchIter 2179 bytesIterated *uint64 2180 } 2181 2182 // flushFlushableBatchIter implements the base.InternalIterator interface. 2183 var _ base.InternalIterator = (*flushFlushableBatchIter)(nil) 2184 2185 func (i *flushFlushableBatchIter) String() string { 2186 return "flushable-batch" 2187 } 2188 2189 func (i *flushFlushableBatchIter) SeekGE( 2190 key []byte, flags base.SeekGEFlags, 2191 ) (*InternalKey, base.LazyValue) { 2192 panic("pebble: SeekGE unimplemented") 2193 } 2194 2195 func (i *flushFlushableBatchIter) SeekPrefixGE( 2196 prefix, key []byte, flags base.SeekGEFlags, 2197 ) (*base.InternalKey, base.LazyValue) { 2198 panic("pebble: SeekPrefixGE unimplemented") 2199 } 2200 2201 func (i *flushFlushableBatchIter) SeekLT( 2202 key []byte, flags base.SeekLTFlags, 2203 ) (*InternalKey, base.LazyValue) { 2204 panic("pebble: SeekLT unimplemented") 2205 } 2206 2207 func (i *flushFlushableBatchIter) First() (*InternalKey, base.LazyValue) { 2208 i.err = nil // clear cached iteration error 2209 key, val := i.flushableBatchIter.First() 2210 if key == nil { 2211 return nil, base.LazyValue{} 2212 } 2213 entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset 2214 *i.bytesIterated += uint64(entryBytes) + i.valueSize() 2215 return key, val 2216 } 2217 2218 func (i *flushFlushableBatchIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { 2219 panic("pebble: Prev unimplemented") 2220 } 2221 2222 // Note: flushFlushableBatchIter.Next mirrors the implementation of 2223 // flushableBatchIter.Next due to performance. Keep the two in sync. 2224 func (i *flushFlushableBatchIter) Next() (*InternalKey, base.LazyValue) { 2225 if i.index == len(i.offsets) { 2226 return nil, base.LazyValue{} 2227 } 2228 i.index++ 2229 if i.index == len(i.offsets) { 2230 return nil, base.LazyValue{} 2231 } 2232 i.key = i.getKey(i.index) 2233 entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset 2234 *i.bytesIterated += uint64(entryBytes) + i.valueSize() 2235 return &i.key, i.value() 2236 } 2237 2238 func (i flushFlushableBatchIter) Prev() (*InternalKey, base.LazyValue) { 2239 panic("pebble: Prev unimplemented") 2240 } 2241 2242 func (i flushFlushableBatchIter) valueSize() uint64 { 2243 p := i.data[i.offsets[i.index].offset:] 2244 if len(p) == 0 { 2245 i.err = base.CorruptionErrorf("corrupted batch") 2246 return 0 2247 } 2248 kind := InternalKeyKind(p[0]) 2249 if kind > InternalKeyKindMax { 2250 i.err = base.CorruptionErrorf("corrupted batch") 2251 return 0 2252 } 2253 var length uint64 2254 switch kind { 2255 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete: 2256 keyEnd := i.offsets[i.index].keyEnd 2257 v, n := binary.Uvarint(i.data[keyEnd:]) 2258 if n <= 0 { 2259 i.err = base.CorruptionErrorf("corrupted batch") 2260 return 0 2261 } 2262 length = v + uint64(n) 2263 } 2264 return length 2265 } 2266 2267 // batchSort returns iterators for the sorted contents of the batch. It is 2268 // intended for testing use only. The batch.Sort dance is done to prevent 2269 // exposing this method in the public pebble interface. 2270 func batchSort( 2271 i interface{}, 2272 ) ( 2273 points internalIterator, 2274 rangeDels keyspan.FragmentIterator, 2275 rangeKeys keyspan.FragmentIterator, 2276 ) { 2277 b := i.(*Batch) 2278 if b.Indexed() { 2279 pointIter := b.newInternalIter(nil) 2280 rangeDelIter := b.newRangeDelIter(nil, math.MaxUint64) 2281 rangeKeyIter := b.newRangeKeyIter(nil, math.MaxUint64) 2282 return pointIter, rangeDelIter, rangeKeyIter 2283 } 2284 f, err := newFlushableBatch(b, b.db.opts.Comparer) 2285 if err != nil { 2286 panic(err) 2287 } 2288 return f.newIter(nil), f.newRangeDelIter(nil), f.newRangeKeyIter(nil) 2289 } 2290 2291 func init() { 2292 private.BatchSort = batchSort 2293 }