github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/batch.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package bitalostable 6 7 import ( 8 "arena" 9 "encoding/binary" 10 "fmt" 11 "io" 12 "math" 13 "sort" 14 "sync" 15 "sync/atomic" 16 "unsafe" 17 18 "github.com/cockroachdb/errors" 19 "github.com/zuoyebang/bitalostable/internal/base" 20 "github.com/zuoyebang/bitalostable/internal/batchskl" 21 "github.com/zuoyebang/bitalostable/internal/humanize" 22 "github.com/zuoyebang/bitalostable/internal/keyspan" 23 "github.com/zuoyebang/bitalostable/internal/manual" 24 "github.com/zuoyebang/bitalostable/internal/private" 25 "github.com/zuoyebang/bitalostable/internal/rangedel" 26 "github.com/zuoyebang/bitalostable/internal/rangekey" 27 "github.com/zuoyebang/bitalostable/internal/rawalloc" 28 ) 29 30 const ( 31 batchCountOffset = 8 32 batchHeaderLen = 12 33 batchInitialSize = 1 << 10 // 1 KB 34 batchMaxRetainedSize = 1 << 20 // 1 MB 35 invalidBatchCount = 1<<32 - 1 36 maxVarintLen32 = 5 37 ) 38 39 // ErrNotIndexed means that a read operation on a batch failed because the 40 // batch is not indexed and thus doesn't support reads. 41 var ErrNotIndexed = errors.New("bitalostable: batch not indexed") 42 43 // ErrInvalidBatch indicates that a batch is invalid or otherwise corrupted. 44 var ErrInvalidBatch = errors.New("bitalostable: invalid batch") 45 46 // ErrBatchTooLarge indicates that a batch is invalid or otherwise corrupted. 47 var ErrBatchTooLarge = errors.Newf("bitalostable: batch too large: >= %s", humanize.Uint64(maxBatchSize)) 48 49 // DeferredBatchOp represents a batch operation (eg. set, merge, delete) that is 50 // being inserted into the batch. Indexing is not performed on the specified key 51 // until Finish is called, hence the name deferred. This struct lets the caller 52 // copy or encode keys/values directly into the batch representation instead of 53 // copying into an intermediary buffer then having bitalostable.Batch copy off of it. 54 type DeferredBatchOp struct { 55 index *batchskl.Skiplist 56 57 // Key and Value point to parts of the binary batch representation where 58 // keys and values should be encoded/copied into. len(Key) and len(Value) 59 // bytes must be copied into these slices respectively before calling 60 // Finish(). Changing where these slices point to is not allowed. 61 Key, Value []byte 62 offset uint32 63 } 64 65 // Finish completes the addition of this batch operation, and adds it to the 66 // index if necessary. Must be called once (and exactly once) keys/values 67 // have been filled into Key and Value. Not calling Finish or not 68 // copying/encoding keys will result in an incomplete index, and calling Finish 69 // twice may result in a panic. 70 func (d DeferredBatchOp) Finish() error { 71 if d.index != nil { 72 if err := d.index.Add(d.offset); err != nil { 73 return err 74 } 75 } 76 return nil 77 } 78 79 // A Batch is a sequence of Sets, Merges, Deletes, DeleteRanges, RangeKeySets, 80 // RangeKeyUnsets, and/or RangeKeyDeletes that are applied atomically. Batch 81 // implements the Reader interface, but only an indexed batch supports reading 82 // (without error) via Get or NewIter. A non-indexed batch will return 83 // ErrNotIndexed when read from. A batch is not safe for concurrent use, and 84 // consumers should use a batch per goroutine or provide their own 85 // synchronization. 86 // 87 // # Indexing 88 // 89 // Batches can be optionally indexed (see DB.NewIndexedBatch). An indexed batch 90 // allows iteration via an Iterator (see Batch.NewIter). The iterator provides 91 // a merged view of the operations in the batch and the underlying 92 // database. This is implemented by treating the batch as an additional layer 93 // in the LSM where every entry in the batch is considered newer than any entry 94 // in the underlying database (batch entries have the InternalKeySeqNumBatch 95 // bit set). By treating the batch as an additional layer in the LSM, iteration 96 // supports all batch operations (i.e. Set, Merge, Delete, DeleteRange, 97 // RangeKeySet, RangeKeyUnset, RangeKeyDelete) with minimal effort. 98 // 99 // The same key can be operated on multiple times in a batch, though only the 100 // latest operation will be visible. For example, Put("a", "b"), Delete("a") 101 // will cause the key "a" to not be visible in the batch. Put("a", "b"), 102 // Put("a", "c") will cause a read of "a" to return the value "c". 103 // 104 // The batch index is implemented via an skiplist (internal/batchskl). While 105 // the skiplist implementation is very fast, inserting into an indexed batch is 106 // significantly slower than inserting into a non-indexed batch. Only use an 107 // indexed batch if you require reading from it. 108 // 109 // # Atomic commit 110 // 111 // The operations in a batch are persisted by calling Batch.Commit which is 112 // equivalent to calling DB.Apply(batch). A batch is committed atomically by 113 // writing the internal batch representation to the WAL, adding all of the 114 // batch operations to the memtable associated with the WAL, and then 115 // incrementing the visible sequence number so that subsequent reads can see 116 // the effects of the batch operations. If WriteOptions.Sync is true, a call to 117 // Batch.Commit will guarantee that the batch is persisted to disk before 118 // returning. See commitPipeline for more on the implementation details. 119 // 120 // # Large batches 121 // 122 // The size of a batch is limited only by available memory (be aware that 123 // indexed batches require considerably additional memory for the skiplist 124 // structure). A given WAL file has a single memtable associated with it (this 125 // restriction could be removed, but doing so is onerous and complex). And a 126 // memtable has a fixed size due to the underlying fixed size arena. Note that 127 // this differs from RocksDB where a memtable can grow arbitrarily large using 128 // a list of arena chunks. In RocksDB this is accomplished by storing pointers 129 // in the arena memory, but that isn't possible in Go. 130 // 131 // During Batch.Commit, a batch which is larger than a threshold (> 132 // MemTableSize/2) is wrapped in a flushableBatch and inserted into the queue 133 // of memtables. A flushableBatch forces WAL to be rotated, but that happens 134 // anyways when the memtable becomes full so this does not cause significant 135 // WAL churn. Because the flushableBatch is readable as another layer in the 136 // LSM, Batch.Commit returns as soon as the flushableBatch has been added to 137 // the queue of memtables. 138 // 139 // Internally, a flushableBatch provides Iterator support by sorting the batch 140 // contents (the batch is sorted once, when it is added to the memtable 141 // queue). Sorting the batch contents and insertion of the contents into a 142 // memtable have the same big-O time, but the constant factor dominates 143 // here. Sorting is significantly faster and uses significantly less memory. 144 // 145 // # Internal representation 146 // 147 // The internal batch representation is a contiguous byte buffer with a fixed 148 // 12-byte header, followed by a series of records. 149 // 150 // +-------------+------------+--- ... ---+ 151 // | SeqNum (8B) | Count (4B) | Entries | 152 // +-------------+------------+--- ... ---+ 153 // 154 // Each record has a 1-byte kind tag prefix, followed by 1 or 2 length prefixed 155 // strings (varstring): 156 // 157 // +-----------+-----------------+-------------------+ 158 // | Kind (1B) | Key (varstring) | Value (varstring) | 159 // +-----------+-----------------+-------------------+ 160 // 161 // A varstring is a varint32 followed by N bytes of data. The Kind tags are 162 // exactly those specified by InternalKeyKind. The following table shows the 163 // format for records of each kind: 164 // 165 // InternalKeyKindDelete varstring 166 // InternalKeyKindLogData varstring 167 // InternalKeyKindSet varstring varstring 168 // InternalKeyKindMerge varstring varstring 169 // InternalKeyKindRangeDelete varstring varstring 170 // InternalKeyKindRangeKeySet varstring varstring 171 // InternalKeyKindRangeKeyUnset varstring varstring 172 // InternalKeyKindRangeKeyDelete varstring varstring 173 // 174 // The intuitive understanding here are that the arguments to Delete, Set, 175 // Merge, DeleteRange and RangeKeyDelete are encoded into the batch. The 176 // RangeKeySet and RangeKeyUnset operations are slightly more complicated, 177 // encoding their end key, suffix and value [in the case of RangeKeySet] within 178 // the Value varstring. For more information on the value encoding for 179 // RangeKeySet and RangeKeyUnset, see the internal/rangekey package. 180 // 181 // The internal batch representation is the on disk format for a batch in the 182 // WAL, and thus stable. New record kinds may be added, but the existing ones 183 // will not be modified. 184 type Batch struct { 185 // Data is the wire format of a batch's log entry: 186 // - 8 bytes for a sequence number of the first batch element, 187 // or zeroes if the batch has not yet been applied, 188 // - 4 bytes for the count: the number of elements in the batch, 189 // or "\xff\xff\xff\xff" if the batch is invalid, 190 // - count elements, being: 191 // - one byte for the kind 192 // - the varint-string user key, 193 // - the varint-string value (if kind != delete). 194 // The sequence number and count are stored in little-endian order. 195 // 196 // The data field can be (but is not guaranteed to be) nil for new 197 // batches. Large batches will set the data field to nil when committed as 198 // the data has been moved to a flushableBatch and inserted into the queue of 199 // memtables. 200 data []byte 201 alloc []byte 202 isFlush bool 203 204 cmp Compare 205 formatKey base.FormatKey 206 abbreviatedKey AbbreviatedKey 207 208 // An upper bound on required space to add this batch to a memtable. 209 // Note that although batches are limited to 4 GiB in size, that limit 210 // applies to len(data), not the memtable size. The upper bound on the 211 // size of a memtable node is larger than the overhead of the batch's log 212 // encoding, so memTableSize is larger than len(data) and may overflow a 213 // uint32. 214 memTableSize uint64 215 216 // The db to which the batch will be committed. Do not change this field 217 // after the batch has been created as it might invalidate internal state. 218 db *DB 219 220 // The count of records in the batch. This count will be stored in the batch 221 // data whenever Repr() is called. 222 count uint64 223 224 // The count of range deletions in the batch. Updated every time a range 225 // deletion is added. 226 countRangeDels uint64 227 228 // The count of range key sets, unsets and deletes in the batch. Updated 229 // every time a RANGEKEYSET, RANGEKEYUNSET or RANGEKEYDEL key is added. 230 countRangeKeys uint64 231 232 // A deferredOp struct, stored in the Batch so that a pointer can be returned 233 // from the *Deferred() methods rather than a value. 234 deferredOp DeferredBatchOp 235 236 // An optional skiplist keyed by offset into data of the entry. 237 index *batchskl.Skiplist 238 rangeDelIndex *batchskl.Skiplist 239 rangeKeyIndex *batchskl.Skiplist 240 241 // Fragmented range deletion tombstones. Cached the first time a range 242 // deletion iterator is requested. The cache is invalidated whenever a new 243 // range deletion is added to the batch. This cache can only be used when 244 // opening an iterator to read at a batch sequence number >= 245 // tombstonesSeqNum. This is the case for all new iterators created over a 246 // batch but it's not the case for all cloned iterators. 247 tombstones []keyspan.Span 248 tombstonesSeqNum uint64 249 250 // Fragmented range key spans. Cached the first time a range key iterator is 251 // requested. The cache is invalidated whenever a new range key 252 // (RangeKey{Set,Unset,Del}) is added to the batch. This cache can only be 253 // used when opening an iterator to read at a batch sequence number >= 254 // tombstonesSeqNum. This is the case for all new iterators created over a 255 // batch but it's not the case for all cloned iterators. 256 rangeKeys []keyspan.Span 257 rangeKeysSeqNum uint64 258 259 // The flushableBatch wrapper if the batch is too large to fit in the 260 // memtable. 261 flushable *flushableBatch 262 263 commit sync.WaitGroup 264 commitErr error 265 applied uint32 // updated atomically 266 } 267 268 var _ Reader = (*Batch)(nil) 269 var _ Writer = (*Batch)(nil) 270 271 var batchPool = sync.Pool{ 272 New: func() interface{} { 273 return &Batch{} 274 }, 275 } 276 277 type indexedBatch struct { 278 batch Batch 279 index batchskl.Skiplist 280 } 281 282 var indexedBatchPool = sync.Pool{ 283 New: func() interface{} { 284 return &indexedBatch{} 285 }, 286 } 287 288 func newBatch(db *DB) *Batch { 289 b := batchPool.Get().(*Batch) 290 b.db = db 291 b.isFlush = false 292 return b 293 } 294 295 func newFlushBatch(db *DB, n int) *Batch { 296 b := batchPool.Get().(*Batch) 297 b.db = db 298 b.isFlush = true 299 b.alloc = manual.New(n) 300 b.data = b.alloc[:batchHeaderLen] 301 b.setCount(0) 302 b.setSeqNum(0) 303 return b 304 } 305 306 func newIndexedBatch(db *DB, comparer *Comparer) *Batch { 307 i := indexedBatchPool.Get().(*indexedBatch) 308 i.batch.cmp = comparer.Compare 309 i.batch.formatKey = comparer.FormatKey 310 i.batch.abbreviatedKey = comparer.AbbreviatedKey 311 i.batch.db = db 312 i.batch.index = &i.index 313 i.batch.index.Init(&i.batch.data, i.batch.cmp, i.batch.abbreviatedKey) 314 i.batch.isFlush = false 315 return &i.batch 316 } 317 318 // nextSeqNum returns the batch "sequence number" that will be given to the next 319 // key written to the batch. During iteration keys within an indexed batch are 320 // given a sequence number consisting of their offset within the batch combined 321 // with the base.InternalKeySeqNumBatch bit. These sequence numbers are only 322 // used during iteration, and the keys are assigned ordinary sequence numbers 323 // when the batch is committed. 324 func (b *Batch) nextSeqNum() uint64 { 325 return uint64(len(b.data)) | base.InternalKeySeqNumBatch 326 } 327 328 func (b *Batch) release() { 329 if b.db == nil { 330 // The batch was not created using newBatch or newIndexedBatch, or an error 331 // was encountered. We don't try to reuse batches that encountered an error 332 // because they might be stuck somewhere in the system and attempting to 333 // reuse such batches is a recipe for onerous debugging sessions. Instead, 334 // let the GC do its job. 335 return 336 } 337 b.db = nil 338 339 // NB: This is ugly (it would be cleaner if we could just assign a Batch{}), 340 // but necessary so that we can use atomic.StoreUint32 for the Batch.applied 341 // field. Without using an atomic to clear that field the Go race detector 342 // complains. 343 b.Reset() 344 b.cmp = nil 345 b.formatKey = nil 346 b.abbreviatedKey = nil 347 b.isFlush = false 348 b.alloc = nil 349 350 if b.index == nil { 351 batchPool.Put(b) 352 } else { 353 b.index, b.rangeDelIndex, b.rangeKeyIndex = nil, nil, nil 354 indexedBatchPool.Put((*indexedBatch)(unsafe.Pointer(b))) 355 } 356 } 357 358 func (b *Batch) refreshMemTableSize() { 359 b.memTableSize = 0 360 if len(b.data) < batchHeaderLen { 361 return 362 } 363 364 b.countRangeDels = 0 365 b.countRangeKeys = 0 366 for r := b.Reader(); ; { 367 kind, key, value, ok := r.Next() 368 if !ok { 369 break 370 } 371 b.memTableSize += memTableEntrySize(len(key), len(value)) 372 switch kind { 373 case InternalKeyKindRangeDelete: 374 b.countRangeDels++ 375 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 376 b.countRangeKeys++ 377 } 378 } 379 } 380 381 // Apply the operations contained in the batch to the receiver batch. 382 // 383 // It is safe to modify the contents of the arguments after Apply returns. 384 func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error { 385 if len(batch.data) == 0 { 386 return nil 387 } 388 if len(batch.data) < batchHeaderLen { 389 return base.CorruptionErrorf("bitalostable: invalid batch") 390 } 391 392 offset := len(b.data) 393 if offset == 0 { 394 b.init(offset) 395 offset = batchHeaderLen 396 } 397 b.data = append(b.data, batch.data[batchHeaderLen:]...) 398 399 b.setCount(b.Count() + batch.Count()) 400 401 if b.db != nil || b.index != nil { 402 // Only iterate over the new entries if we need to track memTableSize or in 403 // order to update the index. 404 for iter := BatchReader(b.data[offset:]); len(iter) > 0; { 405 offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0])) 406 kind, key, value, ok := iter.Next() 407 if !ok { 408 break 409 } 410 switch kind { 411 case InternalKeyKindRangeDelete: 412 b.countRangeDels++ 413 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 414 b.countRangeKeys++ 415 } 416 if b.index != nil { 417 var err error 418 switch kind { 419 case InternalKeyKindRangeDelete: 420 b.tombstones = nil 421 b.tombstonesSeqNum = 0 422 if b.rangeDelIndex == nil { 423 b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey) 424 } 425 err = b.rangeDelIndex.Add(uint32(offset)) 426 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 427 b.rangeKeys = nil 428 b.rangeKeysSeqNum = 0 429 if b.rangeKeyIndex == nil { 430 b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey) 431 } 432 err = b.rangeKeyIndex.Add(uint32(offset)) 433 default: 434 err = b.index.Add(uint32(offset)) 435 } 436 if err != nil { 437 return err 438 } 439 } 440 b.memTableSize += memTableEntrySize(len(key), len(value)) 441 } 442 } 443 return nil 444 } 445 446 // Get gets the value for the given key. It returns ErrNotFound if the Batch 447 // does not contain the key. 448 // 449 // The caller should not modify the contents of the returned slice, but it is 450 // safe to modify the contents of the argument after Get returns. The returned 451 // slice will remain valid until the returned Closer is closed. On success, the 452 // caller MUST call closer.Close() or a memory leak will occur. 453 func (b *Batch) Get(key []byte) ([]byte, io.Closer, error) { 454 if b.index == nil { 455 return nil, nil, ErrNotIndexed 456 } 457 return b.db.getInternal(key, b, nil /* snapshot */) 458 } 459 460 func (b *Batch) prepareDeferredKeyValueRecord(keyLen, valueLen int, kind InternalKeyKind) { 461 if len(b.data) == 0 { 462 b.init(keyLen + valueLen + 2*binary.MaxVarintLen64 + batchHeaderLen) 463 } 464 b.count++ 465 b.memTableSize += memTableEntrySize(keyLen, valueLen) 466 467 pos := len(b.data) 468 b.deferredOp.offset = uint32(pos) 469 b.grow(1 + 2*maxVarintLen32 + keyLen + valueLen) 470 b.data[pos] = byte(kind) 471 pos++ 472 473 { 474 // TODO(peter): Manually inlined version binary.PutUvarint(). This is 20% 475 // faster on BenchmarkBatchSet on go1.13. Remove if go1.14 or future 476 // versions show this to not be a performance win. 477 x := uint32(keyLen) 478 for x >= 0x80 { 479 b.data[pos] = byte(x) | 0x80 480 x >>= 7 481 pos++ 482 } 483 b.data[pos] = byte(x) 484 pos++ 485 } 486 487 b.deferredOp.Key = b.data[pos : pos+keyLen] 488 pos += keyLen 489 490 { 491 // TODO(peter): Manually inlined version binary.PutUvarint(). This is 20% 492 // faster on BenchmarkBatchSet on go1.13. Remove if go1.14 or future 493 // versions show this to not be a performance win. 494 x := uint32(valueLen) 495 for x >= 0x80 { 496 b.data[pos] = byte(x) | 0x80 497 x >>= 7 498 pos++ 499 } 500 b.data[pos] = byte(x) 501 pos++ 502 } 503 504 b.deferredOp.Value = b.data[pos : pos+valueLen] 505 // Shrink data since varints may be shorter than the upper bound. 506 b.data = b.data[:pos+valueLen] 507 } 508 509 func (b *Batch) prepareDeferredKeyRecord(keyLen int, kind InternalKeyKind) { 510 if len(b.data) == 0 { 511 b.init(keyLen + binary.MaxVarintLen64 + batchHeaderLen) 512 } 513 b.count++ 514 b.memTableSize += memTableEntrySize(keyLen, 0) 515 516 pos := len(b.data) 517 b.deferredOp.offset = uint32(pos) 518 b.grow(1 + maxVarintLen32 + keyLen) 519 b.data[pos] = byte(kind) 520 pos++ 521 522 { 523 // TODO(peter): Manually inlined version binary.PutUvarint(). Remove if 524 // go1.13 or future versions show this to not be a performance win. See 525 // BenchmarkBatchSet. 526 x := uint32(keyLen) 527 for x >= 0x80 { 528 b.data[pos] = byte(x) | 0x80 529 x >>= 7 530 pos++ 531 } 532 b.data[pos] = byte(x) 533 pos++ 534 } 535 536 b.deferredOp.Key = b.data[pos : pos+keyLen] 537 b.deferredOp.Value = nil 538 539 // Shrink data since varint may be shorter than the upper bound. 540 b.data = b.data[:pos+keyLen] 541 } 542 543 // Set adds an action to the batch that sets the key to map to the value. 544 // 545 // It is safe to modify the contents of the arguments after Set returns. 546 func (b *Batch) Set(key, value []byte, _ *WriteOptions) error { 547 deferredOp := b.SetDeferred(len(key), len(value)) 548 copy(deferredOp.Key, key) 549 copy(deferredOp.Value, value) 550 return deferredOp.Finish() 551 } 552 553 func (b *Batch) SetMultiValue(key []byte, values ...[]byte) error { 554 var valueLen int 555 for i := range values { 556 valueLen += len(values[i]) 557 } 558 deferredOp := b.SetDeferred(len(key), valueLen) 559 copy(deferredOp.Key, key) 560 pos := 0 561 for j := range values { 562 pos += copy(deferredOp.Value[pos:], values[j]) 563 } 564 return deferredOp.Finish() 565 } 566 567 // SetDeferred is similar to Set in that it adds a set operation to the batch, 568 // except it only takes in key/value lengths instead of complete slices, 569 // letting the caller encode into those objects and then call Finish() on the 570 // returned object. 571 func (b *Batch) SetDeferred(keyLen, valueLen int) *DeferredBatchOp { 572 b.prepareDeferredKeyValueRecord(keyLen, valueLen, InternalKeyKindSet) 573 b.deferredOp.index = b.index 574 return &b.deferredOp 575 } 576 577 // Merge adds an action to the batch that merges the value at key with the new 578 // value. The details of the merge are dependent upon the configured merge 579 // operator. 580 // 581 // It is safe to modify the contents of the arguments after Merge returns. 582 func (b *Batch) Merge(key, value []byte, _ *WriteOptions) error { 583 deferredOp := b.MergeDeferred(len(key), len(value)) 584 copy(deferredOp.Key, key) 585 copy(deferredOp.Value, value) 586 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 587 // in go1.13 will remove the need for this. 588 if b.index != nil { 589 if err := b.index.Add(deferredOp.offset); err != nil { 590 return err 591 } 592 } 593 return nil 594 } 595 596 // MergeDeferred is similar to Merge in that it adds a merge operation to the 597 // batch, except it only takes in key/value lengths instead of complete slices, 598 // letting the caller encode into those objects and then call Finish() on the 599 // returned object. 600 func (b *Batch) MergeDeferred(keyLen, valueLen int) *DeferredBatchOp { 601 b.prepareDeferredKeyValueRecord(keyLen, valueLen, InternalKeyKindMerge) 602 b.deferredOp.index = b.index 603 return &b.deferredOp 604 } 605 606 // Delete adds an action to the batch that deletes the entry for key. 607 // 608 // It is safe to modify the contents of the arguments after Delete returns. 609 func (b *Batch) Delete(key []byte, _ *WriteOptions) error { 610 deferredOp := b.DeleteDeferred(len(key)) 611 copy(deferredOp.Key, key) 612 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 613 // in go1.13 will remove the need for this. 614 if b.index != nil { 615 if err := b.index.Add(deferredOp.offset); err != nil { 616 return err 617 } 618 } 619 return nil 620 } 621 622 // DeleteDeferred is similar to Delete in that it adds a delete operation to 623 // the batch, except it only takes in key/value lengths instead of complete 624 // slices, letting the caller encode into those objects and then call Finish() 625 // on the returned object. 626 func (b *Batch) DeleteDeferred(keyLen int) *DeferredBatchOp { 627 b.prepareDeferredKeyRecord(keyLen, InternalKeyKindDelete) 628 b.deferredOp.index = b.index 629 return &b.deferredOp 630 } 631 632 // SingleDelete adds an action to the batch that single deletes the entry for key. 633 // See Writer.SingleDelete for more details on the semantics of SingleDelete. 634 // 635 // It is safe to modify the contents of the arguments after SingleDelete returns. 636 func (b *Batch) SingleDelete(key []byte, _ *WriteOptions) error { 637 deferredOp := b.SingleDeleteDeferred(len(key)) 638 copy(deferredOp.Key, key) 639 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 640 // in go1.13 will remove the need for this. 641 if b.index != nil { 642 if err := b.index.Add(deferredOp.offset); err != nil { 643 return err 644 } 645 } 646 return nil 647 } 648 649 // SingleDeleteDeferred is similar to SingleDelete in that it adds a single delete 650 // operation to the batch, except it only takes in key/value lengths instead of 651 // complete slices, letting the caller encode into those objects and then call 652 // Finish() on the returned object. 653 func (b *Batch) SingleDeleteDeferred(keyLen int) *DeferredBatchOp { 654 b.prepareDeferredKeyRecord(keyLen, InternalKeyKindSingleDelete) 655 b.deferredOp.index = b.index 656 return &b.deferredOp 657 } 658 659 // DeleteRange deletes all of the point keys (and values) in the range 660 // [start,end) (inclusive on start, exclusive on end). DeleteRange does NOT 661 // delete overlapping range keys (eg, keys set via RangeKeySet). 662 // 663 // It is safe to modify the contents of the arguments after DeleteRange 664 // returns. 665 func (b *Batch) DeleteRange(start, end []byte, _ *WriteOptions) error { 666 deferredOp := b.DeleteRangeDeferred(len(start), len(end)) 667 copy(deferredOp.Key, start) 668 copy(deferredOp.Value, end) 669 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 670 // in go1.13 will remove the need for this. 671 if deferredOp.index != nil { 672 if err := deferredOp.index.Add(deferredOp.offset); err != nil { 673 return err 674 } 675 } 676 return nil 677 } 678 679 // DeleteRangeDeferred is similar to DeleteRange in that it adds a delete range 680 // operation to the batch, except it only takes in key lengths instead of 681 // complete slices, letting the caller encode into those objects and then call 682 // Finish() on the returned object. Note that DeferredBatchOp.Key should be 683 // populated with the start key, and DeferredBatchOp.Value should be populated 684 // with the end key. 685 func (b *Batch) DeleteRangeDeferred(startLen, endLen int) *DeferredBatchOp { 686 b.prepareDeferredKeyValueRecord(startLen, endLen, InternalKeyKindRangeDelete) 687 b.countRangeDels++ 688 if b.index != nil { 689 b.tombstones = nil 690 b.tombstonesSeqNum = 0 691 // Range deletions are rare, so we lazily allocate the index for them. 692 if b.rangeDelIndex == nil { 693 b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey) 694 } 695 b.deferredOp.index = b.rangeDelIndex 696 } 697 return &b.deferredOp 698 } 699 700 // RangeKeySet sets a range key mapping the key range [start, end) at the MVCC 701 // timestamp suffix to value. The suffix is optional. If any portion of the key 702 // range [start, end) is already set by a range key with the same suffix value, 703 // RangeKeySet overrides it. 704 // 705 // It is safe to modify the contents of the arguments after RangeKeySet returns. 706 func (b *Batch) RangeKeySet(start, end, suffix, value []byte, _ *WriteOptions) error { 707 suffixValues := [1]rangekey.SuffixValue{{Suffix: suffix, Value: value}} 708 internalValueLen := rangekey.EncodedSetValueLen(end, suffixValues[:]) 709 710 deferredOp := b.rangeKeySetDeferred(len(start), internalValueLen) 711 copy(deferredOp.Key, start) 712 n := rangekey.EncodeSetValue(deferredOp.Value, end, suffixValues[:]) 713 if n != internalValueLen { 714 panic("unexpected internal value length mismatch") 715 } 716 717 // Manually inline DeferredBatchOp.Finish(). 718 if deferredOp.index != nil { 719 if err := deferredOp.index.Add(deferredOp.offset); err != nil { 720 return err 721 } 722 } 723 return nil 724 } 725 726 func (b *Batch) rangeKeySetDeferred(startLen, internalValueLen int) *DeferredBatchOp { 727 b.prepareDeferredKeyValueRecord(startLen, internalValueLen, InternalKeyKindRangeKeySet) 728 b.incrementRangeKeysCount() 729 return &b.deferredOp 730 } 731 732 func (b *Batch) incrementRangeKeysCount() { 733 b.countRangeKeys++ 734 if b.index != nil { 735 b.rangeKeys = nil 736 b.rangeKeysSeqNum = 0 737 // Range keys are rare, so we lazily allocate the index for them. 738 if b.rangeKeyIndex == nil { 739 b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey) 740 } 741 b.deferredOp.index = b.rangeKeyIndex 742 } 743 } 744 745 // RangeKeyUnset removes a range key mapping the key range [start, end) at the 746 // MVCC timestamp suffix. The suffix may be omitted to remove an unsuffixed 747 // range key. RangeKeyUnset only removes portions of range keys that fall within 748 // the [start, end) key span, and only range keys with suffixes that exactly 749 // match the unset suffix. 750 // 751 // It is safe to modify the contents of the arguments after RangeKeyUnset 752 // returns. 753 func (b *Batch) RangeKeyUnset(start, end, suffix []byte, _ *WriteOptions) error { 754 suffixes := [1][]byte{suffix} 755 internalValueLen := rangekey.EncodedUnsetValueLen(end, suffixes[:]) 756 757 deferredOp := b.rangeKeyUnsetDeferred(len(start), internalValueLen) 758 copy(deferredOp.Key, start) 759 n := rangekey.EncodeUnsetValue(deferredOp.Value, end, suffixes[:]) 760 if n != internalValueLen { 761 panic("unexpected internal value length mismatch") 762 } 763 764 // Manually inline DeferredBatchOp.Finish() 765 if deferredOp.index != nil { 766 if err := deferredOp.index.Add(deferredOp.offset); err != nil { 767 return err 768 } 769 } 770 return nil 771 } 772 773 func (b *Batch) rangeKeyUnsetDeferred(startLen, internalValueLen int) *DeferredBatchOp { 774 b.prepareDeferredKeyValueRecord(startLen, internalValueLen, InternalKeyKindRangeKeyUnset) 775 b.incrementRangeKeysCount() 776 return &b.deferredOp 777 } 778 779 // RangeKeyDelete deletes all of the range keys in the range [start,end) 780 // (inclusive on start, exclusive on end). It does not delete point keys (for 781 // that use DeleteRange). RangeKeyDelete removes all range keys within the 782 // bounds, including those with or without suffixes. 783 // 784 // It is safe to modify the contents of the arguments after RangeKeyDelete 785 // returns. 786 func (b *Batch) RangeKeyDelete(start, end []byte, _ *WriteOptions) error { 787 deferredOp := b.RangeKeyDeleteDeferred(len(start), len(end)) 788 copy(deferredOp.Key, start) 789 copy(deferredOp.Value, end) 790 // Manually inline DeferredBatchOp.Finish(). 791 if deferredOp.index != nil { 792 if err := deferredOp.index.Add(deferredOp.offset); err != nil { 793 return err 794 } 795 } 796 return nil 797 } 798 799 // RangeKeyDeleteDeferred is similar to RangeKeyDelete in that it adds an 800 // operation to delete range keys to the batch, except it only takes in key 801 // lengths instead of complete slices, letting the caller encode into those 802 // objects and then call Finish() on the returned object. Note that 803 // DeferredBatchOp.Key should be populated with the start key, and 804 // DeferredBatchOp.Value should be populated with the end key. 805 func (b *Batch) RangeKeyDeleteDeferred(startLen, endLen int) *DeferredBatchOp { 806 b.prepareDeferredKeyValueRecord(startLen, endLen, InternalKeyKindRangeKeyDelete) 807 b.incrementRangeKeysCount() 808 return &b.deferredOp 809 } 810 811 // LogData adds the specified to the batch. The data will be written to the 812 // WAL, but not added to memtables or sstables. Log data is never indexed, 813 // which makes it useful for testing WAL performance. 814 // 815 // It is safe to modify the contents of the argument after LogData returns. 816 func (b *Batch) LogData(data []byte, _ *WriteOptions) error { 817 origCount, origMemTableSize := b.count, b.memTableSize 818 b.prepareDeferredKeyRecord(len(data), InternalKeyKindLogData) 819 copy(b.deferredOp.Key, data) 820 // Since LogData only writes to the WAL and does not affect the memtable, we 821 // restore b.count and b.memTableSize to their origin values. Note that 822 // Batch.count only refers to records that are added to the memtable. 823 b.count, b.memTableSize = origCount, origMemTableSize 824 return nil 825 } 826 827 // Empty returns true if the batch is empty, and false otherwise. 828 func (b *Batch) Empty() bool { 829 return len(b.data) <= batchHeaderLen 830 } 831 832 // Len returns the current size of the batch in bytes. 833 func (b *Batch) Len() int { 834 if len(b.data) <= batchHeaderLen { 835 return batchHeaderLen 836 } 837 return len(b.data) 838 } 839 840 // Repr returns the underlying batch representation. It is not safe to modify 841 // the contents. Reset() will not change the contents of the returned value, 842 // though any other mutation operation may do so. 843 func (b *Batch) Repr() []byte { 844 if len(b.data) == 0 { 845 b.init(batchHeaderLen) 846 } 847 binary.LittleEndian.PutUint32(b.countData(), b.Count()) 848 return b.data 849 } 850 851 // SetRepr sets the underlying batch representation. The batch takes ownership 852 // of the supplied slice. It is not safe to modify it afterwards until the 853 // Batch is no longer in use. 854 func (b *Batch) SetRepr(data []byte) error { 855 if len(data) < batchHeaderLen { 856 return base.CorruptionErrorf("invalid batch") 857 } 858 b.data = data 859 b.count = uint64(binary.LittleEndian.Uint32(b.countData())) 860 if b.db != nil { 861 // Only track memTableSize for batches that will be committed to the DB. 862 b.refreshMemTableSize() 863 } 864 return nil 865 } 866 867 // NewIter returns an iterator that is unpositioned (Iterator.Valid() will 868 // return false). The iterator can be positioned via a call to SeekGE, 869 // SeekPrefixGE, SeekLT, First or Last. Only indexed batches support iterators. 870 // 871 // The returned Iterator observes all of the Batch's existing mutations, but no 872 // later mutations. Its view can be refreshed via RefreshBatchSnapshot or 873 // SetOptions(). 874 func (b *Batch) NewIter(o *IterOptions) *Iterator { 875 if b.index == nil { 876 return &Iterator{err: ErrNotIndexed} 877 } 878 return b.db.newIterInternal(b, nil /* snapshot */, o) 879 } 880 881 // newInternalIter creates a new internalIterator that iterates over the 882 // contents of the batch. 883 func (b *Batch) newInternalIter(o *IterOptions) *batchIter { 884 iter := &batchIter{} 885 b.initInternalIter(o, iter, b.nextSeqNum()) 886 return iter 887 } 888 889 func (b *Batch) initInternalIter(o *IterOptions, iter *batchIter, batchSnapshot uint64) { 890 *iter = batchIter{ 891 cmp: b.cmp, 892 batch: b, 893 iter: b.index.NewIter(o.GetLowerBound(), o.GetUpperBound()), 894 snapshot: batchSnapshot, 895 } 896 } 897 898 func (b *Batch) newRangeDelIter(o *IterOptions, batchSnapshot uint64) *keyspan.Iter { 899 // Construct an iterator even if rangeDelIndex is nil, because it is allowed 900 // to refresh later, so we need the container to exist. 901 iter := new(keyspan.Iter) 902 b.initRangeDelIter(o, iter, batchSnapshot) 903 return iter 904 } 905 906 func (b *Batch) initRangeDelIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot uint64) { 907 if b.rangeDelIndex == nil { 908 iter.Init(b.cmp, nil) 909 return 910 } 911 912 // Fragment the range tombstones the first time a range deletion iterator is 913 // requested. The cached tombstones are invalidated if another range 914 // deletion tombstone is added to the batch. This cache is only guaranteed 915 // to be correct if we're opening an iterator to read at a batch sequence 916 // number at least as high as tombstonesSeqNum. The cache is guaranteed to 917 // include all tombstones up to tombstonesSeqNum, and if any additional 918 // tombstones were added after that sequence number the cache would've been 919 // cleared. 920 nextSeqNum := b.nextSeqNum() 921 if b.tombstones != nil && b.tombstonesSeqNum <= batchSnapshot { 922 iter.Init(b.cmp, b.tombstones) 923 return 924 } 925 926 tombstones := make([]keyspan.Span, 0, b.countRangeDels) 927 frag := &keyspan.Fragmenter{ 928 Cmp: b.cmp, 929 Format: b.formatKey, 930 Emit: func(s keyspan.Span) { 931 tombstones = append(tombstones, s) 932 }, 933 } 934 it := &batchIter{ 935 cmp: b.cmp, 936 batch: b, 937 iter: b.rangeDelIndex.NewIter(nil, nil), 938 snapshot: batchSnapshot, 939 } 940 fragmentRangeDels(frag, it, int(b.countRangeDels)) 941 iter.Init(b.cmp, tombstones) 942 943 // If we just read all the tombstones in the batch (eg, batchSnapshot was 944 // set to b.nextSeqNum()), then cache the tombstones so that a subsequent 945 // call to initRangeDelIter may use them without refragmenting. 946 if nextSeqNum == batchSnapshot { 947 b.tombstones = tombstones 948 b.tombstonesSeqNum = nextSeqNum 949 } 950 } 951 952 func fragmentRangeDels(frag *keyspan.Fragmenter, it internalIterator, count int) { 953 // The memory management here is a bit subtle. The keys and values returned 954 // by the iterator are slices in Batch.data. Thus the fragmented tombstones 955 // are slices within Batch.data. If additional entries are added to the 956 // Batch, Batch.data may be reallocated. The references in the fragmented 957 // tombstones will remain valid, pointing into the old Batch.data. GC for 958 // the win. 959 960 // Use a single []keyspan.Key buffer to avoid allocating many 961 // individual []keyspan.Key slices with a single element each. 962 keyBuf := make([]keyspan.Key, 0, count) 963 for key, val := it.First(); key != nil; key, val = it.Next() { 964 s := rangedel.Decode(*key, val, keyBuf) 965 keyBuf = s.Keys[len(s.Keys):] 966 967 // Set a fixed capacity to avoid accidental overwriting. 968 s.Keys = s.Keys[:len(s.Keys):len(s.Keys)] 969 frag.Add(s) 970 } 971 frag.Finish() 972 } 973 974 func (b *Batch) newRangeKeyIter(o *IterOptions, batchSnapshot uint64) *keyspan.Iter { 975 // Construct an iterator even if rangeKeyIndex is nil, because it is allowed 976 // to refresh later, so we need the container to exist. 977 iter := new(keyspan.Iter) 978 b.initRangeKeyIter(o, iter, batchSnapshot) 979 return iter 980 } 981 982 func (b *Batch) initRangeKeyIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot uint64) { 983 if b.rangeKeyIndex == nil { 984 iter.Init(b.cmp, nil) 985 return 986 } 987 988 // Fragment the range keys the first time a range key iterator is requested. 989 // The cached spans are invalidated if another range key is added to the 990 // batch. This cache is only guaranteed to be correct if we're opening an 991 // iterator to read at a batch sequence number at least as high as 992 // rangeKeysSeqNum. The cache is guaranteed to include all range keys up to 993 // rangeKeysSeqNum, and if any additional range keys were added after that 994 // sequence number the cache would've been cleared. 995 nextSeqNum := b.nextSeqNum() 996 if b.rangeKeys != nil && b.rangeKeysSeqNum <= batchSnapshot { 997 iter.Init(b.cmp, b.rangeKeys) 998 return 999 } 1000 1001 rangeKeys := make([]keyspan.Span, 0, b.countRangeKeys) 1002 frag := &keyspan.Fragmenter{ 1003 Cmp: b.cmp, 1004 Format: b.formatKey, 1005 Emit: func(s keyspan.Span) { 1006 rangeKeys = append(rangeKeys, s) 1007 }, 1008 } 1009 it := &batchIter{ 1010 cmp: b.cmp, 1011 batch: b, 1012 iter: b.rangeKeyIndex.NewIter(nil, nil), 1013 snapshot: batchSnapshot, 1014 } 1015 fragmentRangeKeys(frag, it, int(b.countRangeKeys)) 1016 iter.Init(b.cmp, rangeKeys) 1017 1018 // If we just read all the range keys in the batch (eg, batchSnapshot was 1019 // set to b.nextSeqNum()), then cache the range keys so that a subsequent 1020 // call to initRangeKeyIter may use them without refragmenting. 1021 if nextSeqNum == batchSnapshot { 1022 b.rangeKeys = rangeKeys 1023 b.rangeKeysSeqNum = nextSeqNum 1024 } 1025 } 1026 1027 func fragmentRangeKeys(frag *keyspan.Fragmenter, it internalIterator, count int) error { 1028 // The memory management here is a bit subtle. The keys and values 1029 // returned by the iterator are slices in Batch.data. Thus the 1030 // fragmented key spans are slices within Batch.data. If additional 1031 // entries are added to the Batch, Batch.data may be reallocated. The 1032 // references in the fragmented keys will remain valid, pointing into 1033 // the old Batch.data. GC for the win. 1034 1035 // Use a single []keyspan.Key buffer to avoid allocating many 1036 // individual []keyspan.Key slices with a single element each. 1037 keyBuf := make([]keyspan.Key, 0, count) 1038 for ik, val := it.First(); ik != nil; ik, val = it.Next() { 1039 s, err := rangekey.Decode(*ik, val, keyBuf) 1040 if err != nil { 1041 return err 1042 } 1043 keyBuf = s.Keys[len(s.Keys):] 1044 1045 // Set a fixed capacity to avoid accidental overwriting. 1046 s.Keys = s.Keys[:len(s.Keys):len(s.Keys)] 1047 frag.Add(s) 1048 } 1049 frag.Finish() 1050 return nil 1051 } 1052 1053 // Commit applies the batch to its parent writer. 1054 func (b *Batch) Commit(o *WriteOptions) error { 1055 return b.db.Apply(b, o) 1056 } 1057 1058 // Close closes the batch without committing it. 1059 func (b *Batch) Close() error { 1060 b.release() 1061 return nil 1062 } 1063 1064 func (b *Batch) AllocFree() { 1065 if b.alloc != nil { 1066 manual.Free(b.alloc) 1067 } 1068 b.alloc = nil 1069 } 1070 1071 // Indexed returns true if the batch is indexed (i.e. supports read 1072 // operations). 1073 func (b *Batch) Indexed() bool { 1074 return b.index != nil 1075 } 1076 1077 func (b *Batch) init(cap int) { 1078 n := batchInitialSize 1079 for n < cap { 1080 n *= 2 1081 } 1082 b.data = rawalloc.New(batchHeaderLen, n) 1083 b.setCount(0) 1084 b.setSeqNum(0) 1085 b.data = b.data[:batchHeaderLen] 1086 } 1087 1088 // Reset resets the batch for reuse. The underlying byte slice (that is 1089 // returned by Repr()) is not modified. It is only necessary to call this 1090 // method if a batch is explicitly being reused. Close automatically takes are 1091 // of releasing resources when appropriate for batches that are internally 1092 // being reused. 1093 func (b *Batch) Reset() { 1094 b.count = 0 1095 b.countRangeDels = 0 1096 b.countRangeKeys = 0 1097 b.memTableSize = 0 1098 b.deferredOp = DeferredBatchOp{} 1099 b.tombstones = nil 1100 b.tombstonesSeqNum = 0 1101 b.rangeKeys = nil 1102 b.rangeKeysSeqNum = 0 1103 b.flushable = nil 1104 b.commit = sync.WaitGroup{} 1105 b.commitErr = nil 1106 atomic.StoreUint32(&b.applied, 0) 1107 if b.data != nil { 1108 if cap(b.data) > batchMaxRetainedSize || b.isFlush { 1109 // If the capacity of the buffer is larger than our maximum 1110 // retention size, don't re-use it. Let it be GC-ed instead. 1111 // This prevents the memory from an unusually large batch from 1112 // being held on to indefinitely. 1113 b.data = nil 1114 } else { 1115 // Otherwise, reset the buffer for re-use. 1116 b.data = b.data[:batchHeaderLen] 1117 b.setSeqNum(0) 1118 } 1119 } 1120 if b.index != nil { 1121 b.index.Init(&b.data, b.cmp, b.abbreviatedKey) 1122 b.rangeDelIndex = nil 1123 b.rangeKeyIndex = nil 1124 } 1125 } 1126 1127 // seqNumData returns the 8 byte little-endian sequence number. Zero means that 1128 // the batch has not yet been applied. 1129 func (b *Batch) seqNumData() []byte { 1130 return b.data[:8] 1131 } 1132 1133 // countData returns the 4 byte little-endian count data. "\xff\xff\xff\xff" 1134 // means that the batch is invalid. 1135 func (b *Batch) countData() []byte { 1136 return b.data[8:12] 1137 } 1138 1139 func (b *Batch) grow(n int) { 1140 newSize := len(b.data) + n 1141 if uint64(newSize) >= maxBatchSize { 1142 panic(ErrBatchTooLarge) 1143 } 1144 if newSize > cap(b.data) { 1145 newCap := 2 * cap(b.data) 1146 for newCap < newSize { 1147 newCap *= 2 1148 } 1149 newData := rawalloc.New(len(b.data), newCap) 1150 copy(newData, b.data) 1151 b.data = newData 1152 } 1153 b.data = b.data[:newSize] 1154 } 1155 1156 func (b *Batch) setSeqNum(seqNum uint64) { 1157 binary.LittleEndian.PutUint64(b.seqNumData(), seqNum) 1158 } 1159 1160 // SeqNum returns the batch sequence number which is applied to the first 1161 // record in the batch. The sequence number is incremented for each subsequent 1162 // record. It returns zero if the batch is empty. 1163 func (b *Batch) SeqNum() uint64 { 1164 if len(b.data) == 0 { 1165 b.init(batchHeaderLen) 1166 } 1167 return binary.LittleEndian.Uint64(b.seqNumData()) 1168 } 1169 1170 func (b *Batch) setCount(v uint32) { 1171 b.count = uint64(v) 1172 } 1173 1174 // Count returns the count of memtable-modifying operations in this batch. All 1175 // operations with the except of LogData increment this count. 1176 func (b *Batch) Count() uint32 { 1177 if b.count > math.MaxUint32 { 1178 panic(ErrInvalidBatch) 1179 } 1180 return uint32(b.count) 1181 } 1182 1183 // Reader returns a BatchReader for the current batch contents. If the batch is 1184 // mutated, the new entries will not be visible to the reader. 1185 func (b *Batch) Reader() BatchReader { 1186 if len(b.data) == 0 { 1187 b.init(batchHeaderLen) 1188 } 1189 return b.data[batchHeaderLen:] 1190 } 1191 1192 func batchDecodeStr(data []byte) (odata []byte, s []byte, ok bool) { 1193 var v uint32 1194 var n int 1195 ptr := unsafe.Pointer(&data[0]) 1196 if a := *((*uint8)(ptr)); a < 128 { 1197 v = uint32(a) 1198 n = 1 1199 } else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 { 1200 v = uint32(b)<<7 | uint32(a) 1201 n = 2 1202 } else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 { 1203 v = uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1204 n = 3 1205 } else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 { 1206 v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1207 n = 4 1208 } else { 1209 d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4))) 1210 v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a) 1211 n = 5 1212 } 1213 1214 data = data[n:] 1215 if v > uint32(len(data)) { 1216 return nil, nil, false 1217 } 1218 return data[v:], data[:v], true 1219 } 1220 1221 // BatchReader iterates over the entries contained in a batch. 1222 type BatchReader []byte 1223 1224 // ReadBatch constructs a BatchReader from a batch representation. The 1225 // header is not validated. ReadBatch returns a new batch reader and the 1226 // count of entries contained within the batch. 1227 func ReadBatch(repr []byte) (r BatchReader, count uint32) { 1228 if len(repr) <= batchHeaderLen { 1229 return nil, count 1230 } 1231 count = binary.LittleEndian.Uint32(repr[batchCountOffset:batchHeaderLen]) 1232 return repr[batchHeaderLen:], count 1233 } 1234 1235 // Next returns the next entry in this batch. The final return value is false 1236 // if the batch is corrupt. The end of batch is reached when len(r)==0. 1237 func (r *BatchReader) Next() (kind InternalKeyKind, ukey []byte, value []byte, ok bool) { 1238 if len(*r) == 0 { 1239 return 0, nil, nil, false 1240 } 1241 kind = InternalKeyKind((*r)[0]) 1242 if kind > InternalKeyKindMax { 1243 return 0, nil, nil, false 1244 } 1245 *r, ukey, ok = batchDecodeStr((*r)[1:]) 1246 if !ok { 1247 return 0, nil, nil, false 1248 } 1249 switch kind { 1250 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete, 1251 InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 1252 *r, value, ok = batchDecodeStr(*r) 1253 if !ok { 1254 return 0, nil, nil, false 1255 } 1256 } 1257 return kind, ukey, value, true 1258 } 1259 1260 // Note: batchIter mirrors the implementation of flushableBatchIter. Keep the 1261 // two in sync. 1262 type batchIter struct { 1263 cmp Compare 1264 batch *Batch 1265 iter batchskl.Iterator 1266 err error 1267 // snapshot holds a batch "sequence number" at which the batch is being 1268 // read. This sequence number has the InternalKeySeqNumBatch bit set, so it 1269 // encodes an offset within the batch. Only batch entries earlier than the 1270 // offset are visible during iteration. 1271 snapshot uint64 1272 } 1273 1274 // batchIter implements the base.InternalIterator interface. 1275 var _ base.InternalIterator = (*batchIter)(nil) 1276 1277 func (i *batchIter) String() string { 1278 return "batch" 1279 } 1280 1281 func (i *batchIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, []byte) { 1282 // Ignore trySeekUsingNext since the batch may have changed, so using Next 1283 // would be incorrect. 1284 i.err = nil 1285 ikey := i.iter.SeekGE(key) 1286 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1287 ikey = i.iter.Next() 1288 } 1289 if ikey == nil { 1290 return nil, nil 1291 } 1292 return ikey, i.Value() 1293 } 1294 1295 func (i *batchIter) SeekPrefixGE( 1296 prefix, key []byte, flags base.SeekGEFlags, 1297 ) (*base.InternalKey, []byte) { 1298 i.err = nil 1299 return i.SeekGE(key, flags) 1300 } 1301 1302 func (i *batchIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, []byte) { 1303 i.err = nil 1304 ikey := i.iter.SeekLT(key) 1305 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1306 ikey = i.iter.Prev() 1307 } 1308 if ikey == nil { 1309 return nil, nil 1310 } 1311 return ikey, i.Value() 1312 } 1313 1314 func (i *batchIter) First() (*InternalKey, []byte) { 1315 i.err = nil 1316 ikey := i.iter.First() 1317 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1318 ikey = i.iter.Next() 1319 } 1320 if ikey == nil { 1321 return nil, nil 1322 } 1323 return ikey, i.Value() 1324 } 1325 1326 func (i *batchIter) Last() (*InternalKey, []byte) { 1327 i.err = nil 1328 ikey := i.iter.Last() 1329 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1330 ikey = i.iter.Prev() 1331 } 1332 if ikey == nil { 1333 return nil, nil 1334 } 1335 return ikey, i.Value() 1336 } 1337 1338 func (i *batchIter) Next() (*InternalKey, []byte) { 1339 ikey := i.iter.Next() 1340 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1341 ikey = i.iter.Next() 1342 } 1343 if ikey == nil { 1344 return nil, nil 1345 } 1346 return ikey, i.Value() 1347 } 1348 1349 func (i *batchIter) Prev() (*InternalKey, []byte) { 1350 ikey := i.iter.Prev() 1351 for ikey != nil && ikey.SeqNum() >= i.snapshot { 1352 ikey = i.iter.Prev() 1353 } 1354 if ikey == nil { 1355 return nil, nil 1356 } 1357 return ikey, i.Value() 1358 } 1359 1360 func (i *batchIter) Key() *InternalKey { 1361 return i.iter.Key() 1362 } 1363 1364 func (i *batchIter) Value() []byte { 1365 offset, _, keyEnd := i.iter.KeyInfo() 1366 data := i.batch.data 1367 if len(data[offset:]) == 0 { 1368 i.err = base.CorruptionErrorf("corrupted batch") 1369 return nil 1370 } 1371 1372 switch InternalKeyKind(data[offset]) { 1373 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete, 1374 InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 1375 _, value, ok := batchDecodeStr(data[keyEnd:]) 1376 if !ok { 1377 return nil 1378 } 1379 return value 1380 default: 1381 return nil 1382 } 1383 } 1384 1385 func (i *batchIter) Valid() bool { 1386 return i.iter.Valid() 1387 } 1388 1389 func (i *batchIter) Error() error { 1390 return i.err 1391 } 1392 1393 func (i *batchIter) Close() error { 1394 _ = i.iter.Close() 1395 return i.err 1396 } 1397 1398 func (i *batchIter) SetBounds(lower, upper []byte) { 1399 i.iter.SetBounds(lower, upper) 1400 } 1401 1402 type flushableBatchEntry struct { 1403 // offset is the byte offset of the record within the batch repr. 1404 offset uint32 1405 // index is the 0-based ordinal number of the record within the batch. Used 1406 // to compute the seqnum for the record. 1407 index uint32 1408 // key{Start,End} are the start and end byte offsets of the key within the 1409 // batch repr. Cached to avoid decoding the key length on every 1410 // comparison. The value is stored starting at keyEnd. 1411 keyStart uint32 1412 keyEnd uint32 1413 } 1414 1415 // flushableBatch wraps an existing batch and provides the interfaces needed 1416 // for making the batch flushable (i.e. able to mimic a memtable). 1417 type flushableBatch struct { 1418 cmp Compare 1419 formatKey base.FormatKey 1420 data []byte 1421 1422 // The base sequence number for the entries in the batch. This is the same 1423 // value as Batch.seqNum() and is cached here for performance. 1424 seqNum uint64 1425 1426 // A slice of offsets and indices for the entries in the batch. Used to 1427 // implement flushableBatchIter. Unlike the indexing on a normal batch, a 1428 // flushable batch is indexed such that batch entry i will be given the 1429 // sequence number flushableBatch.seqNum+i. 1430 // 1431 // Sorted in increasing order of key and decreasing order of offset (since 1432 // higher offsets correspond to higher sequence numbers). 1433 // 1434 // Does not include range deletion entries or range key entries. 1435 offsets []flushableBatchEntry 1436 offsetsArena *arena.Arena 1437 1438 // Fragmented range deletion tombstones. 1439 tombstones []keyspan.Span 1440 1441 // Fragmented range keys. 1442 rangeKeys []keyspan.Span 1443 } 1444 1445 var _ flushable = (*flushableBatch)(nil) 1446 1447 // newFlushableBatch creates a new batch that implements the flushable 1448 // interface. This allows the batch to act like a memtable and be placed in the 1449 // queue of flushable memtables. Note that the flushable batch takes ownership 1450 // of the batch data. 1451 func newFlushableBatch(batch *Batch, comparer *Comparer) *flushableBatch { 1452 b := &flushableBatch{ 1453 data: batch.data, 1454 cmp: comparer.Compare, 1455 formatKey: comparer.FormatKey, 1456 offsetsArena: arena.NewArena(), 1457 } 1458 b.offsets = arena.MakeSlice[flushableBatchEntry](b.offsetsArena, 0, int(batch.Count())) 1459 if b.data != nil { 1460 // Note that this sequence number is not correct when this batch has not 1461 // been applied since the sequence number has not been assigned yet. The 1462 // correct sequence number will be set later. But it is correct when the 1463 // batch is being replayed from the WAL. 1464 b.seqNum = batch.SeqNum() 1465 } 1466 var rangeDelOffsets []flushableBatchEntry 1467 var rangeKeyOffsets []flushableBatchEntry 1468 if len(b.data) > batchHeaderLen { 1469 // Non-empty batch. 1470 var index uint32 1471 for iter := BatchReader(b.data[batchHeaderLen:]); len(iter) > 0; index++ { 1472 offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0])) 1473 kind, key, _, ok := iter.Next() 1474 if !ok { 1475 break 1476 } 1477 entry := flushableBatchEntry{ 1478 offset: uint32(offset), 1479 index: uint32(index), 1480 } 1481 if keySize := uint32(len(key)); keySize == 0 { 1482 // Must add 2 to the offset. One byte encodes `kind` and the next 1483 // byte encodes `0`, which is the length of the key. 1484 entry.keyStart = uint32(offset) + 2 1485 entry.keyEnd = entry.keyStart 1486 } else { 1487 entry.keyStart = uint32(uintptr(unsafe.Pointer(&key[0])) - 1488 uintptr(unsafe.Pointer(&b.data[0]))) 1489 entry.keyEnd = entry.keyStart + keySize 1490 } 1491 switch kind { 1492 case InternalKeyKindRangeDelete: 1493 rangeDelOffsets = append(rangeDelOffsets, entry) 1494 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 1495 rangeKeyOffsets = append(rangeKeyOffsets, entry) 1496 default: 1497 b.offsets = append(b.offsets, entry) 1498 } 1499 } 1500 } 1501 1502 // Sort all of offsets, rangeDelOffsets and rangeKeyOffsets, using *batch's 1503 // sort.Interface implementation. 1504 pointOffsets := b.offsets 1505 sort.Sort(b) 1506 b.offsets = rangeDelOffsets 1507 sort.Sort(b) 1508 b.offsets = rangeKeyOffsets 1509 sort.Sort(b) 1510 b.offsets = pointOffsets 1511 1512 if len(rangeDelOffsets) > 0 { 1513 frag := &keyspan.Fragmenter{ 1514 Cmp: b.cmp, 1515 Format: b.formatKey, 1516 Emit: func(s keyspan.Span) { 1517 b.tombstones = append(b.tombstones, s) 1518 }, 1519 } 1520 it := &flushableBatchIter{ 1521 batch: b, 1522 data: b.data, 1523 offsets: rangeDelOffsets, 1524 cmp: b.cmp, 1525 index: -1, 1526 } 1527 fragmentRangeDels(frag, it, len(rangeDelOffsets)) 1528 } 1529 if len(rangeKeyOffsets) > 0 { 1530 frag := &keyspan.Fragmenter{ 1531 Cmp: b.cmp, 1532 Format: b.formatKey, 1533 Emit: func(s keyspan.Span) { 1534 b.rangeKeys = append(b.rangeKeys, s) 1535 }, 1536 } 1537 it := &flushableBatchIter{ 1538 batch: b, 1539 data: b.data, 1540 offsets: rangeKeyOffsets, 1541 cmp: b.cmp, 1542 index: -1, 1543 } 1544 fragmentRangeKeys(frag, it, len(rangeKeyOffsets)) 1545 } 1546 return b 1547 } 1548 1549 func (b *flushableBatch) release() { 1550 b.offsetsArena.Free() 1551 b.data = nil 1552 b.offsetsArena = nil 1553 b.offsets = nil 1554 b.tombstones = nil 1555 b.rangeKeys = nil 1556 } 1557 1558 func (b *flushableBatch) setSeqNum(seqNum uint64) { 1559 if b.seqNum != 0 { 1560 panic(fmt.Sprintf("bitalostable: flushableBatch.seqNum already set: %d", b.seqNum)) 1561 } 1562 b.seqNum = seqNum 1563 for i := range b.tombstones { 1564 for j := range b.tombstones[i].Keys { 1565 b.tombstones[i].Keys[j].Trailer = base.MakeTrailer( 1566 b.tombstones[i].Keys[j].SeqNum()+seqNum, 1567 b.tombstones[i].Keys[j].Kind(), 1568 ) 1569 } 1570 } 1571 for i := range b.rangeKeys { 1572 for j := range b.rangeKeys[i].Keys { 1573 b.rangeKeys[i].Keys[j].Trailer = base.MakeTrailer( 1574 b.rangeKeys[i].Keys[j].SeqNum()+seqNum, 1575 b.rangeKeys[i].Keys[j].Kind(), 1576 ) 1577 } 1578 } 1579 } 1580 1581 func (b *flushableBatch) Len() int { 1582 return len(b.offsets) 1583 } 1584 1585 func (b *flushableBatch) Less(i, j int) bool { 1586 ei := &b.offsets[i] 1587 ej := &b.offsets[j] 1588 ki := b.data[ei.keyStart:ei.keyEnd] 1589 kj := b.data[ej.keyStart:ej.keyEnd] 1590 switch c := b.cmp(ki, kj); { 1591 case c < 0: 1592 return true 1593 case c > 0: 1594 return false 1595 default: 1596 return ei.offset > ej.offset 1597 } 1598 } 1599 1600 func (b *flushableBatch) Swap(i, j int) { 1601 b.offsets[i], b.offsets[j] = b.offsets[j], b.offsets[i] 1602 } 1603 1604 func (b *flushableBatch) newIter(o *IterOptions) internalIterator { 1605 return &flushableBatchIter{ 1606 batch: b, 1607 data: b.data, 1608 offsets: b.offsets, 1609 cmp: b.cmp, 1610 index: -1, 1611 lower: o.GetLowerBound(), 1612 upper: o.GetUpperBound(), 1613 } 1614 } 1615 1616 func (b *flushableBatch) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator { 1617 return &flushFlushableBatchIter{ 1618 flushableBatchIter: flushableBatchIter{ 1619 batch: b, 1620 data: b.data, 1621 offsets: b.offsets, 1622 cmp: b.cmp, 1623 index: -1, 1624 }, 1625 bytesIterated: bytesFlushed, 1626 } 1627 } 1628 1629 func (b *flushableBatch) newRangeDelIter(o *IterOptions) keyspan.FragmentIterator { 1630 if len(b.tombstones) == 0 { 1631 return nil 1632 } 1633 return keyspan.NewIter(b.cmp, b.tombstones) 1634 } 1635 1636 func (b *flushableBatch) newRangeKeyIter(o *IterOptions) keyspan.FragmentIterator { 1637 if len(b.rangeKeys) == 0 { 1638 return nil 1639 } 1640 return keyspan.NewIter(b.cmp, b.rangeKeys) 1641 } 1642 1643 func (b *flushableBatch) containsRangeKeys() bool { return len(b.rangeKeys) > 0 } 1644 1645 func (b *flushableBatch) inuseBytes() uint64 { 1646 return uint64(len(b.data) - batchHeaderLen) 1647 } 1648 1649 func (b *flushableBatch) totalBytes() uint64 { 1650 return uint64(cap(b.data)) 1651 } 1652 1653 func (b *flushableBatch) readyForFlush() bool { 1654 return true 1655 } 1656 1657 // Note: flushableBatchIter mirrors the implementation of batchIter. Keep the 1658 // two in sync. 1659 type flushableBatchIter struct { 1660 // Members to be initialized by creator. 1661 batch *flushableBatch 1662 // The bytes backing the batch. Always the same as batch.data? 1663 data []byte 1664 // The sorted entries. This is not always equal to batch.offsets. 1665 offsets []flushableBatchEntry 1666 cmp Compare 1667 // Must be initialized to -1. It is the index into offsets that represents 1668 // the current iterator position. 1669 index int 1670 1671 // For internal use by the implementation. 1672 key InternalKey 1673 err error 1674 1675 // Optionally initialize to bounds of iteration, if any. 1676 lower []byte 1677 upper []byte 1678 } 1679 1680 // flushableBatchIter implements the base.InternalIterator interface. 1681 var _ base.InternalIterator = (*flushableBatchIter)(nil) 1682 1683 func (i *flushableBatchIter) String() string { 1684 return "flushable-batch" 1685 } 1686 1687 // SeekGE implements internalIterator.SeekGE, as documented in the bitalostable 1688 // package. Ignore flags.TrySeekUsingNext() since we don't expect this 1689 // optimization to provide much benefit here at the moment. 1690 func (i *flushableBatchIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, []byte) { 1691 i.err = nil 1692 ikey := base.MakeSearchKey(key) 1693 i.index = sort.Search(len(i.offsets), func(j int) bool { 1694 return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0 1695 }) 1696 if i.index >= len(i.offsets) { 1697 return nil, nil 1698 } 1699 i.key = i.getKey(i.index) 1700 if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 { 1701 i.index = len(i.offsets) 1702 return nil, nil 1703 } 1704 return &i.key, i.Value() 1705 } 1706 1707 // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the 1708 // bitalostable package. 1709 func (i *flushableBatchIter) SeekPrefixGE( 1710 prefix, key []byte, flags base.SeekGEFlags, 1711 ) (*base.InternalKey, []byte) { 1712 return i.SeekGE(key, flags) 1713 } 1714 1715 // SeekLT implements internalIterator.SeekLT, as documented in the bitalostable 1716 // package. 1717 func (i *flushableBatchIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, []byte) { 1718 i.err = nil 1719 ikey := base.MakeSearchKey(key) 1720 i.index = sort.Search(len(i.offsets), func(j int) bool { 1721 return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0 1722 }) 1723 i.index-- 1724 if i.index < 0 { 1725 return nil, nil 1726 } 1727 i.key = i.getKey(i.index) 1728 if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 { 1729 i.index = -1 1730 return nil, nil 1731 } 1732 return &i.key, i.Value() 1733 } 1734 1735 // First implements internalIterator.First, as documented in the bitalostable 1736 // package. 1737 func (i *flushableBatchIter) First() (*InternalKey, []byte) { 1738 i.err = nil 1739 if len(i.offsets) == 0 { 1740 return nil, nil 1741 } 1742 i.index = 0 1743 i.key = i.getKey(i.index) 1744 if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 { 1745 i.index = len(i.offsets) 1746 return nil, nil 1747 } 1748 return &i.key, i.Value() 1749 } 1750 1751 // Last implements internalIterator.Last, as documented in the bitalostable 1752 // package. 1753 func (i *flushableBatchIter) Last() (*InternalKey, []byte) { 1754 i.err = nil 1755 if len(i.offsets) == 0 { 1756 return nil, nil 1757 } 1758 i.index = len(i.offsets) - 1 1759 i.key = i.getKey(i.index) 1760 if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 { 1761 i.index = -1 1762 return nil, nil 1763 } 1764 return &i.key, i.Value() 1765 } 1766 1767 // Note: flushFlushableBatchIter.Next mirrors the implementation of 1768 // flushableBatchIter.Next due to performance. Keep the two in sync. 1769 func (i *flushableBatchIter) Next() (*InternalKey, []byte) { 1770 if i.index == len(i.offsets) { 1771 return nil, nil 1772 } 1773 i.index++ 1774 if i.index == len(i.offsets) { 1775 return nil, nil 1776 } 1777 i.key = i.getKey(i.index) 1778 if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 { 1779 i.index = len(i.offsets) 1780 return nil, nil 1781 } 1782 return &i.key, i.Value() 1783 } 1784 1785 func (i *flushableBatchIter) Prev() (*InternalKey, []byte) { 1786 if i.index < 0 { 1787 return nil, nil 1788 } 1789 i.index-- 1790 if i.index < 0 { 1791 return nil, nil 1792 } 1793 i.key = i.getKey(i.index) 1794 if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 { 1795 i.index = -1 1796 return nil, nil 1797 } 1798 return &i.key, i.Value() 1799 } 1800 1801 func (i *flushableBatchIter) getKey(index int) InternalKey { 1802 e := &i.offsets[index] 1803 kind := InternalKeyKind(i.data[e.offset]) 1804 key := i.data[e.keyStart:e.keyEnd] 1805 return base.MakeInternalKey(key, i.batch.seqNum+uint64(e.index), kind) 1806 } 1807 1808 func (i *flushableBatchIter) Key() *InternalKey { 1809 return &i.key 1810 } 1811 1812 func (i *flushableBatchIter) Value() []byte { 1813 p := i.data[i.offsets[i.index].offset:] 1814 if len(p) == 0 { 1815 i.err = base.CorruptionErrorf("corrupted batch") 1816 return nil 1817 } 1818 kind := InternalKeyKind(p[0]) 1819 if kind > InternalKeyKindMax { 1820 i.err = base.CorruptionErrorf("corrupted batch") 1821 return nil 1822 } 1823 var value []byte 1824 var ok bool 1825 switch kind { 1826 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete, 1827 InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 1828 keyEnd := i.offsets[i.index].keyEnd 1829 _, value, ok = batchDecodeStr(i.data[keyEnd:]) 1830 if !ok { 1831 i.err = base.CorruptionErrorf("corrupted batch") 1832 return nil 1833 } 1834 } 1835 return value 1836 } 1837 1838 func (i *flushableBatchIter) Valid() bool { 1839 return i.index >= 0 && i.index < len(i.offsets) 1840 } 1841 1842 func (i *flushableBatchIter) Error() error { 1843 return i.err 1844 } 1845 1846 func (i *flushableBatchIter) Close() error { 1847 return i.err 1848 } 1849 1850 func (i *flushableBatchIter) SetBounds(lower, upper []byte) { 1851 i.lower = lower 1852 i.upper = upper 1853 } 1854 1855 // flushFlushableBatchIter is similar to flushableBatchIter but it keeps track 1856 // of number of bytes iterated. 1857 type flushFlushableBatchIter struct { 1858 flushableBatchIter 1859 bytesIterated *uint64 1860 } 1861 1862 // flushFlushableBatchIter implements the base.InternalIterator interface. 1863 var _ base.InternalIterator = (*flushFlushableBatchIter)(nil) 1864 1865 func (i *flushFlushableBatchIter) String() string { 1866 return "flushable-batch" 1867 } 1868 1869 func (i *flushFlushableBatchIter) SeekGE( 1870 key []byte, flags base.SeekGEFlags, 1871 ) (*InternalKey, []byte) { 1872 panic("bitalostable: SeekGE unimplemented") 1873 } 1874 1875 func (i *flushFlushableBatchIter) SeekPrefixGE( 1876 prefix, key []byte, flags base.SeekGEFlags, 1877 ) (*base.InternalKey, []byte) { 1878 panic("bitalostable: SeekPrefixGE unimplemented") 1879 } 1880 1881 func (i *flushFlushableBatchIter) SeekLT( 1882 key []byte, flags base.SeekLTFlags, 1883 ) (*InternalKey, []byte) { 1884 panic("bitalostable: SeekLT unimplemented") 1885 } 1886 1887 func (i *flushFlushableBatchIter) First() (*InternalKey, []byte) { 1888 i.err = nil 1889 key, val := i.flushableBatchIter.First() 1890 if key == nil { 1891 return nil, nil 1892 } 1893 entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset 1894 *i.bytesIterated += uint64(entryBytes) + i.valueSize() 1895 return key, val 1896 } 1897 1898 // Note: flushFlushableBatchIter.Next mirrors the implementation of 1899 // flushableBatchIter.Next due to performance. Keep the two in sync. 1900 func (i *flushFlushableBatchIter) Next() (*InternalKey, []byte) { 1901 if i.index == len(i.offsets) { 1902 return nil, nil 1903 } 1904 i.index++ 1905 if i.index == len(i.offsets) { 1906 return nil, nil 1907 } 1908 i.key = i.getKey(i.index) 1909 entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset 1910 *i.bytesIterated += uint64(entryBytes) + i.valueSize() 1911 return &i.key, i.Value() 1912 } 1913 1914 func (i flushFlushableBatchIter) Prev() (*InternalKey, []byte) { 1915 panic("bitalostable: Prev unimplemented") 1916 } 1917 1918 func (i flushFlushableBatchIter) valueSize() uint64 { 1919 p := i.data[i.offsets[i.index].offset:] 1920 if len(p) == 0 { 1921 i.err = base.CorruptionErrorf("corrupted batch") 1922 return 0 1923 } 1924 kind := InternalKeyKind(p[0]) 1925 if kind > InternalKeyKindMax { 1926 i.err = base.CorruptionErrorf("corrupted batch") 1927 return 0 1928 } 1929 var length uint64 1930 switch kind { 1931 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete: 1932 keyEnd := i.offsets[i.index].keyEnd 1933 v, n := binary.Uvarint(i.data[keyEnd:]) 1934 if n <= 0 { 1935 i.err = base.CorruptionErrorf("corrupted batch") 1936 return 0 1937 } 1938 length = v + uint64(n) 1939 } 1940 return length 1941 } 1942 1943 // batchSort returns iterators for the sorted contents of the batch. It is 1944 // intended for testing use only. The batch.Sort dance is done to prevent 1945 // exposing this method in the public bitalostable interface. 1946 func batchSort( 1947 i interface{}, 1948 ) ( 1949 points internalIterator, 1950 rangeDels keyspan.FragmentIterator, 1951 rangeKeys keyspan.FragmentIterator, 1952 ) { 1953 b := i.(*Batch) 1954 if b.Indexed() { 1955 pointIter := b.newInternalIter(nil) 1956 rangeDelIter := b.newRangeDelIter(nil, math.MaxUint64) 1957 rangeKeyIter := b.newRangeKeyIter(nil, math.MaxUint64) 1958 return pointIter, rangeDelIter, rangeKeyIter 1959 } 1960 f := newFlushableBatch(b, b.db.opts.Comparer) 1961 return f.newIter(nil), f.newRangeDelIter(nil), f.newRangeKeyIter(nil) 1962 } 1963 1964 func init() { 1965 private.BatchSort = batchSort 1966 }