github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/batch.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "encoding/binary" 9 "errors" 10 "fmt" 11 "math" 12 "sort" 13 "sync" 14 "sync/atomic" 15 "unsafe" 16 17 "github.com/petermattis/pebble/internal/base" 18 "github.com/petermattis/pebble/internal/batchskl" 19 "github.com/petermattis/pebble/internal/rangedel" 20 "github.com/petermattis/pebble/internal/rawalloc" 21 ) 22 23 const ( 24 batchHeaderLen = 12 25 batchInitialSize = 1 << 10 // 1 KB 26 batchMaxRetainedSize = 1 << 20 // 1 MB 27 invalidBatchCount = 1<<32 - 1 28 maxVarintLen32 = 5 29 ) 30 31 // ErrNotIndexed means that a read operation on a batch failed because the 32 // batch is not indexed and thus doesn't support reads. 33 var ErrNotIndexed = errors.New("pebble: batch not indexed") 34 35 // ErrInvalidBatch indicates that a batch is invalid or otherwise corrupted. 36 var ErrInvalidBatch = errors.New("pebble: invalid batch") 37 38 type batchStorage struct { 39 // Data is the wire format of a batch's log entry: 40 // - 8 bytes for a sequence number of the first batch element, 41 // or zeroes if the batch has not yet been applied, 42 // - 4 bytes for the count: the number of elements in the batch, 43 // or "\xff\xff\xff\xff" if the batch is invalid, 44 // - count elements, being: 45 // - one byte for the kind 46 // - the varint-string user key, 47 // - the varint-string value (if kind != delete). 48 // The sequence number and count are stored in little-endian order. 49 data []byte 50 cmp Compare 51 abbreviatedKey AbbreviatedKey 52 } 53 54 // Get implements Storage.Get, as documented in the pebble/batchskl package. 55 func (s *batchStorage) Get(offset uint32) InternalKey { 56 kind := InternalKeyKind(s.data[offset]) 57 _, key, ok := batchDecodeStr(s.data[offset+1:]) 58 if !ok { 59 panic(fmt.Sprintf("corrupted batch entry: %d", offset)) 60 } 61 return base.MakeInternalKey(key, uint64(offset)|InternalKeySeqNumBatch, kind) 62 } 63 64 // AbbreviatedKey implements Storage.AbbreviatedKey, as documented in the 65 // pebble/batchskl package. 66 func (s *batchStorage) AbbreviatedKey(key []byte) uint64 { 67 return s.abbreviatedKey(key) 68 } 69 70 // Compare implements Storage.Compare, as documented in the pebble/batchskl 71 // package. 72 func (s *batchStorage) Compare(a []byte, b uint32) int { 73 // The key "a" is always the search key or the newer key being inserted. If 74 // it is equal to the existing key consider it smaller so that it sorts 75 // first. 76 if s.cmp(a, s.Get(b).UserKey) <= 0 { 77 return -1 78 } 79 return 1 80 } 81 82 // DeferredBatchOp represents a batch operation (eg. set, merge, delete) that is 83 // being inserted into the batch. Indexing is not performed on the specified key 84 // until Finish is called, hence the name deferred. This struct lets the caller 85 // copy or encode keys/values directly into the batch representation instead of 86 // copying into an intermediary buffer then having pebble.Batch copy off of it. 87 type DeferredBatchOp struct { 88 index *batchskl.Skiplist 89 90 // Key and Value point to parts of the binary batch representation where 91 // keys and values should be encoded/copied into. len(Key) and len(Value) 92 // bytes must be copied into these slices respectively before calling 93 // Finish(). Changing where these slices point to is not allowed. 94 Key, Value []byte 95 offset uint32 96 } 97 98 // Finish completes the addition of this batch operation, and adds it to the 99 // index if necessary. Must be called once (and exactly once) keys/values 100 // have been filled into Key and Value. Not calling Finish or not 101 // copying/encoding keys will result in an incomplete index, and calling Finish 102 // twice may result in a panic. 103 func (d DeferredBatchOp) Finish() { 104 if d.index != nil { 105 if err := d.index.Add(d.offset); err != nil { 106 // We never add duplicate entries, so an error should never occur. 107 panic(err) 108 } 109 } 110 } 111 112 // A Batch is a sequence of Sets, Merges, Deletes, and/or DeleteRanges that are 113 // applied atomically. Batch implements the Reader interface, but only an 114 // indexed batch supports reading (without error) via Get or NewIter. A 115 // non-indexed batch will return ErrNotIndexed when read from . 116 // 117 // Indexing 118 // 119 // Batches can be optionally indexed (see DB.NewIndexedBatch). An indexed batch 120 // allows iteration via an Iterator (see Batch.NewIter). The iterator provides 121 // a merged view of the operations in the batch and the underlying 122 // database. This is implemented by treating the batch as an additional layer 123 // in the LSM where every entry in the batch is considered newer than any entry 124 // in the underlying database (batch entries have the InternalKeySeqNumBatch 125 // bit set). By treating the batch as an additional layer in the LSM, iteration 126 // supports all batch operations (i.e. Set, Merge, Delete, and DeleteRange) 127 // with minimal effort. 128 // 129 // The same key can be operated on multiple times in a batch, though only the 130 // latest operation will be visible. For example, Put("a", "b"), Delete("a") 131 // will cause the key "a" to not be visible in the batch. Put("a", "b"), 132 // Put("a", "c") will cause a read of "a" to return the value "c". 133 // 134 // The batch index is implemented via an skiplist (internal/batchskl). While 135 // the skiplist implementation is very fast, inserting into an indexed batch is 136 // significantly slower than inserting into a non-indexed batch. Only use an 137 // indexed batch if you require reading from it. 138 // 139 // Atomic commit 140 // 141 // The operations in a batch are persisted by calling Batch.Commit which is 142 // equivalent to calling DB.Apply(batch). A batch is committed atomically by 143 // writing the internal batch representation to the WAL, adding all of the 144 // batch operations to the memtable associated with the WAL, and then 145 // incrementing the visible sequence number so that subsequent reads can see 146 // the effects of the batch operations. If WriteOptions.Sync is true, a call to 147 // Batch.Commit will guarantee that the batch is persisted to disk before 148 // returning. See commitPipeline for more on the implementation details. 149 // 150 // Large batches 151 // 152 // The size of a batch is limited only by available memory (be aware that 153 // indexed batches require considerably additional memory for the skiplist 154 // structure). A given WAL file has a single memtable associated with it (this 155 // restriction could be removed, but doing so is onerous and complex). And a 156 // memtable has a fixed size due to the underlying fixed size arena. Note that 157 // this differs from RocksDB where a memtable can grow arbitrarily large using 158 // a list of arena chunks. In RocksDB this is accomplished by storing pointers 159 // in the arena memory, but that isn't possible in Go. 160 // 161 // During Batch.Commit, a batch which is larger than a threshold (> 162 // MemTableSize/2) is wrapped in a flushableBatch and inserted into the queue 163 // of memtables. A flushableBatch forces WAL to be rotated, but that happens 164 // anyways when the memtable becomes full so this does not cause significant 165 // WAL churn. Because the flushableBatch is readable as another layer in the 166 // LSM, Batch.Commit returns as soon as the flushableBatch has been added to 167 // the queue of memtables. 168 // 169 // Internally, a flushableBatch provides Iterator support by sorting the batch 170 // contents (the batch is sorted once, when it is added to the memtable 171 // queue). Sorting the batch contents and insertion of the contents into a 172 // memtable have the same big-O time, but the constant factor dominates 173 // here. Sorting is significantly faster and uses significantly less memory. 174 // 175 // Internal representation 176 // 177 // The internal batch representation is a contiguous byte buffer with a fixed 178 // 12-byte header, followed by a series of records. 179 // 180 // +-------------+------------+--- ... ---+ 181 // | SeqNum (8B) | Count (4B) | Entries | 182 // +-------------+------------+--- ... ---+ 183 // 184 // Each record has a 1-byte kind tag prefix, followed by 1 or 2 length prefixed 185 // strings (varstring): 186 // 187 // +-----------+-----------------+-------------------+ 188 // | Kind (1B) | Key (varstring) | Value (varstring) | 189 // +-----------+-----------------+-------------------+ 190 // 191 // A varstring is a varint32 followed by N bytes of data. The Kind tags are 192 // exactly those specified by InternalKeyKind. The following table shows the 193 // format for records of each kind: 194 // 195 // InternalKeyKindDelete varstring 196 // InternalKeyKindLogData varstring 197 // InternalKeyKindSet varstring varstring 198 // InternalKeyKindMerge varstring varstring 199 // InternalKeyKindRangeDelete varstring varstring 200 // 201 // The intuitive understanding here are that the arguments to Delete(), Set(), 202 // Merge(), and DeleteRange() are encoded into the batch. 203 // 204 // The internal batch representation is the on disk format for a batch in the 205 // WAL, and thus stable. New record kinds may be added, but the existing ones 206 // will not be modified. 207 type Batch struct { 208 storage batchStorage 209 210 memTableSize uint32 211 212 // The db to which the batch will be committed. 213 db *DB 214 215 // The count of records in the batch. This count will be stored in the batch 216 // data whenever Repr() is called. 217 count uint32 218 219 // A deferredOp struct, stored in the Batch so that a pointer can be returned 220 // from the *Deferred() methods rather than a value. 221 deferredOp DeferredBatchOp 222 223 // An optional skiplist keyed by offset into data of the entry. 224 index *batchskl.Skiplist 225 rangeDelIndex *batchskl.Skiplist 226 227 // Fragmented range deletion tombstones. Cached the first time a range 228 // deletion iterator is requested. The cache is invalidated whenever a new 229 // range deletion is added to the batch. 230 tombstones []rangedel.Tombstone 231 232 // The flushableBatch wrapper if the batch is too large to fit in the 233 // memtable. 234 flushable *flushableBatch 235 236 commit sync.WaitGroup 237 applied uint32 // updated atomically 238 } 239 240 var _ Reader = (*Batch)(nil) 241 var _ Writer = (*Batch)(nil) 242 243 var batchPool = sync.Pool{ 244 New: func() interface{} { 245 return &Batch{} 246 }, 247 } 248 249 type indexedBatch struct { 250 batch Batch 251 index batchskl.Skiplist 252 } 253 254 var indexedBatchPool = sync.Pool{ 255 New: func() interface{} { 256 return &indexedBatch{} 257 }, 258 } 259 260 func newBatch(db *DB) *Batch { 261 b := batchPool.Get().(*Batch) 262 b.db = db 263 return b 264 } 265 266 func newIndexedBatch(db *DB, comparer *Comparer) *Batch { 267 i := indexedBatchPool.Get().(*indexedBatch) 268 i.batch.storage.cmp = comparer.Compare 269 i.batch.storage.abbreviatedKey = comparer.AbbreviatedKey 270 i.batch.db = db 271 i.batch.index = &i.index 272 i.batch.index.Reset(&i.batch.storage, 0) 273 return &i.batch 274 } 275 276 func (b *Batch) release() { 277 // NB: This is ugly, but necessary so that we can use atomic.StoreUint32 for 278 // the Batch.applied field. Without using an atomic to clear that field the 279 // Go race detector complains. 280 b.Reset() 281 b.storage.cmp = nil 282 b.storage.abbreviatedKey = nil 283 b.memTableSize = 0 284 285 b.flushable = nil 286 b.commit = sync.WaitGroup{} 287 atomic.StoreUint32(&b.applied, 0) 288 289 if b.db == nil { 290 // Batch not created using newBatch or newIndexedBatch, so don't put it 291 // back in the pool. 292 return 293 } 294 b.db = nil 295 296 if b.index == nil { 297 batchPool.Put(b) 298 } else { 299 *b.index = batchskl.Skiplist{} 300 b.index, b.rangeDelIndex = nil, nil 301 indexedBatchPool.Put((*indexedBatch)(unsafe.Pointer(b))) 302 } 303 } 304 305 func (b *Batch) refreshMemTableSize() { 306 b.memTableSize = 0 307 for r := b.Reader(); ; { 308 _, key, value, ok := r.Next() 309 if !ok { 310 break 311 } 312 b.memTableSize += memTableEntrySize(len(key), len(value)) 313 } 314 } 315 316 // Apply the operations contained in the batch to the receiver batch. 317 // 318 // It is safe to modify the contents of the arguments after Apply returns. 319 func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error { 320 if len(batch.storage.data) == 0 { 321 return nil 322 } 323 if len(batch.storage.data) < batchHeaderLen { 324 return errors.New("pebble: invalid batch") 325 } 326 327 offset := len(b.storage.data) 328 if offset == 0 { 329 b.init(offset) 330 offset = batchHeaderLen 331 } 332 b.storage.data = append(b.storage.data, batch.storage.data[batchHeaderLen:]...) 333 334 b.setCount(b.Count() + batch.Count()) 335 336 for iter := BatchReader(b.storage.data[offset:]); len(iter) > 0; { 337 offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.storage.data[0])) 338 kind, key, value, ok := iter.Next() 339 if !ok { 340 break 341 } 342 if b.index != nil { 343 var err error 344 if kind == InternalKeyKindRangeDelete { 345 if b.rangeDelIndex == nil { 346 b.rangeDelIndex = batchskl.NewSkiplist(&b.storage, 0) 347 } 348 err = b.rangeDelIndex.Add(uint32(offset)) 349 } else { 350 err = b.index.Add(uint32(offset)) 351 } 352 if err != nil { 353 // We never add duplicate entries, so an error should never occur. 354 panic(err) 355 } 356 } 357 b.memTableSize += memTableEntrySize(len(key), len(value)) 358 } 359 return nil 360 } 361 362 // Get gets the value for the given key. It returns ErrNotFound if the DB 363 // does not contain the key. 364 // 365 // The caller should not modify the contents of the returned slice, but 366 // it is safe to modify the contents of the argument after Get returns. 367 func (b *Batch) Get(key []byte) (value []byte, err error) { 368 if b.index == nil { 369 return nil, ErrNotIndexed 370 } 371 return b.db.getInternal(key, b, nil /* snapshot */) 372 } 373 374 func (b *Batch) prepareRecord(keyLen, valueLen int, kind InternalKeyKind) { 375 pos := len(b.storage.data) 376 b.deferredOp.offset = uint32(pos) 377 b.grow(1 + 2*maxVarintLen32 + keyLen + valueLen) 378 b.storage.data[pos] = byte(kind) 379 pos++ 380 381 varlen1 := putUvarint32(b.storage.data[pos:], uint32(keyLen)) 382 pos += varlen1 383 b.deferredOp.Key = b.storage.data[pos : pos+keyLen] 384 pos += keyLen 385 386 varlen2 := putUvarint32(b.storage.data[pos:], uint32(valueLen)) 387 pos += varlen2 388 b.deferredOp.Value = b.storage.data[pos : pos+valueLen] 389 pos += valueLen 390 b.storage.data = b.storage.data[:len(b.storage.data)-(2*maxVarintLen32-varlen1-varlen2)] 391 } 392 393 // Set adds an action to the batch that sets the key to map to the value. 394 // 395 // It is safe to modify the contents of the arguments after Set returns. 396 func (b *Batch) Set(key, value []byte, _ *WriteOptions) error { 397 deferredOp, err := b.SetDeferred(len(key), len(value), nil) 398 if err != nil { 399 return err 400 } 401 copy(deferredOp.Key, key) 402 copy(deferredOp.Value, value) 403 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 404 // in go1.13 will remove the need for this. 405 if b.index != nil { 406 if err := b.index.Add(deferredOp.offset); err != nil { 407 // We never add duplicate entries, so an error should never occur. 408 panic(err) 409 } 410 } 411 return nil 412 } 413 414 // SetDeferred is similar to Set in that it adds a set operation to the batch, 415 // except it only takes in key/value lengths instead of complete slices, 416 // letting the caller encode into those objects and then call Finish() on the 417 // returned object. 418 func (b *Batch) SetDeferred(keyLen, valueLen int, _ *WriteOptions) (*DeferredBatchOp, error) { 419 // Code duplication between Set and SetDeferred lets us preserve the fast 420 // path where the entire byte slices are available (in the Set case). 421 if len(b.storage.data) == 0 { 422 b.init(keyLen + valueLen + 2*binary.MaxVarintLen64 + batchHeaderLen) 423 } 424 if !b.increment() { 425 return nil, ErrInvalidBatch 426 } 427 428 b.memTableSize += memTableEntrySize(keyLen, valueLen) 429 b.prepareRecord(keyLen, valueLen, InternalKeyKindSet) 430 b.deferredOp.index = b.index 431 return &b.deferredOp, nil 432 } 433 434 // Merge adds an action to the batch that merges the value at key with the new 435 // value. The details of the merge are dependent upon the configured merge 436 // operator. 437 // 438 // It is safe to modify the contents of the arguments after Merge returns. 439 func (b *Batch) Merge(key, value []byte, _ *WriteOptions) error { 440 deferredOp, err := b.MergeDeferred(len(key), len(value), nil) 441 if err != nil { 442 return err 443 } 444 copy(deferredOp.Key, key) 445 copy(deferredOp.Value, value) 446 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 447 // in go1.13 will remove the need for this. 448 if b.index != nil { 449 if err := b.index.Add(deferredOp.offset); err != nil { 450 // We never add duplicate entries, so an error should never occur. 451 panic(err) 452 } 453 } 454 return nil 455 } 456 457 // MergeDeferred is similar to Merge in that it adds a merge operation to the 458 // batch, except it only takes in key/value lengths instead of complete slices, 459 // letting the caller encode into those objects and then call Finish() on the 460 // returned object. 461 func (b *Batch) MergeDeferred(keyLen, valueLen int, _ *WriteOptions) (*DeferredBatchOp, error) { 462 // Code duplication with Merge is so that the Merge case (where byte slices 463 // are provided) can preserve the fast path. 464 if len(b.storage.data) == 0 { 465 b.init(keyLen + valueLen + 2*binary.MaxVarintLen64 + batchHeaderLen) 466 } 467 if !b.increment() { 468 return nil, ErrInvalidBatch 469 } 470 471 b.memTableSize += memTableEntrySize(keyLen, valueLen) 472 b.prepareRecord(keyLen, valueLen, InternalKeyKindMerge) 473 b.deferredOp.index = b.index 474 return &b.deferredOp, nil 475 } 476 477 // Delete adds an action to the batch that deletes the entry for key. 478 // 479 // It is safe to modify the contents of the arguments after Delete returns. 480 func (b *Batch) Delete(key []byte, _ *WriteOptions) error { 481 deferredOp, err := b.DeleteDeferred(len(key), nil) 482 if err != nil { 483 return err 484 } 485 copy(deferredOp.Key, key) 486 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 487 // in go1.13 will remove the need for this. 488 if b.index != nil { 489 if err := b.index.Add(deferredOp.offset); err != nil { 490 // We never add duplicate entries, so an error should never occur. 491 panic(err) 492 } 493 } 494 return nil 495 } 496 497 // DeleteDeferred is similar to Delete in that it adds a delete operation to 498 // the batch, except it only takes in key/value lengths instead of complete 499 // slices, letting the caller encode into those objects and then call Finish() 500 // on the returned object. 501 func (b *Batch) DeleteDeferred(keyLen int, _ *WriteOptions) (*DeferredBatchOp, error) { 502 // Code duplication with Delete is so that the Delete case (where byte 503 // slices are provided) can preserve the fast path. 504 if len(b.storage.data) == 0 { 505 b.init(keyLen + binary.MaxVarintLen64 + batchHeaderLen) 506 } 507 if !b.increment() { 508 return nil, ErrInvalidBatch 509 } 510 511 b.memTableSize += memTableEntrySize(keyLen, 0) 512 513 pos := len(b.storage.data) 514 b.deferredOp.offset = uint32(pos) 515 b.grow(1 + maxVarintLen32 + keyLen) 516 b.storage.data[pos] = byte(InternalKeyKindDelete) 517 pos++ 518 varlen1 := putUvarint32(b.storage.data[pos:], uint32(keyLen)) 519 pos += varlen1 520 b.deferredOp.Key = b.storage.data[pos : pos+keyLen] 521 b.deferredOp.Value = nil 522 523 b.storage.data = b.storage.data[:len(b.storage.data)-(maxVarintLen32-varlen1)] 524 525 b.deferredOp.index = b.index 526 return &b.deferredOp, nil 527 } 528 529 // DeleteRange deletes all of the keys (and values) in the range [start,end) 530 // (inclusive on start, exclusive on end). 531 // 532 // It is safe to modify the contents of the arguments after DeleteRange 533 // returns. 534 func (b *Batch) DeleteRange(start, end []byte, _ *WriteOptions) error { 535 deferredOp, err := b.DeleteRangeDeferred(len(start), len(end), nil) 536 if err != nil { 537 return err 538 } 539 copy(deferredOp.Key, start) 540 copy(deferredOp.Value, end) 541 // TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining 542 // in go1.13 will remove the need for this. 543 if deferredOp.index != nil { 544 if err := deferredOp.index.Add(deferredOp.offset); err != nil { 545 // We never add duplicate entries, so an error should never occur. 546 panic(err) 547 } 548 } 549 return nil 550 } 551 552 // DeleteRangeDeferred is similar to DeleteRange in that it adds a delete range 553 // operation to the batch, except it only takes in key lengths instead of 554 // complete slices, letting the caller encode into those objects and then call 555 // Finish() on the returned object. Note that DeferredBatchOp.Key should be 556 // populated with the start key, and DeferredBatchOp.Value should be populated 557 // with the end key. 558 func (b *Batch) DeleteRangeDeferred(startLen, endLen int, _ *WriteOptions) (*DeferredBatchOp, error) { 559 if len(b.storage.data) == 0 { 560 b.init(startLen + endLen + 2*binary.MaxVarintLen64 + batchHeaderLen) 561 } 562 if !b.increment() { 563 return nil, ErrInvalidBatch 564 } 565 566 b.memTableSize += memTableEntrySize(startLen, endLen) 567 b.prepareRecord(startLen, endLen, InternalKeyKindRangeDelete) 568 569 if b.index != nil { 570 b.tombstones = nil 571 // Range deletions are rare, so we lazily allocate the index for them. 572 if b.rangeDelIndex == nil { 573 b.rangeDelIndex = batchskl.NewSkiplist(&b.storage, 0) 574 } 575 b.deferredOp.index = b.rangeDelIndex 576 } 577 return &b.deferredOp, nil 578 } 579 580 // LogData adds the specified to the batch. The data will be written to the 581 // WAL, but not added to memtables or sstables. Log data is never indexed, 582 // which makes it useful for testing WAL performance. 583 // 584 // It is safe to modify the contents of the argument after LogData returns. 585 func (b *Batch) LogData(data []byte, _ *WriteOptions) error { 586 if len(b.storage.data) == 0 { 587 b.init(len(data) + binary.MaxVarintLen64 + batchHeaderLen) 588 } 589 // Since LogData only writes to the WAL and does not affect the memtable, 590 // we don't increment b.count here. b.count only tracks operations that 591 // are applied to the memtable. 592 593 pos := len(b.storage.data) 594 b.grow(1 + maxVarintLen32 + len(data)) 595 b.storage.data[pos] = byte(InternalKeyKindLogData) 596 _, varlen1 := b.copyStr(pos+1, data) 597 b.storage.data = b.storage.data[:len(b.storage.data)-(maxVarintLen32-varlen1)] 598 return nil 599 } 600 601 // Empty returns true if the batch is empty, and false otherwise. 602 func (b *Batch) Empty() bool { 603 return len(b.storage.data) <= batchHeaderLen 604 } 605 606 // Repr returns the underlying batch representation. It is not safe to modify 607 // the contents. Reset() will not change the contents of the returned value, 608 // though any other mutation operation may do so. 609 func (b *Batch) Repr() []byte { 610 if len(b.storage.data) == 0 { 611 b.init(batchHeaderLen) 612 } 613 binary.LittleEndian.PutUint32(b.countData(), b.count) 614 return b.storage.data 615 } 616 617 // SetRepr sets the underlying batch representation. The batch takes ownership 618 // of the supplied slice. It is not safe to modify it afterwards until the 619 // Batch is no longer in use. 620 func (b *Batch) SetRepr(data []byte) error { 621 if len(data) < batchHeaderLen { 622 return fmt.Errorf("invalid batch") 623 } 624 b.storage.data = data 625 b.count = binary.LittleEndian.Uint32(b.countData()) 626 b.refreshMemTableSize() 627 return nil 628 } 629 630 // NewIter returns an iterator that is unpositioned (Iterator.Valid() will 631 // return false). The iterator can be positioned via a call to SeekGE, 632 // SeekPrefixGE, SeekLT, First or Last. Only indexed batches support iterators. 633 func (b *Batch) NewIter(o *IterOptions) *Iterator { 634 if b.index == nil { 635 return &Iterator{err: ErrNotIndexed} 636 } 637 return b.db.newIterInternal(b.newInternalIter(o), 638 b.newRangeDelIter(o), nil /* snapshot */, o) 639 } 640 641 // newInternalIter creates a new internalIterator that iterates over the 642 // contents of the batch. 643 func (b *Batch) newInternalIter(o *IterOptions) internalIterator { 644 if b.index == nil { 645 return newErrorIter(ErrNotIndexed) 646 } 647 return &batchIter{ 648 cmp: b.storage.cmp, 649 batch: b, 650 iter: b.index.NewIter(o.GetLowerBound(), o.GetUpperBound()), 651 } 652 } 653 654 func (b *Batch) newRangeDelIter(o *IterOptions) internalIterator { 655 if b.index == nil { 656 return newErrorIter(ErrNotIndexed) 657 } 658 if b.rangeDelIndex == nil { 659 return nil 660 } 661 662 // Fragment the range tombstones the first time a range deletion iterator is 663 // requested. The cached tombstones are invalidated if another range deletion 664 // tombstone is added to the batch. 665 if b.tombstones == nil { 666 frag := &rangedel.Fragmenter{ 667 Cmp: b.storage.cmp, 668 Emit: func(fragmented []rangedel.Tombstone) { 669 b.tombstones = append(b.tombstones, fragmented...) 670 }, 671 } 672 it := &batchIter{ 673 cmp: b.storage.cmp, 674 batch: b, 675 iter: b.rangeDelIndex.NewIter(nil, nil), 676 } 677 for { 678 key, val := it.Next() 679 if key == nil { 680 break 681 } 682 frag.Add(*key, val) 683 } 684 frag.Finish() 685 } 686 687 return rangedel.NewIter(b.storage.cmp, b.tombstones) 688 } 689 690 // Commit applies the batch to its parent writer. 691 func (b *Batch) Commit(o *WriteOptions) error { 692 return b.db.Apply(b, o) 693 } 694 695 // Close closes the batch without committing it. 696 func (b *Batch) Close() error { 697 b.release() 698 return nil 699 } 700 701 // Indexed returns true if the batch is indexed (i.e. supports read 702 // operations). 703 func (b *Batch) Indexed() bool { 704 return b.index != nil 705 } 706 707 func (b *Batch) init(cap int) { 708 n := batchInitialSize 709 for n < cap { 710 n *= 2 711 } 712 b.storage.data = rawalloc.New(batchHeaderLen, n) 713 b.setCount(0) 714 b.setSeqNum(0) 715 b.storage.data = b.storage.data[:batchHeaderLen] 716 } 717 718 // Reset clears the underlying byte slice and effectively empties the batch for 719 // reuse. Used in cases where Batch is only being used to build a batch, and 720 // where the end result is a Repr() call, not a Commit call or a Close call. 721 // Commits and Closes take care of releasing resources when appropriate. 722 func (b *Batch) Reset() { 723 if b.storage.data != nil { 724 if cap(b.storage.data) > batchMaxRetainedSize { 725 // If the capacity of the buffer is larger than our maximum 726 // retention size, don't re-use it. Let it be GC-ed instead. 727 // This prevents the memory from an unusually large batch from 728 // being held on to indefinitely. 729 b.storage.data = nil 730 } else { 731 // Otherwise, reset the buffer for re-use. 732 b.storage.data = b.storage.data[:batchHeaderLen] 733 } 734 b.count = 0 735 } 736 } 737 738 // seqNumData returns the 8 byte little-endian sequence number. Zero means that 739 // the batch has not yet been applied. 740 func (b *Batch) seqNumData() []byte { 741 return b.storage.data[:8] 742 } 743 744 // countData returns the 4 byte little-endian count data. "\xff\xff\xff\xff" 745 // means that the batch is invalid. 746 func (b *Batch) countData() []byte { 747 return b.storage.data[8:12] 748 } 749 750 func (b *Batch) increment() (ok bool) { 751 if b.count == math.MaxUint32 { 752 return false 753 } 754 b.count++ 755 return true 756 } 757 758 func (b *Batch) grow(n int) { 759 newSize := len(b.storage.data) + n 760 if newSize > cap(b.storage.data) { 761 newCap := 2 * cap(b.storage.data) 762 for newCap < newSize { 763 newCap *= 2 764 } 765 newData := rawalloc.New(len(b.storage.data), newCap) 766 copy(newData, b.storage.data) 767 b.storage.data = newData 768 } 769 b.storage.data = b.storage.data[:newSize] 770 } 771 772 func putUvarint32(buf []byte, x uint32) int { 773 i := 0 774 for x >= 0x80 { 775 buf[i] = byte(x) | 0x80 776 x >>= 7 777 i++ 778 } 779 buf[i] = byte(x) 780 return i + 1 781 } 782 783 func (b *Batch) copyStr(pos int, s []byte) (int, int) { 784 n := putUvarint32(b.storage.data[pos:], uint32(len(s))) 785 return pos + n + copy(b.storage.data[pos+n:], s), n 786 } 787 788 func (b *Batch) setSeqNum(seqNum uint64) { 789 binary.LittleEndian.PutUint64(b.seqNumData(), seqNum) 790 } 791 792 // SeqNum returns the batch sequence number which is applied to the first 793 // record in the batch. The sequence number is incremented for each subsequent 794 // record. 795 func (b *Batch) SeqNum() uint64 { 796 return binary.LittleEndian.Uint64(b.seqNumData()) 797 } 798 799 func (b *Batch) setCount(v uint32) { 800 b.count = v 801 } 802 803 // Count returns the count of memtable-modifying operations in this batch. All 804 // operations with the except of LogData increment this count. 805 func (b *Batch) Count() uint32 { 806 return b.count 807 } 808 809 // Reader returns a BatchReader for the current batch contents. If the batch is 810 // mutated, the new entries will not be visible to the reader. 811 func (b *Batch) Reader() BatchReader { 812 return b.storage.data[batchHeaderLen:] 813 } 814 815 func batchDecode(data []byte, offset uint32) (kind InternalKeyKind, ukey []byte, value []byte, ok bool) { 816 p := data[offset:] 817 if len(p) == 0 { 818 return 0, nil, nil, false 819 } 820 kind, p = InternalKeyKind(p[0]), p[1:] 821 if kind > InternalKeyKindMax { 822 return 0, nil, nil, false 823 } 824 p, ukey, ok = batchDecodeStr(p) 825 if !ok { 826 return 0, nil, nil, false 827 } 828 switch kind { 829 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete: 830 _, value, ok = batchDecodeStr(p) 831 if !ok { 832 return 0, nil, nil, false 833 } 834 } 835 return kind, ukey, value, true 836 } 837 838 func batchDecodeStr(data []byte) (odata []byte, s []byte, ok bool) { 839 v, n := binary.Uvarint(data) 840 if n <= 0 { 841 return nil, nil, false 842 } 843 data = data[n:] 844 if v > uint64(len(data)) { 845 return nil, nil, false 846 } 847 return data[v:], data[:v], true 848 } 849 850 // BatchReader iterates over the entries contained in a batch. 851 type BatchReader []byte 852 853 // MakeBatchReader constructs a BatchReader from a batch representation. The 854 // header (containing the batch count and seqnum) is ignored. 855 func MakeBatchReader(repr []byte) BatchReader { 856 return repr[batchHeaderLen:] 857 } 858 859 // Next returns the next entry in this batch. The final return value is false 860 // if the batch is corrupt. The end of batch is reached when len(r)==0. 861 func (r *BatchReader) Next() (kind InternalKeyKind, ukey []byte, value []byte, ok bool) { 862 p := *r 863 if len(p) == 0 { 864 return 0, nil, nil, false 865 } 866 kind, *r = InternalKeyKind(p[0]), p[1:] 867 if kind > InternalKeyKindMax { 868 return 0, nil, nil, false 869 } 870 ukey, ok = r.nextStr() 871 if !ok { 872 return 0, nil, nil, false 873 } 874 switch kind { 875 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete: 876 value, ok = r.nextStr() 877 if !ok { 878 return 0, nil, nil, false 879 } 880 } 881 return kind, ukey, value, true 882 } 883 884 func (r *BatchReader) nextStr() (s []byte, ok bool) { 885 p := *r 886 u, numBytes := binary.Uvarint(p) 887 if numBytes <= 0 { 888 return nil, false 889 } 890 p = p[numBytes:] 891 if u > uint64(len(p)) { 892 return nil, false 893 } 894 s, *r = p[:u], p[u:] 895 return s, true 896 } 897 898 // Note: batchIter mirrors the implementation of flushableBatchIter. Keep the 899 // two in sync. 900 type batchIter struct { 901 cmp Compare 902 batch *Batch 903 iter batchskl.Iterator 904 err error 905 } 906 907 // batchIter implements the internalIterator interface. 908 var _ internalIterator = (*batchIter)(nil) 909 910 func (i *batchIter) SeekGE(key []byte) (*InternalKey, []byte) { 911 ikey := i.iter.SeekGE(key) 912 if ikey == nil { 913 return nil, nil 914 } 915 return ikey, i.Value() 916 } 917 918 func (i *batchIter) SeekPrefixGE(prefix, key []byte) (*InternalKey, []byte) { 919 return i.SeekGE(key) 920 } 921 922 func (i *batchIter) SeekLT(key []byte) (*InternalKey, []byte) { 923 ikey := i.iter.SeekLT(key) 924 if ikey == nil { 925 return nil, nil 926 } 927 return ikey, i.Value() 928 } 929 930 func (i *batchIter) First() (*InternalKey, []byte) { 931 ikey := i.iter.First() 932 if ikey == nil { 933 return nil, nil 934 } 935 return ikey, i.Value() 936 } 937 938 func (i *batchIter) Last() (*InternalKey, []byte) { 939 ikey := i.iter.Last() 940 if ikey == nil { 941 return nil, nil 942 } 943 return ikey, i.Value() 944 } 945 946 func (i *batchIter) Next() (*InternalKey, []byte) { 947 ikey := i.iter.Next() 948 if ikey == nil { 949 return nil, nil 950 } 951 return ikey, i.Value() 952 } 953 954 func (i *batchIter) Prev() (*InternalKey, []byte) { 955 ikey := i.iter.Prev() 956 if ikey == nil { 957 return nil, nil 958 } 959 return ikey, i.Value() 960 } 961 962 func (i *batchIter) Key() *InternalKey { 963 return i.iter.Key() 964 } 965 966 func (i *batchIter) Value() []byte { 967 _, _, value, ok := batchDecode(i.batch.storage.data, i.iter.KeyOffset()) 968 if !ok { 969 i.err = fmt.Errorf("corrupted batch") 970 } 971 return value 972 } 973 974 func (i *batchIter) Valid() bool { 975 return i.iter.Valid() 976 } 977 978 func (i *batchIter) Error() error { 979 return i.err 980 } 981 982 func (i *batchIter) Close() error { 983 _ = i.iter.Close() 984 return i.err 985 } 986 987 func (i *batchIter) SetBounds(lower, upper []byte) { 988 i.iter.SetBounds(lower, upper) 989 } 990 991 type flushableBatchEntry struct { 992 offset uint32 993 index uint32 994 keyStart uint32 995 keyEnd uint32 996 } 997 998 // flushableBatch wraps an existing batch and provides the interfaces needed 999 // for making the batch flushable (i.e. able to mimic a memtable). 1000 type flushableBatch struct { 1001 cmp Compare 1002 data []byte 1003 1004 // The base sequence number for the entries in the batch. This is the same 1005 // value as Batch.seqNum() and is cached here for performance. 1006 seqNum uint64 1007 1008 // A slice of offsets and indices for the entries in the batch. Used to 1009 // implement flushableBatchIter. Unlike the indexing on a normal batch, a 1010 // flushable batch is indexed such that batch entry i will be given the 1011 // sequence number flushableBatch.seqNum+i. 1012 offsets []flushableBatchEntry 1013 1014 // Fragmented range deletion tombstones. 1015 tombstones []rangedel.Tombstone 1016 1017 flushedCh chan struct{} 1018 1019 logNum uint64 1020 } 1021 1022 var _ flushable = (*flushableBatch)(nil) 1023 1024 // newFlushableBatch creates a new batch that implements the flushable 1025 // interface. This allows the batch to act like a memtable and be placed in the 1026 // queue of flushable memtables. Note that the flushable batch takes ownership 1027 // of the batch data. 1028 func newFlushableBatch(batch *Batch, comparer *Comparer) *flushableBatch { 1029 b := &flushableBatch{ 1030 data: batch.storage.data, 1031 cmp: comparer.Compare, 1032 offsets: make([]flushableBatchEntry, 0, batch.Count()), 1033 flushedCh: make(chan struct{}), 1034 } 1035 1036 var index uint32 1037 var rangeDelOffsets []flushableBatchEntry 1038 for iter := BatchReader(b.data[batchHeaderLen:]); len(iter) > 0; index++ { 1039 offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0])) 1040 kind, key, _, ok := iter.Next() 1041 if !ok { 1042 break 1043 } 1044 entry := flushableBatchEntry{ 1045 offset: uint32(offset), 1046 index: uint32(index), 1047 } 1048 if keySize := uint32(len(key)); keySize == 0 { 1049 // Must add 2 to the offset. One byte encodes `kind` and the next 1050 // byte encodes `0`, which is the length of the key. 1051 entry.keyStart = uint32(offset) + 2 1052 entry.keyEnd = entry.keyStart 1053 } else { 1054 entry.keyStart = uint32(uintptr(unsafe.Pointer(&key[0])) - 1055 uintptr(unsafe.Pointer(&b.data[0]))) 1056 entry.keyEnd = entry.keyStart + keySize 1057 } 1058 if kind == InternalKeyKindRangeDelete { 1059 rangeDelOffsets = append(rangeDelOffsets, entry) 1060 } else { 1061 b.offsets = append(b.offsets, entry) 1062 } 1063 } 1064 1065 // Sort both offsets and rangeDelOffsets. 1066 sort.Sort(b) 1067 rangeDelOffsets, b.offsets = b.offsets, rangeDelOffsets 1068 sort.Sort(b) 1069 rangeDelOffsets, b.offsets = b.offsets, rangeDelOffsets 1070 1071 if len(rangeDelOffsets) > 0 { 1072 frag := &rangedel.Fragmenter{ 1073 Cmp: b.cmp, 1074 Emit: func(fragmented []rangedel.Tombstone) { 1075 b.tombstones = append(b.tombstones, fragmented...) 1076 }, 1077 } 1078 it := &flushableBatchIter{ 1079 batch: b, 1080 data: b.data, 1081 offsets: rangeDelOffsets, 1082 cmp: b.cmp, 1083 index: -1, 1084 } 1085 for { 1086 key, val := it.Next() 1087 if key == nil { 1088 break 1089 } 1090 frag.Add(*key, val) 1091 } 1092 frag.Finish() 1093 } 1094 return b 1095 } 1096 1097 func (b *flushableBatch) Len() int { 1098 return len(b.offsets) 1099 } 1100 1101 func (b *flushableBatch) Less(i, j int) bool { 1102 ei := &b.offsets[i] 1103 ej := &b.offsets[j] 1104 ki := b.data[ei.keyStart:ei.keyEnd] 1105 kj := b.data[ej.keyStart:ej.keyEnd] 1106 switch c := b.cmp(ki, kj); { 1107 case c < 0: 1108 return true 1109 case c > 0: 1110 return false 1111 default: 1112 return ei.offset > ej.offset 1113 } 1114 } 1115 1116 func (b *flushableBatch) Swap(i, j int) { 1117 b.offsets[i], b.offsets[j] = b.offsets[j], b.offsets[i] 1118 } 1119 1120 func (b *flushableBatch) newIter(o *IterOptions) internalIterator { 1121 return &flushableBatchIter{ 1122 batch: b, 1123 data: b.data, 1124 offsets: b.offsets, 1125 cmp: b.cmp, 1126 index: -1, 1127 lower: o.GetLowerBound(), 1128 upper: o.GetUpperBound(), 1129 } 1130 } 1131 1132 func (b *flushableBatch) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator { 1133 return &flushFlushableBatchIter{ 1134 flushableBatchIter: flushableBatchIter{ 1135 batch: b, 1136 data: b.data, 1137 offsets: b.offsets, 1138 cmp: b.cmp, 1139 index: -1, 1140 }, 1141 bytesIterated: bytesFlushed, 1142 } 1143 } 1144 1145 func (b *flushableBatch) newRangeDelIter(o *IterOptions) internalIterator { 1146 if len(b.tombstones) == 0 { 1147 return nil 1148 } 1149 return rangedel.NewIter(b.cmp, b.tombstones) 1150 } 1151 1152 func (b *flushableBatch) totalBytes() uint64 { 1153 return uint64(len(b.data) - batchHeaderLen) 1154 } 1155 1156 func (b *flushableBatch) flushed() chan struct{} { 1157 return b.flushedCh 1158 } 1159 1160 func (b *flushableBatch) readyForFlush() bool { 1161 return true 1162 } 1163 1164 func (b *flushableBatch) logInfo() (uint64, uint64) { 1165 return b.logNum, 0 /* logSize */ 1166 } 1167 1168 // Note: flushableBatchIter mirrors the implementation of batchIter. Keep the 1169 // two in sync. 1170 type flushableBatchIter struct { 1171 batch *flushableBatch 1172 data []byte 1173 offsets []flushableBatchEntry 1174 cmp Compare 1175 index int 1176 key InternalKey 1177 err error 1178 lower []byte 1179 upper []byte 1180 } 1181 1182 // flushableBatchIter implements the internalIterator interface. 1183 var _ internalIterator = (*flushableBatchIter)(nil) 1184 1185 func (i *flushableBatchIter) SeekGE(key []byte) (*InternalKey, []byte) { 1186 ikey := base.MakeSearchKey(key) 1187 i.index = sort.Search(len(i.offsets), func(j int) bool { 1188 return base.InternalCompare(i.cmp, ikey, i.getKey(j)) < 0 1189 }) 1190 if i.index >= len(i.offsets) { 1191 return nil, nil 1192 } 1193 i.key = i.getKey(i.index) 1194 if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 { 1195 i.index = len(i.offsets) 1196 return nil, nil 1197 } 1198 return &i.key, i.Value() 1199 } 1200 1201 func (i *flushableBatchIter) SeekPrefixGE(prefix, key []byte) (*InternalKey, []byte) { 1202 return i.SeekGE(key) 1203 } 1204 1205 func (i *flushableBatchIter) SeekLT(key []byte) (*InternalKey, []byte) { 1206 ikey := base.MakeSearchKey(key) 1207 i.index = sort.Search(len(i.offsets), func(j int) bool { 1208 return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0 1209 }) 1210 i.index-- 1211 if i.index < 0 { 1212 return nil, nil 1213 } 1214 i.key = i.getKey(i.index) 1215 if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 { 1216 i.index = -1 1217 return nil, nil 1218 } 1219 return &i.key, i.Value() 1220 } 1221 1222 func (i *flushableBatchIter) First() (*InternalKey, []byte) { 1223 if len(i.offsets) == 0 { 1224 return nil, nil 1225 } 1226 i.index = 0 1227 i.key = i.getKey(i.index) 1228 if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 { 1229 i.index = len(i.offsets) 1230 return nil, nil 1231 } 1232 return &i.key, i.Value() 1233 } 1234 1235 func (i *flushableBatchIter) Last() (*InternalKey, []byte) { 1236 if len(i.offsets) == 0 { 1237 return nil, nil 1238 } 1239 i.index = len(i.offsets) - 1 1240 i.key = i.getKey(i.index) 1241 if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 { 1242 i.index = -1 1243 return nil, nil 1244 } 1245 return &i.key, i.Value() 1246 } 1247 1248 // Note: flushFlushableBatchIter.Next mirrors the implementation of 1249 // flushableBatchIter.Next due to performance. Keep the two in sync. 1250 func (i *flushableBatchIter) Next() (*InternalKey, []byte) { 1251 if i.index == len(i.offsets) { 1252 return nil, nil 1253 } 1254 i.index++ 1255 if i.index == len(i.offsets) { 1256 return nil, nil 1257 } 1258 i.key = i.getKey(i.index) 1259 if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 { 1260 i.index = len(i.offsets) 1261 return nil, nil 1262 } 1263 return &i.key, i.Value() 1264 } 1265 1266 func (i *flushableBatchIter) Prev() (*InternalKey, []byte) { 1267 if i.index < 0 { 1268 return nil, nil 1269 } 1270 i.index-- 1271 if i.index < 0 { 1272 return nil, nil 1273 } 1274 i.key = i.getKey(i.index) 1275 if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 { 1276 i.index = -1 1277 return nil, nil 1278 } 1279 return &i.key, i.Value() 1280 } 1281 1282 func (i *flushableBatchIter) getKey(index int) InternalKey { 1283 e := &i.offsets[index] 1284 kind := InternalKeyKind(i.data[e.offset]) 1285 key := i.data[e.keyStart:e.keyEnd] 1286 return base.MakeInternalKey(key, i.batch.seqNum+uint64(e.index), kind) 1287 } 1288 1289 func (i *flushableBatchIter) Key() *InternalKey { 1290 return &i.key 1291 } 1292 1293 func (i *flushableBatchIter) Value() []byte { 1294 p := i.data[i.offsets[i.index].offset:] 1295 if len(p) == 0 { 1296 i.err = fmt.Errorf("corrupted batch") 1297 return nil 1298 } 1299 kind := InternalKeyKind(p[0]) 1300 if kind > InternalKeyKindMax { 1301 i.err = fmt.Errorf("corrupted batch") 1302 return nil 1303 } 1304 var value []byte 1305 var ok bool 1306 switch kind { 1307 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete: 1308 keyEnd := i.offsets[i.index].keyEnd 1309 _, value, ok = batchDecodeStr(i.data[keyEnd:]) 1310 if !ok { 1311 i.err = fmt.Errorf("corrupted batch") 1312 return nil 1313 } 1314 } 1315 return value 1316 } 1317 1318 func (i *flushableBatchIter) Valid() bool { 1319 return i.index >= 0 && i.index < len(i.offsets) 1320 } 1321 1322 func (i *flushableBatchIter) Error() error { 1323 return i.err 1324 } 1325 1326 func (i *flushableBatchIter) Close() error { 1327 return i.err 1328 } 1329 1330 func (i *flushableBatchIter) SetBounds(lower, upper []byte) { 1331 i.lower = lower 1332 i.upper = upper 1333 } 1334 1335 // flushFlushableBatchIter is similar to flushableBatchIter but it keeps track 1336 // of number of bytes iterated. 1337 type flushFlushableBatchIter struct { 1338 flushableBatchIter 1339 bytesIterated *uint64 1340 } 1341 1342 // flushFlushableBatchIter implements the internalIterator interface. 1343 var _ internalIterator = (*flushFlushableBatchIter)(nil) 1344 1345 func (i *flushFlushableBatchIter) SeekGE(key []byte) (*InternalKey, []byte) { 1346 panic("pebble: SeekGE unimplemented") 1347 } 1348 1349 func (i *flushFlushableBatchIter) SeekPrefixGE(prefix, key []byte) (*InternalKey, []byte) { 1350 panic("pebble: SeekPrefixGE unimplemented") 1351 } 1352 1353 func (i *flushFlushableBatchIter) SeekLT(key []byte) (*InternalKey, []byte) { 1354 panic("pebble: SeekLT unimplemented") 1355 } 1356 1357 func (i *flushFlushableBatchIter) First() (*InternalKey, []byte) { 1358 key, val := i.flushableBatchIter.First() 1359 if key == nil { 1360 return nil, nil 1361 } 1362 entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset 1363 *i.bytesIterated += uint64(entryBytes) + i.valueSize() 1364 return key, val 1365 } 1366 1367 // Note: flushFlushableBatchIter.Next mirrors the implementation of 1368 // flushableBatchIter.Next due to performance. Keep the two in sync. 1369 func (i *flushFlushableBatchIter) Next() (*InternalKey, []byte) { 1370 if i.index == len(i.offsets) { 1371 return nil, nil 1372 } 1373 i.index++ 1374 if i.index == len(i.offsets) { 1375 return nil, nil 1376 } 1377 i.key = i.getKey(i.index) 1378 entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset 1379 *i.bytesIterated += uint64(entryBytes) + i.valueSize() 1380 return &i.key, i.Value() 1381 } 1382 1383 func (i flushFlushableBatchIter) Prev() (*InternalKey, []byte) { 1384 panic("pebble: Prev unimplemented") 1385 } 1386 1387 func (i flushFlushableBatchIter) valueSize() uint64 { 1388 p := i.data[i.offsets[i.index].offset:] 1389 if len(p) == 0 { 1390 i.err = fmt.Errorf("corrupted batch") 1391 return 0 1392 } 1393 kind := InternalKeyKind(p[0]) 1394 if kind > InternalKeyKindMax { 1395 i.err = fmt.Errorf("corrupted batch") 1396 return 0 1397 } 1398 var length uint64 1399 switch kind { 1400 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete: 1401 keyEnd := i.offsets[i.index].keyEnd 1402 v, n := binary.Uvarint(i.data[keyEnd:]) 1403 if n <= 0 { 1404 i.err = fmt.Errorf("corrupted batch") 1405 return 0 1406 } 1407 length = v + uint64(n) 1408 } 1409 return length 1410 }