github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/batch.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"encoding/binary"
     9  	"errors"
    10  	"fmt"
    11  	"math"
    12  	"sort"
    13  	"sync"
    14  	"sync/atomic"
    15  	"unsafe"
    16  
    17  	"github.com/petermattis/pebble/internal/base"
    18  	"github.com/petermattis/pebble/internal/batchskl"
    19  	"github.com/petermattis/pebble/internal/rangedel"
    20  	"github.com/petermattis/pebble/internal/rawalloc"
    21  )
    22  
    23  const (
    24  	batchHeaderLen       = 12
    25  	batchInitialSize     = 1 << 10 // 1 KB
    26  	batchMaxRetainedSize = 1 << 20 // 1 MB
    27  	invalidBatchCount    = 1<<32 - 1
    28  	maxVarintLen32       = 5
    29  )
    30  
    31  // ErrNotIndexed means that a read operation on a batch failed because the
    32  // batch is not indexed and thus doesn't support reads.
    33  var ErrNotIndexed = errors.New("pebble: batch not indexed")
    34  
    35  // ErrInvalidBatch indicates that a batch is invalid or otherwise corrupted.
    36  var ErrInvalidBatch = errors.New("pebble: invalid batch")
    37  
    38  type batchStorage struct {
    39  	// Data is the wire format of a batch's log entry:
    40  	//   - 8 bytes for a sequence number of the first batch element,
    41  	//     or zeroes if the batch has not yet been applied,
    42  	//   - 4 bytes for the count: the number of elements in the batch,
    43  	//     or "\xff\xff\xff\xff" if the batch is invalid,
    44  	//   - count elements, being:
    45  	//     - one byte for the kind
    46  	//     - the varint-string user key,
    47  	//     - the varint-string value (if kind != delete).
    48  	// The sequence number and count are stored in little-endian order.
    49  	data           []byte
    50  	cmp            Compare
    51  	abbreviatedKey AbbreviatedKey
    52  }
    53  
    54  // Get implements Storage.Get, as documented in the pebble/batchskl package.
    55  func (s *batchStorage) Get(offset uint32) InternalKey {
    56  	kind := InternalKeyKind(s.data[offset])
    57  	_, key, ok := batchDecodeStr(s.data[offset+1:])
    58  	if !ok {
    59  		panic(fmt.Sprintf("corrupted batch entry: %d", offset))
    60  	}
    61  	return base.MakeInternalKey(key, uint64(offset)|InternalKeySeqNumBatch, kind)
    62  }
    63  
    64  // AbbreviatedKey implements Storage.AbbreviatedKey, as documented in the
    65  // pebble/batchskl package.
    66  func (s *batchStorage) AbbreviatedKey(key []byte) uint64 {
    67  	return s.abbreviatedKey(key)
    68  }
    69  
    70  // Compare implements Storage.Compare, as documented in the pebble/batchskl
    71  // package.
    72  func (s *batchStorage) Compare(a []byte, b uint32) int {
    73  	// The key "a" is always the search key or the newer key being inserted. If
    74  	// it is equal to the existing key consider it smaller so that it sorts
    75  	// first.
    76  	if s.cmp(a, s.Get(b).UserKey) <= 0 {
    77  		return -1
    78  	}
    79  	return 1
    80  }
    81  
    82  // DeferredBatchOp represents a batch operation (eg. set, merge, delete) that is
    83  // being inserted into the batch. Indexing is not performed on the specified key
    84  // until Finish is called, hence the name deferred. This struct lets the caller
    85  // copy or encode keys/values directly into the batch representation instead of
    86  // copying into an intermediary buffer then having pebble.Batch copy off of it.
    87  type DeferredBatchOp struct {
    88  	index *batchskl.Skiplist
    89  
    90  	// Key and Value point to parts of the binary batch representation where
    91  	// keys and values should be encoded/copied into. len(Key) and len(Value)
    92  	// bytes must be copied into these slices respectively before calling
    93  	// Finish(). Changing where these slices point to is not allowed.
    94  	Key, Value []byte
    95  	offset     uint32
    96  }
    97  
    98  // Finish completes the addition of this batch operation, and adds it to the
    99  // index if necessary. Must be called once (and exactly once) keys/values
   100  // have been filled into Key and Value. Not calling Finish or not
   101  // copying/encoding keys will result in an incomplete index, and calling Finish
   102  // twice may result in a panic.
   103  func (d DeferredBatchOp) Finish() {
   104  	if d.index != nil {
   105  		if err := d.index.Add(d.offset); err != nil {
   106  			// We never add duplicate entries, so an error should never occur.
   107  			panic(err)
   108  		}
   109  	}
   110  }
   111  
   112  // A Batch is a sequence of Sets, Merges, Deletes, and/or DeleteRanges that are
   113  // applied atomically. Batch implements the Reader interface, but only an
   114  // indexed batch supports reading (without error) via Get or NewIter. A
   115  // non-indexed batch will return ErrNotIndexed when read from .
   116  //
   117  // Indexing
   118  //
   119  // Batches can be optionally indexed (see DB.NewIndexedBatch). An indexed batch
   120  // allows iteration via an Iterator (see Batch.NewIter). The iterator provides
   121  // a merged view of the operations in the batch and the underlying
   122  // database. This is implemented by treating the batch as an additional layer
   123  // in the LSM where every entry in the batch is considered newer than any entry
   124  // in the underlying database (batch entries have the InternalKeySeqNumBatch
   125  // bit set). By treating the batch as an additional layer in the LSM, iteration
   126  // supports all batch operations (i.e. Set, Merge, Delete, and DeleteRange)
   127  // with minimal effort.
   128  //
   129  // The same key can be operated on multiple times in a batch, though only the
   130  // latest operation will be visible. For example, Put("a", "b"), Delete("a")
   131  // will cause the key "a" to not be visible in the batch. Put("a", "b"),
   132  // Put("a", "c") will cause a read of "a" to return the value "c".
   133  //
   134  // The batch index is implemented via an skiplist (internal/batchskl). While
   135  // the skiplist implementation is very fast, inserting into an indexed batch is
   136  // significantly slower than inserting into a non-indexed batch. Only use an
   137  // indexed batch if you require reading from it.
   138  //
   139  // Atomic commit
   140  //
   141  // The operations in a batch are persisted by calling Batch.Commit which is
   142  // equivalent to calling DB.Apply(batch). A batch is committed atomically by
   143  // writing the internal batch representation to the WAL, adding all of the
   144  // batch operations to the memtable associated with the WAL, and then
   145  // incrementing the visible sequence number so that subsequent reads can see
   146  // the effects of the batch operations. If WriteOptions.Sync is true, a call to
   147  // Batch.Commit will guarantee that the batch is persisted to disk before
   148  // returning. See commitPipeline for more on the implementation details.
   149  //
   150  // Large batches
   151  //
   152  // The size of a batch is limited only by available memory (be aware that
   153  // indexed batches require considerably additional memory for the skiplist
   154  // structure). A given WAL file has a single memtable associated with it (this
   155  // restriction could be removed, but doing so is onerous and complex). And a
   156  // memtable has a fixed size due to the underlying fixed size arena. Note that
   157  // this differs from RocksDB where a memtable can grow arbitrarily large using
   158  // a list of arena chunks. In RocksDB this is accomplished by storing pointers
   159  // in the arena memory, but that isn't possible in Go.
   160  //
   161  // During Batch.Commit, a batch which is larger than a threshold (>
   162  // MemTableSize/2) is wrapped in a flushableBatch and inserted into the queue
   163  // of memtables. A flushableBatch forces WAL to be rotated, but that happens
   164  // anyways when the memtable becomes full so this does not cause significant
   165  // WAL churn. Because the flushableBatch is readable as another layer in the
   166  // LSM, Batch.Commit returns as soon as the flushableBatch has been added to
   167  // the queue of memtables.
   168  //
   169  // Internally, a flushableBatch provides Iterator support by sorting the batch
   170  // contents (the batch is sorted once, when it is added to the memtable
   171  // queue). Sorting the batch contents and insertion of the contents into a
   172  // memtable have the same big-O time, but the constant factor dominates
   173  // here. Sorting is significantly faster and uses significantly less memory.
   174  //
   175  // Internal representation
   176  //
   177  // The internal batch representation is a contiguous byte buffer with a fixed
   178  // 12-byte header, followed by a series of records.
   179  //
   180  //   +-------------+------------+--- ... ---+
   181  //   | SeqNum (8B) | Count (4B) |  Entries  |
   182  //   +-------------+------------+--- ... ---+
   183  //
   184  // Each record has a 1-byte kind tag prefix, followed by 1 or 2 length prefixed
   185  // strings (varstring):
   186  //
   187  //   +-----------+-----------------+-------------------+
   188  //   | Kind (1B) | Key (varstring) | Value (varstring) |
   189  //   +-----------+-----------------+-------------------+
   190  //
   191  // A varstring is a varint32 followed by N bytes of data. The Kind tags are
   192  // exactly those specified by InternalKeyKind. The following table shows the
   193  // format for records of each kind:
   194  //
   195  //   InternalKeyKindDelete       varstring
   196  //   InternalKeyKindLogData      varstring
   197  //   InternalKeyKindSet          varstring varstring
   198  //   InternalKeyKindMerge        varstring varstring
   199  //   InternalKeyKindRangeDelete  varstring varstring
   200  //
   201  // The intuitive understanding here are that the arguments to Delete(), Set(),
   202  // Merge(), and DeleteRange() are encoded into the batch.
   203  //
   204  // The internal batch representation is the on disk format for a batch in the
   205  // WAL, and thus stable. New record kinds may be added, but the existing ones
   206  // will not be modified.
   207  type Batch struct {
   208  	storage batchStorage
   209  
   210  	memTableSize uint32
   211  
   212  	// The db to which the batch will be committed.
   213  	db *DB
   214  
   215  	// The count of records in the batch. This count will be stored in the batch
   216  	// data whenever Repr() is called.
   217  	count uint32
   218  
   219  	// A deferredOp struct, stored in the Batch so that a pointer can be returned
   220  	// from the *Deferred() methods rather than a value.
   221  	deferredOp DeferredBatchOp
   222  
   223  	// An optional skiplist keyed by offset into data of the entry.
   224  	index         *batchskl.Skiplist
   225  	rangeDelIndex *batchskl.Skiplist
   226  
   227  	// Fragmented range deletion tombstones. Cached the first time a range
   228  	// deletion iterator is requested. The cache is invalidated whenever a new
   229  	// range deletion is added to the batch.
   230  	tombstones []rangedel.Tombstone
   231  
   232  	// The flushableBatch wrapper if the batch is too large to fit in the
   233  	// memtable.
   234  	flushable *flushableBatch
   235  
   236  	commit  sync.WaitGroup
   237  	applied uint32 // updated atomically
   238  }
   239  
   240  var _ Reader = (*Batch)(nil)
   241  var _ Writer = (*Batch)(nil)
   242  
   243  var batchPool = sync.Pool{
   244  	New: func() interface{} {
   245  		return &Batch{}
   246  	},
   247  }
   248  
   249  type indexedBatch struct {
   250  	batch Batch
   251  	index batchskl.Skiplist
   252  }
   253  
   254  var indexedBatchPool = sync.Pool{
   255  	New: func() interface{} {
   256  		return &indexedBatch{}
   257  	},
   258  }
   259  
   260  func newBatch(db *DB) *Batch {
   261  	b := batchPool.Get().(*Batch)
   262  	b.db = db
   263  	return b
   264  }
   265  
   266  func newIndexedBatch(db *DB, comparer *Comparer) *Batch {
   267  	i := indexedBatchPool.Get().(*indexedBatch)
   268  	i.batch.storage.cmp = comparer.Compare
   269  	i.batch.storage.abbreviatedKey = comparer.AbbreviatedKey
   270  	i.batch.db = db
   271  	i.batch.index = &i.index
   272  	i.batch.index.Reset(&i.batch.storage, 0)
   273  	return &i.batch
   274  }
   275  
   276  func (b *Batch) release() {
   277  	// NB: This is ugly, but necessary so that we can use atomic.StoreUint32 for
   278  	// the Batch.applied field. Without using an atomic to clear that field the
   279  	// Go race detector complains.
   280  	b.Reset()
   281  	b.storage.cmp = nil
   282  	b.storage.abbreviatedKey = nil
   283  	b.memTableSize = 0
   284  
   285  	b.flushable = nil
   286  	b.commit = sync.WaitGroup{}
   287  	atomic.StoreUint32(&b.applied, 0)
   288  
   289  	if b.db == nil {
   290  		// Batch not created using newBatch or newIndexedBatch, so don't put it
   291  		// back in the pool.
   292  		return
   293  	}
   294  	b.db = nil
   295  
   296  	if b.index == nil {
   297  		batchPool.Put(b)
   298  	} else {
   299  		*b.index = batchskl.Skiplist{}
   300  		b.index, b.rangeDelIndex = nil, nil
   301  		indexedBatchPool.Put((*indexedBatch)(unsafe.Pointer(b)))
   302  	}
   303  }
   304  
   305  func (b *Batch) refreshMemTableSize() {
   306  	b.memTableSize = 0
   307  	for r := b.Reader(); ; {
   308  		_, key, value, ok := r.Next()
   309  		if !ok {
   310  			break
   311  		}
   312  		b.memTableSize += memTableEntrySize(len(key), len(value))
   313  	}
   314  }
   315  
   316  // Apply the operations contained in the batch to the receiver batch.
   317  //
   318  // It is safe to modify the contents of the arguments after Apply returns.
   319  func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error {
   320  	if len(batch.storage.data) == 0 {
   321  		return nil
   322  	}
   323  	if len(batch.storage.data) < batchHeaderLen {
   324  		return errors.New("pebble: invalid batch")
   325  	}
   326  
   327  	offset := len(b.storage.data)
   328  	if offset == 0 {
   329  		b.init(offset)
   330  		offset = batchHeaderLen
   331  	}
   332  	b.storage.data = append(b.storage.data, batch.storage.data[batchHeaderLen:]...)
   333  
   334  	b.setCount(b.Count() + batch.Count())
   335  
   336  	for iter := BatchReader(b.storage.data[offset:]); len(iter) > 0; {
   337  		offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.storage.data[0]))
   338  		kind, key, value, ok := iter.Next()
   339  		if !ok {
   340  			break
   341  		}
   342  		if b.index != nil {
   343  			var err error
   344  			if kind == InternalKeyKindRangeDelete {
   345  				if b.rangeDelIndex == nil {
   346  					b.rangeDelIndex = batchskl.NewSkiplist(&b.storage, 0)
   347  				}
   348  				err = b.rangeDelIndex.Add(uint32(offset))
   349  			} else {
   350  				err = b.index.Add(uint32(offset))
   351  			}
   352  			if err != nil {
   353  				// We never add duplicate entries, so an error should never occur.
   354  				panic(err)
   355  			}
   356  		}
   357  		b.memTableSize += memTableEntrySize(len(key), len(value))
   358  	}
   359  	return nil
   360  }
   361  
   362  // Get gets the value for the given key. It returns ErrNotFound if the DB
   363  // does not contain the key.
   364  //
   365  // The caller should not modify the contents of the returned slice, but
   366  // it is safe to modify the contents of the argument after Get returns.
   367  func (b *Batch) Get(key []byte) (value []byte, err error) {
   368  	if b.index == nil {
   369  		return nil, ErrNotIndexed
   370  	}
   371  	return b.db.getInternal(key, b, nil /* snapshot */)
   372  }
   373  
   374  func (b *Batch) prepareRecord(keyLen, valueLen int, kind InternalKeyKind) {
   375  	pos := len(b.storage.data)
   376  	b.deferredOp.offset = uint32(pos)
   377  	b.grow(1 + 2*maxVarintLen32 + keyLen + valueLen)
   378  	b.storage.data[pos] = byte(kind)
   379  	pos++
   380  
   381  	varlen1 := putUvarint32(b.storage.data[pos:], uint32(keyLen))
   382  	pos += varlen1
   383  	b.deferredOp.Key = b.storage.data[pos : pos+keyLen]
   384  	pos += keyLen
   385  
   386  	varlen2 := putUvarint32(b.storage.data[pos:], uint32(valueLen))
   387  	pos += varlen2
   388  	b.deferredOp.Value = b.storage.data[pos : pos+valueLen]
   389  	pos += valueLen
   390  	b.storage.data = b.storage.data[:len(b.storage.data)-(2*maxVarintLen32-varlen1-varlen2)]
   391  }
   392  
   393  // Set adds an action to the batch that sets the key to map to the value.
   394  //
   395  // It is safe to modify the contents of the arguments after Set returns.
   396  func (b *Batch) Set(key, value []byte, _ *WriteOptions) error {
   397  	deferredOp, err := b.SetDeferred(len(key), len(value), nil)
   398  	if err != nil {
   399  		return err
   400  	}
   401  	copy(deferredOp.Key, key)
   402  	copy(deferredOp.Value, value)
   403  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   404  	// in go1.13 will remove the need for this.
   405  	if b.index != nil {
   406  		if err := b.index.Add(deferredOp.offset); err != nil {
   407  			// We never add duplicate entries, so an error should never occur.
   408  			panic(err)
   409  		}
   410  	}
   411  	return nil
   412  }
   413  
   414  // SetDeferred is similar to Set in that it adds a set operation to the batch,
   415  // except it only takes in key/value lengths instead of complete slices,
   416  // letting the caller encode into those objects and then call Finish() on the
   417  // returned object.
   418  func (b *Batch) SetDeferred(keyLen, valueLen int, _ *WriteOptions) (*DeferredBatchOp, error) {
   419  	// Code duplication between Set and SetDeferred lets us preserve the fast
   420  	// path where the entire byte slices are available (in the Set case).
   421  	if len(b.storage.data) == 0 {
   422  		b.init(keyLen + valueLen + 2*binary.MaxVarintLen64 + batchHeaderLen)
   423  	}
   424  	if !b.increment() {
   425  		return nil, ErrInvalidBatch
   426  	}
   427  
   428  	b.memTableSize += memTableEntrySize(keyLen, valueLen)
   429  	b.prepareRecord(keyLen, valueLen, InternalKeyKindSet)
   430  	b.deferredOp.index = b.index
   431  	return &b.deferredOp, nil
   432  }
   433  
   434  // Merge adds an action to the batch that merges the value at key with the new
   435  // value. The details of the merge are dependent upon the configured merge
   436  // operator.
   437  //
   438  // It is safe to modify the contents of the arguments after Merge returns.
   439  func (b *Batch) Merge(key, value []byte, _ *WriteOptions) error {
   440  	deferredOp, err := b.MergeDeferred(len(key), len(value), nil)
   441  	if err != nil {
   442  		return err
   443  	}
   444  	copy(deferredOp.Key, key)
   445  	copy(deferredOp.Value, value)
   446  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   447  	// in go1.13 will remove the need for this.
   448  	if b.index != nil {
   449  		if err := b.index.Add(deferredOp.offset); err != nil {
   450  			// We never add duplicate entries, so an error should never occur.
   451  			panic(err)
   452  		}
   453  	}
   454  	return nil
   455  }
   456  
   457  // MergeDeferred is similar to Merge in that it adds a merge operation to the
   458  // batch, except it only takes in key/value lengths instead of complete slices,
   459  // letting the caller encode into those objects and then call Finish() on the
   460  // returned object.
   461  func (b *Batch) MergeDeferred(keyLen, valueLen int, _ *WriteOptions) (*DeferredBatchOp, error) {
   462  	// Code duplication with Merge is so that the Merge case (where byte slices
   463  	// are provided) can preserve the fast path.
   464  	if len(b.storage.data) == 0 {
   465  		b.init(keyLen + valueLen + 2*binary.MaxVarintLen64 + batchHeaderLen)
   466  	}
   467  	if !b.increment() {
   468  		return nil, ErrInvalidBatch
   469  	}
   470  
   471  	b.memTableSize += memTableEntrySize(keyLen, valueLen)
   472  	b.prepareRecord(keyLen, valueLen, InternalKeyKindMerge)
   473  	b.deferredOp.index = b.index
   474  	return &b.deferredOp, nil
   475  }
   476  
   477  // Delete adds an action to the batch that deletes the entry for key.
   478  //
   479  // It is safe to modify the contents of the arguments after Delete returns.
   480  func (b *Batch) Delete(key []byte, _ *WriteOptions) error {
   481  	deferredOp, err := b.DeleteDeferred(len(key), nil)
   482  	if err != nil {
   483  		return err
   484  	}
   485  	copy(deferredOp.Key, key)
   486  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   487  	// in go1.13 will remove the need for this.
   488  	if b.index != nil {
   489  		if err := b.index.Add(deferredOp.offset); err != nil {
   490  			// We never add duplicate entries, so an error should never occur.
   491  			panic(err)
   492  		}
   493  	}
   494  	return nil
   495  }
   496  
   497  // DeleteDeferred is similar to Delete in that it adds a delete operation to
   498  // the batch, except it only takes in key/value lengths instead of complete
   499  // slices, letting the caller encode into those objects and then call Finish()
   500  // on the returned object.
   501  func (b *Batch) DeleteDeferred(keyLen int, _ *WriteOptions) (*DeferredBatchOp, error) {
   502  	// Code duplication with Delete is so that the Delete case (where byte
   503  	// slices are provided) can preserve the fast path.
   504  	if len(b.storage.data) == 0 {
   505  		b.init(keyLen + binary.MaxVarintLen64 + batchHeaderLen)
   506  	}
   507  	if !b.increment() {
   508  		return nil, ErrInvalidBatch
   509  	}
   510  
   511  	b.memTableSize += memTableEntrySize(keyLen, 0)
   512  
   513  	pos := len(b.storage.data)
   514  	b.deferredOp.offset = uint32(pos)
   515  	b.grow(1 + maxVarintLen32 + keyLen)
   516  	b.storage.data[pos] = byte(InternalKeyKindDelete)
   517  	pos++
   518  	varlen1 := putUvarint32(b.storage.data[pos:], uint32(keyLen))
   519  	pos += varlen1
   520  	b.deferredOp.Key = b.storage.data[pos : pos+keyLen]
   521  	b.deferredOp.Value = nil
   522  
   523  	b.storage.data = b.storage.data[:len(b.storage.data)-(maxVarintLen32-varlen1)]
   524  
   525  	b.deferredOp.index = b.index
   526  	return &b.deferredOp, nil
   527  }
   528  
   529  // DeleteRange deletes all of the keys (and values) in the range [start,end)
   530  // (inclusive on start, exclusive on end).
   531  //
   532  // It is safe to modify the contents of the arguments after DeleteRange
   533  // returns.
   534  func (b *Batch) DeleteRange(start, end []byte, _ *WriteOptions) error {
   535  	deferredOp, err := b.DeleteRangeDeferred(len(start), len(end), nil)
   536  	if err != nil {
   537  		return err
   538  	}
   539  	copy(deferredOp.Key, start)
   540  	copy(deferredOp.Value, end)
   541  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   542  	// in go1.13 will remove the need for this.
   543  	if deferredOp.index != nil {
   544  		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
   545  			// We never add duplicate entries, so an error should never occur.
   546  			panic(err)
   547  		}
   548  	}
   549  	return nil
   550  }
   551  
   552  // DeleteRangeDeferred is similar to DeleteRange in that it adds a delete range
   553  // operation to the batch, except it only takes in key lengths instead of
   554  // complete slices, letting the caller encode into those objects and then call
   555  // Finish() on the returned object. Note that DeferredBatchOp.Key should be
   556  // populated with the start key, and DeferredBatchOp.Value should be populated
   557  // with the end key.
   558  func (b *Batch) DeleteRangeDeferred(startLen, endLen int, _ *WriteOptions) (*DeferredBatchOp, error) {
   559  	if len(b.storage.data) == 0 {
   560  		b.init(startLen + endLen + 2*binary.MaxVarintLen64 + batchHeaderLen)
   561  	}
   562  	if !b.increment() {
   563  		return nil, ErrInvalidBatch
   564  	}
   565  
   566  	b.memTableSize += memTableEntrySize(startLen, endLen)
   567  	b.prepareRecord(startLen, endLen, InternalKeyKindRangeDelete)
   568  
   569  	if b.index != nil {
   570  		b.tombstones = nil
   571  		// Range deletions are rare, so we lazily allocate the index for them.
   572  		if b.rangeDelIndex == nil {
   573  			b.rangeDelIndex = batchskl.NewSkiplist(&b.storage, 0)
   574  		}
   575  		b.deferredOp.index = b.rangeDelIndex
   576  	}
   577  	return &b.deferredOp, nil
   578  }
   579  
   580  // LogData adds the specified to the batch. The data will be written to the
   581  // WAL, but not added to memtables or sstables. Log data is never indexed,
   582  // which makes it useful for testing WAL performance.
   583  //
   584  // It is safe to modify the contents of the argument after LogData returns.
   585  func (b *Batch) LogData(data []byte, _ *WriteOptions) error {
   586  	if len(b.storage.data) == 0 {
   587  		b.init(len(data) + binary.MaxVarintLen64 + batchHeaderLen)
   588  	}
   589  	// Since LogData only writes to the WAL and does not affect the memtable,
   590  	// we don't increment b.count here. b.count only tracks operations that
   591  	// are applied to the memtable.
   592  
   593  	pos := len(b.storage.data)
   594  	b.grow(1 + maxVarintLen32 + len(data))
   595  	b.storage.data[pos] = byte(InternalKeyKindLogData)
   596  	_, varlen1 := b.copyStr(pos+1, data)
   597  	b.storage.data = b.storage.data[:len(b.storage.data)-(maxVarintLen32-varlen1)]
   598  	return nil
   599  }
   600  
   601  // Empty returns true if the batch is empty, and false otherwise.
   602  func (b *Batch) Empty() bool {
   603  	return len(b.storage.data) <= batchHeaderLen
   604  }
   605  
   606  // Repr returns the underlying batch representation. It is not safe to modify
   607  // the contents. Reset() will not change the contents of the returned value,
   608  // though any other mutation operation may do so.
   609  func (b *Batch) Repr() []byte {
   610  	if len(b.storage.data) == 0 {
   611  		b.init(batchHeaderLen)
   612  	}
   613  	binary.LittleEndian.PutUint32(b.countData(), b.count)
   614  	return b.storage.data
   615  }
   616  
   617  // SetRepr sets the underlying batch representation. The batch takes ownership
   618  // of the supplied slice. It is not safe to modify it afterwards until the
   619  // Batch is no longer in use.
   620  func (b *Batch) SetRepr(data []byte) error {
   621  	if len(data) < batchHeaderLen {
   622  		return fmt.Errorf("invalid batch")
   623  	}
   624  	b.storage.data = data
   625  	b.count = binary.LittleEndian.Uint32(b.countData())
   626  	b.refreshMemTableSize()
   627  	return nil
   628  }
   629  
   630  // NewIter returns an iterator that is unpositioned (Iterator.Valid() will
   631  // return false). The iterator can be positioned via a call to SeekGE,
   632  // SeekPrefixGE, SeekLT, First or Last. Only indexed batches support iterators.
   633  func (b *Batch) NewIter(o *IterOptions) *Iterator {
   634  	if b.index == nil {
   635  		return &Iterator{err: ErrNotIndexed}
   636  	}
   637  	return b.db.newIterInternal(b.newInternalIter(o),
   638  		b.newRangeDelIter(o), nil /* snapshot */, o)
   639  }
   640  
   641  // newInternalIter creates a new internalIterator that iterates over the
   642  // contents of the batch.
   643  func (b *Batch) newInternalIter(o *IterOptions) internalIterator {
   644  	if b.index == nil {
   645  		return newErrorIter(ErrNotIndexed)
   646  	}
   647  	return &batchIter{
   648  		cmp:   b.storage.cmp,
   649  		batch: b,
   650  		iter:  b.index.NewIter(o.GetLowerBound(), o.GetUpperBound()),
   651  	}
   652  }
   653  
   654  func (b *Batch) newRangeDelIter(o *IterOptions) internalIterator {
   655  	if b.index == nil {
   656  		return newErrorIter(ErrNotIndexed)
   657  	}
   658  	if b.rangeDelIndex == nil {
   659  		return nil
   660  	}
   661  
   662  	// Fragment the range tombstones the first time a range deletion iterator is
   663  	// requested. The cached tombstones are invalidated if another range deletion
   664  	// tombstone is added to the batch.
   665  	if b.tombstones == nil {
   666  		frag := &rangedel.Fragmenter{
   667  			Cmp: b.storage.cmp,
   668  			Emit: func(fragmented []rangedel.Tombstone) {
   669  				b.tombstones = append(b.tombstones, fragmented...)
   670  			},
   671  		}
   672  		it := &batchIter{
   673  			cmp:   b.storage.cmp,
   674  			batch: b,
   675  			iter:  b.rangeDelIndex.NewIter(nil, nil),
   676  		}
   677  		for {
   678  			key, val := it.Next()
   679  			if key == nil {
   680  				break
   681  			}
   682  			frag.Add(*key, val)
   683  		}
   684  		frag.Finish()
   685  	}
   686  
   687  	return rangedel.NewIter(b.storage.cmp, b.tombstones)
   688  }
   689  
   690  // Commit applies the batch to its parent writer.
   691  func (b *Batch) Commit(o *WriteOptions) error {
   692  	return b.db.Apply(b, o)
   693  }
   694  
   695  // Close closes the batch without committing it.
   696  func (b *Batch) Close() error {
   697  	b.release()
   698  	return nil
   699  }
   700  
   701  // Indexed returns true if the batch is indexed (i.e. supports read
   702  // operations).
   703  func (b *Batch) Indexed() bool {
   704  	return b.index != nil
   705  }
   706  
   707  func (b *Batch) init(cap int) {
   708  	n := batchInitialSize
   709  	for n < cap {
   710  		n *= 2
   711  	}
   712  	b.storage.data = rawalloc.New(batchHeaderLen, n)
   713  	b.setCount(0)
   714  	b.setSeqNum(0)
   715  	b.storage.data = b.storage.data[:batchHeaderLen]
   716  }
   717  
   718  // Reset clears the underlying byte slice and effectively empties the batch for
   719  // reuse. Used in cases where Batch is only being used to build a batch, and
   720  // where the end result is a Repr() call, not a Commit call or a Close call.
   721  // Commits and Closes take care of releasing resources when appropriate.
   722  func (b *Batch) Reset() {
   723  	if b.storage.data != nil {
   724  		if cap(b.storage.data) > batchMaxRetainedSize {
   725  			// If the capacity of the buffer is larger than our maximum
   726  			// retention size, don't re-use it. Let it be GC-ed instead.
   727  			// This prevents the memory from an unusually large batch from
   728  			// being held on to indefinitely.
   729  			b.storage.data = nil
   730  		} else {
   731  			// Otherwise, reset the buffer for re-use.
   732  			b.storage.data = b.storage.data[:batchHeaderLen]
   733  		}
   734  		b.count = 0
   735  	}
   736  }
   737  
   738  // seqNumData returns the 8 byte little-endian sequence number. Zero means that
   739  // the batch has not yet been applied.
   740  func (b *Batch) seqNumData() []byte {
   741  	return b.storage.data[:8]
   742  }
   743  
   744  // countData returns the 4 byte little-endian count data. "\xff\xff\xff\xff"
   745  // means that the batch is invalid.
   746  func (b *Batch) countData() []byte {
   747  	return b.storage.data[8:12]
   748  }
   749  
   750  func (b *Batch) increment() (ok bool) {
   751  	if b.count == math.MaxUint32 {
   752  		return false
   753  	}
   754  	b.count++
   755  	return true
   756  }
   757  
   758  func (b *Batch) grow(n int) {
   759  	newSize := len(b.storage.data) + n
   760  	if newSize > cap(b.storage.data) {
   761  		newCap := 2 * cap(b.storage.data)
   762  		for newCap < newSize {
   763  			newCap *= 2
   764  		}
   765  		newData := rawalloc.New(len(b.storage.data), newCap)
   766  		copy(newData, b.storage.data)
   767  		b.storage.data = newData
   768  	}
   769  	b.storage.data = b.storage.data[:newSize]
   770  }
   771  
   772  func putUvarint32(buf []byte, x uint32) int {
   773  	i := 0
   774  	for x >= 0x80 {
   775  		buf[i] = byte(x) | 0x80
   776  		x >>= 7
   777  		i++
   778  	}
   779  	buf[i] = byte(x)
   780  	return i + 1
   781  }
   782  
   783  func (b *Batch) copyStr(pos int, s []byte) (int, int) {
   784  	n := putUvarint32(b.storage.data[pos:], uint32(len(s)))
   785  	return pos + n + copy(b.storage.data[pos+n:], s), n
   786  }
   787  
   788  func (b *Batch) setSeqNum(seqNum uint64) {
   789  	binary.LittleEndian.PutUint64(b.seqNumData(), seqNum)
   790  }
   791  
   792  // SeqNum returns the batch sequence number which is applied to the first
   793  // record in the batch. The sequence number is incremented for each subsequent
   794  // record.
   795  func (b *Batch) SeqNum() uint64 {
   796  	return binary.LittleEndian.Uint64(b.seqNumData())
   797  }
   798  
   799  func (b *Batch) setCount(v uint32) {
   800  	b.count = v
   801  }
   802  
   803  // Count returns the count of memtable-modifying operations in this batch. All
   804  // operations with the except of LogData increment this count.
   805  func (b *Batch) Count() uint32 {
   806  	return b.count
   807  }
   808  
   809  // Reader returns a BatchReader for the current batch contents. If the batch is
   810  // mutated, the new entries will not be visible to the reader.
   811  func (b *Batch) Reader() BatchReader {
   812  	return b.storage.data[batchHeaderLen:]
   813  }
   814  
   815  func batchDecode(data []byte, offset uint32) (kind InternalKeyKind, ukey []byte, value []byte, ok bool) {
   816  	p := data[offset:]
   817  	if len(p) == 0 {
   818  		return 0, nil, nil, false
   819  	}
   820  	kind, p = InternalKeyKind(p[0]), p[1:]
   821  	if kind > InternalKeyKindMax {
   822  		return 0, nil, nil, false
   823  	}
   824  	p, ukey, ok = batchDecodeStr(p)
   825  	if !ok {
   826  		return 0, nil, nil, false
   827  	}
   828  	switch kind {
   829  	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete:
   830  		_, value, ok = batchDecodeStr(p)
   831  		if !ok {
   832  			return 0, nil, nil, false
   833  		}
   834  	}
   835  	return kind, ukey, value, true
   836  }
   837  
   838  func batchDecodeStr(data []byte) (odata []byte, s []byte, ok bool) {
   839  	v, n := binary.Uvarint(data)
   840  	if n <= 0 {
   841  		return nil, nil, false
   842  	}
   843  	data = data[n:]
   844  	if v > uint64(len(data)) {
   845  		return nil, nil, false
   846  	}
   847  	return data[v:], data[:v], true
   848  }
   849  
   850  // BatchReader iterates over the entries contained in a batch.
   851  type BatchReader []byte
   852  
   853  // MakeBatchReader constructs a BatchReader from a batch representation. The
   854  // header (containing the batch count and seqnum) is ignored.
   855  func MakeBatchReader(repr []byte) BatchReader {
   856  	return repr[batchHeaderLen:]
   857  }
   858  
   859  // Next returns the next entry in this batch. The final return value is false
   860  // if the batch is corrupt. The end of batch is reached when len(r)==0.
   861  func (r *BatchReader) Next() (kind InternalKeyKind, ukey []byte, value []byte, ok bool) {
   862  	p := *r
   863  	if len(p) == 0 {
   864  		return 0, nil, nil, false
   865  	}
   866  	kind, *r = InternalKeyKind(p[0]), p[1:]
   867  	if kind > InternalKeyKindMax {
   868  		return 0, nil, nil, false
   869  	}
   870  	ukey, ok = r.nextStr()
   871  	if !ok {
   872  		return 0, nil, nil, false
   873  	}
   874  	switch kind {
   875  	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete:
   876  		value, ok = r.nextStr()
   877  		if !ok {
   878  			return 0, nil, nil, false
   879  		}
   880  	}
   881  	return kind, ukey, value, true
   882  }
   883  
   884  func (r *BatchReader) nextStr() (s []byte, ok bool) {
   885  	p := *r
   886  	u, numBytes := binary.Uvarint(p)
   887  	if numBytes <= 0 {
   888  		return nil, false
   889  	}
   890  	p = p[numBytes:]
   891  	if u > uint64(len(p)) {
   892  		return nil, false
   893  	}
   894  	s, *r = p[:u], p[u:]
   895  	return s, true
   896  }
   897  
   898  // Note: batchIter mirrors the implementation of flushableBatchIter. Keep the
   899  // two in sync.
   900  type batchIter struct {
   901  	cmp   Compare
   902  	batch *Batch
   903  	iter  batchskl.Iterator
   904  	err   error
   905  }
   906  
   907  // batchIter implements the internalIterator interface.
   908  var _ internalIterator = (*batchIter)(nil)
   909  
   910  func (i *batchIter) SeekGE(key []byte) (*InternalKey, []byte) {
   911  	ikey := i.iter.SeekGE(key)
   912  	if ikey == nil {
   913  		return nil, nil
   914  	}
   915  	return ikey, i.Value()
   916  }
   917  
   918  func (i *batchIter) SeekPrefixGE(prefix, key []byte) (*InternalKey, []byte) {
   919  	return i.SeekGE(key)
   920  }
   921  
   922  func (i *batchIter) SeekLT(key []byte) (*InternalKey, []byte) {
   923  	ikey := i.iter.SeekLT(key)
   924  	if ikey == nil {
   925  		return nil, nil
   926  	}
   927  	return ikey, i.Value()
   928  }
   929  
   930  func (i *batchIter) First() (*InternalKey, []byte) {
   931  	ikey := i.iter.First()
   932  	if ikey == nil {
   933  		return nil, nil
   934  	}
   935  	return ikey, i.Value()
   936  }
   937  
   938  func (i *batchIter) Last() (*InternalKey, []byte) {
   939  	ikey := i.iter.Last()
   940  	if ikey == nil {
   941  		return nil, nil
   942  	}
   943  	return ikey, i.Value()
   944  }
   945  
   946  func (i *batchIter) Next() (*InternalKey, []byte) {
   947  	ikey := i.iter.Next()
   948  	if ikey == nil {
   949  		return nil, nil
   950  	}
   951  	return ikey, i.Value()
   952  }
   953  
   954  func (i *batchIter) Prev() (*InternalKey, []byte) {
   955  	ikey := i.iter.Prev()
   956  	if ikey == nil {
   957  		return nil, nil
   958  	}
   959  	return ikey, i.Value()
   960  }
   961  
   962  func (i *batchIter) Key() *InternalKey {
   963  	return i.iter.Key()
   964  }
   965  
   966  func (i *batchIter) Value() []byte {
   967  	_, _, value, ok := batchDecode(i.batch.storage.data, i.iter.KeyOffset())
   968  	if !ok {
   969  		i.err = fmt.Errorf("corrupted batch")
   970  	}
   971  	return value
   972  }
   973  
   974  func (i *batchIter) Valid() bool {
   975  	return i.iter.Valid()
   976  }
   977  
   978  func (i *batchIter) Error() error {
   979  	return i.err
   980  }
   981  
   982  func (i *batchIter) Close() error {
   983  	_ = i.iter.Close()
   984  	return i.err
   985  }
   986  
   987  func (i *batchIter) SetBounds(lower, upper []byte) {
   988  	i.iter.SetBounds(lower, upper)
   989  }
   990  
   991  type flushableBatchEntry struct {
   992  	offset   uint32
   993  	index    uint32
   994  	keyStart uint32
   995  	keyEnd   uint32
   996  }
   997  
   998  // flushableBatch wraps an existing batch and provides the interfaces needed
   999  // for making the batch flushable (i.e. able to mimic a memtable).
  1000  type flushableBatch struct {
  1001  	cmp  Compare
  1002  	data []byte
  1003  
  1004  	// The base sequence number for the entries in the batch. This is the same
  1005  	// value as Batch.seqNum() and is cached here for performance.
  1006  	seqNum uint64
  1007  
  1008  	// A slice of offsets and indices for the entries in the batch. Used to
  1009  	// implement flushableBatchIter. Unlike the indexing on a normal batch, a
  1010  	// flushable batch is indexed such that batch entry i will be given the
  1011  	// sequence number flushableBatch.seqNum+i.
  1012  	offsets []flushableBatchEntry
  1013  
  1014  	// Fragmented range deletion tombstones.
  1015  	tombstones []rangedel.Tombstone
  1016  
  1017  	flushedCh chan struct{}
  1018  
  1019  	logNum uint64
  1020  }
  1021  
  1022  var _ flushable = (*flushableBatch)(nil)
  1023  
  1024  // newFlushableBatch creates a new batch that implements the flushable
  1025  // interface. This allows the batch to act like a memtable and be placed in the
  1026  // queue of flushable memtables. Note that the flushable batch takes ownership
  1027  // of the batch data.
  1028  func newFlushableBatch(batch *Batch, comparer *Comparer) *flushableBatch {
  1029  	b := &flushableBatch{
  1030  		data:      batch.storage.data,
  1031  		cmp:       comparer.Compare,
  1032  		offsets:   make([]flushableBatchEntry, 0, batch.Count()),
  1033  		flushedCh: make(chan struct{}),
  1034  	}
  1035  
  1036  	var index uint32
  1037  	var rangeDelOffsets []flushableBatchEntry
  1038  	for iter := BatchReader(b.data[batchHeaderLen:]); len(iter) > 0; index++ {
  1039  		offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0]))
  1040  		kind, key, _, ok := iter.Next()
  1041  		if !ok {
  1042  			break
  1043  		}
  1044  		entry := flushableBatchEntry{
  1045  			offset: uint32(offset),
  1046  			index:  uint32(index),
  1047  		}
  1048  		if keySize := uint32(len(key)); keySize == 0 {
  1049  			// Must add 2 to the offset. One byte encodes `kind` and the next
  1050  			// byte encodes `0`, which is the length of the key.
  1051  			entry.keyStart = uint32(offset) + 2
  1052  			entry.keyEnd = entry.keyStart
  1053  		} else {
  1054  			entry.keyStart = uint32(uintptr(unsafe.Pointer(&key[0])) -
  1055  				uintptr(unsafe.Pointer(&b.data[0])))
  1056  			entry.keyEnd = entry.keyStart + keySize
  1057  		}
  1058  		if kind == InternalKeyKindRangeDelete {
  1059  			rangeDelOffsets = append(rangeDelOffsets, entry)
  1060  		} else {
  1061  			b.offsets = append(b.offsets, entry)
  1062  		}
  1063  	}
  1064  
  1065  	// Sort both offsets and rangeDelOffsets.
  1066  	sort.Sort(b)
  1067  	rangeDelOffsets, b.offsets = b.offsets, rangeDelOffsets
  1068  	sort.Sort(b)
  1069  	rangeDelOffsets, b.offsets = b.offsets, rangeDelOffsets
  1070  
  1071  	if len(rangeDelOffsets) > 0 {
  1072  		frag := &rangedel.Fragmenter{
  1073  			Cmp: b.cmp,
  1074  			Emit: func(fragmented []rangedel.Tombstone) {
  1075  				b.tombstones = append(b.tombstones, fragmented...)
  1076  			},
  1077  		}
  1078  		it := &flushableBatchIter{
  1079  			batch:   b,
  1080  			data:    b.data,
  1081  			offsets: rangeDelOffsets,
  1082  			cmp:     b.cmp,
  1083  			index:   -1,
  1084  		}
  1085  		for {
  1086  			key, val := it.Next()
  1087  			if key == nil {
  1088  				break
  1089  			}
  1090  			frag.Add(*key, val)
  1091  		}
  1092  		frag.Finish()
  1093  	}
  1094  	return b
  1095  }
  1096  
  1097  func (b *flushableBatch) Len() int {
  1098  	return len(b.offsets)
  1099  }
  1100  
  1101  func (b *flushableBatch) Less(i, j int) bool {
  1102  	ei := &b.offsets[i]
  1103  	ej := &b.offsets[j]
  1104  	ki := b.data[ei.keyStart:ei.keyEnd]
  1105  	kj := b.data[ej.keyStart:ej.keyEnd]
  1106  	switch c := b.cmp(ki, kj); {
  1107  	case c < 0:
  1108  		return true
  1109  	case c > 0:
  1110  		return false
  1111  	default:
  1112  		return ei.offset > ej.offset
  1113  	}
  1114  }
  1115  
  1116  func (b *flushableBatch) Swap(i, j int) {
  1117  	b.offsets[i], b.offsets[j] = b.offsets[j], b.offsets[i]
  1118  }
  1119  
  1120  func (b *flushableBatch) newIter(o *IterOptions) internalIterator {
  1121  	return &flushableBatchIter{
  1122  		batch:   b,
  1123  		data:    b.data,
  1124  		offsets: b.offsets,
  1125  		cmp:     b.cmp,
  1126  		index:   -1,
  1127  		lower:   o.GetLowerBound(),
  1128  		upper:   o.GetUpperBound(),
  1129  	}
  1130  }
  1131  
  1132  func (b *flushableBatch) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator {
  1133  	return &flushFlushableBatchIter{
  1134  		flushableBatchIter: flushableBatchIter{
  1135  			batch:   b,
  1136  			data:    b.data,
  1137  			offsets: b.offsets,
  1138  			cmp:     b.cmp,
  1139  			index:   -1,
  1140  		},
  1141  		bytesIterated: bytesFlushed,
  1142  	}
  1143  }
  1144  
  1145  func (b *flushableBatch) newRangeDelIter(o *IterOptions) internalIterator {
  1146  	if len(b.tombstones) == 0 {
  1147  		return nil
  1148  	}
  1149  	return rangedel.NewIter(b.cmp, b.tombstones)
  1150  }
  1151  
  1152  func (b *flushableBatch) totalBytes() uint64 {
  1153  	return uint64(len(b.data) - batchHeaderLen)
  1154  }
  1155  
  1156  func (b *flushableBatch) flushed() chan struct{} {
  1157  	return b.flushedCh
  1158  }
  1159  
  1160  func (b *flushableBatch) readyForFlush() bool {
  1161  	return true
  1162  }
  1163  
  1164  func (b *flushableBatch) logInfo() (uint64, uint64) {
  1165  	return b.logNum, 0 /* logSize */
  1166  }
  1167  
  1168  // Note: flushableBatchIter mirrors the implementation of batchIter. Keep the
  1169  // two in sync.
  1170  type flushableBatchIter struct {
  1171  	batch   *flushableBatch
  1172  	data    []byte
  1173  	offsets []flushableBatchEntry
  1174  	cmp     Compare
  1175  	index   int
  1176  	key     InternalKey
  1177  	err     error
  1178  	lower   []byte
  1179  	upper   []byte
  1180  }
  1181  
  1182  // flushableBatchIter implements the internalIterator interface.
  1183  var _ internalIterator = (*flushableBatchIter)(nil)
  1184  
  1185  func (i *flushableBatchIter) SeekGE(key []byte) (*InternalKey, []byte) {
  1186  	ikey := base.MakeSearchKey(key)
  1187  	i.index = sort.Search(len(i.offsets), func(j int) bool {
  1188  		return base.InternalCompare(i.cmp, ikey, i.getKey(j)) < 0
  1189  	})
  1190  	if i.index >= len(i.offsets) {
  1191  		return nil, nil
  1192  	}
  1193  	i.key = i.getKey(i.index)
  1194  	if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 {
  1195  		i.index = len(i.offsets)
  1196  		return nil, nil
  1197  	}
  1198  	return &i.key, i.Value()
  1199  }
  1200  
  1201  func (i *flushableBatchIter) SeekPrefixGE(prefix, key []byte) (*InternalKey, []byte) {
  1202  	return i.SeekGE(key)
  1203  }
  1204  
  1205  func (i *flushableBatchIter) SeekLT(key []byte) (*InternalKey, []byte) {
  1206  	ikey := base.MakeSearchKey(key)
  1207  	i.index = sort.Search(len(i.offsets), func(j int) bool {
  1208  		return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0
  1209  	})
  1210  	i.index--
  1211  	if i.index < 0 {
  1212  		return nil, nil
  1213  	}
  1214  	i.key = i.getKey(i.index)
  1215  	if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 {
  1216  		i.index = -1
  1217  		return nil, nil
  1218  	}
  1219  	return &i.key, i.Value()
  1220  }
  1221  
  1222  func (i *flushableBatchIter) First() (*InternalKey, []byte) {
  1223  	if len(i.offsets) == 0 {
  1224  		return nil, nil
  1225  	}
  1226  	i.index = 0
  1227  	i.key = i.getKey(i.index)
  1228  	if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 {
  1229  		i.index = len(i.offsets)
  1230  		return nil, nil
  1231  	}
  1232  	return &i.key, i.Value()
  1233  }
  1234  
  1235  func (i *flushableBatchIter) Last() (*InternalKey, []byte) {
  1236  	if len(i.offsets) == 0 {
  1237  		return nil, nil
  1238  	}
  1239  	i.index = len(i.offsets) - 1
  1240  	i.key = i.getKey(i.index)
  1241  	if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 {
  1242  		i.index = -1
  1243  		return nil, nil
  1244  	}
  1245  	return &i.key, i.Value()
  1246  }
  1247  
  1248  // Note: flushFlushableBatchIter.Next mirrors the implementation of
  1249  // flushableBatchIter.Next due to performance. Keep the two in sync.
  1250  func (i *flushableBatchIter) Next() (*InternalKey, []byte) {
  1251  	if i.index == len(i.offsets) {
  1252  		return nil, nil
  1253  	}
  1254  	i.index++
  1255  	if i.index == len(i.offsets) {
  1256  		return nil, nil
  1257  	}
  1258  	i.key = i.getKey(i.index)
  1259  	if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 {
  1260  		i.index = len(i.offsets)
  1261  		return nil, nil
  1262  	}
  1263  	return &i.key, i.Value()
  1264  }
  1265  
  1266  func (i *flushableBatchIter) Prev() (*InternalKey, []byte) {
  1267  	if i.index < 0 {
  1268  		return nil, nil
  1269  	}
  1270  	i.index--
  1271  	if i.index < 0 {
  1272  		return nil, nil
  1273  	}
  1274  	i.key = i.getKey(i.index)
  1275  	if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 {
  1276  		i.index = -1
  1277  		return nil, nil
  1278  	}
  1279  	return &i.key, i.Value()
  1280  }
  1281  
  1282  func (i *flushableBatchIter) getKey(index int) InternalKey {
  1283  	e := &i.offsets[index]
  1284  	kind := InternalKeyKind(i.data[e.offset])
  1285  	key := i.data[e.keyStart:e.keyEnd]
  1286  	return base.MakeInternalKey(key, i.batch.seqNum+uint64(e.index), kind)
  1287  }
  1288  
  1289  func (i *flushableBatchIter) Key() *InternalKey {
  1290  	return &i.key
  1291  }
  1292  
  1293  func (i *flushableBatchIter) Value() []byte {
  1294  	p := i.data[i.offsets[i.index].offset:]
  1295  	if len(p) == 0 {
  1296  		i.err = fmt.Errorf("corrupted batch")
  1297  		return nil
  1298  	}
  1299  	kind := InternalKeyKind(p[0])
  1300  	if kind > InternalKeyKindMax {
  1301  		i.err = fmt.Errorf("corrupted batch")
  1302  		return nil
  1303  	}
  1304  	var value []byte
  1305  	var ok bool
  1306  	switch kind {
  1307  	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete:
  1308  		keyEnd := i.offsets[i.index].keyEnd
  1309  		_, value, ok = batchDecodeStr(i.data[keyEnd:])
  1310  		if !ok {
  1311  			i.err = fmt.Errorf("corrupted batch")
  1312  			return nil
  1313  		}
  1314  	}
  1315  	return value
  1316  }
  1317  
  1318  func (i *flushableBatchIter) Valid() bool {
  1319  	return i.index >= 0 && i.index < len(i.offsets)
  1320  }
  1321  
  1322  func (i *flushableBatchIter) Error() error {
  1323  	return i.err
  1324  }
  1325  
  1326  func (i *flushableBatchIter) Close() error {
  1327  	return i.err
  1328  }
  1329  
  1330  func (i *flushableBatchIter) SetBounds(lower, upper []byte) {
  1331  	i.lower = lower
  1332  	i.upper = upper
  1333  }
  1334  
  1335  // flushFlushableBatchIter is similar to flushableBatchIter but it keeps track
  1336  // of number of bytes iterated.
  1337  type flushFlushableBatchIter struct {
  1338  	flushableBatchIter
  1339  	bytesIterated *uint64
  1340  }
  1341  
  1342  // flushFlushableBatchIter implements the internalIterator interface.
  1343  var _ internalIterator = (*flushFlushableBatchIter)(nil)
  1344  
  1345  func (i *flushFlushableBatchIter) SeekGE(key []byte) (*InternalKey, []byte) {
  1346  	panic("pebble: SeekGE unimplemented")
  1347  }
  1348  
  1349  func (i *flushFlushableBatchIter) SeekPrefixGE(prefix, key []byte) (*InternalKey, []byte) {
  1350  	panic("pebble: SeekPrefixGE unimplemented")
  1351  }
  1352  
  1353  func (i *flushFlushableBatchIter) SeekLT(key []byte) (*InternalKey, []byte) {
  1354  	panic("pebble: SeekLT unimplemented")
  1355  }
  1356  
  1357  func (i *flushFlushableBatchIter) First() (*InternalKey, []byte) {
  1358  	key, val := i.flushableBatchIter.First()
  1359  	if key == nil {
  1360  		return nil, nil
  1361  	}
  1362  	entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset
  1363  	*i.bytesIterated += uint64(entryBytes) + i.valueSize()
  1364  	return key, val
  1365  }
  1366  
  1367  // Note: flushFlushableBatchIter.Next mirrors the implementation of
  1368  // flushableBatchIter.Next due to performance. Keep the two in sync.
  1369  func (i *flushFlushableBatchIter) Next() (*InternalKey, []byte) {
  1370  	if i.index == len(i.offsets) {
  1371  		return nil, nil
  1372  	}
  1373  	i.index++
  1374  	if i.index == len(i.offsets) {
  1375  		return nil, nil
  1376  	}
  1377  	i.key = i.getKey(i.index)
  1378  	entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset
  1379  	*i.bytesIterated += uint64(entryBytes) + i.valueSize()
  1380  	return &i.key, i.Value()
  1381  }
  1382  
  1383  func (i flushFlushableBatchIter) Prev() (*InternalKey, []byte) {
  1384  	panic("pebble: Prev unimplemented")
  1385  }
  1386  
  1387  func (i flushFlushableBatchIter) valueSize() uint64 {
  1388  	p := i.data[i.offsets[i.index].offset:]
  1389  	if len(p) == 0 {
  1390  		i.err = fmt.Errorf("corrupted batch")
  1391  		return 0
  1392  	}
  1393  	kind := InternalKeyKind(p[0])
  1394  	if kind > InternalKeyKindMax {
  1395  		i.err = fmt.Errorf("corrupted batch")
  1396  		return 0
  1397  	}
  1398  	var length uint64
  1399  	switch kind {
  1400  	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete:
  1401  		keyEnd := i.offsets[i.index].keyEnd
  1402  		v, n := binary.Uvarint(i.data[keyEnd:])
  1403  		if n <= 0 {
  1404  			i.err = fmt.Errorf("corrupted batch")
  1405  			return 0
  1406  		}
  1407  		length = v + uint64(n)
  1408  	}
  1409  	return length
  1410  }