github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/batch.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"context"
     9  	"encoding/binary"
    10  	"fmt"
    11  	"io"
    12  	"math"
    13  	"sort"
    14  	"sync"
    15  	"sync/atomic"
    16  	"time"
    17  	"unsafe"
    18  
    19  	"github.com/cockroachdb/errors"
    20  	"github.com/cockroachdb/pebble/internal/base"
    21  	"github.com/cockroachdb/pebble/internal/batchskl"
    22  	"github.com/cockroachdb/pebble/internal/humanize"
    23  	"github.com/cockroachdb/pebble/internal/keyspan"
    24  	"github.com/cockroachdb/pebble/internal/private"
    25  	"github.com/cockroachdb/pebble/internal/rangedel"
    26  	"github.com/cockroachdb/pebble/internal/rangekey"
    27  	"github.com/cockroachdb/pebble/internal/rawalloc"
    28  )
    29  
    30  const (
    31  	batchCountOffset     = 8
    32  	batchHeaderLen       = 12
    33  	batchInitialSize     = 1 << 10 // 1 KB
    34  	batchMaxRetainedSize = 1 << 20 // 1 MB
    35  	invalidBatchCount    = 1<<32 - 1
    36  	maxVarintLen32       = 5
    37  )
    38  
    39  // ErrNotIndexed means that a read operation on a batch failed because the
    40  // batch is not indexed and thus doesn't support reads.
    41  var ErrNotIndexed = errors.New("pebble: batch not indexed")
    42  
    43  // ErrInvalidBatch indicates that a batch is invalid or otherwise corrupted.
    44  var ErrInvalidBatch = base.MarkCorruptionError(errors.New("pebble: invalid batch"))
    45  
    46  // ErrBatchTooLarge indicates that a batch is invalid or otherwise corrupted.
    47  var ErrBatchTooLarge = base.MarkCorruptionError(errors.Newf("pebble: batch too large: >= %s", humanize.Bytes.Uint64(maxBatchSize)))
    48  
    49  // DeferredBatchOp represents a batch operation (eg. set, merge, delete) that is
    50  // being inserted into the batch. Indexing is not performed on the specified key
    51  // until Finish is called, hence the name deferred. This struct lets the caller
    52  // copy or encode keys/values directly into the batch representation instead of
    53  // copying into an intermediary buffer then having pebble.Batch copy off of it.
    54  type DeferredBatchOp struct {
    55  	index *batchskl.Skiplist
    56  
    57  	// Key and Value point to parts of the binary batch representation where
    58  	// keys and values should be encoded/copied into. len(Key) and len(Value)
    59  	// bytes must be copied into these slices respectively before calling
    60  	// Finish(). Changing where these slices point to is not allowed.
    61  	Key, Value []byte
    62  	offset     uint32
    63  }
    64  
    65  // Finish completes the addition of this batch operation, and adds it to the
    66  // index if necessary. Must be called once (and exactly once) keys/values
    67  // have been filled into Key and Value. Not calling Finish or not
    68  // copying/encoding keys will result in an incomplete index, and calling Finish
    69  // twice may result in a panic.
    70  func (d DeferredBatchOp) Finish() error {
    71  	if d.index != nil {
    72  		if err := d.index.Add(d.offset); err != nil {
    73  			return err
    74  		}
    75  	}
    76  	return nil
    77  }
    78  
    79  // A Batch is a sequence of Sets, Merges, Deletes, DeleteRanges, RangeKeySets,
    80  // RangeKeyUnsets, and/or RangeKeyDeletes that are applied atomically. Batch
    81  // implements the Reader interface, but only an indexed batch supports reading
    82  // (without error) via Get or NewIter. A non-indexed batch will return
    83  // ErrNotIndexed when read from. A batch is not safe for concurrent use, and
    84  // consumers should use a batch per goroutine or provide their own
    85  // synchronization.
    86  //
    87  // # Indexing
    88  //
    89  // Batches can be optionally indexed (see DB.NewIndexedBatch). An indexed batch
    90  // allows iteration via an Iterator (see Batch.NewIter). The iterator provides
    91  // a merged view of the operations in the batch and the underlying
    92  // database. This is implemented by treating the batch as an additional layer
    93  // in the LSM where every entry in the batch is considered newer than any entry
    94  // in the underlying database (batch entries have the InternalKeySeqNumBatch
    95  // bit set). By treating the batch as an additional layer in the LSM, iteration
    96  // supports all batch operations (i.e. Set, Merge, Delete, DeleteRange,
    97  // RangeKeySet, RangeKeyUnset, RangeKeyDelete) with minimal effort.
    98  //
    99  // The same key can be operated on multiple times in a batch, though only the
   100  // latest operation will be visible. For example, Put("a", "b"), Delete("a")
   101  // will cause the key "a" to not be visible in the batch. Put("a", "b"),
   102  // Put("a", "c") will cause a read of "a" to return the value "c".
   103  //
   104  // The batch index is implemented via an skiplist (internal/batchskl). While
   105  // the skiplist implementation is very fast, inserting into an indexed batch is
   106  // significantly slower than inserting into a non-indexed batch. Only use an
   107  // indexed batch if you require reading from it.
   108  //
   109  // # Atomic commit
   110  //
   111  // The operations in a batch are persisted by calling Batch.Commit which is
   112  // equivalent to calling DB.Apply(batch). A batch is committed atomically by
   113  // writing the internal batch representation to the WAL, adding all of the
   114  // batch operations to the memtable associated with the WAL, and then
   115  // incrementing the visible sequence number so that subsequent reads can see
   116  // the effects of the batch operations. If WriteOptions.Sync is true, a call to
   117  // Batch.Commit will guarantee that the batch is persisted to disk before
   118  // returning. See commitPipeline for more on the implementation details.
   119  //
   120  // # Large batches
   121  //
   122  // The size of a batch is limited only by available memory (be aware that
   123  // indexed batches require considerably additional memory for the skiplist
   124  // structure). A given WAL file has a single memtable associated with it (this
   125  // restriction could be removed, but doing so is onerous and complex). And a
   126  // memtable has a fixed size due to the underlying fixed size arena. Note that
   127  // this differs from RocksDB where a memtable can grow arbitrarily large using
   128  // a list of arena chunks. In RocksDB this is accomplished by storing pointers
   129  // in the arena memory, but that isn't possible in Go.
   130  //
   131  // During Batch.Commit, a batch which is larger than a threshold (>
   132  // MemTableSize/2) is wrapped in a flushableBatch and inserted into the queue
   133  // of memtables. A flushableBatch forces WAL to be rotated, but that happens
   134  // anyways when the memtable becomes full so this does not cause significant
   135  // WAL churn. Because the flushableBatch is readable as another layer in the
   136  // LSM, Batch.Commit returns as soon as the flushableBatch has been added to
   137  // the queue of memtables.
   138  //
   139  // Internally, a flushableBatch provides Iterator support by sorting the batch
   140  // contents (the batch is sorted once, when it is added to the memtable
   141  // queue). Sorting the batch contents and insertion of the contents into a
   142  // memtable have the same big-O time, but the constant factor dominates
   143  // here. Sorting is significantly faster and uses significantly less memory.
   144  //
   145  // # Internal representation
   146  //
   147  // The internal batch representation is a contiguous byte buffer with a fixed
   148  // 12-byte header, followed by a series of records.
   149  //
   150  //	+-------------+------------+--- ... ---+
   151  //	| SeqNum (8B) | Count (4B) |  Entries  |
   152  //	+-------------+------------+--- ... ---+
   153  //
   154  // Each record has a 1-byte kind tag prefix, followed by 1 or 2 length prefixed
   155  // strings (varstring):
   156  //
   157  //	+-----------+-----------------+-------------------+
   158  //	| Kind (1B) | Key (varstring) | Value (varstring) |
   159  //	+-----------+-----------------+-------------------+
   160  //
   161  // A varstring is a varint32 followed by N bytes of data. The Kind tags are
   162  // exactly those specified by InternalKeyKind. The following table shows the
   163  // format for records of each kind:
   164  //
   165  //	InternalKeyKindDelete         varstring
   166  //	InternalKeyKindLogData        varstring
   167  //	InternalKeyKindIngestSST      varstring
   168  //	InternalKeyKindSet            varstring varstring
   169  //	InternalKeyKindMerge          varstring varstring
   170  //	InternalKeyKindRangeDelete    varstring varstring
   171  //	InternalKeyKindRangeKeySet    varstring varstring
   172  //	InternalKeyKindRangeKeyUnset  varstring varstring
   173  //	InternalKeyKindRangeKeyDelete varstring varstring
   174  //
   175  // The intuitive understanding here are that the arguments to Delete, Set,
   176  // Merge, DeleteRange and RangeKeyDelete are encoded into the batch. The
   177  // RangeKeySet and RangeKeyUnset operations are slightly more complicated,
   178  // encoding their end key, suffix and value [in the case of RangeKeySet] within
   179  // the Value varstring. For more information on the value encoding for
   180  // RangeKeySet and RangeKeyUnset, see the internal/rangekey package.
   181  //
   182  // The internal batch representation is the on disk format for a batch in the
   183  // WAL, and thus stable. New record kinds may be added, but the existing ones
   184  // will not be modified.
   185  type Batch struct {
   186  	batchInternal
   187  	applied atomic.Bool
   188  }
   189  
   190  // batchInternal contains the set of fields within Batch that are non-atomic and
   191  // capable of being reset using a *b = batchInternal{} struct copy.
   192  type batchInternal struct {
   193  	// Data is the wire format of a batch's log entry:
   194  	//   - 8 bytes for a sequence number of the first batch element,
   195  	//     or zeroes if the batch has not yet been applied,
   196  	//   - 4 bytes for the count: the number of elements in the batch,
   197  	//     or "\xff\xff\xff\xff" if the batch is invalid,
   198  	//   - count elements, being:
   199  	//     - one byte for the kind
   200  	//     - the varint-string user key,
   201  	//     - the varint-string value (if kind != delete).
   202  	// The sequence number and count are stored in little-endian order.
   203  	//
   204  	// The data field can be (but is not guaranteed to be) nil for new
   205  	// batches. Large batches will set the data field to nil when committed as
   206  	// the data has been moved to a flushableBatch and inserted into the queue of
   207  	// memtables.
   208  	data           []byte
   209  	cmp            Compare
   210  	formatKey      base.FormatKey
   211  	abbreviatedKey AbbreviatedKey
   212  
   213  	// An upper bound on required space to add this batch to a memtable.
   214  	// Note that although batches are limited to 4 GiB in size, that limit
   215  	// applies to len(data), not the memtable size. The upper bound on the
   216  	// size of a memtable node is larger than the overhead of the batch's log
   217  	// encoding, so memTableSize is larger than len(data) and may overflow a
   218  	// uint32.
   219  	memTableSize uint64
   220  
   221  	// The db to which the batch will be committed. Do not change this field
   222  	// after the batch has been created as it might invalidate internal state.
   223  	// Batch.memTableSize is only refreshed if Batch.db is set. Setting db to
   224  	// nil once it has been set implies that the Batch has encountered an error.
   225  	db *DB
   226  
   227  	// The count of records in the batch. This count will be stored in the batch
   228  	// data whenever Repr() is called.
   229  	count uint64
   230  
   231  	// The count of range deletions in the batch. Updated every time a range
   232  	// deletion is added.
   233  	countRangeDels uint64
   234  
   235  	// The count of range key sets, unsets and deletes in the batch. Updated
   236  	// every time a RANGEKEYSET, RANGEKEYUNSET or RANGEKEYDEL key is added.
   237  	countRangeKeys uint64
   238  
   239  	// A deferredOp struct, stored in the Batch so that a pointer can be returned
   240  	// from the *Deferred() methods rather than a value.
   241  	deferredOp DeferredBatchOp
   242  
   243  	// An optional skiplist keyed by offset into data of the entry.
   244  	index         *batchskl.Skiplist
   245  	rangeDelIndex *batchskl.Skiplist
   246  	rangeKeyIndex *batchskl.Skiplist
   247  
   248  	// Fragmented range deletion tombstones. Cached the first time a range
   249  	// deletion iterator is requested. The cache is invalidated whenever a new
   250  	// range deletion is added to the batch. This cache can only be used when
   251  	// opening an iterator to read at a batch sequence number >=
   252  	// tombstonesSeqNum. This is the case for all new iterators created over a
   253  	// batch but it's not the case for all cloned iterators.
   254  	tombstones       []keyspan.Span
   255  	tombstonesSeqNum uint64
   256  
   257  	// Fragmented range key spans. Cached the first time a range key iterator is
   258  	// requested. The cache is invalidated whenever a new range key
   259  	// (RangeKey{Set,Unset,Del}) is added to the batch. This cache can only be
   260  	// used when opening an iterator to read at a batch sequence number >=
   261  	// tombstonesSeqNum. This is the case for all new iterators created over a
   262  	// batch but it's not the case for all cloned iterators.
   263  	rangeKeys       []keyspan.Span
   264  	rangeKeysSeqNum uint64
   265  
   266  	// The flushableBatch wrapper if the batch is too large to fit in the
   267  	// memtable.
   268  	flushable *flushableBatch
   269  
   270  	// minimumFormatMajorVersion indicates the format major version required in
   271  	// order to commit this batch. If an operation requires a particular format
   272  	// major version, it ratchets the batch's minimumFormatMajorVersion. When
   273  	// the batch is committed, this is validated against the database's current
   274  	// format major version.
   275  	minimumFormatMajorVersion FormatMajorVersion
   276  
   277  	// Synchronous Apply uses the commit WaitGroup for both publishing the
   278  	// seqnum and waiting for the WAL fsync (if needed). Asynchronous
   279  	// ApplyNoSyncWait, which implies WriteOptions.Sync is true, uses the commit
   280  	// WaitGroup for publishing the seqnum and the fsyncWait WaitGroup for
   281  	// waiting for the WAL fsync.
   282  	//
   283  	// TODO(sumeer): if we find that ApplyNoSyncWait in conjunction with
   284  	// SyncWait is causing higher memory usage because of the time duration
   285  	// between when the sync is already done, and a goroutine calls SyncWait
   286  	// (followed by Batch.Close), we could separate out {fsyncWait, commitErr}
   287  	// into a separate struct that is allocated separately (using another
   288  	// sync.Pool), and only that struct needs to outlive Batch.Close (which
   289  	// could then be called immediately after ApplyNoSyncWait). commitStats
   290  	// will also need to be in this separate struct.
   291  	commit    sync.WaitGroup
   292  	fsyncWait sync.WaitGroup
   293  
   294  	commitStats BatchCommitStats
   295  
   296  	commitErr error
   297  
   298  	// Position bools together to reduce the sizeof the struct.
   299  
   300  	// ingestedSSTBatch indicates that the batch contains one or more key kinds
   301  	// of InternalKeyKindIngestSST. If the batch contains key kinds of IngestSST
   302  	// then it will only contain key kinds of IngestSST.
   303  	ingestedSSTBatch bool
   304  
   305  	// committing is set to true when a batch begins to commit. It's used to
   306  	// ensure the batch is not mutated concurrently. It is not an atomic
   307  	// deliberately, so as to avoid the overhead on batch mutations. This is
   308  	// okay, because under correct usage this field will never be accessed
   309  	// concurrently. It's only under incorrect usage the memory accesses of this
   310  	// variable may violate memory safety. Since we don't use atomics here,
   311  	// false negatives are possible.
   312  	committing bool
   313  }
   314  
   315  // BatchCommitStats exposes stats related to committing a batch.
   316  //
   317  // NB: there is no Pebble internal tracing (using LoggerAndTracer) of slow
   318  // batch commits. The caller can use these stats to do their own tracing as
   319  // needed.
   320  type BatchCommitStats struct {
   321  	// TotalDuration is the time spent in DB.{Apply,ApplyNoSyncWait} or
   322  	// Batch.Commit, plus the time waiting in Batch.SyncWait. If there is a gap
   323  	// between calling ApplyNoSyncWait and calling SyncWait, that gap could
   324  	// include some duration in which real work was being done for the commit
   325  	// and will not be included here. This missing time is considered acceptable
   326  	// since the goal of these stats is to understand user-facing latency.
   327  	//
   328  	// TotalDuration includes time spent in various queues both inside Pebble
   329  	// and outside Pebble (I/O queues, goroutine scheduler queue, mutex wait
   330  	// etc.). For some of these queues (which we consider important) the wait
   331  	// times are included below -- these expose low-level implementation detail
   332  	// and are meant for expert diagnosis and subject to change. There may be
   333  	// unaccounted time after subtracting those values from TotalDuration.
   334  	TotalDuration time.Duration
   335  	// SemaphoreWaitDuration is the wait time for semaphores in
   336  	// commitPipeline.Commit.
   337  	SemaphoreWaitDuration time.Duration
   338  	// WALQueueWaitDuration is the wait time for allocating memory blocks in the
   339  	// LogWriter (due to the LogWriter not writing fast enough). At the moment
   340  	// this is duration is always zero because a single WAL will allow
   341  	// allocating memory blocks up to the entire memtable size. In the future,
   342  	// we may pipeline WALs and bound the WAL queued blocks separately, so this
   343  	// field is preserved for that possibility.
   344  	WALQueueWaitDuration time.Duration
   345  	// MemTableWriteStallDuration is the wait caused by a write stall due to too
   346  	// many memtables (due to not flushing fast enough).
   347  	MemTableWriteStallDuration time.Duration
   348  	// L0ReadAmpWriteStallDuration is the wait caused by a write stall due to
   349  	// high read amplification in L0 (due to not compacting fast enough out of
   350  	// L0).
   351  	L0ReadAmpWriteStallDuration time.Duration
   352  	// WALRotationDuration is the wait time for WAL rotation, which includes
   353  	// syncing and closing the old WAL and creating (or reusing) a new one.
   354  	WALRotationDuration time.Duration
   355  	// CommitWaitDuration is the wait for publishing the seqnum plus the
   356  	// duration for the WAL sync (if requested). The former should be tiny and
   357  	// one can assume that this is all due to the WAL sync.
   358  	CommitWaitDuration time.Duration
   359  }
   360  
   361  var _ Reader = (*Batch)(nil)
   362  var _ Writer = (*Batch)(nil)
   363  
   364  var batchPool = sync.Pool{
   365  	New: func() interface{} {
   366  		return &Batch{}
   367  	},
   368  }
   369  
   370  type indexedBatch struct {
   371  	batch Batch
   372  	index batchskl.Skiplist
   373  }
   374  
   375  var indexedBatchPool = sync.Pool{
   376  	New: func() interface{} {
   377  		return &indexedBatch{}
   378  	},
   379  }
   380  
   381  func newBatch(db *DB) *Batch {
   382  	b := batchPool.Get().(*Batch)
   383  	b.db = db
   384  	return b
   385  }
   386  
   387  func newBatchWithSize(db *DB, size int) *Batch {
   388  	b := newBatch(db)
   389  	if cap(b.data) < size {
   390  		b.data = rawalloc.New(0, size)
   391  	}
   392  	return b
   393  }
   394  
   395  func newIndexedBatch(db *DB, comparer *Comparer) *Batch {
   396  	i := indexedBatchPool.Get().(*indexedBatch)
   397  	i.batch.cmp = comparer.Compare
   398  	i.batch.formatKey = comparer.FormatKey
   399  	i.batch.abbreviatedKey = comparer.AbbreviatedKey
   400  	i.batch.db = db
   401  	i.batch.index = &i.index
   402  	i.batch.index.Init(&i.batch.data, i.batch.cmp, i.batch.abbreviatedKey)
   403  	return &i.batch
   404  }
   405  
   406  func newIndexedBatchWithSize(db *DB, comparer *Comparer, size int) *Batch {
   407  	b := newIndexedBatch(db, comparer)
   408  	if cap(b.data) < size {
   409  		b.data = rawalloc.New(0, size)
   410  	}
   411  	return b
   412  }
   413  
   414  // nextSeqNum returns the batch "sequence number" that will be given to the next
   415  // key written to the batch. During iteration keys within an indexed batch are
   416  // given a sequence number consisting of their offset within the batch combined
   417  // with the base.InternalKeySeqNumBatch bit. These sequence numbers are only
   418  // used during iteration, and the keys are assigned ordinary sequence numbers
   419  // when the batch is committed.
   420  func (b *Batch) nextSeqNum() uint64 {
   421  	return uint64(len(b.data)) | base.InternalKeySeqNumBatch
   422  }
   423  
   424  func (b *Batch) release() {
   425  	if b.db == nil {
   426  		// The batch was not created using newBatch or newIndexedBatch, or an error
   427  		// was encountered. We don't try to reuse batches that encountered an error
   428  		// because they might be stuck somewhere in the system and attempting to
   429  		// reuse such batches is a recipe for onerous debugging sessions. Instead,
   430  		// let the GC do its job.
   431  		return
   432  	}
   433  	b.db = nil
   434  
   435  	// NB: This is ugly (it would be cleaner if we could just assign a Batch{}),
   436  	// but necessary so that we can use atomic.StoreUint32 for the Batch.applied
   437  	// field. Without using an atomic to clear that field the Go race detector
   438  	// complains.
   439  	b.Reset()
   440  	b.cmp = nil
   441  	b.formatKey = nil
   442  	b.abbreviatedKey = nil
   443  
   444  	if b.index == nil {
   445  		batchPool.Put(b)
   446  	} else {
   447  		b.index, b.rangeDelIndex, b.rangeKeyIndex = nil, nil, nil
   448  		indexedBatchPool.Put((*indexedBatch)(unsafe.Pointer(b)))
   449  	}
   450  }
   451  
   452  func (b *Batch) refreshMemTableSize() error {
   453  	b.memTableSize = 0
   454  	if len(b.data) < batchHeaderLen {
   455  		return nil
   456  	}
   457  
   458  	b.countRangeDels = 0
   459  	b.countRangeKeys = 0
   460  	b.minimumFormatMajorVersion = 0
   461  	for r := b.Reader(); ; {
   462  		kind, key, value, ok, err := r.Next()
   463  		if !ok {
   464  			if err != nil {
   465  				return err
   466  			}
   467  			break
   468  		}
   469  		switch kind {
   470  		case InternalKeyKindRangeDelete:
   471  			b.countRangeDels++
   472  		case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
   473  			b.countRangeKeys++
   474  		case InternalKeyKindDeleteSized:
   475  			if b.minimumFormatMajorVersion < FormatDeleteSizedAndObsolete {
   476  				b.minimumFormatMajorVersion = FormatDeleteSizedAndObsolete
   477  			}
   478  		case InternalKeyKindIngestSST:
   479  			if b.minimumFormatMajorVersion < FormatFlushableIngest {
   480  				b.minimumFormatMajorVersion = FormatFlushableIngest
   481  			}
   482  			// This key kind doesn't contribute to the memtable size.
   483  			continue
   484  		}
   485  		b.memTableSize += memTableEntrySize(len(key), len(value))
   486  	}
   487  	if b.countRangeKeys > 0 && b.minimumFormatMajorVersion < FormatRangeKeys {
   488  		b.minimumFormatMajorVersion = FormatRangeKeys
   489  	}
   490  	return nil
   491  }
   492  
   493  // Apply the operations contained in the batch to the receiver batch.
   494  //
   495  // It is safe to modify the contents of the arguments after Apply returns.
   496  func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error {
   497  	if b.ingestedSSTBatch {
   498  		panic("pebble: invalid batch application")
   499  	}
   500  	if len(batch.data) == 0 {
   501  		return nil
   502  	}
   503  	if len(batch.data) < batchHeaderLen {
   504  		return ErrInvalidBatch
   505  	}
   506  
   507  	offset := len(b.data)
   508  	if offset == 0 {
   509  		b.init(offset)
   510  		offset = batchHeaderLen
   511  	}
   512  	b.data = append(b.data, batch.data[batchHeaderLen:]...)
   513  
   514  	b.setCount(b.Count() + batch.Count())
   515  
   516  	if b.db != nil || b.index != nil {
   517  		// Only iterate over the new entries if we need to track memTableSize or in
   518  		// order to update the index.
   519  		for iter := BatchReader(b.data[offset:]); len(iter) > 0; {
   520  			offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0]))
   521  			kind, key, value, ok, err := iter.Next()
   522  			if !ok {
   523  				if err != nil {
   524  					return err
   525  				}
   526  				break
   527  			}
   528  			switch kind {
   529  			case InternalKeyKindRangeDelete:
   530  				b.countRangeDels++
   531  			case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
   532  				b.countRangeKeys++
   533  			case InternalKeyKindIngestSST:
   534  				panic("pebble: invalid key kind for batch")
   535  			}
   536  			if b.index != nil {
   537  				var err error
   538  				switch kind {
   539  				case InternalKeyKindRangeDelete:
   540  					b.tombstones = nil
   541  					b.tombstonesSeqNum = 0
   542  					if b.rangeDelIndex == nil {
   543  						b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey)
   544  					}
   545  					err = b.rangeDelIndex.Add(uint32(offset))
   546  				case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
   547  					b.rangeKeys = nil
   548  					b.rangeKeysSeqNum = 0
   549  					if b.rangeKeyIndex == nil {
   550  						b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey)
   551  					}
   552  					err = b.rangeKeyIndex.Add(uint32(offset))
   553  				default:
   554  					err = b.index.Add(uint32(offset))
   555  				}
   556  				if err != nil {
   557  					return err
   558  				}
   559  			}
   560  			b.memTableSize += memTableEntrySize(len(key), len(value))
   561  		}
   562  	}
   563  	return nil
   564  }
   565  
   566  // Get gets the value for the given key. It returns ErrNotFound if the Batch
   567  // does not contain the key.
   568  //
   569  // The caller should not modify the contents of the returned slice, but it is
   570  // safe to modify the contents of the argument after Get returns. The returned
   571  // slice will remain valid until the returned Closer is closed. On success, the
   572  // caller MUST call closer.Close() or a memory leak will occur.
   573  func (b *Batch) Get(key []byte) ([]byte, io.Closer, error) {
   574  	if b.index == nil {
   575  		return nil, nil, ErrNotIndexed
   576  	}
   577  	return b.db.getInternal(key, b, nil /* snapshot */)
   578  }
   579  
   580  func (b *Batch) prepareDeferredKeyValueRecord(keyLen, valueLen int, kind InternalKeyKind) {
   581  	if b.committing {
   582  		panic("pebble: batch already committing")
   583  	}
   584  	if len(b.data) == 0 {
   585  		b.init(keyLen + valueLen + 2*binary.MaxVarintLen64 + batchHeaderLen)
   586  	}
   587  	b.count++
   588  	b.memTableSize += memTableEntrySize(keyLen, valueLen)
   589  
   590  	pos := len(b.data)
   591  	b.deferredOp.offset = uint32(pos)
   592  	b.grow(1 + 2*maxVarintLen32 + keyLen + valueLen)
   593  	b.data[pos] = byte(kind)
   594  	pos++
   595  
   596  	{
   597  		// TODO(peter): Manually inlined version binary.PutUvarint(). This is 20%
   598  		// faster on BenchmarkBatchSet on go1.13. Remove if go1.14 or future
   599  		// versions show this to not be a performance win.
   600  		x := uint32(keyLen)
   601  		for x >= 0x80 {
   602  			b.data[pos] = byte(x) | 0x80
   603  			x >>= 7
   604  			pos++
   605  		}
   606  		b.data[pos] = byte(x)
   607  		pos++
   608  	}
   609  
   610  	b.deferredOp.Key = b.data[pos : pos+keyLen]
   611  	pos += keyLen
   612  
   613  	{
   614  		// TODO(peter): Manually inlined version binary.PutUvarint(). This is 20%
   615  		// faster on BenchmarkBatchSet on go1.13. Remove if go1.14 or future
   616  		// versions show this to not be a performance win.
   617  		x := uint32(valueLen)
   618  		for x >= 0x80 {
   619  			b.data[pos] = byte(x) | 0x80
   620  			x >>= 7
   621  			pos++
   622  		}
   623  		b.data[pos] = byte(x)
   624  		pos++
   625  	}
   626  
   627  	b.deferredOp.Value = b.data[pos : pos+valueLen]
   628  	// Shrink data since varints may be shorter than the upper bound.
   629  	b.data = b.data[:pos+valueLen]
   630  }
   631  
   632  func (b *Batch) prepareDeferredKeyRecord(keyLen int, kind InternalKeyKind) {
   633  	if b.committing {
   634  		panic("pebble: batch already committing")
   635  	}
   636  	if len(b.data) == 0 {
   637  		b.init(keyLen + binary.MaxVarintLen64 + batchHeaderLen)
   638  	}
   639  	b.count++
   640  	b.memTableSize += memTableEntrySize(keyLen, 0)
   641  
   642  	pos := len(b.data)
   643  	b.deferredOp.offset = uint32(pos)
   644  	b.grow(1 + maxVarintLen32 + keyLen)
   645  	b.data[pos] = byte(kind)
   646  	pos++
   647  
   648  	{
   649  		// TODO(peter): Manually inlined version binary.PutUvarint(). Remove if
   650  		// go1.13 or future versions show this to not be a performance win. See
   651  		// BenchmarkBatchSet.
   652  		x := uint32(keyLen)
   653  		for x >= 0x80 {
   654  			b.data[pos] = byte(x) | 0x80
   655  			x >>= 7
   656  			pos++
   657  		}
   658  		b.data[pos] = byte(x)
   659  		pos++
   660  	}
   661  
   662  	b.deferredOp.Key = b.data[pos : pos+keyLen]
   663  	b.deferredOp.Value = nil
   664  
   665  	// Shrink data since varint may be shorter than the upper bound.
   666  	b.data = b.data[:pos+keyLen]
   667  }
   668  
   669  // AddInternalKey allows the caller to add an internal key of point key kinds to
   670  // a batch. Passing in an internal key of kind RangeKey* or RangeDelete will
   671  // result in a panic. Note that the seqnum in the internal key is effectively
   672  // ignored, even though the Kind is preserved. This is because the batch format
   673  // does not allow for a per-key seqnum to be specified, only a batch-wide one.
   674  //
   675  // Note that non-indexed keys (IngestKeyKind{LogData,IngestSST}) are not
   676  // supported with this method as they require specialized logic.
   677  func (b *Batch) AddInternalKey(key *base.InternalKey, value []byte, _ *WriteOptions) error {
   678  	keyLen := len(key.UserKey)
   679  	hasValue := false
   680  	switch key.Kind() {
   681  	case InternalKeyKindRangeDelete, InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
   682  		panic("unexpected range delete or range key kind in AddInternalKey")
   683  	case InternalKeyKindSingleDelete, InternalKeyKindDelete:
   684  		b.prepareDeferredKeyRecord(len(key.UserKey), key.Kind())
   685  	default:
   686  		b.prepareDeferredKeyValueRecord(keyLen, len(value), key.Kind())
   687  		hasValue = true
   688  	}
   689  	b.deferredOp.index = b.index
   690  	copy(b.deferredOp.Key, key.UserKey)
   691  	if hasValue {
   692  		copy(b.deferredOp.Value, value)
   693  	}
   694  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   695  	// in go1.13 will remove the need for this.
   696  	if b.index != nil {
   697  		if err := b.index.Add(b.deferredOp.offset); err != nil {
   698  			return err
   699  		}
   700  	}
   701  	return nil
   702  }
   703  
   704  // Set adds an action to the batch that sets the key to map to the value.
   705  //
   706  // It is safe to modify the contents of the arguments after Set returns.
   707  func (b *Batch) Set(key, value []byte, _ *WriteOptions) error {
   708  	deferredOp := b.SetDeferred(len(key), len(value))
   709  	copy(deferredOp.Key, key)
   710  	copy(deferredOp.Value, value)
   711  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   712  	// in go1.13 will remove the need for this.
   713  	if b.index != nil {
   714  		if err := b.index.Add(deferredOp.offset); err != nil {
   715  			return err
   716  		}
   717  	}
   718  	return nil
   719  }
   720  
   721  // SetDeferred is similar to Set in that it adds a set operation to the batch,
   722  // except it only takes in key/value lengths instead of complete slices,
   723  // letting the caller encode into those objects and then call Finish() on the
   724  // returned object.
   725  func (b *Batch) SetDeferred(keyLen, valueLen int) *DeferredBatchOp {
   726  	b.prepareDeferredKeyValueRecord(keyLen, valueLen, InternalKeyKindSet)
   727  	b.deferredOp.index = b.index
   728  	return &b.deferredOp
   729  }
   730  
   731  // Merge adds an action to the batch that merges the value at key with the new
   732  // value. The details of the merge are dependent upon the configured merge
   733  // operator.
   734  //
   735  // It is safe to modify the contents of the arguments after Merge returns.
   736  func (b *Batch) Merge(key, value []byte, _ *WriteOptions) error {
   737  	deferredOp := b.MergeDeferred(len(key), len(value))
   738  	copy(deferredOp.Key, key)
   739  	copy(deferredOp.Value, value)
   740  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   741  	// in go1.13 will remove the need for this.
   742  	if b.index != nil {
   743  		if err := b.index.Add(deferredOp.offset); err != nil {
   744  			return err
   745  		}
   746  	}
   747  	return nil
   748  }
   749  
   750  // MergeDeferred is similar to Merge in that it adds a merge operation to the
   751  // batch, except it only takes in key/value lengths instead of complete slices,
   752  // letting the caller encode into those objects and then call Finish() on the
   753  // returned object.
   754  func (b *Batch) MergeDeferred(keyLen, valueLen int) *DeferredBatchOp {
   755  	b.prepareDeferredKeyValueRecord(keyLen, valueLen, InternalKeyKindMerge)
   756  	b.deferredOp.index = b.index
   757  	return &b.deferredOp
   758  }
   759  
   760  // Delete adds an action to the batch that deletes the entry for key.
   761  //
   762  // It is safe to modify the contents of the arguments after Delete returns.
   763  func (b *Batch) Delete(key []byte, _ *WriteOptions) error {
   764  	deferredOp := b.DeleteDeferred(len(key))
   765  	copy(deferredOp.Key, key)
   766  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   767  	// in go1.13 will remove the need for this.
   768  	if b.index != nil {
   769  		if err := b.index.Add(deferredOp.offset); err != nil {
   770  			return err
   771  		}
   772  	}
   773  	return nil
   774  }
   775  
   776  // DeleteDeferred is similar to Delete in that it adds a delete operation to
   777  // the batch, except it only takes in key/value lengths instead of complete
   778  // slices, letting the caller encode into those objects and then call Finish()
   779  // on the returned object.
   780  func (b *Batch) DeleteDeferred(keyLen int) *DeferredBatchOp {
   781  	b.prepareDeferredKeyRecord(keyLen, InternalKeyKindDelete)
   782  	b.deferredOp.index = b.index
   783  	return &b.deferredOp
   784  }
   785  
   786  // DeleteSized behaves identically to Delete, but takes an additional
   787  // argument indicating the size of the value being deleted. DeleteSized
   788  // should be preferred when the caller has the expectation that there exists
   789  // a single internal KV pair for the key (eg, the key has not been
   790  // overwritten recently), and the caller knows the size of its value.
   791  //
   792  // DeleteSized will record the value size within the tombstone and use it to
   793  // inform compaction-picking heuristics which strive to reduce space
   794  // amplification in the LSM. This "calling your shot" mechanic allows the
   795  // storage engine to more accurately estimate and reduce space amplification.
   796  //
   797  // It is safe to modify the contents of the arguments after DeleteSized
   798  // returns.
   799  func (b *Batch) DeleteSized(key []byte, deletedValueSize uint32, _ *WriteOptions) error {
   800  	deferredOp := b.DeleteSizedDeferred(len(key), deletedValueSize)
   801  	copy(b.deferredOp.Key, key)
   802  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Check if in a
   803  	// later Go release this is unnecessary.
   804  	if b.index != nil {
   805  		if err := b.index.Add(deferredOp.offset); err != nil {
   806  			return err
   807  		}
   808  	}
   809  	return nil
   810  }
   811  
   812  // DeleteSizedDeferred is similar to DeleteSized in that it adds a sized delete
   813  // operation to the batch, except it only takes in key length instead of a
   814  // complete key slice, letting the caller encode into the DeferredBatchOp.Key
   815  // slice and then call Finish() on the returned object.
   816  func (b *Batch) DeleteSizedDeferred(keyLen int, deletedValueSize uint32) *DeferredBatchOp {
   817  	if b.minimumFormatMajorVersion < FormatDeleteSizedAndObsolete {
   818  		b.minimumFormatMajorVersion = FormatDeleteSizedAndObsolete
   819  	}
   820  
   821  	// Encode the sum of the key length and the value in the value.
   822  	v := uint64(deletedValueSize) + uint64(keyLen)
   823  
   824  	// Encode `v` as a varint.
   825  	var buf [binary.MaxVarintLen64]byte
   826  	n := 0
   827  	{
   828  		x := v
   829  		for x >= 0x80 {
   830  			buf[n] = byte(x) | 0x80
   831  			x >>= 7
   832  			n++
   833  		}
   834  		buf[n] = byte(x)
   835  		n++
   836  	}
   837  
   838  	// NB: In batch entries and sstable entries, values are stored as
   839  	// varstrings. Here, the value is itself a simple varint. This results in an
   840  	// unnecessary double layer of encoding:
   841  	//     varint(n) varint(deletedValueSize)
   842  	// The first varint will always be 1-byte, since a varint-encoded uint64
   843  	// will never exceed 128 bytes. This unnecessary extra byte and wrapping is
   844  	// preserved to avoid special casing across the database, and in particular
   845  	// in sstable block decoding which is performance sensitive.
   846  	b.prepareDeferredKeyValueRecord(keyLen, n, InternalKeyKindDeleteSized)
   847  	b.deferredOp.index = b.index
   848  	copy(b.deferredOp.Value, buf[:n])
   849  	return &b.deferredOp
   850  }
   851  
   852  // SingleDelete adds an action to the batch that single deletes the entry for key.
   853  // See Writer.SingleDelete for more details on the semantics of SingleDelete.
   854  //
   855  // It is safe to modify the contents of the arguments after SingleDelete returns.
   856  func (b *Batch) SingleDelete(key []byte, _ *WriteOptions) error {
   857  	deferredOp := b.SingleDeleteDeferred(len(key))
   858  	copy(deferredOp.Key, key)
   859  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   860  	// in go1.13 will remove the need for this.
   861  	if b.index != nil {
   862  		if err := b.index.Add(deferredOp.offset); err != nil {
   863  			return err
   864  		}
   865  	}
   866  	return nil
   867  }
   868  
   869  // SingleDeleteDeferred is similar to SingleDelete in that it adds a single delete
   870  // operation to the batch, except it only takes in key/value lengths instead of
   871  // complete slices, letting the caller encode into those objects and then call
   872  // Finish() on the returned object.
   873  func (b *Batch) SingleDeleteDeferred(keyLen int) *DeferredBatchOp {
   874  	b.prepareDeferredKeyRecord(keyLen, InternalKeyKindSingleDelete)
   875  	b.deferredOp.index = b.index
   876  	return &b.deferredOp
   877  }
   878  
   879  // DeleteRange deletes all of the point keys (and values) in the range
   880  // [start,end) (inclusive on start, exclusive on end). DeleteRange does NOT
   881  // delete overlapping range keys (eg, keys set via RangeKeySet).
   882  //
   883  // It is safe to modify the contents of the arguments after DeleteRange
   884  // returns.
   885  func (b *Batch) DeleteRange(start, end []byte, _ *WriteOptions) error {
   886  	deferredOp := b.DeleteRangeDeferred(len(start), len(end))
   887  	copy(deferredOp.Key, start)
   888  	copy(deferredOp.Value, end)
   889  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   890  	// in go1.13 will remove the need for this.
   891  	if deferredOp.index != nil {
   892  		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
   893  			return err
   894  		}
   895  	}
   896  	return nil
   897  }
   898  
   899  // DeleteRangeDeferred is similar to DeleteRange in that it adds a delete range
   900  // operation to the batch, except it only takes in key lengths instead of
   901  // complete slices, letting the caller encode into those objects and then call
   902  // Finish() on the returned object. Note that DeferredBatchOp.Key should be
   903  // populated with the start key, and DeferredBatchOp.Value should be populated
   904  // with the end key.
   905  func (b *Batch) DeleteRangeDeferred(startLen, endLen int) *DeferredBatchOp {
   906  	b.prepareDeferredKeyValueRecord(startLen, endLen, InternalKeyKindRangeDelete)
   907  	b.countRangeDels++
   908  	if b.index != nil {
   909  		b.tombstones = nil
   910  		b.tombstonesSeqNum = 0
   911  		// Range deletions are rare, so we lazily allocate the index for them.
   912  		if b.rangeDelIndex == nil {
   913  			b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey)
   914  		}
   915  		b.deferredOp.index = b.rangeDelIndex
   916  	}
   917  	return &b.deferredOp
   918  }
   919  
   920  // RangeKeySet sets a range key mapping the key range [start, end) at the MVCC
   921  // timestamp suffix to value. The suffix is optional. If any portion of the key
   922  // range [start, end) is already set by a range key with the same suffix value,
   923  // RangeKeySet overrides it.
   924  //
   925  // It is safe to modify the contents of the arguments after RangeKeySet returns.
   926  func (b *Batch) RangeKeySet(start, end, suffix, value []byte, _ *WriteOptions) error {
   927  	suffixValues := [1]rangekey.SuffixValue{{Suffix: suffix, Value: value}}
   928  	internalValueLen := rangekey.EncodedSetValueLen(end, suffixValues[:])
   929  
   930  	deferredOp := b.rangeKeySetDeferred(len(start), internalValueLen)
   931  	copy(deferredOp.Key, start)
   932  	n := rangekey.EncodeSetValue(deferredOp.Value, end, suffixValues[:])
   933  	if n != internalValueLen {
   934  		panic("unexpected internal value length mismatch")
   935  	}
   936  
   937  	// Manually inline DeferredBatchOp.Finish().
   938  	if deferredOp.index != nil {
   939  		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
   940  			return err
   941  		}
   942  	}
   943  	return nil
   944  }
   945  
   946  func (b *Batch) rangeKeySetDeferred(startLen, internalValueLen int) *DeferredBatchOp {
   947  	b.prepareDeferredKeyValueRecord(startLen, internalValueLen, InternalKeyKindRangeKeySet)
   948  	b.incrementRangeKeysCount()
   949  	return &b.deferredOp
   950  }
   951  
   952  func (b *Batch) incrementRangeKeysCount() {
   953  	b.countRangeKeys++
   954  	if b.minimumFormatMajorVersion < FormatRangeKeys {
   955  		b.minimumFormatMajorVersion = FormatRangeKeys
   956  	}
   957  	if b.index != nil {
   958  		b.rangeKeys = nil
   959  		b.rangeKeysSeqNum = 0
   960  		// Range keys are rare, so we lazily allocate the index for them.
   961  		if b.rangeKeyIndex == nil {
   962  			b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey)
   963  		}
   964  		b.deferredOp.index = b.rangeKeyIndex
   965  	}
   966  }
   967  
   968  // RangeKeyUnset removes a range key mapping the key range [start, end) at the
   969  // MVCC timestamp suffix. The suffix may be omitted to remove an unsuffixed
   970  // range key. RangeKeyUnset only removes portions of range keys that fall within
   971  // the [start, end) key span, and only range keys with suffixes that exactly
   972  // match the unset suffix.
   973  //
   974  // It is safe to modify the contents of the arguments after RangeKeyUnset
   975  // returns.
   976  func (b *Batch) RangeKeyUnset(start, end, suffix []byte, _ *WriteOptions) error {
   977  	suffixes := [1][]byte{suffix}
   978  	internalValueLen := rangekey.EncodedUnsetValueLen(end, suffixes[:])
   979  
   980  	deferredOp := b.rangeKeyUnsetDeferred(len(start), internalValueLen)
   981  	copy(deferredOp.Key, start)
   982  	n := rangekey.EncodeUnsetValue(deferredOp.Value, end, suffixes[:])
   983  	if n != internalValueLen {
   984  		panic("unexpected internal value length mismatch")
   985  	}
   986  
   987  	// Manually inline DeferredBatchOp.Finish()
   988  	if deferredOp.index != nil {
   989  		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
   990  			return err
   991  		}
   992  	}
   993  	return nil
   994  }
   995  
   996  func (b *Batch) rangeKeyUnsetDeferred(startLen, internalValueLen int) *DeferredBatchOp {
   997  	b.prepareDeferredKeyValueRecord(startLen, internalValueLen, InternalKeyKindRangeKeyUnset)
   998  	b.incrementRangeKeysCount()
   999  	return &b.deferredOp
  1000  }
  1001  
  1002  // RangeKeyDelete deletes all of the range keys in the range [start,end)
  1003  // (inclusive on start, exclusive on end). It does not delete point keys (for
  1004  // that use DeleteRange). RangeKeyDelete removes all range keys within the
  1005  // bounds, including those with or without suffixes.
  1006  //
  1007  // It is safe to modify the contents of the arguments after RangeKeyDelete
  1008  // returns.
  1009  func (b *Batch) RangeKeyDelete(start, end []byte, _ *WriteOptions) error {
  1010  	deferredOp := b.RangeKeyDeleteDeferred(len(start), len(end))
  1011  	copy(deferredOp.Key, start)
  1012  	copy(deferredOp.Value, end)
  1013  	// Manually inline DeferredBatchOp.Finish().
  1014  	if deferredOp.index != nil {
  1015  		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
  1016  			return err
  1017  		}
  1018  	}
  1019  	return nil
  1020  }
  1021  
  1022  // RangeKeyDeleteDeferred is similar to RangeKeyDelete in that it adds an
  1023  // operation to delete range keys to the batch, except it only takes in key
  1024  // lengths instead of complete slices, letting the caller encode into those
  1025  // objects and then call Finish() on the returned object. Note that
  1026  // DeferredBatchOp.Key should be populated with the start key, and
  1027  // DeferredBatchOp.Value should be populated with the end key.
  1028  func (b *Batch) RangeKeyDeleteDeferred(startLen, endLen int) *DeferredBatchOp {
  1029  	b.prepareDeferredKeyValueRecord(startLen, endLen, InternalKeyKindRangeKeyDelete)
  1030  	b.incrementRangeKeysCount()
  1031  	return &b.deferredOp
  1032  }
  1033  
  1034  // LogData adds the specified to the batch. The data will be written to the
  1035  // WAL, but not added to memtables or sstables. Log data is never indexed,
  1036  // which makes it useful for testing WAL performance.
  1037  //
  1038  // It is safe to modify the contents of the argument after LogData returns.
  1039  func (b *Batch) LogData(data []byte, _ *WriteOptions) error {
  1040  	origCount, origMemTableSize := b.count, b.memTableSize
  1041  	b.prepareDeferredKeyRecord(len(data), InternalKeyKindLogData)
  1042  	copy(b.deferredOp.Key, data)
  1043  	// Since LogData only writes to the WAL and does not affect the memtable, we
  1044  	// restore b.count and b.memTableSize to their origin values. Note that
  1045  	// Batch.count only refers to records that are added to the memtable.
  1046  	b.count, b.memTableSize = origCount, origMemTableSize
  1047  	return nil
  1048  }
  1049  
  1050  // IngestSST adds the FileNum for an sstable to the batch. The data will only be
  1051  // written to the WAL (not added to memtables or sstables).
  1052  func (b *Batch) ingestSST(fileNum base.FileNum) {
  1053  	if b.Empty() {
  1054  		b.ingestedSSTBatch = true
  1055  	} else if !b.ingestedSSTBatch {
  1056  		// Batch contains other key kinds.
  1057  		panic("pebble: invalid call to ingestSST")
  1058  	}
  1059  
  1060  	origMemTableSize := b.memTableSize
  1061  	var buf [binary.MaxVarintLen64]byte
  1062  	length := binary.PutUvarint(buf[:], uint64(fileNum))
  1063  	b.prepareDeferredKeyRecord(length, InternalKeyKindIngestSST)
  1064  	copy(b.deferredOp.Key, buf[:length])
  1065  	// Since IngestSST writes only to the WAL and does not affect the memtable,
  1066  	// we restore b.memTableSize to its original value. Note that Batch.count
  1067  	// is not reset because for the InternalKeyKindIngestSST the count is the
  1068  	// number of sstable paths which have been added to the batch.
  1069  	b.memTableSize = origMemTableSize
  1070  	b.minimumFormatMajorVersion = FormatFlushableIngest
  1071  }
  1072  
  1073  // Empty returns true if the batch is empty, and false otherwise.
  1074  func (b *Batch) Empty() bool {
  1075  	return len(b.data) <= batchHeaderLen
  1076  }
  1077  
  1078  // Len returns the current size of the batch in bytes.
  1079  func (b *Batch) Len() int {
  1080  	if len(b.data) <= batchHeaderLen {
  1081  		return batchHeaderLen
  1082  	}
  1083  	return len(b.data)
  1084  }
  1085  
  1086  // Repr returns the underlying batch representation. It is not safe to modify
  1087  // the contents. Reset() will not change the contents of the returned value,
  1088  // though any other mutation operation may do so.
  1089  func (b *Batch) Repr() []byte {
  1090  	if len(b.data) == 0 {
  1091  		b.init(batchHeaderLen)
  1092  	}
  1093  	binary.LittleEndian.PutUint32(b.countData(), b.Count())
  1094  	return b.data
  1095  }
  1096  
  1097  // SetRepr sets the underlying batch representation. The batch takes ownership
  1098  // of the supplied slice. It is not safe to modify it afterwards until the
  1099  // Batch is no longer in use.
  1100  func (b *Batch) SetRepr(data []byte) error {
  1101  	if len(data) < batchHeaderLen {
  1102  		return base.CorruptionErrorf("invalid batch")
  1103  	}
  1104  	b.data = data
  1105  	b.count = uint64(binary.LittleEndian.Uint32(b.countData()))
  1106  	var err error
  1107  	if b.db != nil {
  1108  		// Only track memTableSize for batches that will be committed to the DB.
  1109  		err = b.refreshMemTableSize()
  1110  	}
  1111  	return err
  1112  }
  1113  
  1114  // NewIter returns an iterator that is unpositioned (Iterator.Valid() will
  1115  // return false). The iterator can be positioned via a call to SeekGE,
  1116  // SeekPrefixGE, SeekLT, First or Last. Only indexed batches support iterators.
  1117  //
  1118  // The returned Iterator observes all of the Batch's existing mutations, but no
  1119  // later mutations. Its view can be refreshed via RefreshBatchSnapshot or
  1120  // SetOptions().
  1121  func (b *Batch) NewIter(o *IterOptions) (*Iterator, error) {
  1122  	return b.NewIterWithContext(context.Background(), o), nil
  1123  }
  1124  
  1125  // NewIterWithContext is like NewIter, and additionally accepts a context for
  1126  // tracing.
  1127  func (b *Batch) NewIterWithContext(ctx context.Context, o *IterOptions) *Iterator {
  1128  	if b.index == nil {
  1129  		return &Iterator{err: ErrNotIndexed}
  1130  	}
  1131  	return b.db.newIter(ctx, b, snapshotIterOpts{}, o)
  1132  }
  1133  
  1134  // newInternalIter creates a new internalIterator that iterates over the
  1135  // contents of the batch.
  1136  func (b *Batch) newInternalIter(o *IterOptions) *batchIter {
  1137  	iter := &batchIter{}
  1138  	b.initInternalIter(o, iter)
  1139  	return iter
  1140  }
  1141  
  1142  func (b *Batch) initInternalIter(o *IterOptions, iter *batchIter) {
  1143  	*iter = batchIter{
  1144  		cmp:   b.cmp,
  1145  		batch: b,
  1146  		iter:  b.index.NewIter(o.GetLowerBound(), o.GetUpperBound()),
  1147  		// NB: We explicitly do not propagate the batch snapshot to the point
  1148  		// key iterator. Filtering point keys within the batch iterator can
  1149  		// cause pathological behavior where a batch iterator advances
  1150  		// significantly farther than necessary filtering many batch keys that
  1151  		// are not visible at the batch sequence number. Instead, the merging
  1152  		// iterator enforces bounds.
  1153  		//
  1154  		// For example, consider an engine that contains the committed keys
  1155  		// 'bar' and 'bax', with no keys between them. Consider a batch
  1156  		// containing keys 1,000 keys within the range [a,z]. All of the
  1157  		// batch keys were added to the batch after the iterator was
  1158  		// constructed, so they are not visible to the iterator. A call to
  1159  		// SeekGE('bax') would seek the LSM iterators and discover the key
  1160  		// 'bax'. It would also seek the batch iterator, landing on the key
  1161  		// 'baz' but discover it that it's not visible. The batch iterator would
  1162  		// next through the rest of the batch's keys, only to discover there are
  1163  		// no visible keys greater than or equal to 'bax'.
  1164  		//
  1165  		// Filtering these batch points within the merging iterator ensures that
  1166  		// the batch iterator never needs to iterate beyond 'baz', because it
  1167  		// already found a smaller, visible key 'bax'.
  1168  		snapshot: base.InternalKeySeqNumMax,
  1169  	}
  1170  }
  1171  
  1172  func (b *Batch) newRangeDelIter(o *IterOptions, batchSnapshot uint64) *keyspan.Iter {
  1173  	// Construct an iterator even if rangeDelIndex is nil, because it is allowed
  1174  	// to refresh later, so we need the container to exist.
  1175  	iter := new(keyspan.Iter)
  1176  	b.initRangeDelIter(o, iter, batchSnapshot)
  1177  	return iter
  1178  }
  1179  
  1180  func (b *Batch) initRangeDelIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot uint64) {
  1181  	if b.rangeDelIndex == nil {
  1182  		iter.Init(b.cmp, nil)
  1183  		return
  1184  	}
  1185  
  1186  	// Fragment the range tombstones the first time a range deletion iterator is
  1187  	// requested. The cached tombstones are invalidated if another range
  1188  	// deletion tombstone is added to the batch. This cache is only guaranteed
  1189  	// to be correct if we're opening an iterator to read at a batch sequence
  1190  	// number at least as high as tombstonesSeqNum. The cache is guaranteed to
  1191  	// include all tombstones up to tombstonesSeqNum, and if any additional
  1192  	// tombstones were added after that sequence number the cache would've been
  1193  	// cleared.
  1194  	nextSeqNum := b.nextSeqNum()
  1195  	if b.tombstones != nil && b.tombstonesSeqNum <= batchSnapshot {
  1196  		iter.Init(b.cmp, b.tombstones)
  1197  		return
  1198  	}
  1199  
  1200  	tombstones := make([]keyspan.Span, 0, b.countRangeDels)
  1201  	frag := &keyspan.Fragmenter{
  1202  		Cmp:    b.cmp,
  1203  		Format: b.formatKey,
  1204  		Emit: func(s keyspan.Span) {
  1205  			tombstones = append(tombstones, s)
  1206  		},
  1207  	}
  1208  	it := &batchIter{
  1209  		cmp:      b.cmp,
  1210  		batch:    b,
  1211  		iter:     b.rangeDelIndex.NewIter(nil, nil),
  1212  		snapshot: batchSnapshot,
  1213  	}
  1214  	fragmentRangeDels(frag, it, int(b.countRangeDels))
  1215  	iter.Init(b.cmp, tombstones)
  1216  
  1217  	// If we just read all the tombstones in the batch (eg, batchSnapshot was
  1218  	// set to b.nextSeqNum()), then cache the tombstones so that a subsequent
  1219  	// call to initRangeDelIter may use them without refragmenting.
  1220  	if nextSeqNum == batchSnapshot {
  1221  		b.tombstones = tombstones
  1222  		b.tombstonesSeqNum = nextSeqNum
  1223  	}
  1224  }
  1225  
  1226  func fragmentRangeDels(frag *keyspan.Fragmenter, it internalIterator, count int) {
  1227  	// The memory management here is a bit subtle. The keys and values returned
  1228  	// by the iterator are slices in Batch.data. Thus the fragmented tombstones
  1229  	// are slices within Batch.data. If additional entries are added to the
  1230  	// Batch, Batch.data may be reallocated. The references in the fragmented
  1231  	// tombstones will remain valid, pointing into the old Batch.data. GC for
  1232  	// the win.
  1233  
  1234  	// Use a single []keyspan.Key buffer to avoid allocating many
  1235  	// individual []keyspan.Key slices with a single element each.
  1236  	keyBuf := make([]keyspan.Key, 0, count)
  1237  	for key, val := it.First(); key != nil; key, val = it.Next() {
  1238  		s := rangedel.Decode(*key, val.InPlaceValue(), keyBuf)
  1239  		keyBuf = s.Keys[len(s.Keys):]
  1240  
  1241  		// Set a fixed capacity to avoid accidental overwriting.
  1242  		s.Keys = s.Keys[:len(s.Keys):len(s.Keys)]
  1243  		frag.Add(s)
  1244  	}
  1245  	frag.Finish()
  1246  }
  1247  
  1248  func (b *Batch) newRangeKeyIter(o *IterOptions, batchSnapshot uint64) *keyspan.Iter {
  1249  	// Construct an iterator even if rangeKeyIndex is nil, because it is allowed
  1250  	// to refresh later, so we need the container to exist.
  1251  	iter := new(keyspan.Iter)
  1252  	b.initRangeKeyIter(o, iter, batchSnapshot)
  1253  	return iter
  1254  }
  1255  
  1256  func (b *Batch) initRangeKeyIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot uint64) {
  1257  	if b.rangeKeyIndex == nil {
  1258  		iter.Init(b.cmp, nil)
  1259  		return
  1260  	}
  1261  
  1262  	// Fragment the range keys the first time a range key iterator is requested.
  1263  	// The cached spans are invalidated if another range key is added to the
  1264  	// batch. This cache is only guaranteed to be correct if we're opening an
  1265  	// iterator to read at a batch sequence number at least as high as
  1266  	// rangeKeysSeqNum. The cache is guaranteed to include all range keys up to
  1267  	// rangeKeysSeqNum, and if any additional range keys were added after that
  1268  	// sequence number the cache would've been cleared.
  1269  	nextSeqNum := b.nextSeqNum()
  1270  	if b.rangeKeys != nil && b.rangeKeysSeqNum <= batchSnapshot {
  1271  		iter.Init(b.cmp, b.rangeKeys)
  1272  		return
  1273  	}
  1274  
  1275  	rangeKeys := make([]keyspan.Span, 0, b.countRangeKeys)
  1276  	frag := &keyspan.Fragmenter{
  1277  		Cmp:    b.cmp,
  1278  		Format: b.formatKey,
  1279  		Emit: func(s keyspan.Span) {
  1280  			rangeKeys = append(rangeKeys, s)
  1281  		},
  1282  	}
  1283  	it := &batchIter{
  1284  		cmp:      b.cmp,
  1285  		batch:    b,
  1286  		iter:     b.rangeKeyIndex.NewIter(nil, nil),
  1287  		snapshot: batchSnapshot,
  1288  	}
  1289  	fragmentRangeKeys(frag, it, int(b.countRangeKeys))
  1290  	iter.Init(b.cmp, rangeKeys)
  1291  
  1292  	// If we just read all the range keys in the batch (eg, batchSnapshot was
  1293  	// set to b.nextSeqNum()), then cache the range keys so that a subsequent
  1294  	// call to initRangeKeyIter may use them without refragmenting.
  1295  	if nextSeqNum == batchSnapshot {
  1296  		b.rangeKeys = rangeKeys
  1297  		b.rangeKeysSeqNum = nextSeqNum
  1298  	}
  1299  }
  1300  
  1301  func fragmentRangeKeys(frag *keyspan.Fragmenter, it internalIterator, count int) error {
  1302  	// The memory management here is a bit subtle. The keys and values
  1303  	// returned by the iterator are slices in Batch.data. Thus the
  1304  	// fragmented key spans are slices within Batch.data. If additional
  1305  	// entries are added to the Batch, Batch.data may be reallocated. The
  1306  	// references in the fragmented keys will remain valid, pointing into
  1307  	// the old Batch.data. GC for the win.
  1308  
  1309  	// Use a single []keyspan.Key buffer to avoid allocating many
  1310  	// individual []keyspan.Key slices with a single element each.
  1311  	keyBuf := make([]keyspan.Key, 0, count)
  1312  	for ik, val := it.First(); ik != nil; ik, val = it.Next() {
  1313  		s, err := rangekey.Decode(*ik, val.InPlaceValue(), keyBuf)
  1314  		if err != nil {
  1315  			return err
  1316  		}
  1317  		keyBuf = s.Keys[len(s.Keys):]
  1318  
  1319  		// Set a fixed capacity to avoid accidental overwriting.
  1320  		s.Keys = s.Keys[:len(s.Keys):len(s.Keys)]
  1321  		frag.Add(s)
  1322  	}
  1323  	frag.Finish()
  1324  	return nil
  1325  }
  1326  
  1327  // Commit applies the batch to its parent writer.
  1328  func (b *Batch) Commit(o *WriteOptions) error {
  1329  	return b.db.Apply(b, o)
  1330  }
  1331  
  1332  // Close closes the batch without committing it.
  1333  func (b *Batch) Close() error {
  1334  	b.release()
  1335  	return nil
  1336  }
  1337  
  1338  // Indexed returns true if the batch is indexed (i.e. supports read
  1339  // operations).
  1340  func (b *Batch) Indexed() bool {
  1341  	return b.index != nil
  1342  }
  1343  
  1344  // init ensures that the batch data slice is initialized to meet the
  1345  // minimum required size and allocates space for the batch header.
  1346  func (b *Batch) init(size int) {
  1347  	n := batchInitialSize
  1348  	for n < size {
  1349  		n *= 2
  1350  	}
  1351  	if cap(b.data) < n {
  1352  		b.data = rawalloc.New(batchHeaderLen, n)
  1353  	}
  1354  	b.data = b.data[:batchHeaderLen]
  1355  	// Zero the sequence number in the header.
  1356  	for i := 0; i < len(b.data); i++ {
  1357  		b.data[i] = 0
  1358  	}
  1359  }
  1360  
  1361  // Reset resets the batch for reuse. The underlying byte slice (that is
  1362  // returned by Repr()) may not be modified. It is only necessary to call this
  1363  // method if a batch is explicitly being reused. Close automatically takes are
  1364  // of releasing resources when appropriate for batches that are internally
  1365  // being reused.
  1366  func (b *Batch) Reset() {
  1367  	// Zero out the struct, retaining only the fields necessary for manual
  1368  	// reuse.
  1369  	b.batchInternal = batchInternal{
  1370  		data:           b.data,
  1371  		cmp:            b.cmp,
  1372  		formatKey:      b.formatKey,
  1373  		abbreviatedKey: b.abbreviatedKey,
  1374  		index:          b.index,
  1375  		db:             b.db,
  1376  	}
  1377  	b.applied.Store(false)
  1378  	if b.data != nil {
  1379  		if cap(b.data) > batchMaxRetainedSize {
  1380  			// If the capacity of the buffer is larger than our maximum
  1381  			// retention size, don't re-use it. Let it be GC-ed instead.
  1382  			// This prevents the memory from an unusually large batch from
  1383  			// being held on to indefinitely.
  1384  			b.data = nil
  1385  		} else {
  1386  			// Otherwise, reset the buffer for re-use.
  1387  			b.data = b.data[:batchHeaderLen]
  1388  			// Zero the sequence number in the header.
  1389  			for i := 0; i < len(b.data); i++ {
  1390  				b.data[i] = 0
  1391  			}
  1392  		}
  1393  	}
  1394  	if b.index != nil {
  1395  		b.index.Init(&b.data, b.cmp, b.abbreviatedKey)
  1396  	}
  1397  }
  1398  
  1399  // seqNumData returns the 8 byte little-endian sequence number. Zero means that
  1400  // the batch has not yet been applied.
  1401  func (b *Batch) seqNumData() []byte {
  1402  	return b.data[:8]
  1403  }
  1404  
  1405  // countData returns the 4 byte little-endian count data. "\xff\xff\xff\xff"
  1406  // means that the batch is invalid.
  1407  func (b *Batch) countData() []byte {
  1408  	return b.data[8:12]
  1409  }
  1410  
  1411  func (b *Batch) grow(n int) {
  1412  	newSize := len(b.data) + n
  1413  	if uint64(newSize) >= maxBatchSize {
  1414  		panic(ErrBatchTooLarge)
  1415  	}
  1416  	if newSize > cap(b.data) {
  1417  		newCap := 2 * cap(b.data)
  1418  		for newCap < newSize {
  1419  			newCap *= 2
  1420  		}
  1421  		newData := rawalloc.New(len(b.data), newCap)
  1422  		copy(newData, b.data)
  1423  		b.data = newData
  1424  	}
  1425  	b.data = b.data[:newSize]
  1426  }
  1427  
  1428  func (b *Batch) setSeqNum(seqNum uint64) {
  1429  	binary.LittleEndian.PutUint64(b.seqNumData(), seqNum)
  1430  }
  1431  
  1432  // SeqNum returns the batch sequence number which is applied to the first
  1433  // record in the batch. The sequence number is incremented for each subsequent
  1434  // record. It returns zero if the batch is empty.
  1435  func (b *Batch) SeqNum() uint64 {
  1436  	if len(b.data) == 0 {
  1437  		b.init(batchHeaderLen)
  1438  	}
  1439  	return binary.LittleEndian.Uint64(b.seqNumData())
  1440  }
  1441  
  1442  func (b *Batch) setCount(v uint32) {
  1443  	b.count = uint64(v)
  1444  }
  1445  
  1446  // Count returns the count of memtable-modifying operations in this batch. All
  1447  // operations with the except of LogData increment this count. For IngestSSTs,
  1448  // count is only used to indicate the number of SSTs ingested in the record, the
  1449  // batch isn't applied to the memtable.
  1450  func (b *Batch) Count() uint32 {
  1451  	if b.count > math.MaxUint32 {
  1452  		panic(ErrInvalidBatch)
  1453  	}
  1454  	return uint32(b.count)
  1455  }
  1456  
  1457  // Reader returns a BatchReader for the current batch contents. If the batch is
  1458  // mutated, the new entries will not be visible to the reader.
  1459  func (b *Batch) Reader() BatchReader {
  1460  	if len(b.data) == 0 {
  1461  		b.init(batchHeaderLen)
  1462  	}
  1463  	return b.data[batchHeaderLen:]
  1464  }
  1465  
  1466  func batchDecodeStr(data []byte) (odata []byte, s []byte, ok bool) {
  1467  	// TODO(jackson): This will index out of bounds if there's no varint or an
  1468  	// invalid varint (eg, a single 0xff byte). Correcting will add a bit of
  1469  	// overhead. We could avoid that overhead whenever len(data) >=
  1470  	// binary.MaxVarint32?
  1471  
  1472  	var v uint32
  1473  	var n int
  1474  	ptr := unsafe.Pointer(&data[0])
  1475  	if a := *((*uint8)(ptr)); a < 128 {
  1476  		v = uint32(a)
  1477  		n = 1
  1478  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
  1479  		v = uint32(b)<<7 | uint32(a)
  1480  		n = 2
  1481  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
  1482  		v = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1483  		n = 3
  1484  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
  1485  		v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1486  		n = 4
  1487  	} else {
  1488  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
  1489  		v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1490  		n = 5
  1491  	}
  1492  
  1493  	data = data[n:]
  1494  	if v > uint32(len(data)) {
  1495  		return nil, nil, false
  1496  	}
  1497  	return data[v:], data[:v], true
  1498  }
  1499  
  1500  // SyncWait is to be used in conjunction with DB.ApplyNoSyncWait.
  1501  func (b *Batch) SyncWait() error {
  1502  	now := time.Now()
  1503  	b.fsyncWait.Wait()
  1504  	if b.commitErr != nil {
  1505  		b.db = nil // prevent batch reuse on error
  1506  	}
  1507  	waitDuration := time.Since(now)
  1508  	b.commitStats.CommitWaitDuration += waitDuration
  1509  	b.commitStats.TotalDuration += waitDuration
  1510  	return b.commitErr
  1511  }
  1512  
  1513  // CommitStats returns stats related to committing the batch. Should be called
  1514  // after Batch.Commit, DB.Apply. If DB.ApplyNoSyncWait is used, should be
  1515  // called after Batch.SyncWait.
  1516  func (b *Batch) CommitStats() BatchCommitStats {
  1517  	return b.commitStats
  1518  }
  1519  
  1520  // BatchReader iterates over the entries contained in a batch.
  1521  type BatchReader []byte
  1522  
  1523  // ReadBatch constructs a BatchReader from a batch representation.  The
  1524  // header is not validated. ReadBatch returns a new batch reader and the
  1525  // count of entries contained within the batch.
  1526  func ReadBatch(repr []byte) (r BatchReader, count uint32) {
  1527  	if len(repr) <= batchHeaderLen {
  1528  		return nil, count
  1529  	}
  1530  	count = binary.LittleEndian.Uint32(repr[batchCountOffset:batchHeaderLen])
  1531  	return repr[batchHeaderLen:], count
  1532  }
  1533  
  1534  // Next returns the next entry in this batch, if there is one. If the reader has
  1535  // reached the end of the batch, Next returns ok=false and a nil error. If the
  1536  // batch is corrupt and the next entry is illegible, Next returns ok=false and a
  1537  // non-nil error.
  1538  func (r *BatchReader) Next() (kind InternalKeyKind, ukey []byte, value []byte, ok bool, err error) {
  1539  	if len(*r) == 0 {
  1540  		return 0, nil, nil, false, nil
  1541  	}
  1542  	kind = InternalKeyKind((*r)[0])
  1543  	if kind > InternalKeyKindMax {
  1544  		return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "invalid key kind 0x%x", (*r)[0])
  1545  	}
  1546  	*r, ukey, ok = batchDecodeStr((*r)[1:])
  1547  	if !ok {
  1548  		return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "decoding user key")
  1549  	}
  1550  	switch kind {
  1551  	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete,
  1552  		InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete,
  1553  		InternalKeyKindDeleteSized:
  1554  		*r, value, ok = batchDecodeStr(*r)
  1555  		if !ok {
  1556  			return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "decoding %s value", kind)
  1557  		}
  1558  	}
  1559  	return kind, ukey, value, true, nil
  1560  }
  1561  
  1562  // Note: batchIter mirrors the implementation of flushableBatchIter. Keep the
  1563  // two in sync.
  1564  type batchIter struct {
  1565  	cmp   Compare
  1566  	batch *Batch
  1567  	iter  batchskl.Iterator
  1568  	err   error
  1569  	// snapshot holds a batch "sequence number" at which the batch is being
  1570  	// read. This sequence number has the InternalKeySeqNumBatch bit set, so it
  1571  	// encodes an offset within the batch. Only batch entries earlier than the
  1572  	// offset are visible during iteration.
  1573  	snapshot uint64
  1574  }
  1575  
  1576  // batchIter implements the base.InternalIterator interface.
  1577  var _ base.InternalIterator = (*batchIter)(nil)
  1578  
  1579  func (i *batchIter) String() string {
  1580  	return "batch"
  1581  }
  1582  
  1583  func (i *batchIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
  1584  	// Ignore TrySeekUsingNext if the view of the batch changed.
  1585  	if flags.TrySeekUsingNext() && flags.BatchJustRefreshed() {
  1586  		flags = flags.DisableTrySeekUsingNext()
  1587  	}
  1588  
  1589  	i.err = nil // clear cached iteration error
  1590  	ikey := i.iter.SeekGE(key, flags)
  1591  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1592  		ikey = i.iter.Next()
  1593  	}
  1594  	if ikey == nil {
  1595  		return nil, base.LazyValue{}
  1596  	}
  1597  	return ikey, base.MakeInPlaceValue(i.value())
  1598  }
  1599  
  1600  func (i *batchIter) SeekPrefixGE(
  1601  	prefix, key []byte, flags base.SeekGEFlags,
  1602  ) (*base.InternalKey, base.LazyValue) {
  1603  	i.err = nil // clear cached iteration error
  1604  	return i.SeekGE(key, flags)
  1605  }
  1606  
  1607  func (i *batchIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
  1608  	i.err = nil // clear cached iteration error
  1609  	ikey := i.iter.SeekLT(key)
  1610  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1611  		ikey = i.iter.Prev()
  1612  	}
  1613  	if ikey == nil {
  1614  		return nil, base.LazyValue{}
  1615  	}
  1616  	return ikey, base.MakeInPlaceValue(i.value())
  1617  }
  1618  
  1619  func (i *batchIter) First() (*InternalKey, base.LazyValue) {
  1620  	i.err = nil // clear cached iteration error
  1621  	ikey := i.iter.First()
  1622  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1623  		ikey = i.iter.Next()
  1624  	}
  1625  	if ikey == nil {
  1626  		return nil, base.LazyValue{}
  1627  	}
  1628  	return ikey, base.MakeInPlaceValue(i.value())
  1629  }
  1630  
  1631  func (i *batchIter) Last() (*InternalKey, base.LazyValue) {
  1632  	i.err = nil // clear cached iteration error
  1633  	ikey := i.iter.Last()
  1634  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1635  		ikey = i.iter.Prev()
  1636  	}
  1637  	if ikey == nil {
  1638  		return nil, base.LazyValue{}
  1639  	}
  1640  	return ikey, base.MakeInPlaceValue(i.value())
  1641  }
  1642  
  1643  func (i *batchIter) Next() (*InternalKey, base.LazyValue) {
  1644  	ikey := i.iter.Next()
  1645  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1646  		ikey = i.iter.Next()
  1647  	}
  1648  	if ikey == nil {
  1649  		return nil, base.LazyValue{}
  1650  	}
  1651  	return ikey, base.MakeInPlaceValue(i.value())
  1652  }
  1653  
  1654  func (i *batchIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) {
  1655  	// Because NextPrefix was invoked `succKey` must be ≥ the key at i's current
  1656  	// position. Seek the arena iterator using TrySeekUsingNext.
  1657  	ikey := i.iter.SeekGE(succKey, base.SeekGEFlagsNone.EnableTrySeekUsingNext())
  1658  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1659  		ikey = i.iter.Next()
  1660  	}
  1661  	if ikey == nil {
  1662  		return nil, base.LazyValue{}
  1663  	}
  1664  	return ikey, base.MakeInPlaceValue(i.value())
  1665  }
  1666  
  1667  func (i *batchIter) Prev() (*InternalKey, base.LazyValue) {
  1668  	ikey := i.iter.Prev()
  1669  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1670  		ikey = i.iter.Prev()
  1671  	}
  1672  	if ikey == nil {
  1673  		return nil, base.LazyValue{}
  1674  	}
  1675  	return ikey, base.MakeInPlaceValue(i.value())
  1676  }
  1677  
  1678  func (i *batchIter) value() []byte {
  1679  	offset, _, keyEnd := i.iter.KeyInfo()
  1680  	data := i.batch.data
  1681  	if len(data[offset:]) == 0 {
  1682  		i.err = base.CorruptionErrorf("corrupted batch")
  1683  		return nil
  1684  	}
  1685  
  1686  	switch InternalKeyKind(data[offset]) {
  1687  	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete,
  1688  		InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete,
  1689  		InternalKeyKindDeleteSized:
  1690  		_, value, ok := batchDecodeStr(data[keyEnd:])
  1691  		if !ok {
  1692  			return nil
  1693  		}
  1694  		return value
  1695  	default:
  1696  		return nil
  1697  	}
  1698  }
  1699  
  1700  func (i *batchIter) Error() error {
  1701  	return i.err
  1702  }
  1703  
  1704  func (i *batchIter) Close() error {
  1705  	_ = i.iter.Close()
  1706  	return i.err
  1707  }
  1708  
  1709  func (i *batchIter) SetBounds(lower, upper []byte) {
  1710  	i.iter.SetBounds(lower, upper)
  1711  }
  1712  
  1713  type flushableBatchEntry struct {
  1714  	// offset is the byte offset of the record within the batch repr.
  1715  	offset uint32
  1716  	// index is the 0-based ordinal number of the record within the batch. Used
  1717  	// to compute the seqnum for the record.
  1718  	index uint32
  1719  	// key{Start,End} are the start and end byte offsets of the key within the
  1720  	// batch repr. Cached to avoid decoding the key length on every
  1721  	// comparison. The value is stored starting at keyEnd.
  1722  	keyStart uint32
  1723  	keyEnd   uint32
  1724  }
  1725  
  1726  // flushableBatch wraps an existing batch and provides the interfaces needed
  1727  // for making the batch flushable (i.e. able to mimic a memtable).
  1728  type flushableBatch struct {
  1729  	cmp       Compare
  1730  	formatKey base.FormatKey
  1731  	data      []byte
  1732  
  1733  	// The base sequence number for the entries in the batch. This is the same
  1734  	// value as Batch.seqNum() and is cached here for performance.
  1735  	seqNum uint64
  1736  
  1737  	// A slice of offsets and indices for the entries in the batch. Used to
  1738  	// implement flushableBatchIter. Unlike the indexing on a normal batch, a
  1739  	// flushable batch is indexed such that batch entry i will be given the
  1740  	// sequence number flushableBatch.seqNum+i.
  1741  	//
  1742  	// Sorted in increasing order of key and decreasing order of offset (since
  1743  	// higher offsets correspond to higher sequence numbers).
  1744  	//
  1745  	// Does not include range deletion entries or range key entries.
  1746  	offsets []flushableBatchEntry
  1747  
  1748  	// Fragmented range deletion tombstones.
  1749  	tombstones []keyspan.Span
  1750  
  1751  	// Fragmented range keys.
  1752  	rangeKeys []keyspan.Span
  1753  }
  1754  
  1755  var _ flushable = (*flushableBatch)(nil)
  1756  
  1757  // newFlushableBatch creates a new batch that implements the flushable
  1758  // interface. This allows the batch to act like a memtable and be placed in the
  1759  // queue of flushable memtables. Note that the flushable batch takes ownership
  1760  // of the batch data.
  1761  func newFlushableBatch(batch *Batch, comparer *Comparer) (*flushableBatch, error) {
  1762  	b := &flushableBatch{
  1763  		data:      batch.data,
  1764  		cmp:       comparer.Compare,
  1765  		formatKey: comparer.FormatKey,
  1766  		offsets:   make([]flushableBatchEntry, 0, batch.Count()),
  1767  	}
  1768  	if b.data != nil {
  1769  		// Note that this sequence number is not correct when this batch has not
  1770  		// been applied since the sequence number has not been assigned yet. The
  1771  		// correct sequence number will be set later. But it is correct when the
  1772  		// batch is being replayed from the WAL.
  1773  		b.seqNum = batch.SeqNum()
  1774  	}
  1775  	var rangeDelOffsets []flushableBatchEntry
  1776  	var rangeKeyOffsets []flushableBatchEntry
  1777  	if len(b.data) > batchHeaderLen {
  1778  		// Non-empty batch.
  1779  		var index uint32
  1780  		for iter := BatchReader(b.data[batchHeaderLen:]); len(iter) > 0; index++ {
  1781  			offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0]))
  1782  			kind, key, _, ok, err := iter.Next()
  1783  			if !ok {
  1784  				if err != nil {
  1785  					return nil, err
  1786  				}
  1787  				break
  1788  			}
  1789  			entry := flushableBatchEntry{
  1790  				offset: uint32(offset),
  1791  				index:  uint32(index),
  1792  			}
  1793  			if keySize := uint32(len(key)); keySize == 0 {
  1794  				// Must add 2 to the offset. One byte encodes `kind` and the next
  1795  				// byte encodes `0`, which is the length of the key.
  1796  				entry.keyStart = uint32(offset) + 2
  1797  				entry.keyEnd = entry.keyStart
  1798  			} else {
  1799  				entry.keyStart = uint32(uintptr(unsafe.Pointer(&key[0])) -
  1800  					uintptr(unsafe.Pointer(&b.data[0])))
  1801  				entry.keyEnd = entry.keyStart + keySize
  1802  			}
  1803  			switch kind {
  1804  			case InternalKeyKindRangeDelete:
  1805  				rangeDelOffsets = append(rangeDelOffsets, entry)
  1806  			case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
  1807  				rangeKeyOffsets = append(rangeKeyOffsets, entry)
  1808  			default:
  1809  				b.offsets = append(b.offsets, entry)
  1810  			}
  1811  		}
  1812  	}
  1813  
  1814  	// Sort all of offsets, rangeDelOffsets and rangeKeyOffsets, using *batch's
  1815  	// sort.Interface implementation.
  1816  	pointOffsets := b.offsets
  1817  	sort.Sort(b)
  1818  	b.offsets = rangeDelOffsets
  1819  	sort.Sort(b)
  1820  	b.offsets = rangeKeyOffsets
  1821  	sort.Sort(b)
  1822  	b.offsets = pointOffsets
  1823  
  1824  	if len(rangeDelOffsets) > 0 {
  1825  		frag := &keyspan.Fragmenter{
  1826  			Cmp:    b.cmp,
  1827  			Format: b.formatKey,
  1828  			Emit: func(s keyspan.Span) {
  1829  				b.tombstones = append(b.tombstones, s)
  1830  			},
  1831  		}
  1832  		it := &flushableBatchIter{
  1833  			batch:   b,
  1834  			data:    b.data,
  1835  			offsets: rangeDelOffsets,
  1836  			cmp:     b.cmp,
  1837  			index:   -1,
  1838  		}
  1839  		fragmentRangeDels(frag, it, len(rangeDelOffsets))
  1840  	}
  1841  	if len(rangeKeyOffsets) > 0 {
  1842  		frag := &keyspan.Fragmenter{
  1843  			Cmp:    b.cmp,
  1844  			Format: b.formatKey,
  1845  			Emit: func(s keyspan.Span) {
  1846  				b.rangeKeys = append(b.rangeKeys, s)
  1847  			},
  1848  		}
  1849  		it := &flushableBatchIter{
  1850  			batch:   b,
  1851  			data:    b.data,
  1852  			offsets: rangeKeyOffsets,
  1853  			cmp:     b.cmp,
  1854  			index:   -1,
  1855  		}
  1856  		fragmentRangeKeys(frag, it, len(rangeKeyOffsets))
  1857  	}
  1858  	return b, nil
  1859  }
  1860  
  1861  func (b *flushableBatch) setSeqNum(seqNum uint64) {
  1862  	if b.seqNum != 0 {
  1863  		panic(fmt.Sprintf("pebble: flushableBatch.seqNum already set: %d", b.seqNum))
  1864  	}
  1865  	b.seqNum = seqNum
  1866  	for i := range b.tombstones {
  1867  		for j := range b.tombstones[i].Keys {
  1868  			b.tombstones[i].Keys[j].Trailer = base.MakeTrailer(
  1869  				b.tombstones[i].Keys[j].SeqNum()+seqNum,
  1870  				b.tombstones[i].Keys[j].Kind(),
  1871  			)
  1872  		}
  1873  	}
  1874  	for i := range b.rangeKeys {
  1875  		for j := range b.rangeKeys[i].Keys {
  1876  			b.rangeKeys[i].Keys[j].Trailer = base.MakeTrailer(
  1877  				b.rangeKeys[i].Keys[j].SeqNum()+seqNum,
  1878  				b.rangeKeys[i].Keys[j].Kind(),
  1879  			)
  1880  		}
  1881  	}
  1882  }
  1883  
  1884  func (b *flushableBatch) Len() int {
  1885  	return len(b.offsets)
  1886  }
  1887  
  1888  func (b *flushableBatch) Less(i, j int) bool {
  1889  	ei := &b.offsets[i]
  1890  	ej := &b.offsets[j]
  1891  	ki := b.data[ei.keyStart:ei.keyEnd]
  1892  	kj := b.data[ej.keyStart:ej.keyEnd]
  1893  	switch c := b.cmp(ki, kj); {
  1894  	case c < 0:
  1895  		return true
  1896  	case c > 0:
  1897  		return false
  1898  	default:
  1899  		return ei.offset > ej.offset
  1900  	}
  1901  }
  1902  
  1903  func (b *flushableBatch) Swap(i, j int) {
  1904  	b.offsets[i], b.offsets[j] = b.offsets[j], b.offsets[i]
  1905  }
  1906  
  1907  // newIter is part of the flushable interface.
  1908  func (b *flushableBatch) newIter(o *IterOptions) internalIterator {
  1909  	return &flushableBatchIter{
  1910  		batch:   b,
  1911  		data:    b.data,
  1912  		offsets: b.offsets,
  1913  		cmp:     b.cmp,
  1914  		index:   -1,
  1915  		lower:   o.GetLowerBound(),
  1916  		upper:   o.GetUpperBound(),
  1917  	}
  1918  }
  1919  
  1920  // newFlushIter is part of the flushable interface.
  1921  func (b *flushableBatch) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator {
  1922  	return &flushFlushableBatchIter{
  1923  		flushableBatchIter: flushableBatchIter{
  1924  			batch:   b,
  1925  			data:    b.data,
  1926  			offsets: b.offsets,
  1927  			cmp:     b.cmp,
  1928  			index:   -1,
  1929  		},
  1930  		bytesIterated: bytesFlushed,
  1931  	}
  1932  }
  1933  
  1934  // newRangeDelIter is part of the flushable interface.
  1935  func (b *flushableBatch) newRangeDelIter(o *IterOptions) keyspan.FragmentIterator {
  1936  	if len(b.tombstones) == 0 {
  1937  		return nil
  1938  	}
  1939  	return keyspan.NewIter(b.cmp, b.tombstones)
  1940  }
  1941  
  1942  // newRangeKeyIter is part of the flushable interface.
  1943  func (b *flushableBatch) newRangeKeyIter(o *IterOptions) keyspan.FragmentIterator {
  1944  	if len(b.rangeKeys) == 0 {
  1945  		return nil
  1946  	}
  1947  	return keyspan.NewIter(b.cmp, b.rangeKeys)
  1948  }
  1949  
  1950  // containsRangeKeys is part of the flushable interface.
  1951  func (b *flushableBatch) containsRangeKeys() bool { return len(b.rangeKeys) > 0 }
  1952  
  1953  // inuseBytes is part of the flushable interface.
  1954  func (b *flushableBatch) inuseBytes() uint64 {
  1955  	return uint64(len(b.data) - batchHeaderLen)
  1956  }
  1957  
  1958  // totalBytes is part of the flushable interface.
  1959  func (b *flushableBatch) totalBytes() uint64 {
  1960  	return uint64(cap(b.data))
  1961  }
  1962  
  1963  // readyForFlush is part of the flushable interface.
  1964  func (b *flushableBatch) readyForFlush() bool {
  1965  	// A flushable batch is always ready for flush; it must be flushed together
  1966  	// with the previous memtable.
  1967  	return true
  1968  }
  1969  
  1970  // Note: flushableBatchIter mirrors the implementation of batchIter. Keep the
  1971  // two in sync.
  1972  type flushableBatchIter struct {
  1973  	// Members to be initialized by creator.
  1974  	batch *flushableBatch
  1975  	// The bytes backing the batch. Always the same as batch.data?
  1976  	data []byte
  1977  	// The sorted entries. This is not always equal to batch.offsets.
  1978  	offsets []flushableBatchEntry
  1979  	cmp     Compare
  1980  	// Must be initialized to -1. It is the index into offsets that represents
  1981  	// the current iterator position.
  1982  	index int
  1983  
  1984  	// For internal use by the implementation.
  1985  	key InternalKey
  1986  	err error
  1987  
  1988  	// Optionally initialize to bounds of iteration, if any.
  1989  	lower []byte
  1990  	upper []byte
  1991  }
  1992  
  1993  // flushableBatchIter implements the base.InternalIterator interface.
  1994  var _ base.InternalIterator = (*flushableBatchIter)(nil)
  1995  
  1996  func (i *flushableBatchIter) String() string {
  1997  	return "flushable-batch"
  1998  }
  1999  
  2000  // SeekGE implements internalIterator.SeekGE, as documented in the pebble
  2001  // package. Ignore flags.TrySeekUsingNext() since we don't expect this
  2002  // optimization to provide much benefit here at the moment.
  2003  func (i *flushableBatchIter) SeekGE(
  2004  	key []byte, flags base.SeekGEFlags,
  2005  ) (*InternalKey, base.LazyValue) {
  2006  	i.err = nil // clear cached iteration error
  2007  	ikey := base.MakeSearchKey(key)
  2008  	i.index = sort.Search(len(i.offsets), func(j int) bool {
  2009  		return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0
  2010  	})
  2011  	if i.index >= len(i.offsets) {
  2012  		return nil, base.LazyValue{}
  2013  	}
  2014  	i.key = i.getKey(i.index)
  2015  	if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 {
  2016  		i.index = len(i.offsets)
  2017  		return nil, base.LazyValue{}
  2018  	}
  2019  	return &i.key, i.value()
  2020  }
  2021  
  2022  // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
  2023  // pebble package.
  2024  func (i *flushableBatchIter) SeekPrefixGE(
  2025  	prefix, key []byte, flags base.SeekGEFlags,
  2026  ) (*base.InternalKey, base.LazyValue) {
  2027  	return i.SeekGE(key, flags)
  2028  }
  2029  
  2030  // SeekLT implements internalIterator.SeekLT, as documented in the pebble
  2031  // package.
  2032  func (i *flushableBatchIter) SeekLT(
  2033  	key []byte, flags base.SeekLTFlags,
  2034  ) (*InternalKey, base.LazyValue) {
  2035  	i.err = nil // clear cached iteration error
  2036  	ikey := base.MakeSearchKey(key)
  2037  	i.index = sort.Search(len(i.offsets), func(j int) bool {
  2038  		return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0
  2039  	})
  2040  	i.index--
  2041  	if i.index < 0 {
  2042  		return nil, base.LazyValue{}
  2043  	}
  2044  	i.key = i.getKey(i.index)
  2045  	if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 {
  2046  		i.index = -1
  2047  		return nil, base.LazyValue{}
  2048  	}
  2049  	return &i.key, i.value()
  2050  }
  2051  
  2052  // First implements internalIterator.First, as documented in the pebble
  2053  // package.
  2054  func (i *flushableBatchIter) First() (*InternalKey, base.LazyValue) {
  2055  	i.err = nil // clear cached iteration error
  2056  	if len(i.offsets) == 0 {
  2057  		return nil, base.LazyValue{}
  2058  	}
  2059  	i.index = 0
  2060  	i.key = i.getKey(i.index)
  2061  	if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 {
  2062  		i.index = len(i.offsets)
  2063  		return nil, base.LazyValue{}
  2064  	}
  2065  	return &i.key, i.value()
  2066  }
  2067  
  2068  // Last implements internalIterator.Last, as documented in the pebble
  2069  // package.
  2070  func (i *flushableBatchIter) Last() (*InternalKey, base.LazyValue) {
  2071  	i.err = nil // clear cached iteration error
  2072  	if len(i.offsets) == 0 {
  2073  		return nil, base.LazyValue{}
  2074  	}
  2075  	i.index = len(i.offsets) - 1
  2076  	i.key = i.getKey(i.index)
  2077  	if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 {
  2078  		i.index = -1
  2079  		return nil, base.LazyValue{}
  2080  	}
  2081  	return &i.key, i.value()
  2082  }
  2083  
  2084  // Note: flushFlushableBatchIter.Next mirrors the implementation of
  2085  // flushableBatchIter.Next due to performance. Keep the two in sync.
  2086  func (i *flushableBatchIter) Next() (*InternalKey, base.LazyValue) {
  2087  	if i.index == len(i.offsets) {
  2088  		return nil, base.LazyValue{}
  2089  	}
  2090  	i.index++
  2091  	if i.index == len(i.offsets) {
  2092  		return nil, base.LazyValue{}
  2093  	}
  2094  	i.key = i.getKey(i.index)
  2095  	if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 {
  2096  		i.index = len(i.offsets)
  2097  		return nil, base.LazyValue{}
  2098  	}
  2099  	return &i.key, i.value()
  2100  }
  2101  
  2102  func (i *flushableBatchIter) Prev() (*InternalKey, base.LazyValue) {
  2103  	if i.index < 0 {
  2104  		return nil, base.LazyValue{}
  2105  	}
  2106  	i.index--
  2107  	if i.index < 0 {
  2108  		return nil, base.LazyValue{}
  2109  	}
  2110  	i.key = i.getKey(i.index)
  2111  	if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 {
  2112  		i.index = -1
  2113  		return nil, base.LazyValue{}
  2114  	}
  2115  	return &i.key, i.value()
  2116  }
  2117  
  2118  // Note: flushFlushableBatchIter.NextPrefix mirrors the implementation of
  2119  // flushableBatchIter.NextPrefix due to performance. Keep the two in sync.
  2120  func (i *flushableBatchIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) {
  2121  	return i.SeekGE(succKey, base.SeekGEFlagsNone.EnableTrySeekUsingNext())
  2122  }
  2123  
  2124  func (i *flushableBatchIter) getKey(index int) InternalKey {
  2125  	e := &i.offsets[index]
  2126  	kind := InternalKeyKind(i.data[e.offset])
  2127  	key := i.data[e.keyStart:e.keyEnd]
  2128  	return base.MakeInternalKey(key, i.batch.seqNum+uint64(e.index), kind)
  2129  }
  2130  
  2131  func (i *flushableBatchIter) value() base.LazyValue {
  2132  	p := i.data[i.offsets[i.index].offset:]
  2133  	if len(p) == 0 {
  2134  		i.err = base.CorruptionErrorf("corrupted batch")
  2135  		return base.LazyValue{}
  2136  	}
  2137  	kind := InternalKeyKind(p[0])
  2138  	if kind > InternalKeyKindMax {
  2139  		i.err = base.CorruptionErrorf("corrupted batch")
  2140  		return base.LazyValue{}
  2141  	}
  2142  	var value []byte
  2143  	var ok bool
  2144  	switch kind {
  2145  	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete,
  2146  		InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete,
  2147  		InternalKeyKindDeleteSized:
  2148  		keyEnd := i.offsets[i.index].keyEnd
  2149  		_, value, ok = batchDecodeStr(i.data[keyEnd:])
  2150  		if !ok {
  2151  			i.err = base.CorruptionErrorf("corrupted batch")
  2152  			return base.LazyValue{}
  2153  		}
  2154  	}
  2155  	return base.MakeInPlaceValue(value)
  2156  }
  2157  
  2158  func (i *flushableBatchIter) Valid() bool {
  2159  	return i.index >= 0 && i.index < len(i.offsets)
  2160  }
  2161  
  2162  func (i *flushableBatchIter) Error() error {
  2163  	return i.err
  2164  }
  2165  
  2166  func (i *flushableBatchIter) Close() error {
  2167  	return i.err
  2168  }
  2169  
  2170  func (i *flushableBatchIter) SetBounds(lower, upper []byte) {
  2171  	i.lower = lower
  2172  	i.upper = upper
  2173  }
  2174  
  2175  // flushFlushableBatchIter is similar to flushableBatchIter but it keeps track
  2176  // of number of bytes iterated.
  2177  type flushFlushableBatchIter struct {
  2178  	flushableBatchIter
  2179  	bytesIterated *uint64
  2180  }
  2181  
  2182  // flushFlushableBatchIter implements the base.InternalIterator interface.
  2183  var _ base.InternalIterator = (*flushFlushableBatchIter)(nil)
  2184  
  2185  func (i *flushFlushableBatchIter) String() string {
  2186  	return "flushable-batch"
  2187  }
  2188  
  2189  func (i *flushFlushableBatchIter) SeekGE(
  2190  	key []byte, flags base.SeekGEFlags,
  2191  ) (*InternalKey, base.LazyValue) {
  2192  	panic("pebble: SeekGE unimplemented")
  2193  }
  2194  
  2195  func (i *flushFlushableBatchIter) SeekPrefixGE(
  2196  	prefix, key []byte, flags base.SeekGEFlags,
  2197  ) (*base.InternalKey, base.LazyValue) {
  2198  	panic("pebble: SeekPrefixGE unimplemented")
  2199  }
  2200  
  2201  func (i *flushFlushableBatchIter) SeekLT(
  2202  	key []byte, flags base.SeekLTFlags,
  2203  ) (*InternalKey, base.LazyValue) {
  2204  	panic("pebble: SeekLT unimplemented")
  2205  }
  2206  
  2207  func (i *flushFlushableBatchIter) First() (*InternalKey, base.LazyValue) {
  2208  	i.err = nil // clear cached iteration error
  2209  	key, val := i.flushableBatchIter.First()
  2210  	if key == nil {
  2211  		return nil, base.LazyValue{}
  2212  	}
  2213  	entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset
  2214  	*i.bytesIterated += uint64(entryBytes) + i.valueSize()
  2215  	return key, val
  2216  }
  2217  
  2218  func (i *flushFlushableBatchIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
  2219  	panic("pebble: Prev unimplemented")
  2220  }
  2221  
  2222  // Note: flushFlushableBatchIter.Next mirrors the implementation of
  2223  // flushableBatchIter.Next due to performance. Keep the two in sync.
  2224  func (i *flushFlushableBatchIter) Next() (*InternalKey, base.LazyValue) {
  2225  	if i.index == len(i.offsets) {
  2226  		return nil, base.LazyValue{}
  2227  	}
  2228  	i.index++
  2229  	if i.index == len(i.offsets) {
  2230  		return nil, base.LazyValue{}
  2231  	}
  2232  	i.key = i.getKey(i.index)
  2233  	entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset
  2234  	*i.bytesIterated += uint64(entryBytes) + i.valueSize()
  2235  	return &i.key, i.value()
  2236  }
  2237  
  2238  func (i flushFlushableBatchIter) Prev() (*InternalKey, base.LazyValue) {
  2239  	panic("pebble: Prev unimplemented")
  2240  }
  2241  
  2242  func (i flushFlushableBatchIter) valueSize() uint64 {
  2243  	p := i.data[i.offsets[i.index].offset:]
  2244  	if len(p) == 0 {
  2245  		i.err = base.CorruptionErrorf("corrupted batch")
  2246  		return 0
  2247  	}
  2248  	kind := InternalKeyKind(p[0])
  2249  	if kind > InternalKeyKindMax {
  2250  		i.err = base.CorruptionErrorf("corrupted batch")
  2251  		return 0
  2252  	}
  2253  	var length uint64
  2254  	switch kind {
  2255  	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete:
  2256  		keyEnd := i.offsets[i.index].keyEnd
  2257  		v, n := binary.Uvarint(i.data[keyEnd:])
  2258  		if n <= 0 {
  2259  			i.err = base.CorruptionErrorf("corrupted batch")
  2260  			return 0
  2261  		}
  2262  		length = v + uint64(n)
  2263  	}
  2264  	return length
  2265  }
  2266  
  2267  // batchSort returns iterators for the sorted contents of the batch. It is
  2268  // intended for testing use only. The batch.Sort dance is done to prevent
  2269  // exposing this method in the public pebble interface.
  2270  func batchSort(
  2271  	i interface{},
  2272  ) (
  2273  	points internalIterator,
  2274  	rangeDels keyspan.FragmentIterator,
  2275  	rangeKeys keyspan.FragmentIterator,
  2276  ) {
  2277  	b := i.(*Batch)
  2278  	if b.Indexed() {
  2279  		pointIter := b.newInternalIter(nil)
  2280  		rangeDelIter := b.newRangeDelIter(nil, math.MaxUint64)
  2281  		rangeKeyIter := b.newRangeKeyIter(nil, math.MaxUint64)
  2282  		return pointIter, rangeDelIter, rangeKeyIter
  2283  	}
  2284  	f, err := newFlushableBatch(b, b.db.opts.Comparer)
  2285  	if err != nil {
  2286  		panic(err)
  2287  	}
  2288  	return f.newIter(nil), f.newRangeDelIter(nil), f.newRangeKeyIter(nil)
  2289  }
  2290  
  2291  func init() {
  2292  	private.BatchSort = batchSort
  2293  }