github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/batch.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"context"
     9  	"encoding/binary"
    10  	"fmt"
    11  	"io"
    12  	"math"
    13  	"sort"
    14  	"sync"
    15  	"sync/atomic"
    16  	"time"
    17  	"unsafe"
    18  
    19  	"github.com/cockroachdb/errors"
    20  	"github.com/cockroachdb/pebble/internal/base"
    21  	"github.com/cockroachdb/pebble/internal/batchskl"
    22  	"github.com/cockroachdb/pebble/internal/humanize"
    23  	"github.com/cockroachdb/pebble/internal/invariants"
    24  	"github.com/cockroachdb/pebble/internal/keyspan"
    25  	"github.com/cockroachdb/pebble/internal/private"
    26  	"github.com/cockroachdb/pebble/internal/rangedel"
    27  	"github.com/cockroachdb/pebble/internal/rangekey"
    28  	"github.com/cockroachdb/pebble/internal/rawalloc"
    29  )
    30  
    31  const (
    32  	batchCountOffset     = 8
    33  	batchHeaderLen       = 12
    34  	batchInitialSize     = 1 << 10 // 1 KB
    35  	batchMaxRetainedSize = 1 << 20 // 1 MB
    36  	invalidBatchCount    = 1<<32 - 1
    37  	maxVarintLen32       = 5
    38  )
    39  
    40  // ErrNotIndexed means that a read operation on a batch failed because the
    41  // batch is not indexed and thus doesn't support reads.
    42  var ErrNotIndexed = errors.New("pebble: batch not indexed")
    43  
    44  // ErrInvalidBatch indicates that a batch is invalid or otherwise corrupted.
    45  var ErrInvalidBatch = base.MarkCorruptionError(errors.New("pebble: invalid batch"))
    46  
    47  // ErrBatchTooLarge indicates that a batch is invalid or otherwise corrupted.
    48  var ErrBatchTooLarge = base.MarkCorruptionError(errors.Newf("pebble: batch too large: >= %s", humanize.Bytes.Uint64(maxBatchSize)))
    49  
    50  // DeferredBatchOp represents a batch operation (eg. set, merge, delete) that is
    51  // being inserted into the batch. Indexing is not performed on the specified key
    52  // until Finish is called, hence the name deferred. This struct lets the caller
    53  // copy or encode keys/values directly into the batch representation instead of
    54  // copying into an intermediary buffer then having pebble.Batch copy off of it.
    55  type DeferredBatchOp struct {
    56  	index *batchskl.Skiplist
    57  
    58  	// Key and Value point to parts of the binary batch representation where
    59  	// keys and values should be encoded/copied into. len(Key) and len(Value)
    60  	// bytes must be copied into these slices respectively before calling
    61  	// Finish(). Changing where these slices point to is not allowed.
    62  	Key, Value []byte
    63  	offset     uint32
    64  }
    65  
    66  // Finish completes the addition of this batch operation, and adds it to the
    67  // index if necessary. Must be called once (and exactly once) keys/values
    68  // have been filled into Key and Value. Not calling Finish or not
    69  // copying/encoding keys will result in an incomplete index, and calling Finish
    70  // twice may result in a panic.
    71  func (d DeferredBatchOp) Finish() error {
    72  	if d.index != nil {
    73  		if err := d.index.Add(d.offset); err != nil {
    74  			return err
    75  		}
    76  	}
    77  	return nil
    78  }
    79  
    80  // A Batch is a sequence of Sets, Merges, Deletes, DeleteRanges, RangeKeySets,
    81  // RangeKeyUnsets, and/or RangeKeyDeletes that are applied atomically. Batch
    82  // implements the Reader interface, but only an indexed batch supports reading
    83  // (without error) via Get or NewIter. A non-indexed batch will return
    84  // ErrNotIndexed when read from. A batch is not safe for concurrent use, and
    85  // consumers should use a batch per goroutine or provide their own
    86  // synchronization.
    87  //
    88  // # Indexing
    89  //
    90  // Batches can be optionally indexed (see DB.NewIndexedBatch). An indexed batch
    91  // allows iteration via an Iterator (see Batch.NewIter). The iterator provides
    92  // a merged view of the operations in the batch and the underlying
    93  // database. This is implemented by treating the batch as an additional layer
    94  // in the LSM where every entry in the batch is considered newer than any entry
    95  // in the underlying database (batch entries have the InternalKeySeqNumBatch
    96  // bit set). By treating the batch as an additional layer in the LSM, iteration
    97  // supports all batch operations (i.e. Set, Merge, Delete, DeleteRange,
    98  // RangeKeySet, RangeKeyUnset, RangeKeyDelete) with minimal effort.
    99  //
   100  // The same key can be operated on multiple times in a batch, though only the
   101  // latest operation will be visible. For example, Put("a", "b"), Delete("a")
   102  // will cause the key "a" to not be visible in the batch. Put("a", "b"),
   103  // Put("a", "c") will cause a read of "a" to return the value "c".
   104  //
   105  // The batch index is implemented via an skiplist (internal/batchskl). While
   106  // the skiplist implementation is very fast, inserting into an indexed batch is
   107  // significantly slower than inserting into a non-indexed batch. Only use an
   108  // indexed batch if you require reading from it.
   109  //
   110  // # Atomic commit
   111  //
   112  // The operations in a batch are persisted by calling Batch.Commit which is
   113  // equivalent to calling DB.Apply(batch). A batch is committed atomically by
   114  // writing the internal batch representation to the WAL, adding all of the
   115  // batch operations to the memtable associated with the WAL, and then
   116  // incrementing the visible sequence number so that subsequent reads can see
   117  // the effects of the batch operations. If WriteOptions.Sync is true, a call to
   118  // Batch.Commit will guarantee that the batch is persisted to disk before
   119  // returning. See commitPipeline for more on the implementation details.
   120  //
   121  // # Large batches
   122  //
   123  // The size of a batch is limited only by available memory (be aware that
   124  // indexed batches require considerably additional memory for the skiplist
   125  // structure). A given WAL file has a single memtable associated with it (this
   126  // restriction could be removed, but doing so is onerous and complex). And a
   127  // memtable has a fixed size due to the underlying fixed size arena. Note that
   128  // this differs from RocksDB where a memtable can grow arbitrarily large using
   129  // a list of arena chunks. In RocksDB this is accomplished by storing pointers
   130  // in the arena memory, but that isn't possible in Go.
   131  //
   132  // During Batch.Commit, a batch which is larger than a threshold (>
   133  // MemTableSize/2) is wrapped in a flushableBatch and inserted into the queue
   134  // of memtables. A flushableBatch forces WAL to be rotated, but that happens
   135  // anyways when the memtable becomes full so this does not cause significant
   136  // WAL churn. Because the flushableBatch is readable as another layer in the
   137  // LSM, Batch.Commit returns as soon as the flushableBatch has been added to
   138  // the queue of memtables.
   139  //
   140  // Internally, a flushableBatch provides Iterator support by sorting the batch
   141  // contents (the batch is sorted once, when it is added to the memtable
   142  // queue). Sorting the batch contents and insertion of the contents into a
   143  // memtable have the same big-O time, but the constant factor dominates
   144  // here. Sorting is significantly faster and uses significantly less memory.
   145  //
   146  // # Internal representation
   147  //
   148  // The internal batch representation is a contiguous byte buffer with a fixed
   149  // 12-byte header, followed by a series of records.
   150  //
   151  //	+-------------+------------+--- ... ---+
   152  //	| SeqNum (8B) | Count (4B) |  Entries  |
   153  //	+-------------+------------+--- ... ---+
   154  //
   155  // Each record has a 1-byte kind tag prefix, followed by 1 or 2 length prefixed
   156  // strings (varstring):
   157  //
   158  //	+-----------+-----------------+-------------------+
   159  //	| Kind (1B) | Key (varstring) | Value (varstring) |
   160  //	+-----------+-----------------+-------------------+
   161  //
   162  // A varstring is a varint32 followed by N bytes of data. The Kind tags are
   163  // exactly those specified by InternalKeyKind. The following table shows the
   164  // format for records of each kind:
   165  //
   166  //	InternalKeyKindDelete         varstring
   167  //	InternalKeyKindLogData        varstring
   168  //	InternalKeyKindIngestSST      varstring
   169  //	InternalKeyKindSet            varstring varstring
   170  //	InternalKeyKindMerge          varstring varstring
   171  //	InternalKeyKindRangeDelete    varstring varstring
   172  //	InternalKeyKindRangeKeySet    varstring varstring
   173  //	InternalKeyKindRangeKeyUnset  varstring varstring
   174  //	InternalKeyKindRangeKeyDelete varstring varstring
   175  //
   176  // The intuitive understanding here are that the arguments to Delete, Set,
   177  // Merge, DeleteRange and RangeKeyDelete are encoded into the batch. The
   178  // RangeKeySet and RangeKeyUnset operations are slightly more complicated,
   179  // encoding their end key, suffix and value [in the case of RangeKeySet] within
   180  // the Value varstring. For more information on the value encoding for
   181  // RangeKeySet and RangeKeyUnset, see the internal/rangekey package.
   182  //
   183  // The internal batch representation is the on disk format for a batch in the
   184  // WAL, and thus stable. New record kinds may be added, but the existing ones
   185  // will not be modified.
   186  type Batch struct {
   187  	batchInternal
   188  	applied atomic.Bool
   189  }
   190  
   191  // batchInternal contains the set of fields within Batch that are non-atomic and
   192  // capable of being reset using a *b = batchInternal{} struct copy.
   193  type batchInternal struct {
   194  	// Data is the wire format of a batch's log entry:
   195  	//   - 8 bytes for a sequence number of the first batch element,
   196  	//     or zeroes if the batch has not yet been applied,
   197  	//   - 4 bytes for the count: the number of elements in the batch,
   198  	//     or "\xff\xff\xff\xff" if the batch is invalid,
   199  	//   - count elements, being:
   200  	//     - one byte for the kind
   201  	//     - the varint-string user key,
   202  	//     - the varint-string value (if kind != delete).
   203  	// The sequence number and count are stored in little-endian order.
   204  	//
   205  	// The data field can be (but is not guaranteed to be) nil for new
   206  	// batches. Large batches will set the data field to nil when committed as
   207  	// the data has been moved to a flushableBatch and inserted into the queue of
   208  	// memtables.
   209  	data           []byte
   210  	cmp            Compare
   211  	formatKey      base.FormatKey
   212  	abbreviatedKey AbbreviatedKey
   213  
   214  	// An upper bound on required space to add this batch to a memtable.
   215  	// Note that although batches are limited to 4 GiB in size, that limit
   216  	// applies to len(data), not the memtable size. The upper bound on the
   217  	// size of a memtable node is larger than the overhead of the batch's log
   218  	// encoding, so memTableSize is larger than len(data) and may overflow a
   219  	// uint32.
   220  	memTableSize uint64
   221  
   222  	// The db to which the batch will be committed. Do not change this field
   223  	// after the batch has been created as it might invalidate internal state.
   224  	// Batch.memTableSize is only refreshed if Batch.db is set. Setting db to
   225  	// nil once it has been set implies that the Batch has encountered an error.
   226  	db *DB
   227  
   228  	// The count of records in the batch. This count will be stored in the batch
   229  	// data whenever Repr() is called.
   230  	count uint64
   231  
   232  	// The count of range deletions in the batch. Updated every time a range
   233  	// deletion is added.
   234  	countRangeDels uint64
   235  
   236  	// The count of range key sets, unsets and deletes in the batch. Updated
   237  	// every time a RANGEKEYSET, RANGEKEYUNSET or RANGEKEYDEL key is added.
   238  	countRangeKeys uint64
   239  
   240  	// A deferredOp struct, stored in the Batch so that a pointer can be returned
   241  	// from the *Deferred() methods rather than a value.
   242  	deferredOp DeferredBatchOp
   243  
   244  	// An optional skiplist keyed by offset into data of the entry.
   245  	index         *batchskl.Skiplist
   246  	rangeDelIndex *batchskl.Skiplist
   247  	rangeKeyIndex *batchskl.Skiplist
   248  
   249  	// Fragmented range deletion tombstones. Cached the first time a range
   250  	// deletion iterator is requested. The cache is invalidated whenever a new
   251  	// range deletion is added to the batch. This cache can only be used when
   252  	// opening an iterator to read at a batch sequence number >=
   253  	// tombstonesSeqNum. This is the case for all new iterators created over a
   254  	// batch but it's not the case for all cloned iterators.
   255  	tombstones       []keyspan.Span
   256  	tombstonesSeqNum uint64
   257  
   258  	// Fragmented range key spans. Cached the first time a range key iterator is
   259  	// requested. The cache is invalidated whenever a new range key
   260  	// (RangeKey{Set,Unset,Del}) is added to the batch. This cache can only be
   261  	// used when opening an iterator to read at a batch sequence number >=
   262  	// tombstonesSeqNum. This is the case for all new iterators created over a
   263  	// batch but it's not the case for all cloned iterators.
   264  	rangeKeys       []keyspan.Span
   265  	rangeKeysSeqNum uint64
   266  
   267  	// The flushableBatch wrapper if the batch is too large to fit in the
   268  	// memtable.
   269  	flushable *flushableBatch
   270  
   271  	// minimumFormatMajorVersion indicates the format major version required in
   272  	// order to commit this batch. If an operation requires a particular format
   273  	// major version, it ratchets the batch's minimumFormatMajorVersion. When
   274  	// the batch is committed, this is validated against the database's current
   275  	// format major version.
   276  	minimumFormatMajorVersion FormatMajorVersion
   277  
   278  	// Synchronous Apply uses the commit WaitGroup for both publishing the
   279  	// seqnum and waiting for the WAL fsync (if needed). Asynchronous
   280  	// ApplyNoSyncWait, which implies WriteOptions.Sync is true, uses the commit
   281  	// WaitGroup for publishing the seqnum and the fsyncWait WaitGroup for
   282  	// waiting for the WAL fsync.
   283  	//
   284  	// TODO(sumeer): if we find that ApplyNoSyncWait in conjunction with
   285  	// SyncWait is causing higher memory usage because of the time duration
   286  	// between when the sync is already done, and a goroutine calls SyncWait
   287  	// (followed by Batch.Close), we could separate out {fsyncWait, commitErr}
   288  	// into a separate struct that is allocated separately (using another
   289  	// sync.Pool), and only that struct needs to outlive Batch.Close (which
   290  	// could then be called immediately after ApplyNoSyncWait). commitStats
   291  	// will also need to be in this separate struct.
   292  	commit    sync.WaitGroup
   293  	fsyncWait sync.WaitGroup
   294  
   295  	commitStats BatchCommitStats
   296  
   297  	commitErr error
   298  
   299  	// Position bools together to reduce the sizeof the struct.
   300  
   301  	// ingestedSSTBatch indicates that the batch contains one or more key kinds
   302  	// of InternalKeyKindIngestSST. If the batch contains key kinds of IngestSST
   303  	// then it will only contain key kinds of IngestSST.
   304  	ingestedSSTBatch bool
   305  
   306  	// committing is set to true when a batch begins to commit. It's used to
   307  	// ensure the batch is not mutated concurrently. It is not an atomic
   308  	// deliberately, so as to avoid the overhead on batch mutations. This is
   309  	// okay, because under correct usage this field will never be accessed
   310  	// concurrently. It's only under incorrect usage the memory accesses of this
   311  	// variable may violate memory safety. Since we don't use atomics here,
   312  	// false negatives are possible.
   313  	committing bool
   314  }
   315  
   316  // BatchCommitStats exposes stats related to committing a batch.
   317  //
   318  // NB: there is no Pebble internal tracing (using LoggerAndTracer) of slow
   319  // batch commits. The caller can use these stats to do their own tracing as
   320  // needed.
   321  type BatchCommitStats struct {
   322  	// TotalDuration is the time spent in DB.{Apply,ApplyNoSyncWait} or
   323  	// Batch.Commit, plus the time waiting in Batch.SyncWait. If there is a gap
   324  	// between calling ApplyNoSyncWait and calling SyncWait, that gap could
   325  	// include some duration in which real work was being done for the commit
   326  	// and will not be included here. This missing time is considered acceptable
   327  	// since the goal of these stats is to understand user-facing latency.
   328  	//
   329  	// TotalDuration includes time spent in various queues both inside Pebble
   330  	// and outside Pebble (I/O queues, goroutine scheduler queue, mutex wait
   331  	// etc.). For some of these queues (which we consider important) the wait
   332  	// times are included below -- these expose low-level implementation detail
   333  	// and are meant for expert diagnosis and subject to change. There may be
   334  	// unaccounted time after subtracting those values from TotalDuration.
   335  	TotalDuration time.Duration
   336  	// SemaphoreWaitDuration is the wait time for semaphores in
   337  	// commitPipeline.Commit.
   338  	SemaphoreWaitDuration time.Duration
   339  	// WALQueueWaitDuration is the wait time for allocating memory blocks in the
   340  	// LogWriter (due to the LogWriter not writing fast enough). At the moment
   341  	// this is duration is always zero because a single WAL will allow
   342  	// allocating memory blocks up to the entire memtable size. In the future,
   343  	// we may pipeline WALs and bound the WAL queued blocks separately, so this
   344  	// field is preserved for that possibility.
   345  	WALQueueWaitDuration time.Duration
   346  	// MemTableWriteStallDuration is the wait caused by a write stall due to too
   347  	// many memtables (due to not flushing fast enough).
   348  	MemTableWriteStallDuration time.Duration
   349  	// L0ReadAmpWriteStallDuration is the wait caused by a write stall due to
   350  	// high read amplification in L0 (due to not compacting fast enough out of
   351  	// L0).
   352  	L0ReadAmpWriteStallDuration time.Duration
   353  	// WALRotationDuration is the wait time for WAL rotation, which includes
   354  	// syncing and closing the old WAL and creating (or reusing) a new one.
   355  	WALRotationDuration time.Duration
   356  	// CommitWaitDuration is the wait for publishing the seqnum plus the
   357  	// duration for the WAL sync (if requested). The former should be tiny and
   358  	// one can assume that this is all due to the WAL sync.
   359  	CommitWaitDuration time.Duration
   360  }
   361  
   362  var _ Reader = (*Batch)(nil)
   363  var _ Writer = (*Batch)(nil)
   364  
   365  var batchPool = sync.Pool{
   366  	New: func() interface{} {
   367  		return &Batch{}
   368  	},
   369  }
   370  
   371  type indexedBatch struct {
   372  	batch Batch
   373  	index batchskl.Skiplist
   374  }
   375  
   376  var indexedBatchPool = sync.Pool{
   377  	New: func() interface{} {
   378  		return &indexedBatch{}
   379  	},
   380  }
   381  
   382  func newBatch(db *DB) *Batch {
   383  	b := batchPool.Get().(*Batch)
   384  	b.db = db
   385  	return b
   386  }
   387  
   388  func newBatchWithSize(db *DB, size int) *Batch {
   389  	b := newBatch(db)
   390  	if cap(b.data) < size {
   391  		b.data = rawalloc.New(0, size)
   392  	}
   393  	return b
   394  }
   395  
   396  func newIndexedBatch(db *DB, comparer *Comparer) *Batch {
   397  	i := indexedBatchPool.Get().(*indexedBatch)
   398  	i.batch.cmp = comparer.Compare
   399  	i.batch.formatKey = comparer.FormatKey
   400  	i.batch.abbreviatedKey = comparer.AbbreviatedKey
   401  	i.batch.db = db
   402  	i.batch.index = &i.index
   403  	i.batch.index.Init(&i.batch.data, i.batch.cmp, i.batch.abbreviatedKey)
   404  	return &i.batch
   405  }
   406  
   407  func newIndexedBatchWithSize(db *DB, comparer *Comparer, size int) *Batch {
   408  	b := newIndexedBatch(db, comparer)
   409  	if cap(b.data) < size {
   410  		b.data = rawalloc.New(0, size)
   411  	}
   412  	return b
   413  }
   414  
   415  // nextSeqNum returns the batch "sequence number" that will be given to the next
   416  // key written to the batch. During iteration keys within an indexed batch are
   417  // given a sequence number consisting of their offset within the batch combined
   418  // with the base.InternalKeySeqNumBatch bit. These sequence numbers are only
   419  // used during iteration, and the keys are assigned ordinary sequence numbers
   420  // when the batch is committed.
   421  func (b *Batch) nextSeqNum() uint64 {
   422  	return uint64(len(b.data)) | base.InternalKeySeqNumBatch
   423  }
   424  
   425  func (b *Batch) release() {
   426  	if b.db == nil {
   427  		// The batch was not created using newBatch or newIndexedBatch, or an error
   428  		// was encountered. We don't try to reuse batches that encountered an error
   429  		// because they might be stuck somewhere in the system and attempting to
   430  		// reuse such batches is a recipe for onerous debugging sessions. Instead,
   431  		// let the GC do its job.
   432  		return
   433  	}
   434  	b.db = nil
   435  
   436  	// NB: This is ugly (it would be cleaner if we could just assign a Batch{}),
   437  	// but necessary so that we can use atomic.StoreUint32 for the Batch.applied
   438  	// field. Without using an atomic to clear that field the Go race detector
   439  	// complains.
   440  	b.Reset()
   441  	b.cmp = nil
   442  	b.formatKey = nil
   443  	b.abbreviatedKey = nil
   444  
   445  	if b.index == nil {
   446  		batchPool.Put(b)
   447  	} else {
   448  		b.index, b.rangeDelIndex, b.rangeKeyIndex = nil, nil, nil
   449  		indexedBatchPool.Put((*indexedBatch)(unsafe.Pointer(b)))
   450  	}
   451  }
   452  
   453  func (b *Batch) refreshMemTableSize() error {
   454  	b.memTableSize = 0
   455  	if len(b.data) < batchHeaderLen {
   456  		return nil
   457  	}
   458  
   459  	b.countRangeDels = 0
   460  	b.countRangeKeys = 0
   461  	b.minimumFormatMajorVersion = 0
   462  	for r := b.Reader(); ; {
   463  		kind, key, value, ok, err := r.Next()
   464  		if !ok {
   465  			if err != nil {
   466  				return err
   467  			}
   468  			break
   469  		}
   470  		switch kind {
   471  		case InternalKeyKindRangeDelete:
   472  			b.countRangeDels++
   473  		case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
   474  			b.countRangeKeys++
   475  		case InternalKeyKindDeleteSized:
   476  			if b.minimumFormatMajorVersion < FormatDeleteSizedAndObsolete {
   477  				b.minimumFormatMajorVersion = FormatDeleteSizedAndObsolete
   478  			}
   479  		case InternalKeyKindIngestSST:
   480  			if b.minimumFormatMajorVersion < FormatFlushableIngest {
   481  				b.minimumFormatMajorVersion = FormatFlushableIngest
   482  			}
   483  			// This key kind doesn't contribute to the memtable size.
   484  			continue
   485  		}
   486  		b.memTableSize += memTableEntrySize(len(key), len(value))
   487  	}
   488  	if b.countRangeKeys > 0 && b.minimumFormatMajorVersion < FormatRangeKeys {
   489  		b.minimumFormatMajorVersion = FormatRangeKeys
   490  	}
   491  	return nil
   492  }
   493  
   494  // Apply the operations contained in the batch to the receiver batch.
   495  //
   496  // It is safe to modify the contents of the arguments after Apply returns.
   497  func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error {
   498  	if b.ingestedSSTBatch {
   499  		panic("pebble: invalid batch application")
   500  	}
   501  	if len(batch.data) == 0 {
   502  		return nil
   503  	}
   504  	if len(batch.data) < batchHeaderLen {
   505  		return ErrInvalidBatch
   506  	}
   507  
   508  	offset := len(b.data)
   509  	if offset == 0 {
   510  		b.init(offset)
   511  		offset = batchHeaderLen
   512  	}
   513  	b.data = append(b.data, batch.data[batchHeaderLen:]...)
   514  
   515  	b.setCount(b.Count() + batch.Count())
   516  
   517  	if b.db != nil || b.index != nil {
   518  		// Only iterate over the new entries if we need to track memTableSize or in
   519  		// order to update the index.
   520  		for iter := BatchReader(b.data[offset:]); len(iter) > 0; {
   521  			offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0]))
   522  			kind, key, value, ok, err := iter.Next()
   523  			if !ok {
   524  				if err != nil {
   525  					return err
   526  				}
   527  				break
   528  			}
   529  			switch kind {
   530  			case InternalKeyKindRangeDelete:
   531  				b.countRangeDels++
   532  			case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
   533  				b.countRangeKeys++
   534  			case InternalKeyKindIngestSST:
   535  				panic("pebble: invalid key kind for batch")
   536  			}
   537  			if b.index != nil {
   538  				var err error
   539  				switch kind {
   540  				case InternalKeyKindRangeDelete:
   541  					b.tombstones = nil
   542  					b.tombstonesSeqNum = 0
   543  					if b.rangeDelIndex == nil {
   544  						b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey)
   545  					}
   546  					err = b.rangeDelIndex.Add(uint32(offset))
   547  				case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
   548  					b.rangeKeys = nil
   549  					b.rangeKeysSeqNum = 0
   550  					if b.rangeKeyIndex == nil {
   551  						b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey)
   552  					}
   553  					err = b.rangeKeyIndex.Add(uint32(offset))
   554  				default:
   555  					err = b.index.Add(uint32(offset))
   556  				}
   557  				if err != nil {
   558  					return err
   559  				}
   560  			}
   561  			b.memTableSize += memTableEntrySize(len(key), len(value))
   562  		}
   563  	}
   564  	return nil
   565  }
   566  
   567  // Get gets the value for the given key. It returns ErrNotFound if the Batch
   568  // does not contain the key.
   569  //
   570  // The caller should not modify the contents of the returned slice, but it is
   571  // safe to modify the contents of the argument after Get returns. The returned
   572  // slice will remain valid until the returned Closer is closed. On success, the
   573  // caller MUST call closer.Close() or a memory leak will occur.
   574  func (b *Batch) Get(key []byte) ([]byte, io.Closer, error) {
   575  	if b.index == nil {
   576  		return nil, nil, ErrNotIndexed
   577  	}
   578  	return b.db.getInternal(key, b, nil /* snapshot */)
   579  }
   580  
   581  func (b *Batch) prepareDeferredKeyValueRecord(keyLen, valueLen int, kind InternalKeyKind) {
   582  	if b.committing {
   583  		panic("pebble: batch already committing")
   584  	}
   585  	if len(b.data) == 0 {
   586  		b.init(keyLen + valueLen + 2*binary.MaxVarintLen64 + batchHeaderLen)
   587  	}
   588  	b.count++
   589  	b.memTableSize += memTableEntrySize(keyLen, valueLen)
   590  
   591  	pos := len(b.data)
   592  	b.deferredOp.offset = uint32(pos)
   593  	b.grow(1 + 2*maxVarintLen32 + keyLen + valueLen)
   594  	b.data[pos] = byte(kind)
   595  	pos++
   596  
   597  	{
   598  		// TODO(peter): Manually inlined version binary.PutUvarint(). This is 20%
   599  		// faster on BenchmarkBatchSet on go1.13. Remove if go1.14 or future
   600  		// versions show this to not be a performance win.
   601  		x := uint32(keyLen)
   602  		for x >= 0x80 {
   603  			b.data[pos] = byte(x) | 0x80
   604  			x >>= 7
   605  			pos++
   606  		}
   607  		b.data[pos] = byte(x)
   608  		pos++
   609  	}
   610  
   611  	b.deferredOp.Key = b.data[pos : pos+keyLen]
   612  	pos += keyLen
   613  
   614  	{
   615  		// TODO(peter): Manually inlined version binary.PutUvarint(). This is 20%
   616  		// faster on BenchmarkBatchSet on go1.13. Remove if go1.14 or future
   617  		// versions show this to not be a performance win.
   618  		x := uint32(valueLen)
   619  		for x >= 0x80 {
   620  			b.data[pos] = byte(x) | 0x80
   621  			x >>= 7
   622  			pos++
   623  		}
   624  		b.data[pos] = byte(x)
   625  		pos++
   626  	}
   627  
   628  	b.deferredOp.Value = b.data[pos : pos+valueLen]
   629  	// Shrink data since varints may be shorter than the upper bound.
   630  	b.data = b.data[:pos+valueLen]
   631  }
   632  
   633  func (b *Batch) prepareDeferredKeyRecord(keyLen int, kind InternalKeyKind) {
   634  	if b.committing {
   635  		panic("pebble: batch already committing")
   636  	}
   637  	if len(b.data) == 0 {
   638  		b.init(keyLen + binary.MaxVarintLen64 + batchHeaderLen)
   639  	}
   640  	b.count++
   641  	b.memTableSize += memTableEntrySize(keyLen, 0)
   642  
   643  	pos := len(b.data)
   644  	b.deferredOp.offset = uint32(pos)
   645  	b.grow(1 + maxVarintLen32 + keyLen)
   646  	b.data[pos] = byte(kind)
   647  	pos++
   648  
   649  	{
   650  		// TODO(peter): Manually inlined version binary.PutUvarint(). Remove if
   651  		// go1.13 or future versions show this to not be a performance win. See
   652  		// BenchmarkBatchSet.
   653  		x := uint32(keyLen)
   654  		for x >= 0x80 {
   655  			b.data[pos] = byte(x) | 0x80
   656  			x >>= 7
   657  			pos++
   658  		}
   659  		b.data[pos] = byte(x)
   660  		pos++
   661  	}
   662  
   663  	b.deferredOp.Key = b.data[pos : pos+keyLen]
   664  	b.deferredOp.Value = nil
   665  
   666  	// Shrink data since varint may be shorter than the upper bound.
   667  	b.data = b.data[:pos+keyLen]
   668  }
   669  
   670  // AddInternalKey allows the caller to add an internal key of point key or range
   671  // key kinds (but not RangeDelete) to a batch. Passing in an internal key of
   672  // kind RangeDelete will result in a panic. Note that the seqnum in the internal
   673  // key is effectively ignored, even though the Kind is preserved. This is
   674  // because the batch format does not allow for a per-key seqnum to be specified,
   675  // only a batch-wide one.
   676  //
   677  // Note that non-indexed keys (IngestKeyKind{LogData,IngestSST}) are not
   678  // supported with this method as they require specialized logic.
   679  func (b *Batch) AddInternalKey(key *base.InternalKey, value []byte, _ *WriteOptions) error {
   680  	keyLen := len(key.UserKey)
   681  	hasValue := false
   682  	switch kind := key.Kind(); kind {
   683  	case InternalKeyKindRangeDelete:
   684  		panic("unexpected range delete in AddInternalKey")
   685  	case InternalKeyKindSingleDelete, InternalKeyKindDelete:
   686  		b.prepareDeferredKeyRecord(keyLen, kind)
   687  		b.deferredOp.index = b.index
   688  	case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
   689  		b.prepareDeferredKeyValueRecord(keyLen, len(value), kind)
   690  		hasValue = true
   691  		b.incrementRangeKeysCount()
   692  	default:
   693  		b.prepareDeferredKeyValueRecord(keyLen, len(value), kind)
   694  		hasValue = true
   695  		b.deferredOp.index = b.index
   696  	}
   697  	copy(b.deferredOp.Key, key.UserKey)
   698  	if hasValue {
   699  		copy(b.deferredOp.Value, value)
   700  	}
   701  
   702  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   703  	// in go1.13 will remove the need for this.
   704  	if b.index != nil {
   705  		if err := b.index.Add(b.deferredOp.offset); err != nil {
   706  			return err
   707  		}
   708  	}
   709  	return nil
   710  }
   711  
   712  // Set adds an action to the batch that sets the key to map to the value.
   713  //
   714  // It is safe to modify the contents of the arguments after Set returns.
   715  func (b *Batch) Set(key, value []byte, _ *WriteOptions) error {
   716  	deferredOp := b.SetDeferred(len(key), len(value))
   717  	copy(deferredOp.Key, key)
   718  	copy(deferredOp.Value, value)
   719  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   720  	// in go1.13 will remove the need for this.
   721  	if b.index != nil {
   722  		if err := b.index.Add(deferredOp.offset); err != nil {
   723  			return err
   724  		}
   725  	}
   726  	return nil
   727  }
   728  
   729  // SetDeferred is similar to Set in that it adds a set operation to the batch,
   730  // except it only takes in key/value lengths instead of complete slices,
   731  // letting the caller encode into those objects and then call Finish() on the
   732  // returned object.
   733  func (b *Batch) SetDeferred(keyLen, valueLen int) *DeferredBatchOp {
   734  	b.prepareDeferredKeyValueRecord(keyLen, valueLen, InternalKeyKindSet)
   735  	b.deferredOp.index = b.index
   736  	return &b.deferredOp
   737  }
   738  
   739  // Merge adds an action to the batch that merges the value at key with the new
   740  // value. The details of the merge are dependent upon the configured merge
   741  // operator.
   742  //
   743  // It is safe to modify the contents of the arguments after Merge returns.
   744  func (b *Batch) Merge(key, value []byte, _ *WriteOptions) error {
   745  	deferredOp := b.MergeDeferred(len(key), len(value))
   746  	copy(deferredOp.Key, key)
   747  	copy(deferredOp.Value, value)
   748  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   749  	// in go1.13 will remove the need for this.
   750  	if b.index != nil {
   751  		if err := b.index.Add(deferredOp.offset); err != nil {
   752  			return err
   753  		}
   754  	}
   755  	return nil
   756  }
   757  
   758  // MergeDeferred is similar to Merge in that it adds a merge operation to the
   759  // batch, except it only takes in key/value lengths instead of complete slices,
   760  // letting the caller encode into those objects and then call Finish() on the
   761  // returned object.
   762  func (b *Batch) MergeDeferred(keyLen, valueLen int) *DeferredBatchOp {
   763  	b.prepareDeferredKeyValueRecord(keyLen, valueLen, InternalKeyKindMerge)
   764  	b.deferredOp.index = b.index
   765  	return &b.deferredOp
   766  }
   767  
   768  // Delete adds an action to the batch that deletes the entry for key.
   769  //
   770  // It is safe to modify the contents of the arguments after Delete returns.
   771  func (b *Batch) Delete(key []byte, _ *WriteOptions) error {
   772  	deferredOp := b.DeleteDeferred(len(key))
   773  	copy(deferredOp.Key, key)
   774  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   775  	// in go1.13 will remove the need for this.
   776  	if b.index != nil {
   777  		if err := b.index.Add(deferredOp.offset); err != nil {
   778  			return err
   779  		}
   780  	}
   781  	return nil
   782  }
   783  
   784  // DeleteDeferred is similar to Delete in that it adds a delete operation to
   785  // the batch, except it only takes in key/value lengths instead of complete
   786  // slices, letting the caller encode into those objects and then call Finish()
   787  // on the returned object.
   788  func (b *Batch) DeleteDeferred(keyLen int) *DeferredBatchOp {
   789  	b.prepareDeferredKeyRecord(keyLen, InternalKeyKindDelete)
   790  	b.deferredOp.index = b.index
   791  	return &b.deferredOp
   792  }
   793  
   794  // DeleteSized behaves identically to Delete, but takes an additional
   795  // argument indicating the size of the value being deleted. DeleteSized
   796  // should be preferred when the caller has the expectation that there exists
   797  // a single internal KV pair for the key (eg, the key has not been
   798  // overwritten recently), and the caller knows the size of its value.
   799  //
   800  // DeleteSized will record the value size within the tombstone and use it to
   801  // inform compaction-picking heuristics which strive to reduce space
   802  // amplification in the LSM. This "calling your shot" mechanic allows the
   803  // storage engine to more accurately estimate and reduce space amplification.
   804  //
   805  // It is safe to modify the contents of the arguments after DeleteSized
   806  // returns.
   807  func (b *Batch) DeleteSized(key []byte, deletedValueSize uint32, _ *WriteOptions) error {
   808  	deferredOp := b.DeleteSizedDeferred(len(key), deletedValueSize)
   809  	copy(b.deferredOp.Key, key)
   810  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Check if in a
   811  	// later Go release this is unnecessary.
   812  	if b.index != nil {
   813  		if err := b.index.Add(deferredOp.offset); err != nil {
   814  			return err
   815  		}
   816  	}
   817  	return nil
   818  }
   819  
   820  // DeleteSizedDeferred is similar to DeleteSized in that it adds a sized delete
   821  // operation to the batch, except it only takes in key length instead of a
   822  // complete key slice, letting the caller encode into the DeferredBatchOp.Key
   823  // slice and then call Finish() on the returned object.
   824  func (b *Batch) DeleteSizedDeferred(keyLen int, deletedValueSize uint32) *DeferredBatchOp {
   825  	if b.minimumFormatMajorVersion < FormatDeleteSizedAndObsolete {
   826  		b.minimumFormatMajorVersion = FormatDeleteSizedAndObsolete
   827  	}
   828  
   829  	// Encode the sum of the key length and the value in the value.
   830  	v := uint64(deletedValueSize) + uint64(keyLen)
   831  
   832  	// Encode `v` as a varint.
   833  	var buf [binary.MaxVarintLen64]byte
   834  	n := 0
   835  	{
   836  		x := v
   837  		for x >= 0x80 {
   838  			buf[n] = byte(x) | 0x80
   839  			x >>= 7
   840  			n++
   841  		}
   842  		buf[n] = byte(x)
   843  		n++
   844  	}
   845  
   846  	// NB: In batch entries and sstable entries, values are stored as
   847  	// varstrings. Here, the value is itself a simple varint. This results in an
   848  	// unnecessary double layer of encoding:
   849  	//     varint(n) varint(deletedValueSize)
   850  	// The first varint will always be 1-byte, since a varint-encoded uint64
   851  	// will never exceed 128 bytes. This unnecessary extra byte and wrapping is
   852  	// preserved to avoid special casing across the database, and in particular
   853  	// in sstable block decoding which is performance sensitive.
   854  	b.prepareDeferredKeyValueRecord(keyLen, n, InternalKeyKindDeleteSized)
   855  	b.deferredOp.index = b.index
   856  	copy(b.deferredOp.Value, buf[:n])
   857  	return &b.deferredOp
   858  }
   859  
   860  // SingleDelete adds an action to the batch that single deletes the entry for key.
   861  // See Writer.SingleDelete for more details on the semantics of SingleDelete.
   862  //
   863  // It is safe to modify the contents of the arguments after SingleDelete returns.
   864  func (b *Batch) SingleDelete(key []byte, _ *WriteOptions) error {
   865  	deferredOp := b.SingleDeleteDeferred(len(key))
   866  	copy(deferredOp.Key, key)
   867  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   868  	// in go1.13 will remove the need for this.
   869  	if b.index != nil {
   870  		if err := b.index.Add(deferredOp.offset); err != nil {
   871  			return err
   872  		}
   873  	}
   874  	return nil
   875  }
   876  
   877  // SingleDeleteDeferred is similar to SingleDelete in that it adds a single delete
   878  // operation to the batch, except it only takes in key/value lengths instead of
   879  // complete slices, letting the caller encode into those objects and then call
   880  // Finish() on the returned object.
   881  func (b *Batch) SingleDeleteDeferred(keyLen int) *DeferredBatchOp {
   882  	b.prepareDeferredKeyRecord(keyLen, InternalKeyKindSingleDelete)
   883  	b.deferredOp.index = b.index
   884  	return &b.deferredOp
   885  }
   886  
   887  // DeleteRange deletes all of the point keys (and values) in the range
   888  // [start,end) (inclusive on start, exclusive on end). DeleteRange does NOT
   889  // delete overlapping range keys (eg, keys set via RangeKeySet).
   890  //
   891  // It is safe to modify the contents of the arguments after DeleteRange
   892  // returns.
   893  func (b *Batch) DeleteRange(start, end []byte, _ *WriteOptions) error {
   894  	deferredOp := b.DeleteRangeDeferred(len(start), len(end))
   895  	copy(deferredOp.Key, start)
   896  	copy(deferredOp.Value, end)
   897  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   898  	// in go1.13 will remove the need for this.
   899  	if deferredOp.index != nil {
   900  		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
   901  			return err
   902  		}
   903  	}
   904  	return nil
   905  }
   906  
   907  // DeleteRangeDeferred is similar to DeleteRange in that it adds a delete range
   908  // operation to the batch, except it only takes in key lengths instead of
   909  // complete slices, letting the caller encode into those objects and then call
   910  // Finish() on the returned object. Note that DeferredBatchOp.Key should be
   911  // populated with the start key, and DeferredBatchOp.Value should be populated
   912  // with the end key.
   913  func (b *Batch) DeleteRangeDeferred(startLen, endLen int) *DeferredBatchOp {
   914  	b.prepareDeferredKeyValueRecord(startLen, endLen, InternalKeyKindRangeDelete)
   915  	b.countRangeDels++
   916  	if b.index != nil {
   917  		b.tombstones = nil
   918  		b.tombstonesSeqNum = 0
   919  		// Range deletions are rare, so we lazily allocate the index for them.
   920  		if b.rangeDelIndex == nil {
   921  			b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey)
   922  		}
   923  		b.deferredOp.index = b.rangeDelIndex
   924  	}
   925  	return &b.deferredOp
   926  }
   927  
   928  // RangeKeySet sets a range key mapping the key range [start, end) at the MVCC
   929  // timestamp suffix to value. The suffix is optional. If any portion of the key
   930  // range [start, end) is already set by a range key with the same suffix value,
   931  // RangeKeySet overrides it.
   932  //
   933  // It is safe to modify the contents of the arguments after RangeKeySet returns.
   934  func (b *Batch) RangeKeySet(start, end, suffix, value []byte, _ *WriteOptions) error {
   935  	if invariants.Enabled && b.db != nil && b.db.opts.Comparer.Split != nil {
   936  		// RangeKeySet is only supported on prefix keys.
   937  		if b.db.opts.Comparer.Split(start) != len(start) {
   938  			panic("RangeKeySet called with suffixed start key")
   939  		}
   940  		if b.db.opts.Comparer.Split(end) != len(end) {
   941  			panic("RangeKeySet called with suffixed end key")
   942  		}
   943  	}
   944  	suffixValues := [1]rangekey.SuffixValue{{Suffix: suffix, Value: value}}
   945  	internalValueLen := rangekey.EncodedSetValueLen(end, suffixValues[:])
   946  
   947  	deferredOp := b.rangeKeySetDeferred(len(start), internalValueLen)
   948  	copy(deferredOp.Key, start)
   949  	n := rangekey.EncodeSetValue(deferredOp.Value, end, suffixValues[:])
   950  	if n != internalValueLen {
   951  		panic("unexpected internal value length mismatch")
   952  	}
   953  
   954  	// Manually inline DeferredBatchOp.Finish().
   955  	if deferredOp.index != nil {
   956  		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
   957  			return err
   958  		}
   959  	}
   960  	return nil
   961  }
   962  
   963  func (b *Batch) rangeKeySetDeferred(startLen, internalValueLen int) *DeferredBatchOp {
   964  	b.prepareDeferredKeyValueRecord(startLen, internalValueLen, InternalKeyKindRangeKeySet)
   965  	b.incrementRangeKeysCount()
   966  	return &b.deferredOp
   967  }
   968  
   969  func (b *Batch) incrementRangeKeysCount() {
   970  	b.countRangeKeys++
   971  	if b.minimumFormatMajorVersion < FormatRangeKeys {
   972  		b.minimumFormatMajorVersion = FormatRangeKeys
   973  	}
   974  	if b.index != nil {
   975  		b.rangeKeys = nil
   976  		b.rangeKeysSeqNum = 0
   977  		// Range keys are rare, so we lazily allocate the index for them.
   978  		if b.rangeKeyIndex == nil {
   979  			b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey)
   980  		}
   981  		b.deferredOp.index = b.rangeKeyIndex
   982  	}
   983  }
   984  
   985  // RangeKeyUnset removes a range key mapping the key range [start, end) at the
   986  // MVCC timestamp suffix. The suffix may be omitted to remove an unsuffixed
   987  // range key. RangeKeyUnset only removes portions of range keys that fall within
   988  // the [start, end) key span, and only range keys with suffixes that exactly
   989  // match the unset suffix.
   990  //
   991  // It is safe to modify the contents of the arguments after RangeKeyUnset
   992  // returns.
   993  func (b *Batch) RangeKeyUnset(start, end, suffix []byte, _ *WriteOptions) error {
   994  	if invariants.Enabled && b.db != nil && b.db.opts.Comparer.Split != nil {
   995  		// RangeKeyUnset is only supported on prefix keys.
   996  		if b.db.opts.Comparer.Split(start) != len(start) {
   997  			panic("RangeKeyUnset called with suffixed start key")
   998  		}
   999  		if b.db.opts.Comparer.Split(end) != len(end) {
  1000  			panic("RangeKeyUnset called with suffixed end key")
  1001  		}
  1002  	}
  1003  	suffixes := [1][]byte{suffix}
  1004  	internalValueLen := rangekey.EncodedUnsetValueLen(end, suffixes[:])
  1005  
  1006  	deferredOp := b.rangeKeyUnsetDeferred(len(start), internalValueLen)
  1007  	copy(deferredOp.Key, start)
  1008  	n := rangekey.EncodeUnsetValue(deferredOp.Value, end, suffixes[:])
  1009  	if n != internalValueLen {
  1010  		panic("unexpected internal value length mismatch")
  1011  	}
  1012  
  1013  	// Manually inline DeferredBatchOp.Finish()
  1014  	if deferredOp.index != nil {
  1015  		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
  1016  			return err
  1017  		}
  1018  	}
  1019  	return nil
  1020  }
  1021  
  1022  func (b *Batch) rangeKeyUnsetDeferred(startLen, internalValueLen int) *DeferredBatchOp {
  1023  	b.prepareDeferredKeyValueRecord(startLen, internalValueLen, InternalKeyKindRangeKeyUnset)
  1024  	b.incrementRangeKeysCount()
  1025  	return &b.deferredOp
  1026  }
  1027  
  1028  // RangeKeyDelete deletes all of the range keys in the range [start,end)
  1029  // (inclusive on start, exclusive on end). It does not delete point keys (for
  1030  // that use DeleteRange). RangeKeyDelete removes all range keys within the
  1031  // bounds, including those with or without suffixes.
  1032  //
  1033  // It is safe to modify the contents of the arguments after RangeKeyDelete
  1034  // returns.
  1035  func (b *Batch) RangeKeyDelete(start, end []byte, _ *WriteOptions) error {
  1036  	if invariants.Enabled && b.db != nil && b.db.opts.Comparer.Split != nil {
  1037  		// RangeKeyDelete is only supported on prefix keys.
  1038  		if b.db.opts.Comparer.Split(start) != len(start) {
  1039  			panic("RangeKeyDelete called with suffixed start key")
  1040  		}
  1041  		if b.db.opts.Comparer.Split(end) != len(end) {
  1042  			panic("RangeKeyDelete called with suffixed end key")
  1043  		}
  1044  	}
  1045  	deferredOp := b.RangeKeyDeleteDeferred(len(start), len(end))
  1046  	copy(deferredOp.Key, start)
  1047  	copy(deferredOp.Value, end)
  1048  	// Manually inline DeferredBatchOp.Finish().
  1049  	if deferredOp.index != nil {
  1050  		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
  1051  			return err
  1052  		}
  1053  	}
  1054  	return nil
  1055  }
  1056  
  1057  // RangeKeyDeleteDeferred is similar to RangeKeyDelete in that it adds an
  1058  // operation to delete range keys to the batch, except it only takes in key
  1059  // lengths instead of complete slices, letting the caller encode into those
  1060  // objects and then call Finish() on the returned object. Note that
  1061  // DeferredBatchOp.Key should be populated with the start key, and
  1062  // DeferredBatchOp.Value should be populated with the end key.
  1063  func (b *Batch) RangeKeyDeleteDeferred(startLen, endLen int) *DeferredBatchOp {
  1064  	b.prepareDeferredKeyValueRecord(startLen, endLen, InternalKeyKindRangeKeyDelete)
  1065  	b.incrementRangeKeysCount()
  1066  	return &b.deferredOp
  1067  }
  1068  
  1069  // LogData adds the specified to the batch. The data will be written to the
  1070  // WAL, but not added to memtables or sstables. Log data is never indexed,
  1071  // which makes it useful for testing WAL performance.
  1072  //
  1073  // It is safe to modify the contents of the argument after LogData returns.
  1074  func (b *Batch) LogData(data []byte, _ *WriteOptions) error {
  1075  	origCount, origMemTableSize := b.count, b.memTableSize
  1076  	b.prepareDeferredKeyRecord(len(data), InternalKeyKindLogData)
  1077  	copy(b.deferredOp.Key, data)
  1078  	// Since LogData only writes to the WAL and does not affect the memtable, we
  1079  	// restore b.count and b.memTableSize to their origin values. Note that
  1080  	// Batch.count only refers to records that are added to the memtable.
  1081  	b.count, b.memTableSize = origCount, origMemTableSize
  1082  	return nil
  1083  }
  1084  
  1085  // IngestSST adds the FileNum for an sstable to the batch. The data will only be
  1086  // written to the WAL (not added to memtables or sstables).
  1087  func (b *Batch) ingestSST(fileNum base.FileNum) {
  1088  	if b.Empty() {
  1089  		b.ingestedSSTBatch = true
  1090  	} else if !b.ingestedSSTBatch {
  1091  		// Batch contains other key kinds.
  1092  		panic("pebble: invalid call to ingestSST")
  1093  	}
  1094  
  1095  	origMemTableSize := b.memTableSize
  1096  	var buf [binary.MaxVarintLen64]byte
  1097  	length := binary.PutUvarint(buf[:], uint64(fileNum))
  1098  	b.prepareDeferredKeyRecord(length, InternalKeyKindIngestSST)
  1099  	copy(b.deferredOp.Key, buf[:length])
  1100  	// Since IngestSST writes only to the WAL and does not affect the memtable,
  1101  	// we restore b.memTableSize to its original value. Note that Batch.count
  1102  	// is not reset because for the InternalKeyKindIngestSST the count is the
  1103  	// number of sstable paths which have been added to the batch.
  1104  	b.memTableSize = origMemTableSize
  1105  	b.minimumFormatMajorVersion = FormatFlushableIngest
  1106  }
  1107  
  1108  // Empty returns true if the batch is empty, and false otherwise.
  1109  func (b *Batch) Empty() bool {
  1110  	return len(b.data) <= batchHeaderLen
  1111  }
  1112  
  1113  // Len returns the current size of the batch in bytes.
  1114  func (b *Batch) Len() int {
  1115  	if len(b.data) <= batchHeaderLen {
  1116  		return batchHeaderLen
  1117  	}
  1118  	return len(b.data)
  1119  }
  1120  
  1121  // Repr returns the underlying batch representation. It is not safe to modify
  1122  // the contents. Reset() will not change the contents of the returned value,
  1123  // though any other mutation operation may do so.
  1124  func (b *Batch) Repr() []byte {
  1125  	if len(b.data) == 0 {
  1126  		b.init(batchHeaderLen)
  1127  	}
  1128  	binary.LittleEndian.PutUint32(b.countData(), b.Count())
  1129  	return b.data
  1130  }
  1131  
  1132  // SetRepr sets the underlying batch representation. The batch takes ownership
  1133  // of the supplied slice. It is not safe to modify it afterwards until the
  1134  // Batch is no longer in use.
  1135  func (b *Batch) SetRepr(data []byte) error {
  1136  	if len(data) < batchHeaderLen {
  1137  		return base.CorruptionErrorf("invalid batch")
  1138  	}
  1139  	b.data = data
  1140  	b.count = uint64(binary.LittleEndian.Uint32(b.countData()))
  1141  	var err error
  1142  	if b.db != nil {
  1143  		// Only track memTableSize for batches that will be committed to the DB.
  1144  		err = b.refreshMemTableSize()
  1145  	}
  1146  	return err
  1147  }
  1148  
  1149  // NewIter returns an iterator that is unpositioned (Iterator.Valid() will
  1150  // return false). The iterator can be positioned via a call to SeekGE,
  1151  // SeekPrefixGE, SeekLT, First or Last. Only indexed batches support iterators.
  1152  //
  1153  // The returned Iterator observes all of the Batch's existing mutations, but no
  1154  // later mutations. Its view can be refreshed via RefreshBatchSnapshot or
  1155  // SetOptions().
  1156  func (b *Batch) NewIter(o *IterOptions) (*Iterator, error) {
  1157  	return b.NewIterWithContext(context.Background(), o)
  1158  }
  1159  
  1160  // NewIterWithContext is like NewIter, and additionally accepts a context for
  1161  // tracing.
  1162  func (b *Batch) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error) {
  1163  	if b.index == nil {
  1164  		return nil, ErrNotIndexed
  1165  	}
  1166  	return b.db.newIter(ctx, b, newIterOpts{}, o), nil
  1167  }
  1168  
  1169  // NewBatchOnlyIter constructs an iterator that only reads the contents of the
  1170  // batch, and does not overlay the batch mutations on top of the DB state.
  1171  //
  1172  // The returned Iterator observes all of the Batch's existing mutations, but
  1173  // no later mutations. Its view can be refreshed via RefreshBatchSnapshot or
  1174  // SetOptions().
  1175  func (b *Batch) NewBatchOnlyIter(ctx context.Context, o *IterOptions) (*Iterator, error) {
  1176  	if b.index == nil {
  1177  		return nil, ErrNotIndexed
  1178  	}
  1179  	return b.db.newIter(ctx, b, newIterOpts{batch: batchIterOpts{batchOnly: true}}, o), nil
  1180  }
  1181  
  1182  // newInternalIter creates a new internalIterator that iterates over the
  1183  // contents of the batch.
  1184  func (b *Batch) newInternalIter(o *IterOptions) *batchIter {
  1185  	iter := &batchIter{}
  1186  	b.initInternalIter(o, iter)
  1187  	return iter
  1188  }
  1189  
  1190  func (b *Batch) initInternalIter(o *IterOptions, iter *batchIter) {
  1191  	*iter = batchIter{
  1192  		cmp:   b.cmp,
  1193  		batch: b,
  1194  		iter:  b.index.NewIter(o.GetLowerBound(), o.GetUpperBound()),
  1195  		// NB: We explicitly do not propagate the batch snapshot to the point
  1196  		// key iterator. Filtering point keys within the batch iterator can
  1197  		// cause pathological behavior where a batch iterator advances
  1198  		// significantly farther than necessary filtering many batch keys that
  1199  		// are not visible at the batch sequence number. Instead, the merging
  1200  		// iterator enforces bounds.
  1201  		//
  1202  		// For example, consider an engine that contains the committed keys
  1203  		// 'bar' and 'bax', with no keys between them. Consider a batch
  1204  		// containing keys 1,000 keys within the range [a,z]. All of the
  1205  		// batch keys were added to the batch after the iterator was
  1206  		// constructed, so they are not visible to the iterator. A call to
  1207  		// SeekGE('bax') would seek the LSM iterators and discover the key
  1208  		// 'bax'. It would also seek the batch iterator, landing on the key
  1209  		// 'baz' but discover it that it's not visible. The batch iterator would
  1210  		// next through the rest of the batch's keys, only to discover there are
  1211  		// no visible keys greater than or equal to 'bax'.
  1212  		//
  1213  		// Filtering these batch points within the merging iterator ensures that
  1214  		// the batch iterator never needs to iterate beyond 'baz', because it
  1215  		// already found a smaller, visible key 'bax'.
  1216  		snapshot: base.InternalKeySeqNumMax,
  1217  	}
  1218  }
  1219  
  1220  func (b *Batch) newRangeDelIter(o *IterOptions, batchSnapshot uint64) *keyspan.Iter {
  1221  	// Construct an iterator even if rangeDelIndex is nil, because it is allowed
  1222  	// to refresh later, so we need the container to exist.
  1223  	iter := new(keyspan.Iter)
  1224  	b.initRangeDelIter(o, iter, batchSnapshot)
  1225  	return iter
  1226  }
  1227  
  1228  func (b *Batch) initRangeDelIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot uint64) {
  1229  	if b.rangeDelIndex == nil {
  1230  		iter.Init(b.cmp, nil)
  1231  		return
  1232  	}
  1233  
  1234  	// Fragment the range tombstones the first time a range deletion iterator is
  1235  	// requested. The cached tombstones are invalidated if another range
  1236  	// deletion tombstone is added to the batch. This cache is only guaranteed
  1237  	// to be correct if we're opening an iterator to read at a batch sequence
  1238  	// number at least as high as tombstonesSeqNum. The cache is guaranteed to
  1239  	// include all tombstones up to tombstonesSeqNum, and if any additional
  1240  	// tombstones were added after that sequence number the cache would've been
  1241  	// cleared.
  1242  	nextSeqNum := b.nextSeqNum()
  1243  	if b.tombstones != nil && b.tombstonesSeqNum <= batchSnapshot {
  1244  		iter.Init(b.cmp, b.tombstones)
  1245  		return
  1246  	}
  1247  
  1248  	tombstones := make([]keyspan.Span, 0, b.countRangeDels)
  1249  	frag := &keyspan.Fragmenter{
  1250  		Cmp:    b.cmp,
  1251  		Format: b.formatKey,
  1252  		Emit: func(s keyspan.Span) {
  1253  			tombstones = append(tombstones, s)
  1254  		},
  1255  	}
  1256  	it := &batchIter{
  1257  		cmp:      b.cmp,
  1258  		batch:    b,
  1259  		iter:     b.rangeDelIndex.NewIter(nil, nil),
  1260  		snapshot: batchSnapshot,
  1261  	}
  1262  	fragmentRangeDels(frag, it, int(b.countRangeDels))
  1263  	iter.Init(b.cmp, tombstones)
  1264  
  1265  	// If we just read all the tombstones in the batch (eg, batchSnapshot was
  1266  	// set to b.nextSeqNum()), then cache the tombstones so that a subsequent
  1267  	// call to initRangeDelIter may use them without refragmenting.
  1268  	if nextSeqNum == batchSnapshot {
  1269  		b.tombstones = tombstones
  1270  		b.tombstonesSeqNum = nextSeqNum
  1271  	}
  1272  }
  1273  
  1274  func fragmentRangeDels(frag *keyspan.Fragmenter, it internalIterator, count int) {
  1275  	// The memory management here is a bit subtle. The keys and values returned
  1276  	// by the iterator are slices in Batch.data. Thus the fragmented tombstones
  1277  	// are slices within Batch.data. If additional entries are added to the
  1278  	// Batch, Batch.data may be reallocated. The references in the fragmented
  1279  	// tombstones will remain valid, pointing into the old Batch.data. GC for
  1280  	// the win.
  1281  
  1282  	// Use a single []keyspan.Key buffer to avoid allocating many
  1283  	// individual []keyspan.Key slices with a single element each.
  1284  	keyBuf := make([]keyspan.Key, 0, count)
  1285  	for key, val := it.First(); key != nil; key, val = it.Next() {
  1286  		s := rangedel.Decode(*key, val.InPlaceValue(), keyBuf)
  1287  		keyBuf = s.Keys[len(s.Keys):]
  1288  
  1289  		// Set a fixed capacity to avoid accidental overwriting.
  1290  		s.Keys = s.Keys[:len(s.Keys):len(s.Keys)]
  1291  		frag.Add(s)
  1292  	}
  1293  	frag.Finish()
  1294  }
  1295  
  1296  func (b *Batch) newRangeKeyIter(o *IterOptions, batchSnapshot uint64) *keyspan.Iter {
  1297  	// Construct an iterator even if rangeKeyIndex is nil, because it is allowed
  1298  	// to refresh later, so we need the container to exist.
  1299  	iter := new(keyspan.Iter)
  1300  	b.initRangeKeyIter(o, iter, batchSnapshot)
  1301  	return iter
  1302  }
  1303  
  1304  func (b *Batch) initRangeKeyIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot uint64) {
  1305  	if b.rangeKeyIndex == nil {
  1306  		iter.Init(b.cmp, nil)
  1307  		return
  1308  	}
  1309  
  1310  	// Fragment the range keys the first time a range key iterator is requested.
  1311  	// The cached spans are invalidated if another range key is added to the
  1312  	// batch. This cache is only guaranteed to be correct if we're opening an
  1313  	// iterator to read at a batch sequence number at least as high as
  1314  	// rangeKeysSeqNum. The cache is guaranteed to include all range keys up to
  1315  	// rangeKeysSeqNum, and if any additional range keys were added after that
  1316  	// sequence number the cache would've been cleared.
  1317  	nextSeqNum := b.nextSeqNum()
  1318  	if b.rangeKeys != nil && b.rangeKeysSeqNum <= batchSnapshot {
  1319  		iter.Init(b.cmp, b.rangeKeys)
  1320  		return
  1321  	}
  1322  
  1323  	rangeKeys := make([]keyspan.Span, 0, b.countRangeKeys)
  1324  	frag := &keyspan.Fragmenter{
  1325  		Cmp:    b.cmp,
  1326  		Format: b.formatKey,
  1327  		Emit: func(s keyspan.Span) {
  1328  			rangeKeys = append(rangeKeys, s)
  1329  		},
  1330  	}
  1331  	it := &batchIter{
  1332  		cmp:      b.cmp,
  1333  		batch:    b,
  1334  		iter:     b.rangeKeyIndex.NewIter(nil, nil),
  1335  		snapshot: batchSnapshot,
  1336  	}
  1337  	fragmentRangeKeys(frag, it, int(b.countRangeKeys))
  1338  	iter.Init(b.cmp, rangeKeys)
  1339  
  1340  	// If we just read all the range keys in the batch (eg, batchSnapshot was
  1341  	// set to b.nextSeqNum()), then cache the range keys so that a subsequent
  1342  	// call to initRangeKeyIter may use them without refragmenting.
  1343  	if nextSeqNum == batchSnapshot {
  1344  		b.rangeKeys = rangeKeys
  1345  		b.rangeKeysSeqNum = nextSeqNum
  1346  	}
  1347  }
  1348  
  1349  func fragmentRangeKeys(frag *keyspan.Fragmenter, it internalIterator, count int) error {
  1350  	// The memory management here is a bit subtle. The keys and values
  1351  	// returned by the iterator are slices in Batch.data. Thus the
  1352  	// fragmented key spans are slices within Batch.data. If additional
  1353  	// entries are added to the Batch, Batch.data may be reallocated. The
  1354  	// references in the fragmented keys will remain valid, pointing into
  1355  	// the old Batch.data. GC for the win.
  1356  
  1357  	// Use a single []keyspan.Key buffer to avoid allocating many
  1358  	// individual []keyspan.Key slices with a single element each.
  1359  	keyBuf := make([]keyspan.Key, 0, count)
  1360  	for ik, val := it.First(); ik != nil; ik, val = it.Next() {
  1361  		s, err := rangekey.Decode(*ik, val.InPlaceValue(), keyBuf)
  1362  		if err != nil {
  1363  			return err
  1364  		}
  1365  		keyBuf = s.Keys[len(s.Keys):]
  1366  
  1367  		// Set a fixed capacity to avoid accidental overwriting.
  1368  		s.Keys = s.Keys[:len(s.Keys):len(s.Keys)]
  1369  		frag.Add(s)
  1370  	}
  1371  	frag.Finish()
  1372  	return nil
  1373  }
  1374  
  1375  // Commit applies the batch to its parent writer.
  1376  func (b *Batch) Commit(o *WriteOptions) error {
  1377  	return b.db.Apply(b, o)
  1378  }
  1379  
  1380  // Close closes the batch without committing it.
  1381  func (b *Batch) Close() error {
  1382  	b.release()
  1383  	return nil
  1384  }
  1385  
  1386  // Indexed returns true if the batch is indexed (i.e. supports read
  1387  // operations).
  1388  func (b *Batch) Indexed() bool {
  1389  	return b.index != nil
  1390  }
  1391  
  1392  // init ensures that the batch data slice is initialized to meet the
  1393  // minimum required size and allocates space for the batch header.
  1394  func (b *Batch) init(size int) {
  1395  	n := batchInitialSize
  1396  	for n < size {
  1397  		n *= 2
  1398  	}
  1399  	if cap(b.data) < n {
  1400  		b.data = rawalloc.New(batchHeaderLen, n)
  1401  	}
  1402  	b.data = b.data[:batchHeaderLen]
  1403  	clear(b.data) // Zero the sequence number in the header
  1404  }
  1405  
  1406  // Reset resets the batch for reuse. The underlying byte slice (that is
  1407  // returned by Repr()) may not be modified. It is only necessary to call this
  1408  // method if a batch is explicitly being reused. Close automatically takes are
  1409  // of releasing resources when appropriate for batches that are internally
  1410  // being reused.
  1411  func (b *Batch) Reset() {
  1412  	// Zero out the struct, retaining only the fields necessary for manual
  1413  	// reuse.
  1414  	b.batchInternal = batchInternal{
  1415  		data:           b.data,
  1416  		cmp:            b.cmp,
  1417  		formatKey:      b.formatKey,
  1418  		abbreviatedKey: b.abbreviatedKey,
  1419  		index:          b.index,
  1420  		db:             b.db,
  1421  	}
  1422  	b.applied.Store(false)
  1423  	if b.data != nil {
  1424  		if cap(b.data) > batchMaxRetainedSize {
  1425  			// If the capacity of the buffer is larger than our maximum
  1426  			// retention size, don't re-use it. Let it be GC-ed instead.
  1427  			// This prevents the memory from an unusually large batch from
  1428  			// being held on to indefinitely.
  1429  			b.data = nil
  1430  		} else {
  1431  			// Otherwise, reset the buffer for re-use.
  1432  			b.data = b.data[:batchHeaderLen]
  1433  			clear(b.data)
  1434  		}
  1435  	}
  1436  	if b.index != nil {
  1437  		b.index.Init(&b.data, b.cmp, b.abbreviatedKey)
  1438  	}
  1439  }
  1440  
  1441  // seqNumData returns the 8 byte little-endian sequence number. Zero means that
  1442  // the batch has not yet been applied.
  1443  func (b *Batch) seqNumData() []byte {
  1444  	return b.data[:8]
  1445  }
  1446  
  1447  // countData returns the 4 byte little-endian count data. "\xff\xff\xff\xff"
  1448  // means that the batch is invalid.
  1449  func (b *Batch) countData() []byte {
  1450  	return b.data[8:12]
  1451  }
  1452  
  1453  func (b *Batch) grow(n int) {
  1454  	newSize := len(b.data) + n
  1455  	if uint64(newSize) >= maxBatchSize {
  1456  		panic(ErrBatchTooLarge)
  1457  	}
  1458  	if newSize > cap(b.data) {
  1459  		newCap := 2 * cap(b.data)
  1460  		for newCap < newSize {
  1461  			newCap *= 2
  1462  		}
  1463  		newData := rawalloc.New(len(b.data), newCap)
  1464  		copy(newData, b.data)
  1465  		b.data = newData
  1466  	}
  1467  	b.data = b.data[:newSize]
  1468  }
  1469  
  1470  func (b *Batch) setSeqNum(seqNum uint64) {
  1471  	binary.LittleEndian.PutUint64(b.seqNumData(), seqNum)
  1472  }
  1473  
  1474  // SeqNum returns the batch sequence number which is applied to the first
  1475  // record in the batch. The sequence number is incremented for each subsequent
  1476  // record. It returns zero if the batch is empty.
  1477  func (b *Batch) SeqNum() uint64 {
  1478  	if len(b.data) == 0 {
  1479  		b.init(batchHeaderLen)
  1480  	}
  1481  	return binary.LittleEndian.Uint64(b.seqNumData())
  1482  }
  1483  
  1484  func (b *Batch) setCount(v uint32) {
  1485  	b.count = uint64(v)
  1486  }
  1487  
  1488  // Count returns the count of memtable-modifying operations in this batch. All
  1489  // operations with the except of LogData increment this count. For IngestSSTs,
  1490  // count is only used to indicate the number of SSTs ingested in the record, the
  1491  // batch isn't applied to the memtable.
  1492  func (b *Batch) Count() uint32 {
  1493  	if b.count > math.MaxUint32 {
  1494  		panic(ErrInvalidBatch)
  1495  	}
  1496  	return uint32(b.count)
  1497  }
  1498  
  1499  // Reader returns a BatchReader for the current batch contents. If the batch is
  1500  // mutated, the new entries will not be visible to the reader.
  1501  func (b *Batch) Reader() BatchReader {
  1502  	if len(b.data) == 0 {
  1503  		b.init(batchHeaderLen)
  1504  	}
  1505  	return b.data[batchHeaderLen:]
  1506  }
  1507  
  1508  func batchDecodeStr(data []byte) (odata []byte, s []byte, ok bool) {
  1509  	// TODO(jackson): This will index out of bounds if there's no varint or an
  1510  	// invalid varint (eg, a single 0xff byte). Correcting will add a bit of
  1511  	// overhead. We could avoid that overhead whenever len(data) >=
  1512  	// binary.MaxVarint32?
  1513  
  1514  	var v uint32
  1515  	var n int
  1516  	ptr := unsafe.Pointer(&data[0])
  1517  	if a := *((*uint8)(ptr)); a < 128 {
  1518  		v = uint32(a)
  1519  		n = 1
  1520  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
  1521  		v = uint32(b)<<7 | uint32(a)
  1522  		n = 2
  1523  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
  1524  		v = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1525  		n = 3
  1526  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
  1527  		v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1528  		n = 4
  1529  	} else {
  1530  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
  1531  		v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1532  		n = 5
  1533  	}
  1534  
  1535  	data = data[n:]
  1536  	if v > uint32(len(data)) {
  1537  		return nil, nil, false
  1538  	}
  1539  	return data[v:], data[:v], true
  1540  }
  1541  
  1542  // SyncWait is to be used in conjunction with DB.ApplyNoSyncWait.
  1543  func (b *Batch) SyncWait() error {
  1544  	now := time.Now()
  1545  	b.fsyncWait.Wait()
  1546  	if b.commitErr != nil {
  1547  		b.db = nil // prevent batch reuse on error
  1548  	}
  1549  	waitDuration := time.Since(now)
  1550  	b.commitStats.CommitWaitDuration += waitDuration
  1551  	b.commitStats.TotalDuration += waitDuration
  1552  	return b.commitErr
  1553  }
  1554  
  1555  // CommitStats returns stats related to committing the batch. Should be called
  1556  // after Batch.Commit, DB.Apply. If DB.ApplyNoSyncWait is used, should be
  1557  // called after Batch.SyncWait.
  1558  func (b *Batch) CommitStats() BatchCommitStats {
  1559  	return b.commitStats
  1560  }
  1561  
  1562  // BatchReader iterates over the entries contained in a batch.
  1563  type BatchReader []byte
  1564  
  1565  // ReadBatch constructs a BatchReader from a batch representation.  The
  1566  // header is not validated. ReadBatch returns a new batch reader and the
  1567  // count of entries contained within the batch.
  1568  func ReadBatch(repr []byte) (r BatchReader, count uint32) {
  1569  	if len(repr) <= batchHeaderLen {
  1570  		return nil, count
  1571  	}
  1572  	count = binary.LittleEndian.Uint32(repr[batchCountOffset:batchHeaderLen])
  1573  	return repr[batchHeaderLen:], count
  1574  }
  1575  
  1576  // Next returns the next entry in this batch, if there is one. If the reader has
  1577  // reached the end of the batch, Next returns ok=false and a nil error. If the
  1578  // batch is corrupt and the next entry is illegible, Next returns ok=false and a
  1579  // non-nil error.
  1580  func (r *BatchReader) Next() (kind InternalKeyKind, ukey []byte, value []byte, ok bool, err error) {
  1581  	if len(*r) == 0 {
  1582  		return 0, nil, nil, false, nil
  1583  	}
  1584  	kind = InternalKeyKind((*r)[0])
  1585  	if kind > InternalKeyKindMax {
  1586  		return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "invalid key kind 0x%x", (*r)[0])
  1587  	}
  1588  	*r, ukey, ok = batchDecodeStr((*r)[1:])
  1589  	if !ok {
  1590  		return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "decoding user key")
  1591  	}
  1592  	switch kind {
  1593  	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete,
  1594  		InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete,
  1595  		InternalKeyKindDeleteSized:
  1596  		*r, value, ok = batchDecodeStr(*r)
  1597  		if !ok {
  1598  			return 0, nil, nil, false, errors.Wrapf(ErrInvalidBatch, "decoding %s value", kind)
  1599  		}
  1600  	}
  1601  	return kind, ukey, value, true, nil
  1602  }
  1603  
  1604  // Note: batchIter mirrors the implementation of flushableBatchIter. Keep the
  1605  // two in sync.
  1606  type batchIter struct {
  1607  	cmp   Compare
  1608  	batch *Batch
  1609  	iter  batchskl.Iterator
  1610  	err   error
  1611  	// snapshot holds a batch "sequence number" at which the batch is being
  1612  	// read. This sequence number has the InternalKeySeqNumBatch bit set, so it
  1613  	// encodes an offset within the batch. Only batch entries earlier than the
  1614  	// offset are visible during iteration.
  1615  	snapshot uint64
  1616  }
  1617  
  1618  // batchIter implements the base.InternalIterator interface.
  1619  var _ base.InternalIterator = (*batchIter)(nil)
  1620  
  1621  func (i *batchIter) String() string {
  1622  	return "batch"
  1623  }
  1624  
  1625  func (i *batchIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
  1626  	// Ignore TrySeekUsingNext if the view of the batch changed.
  1627  	if flags.TrySeekUsingNext() && flags.BatchJustRefreshed() {
  1628  		flags = flags.DisableTrySeekUsingNext()
  1629  	}
  1630  
  1631  	i.err = nil // clear cached iteration error
  1632  	ikey := i.iter.SeekGE(key, flags)
  1633  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1634  		ikey = i.iter.Next()
  1635  	}
  1636  	if ikey == nil {
  1637  		return nil, base.LazyValue{}
  1638  	}
  1639  	return ikey, base.MakeInPlaceValue(i.value())
  1640  }
  1641  
  1642  func (i *batchIter) SeekPrefixGE(
  1643  	prefix, key []byte, flags base.SeekGEFlags,
  1644  ) (*base.InternalKey, base.LazyValue) {
  1645  	i.err = nil // clear cached iteration error
  1646  	return i.SeekGE(key, flags)
  1647  }
  1648  
  1649  func (i *batchIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
  1650  	i.err = nil // clear cached iteration error
  1651  	ikey := i.iter.SeekLT(key)
  1652  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1653  		ikey = i.iter.Prev()
  1654  	}
  1655  	if ikey == nil {
  1656  		return nil, base.LazyValue{}
  1657  	}
  1658  	return ikey, base.MakeInPlaceValue(i.value())
  1659  }
  1660  
  1661  func (i *batchIter) First() (*InternalKey, base.LazyValue) {
  1662  	i.err = nil // clear cached iteration error
  1663  	ikey := i.iter.First()
  1664  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1665  		ikey = i.iter.Next()
  1666  	}
  1667  	if ikey == nil {
  1668  		return nil, base.LazyValue{}
  1669  	}
  1670  	return ikey, base.MakeInPlaceValue(i.value())
  1671  }
  1672  
  1673  func (i *batchIter) Last() (*InternalKey, base.LazyValue) {
  1674  	i.err = nil // clear cached iteration error
  1675  	ikey := i.iter.Last()
  1676  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1677  		ikey = i.iter.Prev()
  1678  	}
  1679  	if ikey == nil {
  1680  		return nil, base.LazyValue{}
  1681  	}
  1682  	return ikey, base.MakeInPlaceValue(i.value())
  1683  }
  1684  
  1685  func (i *batchIter) Next() (*InternalKey, base.LazyValue) {
  1686  	ikey := i.iter.Next()
  1687  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1688  		ikey = i.iter.Next()
  1689  	}
  1690  	if ikey == nil {
  1691  		return nil, base.LazyValue{}
  1692  	}
  1693  	return ikey, base.MakeInPlaceValue(i.value())
  1694  }
  1695  
  1696  func (i *batchIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) {
  1697  	// Because NextPrefix was invoked `succKey` must be ≥ the key at i's current
  1698  	// position. Seek the arena iterator using TrySeekUsingNext.
  1699  	ikey := i.iter.SeekGE(succKey, base.SeekGEFlagsNone.EnableTrySeekUsingNext())
  1700  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1701  		ikey = i.iter.Next()
  1702  	}
  1703  	if ikey == nil {
  1704  		return nil, base.LazyValue{}
  1705  	}
  1706  	return ikey, base.MakeInPlaceValue(i.value())
  1707  }
  1708  
  1709  func (i *batchIter) Prev() (*InternalKey, base.LazyValue) {
  1710  	ikey := i.iter.Prev()
  1711  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1712  		ikey = i.iter.Prev()
  1713  	}
  1714  	if ikey == nil {
  1715  		return nil, base.LazyValue{}
  1716  	}
  1717  	return ikey, base.MakeInPlaceValue(i.value())
  1718  }
  1719  
  1720  func (i *batchIter) value() []byte {
  1721  	offset, _, keyEnd := i.iter.KeyInfo()
  1722  	data := i.batch.data
  1723  	if len(data[offset:]) == 0 {
  1724  		i.err = base.CorruptionErrorf("corrupted batch")
  1725  		return nil
  1726  	}
  1727  
  1728  	switch InternalKeyKind(data[offset]) {
  1729  	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete,
  1730  		InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete,
  1731  		InternalKeyKindDeleteSized:
  1732  		_, value, ok := batchDecodeStr(data[keyEnd:])
  1733  		if !ok {
  1734  			return nil
  1735  		}
  1736  		return value
  1737  	default:
  1738  		return nil
  1739  	}
  1740  }
  1741  
  1742  func (i *batchIter) Error() error {
  1743  	return i.err
  1744  }
  1745  
  1746  func (i *batchIter) Close() error {
  1747  	_ = i.iter.Close()
  1748  	return i.err
  1749  }
  1750  
  1751  func (i *batchIter) SetBounds(lower, upper []byte) {
  1752  	i.iter.SetBounds(lower, upper)
  1753  }
  1754  
  1755  func (i *batchIter) SetContext(_ context.Context) {}
  1756  
  1757  type flushableBatchEntry struct {
  1758  	// offset is the byte offset of the record within the batch repr.
  1759  	offset uint32
  1760  	// index is the 0-based ordinal number of the record within the batch. Used
  1761  	// to compute the seqnum for the record.
  1762  	index uint32
  1763  	// key{Start,End} are the start and end byte offsets of the key within the
  1764  	// batch repr. Cached to avoid decoding the key length on every
  1765  	// comparison. The value is stored starting at keyEnd.
  1766  	keyStart uint32
  1767  	keyEnd   uint32
  1768  }
  1769  
  1770  // flushableBatch wraps an existing batch and provides the interfaces needed
  1771  // for making the batch flushable (i.e. able to mimic a memtable).
  1772  type flushableBatch struct {
  1773  	cmp       Compare
  1774  	formatKey base.FormatKey
  1775  	data      []byte
  1776  
  1777  	// The base sequence number for the entries in the batch. This is the same
  1778  	// value as Batch.seqNum() and is cached here for performance.
  1779  	seqNum uint64
  1780  
  1781  	// A slice of offsets and indices for the entries in the batch. Used to
  1782  	// implement flushableBatchIter. Unlike the indexing on a normal batch, a
  1783  	// flushable batch is indexed such that batch entry i will be given the
  1784  	// sequence number flushableBatch.seqNum+i.
  1785  	//
  1786  	// Sorted in increasing order of key and decreasing order of offset (since
  1787  	// higher offsets correspond to higher sequence numbers).
  1788  	//
  1789  	// Does not include range deletion entries or range key entries.
  1790  	offsets []flushableBatchEntry
  1791  
  1792  	// Fragmented range deletion tombstones.
  1793  	tombstones []keyspan.Span
  1794  
  1795  	// Fragmented range keys.
  1796  	rangeKeys []keyspan.Span
  1797  }
  1798  
  1799  var _ flushable = (*flushableBatch)(nil)
  1800  
  1801  // newFlushableBatch creates a new batch that implements the flushable
  1802  // interface. This allows the batch to act like a memtable and be placed in the
  1803  // queue of flushable memtables. Note that the flushable batch takes ownership
  1804  // of the batch data.
  1805  func newFlushableBatch(batch *Batch, comparer *Comparer) (*flushableBatch, error) {
  1806  	b := &flushableBatch{
  1807  		data:      batch.data,
  1808  		cmp:       comparer.Compare,
  1809  		formatKey: comparer.FormatKey,
  1810  		offsets:   make([]flushableBatchEntry, 0, batch.Count()),
  1811  	}
  1812  	if b.data != nil {
  1813  		// Note that this sequence number is not correct when this batch has not
  1814  		// been applied since the sequence number has not been assigned yet. The
  1815  		// correct sequence number will be set later. But it is correct when the
  1816  		// batch is being replayed from the WAL.
  1817  		b.seqNum = batch.SeqNum()
  1818  	}
  1819  	var rangeDelOffsets []flushableBatchEntry
  1820  	var rangeKeyOffsets []flushableBatchEntry
  1821  	if len(b.data) > batchHeaderLen {
  1822  		// Non-empty batch.
  1823  		var index uint32
  1824  		for iter := BatchReader(b.data[batchHeaderLen:]); len(iter) > 0; index++ {
  1825  			offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0]))
  1826  			kind, key, _, ok, err := iter.Next()
  1827  			if !ok {
  1828  				if err != nil {
  1829  					return nil, err
  1830  				}
  1831  				break
  1832  			}
  1833  			entry := flushableBatchEntry{
  1834  				offset: uint32(offset),
  1835  				index:  uint32(index),
  1836  			}
  1837  			if keySize := uint32(len(key)); keySize == 0 {
  1838  				// Must add 2 to the offset. One byte encodes `kind` and the next
  1839  				// byte encodes `0`, which is the length of the key.
  1840  				entry.keyStart = uint32(offset) + 2
  1841  				entry.keyEnd = entry.keyStart
  1842  			} else {
  1843  				entry.keyStart = uint32(uintptr(unsafe.Pointer(&key[0])) -
  1844  					uintptr(unsafe.Pointer(&b.data[0])))
  1845  				entry.keyEnd = entry.keyStart + keySize
  1846  			}
  1847  			switch kind {
  1848  			case InternalKeyKindRangeDelete:
  1849  				rangeDelOffsets = append(rangeDelOffsets, entry)
  1850  			case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
  1851  				rangeKeyOffsets = append(rangeKeyOffsets, entry)
  1852  			default:
  1853  				b.offsets = append(b.offsets, entry)
  1854  			}
  1855  		}
  1856  	}
  1857  
  1858  	// Sort all of offsets, rangeDelOffsets and rangeKeyOffsets, using *batch's
  1859  	// sort.Interface implementation.
  1860  	pointOffsets := b.offsets
  1861  	sort.Sort(b)
  1862  	b.offsets = rangeDelOffsets
  1863  	sort.Sort(b)
  1864  	b.offsets = rangeKeyOffsets
  1865  	sort.Sort(b)
  1866  	b.offsets = pointOffsets
  1867  
  1868  	if len(rangeDelOffsets) > 0 {
  1869  		frag := &keyspan.Fragmenter{
  1870  			Cmp:    b.cmp,
  1871  			Format: b.formatKey,
  1872  			Emit: func(s keyspan.Span) {
  1873  				b.tombstones = append(b.tombstones, s)
  1874  			},
  1875  		}
  1876  		it := &flushableBatchIter{
  1877  			batch:   b,
  1878  			data:    b.data,
  1879  			offsets: rangeDelOffsets,
  1880  			cmp:     b.cmp,
  1881  			index:   -1,
  1882  		}
  1883  		fragmentRangeDels(frag, it, len(rangeDelOffsets))
  1884  	}
  1885  	if len(rangeKeyOffsets) > 0 {
  1886  		frag := &keyspan.Fragmenter{
  1887  			Cmp:    b.cmp,
  1888  			Format: b.formatKey,
  1889  			Emit: func(s keyspan.Span) {
  1890  				b.rangeKeys = append(b.rangeKeys, s)
  1891  			},
  1892  		}
  1893  		it := &flushableBatchIter{
  1894  			batch:   b,
  1895  			data:    b.data,
  1896  			offsets: rangeKeyOffsets,
  1897  			cmp:     b.cmp,
  1898  			index:   -1,
  1899  		}
  1900  		fragmentRangeKeys(frag, it, len(rangeKeyOffsets))
  1901  	}
  1902  	return b, nil
  1903  }
  1904  
  1905  func (b *flushableBatch) setSeqNum(seqNum uint64) {
  1906  	if b.seqNum != 0 {
  1907  		panic(fmt.Sprintf("pebble: flushableBatch.seqNum already set: %d", b.seqNum))
  1908  	}
  1909  	b.seqNum = seqNum
  1910  	for i := range b.tombstones {
  1911  		for j := range b.tombstones[i].Keys {
  1912  			b.tombstones[i].Keys[j].Trailer = base.MakeTrailer(
  1913  				b.tombstones[i].Keys[j].SeqNum()+seqNum,
  1914  				b.tombstones[i].Keys[j].Kind(),
  1915  			)
  1916  		}
  1917  	}
  1918  	for i := range b.rangeKeys {
  1919  		for j := range b.rangeKeys[i].Keys {
  1920  			b.rangeKeys[i].Keys[j].Trailer = base.MakeTrailer(
  1921  				b.rangeKeys[i].Keys[j].SeqNum()+seqNum,
  1922  				b.rangeKeys[i].Keys[j].Kind(),
  1923  			)
  1924  		}
  1925  	}
  1926  }
  1927  
  1928  func (b *flushableBatch) Len() int {
  1929  	return len(b.offsets)
  1930  }
  1931  
  1932  func (b *flushableBatch) Less(i, j int) bool {
  1933  	ei := &b.offsets[i]
  1934  	ej := &b.offsets[j]
  1935  	ki := b.data[ei.keyStart:ei.keyEnd]
  1936  	kj := b.data[ej.keyStart:ej.keyEnd]
  1937  	switch c := b.cmp(ki, kj); {
  1938  	case c < 0:
  1939  		return true
  1940  	case c > 0:
  1941  		return false
  1942  	default:
  1943  		return ei.offset > ej.offset
  1944  	}
  1945  }
  1946  
  1947  func (b *flushableBatch) Swap(i, j int) {
  1948  	b.offsets[i], b.offsets[j] = b.offsets[j], b.offsets[i]
  1949  }
  1950  
  1951  // newIter is part of the flushable interface.
  1952  func (b *flushableBatch) newIter(o *IterOptions) internalIterator {
  1953  	return &flushableBatchIter{
  1954  		batch:   b,
  1955  		data:    b.data,
  1956  		offsets: b.offsets,
  1957  		cmp:     b.cmp,
  1958  		index:   -1,
  1959  		lower:   o.GetLowerBound(),
  1960  		upper:   o.GetUpperBound(),
  1961  	}
  1962  }
  1963  
  1964  // newFlushIter is part of the flushable interface.
  1965  func (b *flushableBatch) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator {
  1966  	return &flushFlushableBatchIter{
  1967  		flushableBatchIter: flushableBatchIter{
  1968  			batch:   b,
  1969  			data:    b.data,
  1970  			offsets: b.offsets,
  1971  			cmp:     b.cmp,
  1972  			index:   -1,
  1973  		},
  1974  		bytesIterated: bytesFlushed,
  1975  	}
  1976  }
  1977  
  1978  // newRangeDelIter is part of the flushable interface.
  1979  func (b *flushableBatch) newRangeDelIter(o *IterOptions) keyspan.FragmentIterator {
  1980  	if len(b.tombstones) == 0 {
  1981  		return nil
  1982  	}
  1983  	return keyspan.NewIter(b.cmp, b.tombstones)
  1984  }
  1985  
  1986  // newRangeKeyIter is part of the flushable interface.
  1987  func (b *flushableBatch) newRangeKeyIter(o *IterOptions) keyspan.FragmentIterator {
  1988  	if len(b.rangeKeys) == 0 {
  1989  		return nil
  1990  	}
  1991  	return keyspan.NewIter(b.cmp, b.rangeKeys)
  1992  }
  1993  
  1994  // containsRangeKeys is part of the flushable interface.
  1995  func (b *flushableBatch) containsRangeKeys() bool { return len(b.rangeKeys) > 0 }
  1996  
  1997  // inuseBytes is part of the flushable interface.
  1998  func (b *flushableBatch) inuseBytes() uint64 {
  1999  	return uint64(len(b.data) - batchHeaderLen)
  2000  }
  2001  
  2002  // totalBytes is part of the flushable interface.
  2003  func (b *flushableBatch) totalBytes() uint64 {
  2004  	return uint64(cap(b.data))
  2005  }
  2006  
  2007  // readyForFlush is part of the flushable interface.
  2008  func (b *flushableBatch) readyForFlush() bool {
  2009  	// A flushable batch is always ready for flush; it must be flushed together
  2010  	// with the previous memtable.
  2011  	return true
  2012  }
  2013  
  2014  // Note: flushableBatchIter mirrors the implementation of batchIter. Keep the
  2015  // two in sync.
  2016  type flushableBatchIter struct {
  2017  	// Members to be initialized by creator.
  2018  	batch *flushableBatch
  2019  	// The bytes backing the batch. Always the same as batch.data?
  2020  	data []byte
  2021  	// The sorted entries. This is not always equal to batch.offsets.
  2022  	offsets []flushableBatchEntry
  2023  	cmp     Compare
  2024  	// Must be initialized to -1. It is the index into offsets that represents
  2025  	// the current iterator position.
  2026  	index int
  2027  
  2028  	// For internal use by the implementation.
  2029  	key InternalKey
  2030  	err error
  2031  
  2032  	// Optionally initialize to bounds of iteration, if any.
  2033  	lower []byte
  2034  	upper []byte
  2035  }
  2036  
  2037  // flushableBatchIter implements the base.InternalIterator interface.
  2038  var _ base.InternalIterator = (*flushableBatchIter)(nil)
  2039  
  2040  func (i *flushableBatchIter) String() string {
  2041  	return "flushable-batch"
  2042  }
  2043  
  2044  // SeekGE implements internalIterator.SeekGE, as documented in the pebble
  2045  // package. Ignore flags.TrySeekUsingNext() since we don't expect this
  2046  // optimization to provide much benefit here at the moment.
  2047  func (i *flushableBatchIter) SeekGE(
  2048  	key []byte, flags base.SeekGEFlags,
  2049  ) (*InternalKey, base.LazyValue) {
  2050  	i.err = nil // clear cached iteration error
  2051  	ikey := base.MakeSearchKey(key)
  2052  	i.index = sort.Search(len(i.offsets), func(j int) bool {
  2053  		return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0
  2054  	})
  2055  	if i.index >= len(i.offsets) {
  2056  		return nil, base.LazyValue{}
  2057  	}
  2058  	i.key = i.getKey(i.index)
  2059  	if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 {
  2060  		i.index = len(i.offsets)
  2061  		return nil, base.LazyValue{}
  2062  	}
  2063  	return &i.key, i.value()
  2064  }
  2065  
  2066  // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
  2067  // pebble package.
  2068  func (i *flushableBatchIter) SeekPrefixGE(
  2069  	prefix, key []byte, flags base.SeekGEFlags,
  2070  ) (*base.InternalKey, base.LazyValue) {
  2071  	return i.SeekGE(key, flags)
  2072  }
  2073  
  2074  // SeekLT implements internalIterator.SeekLT, as documented in the pebble
  2075  // package.
  2076  func (i *flushableBatchIter) SeekLT(
  2077  	key []byte, flags base.SeekLTFlags,
  2078  ) (*InternalKey, base.LazyValue) {
  2079  	i.err = nil // clear cached iteration error
  2080  	ikey := base.MakeSearchKey(key)
  2081  	i.index = sort.Search(len(i.offsets), func(j int) bool {
  2082  		return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0
  2083  	})
  2084  	i.index--
  2085  	if i.index < 0 {
  2086  		return nil, base.LazyValue{}
  2087  	}
  2088  	i.key = i.getKey(i.index)
  2089  	if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 {
  2090  		i.index = -1
  2091  		return nil, base.LazyValue{}
  2092  	}
  2093  	return &i.key, i.value()
  2094  }
  2095  
  2096  // First implements internalIterator.First, as documented in the pebble
  2097  // package.
  2098  func (i *flushableBatchIter) First() (*InternalKey, base.LazyValue) {
  2099  	i.err = nil // clear cached iteration error
  2100  	if len(i.offsets) == 0 {
  2101  		return nil, base.LazyValue{}
  2102  	}
  2103  	i.index = 0
  2104  	i.key = i.getKey(i.index)
  2105  	if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 {
  2106  		i.index = len(i.offsets)
  2107  		return nil, base.LazyValue{}
  2108  	}
  2109  	return &i.key, i.value()
  2110  }
  2111  
  2112  // Last implements internalIterator.Last, as documented in the pebble
  2113  // package.
  2114  func (i *flushableBatchIter) Last() (*InternalKey, base.LazyValue) {
  2115  	i.err = nil // clear cached iteration error
  2116  	if len(i.offsets) == 0 {
  2117  		return nil, base.LazyValue{}
  2118  	}
  2119  	i.index = len(i.offsets) - 1
  2120  	i.key = i.getKey(i.index)
  2121  	if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 {
  2122  		i.index = -1
  2123  		return nil, base.LazyValue{}
  2124  	}
  2125  	return &i.key, i.value()
  2126  }
  2127  
  2128  // Note: flushFlushableBatchIter.Next mirrors the implementation of
  2129  // flushableBatchIter.Next due to performance. Keep the two in sync.
  2130  func (i *flushableBatchIter) Next() (*InternalKey, base.LazyValue) {
  2131  	if i.index == len(i.offsets) {
  2132  		return nil, base.LazyValue{}
  2133  	}
  2134  	i.index++
  2135  	if i.index == len(i.offsets) {
  2136  		return nil, base.LazyValue{}
  2137  	}
  2138  	i.key = i.getKey(i.index)
  2139  	if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 {
  2140  		i.index = len(i.offsets)
  2141  		return nil, base.LazyValue{}
  2142  	}
  2143  	return &i.key, i.value()
  2144  }
  2145  
  2146  func (i *flushableBatchIter) Prev() (*InternalKey, base.LazyValue) {
  2147  	if i.index < 0 {
  2148  		return nil, base.LazyValue{}
  2149  	}
  2150  	i.index--
  2151  	if i.index < 0 {
  2152  		return nil, base.LazyValue{}
  2153  	}
  2154  	i.key = i.getKey(i.index)
  2155  	if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 {
  2156  		i.index = -1
  2157  		return nil, base.LazyValue{}
  2158  	}
  2159  	return &i.key, i.value()
  2160  }
  2161  
  2162  // Note: flushFlushableBatchIter.NextPrefix mirrors the implementation of
  2163  // flushableBatchIter.NextPrefix due to performance. Keep the two in sync.
  2164  func (i *flushableBatchIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) {
  2165  	return i.SeekGE(succKey, base.SeekGEFlagsNone.EnableTrySeekUsingNext())
  2166  }
  2167  
  2168  func (i *flushableBatchIter) getKey(index int) InternalKey {
  2169  	e := &i.offsets[index]
  2170  	kind := InternalKeyKind(i.data[e.offset])
  2171  	key := i.data[e.keyStart:e.keyEnd]
  2172  	return base.MakeInternalKey(key, i.batch.seqNum+uint64(e.index), kind)
  2173  }
  2174  
  2175  func (i *flushableBatchIter) value() base.LazyValue {
  2176  	p := i.data[i.offsets[i.index].offset:]
  2177  	if len(p) == 0 {
  2178  		i.err = base.CorruptionErrorf("corrupted batch")
  2179  		return base.LazyValue{}
  2180  	}
  2181  	kind := InternalKeyKind(p[0])
  2182  	if kind > InternalKeyKindMax {
  2183  		i.err = base.CorruptionErrorf("corrupted batch")
  2184  		return base.LazyValue{}
  2185  	}
  2186  	var value []byte
  2187  	var ok bool
  2188  	switch kind {
  2189  	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete,
  2190  		InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete,
  2191  		InternalKeyKindDeleteSized:
  2192  		keyEnd := i.offsets[i.index].keyEnd
  2193  		_, value, ok = batchDecodeStr(i.data[keyEnd:])
  2194  		if !ok {
  2195  			i.err = base.CorruptionErrorf("corrupted batch")
  2196  			return base.LazyValue{}
  2197  		}
  2198  	}
  2199  	return base.MakeInPlaceValue(value)
  2200  }
  2201  
  2202  func (i *flushableBatchIter) Valid() bool {
  2203  	return i.index >= 0 && i.index < len(i.offsets)
  2204  }
  2205  
  2206  func (i *flushableBatchIter) Error() error {
  2207  	return i.err
  2208  }
  2209  
  2210  func (i *flushableBatchIter) Close() error {
  2211  	return i.err
  2212  }
  2213  
  2214  func (i *flushableBatchIter) SetBounds(lower, upper []byte) {
  2215  	i.lower = lower
  2216  	i.upper = upper
  2217  }
  2218  
  2219  func (i *flushableBatchIter) SetContext(_ context.Context) {}
  2220  
  2221  // flushFlushableBatchIter is similar to flushableBatchIter but it keeps track
  2222  // of number of bytes iterated.
  2223  type flushFlushableBatchIter struct {
  2224  	flushableBatchIter
  2225  	bytesIterated *uint64
  2226  }
  2227  
  2228  // flushFlushableBatchIter implements the base.InternalIterator interface.
  2229  var _ base.InternalIterator = (*flushFlushableBatchIter)(nil)
  2230  
  2231  func (i *flushFlushableBatchIter) String() string {
  2232  	return "flushable-batch"
  2233  }
  2234  
  2235  func (i *flushFlushableBatchIter) SeekGE(
  2236  	key []byte, flags base.SeekGEFlags,
  2237  ) (*InternalKey, base.LazyValue) {
  2238  	panic("pebble: SeekGE unimplemented")
  2239  }
  2240  
  2241  func (i *flushFlushableBatchIter) SeekPrefixGE(
  2242  	prefix, key []byte, flags base.SeekGEFlags,
  2243  ) (*base.InternalKey, base.LazyValue) {
  2244  	panic("pebble: SeekPrefixGE unimplemented")
  2245  }
  2246  
  2247  func (i *flushFlushableBatchIter) SeekLT(
  2248  	key []byte, flags base.SeekLTFlags,
  2249  ) (*InternalKey, base.LazyValue) {
  2250  	panic("pebble: SeekLT unimplemented")
  2251  }
  2252  
  2253  func (i *flushFlushableBatchIter) First() (*InternalKey, base.LazyValue) {
  2254  	i.err = nil // clear cached iteration error
  2255  	key, val := i.flushableBatchIter.First()
  2256  	if key == nil {
  2257  		return nil, base.LazyValue{}
  2258  	}
  2259  	entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset
  2260  	*i.bytesIterated += uint64(entryBytes) + i.valueSize()
  2261  	return key, val
  2262  }
  2263  
  2264  func (i *flushFlushableBatchIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
  2265  	panic("pebble: Prev unimplemented")
  2266  }
  2267  
  2268  // Note: flushFlushableBatchIter.Next mirrors the implementation of
  2269  // flushableBatchIter.Next due to performance. Keep the two in sync.
  2270  func (i *flushFlushableBatchIter) Next() (*InternalKey, base.LazyValue) {
  2271  	if i.index == len(i.offsets) {
  2272  		return nil, base.LazyValue{}
  2273  	}
  2274  	i.index++
  2275  	if i.index == len(i.offsets) {
  2276  		return nil, base.LazyValue{}
  2277  	}
  2278  	i.key = i.getKey(i.index)
  2279  	entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset
  2280  	*i.bytesIterated += uint64(entryBytes) + i.valueSize()
  2281  	return &i.key, i.value()
  2282  }
  2283  
  2284  func (i flushFlushableBatchIter) Prev() (*InternalKey, base.LazyValue) {
  2285  	panic("pebble: Prev unimplemented")
  2286  }
  2287  
  2288  func (i flushFlushableBatchIter) valueSize() uint64 {
  2289  	p := i.data[i.offsets[i.index].offset:]
  2290  	if len(p) == 0 {
  2291  		i.err = base.CorruptionErrorf("corrupted batch")
  2292  		return 0
  2293  	}
  2294  	kind := InternalKeyKind(p[0])
  2295  	if kind > InternalKeyKindMax {
  2296  		i.err = base.CorruptionErrorf("corrupted batch")
  2297  		return 0
  2298  	}
  2299  	var length uint64
  2300  	switch kind {
  2301  	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete:
  2302  		keyEnd := i.offsets[i.index].keyEnd
  2303  		v, n := binary.Uvarint(i.data[keyEnd:])
  2304  		if n <= 0 {
  2305  			i.err = base.CorruptionErrorf("corrupted batch")
  2306  			return 0
  2307  		}
  2308  		length = v + uint64(n)
  2309  	}
  2310  	return length
  2311  }
  2312  
  2313  // batchSort returns iterators for the sorted contents of the batch. It is
  2314  // intended for testing use only. The batch.Sort dance is done to prevent
  2315  // exposing this method in the public pebble interface.
  2316  func batchSort(
  2317  	i interface{},
  2318  ) (
  2319  	points internalIterator,
  2320  	rangeDels keyspan.FragmentIterator,
  2321  	rangeKeys keyspan.FragmentIterator,
  2322  ) {
  2323  	b := i.(*Batch)
  2324  	if b.Indexed() {
  2325  		pointIter := b.newInternalIter(nil)
  2326  		rangeDelIter := b.newRangeDelIter(nil, math.MaxUint64)
  2327  		rangeKeyIter := b.newRangeKeyIter(nil, math.MaxUint64)
  2328  		return pointIter, rangeDelIter, rangeKeyIter
  2329  	}
  2330  	f, err := newFlushableBatch(b, b.db.opts.Comparer)
  2331  	if err != nil {
  2332  		panic(err)
  2333  	}
  2334  	return f.newIter(nil), f.newRangeDelIter(nil), f.newRangeKeyIter(nil)
  2335  }
  2336  
  2337  func init() {
  2338  	private.BatchSort = batchSort
  2339  }