github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/batch.go

github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/batch.go (about)

     1  // Copyright 2012 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package bitalostable
     6  
     7  import (
     8  	"arena"
     9  	"encoding/binary"
    10  	"fmt"
    11  	"io"
    12  	"math"
    13  	"sort"
    14  	"sync"
    15  	"sync/atomic"
    16  	"unsafe"
    17  
    18  	"github.com/cockroachdb/errors"
    19  	"github.com/zuoyebang/bitalostable/internal/base"
    20  	"github.com/zuoyebang/bitalostable/internal/batchskl"
    21  	"github.com/zuoyebang/bitalostable/internal/humanize"
    22  	"github.com/zuoyebang/bitalostable/internal/keyspan"
    23  	"github.com/zuoyebang/bitalostable/internal/manual"
    24  	"github.com/zuoyebang/bitalostable/internal/private"
    25  	"github.com/zuoyebang/bitalostable/internal/rangedel"
    26  	"github.com/zuoyebang/bitalostable/internal/rangekey"
    27  	"github.com/zuoyebang/bitalostable/internal/rawalloc"
    28  )
    29  
    30  const (
    31  	batchCountOffset     = 8
    32  	batchHeaderLen       = 12
    33  	batchInitialSize     = 1 << 10 // 1 KB
    34  	batchMaxRetainedSize = 1 << 20 // 1 MB
    35  	invalidBatchCount    = 1<<32 - 1
    36  	maxVarintLen32       = 5
    37  )
    38  
    39  // ErrNotIndexed means that a read operation on a batch failed because the
    40  // batch is not indexed and thus doesn't support reads.
    41  var ErrNotIndexed = errors.New("bitalostable: batch not indexed")
    42  
    43  // ErrInvalidBatch indicates that a batch is invalid or otherwise corrupted.
    44  var ErrInvalidBatch = errors.New("bitalostable: invalid batch")
    45  
    46  // ErrBatchTooLarge indicates that a batch is invalid or otherwise corrupted.
    47  var ErrBatchTooLarge = errors.Newf("bitalostable: batch too large: >= %s", humanize.Uint64(maxBatchSize))
    48  
    49  // DeferredBatchOp represents a batch operation (eg. set, merge, delete) that is
    50  // being inserted into the batch. Indexing is not performed on the specified key
    51  // until Finish is called, hence the name deferred. This struct lets the caller
    52  // copy or encode keys/values directly into the batch representation instead of
    53  // copying into an intermediary buffer then having bitalostable.Batch copy off of it.
    54  type DeferredBatchOp struct {
    55  	index *batchskl.Skiplist
    56  
    57  	// Key and Value point to parts of the binary batch representation where
    58  	// keys and values should be encoded/copied into. len(Key) and len(Value)
    59  	// bytes must be copied into these slices respectively before calling
    60  	// Finish(). Changing where these slices point to is not allowed.
    61  	Key, Value []byte
    62  	offset     uint32
    63  }
    64  
    65  // Finish completes the addition of this batch operation, and adds it to the
    66  // index if necessary. Must be called once (and exactly once) keys/values
    67  // have been filled into Key and Value. Not calling Finish or not
    68  // copying/encoding keys will result in an incomplete index, and calling Finish
    69  // twice may result in a panic.
    70  func (d DeferredBatchOp) Finish() error {
    71  	if d.index != nil {
    72  		if err := d.index.Add(d.offset); err != nil {
    73  			return err
    74  		}
    75  	}
    76  	return nil
    77  }
    78  
    79  // A Batch is a sequence of Sets, Merges, Deletes, DeleteRanges, RangeKeySets,
    80  // RangeKeyUnsets, and/or RangeKeyDeletes that are applied atomically. Batch
    81  // implements the Reader interface, but only an indexed batch supports reading
    82  // (without error) via Get or NewIter. A non-indexed batch will return
    83  // ErrNotIndexed when read from. A batch is not safe for concurrent use, and
    84  // consumers should use a batch per goroutine or provide their own
    85  // synchronization.
    86  //
    87  // # Indexing
    88  //
    89  // Batches can be optionally indexed (see DB.NewIndexedBatch). An indexed batch
    90  // allows iteration via an Iterator (see Batch.NewIter). The iterator provides
    91  // a merged view of the operations in the batch and the underlying
    92  // database. This is implemented by treating the batch as an additional layer
    93  // in the LSM where every entry in the batch is considered newer than any entry
    94  // in the underlying database (batch entries have the InternalKeySeqNumBatch
    95  // bit set). By treating the batch as an additional layer in the LSM, iteration
    96  // supports all batch operations (i.e. Set, Merge, Delete, DeleteRange,
    97  // RangeKeySet, RangeKeyUnset, RangeKeyDelete) with minimal effort.
    98  //
    99  // The same key can be operated on multiple times in a batch, though only the
   100  // latest operation will be visible. For example, Put("a", "b"), Delete("a")
   101  // will cause the key "a" to not be visible in the batch. Put("a", "b"),
   102  // Put("a", "c") will cause a read of "a" to return the value "c".
   103  //
   104  // The batch index is implemented via an skiplist (internal/batchskl). While
   105  // the skiplist implementation is very fast, inserting into an indexed batch is
   106  // significantly slower than inserting into a non-indexed batch. Only use an
   107  // indexed batch if you require reading from it.
   108  //
   109  // # Atomic commit
   110  //
   111  // The operations in a batch are persisted by calling Batch.Commit which is
   112  // equivalent to calling DB.Apply(batch). A batch is committed atomically by
   113  // writing the internal batch representation to the WAL, adding all of the
   114  // batch operations to the memtable associated with the WAL, and then
   115  // incrementing the visible sequence number so that subsequent reads can see
   116  // the effects of the batch operations. If WriteOptions.Sync is true, a call to
   117  // Batch.Commit will guarantee that the batch is persisted to disk before
   118  // returning. See commitPipeline for more on the implementation details.
   119  //
   120  // # Large batches
   121  //
   122  // The size of a batch is limited only by available memory (be aware that
   123  // indexed batches require considerably additional memory for the skiplist
   124  // structure). A given WAL file has a single memtable associated with it (this
   125  // restriction could be removed, but doing so is onerous and complex). And a
   126  // memtable has a fixed size due to the underlying fixed size arena. Note that
   127  // this differs from RocksDB where a memtable can grow arbitrarily large using
   128  // a list of arena chunks. In RocksDB this is accomplished by storing pointers
   129  // in the arena memory, but that isn't possible in Go.
   130  //
   131  // During Batch.Commit, a batch which is larger than a threshold (>
   132  // MemTableSize/2) is wrapped in a flushableBatch and inserted into the queue
   133  // of memtables. A flushableBatch forces WAL to be rotated, but that happens
   134  // anyways when the memtable becomes full so this does not cause significant
   135  // WAL churn. Because the flushableBatch is readable as another layer in the
   136  // LSM, Batch.Commit returns as soon as the flushableBatch has been added to
   137  // the queue of memtables.
   138  //
   139  // Internally, a flushableBatch provides Iterator support by sorting the batch
   140  // contents (the batch is sorted once, when it is added to the memtable
   141  // queue). Sorting the batch contents and insertion of the contents into a
   142  // memtable have the same big-O time, but the constant factor dominates
   143  // here. Sorting is significantly faster and uses significantly less memory.
   144  //
   145  // # Internal representation
   146  //
   147  // The internal batch representation is a contiguous byte buffer with a fixed
   148  // 12-byte header, followed by a series of records.
   149  //
   150  //	+-------------+------------+--- ... ---+
   151  //	| SeqNum (8B) | Count (4B) |  Entries  |
   152  //	+-------------+------------+--- ... ---+
   153  //
   154  // Each record has a 1-byte kind tag prefix, followed by 1 or 2 length prefixed
   155  // strings (varstring):
   156  //
   157  //	+-----------+-----------------+-------------------+
   158  //	| Kind (1B) | Key (varstring) | Value (varstring) |
   159  //	+-----------+-----------------+-------------------+
   160  //
   161  // A varstring is a varint32 followed by N bytes of data. The Kind tags are
   162  // exactly those specified by InternalKeyKind. The following table shows the
   163  // format for records of each kind:
   164  //
   165  //	InternalKeyKindDelete         varstring
   166  //	InternalKeyKindLogData        varstring
   167  //	InternalKeyKindSet            varstring varstring
   168  //	InternalKeyKindMerge          varstring varstring
   169  //	InternalKeyKindRangeDelete    varstring varstring
   170  //	InternalKeyKindRangeKeySet    varstring varstring
   171  //	InternalKeyKindRangeKeyUnset  varstring varstring
   172  //	InternalKeyKindRangeKeyDelete varstring varstring
   173  //
   174  // The intuitive understanding here are that the arguments to Delete, Set,
   175  // Merge, DeleteRange and RangeKeyDelete are encoded into the batch. The
   176  // RangeKeySet and RangeKeyUnset operations are slightly more complicated,
   177  // encoding their end key, suffix and value [in the case of RangeKeySet] within
   178  // the Value varstring. For more information on the value encoding for
   179  // RangeKeySet and RangeKeyUnset, see the internal/rangekey package.
   180  //
   181  // The internal batch representation is the on disk format for a batch in the
   182  // WAL, and thus stable. New record kinds may be added, but the existing ones
   183  // will not be modified.
   184  type Batch struct {
   185  	// Data is the wire format of a batch's log entry:
   186  	//   - 8 bytes for a sequence number of the first batch element,
   187  	//     or zeroes if the batch has not yet been applied,
   188  	//   - 4 bytes for the count: the number of elements in the batch,
   189  	//     or "\xff\xff\xff\xff" if the batch is invalid,
   190  	//   - count elements, being:
   191  	//     - one byte for the kind
   192  	//     - the varint-string user key,
   193  	//     - the varint-string value (if kind != delete).
   194  	// The sequence number and count are stored in little-endian order.
   195  	//
   196  	// The data field can be (but is not guaranteed to be) nil for new
   197  	// batches. Large batches will set the data field to nil when committed as
   198  	// the data has been moved to a flushableBatch and inserted into the queue of
   199  	// memtables.
   200  	data    []byte
   201  	alloc   []byte
   202  	isFlush bool
   203  
   204  	cmp            Compare
   205  	formatKey      base.FormatKey
   206  	abbreviatedKey AbbreviatedKey
   207  
   208  	// An upper bound on required space to add this batch to a memtable.
   209  	// Note that although batches are limited to 4 GiB in size, that limit
   210  	// applies to len(data), not the memtable size. The upper bound on the
   211  	// size of a memtable node is larger than the overhead of the batch's log
   212  	// encoding, so memTableSize is larger than len(data) and may overflow a
   213  	// uint32.
   214  	memTableSize uint64
   215  
   216  	// The db to which the batch will be committed. Do not change this field
   217  	// after the batch has been created as it might invalidate internal state.
   218  	db *DB
   219  
   220  	// The count of records in the batch. This count will be stored in the batch
   221  	// data whenever Repr() is called.
   222  	count uint64
   223  
   224  	// The count of range deletions in the batch. Updated every time a range
   225  	// deletion is added.
   226  	countRangeDels uint64
   227  
   228  	// The count of range key sets, unsets and deletes in the batch. Updated
   229  	// every time a RANGEKEYSET, RANGEKEYUNSET or RANGEKEYDEL key is added.
   230  	countRangeKeys uint64
   231  
   232  	// A deferredOp struct, stored in the Batch so that a pointer can be returned
   233  	// from the *Deferred() methods rather than a value.
   234  	deferredOp DeferredBatchOp
   235  
   236  	// An optional skiplist keyed by offset into data of the entry.
   237  	index         *batchskl.Skiplist
   238  	rangeDelIndex *batchskl.Skiplist
   239  	rangeKeyIndex *batchskl.Skiplist
   240  
   241  	// Fragmented range deletion tombstones. Cached the first time a range
   242  	// deletion iterator is requested. The cache is invalidated whenever a new
   243  	// range deletion is added to the batch. This cache can only be used when
   244  	// opening an iterator to read at a batch sequence number >=
   245  	// tombstonesSeqNum. This is the case for all new iterators created over a
   246  	// batch but it's not the case for all cloned iterators.
   247  	tombstones       []keyspan.Span
   248  	tombstonesSeqNum uint64
   249  
   250  	// Fragmented range key spans. Cached the first time a range key iterator is
   251  	// requested. The cache is invalidated whenever a new range key
   252  	// (RangeKey{Set,Unset,Del}) is added to the batch. This cache can only be
   253  	// used when opening an iterator to read at a batch sequence number >=
   254  	// tombstonesSeqNum. This is the case for all new iterators created over a
   255  	// batch but it's not the case for all cloned iterators.
   256  	rangeKeys       []keyspan.Span
   257  	rangeKeysSeqNum uint64
   258  
   259  	// The flushableBatch wrapper if the batch is too large to fit in the
   260  	// memtable.
   261  	flushable *flushableBatch
   262  
   263  	commit    sync.WaitGroup
   264  	commitErr error
   265  	applied   uint32 // updated atomically
   266  }
   267  
   268  var _ Reader = (*Batch)(nil)
   269  var _ Writer = (*Batch)(nil)
   270  
   271  var batchPool = sync.Pool{
   272  	New: func() interface{} {
   273  		return &Batch{}
   274  	},
   275  }
   276  
   277  type indexedBatch struct {
   278  	batch Batch
   279  	index batchskl.Skiplist
   280  }
   281  
   282  var indexedBatchPool = sync.Pool{
   283  	New: func() interface{} {
   284  		return &indexedBatch{}
   285  	},
   286  }
   287  
   288  func newBatch(db *DB) *Batch {
   289  	b := batchPool.Get().(*Batch)
   290  	b.db = db
   291  	b.isFlush = false
   292  	return b
   293  }
   294  
   295  func newFlushBatch(db *DB, n int) *Batch {
   296  	b := batchPool.Get().(*Batch)
   297  	b.db = db
   298  	b.isFlush = true
   299  	b.alloc = manual.New(n)
   300  	b.data = b.alloc[:batchHeaderLen]
   301  	b.setCount(0)
   302  	b.setSeqNum(0)
   303  	return b
   304  }
   305  
   306  func newIndexedBatch(db *DB, comparer *Comparer) *Batch {
   307  	i := indexedBatchPool.Get().(*indexedBatch)
   308  	i.batch.cmp = comparer.Compare
   309  	i.batch.formatKey = comparer.FormatKey
   310  	i.batch.abbreviatedKey = comparer.AbbreviatedKey
   311  	i.batch.db = db
   312  	i.batch.index = &i.index
   313  	i.batch.index.Init(&i.batch.data, i.batch.cmp, i.batch.abbreviatedKey)
   314  	i.batch.isFlush = false
   315  	return &i.batch
   316  }
   317  
   318  // nextSeqNum returns the batch "sequence number" that will be given to the next
   319  // key written to the batch. During iteration keys within an indexed batch are
   320  // given a sequence number consisting of their offset within the batch combined
   321  // with the base.InternalKeySeqNumBatch bit. These sequence numbers are only
   322  // used during iteration, and the keys are assigned ordinary sequence numbers
   323  // when the batch is committed.
   324  func (b *Batch) nextSeqNum() uint64 {
   325  	return uint64(len(b.data)) | base.InternalKeySeqNumBatch
   326  }
   327  
   328  func (b *Batch) release() {
   329  	if b.db == nil {
   330  		// The batch was not created using newBatch or newIndexedBatch, or an error
   331  		// was encountered. We don't try to reuse batches that encountered an error
   332  		// because they might be stuck somewhere in the system and attempting to
   333  		// reuse such batches is a recipe for onerous debugging sessions. Instead,
   334  		// let the GC do its job.
   335  		return
   336  	}
   337  	b.db = nil
   338  
   339  	// NB: This is ugly (it would be cleaner if we could just assign a Batch{}),
   340  	// but necessary so that we can use atomic.StoreUint32 for the Batch.applied
   341  	// field. Without using an atomic to clear that field the Go race detector
   342  	// complains.
   343  	b.Reset()
   344  	b.cmp = nil
   345  	b.formatKey = nil
   346  	b.abbreviatedKey = nil
   347  	b.isFlush = false
   348  	b.alloc = nil
   349  
   350  	if b.index == nil {
   351  		batchPool.Put(b)
   352  	} else {
   353  		b.index, b.rangeDelIndex, b.rangeKeyIndex = nil, nil, nil
   354  		indexedBatchPool.Put((*indexedBatch)(unsafe.Pointer(b)))
   355  	}
   356  }
   357  
   358  func (b *Batch) refreshMemTableSize() {
   359  	b.memTableSize = 0
   360  	if len(b.data) < batchHeaderLen {
   361  		return
   362  	}
   363  
   364  	b.countRangeDels = 0
   365  	b.countRangeKeys = 0
   366  	for r := b.Reader(); ; {
   367  		kind, key, value, ok := r.Next()
   368  		if !ok {
   369  			break
   370  		}
   371  		b.memTableSize += memTableEntrySize(len(key), len(value))
   372  		switch kind {
   373  		case InternalKeyKindRangeDelete:
   374  			b.countRangeDels++
   375  		case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
   376  			b.countRangeKeys++
   377  		}
   378  	}
   379  }
   380  
   381  // Apply the operations contained in the batch to the receiver batch.
   382  //
   383  // It is safe to modify the contents of the arguments after Apply returns.
   384  func (b *Batch) Apply(batch *Batch, _ *WriteOptions) error {
   385  	if len(batch.data) == 0 {
   386  		return nil
   387  	}
   388  	if len(batch.data) < batchHeaderLen {
   389  		return base.CorruptionErrorf("bitalostable: invalid batch")
   390  	}
   391  
   392  	offset := len(b.data)
   393  	if offset == 0 {
   394  		b.init(offset)
   395  		offset = batchHeaderLen
   396  	}
   397  	b.data = append(b.data, batch.data[batchHeaderLen:]...)
   398  
   399  	b.setCount(b.Count() + batch.Count())
   400  
   401  	if b.db != nil || b.index != nil {
   402  		// Only iterate over the new entries if we need to track memTableSize or in
   403  		// order to update the index.
   404  		for iter := BatchReader(b.data[offset:]); len(iter) > 0; {
   405  			offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0]))
   406  			kind, key, value, ok := iter.Next()
   407  			if !ok {
   408  				break
   409  			}
   410  			switch kind {
   411  			case InternalKeyKindRangeDelete:
   412  				b.countRangeDels++
   413  			case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
   414  				b.countRangeKeys++
   415  			}
   416  			if b.index != nil {
   417  				var err error
   418  				switch kind {
   419  				case InternalKeyKindRangeDelete:
   420  					b.tombstones = nil
   421  					b.tombstonesSeqNum = 0
   422  					if b.rangeDelIndex == nil {
   423  						b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey)
   424  					}
   425  					err = b.rangeDelIndex.Add(uint32(offset))
   426  				case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
   427  					b.rangeKeys = nil
   428  					b.rangeKeysSeqNum = 0
   429  					if b.rangeKeyIndex == nil {
   430  						b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey)
   431  					}
   432  					err = b.rangeKeyIndex.Add(uint32(offset))
   433  				default:
   434  					err = b.index.Add(uint32(offset))
   435  				}
   436  				if err != nil {
   437  					return err
   438  				}
   439  			}
   440  			b.memTableSize += memTableEntrySize(len(key), len(value))
   441  		}
   442  	}
   443  	return nil
   444  }
   445  
   446  // Get gets the value for the given key. It returns ErrNotFound if the Batch
   447  // does not contain the key.
   448  //
   449  // The caller should not modify the contents of the returned slice, but it is
   450  // safe to modify the contents of the argument after Get returns. The returned
   451  // slice will remain valid until the returned Closer is closed. On success, the
   452  // caller MUST call closer.Close() or a memory leak will occur.
   453  func (b *Batch) Get(key []byte) ([]byte, io.Closer, error) {
   454  	if b.index == nil {
   455  		return nil, nil, ErrNotIndexed
   456  	}
   457  	return b.db.getInternal(key, b, nil /* snapshot */)
   458  }
   459  
   460  func (b *Batch) prepareDeferredKeyValueRecord(keyLen, valueLen int, kind InternalKeyKind) {
   461  	if len(b.data) == 0 {
   462  		b.init(keyLen + valueLen + 2*binary.MaxVarintLen64 + batchHeaderLen)
   463  	}
   464  	b.count++
   465  	b.memTableSize += memTableEntrySize(keyLen, valueLen)
   466  
   467  	pos := len(b.data)
   468  	b.deferredOp.offset = uint32(pos)
   469  	b.grow(1 + 2*maxVarintLen32 + keyLen + valueLen)
   470  	b.data[pos] = byte(kind)
   471  	pos++
   472  
   473  	{
   474  		// TODO(peter): Manually inlined version binary.PutUvarint(). This is 20%
   475  		// faster on BenchmarkBatchSet on go1.13. Remove if go1.14 or future
   476  		// versions show this to not be a performance win.
   477  		x := uint32(keyLen)
   478  		for x >= 0x80 {
   479  			b.data[pos] = byte(x) | 0x80
   480  			x >>= 7
   481  			pos++
   482  		}
   483  		b.data[pos] = byte(x)
   484  		pos++
   485  	}
   486  
   487  	b.deferredOp.Key = b.data[pos : pos+keyLen]
   488  	pos += keyLen
   489  
   490  	{
   491  		// TODO(peter): Manually inlined version binary.PutUvarint(). This is 20%
   492  		// faster on BenchmarkBatchSet on go1.13. Remove if go1.14 or future
   493  		// versions show this to not be a performance win.
   494  		x := uint32(valueLen)
   495  		for x >= 0x80 {
   496  			b.data[pos] = byte(x) | 0x80
   497  			x >>= 7
   498  			pos++
   499  		}
   500  		b.data[pos] = byte(x)
   501  		pos++
   502  	}
   503  
   504  	b.deferredOp.Value = b.data[pos : pos+valueLen]
   505  	// Shrink data since varints may be shorter than the upper bound.
   506  	b.data = b.data[:pos+valueLen]
   507  }
   508  
   509  func (b *Batch) prepareDeferredKeyRecord(keyLen int, kind InternalKeyKind) {
   510  	if len(b.data) == 0 {
   511  		b.init(keyLen + binary.MaxVarintLen64 + batchHeaderLen)
   512  	}
   513  	b.count++
   514  	b.memTableSize += memTableEntrySize(keyLen, 0)
   515  
   516  	pos := len(b.data)
   517  	b.deferredOp.offset = uint32(pos)
   518  	b.grow(1 + maxVarintLen32 + keyLen)
   519  	b.data[pos] = byte(kind)
   520  	pos++
   521  
   522  	{
   523  		// TODO(peter): Manually inlined version binary.PutUvarint(). Remove if
   524  		// go1.13 or future versions show this to not be a performance win. See
   525  		// BenchmarkBatchSet.
   526  		x := uint32(keyLen)
   527  		for x >= 0x80 {
   528  			b.data[pos] = byte(x) | 0x80
   529  			x >>= 7
   530  			pos++
   531  		}
   532  		b.data[pos] = byte(x)
   533  		pos++
   534  	}
   535  
   536  	b.deferredOp.Key = b.data[pos : pos+keyLen]
   537  	b.deferredOp.Value = nil
   538  
   539  	// Shrink data since varint may be shorter than the upper bound.
   540  	b.data = b.data[:pos+keyLen]
   541  }
   542  
   543  // Set adds an action to the batch that sets the key to map to the value.
   544  //
   545  // It is safe to modify the contents of the arguments after Set returns.
   546  func (b *Batch) Set(key, value []byte, _ *WriteOptions) error {
   547  	deferredOp := b.SetDeferred(len(key), len(value))
   548  	copy(deferredOp.Key, key)
   549  	copy(deferredOp.Value, value)
   550  	return deferredOp.Finish()
   551  }
   552  
   553  func (b *Batch) SetMultiValue(key []byte, values ...[]byte) error {
   554  	var valueLen int
   555  	for i := range values {
   556  		valueLen += len(values[i])
   557  	}
   558  	deferredOp := b.SetDeferred(len(key), valueLen)
   559  	copy(deferredOp.Key, key)
   560  	pos := 0
   561  	for j := range values {
   562  		pos += copy(deferredOp.Value[pos:], values[j])
   563  	}
   564  	return deferredOp.Finish()
   565  }
   566  
   567  // SetDeferred is similar to Set in that it adds a set operation to the batch,
   568  // except it only takes in key/value lengths instead of complete slices,
   569  // letting the caller encode into those objects and then call Finish() on the
   570  // returned object.
   571  func (b *Batch) SetDeferred(keyLen, valueLen int) *DeferredBatchOp {
   572  	b.prepareDeferredKeyValueRecord(keyLen, valueLen, InternalKeyKindSet)
   573  	b.deferredOp.index = b.index
   574  	return &b.deferredOp
   575  }
   576  
   577  // Merge adds an action to the batch that merges the value at key with the new
   578  // value. The details of the merge are dependent upon the configured merge
   579  // operator.
   580  //
   581  // It is safe to modify the contents of the arguments after Merge returns.
   582  func (b *Batch) Merge(key, value []byte, _ *WriteOptions) error {
   583  	deferredOp := b.MergeDeferred(len(key), len(value))
   584  	copy(deferredOp.Key, key)
   585  	copy(deferredOp.Value, value)
   586  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   587  	// in go1.13 will remove the need for this.
   588  	if b.index != nil {
   589  		if err := b.index.Add(deferredOp.offset); err != nil {
   590  			return err
   591  		}
   592  	}
   593  	return nil
   594  }
   595  
   596  // MergeDeferred is similar to Merge in that it adds a merge operation to the
   597  // batch, except it only takes in key/value lengths instead of complete slices,
   598  // letting the caller encode into those objects and then call Finish() on the
   599  // returned object.
   600  func (b *Batch) MergeDeferred(keyLen, valueLen int) *DeferredBatchOp {
   601  	b.prepareDeferredKeyValueRecord(keyLen, valueLen, InternalKeyKindMerge)
   602  	b.deferredOp.index = b.index
   603  	return &b.deferredOp
   604  }
   605  
   606  // Delete adds an action to the batch that deletes the entry for key.
   607  //
   608  // It is safe to modify the contents of the arguments after Delete returns.
   609  func (b *Batch) Delete(key []byte, _ *WriteOptions) error {
   610  	deferredOp := b.DeleteDeferred(len(key))
   611  	copy(deferredOp.Key, key)
   612  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   613  	// in go1.13 will remove the need for this.
   614  	if b.index != nil {
   615  		if err := b.index.Add(deferredOp.offset); err != nil {
   616  			return err
   617  		}
   618  	}
   619  	return nil
   620  }
   621  
   622  // DeleteDeferred is similar to Delete in that it adds a delete operation to
   623  // the batch, except it only takes in key/value lengths instead of complete
   624  // slices, letting the caller encode into those objects and then call Finish()
   625  // on the returned object.
   626  func (b *Batch) DeleteDeferred(keyLen int) *DeferredBatchOp {
   627  	b.prepareDeferredKeyRecord(keyLen, InternalKeyKindDelete)
   628  	b.deferredOp.index = b.index
   629  	return &b.deferredOp
   630  }
   631  
   632  // SingleDelete adds an action to the batch that single deletes the entry for key.
   633  // See Writer.SingleDelete for more details on the semantics of SingleDelete.
   634  //
   635  // It is safe to modify the contents of the arguments after SingleDelete returns.
   636  func (b *Batch) SingleDelete(key []byte, _ *WriteOptions) error {
   637  	deferredOp := b.SingleDeleteDeferred(len(key))
   638  	copy(deferredOp.Key, key)
   639  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   640  	// in go1.13 will remove the need for this.
   641  	if b.index != nil {
   642  		if err := b.index.Add(deferredOp.offset); err != nil {
   643  			return err
   644  		}
   645  	}
   646  	return nil
   647  }
   648  
   649  // SingleDeleteDeferred is similar to SingleDelete in that it adds a single delete
   650  // operation to the batch, except it only takes in key/value lengths instead of
   651  // complete slices, letting the caller encode into those objects and then call
   652  // Finish() on the returned object.
   653  func (b *Batch) SingleDeleteDeferred(keyLen int) *DeferredBatchOp {
   654  	b.prepareDeferredKeyRecord(keyLen, InternalKeyKindSingleDelete)
   655  	b.deferredOp.index = b.index
   656  	return &b.deferredOp
   657  }
   658  
   659  // DeleteRange deletes all of the point keys (and values) in the range
   660  // [start,end) (inclusive on start, exclusive on end). DeleteRange does NOT
   661  // delete overlapping range keys (eg, keys set via RangeKeySet).
   662  //
   663  // It is safe to modify the contents of the arguments after DeleteRange
   664  // returns.
   665  func (b *Batch) DeleteRange(start, end []byte, _ *WriteOptions) error {
   666  	deferredOp := b.DeleteRangeDeferred(len(start), len(end))
   667  	copy(deferredOp.Key, start)
   668  	copy(deferredOp.Value, end)
   669  	// TODO(peter): Manually inline DeferredBatchOp.Finish(). Mid-stack inlining
   670  	// in go1.13 will remove the need for this.
   671  	if deferredOp.index != nil {
   672  		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
   673  			return err
   674  		}
   675  	}
   676  	return nil
   677  }
   678  
   679  // DeleteRangeDeferred is similar to DeleteRange in that it adds a delete range
   680  // operation to the batch, except it only takes in key lengths instead of
   681  // complete slices, letting the caller encode into those objects and then call
   682  // Finish() on the returned object. Note that DeferredBatchOp.Key should be
   683  // populated with the start key, and DeferredBatchOp.Value should be populated
   684  // with the end key.
   685  func (b *Batch) DeleteRangeDeferred(startLen, endLen int) *DeferredBatchOp {
   686  	b.prepareDeferredKeyValueRecord(startLen, endLen, InternalKeyKindRangeDelete)
   687  	b.countRangeDels++
   688  	if b.index != nil {
   689  		b.tombstones = nil
   690  		b.tombstonesSeqNum = 0
   691  		// Range deletions are rare, so we lazily allocate the index for them.
   692  		if b.rangeDelIndex == nil {
   693  			b.rangeDelIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey)
   694  		}
   695  		b.deferredOp.index = b.rangeDelIndex
   696  	}
   697  	return &b.deferredOp
   698  }
   699  
   700  // RangeKeySet sets a range key mapping the key range [start, end) at the MVCC
   701  // timestamp suffix to value. The suffix is optional. If any portion of the key
   702  // range [start, end) is already set by a range key with the same suffix value,
   703  // RangeKeySet overrides it.
   704  //
   705  // It is safe to modify the contents of the arguments after RangeKeySet returns.
   706  func (b *Batch) RangeKeySet(start, end, suffix, value []byte, _ *WriteOptions) error {
   707  	suffixValues := [1]rangekey.SuffixValue{{Suffix: suffix, Value: value}}
   708  	internalValueLen := rangekey.EncodedSetValueLen(end, suffixValues[:])
   709  
   710  	deferredOp := b.rangeKeySetDeferred(len(start), internalValueLen)
   711  	copy(deferredOp.Key, start)
   712  	n := rangekey.EncodeSetValue(deferredOp.Value, end, suffixValues[:])
   713  	if n != internalValueLen {
   714  		panic("unexpected internal value length mismatch")
   715  	}
   716  
   717  	// Manually inline DeferredBatchOp.Finish().
   718  	if deferredOp.index != nil {
   719  		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
   720  			return err
   721  		}
   722  	}
   723  	return nil
   724  }
   725  
   726  func (b *Batch) rangeKeySetDeferred(startLen, internalValueLen int) *DeferredBatchOp {
   727  	b.prepareDeferredKeyValueRecord(startLen, internalValueLen, InternalKeyKindRangeKeySet)
   728  	b.incrementRangeKeysCount()
   729  	return &b.deferredOp
   730  }
   731  
   732  func (b *Batch) incrementRangeKeysCount() {
   733  	b.countRangeKeys++
   734  	if b.index != nil {
   735  		b.rangeKeys = nil
   736  		b.rangeKeysSeqNum = 0
   737  		// Range keys are rare, so we lazily allocate the index for them.
   738  		if b.rangeKeyIndex == nil {
   739  			b.rangeKeyIndex = batchskl.NewSkiplist(&b.data, b.cmp, b.abbreviatedKey)
   740  		}
   741  		b.deferredOp.index = b.rangeKeyIndex
   742  	}
   743  }
   744  
   745  // RangeKeyUnset removes a range key mapping the key range [start, end) at the
   746  // MVCC timestamp suffix. The suffix may be omitted to remove an unsuffixed
   747  // range key. RangeKeyUnset only removes portions of range keys that fall within
   748  // the [start, end) key span, and only range keys with suffixes that exactly
   749  // match the unset suffix.
   750  //
   751  // It is safe to modify the contents of the arguments after RangeKeyUnset
   752  // returns.
   753  func (b *Batch) RangeKeyUnset(start, end, suffix []byte, _ *WriteOptions) error {
   754  	suffixes := [1][]byte{suffix}
   755  	internalValueLen := rangekey.EncodedUnsetValueLen(end, suffixes[:])
   756  
   757  	deferredOp := b.rangeKeyUnsetDeferred(len(start), internalValueLen)
   758  	copy(deferredOp.Key, start)
   759  	n := rangekey.EncodeUnsetValue(deferredOp.Value, end, suffixes[:])
   760  	if n != internalValueLen {
   761  		panic("unexpected internal value length mismatch")
   762  	}
   763  
   764  	// Manually inline DeferredBatchOp.Finish()
   765  	if deferredOp.index != nil {
   766  		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
   767  			return err
   768  		}
   769  	}
   770  	return nil
   771  }
   772  
   773  func (b *Batch) rangeKeyUnsetDeferred(startLen, internalValueLen int) *DeferredBatchOp {
   774  	b.prepareDeferredKeyValueRecord(startLen, internalValueLen, InternalKeyKindRangeKeyUnset)
   775  	b.incrementRangeKeysCount()
   776  	return &b.deferredOp
   777  }
   778  
   779  // RangeKeyDelete deletes all of the range keys in the range [start,end)
   780  // (inclusive on start, exclusive on end). It does not delete point keys (for
   781  // that use DeleteRange). RangeKeyDelete removes all range keys within the
   782  // bounds, including those with or without suffixes.
   783  //
   784  // It is safe to modify the contents of the arguments after RangeKeyDelete
   785  // returns.
   786  func (b *Batch) RangeKeyDelete(start, end []byte, _ *WriteOptions) error {
   787  	deferredOp := b.RangeKeyDeleteDeferred(len(start), len(end))
   788  	copy(deferredOp.Key, start)
   789  	copy(deferredOp.Value, end)
   790  	// Manually inline DeferredBatchOp.Finish().
   791  	if deferredOp.index != nil {
   792  		if err := deferredOp.index.Add(deferredOp.offset); err != nil {
   793  			return err
   794  		}
   795  	}
   796  	return nil
   797  }
   798  
   799  // RangeKeyDeleteDeferred is similar to RangeKeyDelete in that it adds an
   800  // operation to delete range keys to the batch, except it only takes in key
   801  // lengths instead of complete slices, letting the caller encode into those
   802  // objects and then call Finish() on the returned object. Note that
   803  // DeferredBatchOp.Key should be populated with the start key, and
   804  // DeferredBatchOp.Value should be populated with the end key.
   805  func (b *Batch) RangeKeyDeleteDeferred(startLen, endLen int) *DeferredBatchOp {
   806  	b.prepareDeferredKeyValueRecord(startLen, endLen, InternalKeyKindRangeKeyDelete)
   807  	b.incrementRangeKeysCount()
   808  	return &b.deferredOp
   809  }
   810  
   811  // LogData adds the specified to the batch. The data will be written to the
   812  // WAL, but not added to memtables or sstables. Log data is never indexed,
   813  // which makes it useful for testing WAL performance.
   814  //
   815  // It is safe to modify the contents of the argument after LogData returns.
   816  func (b *Batch) LogData(data []byte, _ *WriteOptions) error {
   817  	origCount, origMemTableSize := b.count, b.memTableSize
   818  	b.prepareDeferredKeyRecord(len(data), InternalKeyKindLogData)
   819  	copy(b.deferredOp.Key, data)
   820  	// Since LogData only writes to the WAL and does not affect the memtable, we
   821  	// restore b.count and b.memTableSize to their origin values. Note that
   822  	// Batch.count only refers to records that are added to the memtable.
   823  	b.count, b.memTableSize = origCount, origMemTableSize
   824  	return nil
   825  }
   826  
   827  // Empty returns true if the batch is empty, and false otherwise.
   828  func (b *Batch) Empty() bool {
   829  	return len(b.data) <= batchHeaderLen
   830  }
   831  
   832  // Len returns the current size of the batch in bytes.
   833  func (b *Batch) Len() int {
   834  	if len(b.data) <= batchHeaderLen {
   835  		return batchHeaderLen
   836  	}
   837  	return len(b.data)
   838  }
   839  
   840  // Repr returns the underlying batch representation. It is not safe to modify
   841  // the contents. Reset() will not change the contents of the returned value,
   842  // though any other mutation operation may do so.
   843  func (b *Batch) Repr() []byte {
   844  	if len(b.data) == 0 {
   845  		b.init(batchHeaderLen)
   846  	}
   847  	binary.LittleEndian.PutUint32(b.countData(), b.Count())
   848  	return b.data
   849  }
   850  
   851  // SetRepr sets the underlying batch representation. The batch takes ownership
   852  // of the supplied slice. It is not safe to modify it afterwards until the
   853  // Batch is no longer in use.
   854  func (b *Batch) SetRepr(data []byte) error {
   855  	if len(data) < batchHeaderLen {
   856  		return base.CorruptionErrorf("invalid batch")
   857  	}
   858  	b.data = data
   859  	b.count = uint64(binary.LittleEndian.Uint32(b.countData()))
   860  	if b.db != nil {
   861  		// Only track memTableSize for batches that will be committed to the DB.
   862  		b.refreshMemTableSize()
   863  	}
   864  	return nil
   865  }
   866  
   867  // NewIter returns an iterator that is unpositioned (Iterator.Valid() will
   868  // return false). The iterator can be positioned via a call to SeekGE,
   869  // SeekPrefixGE, SeekLT, First or Last. Only indexed batches support iterators.
   870  //
   871  // The returned Iterator observes all of the Batch's existing mutations, but no
   872  // later mutations. Its view can be refreshed via RefreshBatchSnapshot or
   873  // SetOptions().
   874  func (b *Batch) NewIter(o *IterOptions) *Iterator {
   875  	if b.index == nil {
   876  		return &Iterator{err: ErrNotIndexed}
   877  	}
   878  	return b.db.newIterInternal(b, nil /* snapshot */, o)
   879  }
   880  
   881  // newInternalIter creates a new internalIterator that iterates over the
   882  // contents of the batch.
   883  func (b *Batch) newInternalIter(o *IterOptions) *batchIter {
   884  	iter := &batchIter{}
   885  	b.initInternalIter(o, iter, b.nextSeqNum())
   886  	return iter
   887  }
   888  
   889  func (b *Batch) initInternalIter(o *IterOptions, iter *batchIter, batchSnapshot uint64) {
   890  	*iter = batchIter{
   891  		cmp:      b.cmp,
   892  		batch:    b,
   893  		iter:     b.index.NewIter(o.GetLowerBound(), o.GetUpperBound()),
   894  		snapshot: batchSnapshot,
   895  	}
   896  }
   897  
   898  func (b *Batch) newRangeDelIter(o *IterOptions, batchSnapshot uint64) *keyspan.Iter {
   899  	// Construct an iterator even if rangeDelIndex is nil, because it is allowed
   900  	// to refresh later, so we need the container to exist.
   901  	iter := new(keyspan.Iter)
   902  	b.initRangeDelIter(o, iter, batchSnapshot)
   903  	return iter
   904  }
   905  
   906  func (b *Batch) initRangeDelIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot uint64) {
   907  	if b.rangeDelIndex == nil {
   908  		iter.Init(b.cmp, nil)
   909  		return
   910  	}
   911  
   912  	// Fragment the range tombstones the first time a range deletion iterator is
   913  	// requested. The cached tombstones are invalidated if another range
   914  	// deletion tombstone is added to the batch. This cache is only guaranteed
   915  	// to be correct if we're opening an iterator to read at a batch sequence
   916  	// number at least as high as tombstonesSeqNum. The cache is guaranteed to
   917  	// include all tombstones up to tombstonesSeqNum, and if any additional
   918  	// tombstones were added after that sequence number the cache would've been
   919  	// cleared.
   920  	nextSeqNum := b.nextSeqNum()
   921  	if b.tombstones != nil && b.tombstonesSeqNum <= batchSnapshot {
   922  		iter.Init(b.cmp, b.tombstones)
   923  		return
   924  	}
   925  
   926  	tombstones := make([]keyspan.Span, 0, b.countRangeDels)
   927  	frag := &keyspan.Fragmenter{
   928  		Cmp:    b.cmp,
   929  		Format: b.formatKey,
   930  		Emit: func(s keyspan.Span) {
   931  			tombstones = append(tombstones, s)
   932  		},
   933  	}
   934  	it := &batchIter{
   935  		cmp:      b.cmp,
   936  		batch:    b,
   937  		iter:     b.rangeDelIndex.NewIter(nil, nil),
   938  		snapshot: batchSnapshot,
   939  	}
   940  	fragmentRangeDels(frag, it, int(b.countRangeDels))
   941  	iter.Init(b.cmp, tombstones)
   942  
   943  	// If we just read all the tombstones in the batch (eg, batchSnapshot was
   944  	// set to b.nextSeqNum()), then cache the tombstones so that a subsequent
   945  	// call to initRangeDelIter may use them without refragmenting.
   946  	if nextSeqNum == batchSnapshot {
   947  		b.tombstones = tombstones
   948  		b.tombstonesSeqNum = nextSeqNum
   949  	}
   950  }
   951  
   952  func fragmentRangeDels(frag *keyspan.Fragmenter, it internalIterator, count int) {
   953  	// The memory management here is a bit subtle. The keys and values returned
   954  	// by the iterator are slices in Batch.data. Thus the fragmented tombstones
   955  	// are slices within Batch.data. If additional entries are added to the
   956  	// Batch, Batch.data may be reallocated. The references in the fragmented
   957  	// tombstones will remain valid, pointing into the old Batch.data. GC for
   958  	// the win.
   959  
   960  	// Use a single []keyspan.Key buffer to avoid allocating many
   961  	// individual []keyspan.Key slices with a single element each.
   962  	keyBuf := make([]keyspan.Key, 0, count)
   963  	for key, val := it.First(); key != nil; key, val = it.Next() {
   964  		s := rangedel.Decode(*key, val, keyBuf)
   965  		keyBuf = s.Keys[len(s.Keys):]
   966  
   967  		// Set a fixed capacity to avoid accidental overwriting.
   968  		s.Keys = s.Keys[:len(s.Keys):len(s.Keys)]
   969  		frag.Add(s)
   970  	}
   971  	frag.Finish()
   972  }
   973  
   974  func (b *Batch) newRangeKeyIter(o *IterOptions, batchSnapshot uint64) *keyspan.Iter {
   975  	// Construct an iterator even if rangeKeyIndex is nil, because it is allowed
   976  	// to refresh later, so we need the container to exist.
   977  	iter := new(keyspan.Iter)
   978  	b.initRangeKeyIter(o, iter, batchSnapshot)
   979  	return iter
   980  }
   981  
   982  func (b *Batch) initRangeKeyIter(_ *IterOptions, iter *keyspan.Iter, batchSnapshot uint64) {
   983  	if b.rangeKeyIndex == nil {
   984  		iter.Init(b.cmp, nil)
   985  		return
   986  	}
   987  
   988  	// Fragment the range keys the first time a range key iterator is requested.
   989  	// The cached spans are invalidated if another range key is added to the
   990  	// batch. This cache is only guaranteed to be correct if we're opening an
   991  	// iterator to read at a batch sequence number at least as high as
   992  	// rangeKeysSeqNum. The cache is guaranteed to include all range keys up to
   993  	// rangeKeysSeqNum, and if any additional range keys were added after that
   994  	// sequence number the cache would've been cleared.
   995  	nextSeqNum := b.nextSeqNum()
   996  	if b.rangeKeys != nil && b.rangeKeysSeqNum <= batchSnapshot {
   997  		iter.Init(b.cmp, b.rangeKeys)
   998  		return
   999  	}
  1000  
  1001  	rangeKeys := make([]keyspan.Span, 0, b.countRangeKeys)
  1002  	frag := &keyspan.Fragmenter{
  1003  		Cmp:    b.cmp,
  1004  		Format: b.formatKey,
  1005  		Emit: func(s keyspan.Span) {
  1006  			rangeKeys = append(rangeKeys, s)
  1007  		},
  1008  	}
  1009  	it := &batchIter{
  1010  		cmp:      b.cmp,
  1011  		batch:    b,
  1012  		iter:     b.rangeKeyIndex.NewIter(nil, nil),
  1013  		snapshot: batchSnapshot,
  1014  	}
  1015  	fragmentRangeKeys(frag, it, int(b.countRangeKeys))
  1016  	iter.Init(b.cmp, rangeKeys)
  1017  
  1018  	// If we just read all the range keys in the batch (eg, batchSnapshot was
  1019  	// set to b.nextSeqNum()), then cache the range keys so that a subsequent
  1020  	// call to initRangeKeyIter may use them without refragmenting.
  1021  	if nextSeqNum == batchSnapshot {
  1022  		b.rangeKeys = rangeKeys
  1023  		b.rangeKeysSeqNum = nextSeqNum
  1024  	}
  1025  }
  1026  
  1027  func fragmentRangeKeys(frag *keyspan.Fragmenter, it internalIterator, count int) error {
  1028  	// The memory management here is a bit subtle. The keys and values
  1029  	// returned by the iterator are slices in Batch.data. Thus the
  1030  	// fragmented key spans are slices within Batch.data. If additional
  1031  	// entries are added to the Batch, Batch.data may be reallocated. The
  1032  	// references in the fragmented keys will remain valid, pointing into
  1033  	// the old Batch.data. GC for the win.
  1034  
  1035  	// Use a single []keyspan.Key buffer to avoid allocating many
  1036  	// individual []keyspan.Key slices with a single element each.
  1037  	keyBuf := make([]keyspan.Key, 0, count)
  1038  	for ik, val := it.First(); ik != nil; ik, val = it.Next() {
  1039  		s, err := rangekey.Decode(*ik, val, keyBuf)
  1040  		if err != nil {
  1041  			return err
  1042  		}
  1043  		keyBuf = s.Keys[len(s.Keys):]
  1044  
  1045  		// Set a fixed capacity to avoid accidental overwriting.
  1046  		s.Keys = s.Keys[:len(s.Keys):len(s.Keys)]
  1047  		frag.Add(s)
  1048  	}
  1049  	frag.Finish()
  1050  	return nil
  1051  }
  1052  
  1053  // Commit applies the batch to its parent writer.
  1054  func (b *Batch) Commit(o *WriteOptions) error {
  1055  	return b.db.Apply(b, o)
  1056  }
  1057  
  1058  // Close closes the batch without committing it.
  1059  func (b *Batch) Close() error {
  1060  	b.release()
  1061  	return nil
  1062  }
  1063  
  1064  func (b *Batch) AllocFree() {
  1065  	if b.alloc != nil {
  1066  		manual.Free(b.alloc)
  1067  	}
  1068  	b.alloc = nil
  1069  }
  1070  
  1071  // Indexed returns true if the batch is indexed (i.e. supports read
  1072  // operations).
  1073  func (b *Batch) Indexed() bool {
  1074  	return b.index != nil
  1075  }
  1076  
  1077  func (b *Batch) init(cap int) {
  1078  	n := batchInitialSize
  1079  	for n < cap {
  1080  		n *= 2
  1081  	}
  1082  	b.data = rawalloc.New(batchHeaderLen, n)
  1083  	b.setCount(0)
  1084  	b.setSeqNum(0)
  1085  	b.data = b.data[:batchHeaderLen]
  1086  }
  1087  
  1088  // Reset resets the batch for reuse. The underlying byte slice (that is
  1089  // returned by Repr()) is not modified. It is only necessary to call this
  1090  // method if a batch is explicitly being reused. Close automatically takes are
  1091  // of releasing resources when appropriate for batches that are internally
  1092  // being reused.
  1093  func (b *Batch) Reset() {
  1094  	b.count = 0
  1095  	b.countRangeDels = 0
  1096  	b.countRangeKeys = 0
  1097  	b.memTableSize = 0
  1098  	b.deferredOp = DeferredBatchOp{}
  1099  	b.tombstones = nil
  1100  	b.tombstonesSeqNum = 0
  1101  	b.rangeKeys = nil
  1102  	b.rangeKeysSeqNum = 0
  1103  	b.flushable = nil
  1104  	b.commit = sync.WaitGroup{}
  1105  	b.commitErr = nil
  1106  	atomic.StoreUint32(&b.applied, 0)
  1107  	if b.data != nil {
  1108  		if cap(b.data) > batchMaxRetainedSize || b.isFlush {
  1109  			// If the capacity of the buffer is larger than our maximum
  1110  			// retention size, don't re-use it. Let it be GC-ed instead.
  1111  			// This prevents the memory from an unusually large batch from
  1112  			// being held on to indefinitely.
  1113  			b.data = nil
  1114  		} else {
  1115  			// Otherwise, reset the buffer for re-use.
  1116  			b.data = b.data[:batchHeaderLen]
  1117  			b.setSeqNum(0)
  1118  		}
  1119  	}
  1120  	if b.index != nil {
  1121  		b.index.Init(&b.data, b.cmp, b.abbreviatedKey)
  1122  		b.rangeDelIndex = nil
  1123  		b.rangeKeyIndex = nil
  1124  	}
  1125  }
  1126  
  1127  // seqNumData returns the 8 byte little-endian sequence number. Zero means that
  1128  // the batch has not yet been applied.
  1129  func (b *Batch) seqNumData() []byte {
  1130  	return b.data[:8]
  1131  }
  1132  
  1133  // countData returns the 4 byte little-endian count data. "\xff\xff\xff\xff"
  1134  // means that the batch is invalid.
  1135  func (b *Batch) countData() []byte {
  1136  	return b.data[8:12]
  1137  }
  1138  
  1139  func (b *Batch) grow(n int) {
  1140  	newSize := len(b.data) + n
  1141  	if uint64(newSize) >= maxBatchSize {
  1142  		panic(ErrBatchTooLarge)
  1143  	}
  1144  	if newSize > cap(b.data) {
  1145  		newCap := 2 * cap(b.data)
  1146  		for newCap < newSize {
  1147  			newCap *= 2
  1148  		}
  1149  		newData := rawalloc.New(len(b.data), newCap)
  1150  		copy(newData, b.data)
  1151  		b.data = newData
  1152  	}
  1153  	b.data = b.data[:newSize]
  1154  }
  1155  
  1156  func (b *Batch) setSeqNum(seqNum uint64) {
  1157  	binary.LittleEndian.PutUint64(b.seqNumData(), seqNum)
  1158  }
  1159  
  1160  // SeqNum returns the batch sequence number which is applied to the first
  1161  // record in the batch. The sequence number is incremented for each subsequent
  1162  // record. It returns zero if the batch is empty.
  1163  func (b *Batch) SeqNum() uint64 {
  1164  	if len(b.data) == 0 {
  1165  		b.init(batchHeaderLen)
  1166  	}
  1167  	return binary.LittleEndian.Uint64(b.seqNumData())
  1168  }
  1169  
  1170  func (b *Batch) setCount(v uint32) {
  1171  	b.count = uint64(v)
  1172  }
  1173  
  1174  // Count returns the count of memtable-modifying operations in this batch. All
  1175  // operations with the except of LogData increment this count.
  1176  func (b *Batch) Count() uint32 {
  1177  	if b.count > math.MaxUint32 {
  1178  		panic(ErrInvalidBatch)
  1179  	}
  1180  	return uint32(b.count)
  1181  }
  1182  
  1183  // Reader returns a BatchReader for the current batch contents. If the batch is
  1184  // mutated, the new entries will not be visible to the reader.
  1185  func (b *Batch) Reader() BatchReader {
  1186  	if len(b.data) == 0 {
  1187  		b.init(batchHeaderLen)
  1188  	}
  1189  	return b.data[batchHeaderLen:]
  1190  }
  1191  
  1192  func batchDecodeStr(data []byte) (odata []byte, s []byte, ok bool) {
  1193  	var v uint32
  1194  	var n int
  1195  	ptr := unsafe.Pointer(&data[0])
  1196  	if a := *((*uint8)(ptr)); a < 128 {
  1197  		v = uint32(a)
  1198  		n = 1
  1199  	} else if a, b := a&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 1))); b < 128 {
  1200  		v = uint32(b)<<7 | uint32(a)
  1201  		n = 2
  1202  	} else if b, c := b&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 2))); c < 128 {
  1203  		v = uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1204  		n = 3
  1205  	} else if c, d := c&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 3))); d < 128 {
  1206  		v = uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1207  		n = 4
  1208  	} else {
  1209  		d, e := d&0x7f, *((*uint8)(unsafe.Pointer(uintptr(ptr) + 4)))
  1210  		v = uint32(e)<<28 | uint32(d)<<21 | uint32(c)<<14 | uint32(b)<<7 | uint32(a)
  1211  		n = 5
  1212  	}
  1213  
  1214  	data = data[n:]
  1215  	if v > uint32(len(data)) {
  1216  		return nil, nil, false
  1217  	}
  1218  	return data[v:], data[:v], true
  1219  }
  1220  
  1221  // BatchReader iterates over the entries contained in a batch.
  1222  type BatchReader []byte
  1223  
  1224  // ReadBatch constructs a BatchReader from a batch representation.  The
  1225  // header is not validated. ReadBatch returns a new batch reader and the
  1226  // count of entries contained within the batch.
  1227  func ReadBatch(repr []byte) (r BatchReader, count uint32) {
  1228  	if len(repr) <= batchHeaderLen {
  1229  		return nil, count
  1230  	}
  1231  	count = binary.LittleEndian.Uint32(repr[batchCountOffset:batchHeaderLen])
  1232  	return repr[batchHeaderLen:], count
  1233  }
  1234  
  1235  // Next returns the next entry in this batch. The final return value is false
  1236  // if the batch is corrupt. The end of batch is reached when len(r)==0.
  1237  func (r *BatchReader) Next() (kind InternalKeyKind, ukey []byte, value []byte, ok bool) {
  1238  	if len(*r) == 0 {
  1239  		return 0, nil, nil, false
  1240  	}
  1241  	kind = InternalKeyKind((*r)[0])
  1242  	if kind > InternalKeyKindMax {
  1243  		return 0, nil, nil, false
  1244  	}
  1245  	*r, ukey, ok = batchDecodeStr((*r)[1:])
  1246  	if !ok {
  1247  		return 0, nil, nil, false
  1248  	}
  1249  	switch kind {
  1250  	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete,
  1251  		InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
  1252  		*r, value, ok = batchDecodeStr(*r)
  1253  		if !ok {
  1254  			return 0, nil, nil, false
  1255  		}
  1256  	}
  1257  	return kind, ukey, value, true
  1258  }
  1259  
  1260  // Note: batchIter mirrors the implementation of flushableBatchIter. Keep the
  1261  // two in sync.
  1262  type batchIter struct {
  1263  	cmp   Compare
  1264  	batch *Batch
  1265  	iter  batchskl.Iterator
  1266  	err   error
  1267  	// snapshot holds a batch "sequence number" at which the batch is being
  1268  	// read. This sequence number has the InternalKeySeqNumBatch bit set, so it
  1269  	// encodes an offset within the batch. Only batch entries earlier than the
  1270  	// offset are visible during iteration.
  1271  	snapshot uint64
  1272  }
  1273  
  1274  // batchIter implements the base.InternalIterator interface.
  1275  var _ base.InternalIterator = (*batchIter)(nil)
  1276  
  1277  func (i *batchIter) String() string {
  1278  	return "batch"
  1279  }
  1280  
  1281  func (i *batchIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, []byte) {
  1282  	// Ignore trySeekUsingNext since the batch may have changed, so using Next
  1283  	// would be incorrect.
  1284  	i.err = nil
  1285  	ikey := i.iter.SeekGE(key)
  1286  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1287  		ikey = i.iter.Next()
  1288  	}
  1289  	if ikey == nil {
  1290  		return nil, nil
  1291  	}
  1292  	return ikey, i.Value()
  1293  }
  1294  
  1295  func (i *batchIter) SeekPrefixGE(
  1296  	prefix, key []byte, flags base.SeekGEFlags,
  1297  ) (*base.InternalKey, []byte) {
  1298  	i.err = nil
  1299  	return i.SeekGE(key, flags)
  1300  }
  1301  
  1302  func (i *batchIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, []byte) {
  1303  	i.err = nil
  1304  	ikey := i.iter.SeekLT(key)
  1305  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1306  		ikey = i.iter.Prev()
  1307  	}
  1308  	if ikey == nil {
  1309  		return nil, nil
  1310  	}
  1311  	return ikey, i.Value()
  1312  }
  1313  
  1314  func (i *batchIter) First() (*InternalKey, []byte) {
  1315  	i.err = nil
  1316  	ikey := i.iter.First()
  1317  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1318  		ikey = i.iter.Next()
  1319  	}
  1320  	if ikey == nil {
  1321  		return nil, nil
  1322  	}
  1323  	return ikey, i.Value()
  1324  }
  1325  
  1326  func (i *batchIter) Last() (*InternalKey, []byte) {
  1327  	i.err = nil
  1328  	ikey := i.iter.Last()
  1329  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1330  		ikey = i.iter.Prev()
  1331  	}
  1332  	if ikey == nil {
  1333  		return nil, nil
  1334  	}
  1335  	return ikey, i.Value()
  1336  }
  1337  
  1338  func (i *batchIter) Next() (*InternalKey, []byte) {
  1339  	ikey := i.iter.Next()
  1340  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1341  		ikey = i.iter.Next()
  1342  	}
  1343  	if ikey == nil {
  1344  		return nil, nil
  1345  	}
  1346  	return ikey, i.Value()
  1347  }
  1348  
  1349  func (i *batchIter) Prev() (*InternalKey, []byte) {
  1350  	ikey := i.iter.Prev()
  1351  	for ikey != nil && ikey.SeqNum() >= i.snapshot {
  1352  		ikey = i.iter.Prev()
  1353  	}
  1354  	if ikey == nil {
  1355  		return nil, nil
  1356  	}
  1357  	return ikey, i.Value()
  1358  }
  1359  
  1360  func (i *batchIter) Key() *InternalKey {
  1361  	return i.iter.Key()
  1362  }
  1363  
  1364  func (i *batchIter) Value() []byte {
  1365  	offset, _, keyEnd := i.iter.KeyInfo()
  1366  	data := i.batch.data
  1367  	if len(data[offset:]) == 0 {
  1368  		i.err = base.CorruptionErrorf("corrupted batch")
  1369  		return nil
  1370  	}
  1371  
  1372  	switch InternalKeyKind(data[offset]) {
  1373  	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete,
  1374  		InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
  1375  		_, value, ok := batchDecodeStr(data[keyEnd:])
  1376  		if !ok {
  1377  			return nil
  1378  		}
  1379  		return value
  1380  	default:
  1381  		return nil
  1382  	}
  1383  }
  1384  
  1385  func (i *batchIter) Valid() bool {
  1386  	return i.iter.Valid()
  1387  }
  1388  
  1389  func (i *batchIter) Error() error {
  1390  	return i.err
  1391  }
  1392  
  1393  func (i *batchIter) Close() error {
  1394  	_ = i.iter.Close()
  1395  	return i.err
  1396  }
  1397  
  1398  func (i *batchIter) SetBounds(lower, upper []byte) {
  1399  	i.iter.SetBounds(lower, upper)
  1400  }
  1401  
  1402  type flushableBatchEntry struct {
  1403  	// offset is the byte offset of the record within the batch repr.
  1404  	offset uint32
  1405  	// index is the 0-based ordinal number of the record within the batch. Used
  1406  	// to compute the seqnum for the record.
  1407  	index uint32
  1408  	// key{Start,End} are the start and end byte offsets of the key within the
  1409  	// batch repr. Cached to avoid decoding the key length on every
  1410  	// comparison. The value is stored starting at keyEnd.
  1411  	keyStart uint32
  1412  	keyEnd   uint32
  1413  }
  1414  
  1415  // flushableBatch wraps an existing batch and provides the interfaces needed
  1416  // for making the batch flushable (i.e. able to mimic a memtable).
  1417  type flushableBatch struct {
  1418  	cmp       Compare
  1419  	formatKey base.FormatKey
  1420  	data      []byte
  1421  
  1422  	// The base sequence number for the entries in the batch. This is the same
  1423  	// value as Batch.seqNum() and is cached here for performance.
  1424  	seqNum uint64
  1425  
  1426  	// A slice of offsets and indices for the entries in the batch. Used to
  1427  	// implement flushableBatchIter. Unlike the indexing on a normal batch, a
  1428  	// flushable batch is indexed such that batch entry i will be given the
  1429  	// sequence number flushableBatch.seqNum+i.
  1430  	//
  1431  	// Sorted in increasing order of key and decreasing order of offset (since
  1432  	// higher offsets correspond to higher sequence numbers).
  1433  	//
  1434  	// Does not include range deletion entries or range key entries.
  1435  	offsets      []flushableBatchEntry
  1436  	offsetsArena *arena.Arena
  1437  
  1438  	// Fragmented range deletion tombstones.
  1439  	tombstones []keyspan.Span
  1440  
  1441  	// Fragmented range keys.
  1442  	rangeKeys []keyspan.Span
  1443  }
  1444  
  1445  var _ flushable = (*flushableBatch)(nil)
  1446  
  1447  // newFlushableBatch creates a new batch that implements the flushable
  1448  // interface. This allows the batch to act like a memtable and be placed in the
  1449  // queue of flushable memtables. Note that the flushable batch takes ownership
  1450  // of the batch data.
  1451  func newFlushableBatch(batch *Batch, comparer *Comparer) *flushableBatch {
  1452  	b := &flushableBatch{
  1453  		data:         batch.data,
  1454  		cmp:          comparer.Compare,
  1455  		formatKey:    comparer.FormatKey,
  1456  		offsetsArena: arena.NewArena(),
  1457  	}
  1458  	b.offsets = arena.MakeSlice[flushableBatchEntry](b.offsetsArena, 0, int(batch.Count()))
  1459  	if b.data != nil {
  1460  		// Note that this sequence number is not correct when this batch has not
  1461  		// been applied since the sequence number has not been assigned yet. The
  1462  		// correct sequence number will be set later. But it is correct when the
  1463  		// batch is being replayed from the WAL.
  1464  		b.seqNum = batch.SeqNum()
  1465  	}
  1466  	var rangeDelOffsets []flushableBatchEntry
  1467  	var rangeKeyOffsets []flushableBatchEntry
  1468  	if len(b.data) > batchHeaderLen {
  1469  		// Non-empty batch.
  1470  		var index uint32
  1471  		for iter := BatchReader(b.data[batchHeaderLen:]); len(iter) > 0; index++ {
  1472  			offset := uintptr(unsafe.Pointer(&iter[0])) - uintptr(unsafe.Pointer(&b.data[0]))
  1473  			kind, key, _, ok := iter.Next()
  1474  			if !ok {
  1475  				break
  1476  			}
  1477  			entry := flushableBatchEntry{
  1478  				offset: uint32(offset),
  1479  				index:  uint32(index),
  1480  			}
  1481  			if keySize := uint32(len(key)); keySize == 0 {
  1482  				// Must add 2 to the offset. One byte encodes `kind` and the next
  1483  				// byte encodes `0`, which is the length of the key.
  1484  				entry.keyStart = uint32(offset) + 2
  1485  				entry.keyEnd = entry.keyStart
  1486  			} else {
  1487  				entry.keyStart = uint32(uintptr(unsafe.Pointer(&key[0])) -
  1488  					uintptr(unsafe.Pointer(&b.data[0])))
  1489  				entry.keyEnd = entry.keyStart + keySize
  1490  			}
  1491  			switch kind {
  1492  			case InternalKeyKindRangeDelete:
  1493  				rangeDelOffsets = append(rangeDelOffsets, entry)
  1494  			case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
  1495  				rangeKeyOffsets = append(rangeKeyOffsets, entry)
  1496  			default:
  1497  				b.offsets = append(b.offsets, entry)
  1498  			}
  1499  		}
  1500  	}
  1501  
  1502  	// Sort all of offsets, rangeDelOffsets and rangeKeyOffsets, using *batch's
  1503  	// sort.Interface implementation.
  1504  	pointOffsets := b.offsets
  1505  	sort.Sort(b)
  1506  	b.offsets = rangeDelOffsets
  1507  	sort.Sort(b)
  1508  	b.offsets = rangeKeyOffsets
  1509  	sort.Sort(b)
  1510  	b.offsets = pointOffsets
  1511  
  1512  	if len(rangeDelOffsets) > 0 {
  1513  		frag := &keyspan.Fragmenter{
  1514  			Cmp:    b.cmp,
  1515  			Format: b.formatKey,
  1516  			Emit: func(s keyspan.Span) {
  1517  				b.tombstones = append(b.tombstones, s)
  1518  			},
  1519  		}
  1520  		it := &flushableBatchIter{
  1521  			batch:   b,
  1522  			data:    b.data,
  1523  			offsets: rangeDelOffsets,
  1524  			cmp:     b.cmp,
  1525  			index:   -1,
  1526  		}
  1527  		fragmentRangeDels(frag, it, len(rangeDelOffsets))
  1528  	}
  1529  	if len(rangeKeyOffsets) > 0 {
  1530  		frag := &keyspan.Fragmenter{
  1531  			Cmp:    b.cmp,
  1532  			Format: b.formatKey,
  1533  			Emit: func(s keyspan.Span) {
  1534  				b.rangeKeys = append(b.rangeKeys, s)
  1535  			},
  1536  		}
  1537  		it := &flushableBatchIter{
  1538  			batch:   b,
  1539  			data:    b.data,
  1540  			offsets: rangeKeyOffsets,
  1541  			cmp:     b.cmp,
  1542  			index:   -1,
  1543  		}
  1544  		fragmentRangeKeys(frag, it, len(rangeKeyOffsets))
  1545  	}
  1546  	return b
  1547  }
  1548  
  1549  func (b *flushableBatch) release() {
  1550  	b.offsetsArena.Free()
  1551  	b.data = nil
  1552  	b.offsetsArena = nil
  1553  	b.offsets = nil
  1554  	b.tombstones = nil
  1555  	b.rangeKeys = nil
  1556  }
  1557  
  1558  func (b *flushableBatch) setSeqNum(seqNum uint64) {
  1559  	if b.seqNum != 0 {
  1560  		panic(fmt.Sprintf("bitalostable: flushableBatch.seqNum already set: %d", b.seqNum))
  1561  	}
  1562  	b.seqNum = seqNum
  1563  	for i := range b.tombstones {
  1564  		for j := range b.tombstones[i].Keys {
  1565  			b.tombstones[i].Keys[j].Trailer = base.MakeTrailer(
  1566  				b.tombstones[i].Keys[j].SeqNum()+seqNum,
  1567  				b.tombstones[i].Keys[j].Kind(),
  1568  			)
  1569  		}
  1570  	}
  1571  	for i := range b.rangeKeys {
  1572  		for j := range b.rangeKeys[i].Keys {
  1573  			b.rangeKeys[i].Keys[j].Trailer = base.MakeTrailer(
  1574  				b.rangeKeys[i].Keys[j].SeqNum()+seqNum,
  1575  				b.rangeKeys[i].Keys[j].Kind(),
  1576  			)
  1577  		}
  1578  	}
  1579  }
  1580  
  1581  func (b *flushableBatch) Len() int {
  1582  	return len(b.offsets)
  1583  }
  1584  
  1585  func (b *flushableBatch) Less(i, j int) bool {
  1586  	ei := &b.offsets[i]
  1587  	ej := &b.offsets[j]
  1588  	ki := b.data[ei.keyStart:ei.keyEnd]
  1589  	kj := b.data[ej.keyStart:ej.keyEnd]
  1590  	switch c := b.cmp(ki, kj); {
  1591  	case c < 0:
  1592  		return true
  1593  	case c > 0:
  1594  		return false
  1595  	default:
  1596  		return ei.offset > ej.offset
  1597  	}
  1598  }
  1599  
  1600  func (b *flushableBatch) Swap(i, j int) {
  1601  	b.offsets[i], b.offsets[j] = b.offsets[j], b.offsets[i]
  1602  }
  1603  
  1604  func (b *flushableBatch) newIter(o *IterOptions) internalIterator {
  1605  	return &flushableBatchIter{
  1606  		batch:   b,
  1607  		data:    b.data,
  1608  		offsets: b.offsets,
  1609  		cmp:     b.cmp,
  1610  		index:   -1,
  1611  		lower:   o.GetLowerBound(),
  1612  		upper:   o.GetUpperBound(),
  1613  	}
  1614  }
  1615  
  1616  func (b *flushableBatch) newFlushIter(o *IterOptions, bytesFlushed *uint64) internalIterator {
  1617  	return &flushFlushableBatchIter{
  1618  		flushableBatchIter: flushableBatchIter{
  1619  			batch:   b,
  1620  			data:    b.data,
  1621  			offsets: b.offsets,
  1622  			cmp:     b.cmp,
  1623  			index:   -1,
  1624  		},
  1625  		bytesIterated: bytesFlushed,
  1626  	}
  1627  }
  1628  
  1629  func (b *flushableBatch) newRangeDelIter(o *IterOptions) keyspan.FragmentIterator {
  1630  	if len(b.tombstones) == 0 {
  1631  		return nil
  1632  	}
  1633  	return keyspan.NewIter(b.cmp, b.tombstones)
  1634  }
  1635  
  1636  func (b *flushableBatch) newRangeKeyIter(o *IterOptions) keyspan.FragmentIterator {
  1637  	if len(b.rangeKeys) == 0 {
  1638  		return nil
  1639  	}
  1640  	return keyspan.NewIter(b.cmp, b.rangeKeys)
  1641  }
  1642  
  1643  func (b *flushableBatch) containsRangeKeys() bool { return len(b.rangeKeys) > 0 }
  1644  
  1645  func (b *flushableBatch) inuseBytes() uint64 {
  1646  	return uint64(len(b.data) - batchHeaderLen)
  1647  }
  1648  
  1649  func (b *flushableBatch) totalBytes() uint64 {
  1650  	return uint64(cap(b.data))
  1651  }
  1652  
  1653  func (b *flushableBatch) readyForFlush() bool {
  1654  	return true
  1655  }
  1656  
  1657  // Note: flushableBatchIter mirrors the implementation of batchIter. Keep the
  1658  // two in sync.
  1659  type flushableBatchIter struct {
  1660  	// Members to be initialized by creator.
  1661  	batch *flushableBatch
  1662  	// The bytes backing the batch. Always the same as batch.data?
  1663  	data []byte
  1664  	// The sorted entries. This is not always equal to batch.offsets.
  1665  	offsets []flushableBatchEntry
  1666  	cmp     Compare
  1667  	// Must be initialized to -1. It is the index into offsets that represents
  1668  	// the current iterator position.
  1669  	index int
  1670  
  1671  	// For internal use by the implementation.
  1672  	key InternalKey
  1673  	err error
  1674  
  1675  	// Optionally initialize to bounds of iteration, if any.
  1676  	lower []byte
  1677  	upper []byte
  1678  }
  1679  
  1680  // flushableBatchIter implements the base.InternalIterator interface.
  1681  var _ base.InternalIterator = (*flushableBatchIter)(nil)
  1682  
  1683  func (i *flushableBatchIter) String() string {
  1684  	return "flushable-batch"
  1685  }
  1686  
  1687  // SeekGE implements internalIterator.SeekGE, as documented in the bitalostable
  1688  // package. Ignore flags.TrySeekUsingNext() since we don't expect this
  1689  // optimization to provide much benefit here at the moment.
  1690  func (i *flushableBatchIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, []byte) {
  1691  	i.err = nil
  1692  	ikey := base.MakeSearchKey(key)
  1693  	i.index = sort.Search(len(i.offsets), func(j int) bool {
  1694  		return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0
  1695  	})
  1696  	if i.index >= len(i.offsets) {
  1697  		return nil, nil
  1698  	}
  1699  	i.key = i.getKey(i.index)
  1700  	if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 {
  1701  		i.index = len(i.offsets)
  1702  		return nil, nil
  1703  	}
  1704  	return &i.key, i.Value()
  1705  }
  1706  
  1707  // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
  1708  // bitalostable package.
  1709  func (i *flushableBatchIter) SeekPrefixGE(
  1710  	prefix, key []byte, flags base.SeekGEFlags,
  1711  ) (*base.InternalKey, []byte) {
  1712  	return i.SeekGE(key, flags)
  1713  }
  1714  
  1715  // SeekLT implements internalIterator.SeekLT, as documented in the bitalostable
  1716  // package.
  1717  func (i *flushableBatchIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, []byte) {
  1718  	i.err = nil
  1719  	ikey := base.MakeSearchKey(key)
  1720  	i.index = sort.Search(len(i.offsets), func(j int) bool {
  1721  		return base.InternalCompare(i.cmp, ikey, i.getKey(j)) <= 0
  1722  	})
  1723  	i.index--
  1724  	if i.index < 0 {
  1725  		return nil, nil
  1726  	}
  1727  	i.key = i.getKey(i.index)
  1728  	if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 {
  1729  		i.index = -1
  1730  		return nil, nil
  1731  	}
  1732  	return &i.key, i.Value()
  1733  }
  1734  
  1735  // First implements internalIterator.First, as documented in the bitalostable
  1736  // package.
  1737  func (i *flushableBatchIter) First() (*InternalKey, []byte) {
  1738  	i.err = nil
  1739  	if len(i.offsets) == 0 {
  1740  		return nil, nil
  1741  	}
  1742  	i.index = 0
  1743  	i.key = i.getKey(i.index)
  1744  	if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 {
  1745  		i.index = len(i.offsets)
  1746  		return nil, nil
  1747  	}
  1748  	return &i.key, i.Value()
  1749  }
  1750  
  1751  // Last implements internalIterator.Last, as documented in the bitalostable
  1752  // package.
  1753  func (i *flushableBatchIter) Last() (*InternalKey, []byte) {
  1754  	i.err = nil
  1755  	if len(i.offsets) == 0 {
  1756  		return nil, nil
  1757  	}
  1758  	i.index = len(i.offsets) - 1
  1759  	i.key = i.getKey(i.index)
  1760  	if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 {
  1761  		i.index = -1
  1762  		return nil, nil
  1763  	}
  1764  	return &i.key, i.Value()
  1765  }
  1766  
  1767  // Note: flushFlushableBatchIter.Next mirrors the implementation of
  1768  // flushableBatchIter.Next due to performance. Keep the two in sync.
  1769  func (i *flushableBatchIter) Next() (*InternalKey, []byte) {
  1770  	if i.index == len(i.offsets) {
  1771  		return nil, nil
  1772  	}
  1773  	i.index++
  1774  	if i.index == len(i.offsets) {
  1775  		return nil, nil
  1776  	}
  1777  	i.key = i.getKey(i.index)
  1778  	if i.upper != nil && i.cmp(i.key.UserKey, i.upper) >= 0 {
  1779  		i.index = len(i.offsets)
  1780  		return nil, nil
  1781  	}
  1782  	return &i.key, i.Value()
  1783  }
  1784  
  1785  func (i *flushableBatchIter) Prev() (*InternalKey, []byte) {
  1786  	if i.index < 0 {
  1787  		return nil, nil
  1788  	}
  1789  	i.index--
  1790  	if i.index < 0 {
  1791  		return nil, nil
  1792  	}
  1793  	i.key = i.getKey(i.index)
  1794  	if i.lower != nil && i.cmp(i.key.UserKey, i.lower) < 0 {
  1795  		i.index = -1
  1796  		return nil, nil
  1797  	}
  1798  	return &i.key, i.Value()
  1799  }
  1800  
  1801  func (i *flushableBatchIter) getKey(index int) InternalKey {
  1802  	e := &i.offsets[index]
  1803  	kind := InternalKeyKind(i.data[e.offset])
  1804  	key := i.data[e.keyStart:e.keyEnd]
  1805  	return base.MakeInternalKey(key, i.batch.seqNum+uint64(e.index), kind)
  1806  }
  1807  
  1808  func (i *flushableBatchIter) Key() *InternalKey {
  1809  	return &i.key
  1810  }
  1811  
  1812  func (i *flushableBatchIter) Value() []byte {
  1813  	p := i.data[i.offsets[i.index].offset:]
  1814  	if len(p) == 0 {
  1815  		i.err = base.CorruptionErrorf("corrupted batch")
  1816  		return nil
  1817  	}
  1818  	kind := InternalKeyKind(p[0])
  1819  	if kind > InternalKeyKindMax {
  1820  		i.err = base.CorruptionErrorf("corrupted batch")
  1821  		return nil
  1822  	}
  1823  	var value []byte
  1824  	var ok bool
  1825  	switch kind {
  1826  	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete,
  1827  		InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
  1828  		keyEnd := i.offsets[i.index].keyEnd
  1829  		_, value, ok = batchDecodeStr(i.data[keyEnd:])
  1830  		if !ok {
  1831  			i.err = base.CorruptionErrorf("corrupted batch")
  1832  			return nil
  1833  		}
  1834  	}
  1835  	return value
  1836  }
  1837  
  1838  func (i *flushableBatchIter) Valid() bool {
  1839  	return i.index >= 0 && i.index < len(i.offsets)
  1840  }
  1841  
  1842  func (i *flushableBatchIter) Error() error {
  1843  	return i.err
  1844  }
  1845  
  1846  func (i *flushableBatchIter) Close() error {
  1847  	return i.err
  1848  }
  1849  
  1850  func (i *flushableBatchIter) SetBounds(lower, upper []byte) {
  1851  	i.lower = lower
  1852  	i.upper = upper
  1853  }
  1854  
  1855  // flushFlushableBatchIter is similar to flushableBatchIter but it keeps track
  1856  // of number of bytes iterated.
  1857  type flushFlushableBatchIter struct {
  1858  	flushableBatchIter
  1859  	bytesIterated *uint64
  1860  }
  1861  
  1862  // flushFlushableBatchIter implements the base.InternalIterator interface.
  1863  var _ base.InternalIterator = (*flushFlushableBatchIter)(nil)
  1864  
  1865  func (i *flushFlushableBatchIter) String() string {
  1866  	return "flushable-batch"
  1867  }
  1868  
  1869  func (i *flushFlushableBatchIter) SeekGE(
  1870  	key []byte, flags base.SeekGEFlags,
  1871  ) (*InternalKey, []byte) {
  1872  	panic("bitalostable: SeekGE unimplemented")
  1873  }
  1874  
  1875  func (i *flushFlushableBatchIter) SeekPrefixGE(
  1876  	prefix, key []byte, flags base.SeekGEFlags,
  1877  ) (*base.InternalKey, []byte) {
  1878  	panic("bitalostable: SeekPrefixGE unimplemented")
  1879  }
  1880  
  1881  func (i *flushFlushableBatchIter) SeekLT(
  1882  	key []byte, flags base.SeekLTFlags,
  1883  ) (*InternalKey, []byte) {
  1884  	panic("bitalostable: SeekLT unimplemented")
  1885  }
  1886  
  1887  func (i *flushFlushableBatchIter) First() (*InternalKey, []byte) {
  1888  	i.err = nil
  1889  	key, val := i.flushableBatchIter.First()
  1890  	if key == nil {
  1891  		return nil, nil
  1892  	}
  1893  	entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset
  1894  	*i.bytesIterated += uint64(entryBytes) + i.valueSize()
  1895  	return key, val
  1896  }
  1897  
  1898  // Note: flushFlushableBatchIter.Next mirrors the implementation of
  1899  // flushableBatchIter.Next due to performance. Keep the two in sync.
  1900  func (i *flushFlushableBatchIter) Next() (*InternalKey, []byte) {
  1901  	if i.index == len(i.offsets) {
  1902  		return nil, nil
  1903  	}
  1904  	i.index++
  1905  	if i.index == len(i.offsets) {
  1906  		return nil, nil
  1907  	}
  1908  	i.key = i.getKey(i.index)
  1909  	entryBytes := i.offsets[i.index].keyEnd - i.offsets[i.index].offset
  1910  	*i.bytesIterated += uint64(entryBytes) + i.valueSize()
  1911  	return &i.key, i.Value()
  1912  }
  1913  
  1914  func (i flushFlushableBatchIter) Prev() (*InternalKey, []byte) {
  1915  	panic("bitalostable: Prev unimplemented")
  1916  }
  1917  
  1918  func (i flushFlushableBatchIter) valueSize() uint64 {
  1919  	p := i.data[i.offsets[i.index].offset:]
  1920  	if len(p) == 0 {
  1921  		i.err = base.CorruptionErrorf("corrupted batch")
  1922  		return 0
  1923  	}
  1924  	kind := InternalKeyKind(p[0])
  1925  	if kind > InternalKeyKindMax {
  1926  		i.err = base.CorruptionErrorf("corrupted batch")
  1927  		return 0
  1928  	}
  1929  	var length uint64
  1930  	switch kind {
  1931  	case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindRangeDelete:
  1932  		keyEnd := i.offsets[i.index].keyEnd
  1933  		v, n := binary.Uvarint(i.data[keyEnd:])
  1934  		if n <= 0 {
  1935  			i.err = base.CorruptionErrorf("corrupted batch")
  1936  			return 0
  1937  		}
  1938  		length = v + uint64(n)
  1939  	}
  1940  	return length
  1941  }
  1942  
  1943  // batchSort returns iterators for the sorted contents of the batch. It is
  1944  // intended for testing use only. The batch.Sort dance is done to prevent
  1945  // exposing this method in the public bitalostable interface.
  1946  func batchSort(
  1947  	i interface{},
  1948  ) (
  1949  	points internalIterator,
  1950  	rangeDels keyspan.FragmentIterator,
  1951  	rangeKeys keyspan.FragmentIterator,
  1952  ) {
  1953  	b := i.(*Batch)
  1954  	if b.Indexed() {
  1955  		pointIter := b.newInternalIter(nil)
  1956  		rangeDelIter := b.newRangeDelIter(nil, math.MaxUint64)
  1957  		rangeKeyIter := b.newRangeKeyIter(nil, math.MaxUint64)
  1958  		return pointIter, rangeDelIter, rangeKeyIter
  1959  	}
  1960  	f := newFlushableBatch(b, b.db.opts.Comparer)
  1961  	return f.newIter(nil), f.newRangeDelIter(nil), f.newRangeKeyIter(nil)
  1962  }
  1963  
  1964  func init() {
  1965  	private.BatchSort = batchSort
  1966  }