github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/batch.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package storage
    12  
    13  import (
    14  	"encoding/binary"
    15  
    16  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    17  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    18  	"github.com/cockroachdb/errors"
    19  	"github.com/cockroachdb/pebble"
    20  )
    21  
    22  // BatchType represents the type of an entry in an encoded RocksDB batch.
    23  type BatchType byte
    24  
    25  // These constants come from rocksdb/db/dbformat.h.
    26  const (
    27  	BatchTypeDeletion BatchType = 0x0
    28  	BatchTypeValue    BatchType = 0x1
    29  	BatchTypeMerge    BatchType = 0x2
    30  	BatchTypeLogData  BatchType = 0x3
    31  	// BatchTypeColumnFamilyDeletion       BatchType = 0x4
    32  	// BatchTypeColumnFamilyValue          BatchType = 0x5
    33  	// BatchTypeColumnFamilyMerge          BatchType = 0x6
    34  	BatchTypeSingleDeletion BatchType = 0x7
    35  	// BatchTypeColumnFamilySingleDeletion BatchType = 0x8
    36  	// BatchTypeBeginPrepareXID            BatchType = 0x9
    37  	// BatchTypeEndPrepareXID              BatchType = 0xA
    38  	// BatchTypeCommitXID                  BatchType = 0xB
    39  	// BatchTypeRollbackXID                BatchType = 0xC
    40  	// BatchTypeNoop                       BatchType = 0xD
    41  	// BatchTypeColumnFamilyRangeDeletion  BatchType = 0xE
    42  	BatchTypeRangeDeletion BatchType = 0xF
    43  	// BatchTypeColumnFamilyBlobIndex      BatchType = 0x10
    44  	// BatchTypeBlobIndex                  BatchType = 0x11
    45  	// BatchMaxValue                       BatchType = 0x7F
    46  )
    47  
    48  const (
    49  	// The batch header is composed of an 8-byte sequence number (all zeroes) and
    50  	// 4-byte count of the number of entries in the batch.
    51  	headerSize       int = 12
    52  	countPos             = 8
    53  	initialBatchSize     = 1 << 10 // 1 KB
    54  )
    55  
    56  // RocksDBBatchBuilder is used to construct the RocksDB batch representation.
    57  // From the RocksDB code, the representation of a batch is:
    58  //
    59  //   WriteBatch::rep_ :=
    60  //      sequence: fixed64
    61  //      count: fixed32
    62  //      data: record[count]
    63  //   record :=
    64  //      kTypeValue varstring varstring
    65  //      kTypeDeletion varstring
    66  //      [...] (see BatchType)
    67  //   varstring :=
    68  //      len: varint32
    69  //      data: uint8[len]
    70  //
    71  // The RocksDBBatchBuilder code currently only supports kTypeValue
    72  // (BatchTypeValue), kTypeDeletion (BatchTypeDeletion), kTypeMerge
    73  // (BatchTypeMerge), and kTypeSingleDeletion (BatchTypeSingleDeletion)
    74  // operations. Before a batch is written to the RocksDB write-ahead-log,
    75  // the sequence number is 0. The "fixed32" format is little endian.
    76  //
    77  // The keys encoded into the batch are MVCC keys: a string key with a timestamp
    78  // suffix. MVCC keys are encoded as:
    79  //
    80  //   <key>[<wall_time>[<logical>]]<#timestamp-bytes>
    81  //
    82  // The <wall_time> and <logical> portions of the key are encoded as 64 and
    83  // 32-bit big-endian integers. A custom RocksDB comparator is used to maintain
    84  // the desired ordering as these keys do not sort lexicographically correctly.
    85  // Note that the encoding of these keys needs to match up with the encoding in
    86  // rocksdb/db.cc:EncodeKey().
    87  type RocksDBBatchBuilder struct {
    88  	batch   pebble.Batch
    89  	logData bool
    90  }
    91  
    92  func (b *RocksDBBatchBuilder) reset() {
    93  	b.batch.Reset()
    94  	b.logData = false
    95  }
    96  
    97  // Finish returns the constructed batch representation. After calling Finish,
    98  // the builder may be used to construct another batch, but the returned []byte
    99  // is only valid until the next builder method is called.
   100  func (b *RocksDBBatchBuilder) Finish() []byte {
   101  	repr := b.batch.Repr()
   102  	b.reset()
   103  
   104  	return repr
   105  }
   106  
   107  // Len returns the number of bytes currently in the under construction repr.
   108  func (b *RocksDBBatchBuilder) Len() int {
   109  	return len(b.batch.Repr())
   110  }
   111  
   112  var _ = (*RocksDBBatchBuilder).Len
   113  
   114  // getRepr constructs the batch representation and returns it.
   115  func (b *RocksDBBatchBuilder) getRepr() []byte {
   116  	return b.batch.Repr()
   117  }
   118  
   119  // Put sets the given key to the value provided.
   120  //
   121  // It is safe to modify the contents of the arguments after Put returns.
   122  func (b *RocksDBBatchBuilder) Put(key MVCCKey, value []byte) {
   123  	keyLen := key.Len()
   124  	deferredOp := b.batch.SetDeferred(keyLen, len(value))
   125  	encodeKeyToBuf(deferredOp.Key, key, keyLen)
   126  	copy(deferredOp.Value, value)
   127  	// NB: the batch is not indexed, obviating the need to call
   128  	// deferredOp.Finish.
   129  }
   130  
   131  // Merge is a high-performance write operation used for values which are
   132  // accumulated over several writes. Multiple values can be merged sequentially
   133  // into a single key; a subsequent read will return a "merged" value which is
   134  // computed from the original merged values.
   135  //
   136  // It is safe to modify the contents of the arguments after Merge returns.
   137  func (b *RocksDBBatchBuilder) Merge(key MVCCKey, value []byte) {
   138  	keyLen := key.Len()
   139  	deferredOp := b.batch.MergeDeferred(keyLen, len(value))
   140  	encodeKeyToBuf(deferredOp.Key, key, keyLen)
   141  	copy(deferredOp.Value, value)
   142  	// NB: the batch is not indexed, obviating the need to call
   143  	// deferredOp.Finish.
   144  }
   145  
   146  // Clear removes the item from the db with the given key.
   147  //
   148  // It is safe to modify the contents of the arguments after Clear returns.
   149  func (b *RocksDBBatchBuilder) Clear(key MVCCKey) {
   150  	keyLen := key.Len()
   151  	deferredOp := b.batch.DeleteDeferred(keyLen)
   152  	encodeKeyToBuf(deferredOp.Key, key, keyLen)
   153  	// NB: the batch is not indexed, obviating the need to call
   154  	// deferredOp.Finish.
   155  }
   156  
   157  // SingleClear removes the most recent item from the db with the given key.
   158  //
   159  // It is safe to modify the contents of the arguments after SingleClear returns.
   160  func (b *RocksDBBatchBuilder) SingleClear(key MVCCKey) {
   161  	keyLen := key.Len()
   162  	deferredOp := b.batch.SingleDeleteDeferred(keyLen)
   163  	encodeKeyToBuf(deferredOp.Key, key, keyLen)
   164  	// NB: the batch is not indexed, obviating the need to call
   165  	// deferredOp.Finish.
   166  }
   167  
   168  // LogData adds a blob of log data to the batch. It will be written to the WAL,
   169  // but otherwise uninterpreted by RocksDB.
   170  //
   171  // It is safe to modify the contents of the arguments after LogData returns.
   172  func (b *RocksDBBatchBuilder) LogData(data []byte) {
   173  	_ = b.batch.LogData(data, nil)
   174  	b.logData = true
   175  }
   176  
   177  // ApplyRepr applies the mutations in repr to the current batch.
   178  //
   179  // It is safe to modify the contents of the arguments after ApplyRepr
   180  // returns.
   181  func (b *RocksDBBatchBuilder) ApplyRepr(repr []byte) error {
   182  	b2 := &pebble.Batch{}
   183  	if err := b2.SetRepr(repr); err != nil {
   184  		return err
   185  	}
   186  
   187  	return b.batch.Apply(b2, nil)
   188  }
   189  
   190  // Count returns the count of memtable-modifying operations in this batch.
   191  func (b *RocksDBBatchBuilder) Count() uint32 {
   192  	return b.batch.Count()
   193  }
   194  
   195  // EncodeKey encodes an engine.MVCC key into the RocksDB representation. This
   196  // encoding must match with the encoding in engine/db.cc:EncodeKey().
   197  func EncodeKey(key MVCCKey) []byte {
   198  	keyLen := key.Len()
   199  	buf := make([]byte, keyLen)
   200  	encodeKeyToBuf(buf, key, keyLen)
   201  	return buf
   202  }
   203  
   204  // EncodeKeyToBuf encodes an engine.MVCC key into the RocksDB representation.
   205  // This encoding must match with the encoding in engine/db.cc:EncodeKey().
   206  func EncodeKeyToBuf(buf []byte, key MVCCKey) []byte {
   207  	keyLen := key.Len()
   208  	if cap(buf) < keyLen {
   209  		buf = make([]byte, keyLen)
   210  	} else {
   211  		buf = buf[:keyLen]
   212  	}
   213  	encodeKeyToBuf(buf, key, keyLen)
   214  	return buf
   215  }
   216  
   217  func encodeKeyToBuf(buf []byte, key MVCCKey, keyLen int) {
   218  	const (
   219  		timestampSentinelLen = 1
   220  		walltimeEncodedLen   = 8
   221  		logicalEncodedLen    = 4
   222  	)
   223  
   224  	copy(buf, key.Key)
   225  
   226  	pos := len(key.Key)
   227  	timestampLength := keyLen - pos - 1
   228  	if timestampLength > 0 {
   229  		buf[pos] = 0
   230  		pos += timestampSentinelLen
   231  		binary.BigEndian.PutUint64(buf[pos:], uint64(key.Timestamp.WallTime))
   232  		pos += walltimeEncodedLen
   233  		if key.Timestamp.Logical != 0 {
   234  			binary.BigEndian.PutUint32(buf[pos:], uint32(key.Timestamp.Logical))
   235  			pos += logicalEncodedLen
   236  		}
   237  	}
   238  	buf[len(buf)-1] = byte(timestampLength)
   239  }
   240  
   241  func encodeTimestamp(ts hlc.Timestamp) []byte {
   242  	_, encodedTS, _ := enginepb.SplitMVCCKey(EncodeKey(MVCCKey{Timestamp: ts}))
   243  	return encodedTS
   244  }
   245  
   246  // DecodeMVCCKey decodes an engine.MVCCKey from its serialized representation. This
   247  // decoding must match engine/db.cc:DecodeKey().
   248  func DecodeMVCCKey(encodedKey []byte) (MVCCKey, error) {
   249  	k, ts, err := enginepb.DecodeKey(encodedKey)
   250  	return MVCCKey{k, ts}, err
   251  }
   252  
   253  // Decode the header of RocksDB batch repr, returning both the count of the
   254  // entries in the batch and the suffix of data remaining in the batch.
   255  func rocksDBBatchDecodeHeader(repr []byte) (count int, orepr pebble.BatchReader, err error) {
   256  	if len(repr) < headerSize {
   257  		return 0, nil, errors.Errorf("batch repr too small: %d < %d", len(repr), headerSize)
   258  	}
   259  	seq := binary.LittleEndian.Uint64(repr[:countPos])
   260  	if seq != 0 {
   261  		return 0, nil, errors.Errorf("bad sequence: expected 0, but found %d", seq)
   262  	}
   263  	count = int(binary.LittleEndian.Uint32(repr[countPos:headerSize]))
   264  	return count, pebble.MakeBatchReader(repr), nil
   265  }
   266  
   267  // RocksDBBatchReader is used to iterate the entries in a RocksDB batch
   268  // representation.
   269  //
   270  // Example:
   271  // r, err := NewRocksDBBatchReader(...)
   272  // if err != nil {
   273  //   return err
   274  // }
   275  // for r.Next() {
   276  // 	 switch r.BatchType() {
   277  // 	 case BatchTypeDeletion:
   278  // 	   fmt.Printf("delete(%x)", r.Key())
   279  // 	 case BatchTypeValue:
   280  // 	   fmt.Printf("put(%x,%x)", r.Key(), r.Value())
   281  // 	 case BatchTypeMerge:
   282  // 	   fmt.Printf("merge(%x,%x)", r.Key(), r.Value())
   283  //   case BatchTypeSingleDeletion:
   284  // 	   fmt.Printf("single_delete(%x)", r.Key())
   285  //   case BatchTypeRangeDeletion:
   286  // 	   fmt.Printf("delete_range(%x,%x)", r.Key(), r.Value())
   287  // 	 }
   288  // }
   289  // if err := r.Error(); err != nil {
   290  //   return err
   291  // }
   292  type RocksDBBatchReader struct {
   293  	batchReader pebble.BatchReader
   294  
   295  	// The error encountered during iterator, if any
   296  	err error
   297  
   298  	// The total number of entries, decoded from the batch header
   299  	count int
   300  
   301  	// The following all represent the current entry and are updated by Next.
   302  	// `value` is not applicable for BatchTypeDeletion or BatchTypeSingleDeletion.
   303  	// `value` indicates the end key for BatchTypeRangeDeletion.
   304  	typ   BatchType
   305  	key   []byte
   306  	value []byte
   307  }
   308  
   309  // NewRocksDBBatchReader creates a RocksDBBatchReader from the given repr and
   310  // verifies the header.
   311  func NewRocksDBBatchReader(repr []byte) (*RocksDBBatchReader, error) {
   312  	count, batchReader, err := rocksDBBatchDecodeHeader(repr)
   313  	if err != nil {
   314  		return nil, err
   315  	}
   316  	return &RocksDBBatchReader{batchReader: batchReader, count: count}, nil
   317  }
   318  
   319  // Count returns the declared number of entries in the batch.
   320  func (r *RocksDBBatchReader) Count() int {
   321  	return r.count
   322  }
   323  
   324  // Error returns the error, if any, which the iterator encountered.
   325  func (r *RocksDBBatchReader) Error() error {
   326  	return r.err
   327  }
   328  
   329  // BatchType returns the type of the current batch entry.
   330  func (r *RocksDBBatchReader) BatchType() BatchType {
   331  	return r.typ
   332  }
   333  
   334  // Key returns the key of the current batch entry.
   335  func (r *RocksDBBatchReader) Key() []byte {
   336  	return r.key
   337  }
   338  
   339  func decodeMVCCKey(k []byte) (MVCCKey, error) {
   340  	k, ts, err := enginepb.DecodeKey(k)
   341  	return MVCCKey{k, ts}, err
   342  }
   343  
   344  // MVCCKey returns the MVCC key of the current batch entry.
   345  func (r *RocksDBBatchReader) MVCCKey() (MVCCKey, error) {
   346  	return decodeMVCCKey(r.Key())
   347  }
   348  
   349  // Value returns the value of the current batch entry. Value panics if the
   350  // BatchType is BatchTypeDeleted.
   351  func (r *RocksDBBatchReader) Value() []byte {
   352  	if r.typ == BatchTypeDeletion || r.typ == BatchTypeSingleDeletion {
   353  		panic("cannot call Value on a deletion entry")
   354  	}
   355  	return r.value
   356  }
   357  
   358  // MVCCEndKey returns the MVCC end key of the current batch entry.
   359  func (r *RocksDBBatchReader) MVCCEndKey() (MVCCKey, error) {
   360  	if r.typ != BatchTypeRangeDeletion {
   361  		panic("cannot only call Value on a range deletion entry")
   362  	}
   363  	return decodeMVCCKey(r.Value())
   364  }
   365  
   366  // Next advances to the next entry in the batch, returning false when the batch
   367  // is empty.
   368  func (r *RocksDBBatchReader) Next() bool {
   369  	kind, ukey, value, ok := r.batchReader.Next()
   370  
   371  	r.typ = BatchType(kind)
   372  	r.key = ukey
   373  	r.value = value
   374  
   375  	return ok
   376  }
   377  
   378  // RocksDBBatchCount provides an efficient way to get the count of mutations
   379  // in a RocksDB Batch representation.
   380  func RocksDBBatchCount(repr []byte) (int, error) {
   381  	if len(repr) < headerSize {
   382  		return 0, errors.Errorf("batch repr too small: %d < %d", len(repr), headerSize)
   383  	}
   384  	return int(binary.LittleEndian.Uint32(repr[countPos:headerSize])), nil
   385  }