github.com/cockroachdb/pebble@v1.1.2/cmd/pebble/mvcc.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package main
     6  
     7  import (
     8  	"bytes"
     9  
    10  	"github.com/cockroachdb/pebble"
    11  	"github.com/cockroachdb/pebble/internal/bytealloc"
    12  )
    13  
    14  // MVCC encoding and decoding routines adapted from CockroachDB sources. Used
    15  // to perform apples-to-apples benchmarking for CockroachDB's usage of RocksDB.
    16  
    17  var mvccComparer = &pebble.Comparer{
    18  	Compare: mvccCompare,
    19  
    20  	AbbreviatedKey: func(k []byte) uint64 {
    21  		key, _, ok := mvccSplitKey(k)
    22  		if !ok {
    23  			return 0
    24  		}
    25  		return pebble.DefaultComparer.AbbreviatedKey(key)
    26  	},
    27  
    28  	Equal: func(a, b []byte) bool {
    29  		return mvccCompare(a, b) == 0
    30  	},
    31  
    32  	Separator: func(dst, a, b []byte) []byte {
    33  		aKey, _, ok := mvccSplitKey(a)
    34  		if !ok {
    35  			return append(dst, a...)
    36  		}
    37  		bKey, _, ok := mvccSplitKey(b)
    38  		if !ok {
    39  			return append(dst, a...)
    40  		}
    41  		// If the keys are the same just return a.
    42  		if bytes.Equal(aKey, bKey) {
    43  			return append(dst, a...)
    44  		}
    45  		n := len(dst)
    46  		// MVCC key comparison uses bytes.Compare on the roachpb.Key, which is the same semantics as
    47  		// pebble.DefaultComparer, so reuse the latter's Separator implementation.
    48  		dst = pebble.DefaultComparer.Separator(dst, aKey, bKey)
    49  		// Did it pick a separator different than aKey -- if it did not we can't do better than a.
    50  		buf := dst[n:]
    51  		if bytes.Equal(aKey, buf) {
    52  			return append(dst[:n], a...)
    53  		}
    54  		// The separator is > aKey, so we only need to add the timestamp sentinel.
    55  		return append(dst, 0)
    56  	},
    57  
    58  	Successor: func(dst, a []byte) []byte {
    59  		aKey, _, ok := mvccSplitKey(a)
    60  		if !ok {
    61  			return append(dst, a...)
    62  		}
    63  		n := len(dst)
    64  		// MVCC key comparison uses bytes.Compare on the roachpb.Key, which is the same semantics as
    65  		// pebble.DefaultComparer, so reuse the latter's Successor implementation.
    66  		dst = pebble.DefaultComparer.Successor(dst, aKey)
    67  		// Did it pick a successor different than aKey -- if it did not we can't do better than a.
    68  		buf := dst[n:]
    69  		if bytes.Equal(aKey, buf) {
    70  			return append(dst[:n], a...)
    71  		}
    72  		// The successor is > aKey, so we only need to add the timestamp sentinel.
    73  		return append(dst, 0)
    74  	},
    75  
    76  	Split: func(k []byte) int {
    77  		key, _, ok := mvccSplitKey(k)
    78  		if !ok {
    79  			return len(k)
    80  		}
    81  		// This matches the behavior of libroach/KeyPrefix. RocksDB requires that
    82  		// keys generated via a SliceTransform be comparable with normal encoded
    83  		// MVCC keys. Encoded MVCC keys have a suffix indicating the number of
    84  		// bytes of timestamp data. MVCC keys without a timestamp have a suffix of
    85  		// 0. We're careful in EncodeKey to make sure that the user-key always has
    86  		// a trailing 0. If there is no timestamp this falls out naturally. If
    87  		// there is a timestamp we prepend a 0 to the encoded timestamp data.
    88  		return len(key) + 1
    89  	},
    90  
    91  	Name: "cockroach_comparator",
    92  }
    93  
    94  func mvccSplitKey(mvccKey []byte) (key []byte, ts []byte, ok bool) {
    95  	if len(mvccKey) == 0 {
    96  		return nil, nil, false
    97  	}
    98  	n := len(mvccKey) - 1
    99  	tsLen := int(mvccKey[n])
   100  	if n < tsLen {
   101  		return nil, nil, false
   102  	}
   103  	key = mvccKey[:n-tsLen]
   104  	if tsLen > 0 {
   105  		ts = mvccKey[n-tsLen+1 : len(mvccKey)-1]
   106  	}
   107  	return key, ts, true
   108  }
   109  
   110  func mvccCompare(a, b []byte) int {
   111  	// NB: For performance, this routine manually splits the key into the
   112  	// user-key and timestamp components rather than using SplitMVCCKey. Don't
   113  	// try this at home kids: use SplitMVCCKey.
   114  
   115  	aEnd := len(a) - 1
   116  	bEnd := len(b) - 1
   117  	if aEnd < 0 || bEnd < 0 {
   118  		// This should never happen unless there is some sort of corruption of
   119  		// the keys. This is a little bizarre, but the behavior exactly matches
   120  		// engine/db.cc:DBComparator.
   121  		return bytes.Compare(a, b)
   122  	}
   123  
   124  	// Compute the index of the separator between the key and the timestamp.
   125  	aSep := aEnd - int(a[aEnd])
   126  	bSep := bEnd - int(b[bEnd])
   127  	if aSep < 0 || bSep < 0 {
   128  		// This should never happen unless there is some sort of corruption of
   129  		// the keys. This is a little bizarre, but the behavior exactly matches
   130  		// engine/db.cc:DBComparator.
   131  		return bytes.Compare(a, b)
   132  	}
   133  
   134  	// Compare the "user key" part of the key.
   135  	if c := bytes.Compare(a[:aSep], b[:bSep]); c != 0 {
   136  		return c
   137  	}
   138  
   139  	// Compare the timestamp part of the key.
   140  	aTS := a[aSep:aEnd]
   141  	bTS := b[bSep:bEnd]
   142  	if len(aTS) == 0 {
   143  		if len(bTS) == 0 {
   144  			return 0
   145  		}
   146  		return -1
   147  	} else if len(bTS) == 0 {
   148  		return 1
   149  	}
   150  	return bytes.Compare(bTS, aTS)
   151  }
   152  
   153  // <key>\x00[<wall_time>[<logical>]]<#timestamp-bytes>
   154  func mvccEncode(dst, key []byte, walltime uint64, logical uint32) []byte {
   155  	dst = append(dst, key...)
   156  	dst = append(dst, 0)
   157  	if walltime != 0 || logical != 0 {
   158  		extra := byte(1 + 8)
   159  		dst = encodeUint64Ascending(dst, walltime)
   160  		if logical != 0 {
   161  			dst = encodeUint32Ascending(dst, logical)
   162  			extra += 4
   163  		}
   164  		dst = append(dst, extra)
   165  	}
   166  	return dst
   167  }
   168  
   169  func mvccForwardScan(d DB, start, end, ts []byte) (int, int64) {
   170  	it := d.NewIter(&pebble.IterOptions{
   171  		LowerBound: mvccEncode(nil, start, 0, 0),
   172  		UpperBound: mvccEncode(nil, end, 0, 0),
   173  	})
   174  	defer it.Close()
   175  
   176  	var data bytealloc.A
   177  	var count int
   178  	var nbytes int64
   179  
   180  	for valid := it.First(); valid; valid = it.Next() {
   181  		key, keyTS, _ := mvccSplitKey(it.Key())
   182  		if bytes.Compare(keyTS, ts) <= 0 {
   183  			data, _ = data.Copy(key)
   184  			data, _ = data.Copy(it.Value())
   185  		}
   186  		count++
   187  		nbytes += int64(len(it.Key()) + len(it.Value()))
   188  	}
   189  	return count, nbytes
   190  }
   191  
   192  func mvccReverseScan(d DB, start, end, ts []byte) (int, int64) {
   193  	it := d.NewIter(&pebble.IterOptions{
   194  		LowerBound: mvccEncode(nil, start, 0, 0),
   195  		UpperBound: mvccEncode(nil, end, 0, 0),
   196  	})
   197  	defer it.Close()
   198  
   199  	var data bytealloc.A
   200  	var count int
   201  	var nbytes int64
   202  
   203  	for valid := it.Last(); valid; valid = it.Prev() {
   204  		key, keyTS, _ := mvccSplitKey(it.Key())
   205  		if bytes.Compare(keyTS, ts) <= 0 {
   206  			data, _ = data.Copy(key)
   207  			data, _ = data.Copy(it.Value())
   208  		}
   209  		count++
   210  		nbytes += int64(len(it.Key()) + len(it.Value()))
   211  	}
   212  	return count, nbytes
   213  }
   214  
   215  var fauxMVCCMerger = &pebble.Merger{
   216  	Name: "cockroach_merge_operator",
   217  	Merge: func(key, value []byte) (pebble.ValueMerger, error) {
   218  		// This merger is used by the compact benchmark and use the
   219  		// pebble default value merger to concatenate values.
   220  		// It shouldn't materially affect the benchmarks.
   221  		return pebble.DefaultMerger.Merge(key, value)
   222  	},
   223  }