github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/cmd/pebble/mvcc.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package main 6 7 import ( 8 "bytes" 9 10 "github.com/cockroachdb/pebble" 11 "github.com/cockroachdb/pebble/internal/bytealloc" 12 ) 13 14 // MVCC encoding and decoding routines adapted from CockroachDB sources. Used 15 // to perform apples-to-apples benchmarking for CockroachDB's usage of RocksDB. 16 17 var mvccComparer = &pebble.Comparer{ 18 Compare: mvccCompare, 19 20 AbbreviatedKey: func(k []byte) uint64 { 21 key, _, ok := mvccSplitKey(k) 22 if !ok { 23 return 0 24 } 25 return pebble.DefaultComparer.AbbreviatedKey(key) 26 }, 27 28 Equal: func(a, b []byte) bool { 29 return mvccCompare(a, b) == 0 30 }, 31 32 Separator: func(dst, a, b []byte) []byte { 33 aKey, _, ok := mvccSplitKey(a) 34 if !ok { 35 return append(dst, a...) 36 } 37 bKey, _, ok := mvccSplitKey(b) 38 if !ok { 39 return append(dst, a...) 40 } 41 // If the keys are the same just return a. 42 if bytes.Equal(aKey, bKey) { 43 return append(dst, a...) 44 } 45 n := len(dst) 46 // MVCC key comparison uses bytes.Compare on the roachpb.Key, which is the same semantics as 47 // pebble.DefaultComparer, so reuse the latter's Separator implementation. 48 dst = pebble.DefaultComparer.Separator(dst, aKey, bKey) 49 // Did it pick a separator different than aKey -- if it did not we can't do better than a. 50 buf := dst[n:] 51 if bytes.Equal(aKey, buf) { 52 return append(dst[:n], a...) 53 } 54 // The separator is > aKey, so we only need to add the timestamp sentinel. 55 return append(dst, 0) 56 }, 57 58 Successor: func(dst, a []byte) []byte { 59 aKey, _, ok := mvccSplitKey(a) 60 if !ok { 61 return append(dst, a...) 62 } 63 n := len(dst) 64 // MVCC key comparison uses bytes.Compare on the roachpb.Key, which is the same semantics as 65 // pebble.DefaultComparer, so reuse the latter's Successor implementation. 66 dst = pebble.DefaultComparer.Successor(dst, aKey) 67 // Did it pick a successor different than aKey -- if it did not we can't do better than a. 68 buf := dst[n:] 69 if bytes.Equal(aKey, buf) { 70 return append(dst[:n], a...) 71 } 72 // The successor is > aKey, so we only need to add the timestamp sentinel. 73 return append(dst, 0) 74 }, 75 76 Split: func(k []byte) int { 77 key, _, ok := mvccSplitKey(k) 78 if !ok { 79 return len(k) 80 } 81 // This matches the behavior of libroach/KeyPrefix. RocksDB requires that 82 // keys generated via a SliceTransform be comparable with normal encoded 83 // MVCC keys. Encoded MVCC keys have a suffix indicating the number of 84 // bytes of timestamp data. MVCC keys without a timestamp have a suffix of 85 // 0. We're careful in EncodeKey to make sure that the user-key always has 86 // a trailing 0. If there is no timestamp this falls out naturally. If 87 // there is a timestamp we prepend a 0 to the encoded timestamp data. 88 return len(key) + 1 89 }, 90 91 Name: "cockroach_comparator", 92 } 93 94 func mvccSplitKey(mvccKey []byte) (key []byte, ts []byte, ok bool) { 95 if len(mvccKey) == 0 { 96 return nil, nil, false 97 } 98 n := len(mvccKey) - 1 99 tsLen := int(mvccKey[n]) 100 if n < tsLen { 101 return nil, nil, false 102 } 103 key = mvccKey[:n-tsLen] 104 if tsLen > 0 { 105 ts = mvccKey[n-tsLen+1 : len(mvccKey)-1] 106 } 107 return key, ts, true 108 } 109 110 func mvccCompare(a, b []byte) int { 111 // NB: For performance, this routine manually splits the key into the 112 // user-key and timestamp components rather than using SplitMVCCKey. Don't 113 // try this at home kids: use SplitMVCCKey. 114 115 aEnd := len(a) - 1 116 bEnd := len(b) - 1 117 if aEnd < 0 || bEnd < 0 { 118 // This should never happen unless there is some sort of corruption of 119 // the keys. This is a little bizarre, but the behavior exactly matches 120 // engine/db.cc:DBComparator. 121 return bytes.Compare(a, b) 122 } 123 124 // Compute the index of the separator between the key and the timestamp. 125 aSep := aEnd - int(a[aEnd]) 126 bSep := bEnd - int(b[bEnd]) 127 if aSep < 0 || bSep < 0 { 128 // This should never happen unless there is some sort of corruption of 129 // the keys. This is a little bizarre, but the behavior exactly matches 130 // engine/db.cc:DBComparator. 131 return bytes.Compare(a, b) 132 } 133 134 // Compare the "user key" part of the key. 135 if c := bytes.Compare(a[:aSep], b[:bSep]); c != 0 { 136 return c 137 } 138 139 // Compare the timestamp part of the key. 140 aTS := a[aSep:aEnd] 141 bTS := b[bSep:bEnd] 142 if len(aTS) == 0 { 143 if len(bTS) == 0 { 144 return 0 145 } 146 return -1 147 } else if len(bTS) == 0 { 148 return 1 149 } 150 return bytes.Compare(bTS, aTS) 151 } 152 153 // <key>\x00[<wall_time>[<logical>]]<#timestamp-bytes> 154 func mvccEncode(dst, key []byte, walltime uint64, logical uint32) []byte { 155 dst = append(dst, key...) 156 dst = append(dst, 0) 157 if walltime != 0 || logical != 0 { 158 extra := byte(1 + 8) 159 dst = encodeUint64Ascending(dst, walltime) 160 if logical != 0 { 161 dst = encodeUint32Ascending(dst, logical) 162 extra += 4 163 } 164 dst = append(dst, extra) 165 } 166 return dst 167 } 168 169 func mvccForwardScan(d DB, start, end, ts []byte) (int, int64) { 170 it := d.NewIter(&pebble.IterOptions{ 171 LowerBound: mvccEncode(nil, start, 0, 0), 172 UpperBound: mvccEncode(nil, end, 0, 0), 173 }) 174 defer it.Close() 175 176 var data bytealloc.A 177 var count int 178 var nbytes int64 179 180 for valid := it.First(); valid; valid = it.Next() { 181 key, keyTS, _ := mvccSplitKey(it.Key()) 182 if bytes.Compare(keyTS, ts) <= 0 { 183 data, _ = data.Copy(key) 184 data, _ = data.Copy(it.Value()) 185 } 186 count++ 187 nbytes += int64(len(it.Key()) + len(it.Value())) 188 } 189 return count, nbytes 190 } 191 192 func mvccReverseScan(d DB, start, end, ts []byte) (int, int64) { 193 it := d.NewIter(&pebble.IterOptions{ 194 LowerBound: mvccEncode(nil, start, 0, 0), 195 UpperBound: mvccEncode(nil, end, 0, 0), 196 }) 197 defer it.Close() 198 199 var data bytealloc.A 200 var count int 201 var nbytes int64 202 203 for valid := it.Last(); valid; valid = it.Prev() { 204 key, keyTS, _ := mvccSplitKey(it.Key()) 205 if bytes.Compare(keyTS, ts) <= 0 { 206 data, _ = data.Copy(key) 207 data, _ = data.Copy(it.Value()) 208 } 209 count++ 210 nbytes += int64(len(it.Key()) + len(it.Value())) 211 } 212 return count, nbytes 213 } 214 215 var fauxMVCCMerger = &pebble.Merger{ 216 Name: "cockroach_merge_operator", 217 Merge: func(key, value []byte) (pebble.ValueMerger, error) { 218 // This merger is used by the compact benchmark and use the 219 // pebble default value merger to concatenate values. 220 // It shouldn't materially affect the benchmarks. 221 return pebble.DefaultMerger.Merge(key, value) 222 }, 223 }