github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/internal/rangekey/rangekey.go (about) 1 // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 // Package rangekey provides facilities for encoding, decoding and merging range 6 // keys. 7 // 8 // Range keys map a span of keyspan `[start, end)`, at an optional suffix, to a 9 // value. 10 // 11 // # Encoding 12 // 13 // Unlike other Pebble keys, range keys encode several fields of information: 14 // start key, end key, suffix and value. Internally within Pebble and its 15 // sstables, all keys including range keys are represented as a key-value tuple. 16 // Range keys map to internal key-value tuples by mapping the start key to the 17 // key and encoding the remainder of the fields in the value. 18 // 19 // ## `RANGEKEYSET` 20 // 21 // A `RANGEKEYSET` represents one more range keys set over a single region of 22 // user key space. Each represented range key must have a unique suffix. A 23 // `RANGEKEYSET` encapsulates a start key, an end key and a set of SuffixValue 24 // pairs. 25 // 26 // A `RANGEKEYSET` key's user key holds the start key. Its value is a varstring 27 // end key, followed by a set of SuffixValue pairs. A `RANGEKEYSET` may have 28 // multiple SuffixValue pairs if the keyspan was set at multiple unique suffix 29 // values. 30 // 31 // ## `RANGEKEYUNSET` 32 // 33 // A `RANGEKEYUNSET` represents the removal of range keys at specific suffixes 34 // over a single region of user key space. A `RANGEKEYUNSET` encapsulates a 35 // start key, an end key and a set of suffixes. 36 // 37 // A `RANGEKEYUNSET` key's user key holds the start key. Its value is a 38 // varstring end key, followed by a set of suffixes. A `RANGEKEYUNSET` may have 39 // multiple suffixes if the keyspan was unset at multiple unique suffixes. 40 // 41 // ## `RANGEKEYDEL` 42 // 43 // A `RANGEKEYDEL` represents the removal of all range keys over a single region 44 // of user key space, regardless of suffix. A `RANGEKEYDEL` encapsulates a 45 // start key and an end key. The end key is stored in the value, without any 46 // varstring length prefixing. 47 package rangekey 48 49 // TODO(jackson): Document the encoding of RANGEKEYSET and RANGEKEYUNSET values 50 // once we're confident they're stable. 51 52 import ( 53 "encoding/binary" 54 55 "github.com/cockroachdb/errors" 56 "github.com/cockroachdb/pebble/internal/base" 57 "github.com/cockroachdb/pebble/internal/keyspan" 58 ) 59 60 // Encode takes a Span containing only range keys. It invokes the provided 61 // closure with the encoded internal keys that represent the Span's state. The 62 // keys and values passed to emit are only valid until the closure returns. 63 // If emit returns an error, Encode stops and returns the error. 64 func Encode(s *keyspan.Span, emit func(k base.InternalKey, v []byte) error) error { 65 enc := Encoder{Emit: emit} 66 return enc.Encode(s) 67 } 68 69 // An Encoder encodes range keys into their on-disk InternalKey format. An 70 // Encoder holds internal buffers, reused between Emit calls. 71 type Encoder struct { 72 Emit func(base.InternalKey, []byte) error 73 buf []byte 74 unsets [][]byte 75 sets []SuffixValue 76 } 77 78 // Encode takes a Span containing only range keys. It invokes the Encoder's Emit 79 // closure with the encoded internal keys that represent the Span's state. The 80 // keys and values passed to emit are only valid until the closure returns. If 81 // Emit returns an error, Encode stops and returns the error. 82 // 83 // The encoded key-value pair passed to Emit is only valid until the closure 84 // completes. 85 func (e *Encoder) Encode(s *keyspan.Span) error { 86 if s.Empty() { 87 return nil 88 } 89 90 // This for loop iterates through the span's keys, which are sorted by 91 // sequence number descending, grouping them into sequence numbers. All keys 92 // with identical sequence numbers are flushed together. 93 var del bool 94 var seqNum uint64 95 for i := range s.Keys { 96 if i == 0 || s.Keys[i].SeqNum() != seqNum { 97 if i > 0 { 98 // Flush all the existing internal keys that exist at seqNum. 99 if err := e.flush(s, seqNum, del); err != nil { 100 return err 101 } 102 } 103 104 // Reset sets, unsets, del. 105 seqNum = s.Keys[i].SeqNum() 106 del = false 107 e.sets = e.sets[:0] 108 e.unsets = e.unsets[:0] 109 } 110 111 switch s.Keys[i].Kind() { 112 case base.InternalKeyKindRangeKeySet: 113 e.sets = append(e.sets, SuffixValue{ 114 Suffix: s.Keys[i].Suffix, 115 Value: s.Keys[i].Value, 116 }) 117 case base.InternalKeyKindRangeKeyUnset: 118 e.unsets = append(e.unsets, s.Keys[i].Suffix) 119 case base.InternalKeyKindRangeKeyDelete: 120 del = true 121 default: 122 return base.CorruptionErrorf("pebble: %s key kind is not a range key", s.Keys[i].Kind()) 123 } 124 } 125 return e.flush(s, seqNum, del) 126 } 127 128 // flush constructs internal keys for accumulated key state, and emits the 129 // internal keys. 130 func (e *Encoder) flush(s *keyspan.Span, seqNum uint64, del bool) error { 131 if len(e.sets) > 0 { 132 ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeySet) 133 l := EncodedSetValueLen(s.End, e.sets) 134 if l > cap(e.buf) { 135 e.buf = make([]byte, l) 136 } 137 EncodeSetValue(e.buf[:l], s.End, e.sets) 138 if err := e.Emit(ik, e.buf[:l]); err != nil { 139 return err 140 } 141 } 142 if len(e.unsets) > 0 { 143 ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeyUnset) 144 l := EncodedUnsetValueLen(s.End, e.unsets) 145 if l > cap(e.buf) { 146 e.buf = make([]byte, l) 147 } 148 EncodeUnsetValue(e.buf[:l], s.End, e.unsets) 149 if err := e.Emit(ik, e.buf[:l]); err != nil { 150 return err 151 } 152 } 153 if del { 154 ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeyDelete) 155 // s.End is stored directly in the value for RangeKeyDeletes. 156 if err := e.Emit(ik, s.End); err != nil { 157 return err 158 } 159 } 160 return nil 161 } 162 163 // Decode takes an internal key pair encoding range key(s) and returns a decoded 164 // keyspan containing the keys. If keysDst is provided, keys will be appended to 165 // keysDst. 166 func Decode(ik base.InternalKey, v []byte, keysDst []keyspan.Key) (keyspan.Span, error) { 167 var s keyspan.Span 168 169 // Hydrate the user key bounds. 170 s.Start = ik.UserKey 171 var ok bool 172 s.End, v, ok = DecodeEndKey(ik.Kind(), v) 173 if !ok { 174 return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key end from %s", ik.Kind()) 175 } 176 s.Keys = keysDst 177 178 // Hydrate the contents of the range key(s). 179 switch ik.Kind() { 180 case base.InternalKeyKindRangeKeySet: 181 for len(v) > 0 { 182 var sv SuffixValue 183 sv, v, ok = decodeSuffixValue(v) 184 if !ok { 185 return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key suffix-value tuple") 186 } 187 s.Keys = append(s.Keys, keyspan.Key{ 188 Trailer: ik.Trailer, 189 Suffix: sv.Suffix, 190 Value: sv.Value, 191 }) 192 } 193 case base.InternalKeyKindRangeKeyUnset: 194 for len(v) > 0 { 195 var suffix []byte 196 suffix, v, ok = decodeSuffix(v) 197 if !ok { 198 return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key unset suffix") 199 } 200 s.Keys = append(s.Keys, keyspan.Key{ 201 Trailer: ik.Trailer, 202 Suffix: suffix, 203 }) 204 } 205 case base.InternalKeyKindRangeKeyDelete: 206 if len(v) > 0 { 207 return keyspan.Span{}, base.CorruptionErrorf("pebble: RANGEKEYDELs must not contain additional data") 208 } 209 s.Keys = append(s.Keys, keyspan.Key{Trailer: ik.Trailer}) 210 default: 211 return keyspan.Span{}, base.CorruptionErrorf("pebble: %s is not a range key", ik.Kind()) 212 } 213 return s, nil 214 } 215 216 // SuffixValue represents a tuple of a suffix and a corresponding value. A 217 // physical RANGEKEYSET key may contain many logical RangeKeySets, each 218 // represented with a separate SuffixValue tuple. 219 type SuffixValue struct { 220 Suffix []byte 221 Value []byte 222 } 223 224 // encodedSetSuffixValuesLen precomputes the length of the given slice of 225 // SuffixValues, when encoded for a RangeKeySet. It may be used to construct a 226 // buffer of the appropriate size before encoding. 227 func encodedSetSuffixValuesLen(suffixValues []SuffixValue) int { 228 var n int 229 for i := 0; i < len(suffixValues); i++ { 230 n += lenVarint(len(suffixValues[i].Suffix)) 231 n += len(suffixValues[i].Suffix) 232 n += lenVarint(len(suffixValues[i].Value)) 233 n += len(suffixValues[i].Value) 234 } 235 return n 236 } 237 238 // encodeSetSuffixValues encodes a slice of SuffixValues for a RangeKeySet into 239 // dst. The length of dst must be greater than or equal to 240 // encodedSetSuffixValuesLen. encodeSetSuffixValues returns the number of bytes 241 // written, which should always equal the EncodedSetValueLen with the same 242 // arguments. 243 func encodeSetSuffixValues(dst []byte, suffixValues []SuffixValue) int { 244 // Encode the list of (suffix, value-len) tuples. 245 var n int 246 for i := 0; i < len(suffixValues); i++ { 247 // Encode the length of the suffix. 248 n += binary.PutUvarint(dst[n:], uint64(len(suffixValues[i].Suffix))) 249 250 // Encode the suffix itself. 251 n += copy(dst[n:], suffixValues[i].Suffix) 252 253 // Encode the value length. 254 n += binary.PutUvarint(dst[n:], uint64(len(suffixValues[i].Value))) 255 256 // Encode the value itself. 257 n += copy(dst[n:], suffixValues[i].Value) 258 } 259 return n 260 } 261 262 // EncodedSetValueLen precomputes the length of a RangeKeySet's value when 263 // encoded. It may be used to construct a buffer of the appropriate size before 264 // encoding. 265 func EncodedSetValueLen(endKey []byte, suffixValues []SuffixValue) int { 266 n := lenVarint(len(endKey)) 267 n += len(endKey) 268 n += encodedSetSuffixValuesLen(suffixValues) 269 return n 270 } 271 272 // EncodeSetValue encodes a RangeKeySet's value into dst. The length of dst must 273 // be greater than or equal to EncodedSetValueLen. EncodeSetValue returns the 274 // number of bytes written, which should always equal the EncodedSetValueLen 275 // with the same arguments. 276 func EncodeSetValue(dst []byte, endKey []byte, suffixValues []SuffixValue) int { 277 // First encode the end key as a varstring. 278 n := binary.PutUvarint(dst, uint64(len(endKey))) 279 n += copy(dst[n:], endKey) 280 n += encodeSetSuffixValues(dst[n:], suffixValues) 281 return n 282 } 283 284 // DecodeEndKey reads the end key from the beginning of a range key (RANGEKEYSET, 285 // RANGEKEYUNSET or RANGEKEYDEL)'s physical encoded value. Both sets and unsets 286 // encode the range key, plus additional data in the value. 287 func DecodeEndKey(kind base.InternalKeyKind, data []byte) (endKey, value []byte, ok bool) { 288 switch kind { 289 case base.InternalKeyKindRangeKeyDelete: 290 // No splitting is necessary for range key deletes. The value is the end 291 // key, and there is no additional associated value. 292 return data, nil, true 293 case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset: 294 v, n := binary.Uvarint(data) 295 if n <= 0 || uint64(n)+v >= uint64(len(data)) { 296 return nil, nil, false 297 } 298 endKey, value = data[n:n+int(v)], data[n+int(v):] 299 return endKey, value, true 300 default: 301 panic(errors.Newf("key kind %s is not a range key kind", kind)) 302 } 303 } 304 305 // decodeSuffixValue decodes a single encoded SuffixValue from a RangeKeySet's 306 // split value. The end key must have already been stripped from the 307 // RangeKeySet's value (see DecodeEndKey). 308 func decodeSuffixValue(data []byte) (sv SuffixValue, rest []byte, ok bool) { 309 // Decode the suffix. 310 sv.Suffix, data, ok = decodeVarstring(data) 311 if !ok { 312 return SuffixValue{}, nil, false 313 } 314 // Decode the value. 315 sv.Value, data, ok = decodeVarstring(data) 316 if !ok { 317 return SuffixValue{}, nil, false 318 } 319 return sv, data, true 320 } 321 322 // encodedUnsetSuffixesLen precomputes the length of the given slice of 323 // suffixes, when encoded for a RangeKeyUnset. It may be used to construct a 324 // buffer of the appropriate size before encoding. 325 func encodedUnsetSuffixesLen(suffixes [][]byte) int { 326 var n int 327 for i := 0; i < len(suffixes); i++ { 328 n += lenVarint(len(suffixes[i])) 329 n += len(suffixes[i]) 330 } 331 return n 332 } 333 334 // encodeUnsetSuffixes encodes a slice of suffixes for a RangeKeyUnset into dst. 335 // The length of dst must be greater than or equal to EncodedUnsetSuffixesLen. 336 // EncodeUnsetSuffixes returns the number of bytes written, which should always 337 // equal the EncodedUnsetSuffixesLen with the same arguments. 338 func encodeUnsetSuffixes(dst []byte, suffixes [][]byte) int { 339 // Encode the list of (suffix, value-len) tuples. 340 var n int 341 for i := 0; i < len(suffixes); i++ { 342 // Encode the length of the suffix. 343 n += binary.PutUvarint(dst[n:], uint64(len(suffixes[i]))) 344 345 // Encode the suffix itself. 346 n += copy(dst[n:], suffixes[i]) 347 } 348 return n 349 } 350 351 // EncodedUnsetValueLen precomputes the length of a RangeKeyUnset's value when 352 // encoded. It may be used to construct a buffer of the appropriate size before 353 // encoding. 354 func EncodedUnsetValueLen(endKey []byte, suffixes [][]byte) int { 355 n := lenVarint(len(endKey)) 356 n += len(endKey) 357 n += encodedUnsetSuffixesLen(suffixes) 358 return n 359 } 360 361 // EncodeUnsetValue encodes a RangeKeyUnset's value into dst. The length of dst 362 // must be greater than or equal to EncodedUnsetValueLen. EncodeUnsetValue 363 // returns the number of bytes written, which should always equal the 364 // EncodedUnsetValueLen with the same arguments. 365 func EncodeUnsetValue(dst []byte, endKey []byte, suffixes [][]byte) int { 366 // First encode the end key as a varstring. 367 n := binary.PutUvarint(dst, uint64(len(endKey))) 368 n += copy(dst[n:], endKey) 369 n += encodeUnsetSuffixes(dst[n:], suffixes) 370 return n 371 } 372 373 // decodeSuffix decodes a single suffix from the beginning of data. If decoding 374 // suffixes from a RangeKeyUnset's value, the end key must have already been 375 // stripped from the RangeKeyUnset's value (see DecodeEndKey). 376 func decodeSuffix(data []byte) (suffix, rest []byte, ok bool) { 377 return decodeVarstring(data) 378 } 379 380 func decodeVarstring(data []byte) (v, rest []byte, ok bool) { 381 // Decode the length of the string. 382 l, n := binary.Uvarint(data) 383 if n <= 0 { 384 return nil, nil, ok 385 } 386 387 // Extract the string itself. 388 return data[n : n+int(l)], data[n+int(l):], true 389 } 390 391 // IsRangeKey returns true if the given key kind is one of the range key kinds. 392 func IsRangeKey(kind base.InternalKeyKind) bool { 393 switch kind { 394 case base.InternalKeyKindRangeKeyDelete, 395 base.InternalKeyKindRangeKeyUnset, 396 base.InternalKeyKindRangeKeySet: 397 return true 398 default: 399 return false 400 } 401 } 402 403 func lenVarint(v int) (n int) { 404 x := uint32(v) 405 n++ 406 for x >= 0x80 { 407 x >>= 7 408 n++ 409 } 410 return n 411 }