github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/rangekey/rangekey.go (about)

     1  // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  // Package rangekey provides facilities for encoding, decoding and merging range
     6  // keys.
     7  //
     8  // Range keys map a span of keyspan `[start, end)`, at an optional suffix, to a
     9  // value.
    10  //
    11  // # Encoding
    12  //
    13  // Unlike other Pebble keys, range keys encode several fields of information:
    14  // start key, end key, suffix and value. Internally within Pebble and its
    15  // sstables, all keys including range keys are represented as a key-value tuple.
    16  // Range keys map to internal key-value tuples by mapping the start key to the
    17  // key and encoding the remainder of the fields in the value.
    18  //
    19  // ## `RANGEKEYSET`
    20  //
    21  // A `RANGEKEYSET` represents one more range keys set over a single region of
    22  // user key space. Each represented range key must have a unique suffix.  A
    23  // `RANGEKEYSET` encapsulates a start key, an end key and a set of SuffixValue
    24  // pairs.
    25  //
    26  // A `RANGEKEYSET` key's user key holds the start key. Its value is a varstring
    27  // end key, followed by a set of SuffixValue pairs. A `RANGEKEYSET` may have
    28  // multiple SuffixValue pairs if the keyspan was set at multiple unique suffix
    29  // values.
    30  //
    31  // ## `RANGEKEYUNSET`
    32  //
    33  // A `RANGEKEYUNSET` represents the removal of range keys at specific suffixes
    34  // over a single region of user key space. A `RANGEKEYUNSET` encapsulates a
    35  // start key, an end key and a set of suffixes.
    36  //
    37  // A `RANGEKEYUNSET` key's user key holds the start key. Its value is a
    38  // varstring end key, followed by a set of suffixes. A `RANGEKEYUNSET` may have
    39  // multiple suffixes if the keyspan was unset at multiple unique suffixes.
    40  //
    41  // ## `RANGEKEYDEL`
    42  //
    43  // A `RANGEKEYDEL` represents the removal of all range keys over a single region
    44  // of user key space, regardless of suffix. A `RANGEKEYDEL` encapsulates a
    45  // start key and an end key. The end key is stored in the value, without any
    46  // varstring length prefixing.
    47  package rangekey
    48  
    49  // TODO(jackson): Document the encoding of RANGEKEYSET and RANGEKEYUNSET values
    50  // once we're confident they're stable.
    51  
    52  import (
    53  	"encoding/binary"
    54  
    55  	"github.com/cockroachdb/errors"
    56  	"github.com/cockroachdb/pebble/internal/base"
    57  	"github.com/cockroachdb/pebble/internal/keyspan"
    58  )
    59  
    60  // Encode takes a Span containing only range keys. It invokes the provided
    61  // closure with the encoded internal keys that represent the Span's state. The
    62  // keys and values passed to emit are only valid until the closure returns.
    63  // If emit returns an error, Encode stops and returns the error.
    64  func Encode(s *keyspan.Span, emit func(k base.InternalKey, v []byte) error) error {
    65  	enc := Encoder{Emit: emit}
    66  	return enc.Encode(s)
    67  }
    68  
    69  // An Encoder encodes range keys into their on-disk InternalKey format. An
    70  // Encoder holds internal buffers, reused between Emit calls.
    71  type Encoder struct {
    72  	Emit   func(base.InternalKey, []byte) error
    73  	buf    []byte
    74  	unsets [][]byte
    75  	sets   []SuffixValue
    76  }
    77  
    78  // Encode takes a Span containing only range keys. It invokes the Encoder's Emit
    79  // closure with the encoded internal keys that represent the Span's state. The
    80  // keys and values passed to emit are only valid until the closure returns.  If
    81  // Emit returns an error, Encode stops and returns the error.
    82  //
    83  // The encoded key-value pair passed to Emit is only valid until the closure
    84  // completes.
    85  func (e *Encoder) Encode(s *keyspan.Span) error {
    86  	if s.Empty() {
    87  		return nil
    88  	}
    89  
    90  	// This for loop iterates through the span's keys, which are sorted by
    91  	// sequence number descending, grouping them into sequence numbers. All keys
    92  	// with identical sequence numbers are flushed together.
    93  	var del bool
    94  	var seqNum uint64
    95  	for i := range s.Keys {
    96  		if i == 0 || s.Keys[i].SeqNum() != seqNum {
    97  			if i > 0 {
    98  				// Flush all the existing internal keys that exist at seqNum.
    99  				if err := e.flush(s, seqNum, del); err != nil {
   100  					return err
   101  				}
   102  			}
   103  
   104  			// Reset sets, unsets, del.
   105  			seqNum = s.Keys[i].SeqNum()
   106  			del = false
   107  			e.sets = e.sets[:0]
   108  			e.unsets = e.unsets[:0]
   109  		}
   110  
   111  		switch s.Keys[i].Kind() {
   112  		case base.InternalKeyKindRangeKeySet:
   113  			e.sets = append(e.sets, SuffixValue{
   114  				Suffix: s.Keys[i].Suffix,
   115  				Value:  s.Keys[i].Value,
   116  			})
   117  		case base.InternalKeyKindRangeKeyUnset:
   118  			e.unsets = append(e.unsets, s.Keys[i].Suffix)
   119  		case base.InternalKeyKindRangeKeyDelete:
   120  			del = true
   121  		default:
   122  			return base.CorruptionErrorf("pebble: %s key kind is not a range key", s.Keys[i].Kind())
   123  		}
   124  	}
   125  	return e.flush(s, seqNum, del)
   126  }
   127  
   128  // flush constructs internal keys for accumulated key state, and emits the
   129  // internal keys.
   130  func (e *Encoder) flush(s *keyspan.Span, seqNum uint64, del bool) error {
   131  	if len(e.sets) > 0 {
   132  		ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeySet)
   133  		l := EncodedSetValueLen(s.End, e.sets)
   134  		if l > cap(e.buf) {
   135  			e.buf = make([]byte, l)
   136  		}
   137  		EncodeSetValue(e.buf[:l], s.End, e.sets)
   138  		if err := e.Emit(ik, e.buf[:l]); err != nil {
   139  			return err
   140  		}
   141  	}
   142  	if len(e.unsets) > 0 {
   143  		ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeyUnset)
   144  		l := EncodedUnsetValueLen(s.End, e.unsets)
   145  		if l > cap(e.buf) {
   146  			e.buf = make([]byte, l)
   147  		}
   148  		EncodeUnsetValue(e.buf[:l], s.End, e.unsets)
   149  		if err := e.Emit(ik, e.buf[:l]); err != nil {
   150  			return err
   151  		}
   152  	}
   153  	if del {
   154  		ik := base.MakeInternalKey(s.Start, seqNum, base.InternalKeyKindRangeKeyDelete)
   155  		// s.End is stored directly in the value for RangeKeyDeletes.
   156  		if err := e.Emit(ik, s.End); err != nil {
   157  			return err
   158  		}
   159  	}
   160  	return nil
   161  }
   162  
   163  // Decode takes an internal key pair encoding range key(s) and returns a decoded
   164  // keyspan containing the keys. If keysDst is provided, keys will be appended to
   165  // keysDst.
   166  func Decode(ik base.InternalKey, v []byte, keysDst []keyspan.Key) (keyspan.Span, error) {
   167  	var s keyspan.Span
   168  
   169  	// Hydrate the user key bounds.
   170  	s.Start = ik.UserKey
   171  	var ok bool
   172  	s.End, v, ok = DecodeEndKey(ik.Kind(), v)
   173  	if !ok {
   174  		return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key end from %s", ik.Kind())
   175  	}
   176  	s.Keys = keysDst
   177  
   178  	// Hydrate the contents of the range key(s).
   179  	switch ik.Kind() {
   180  	case base.InternalKeyKindRangeKeySet:
   181  		for len(v) > 0 {
   182  			var sv SuffixValue
   183  			sv, v, ok = decodeSuffixValue(v)
   184  			if !ok {
   185  				return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key suffix-value tuple")
   186  			}
   187  			s.Keys = append(s.Keys, keyspan.Key{
   188  				Trailer: ik.Trailer,
   189  				Suffix:  sv.Suffix,
   190  				Value:   sv.Value,
   191  			})
   192  		}
   193  	case base.InternalKeyKindRangeKeyUnset:
   194  		for len(v) > 0 {
   195  			var suffix []byte
   196  			suffix, v, ok = decodeSuffix(v)
   197  			if !ok {
   198  				return keyspan.Span{}, base.CorruptionErrorf("pebble: unable to decode range key unset suffix")
   199  			}
   200  			s.Keys = append(s.Keys, keyspan.Key{
   201  				Trailer: ik.Trailer,
   202  				Suffix:  suffix,
   203  			})
   204  		}
   205  	case base.InternalKeyKindRangeKeyDelete:
   206  		if len(v) > 0 {
   207  			return keyspan.Span{}, base.CorruptionErrorf("pebble: RANGEKEYDELs must not contain additional data")
   208  		}
   209  		s.Keys = append(s.Keys, keyspan.Key{Trailer: ik.Trailer})
   210  	default:
   211  		return keyspan.Span{}, base.CorruptionErrorf("pebble: %s is not a range key", ik.Kind())
   212  	}
   213  	return s, nil
   214  }
   215  
   216  // SuffixValue represents a tuple of a suffix and a corresponding value. A
   217  // physical RANGEKEYSET key may contain many logical RangeKeySets, each
   218  // represented with a separate SuffixValue tuple.
   219  type SuffixValue struct {
   220  	Suffix []byte
   221  	Value  []byte
   222  }
   223  
   224  // encodedSetSuffixValuesLen precomputes the length of the given slice of
   225  // SuffixValues, when encoded for a RangeKeySet. It may be used to construct a
   226  // buffer of the appropriate size before encoding.
   227  func encodedSetSuffixValuesLen(suffixValues []SuffixValue) int {
   228  	var n int
   229  	for i := 0; i < len(suffixValues); i++ {
   230  		n += lenVarint(len(suffixValues[i].Suffix))
   231  		n += len(suffixValues[i].Suffix)
   232  		n += lenVarint(len(suffixValues[i].Value))
   233  		n += len(suffixValues[i].Value)
   234  	}
   235  	return n
   236  }
   237  
   238  // encodeSetSuffixValues encodes a slice of SuffixValues for a RangeKeySet into
   239  // dst. The length of dst must be greater than or equal to
   240  // encodedSetSuffixValuesLen. encodeSetSuffixValues returns the number of bytes
   241  // written, which should always equal the EncodedSetValueLen with the same
   242  // arguments.
   243  func encodeSetSuffixValues(dst []byte, suffixValues []SuffixValue) int {
   244  	// Encode the list of (suffix, value-len) tuples.
   245  	var n int
   246  	for i := 0; i < len(suffixValues); i++ {
   247  		// Encode the length of the suffix.
   248  		n += binary.PutUvarint(dst[n:], uint64(len(suffixValues[i].Suffix)))
   249  
   250  		// Encode the suffix itself.
   251  		n += copy(dst[n:], suffixValues[i].Suffix)
   252  
   253  		// Encode the value length.
   254  		n += binary.PutUvarint(dst[n:], uint64(len(suffixValues[i].Value)))
   255  
   256  		// Encode the value itself.
   257  		n += copy(dst[n:], suffixValues[i].Value)
   258  	}
   259  	return n
   260  }
   261  
   262  // EncodedSetValueLen precomputes the length of a RangeKeySet's value when
   263  // encoded. It may be used to construct a buffer of the appropriate size before
   264  // encoding.
   265  func EncodedSetValueLen(endKey []byte, suffixValues []SuffixValue) int {
   266  	n := lenVarint(len(endKey))
   267  	n += len(endKey)
   268  	n += encodedSetSuffixValuesLen(suffixValues)
   269  	return n
   270  }
   271  
   272  // EncodeSetValue encodes a RangeKeySet's value into dst. The length of dst must
   273  // be greater than or equal to EncodedSetValueLen. EncodeSetValue returns the
   274  // number of bytes written, which should always equal the EncodedSetValueLen
   275  // with the same arguments.
   276  func EncodeSetValue(dst []byte, endKey []byte, suffixValues []SuffixValue) int {
   277  	// First encode the end key as a varstring.
   278  	n := binary.PutUvarint(dst, uint64(len(endKey)))
   279  	n += copy(dst[n:], endKey)
   280  	n += encodeSetSuffixValues(dst[n:], suffixValues)
   281  	return n
   282  }
   283  
   284  // DecodeEndKey reads the end key from the beginning of a range key (RANGEKEYSET,
   285  // RANGEKEYUNSET or RANGEKEYDEL)'s physical encoded value. Both sets and unsets
   286  // encode the range key, plus additional data in the value.
   287  func DecodeEndKey(kind base.InternalKeyKind, data []byte) (endKey, value []byte, ok bool) {
   288  	switch kind {
   289  	case base.InternalKeyKindRangeKeyDelete:
   290  		// No splitting is necessary for range key deletes. The value is the end
   291  		// key, and there is no additional associated value.
   292  		return data, nil, true
   293  	case base.InternalKeyKindRangeKeySet, base.InternalKeyKindRangeKeyUnset:
   294  		v, n := binary.Uvarint(data)
   295  		if n <= 0 || uint64(n)+v >= uint64(len(data)) {
   296  			return nil, nil, false
   297  		}
   298  		endKey, value = data[n:n+int(v)], data[n+int(v):]
   299  		return endKey, value, true
   300  	default:
   301  		panic(errors.Newf("key kind %s is not a range key kind", kind))
   302  	}
   303  }
   304  
   305  // decodeSuffixValue decodes a single encoded SuffixValue from a RangeKeySet's
   306  // split value. The end key must have already been stripped from the
   307  // RangeKeySet's value (see DecodeEndKey).
   308  func decodeSuffixValue(data []byte) (sv SuffixValue, rest []byte, ok bool) {
   309  	// Decode the suffix.
   310  	sv.Suffix, data, ok = decodeVarstring(data)
   311  	if !ok {
   312  		return SuffixValue{}, nil, false
   313  	}
   314  	// Decode the value.
   315  	sv.Value, data, ok = decodeVarstring(data)
   316  	if !ok {
   317  		return SuffixValue{}, nil, false
   318  	}
   319  	return sv, data, true
   320  }
   321  
   322  // encodedUnsetSuffixesLen precomputes the length of the given slice of
   323  // suffixes, when encoded for a RangeKeyUnset. It may be used to construct a
   324  // buffer of the appropriate size before encoding.
   325  func encodedUnsetSuffixesLen(suffixes [][]byte) int {
   326  	var n int
   327  	for i := 0; i < len(suffixes); i++ {
   328  		n += lenVarint(len(suffixes[i]))
   329  		n += len(suffixes[i])
   330  	}
   331  	return n
   332  }
   333  
   334  // encodeUnsetSuffixes encodes a slice of suffixes for a RangeKeyUnset into dst.
   335  // The length of dst must be greater than or equal to EncodedUnsetSuffixesLen.
   336  // EncodeUnsetSuffixes returns the number of bytes written, which should always
   337  // equal the EncodedUnsetSuffixesLen with the same arguments.
   338  func encodeUnsetSuffixes(dst []byte, suffixes [][]byte) int {
   339  	// Encode the list of (suffix, value-len) tuples.
   340  	var n int
   341  	for i := 0; i < len(suffixes); i++ {
   342  		//  Encode the length of the suffix.
   343  		n += binary.PutUvarint(dst[n:], uint64(len(suffixes[i])))
   344  
   345  		// Encode the suffix itself.
   346  		n += copy(dst[n:], suffixes[i])
   347  	}
   348  	return n
   349  }
   350  
   351  // EncodedUnsetValueLen precomputes the length of a RangeKeyUnset's value when
   352  // encoded.  It may be used to construct a buffer of the appropriate size before
   353  // encoding.
   354  func EncodedUnsetValueLen(endKey []byte, suffixes [][]byte) int {
   355  	n := lenVarint(len(endKey))
   356  	n += len(endKey)
   357  	n += encodedUnsetSuffixesLen(suffixes)
   358  	return n
   359  }
   360  
   361  // EncodeUnsetValue encodes a RangeKeyUnset's value into dst. The length of dst
   362  // must be greater than or equal to EncodedUnsetValueLen. EncodeUnsetValue
   363  // returns the number of bytes written, which should always equal the
   364  // EncodedUnsetValueLen with the same arguments.
   365  func EncodeUnsetValue(dst []byte, endKey []byte, suffixes [][]byte) int {
   366  	// First encode the end key as a varstring.
   367  	n := binary.PutUvarint(dst, uint64(len(endKey)))
   368  	n += copy(dst[n:], endKey)
   369  	n += encodeUnsetSuffixes(dst[n:], suffixes)
   370  	return n
   371  }
   372  
   373  // decodeSuffix decodes a single suffix from the beginning of data. If decoding
   374  // suffixes from a RangeKeyUnset's value, the end key must have already been
   375  // stripped from the RangeKeyUnset's value (see DecodeEndKey).
   376  func decodeSuffix(data []byte) (suffix, rest []byte, ok bool) {
   377  	return decodeVarstring(data)
   378  }
   379  
   380  func decodeVarstring(data []byte) (v, rest []byte, ok bool) {
   381  	// Decode the length of the string.
   382  	l, n := binary.Uvarint(data)
   383  	if n <= 0 {
   384  		return nil, nil, ok
   385  	}
   386  
   387  	// Extract the string itself.
   388  	return data[n : n+int(l)], data[n+int(l):], true
   389  }
   390  
   391  // IsRangeKey returns true if the given key kind is one of the range key kinds.
   392  func IsRangeKey(kind base.InternalKeyKind) bool {
   393  	switch kind {
   394  	case base.InternalKeyKindRangeKeyDelete,
   395  		base.InternalKeyKindRangeKeyUnset,
   396  		base.InternalKeyKindRangeKeySet:
   397  		return true
   398  	default:
   399  		return false
   400  	}
   401  }
   402  
   403  func lenVarint(v int) (n int) {
   404  	x := uint32(v)
   405  	n++
   406  	for x >= 0x80 {
   407  		x >>= 7
   408  		n++
   409  	}
   410  	return n
   411  }