github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/internal/base/internal.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package base // import "github.com/cockroachdb/pebble/internal/base" 6 7 import ( 8 "cmp" 9 "encoding/binary" 10 "fmt" 11 "strconv" 12 "strings" 13 14 "github.com/cockroachdb/redact" 15 ) 16 17 const ( 18 // SeqNumZero is the zero sequence number, set by compactions if they can 19 // guarantee there are no keys underneath an internal key. 20 SeqNumZero = uint64(0) 21 // SeqNumStart is the first sequence number assigned to a key. Sequence 22 // numbers 1-9 are reserved for potential future use. 23 SeqNumStart = uint64(10) 24 ) 25 26 // InternalKeyKind enumerates the kind of key: a deletion tombstone, a set 27 // value, a merged value, etc. 28 type InternalKeyKind uint8 29 30 // These constants are part of the file format, and should not be changed. 31 const ( 32 InternalKeyKindDelete InternalKeyKind = 0 33 InternalKeyKindSet InternalKeyKind = 1 34 InternalKeyKindMerge InternalKeyKind = 2 35 InternalKeyKindLogData InternalKeyKind = 3 36 //InternalKeyKindColumnFamilyDeletion InternalKeyKind = 4 37 //InternalKeyKindColumnFamilyValue InternalKeyKind = 5 38 //InternalKeyKindColumnFamilyMerge InternalKeyKind = 6 39 40 // InternalKeyKindSingleDelete (SINGLEDEL) is a performance optimization 41 // solely for compactions (to reduce write amp and space amp). Readers other 42 // than compactions should treat SINGLEDEL as equivalent to a DEL. 43 // Historically, it was simpler for readers other than compactions to treat 44 // SINGLEDEL as equivalent to DEL, but as of the introduction of 45 // InternalKeyKindSSTableInternalObsoleteBit, this is also necessary for 46 // correctness. 47 InternalKeyKindSingleDelete InternalKeyKind = 7 48 //InternalKeyKindColumnFamilySingleDelete InternalKeyKind = 8 49 //InternalKeyKindBeginPrepareXID InternalKeyKind = 9 50 //InternalKeyKindEndPrepareXID InternalKeyKind = 10 51 //InternalKeyKindCommitXID InternalKeyKind = 11 52 //InternalKeyKindRollbackXID InternalKeyKind = 12 53 //InternalKeyKindNoop InternalKeyKind = 13 54 //InternalKeyKindColumnFamilyRangeDelete InternalKeyKind = 14 55 InternalKeyKindRangeDelete InternalKeyKind = 15 56 //InternalKeyKindColumnFamilyBlobIndex InternalKeyKind = 16 57 //InternalKeyKindBlobIndex InternalKeyKind = 17 58 59 // InternalKeyKindSeparator is a key used for separator / successor keys 60 // written to sstable block indexes. 61 // 62 // NOTE: the RocksDB value has been repurposed. This was done to ensure that 63 // keys written to block indexes with value "17" (when 17 happened to be the 64 // max value, and InternalKeyKindMax was therefore set to 17), remain stable 65 // when new key kinds are supported in Pebble. 66 InternalKeyKindSeparator InternalKeyKind = 17 67 68 // InternalKeyKindSetWithDelete keys are SET keys that have met with a 69 // DELETE or SINGLEDEL key in a prior compaction. This key kind is 70 // specific to Pebble. See 71 // https://github.com/cockroachdb/pebble/issues/1255. 72 InternalKeyKindSetWithDelete InternalKeyKind = 18 73 74 // InternalKeyKindRangeKeyDelete removes all range keys within a key range. 75 // See the internal/rangekey package for more details. 76 InternalKeyKindRangeKeyDelete InternalKeyKind = 19 77 // InternalKeyKindRangeKeySet and InternalKeyKindRangeUnset represent 78 // keys that set and unset values associated with ranges of key 79 // space. See the internal/rangekey package for more details. 80 InternalKeyKindRangeKeyUnset InternalKeyKind = 20 81 InternalKeyKindRangeKeySet InternalKeyKind = 21 82 83 // InternalKeyKindIngestSST is used to distinguish a batch that corresponds to 84 // the WAL entry for ingested sstables that are added to the flushable 85 // queue. This InternalKeyKind cannot appear, amongst other key kinds in a 86 // batch, or in an sstable. 87 InternalKeyKindIngestSST InternalKeyKind = 22 88 89 // InternalKeyKindDeleteSized keys behave identically to 90 // InternalKeyKindDelete keys, except that they hold an associated uint64 91 // value indicating the (len(key)+len(value)) of the shadowed entry the 92 // tombstone is expected to delete. This value is used to inform compaction 93 // heuristics, but is not required to be accurate for correctness. 94 InternalKeyKindDeleteSized InternalKeyKind = 23 95 96 // This maximum value isn't part of the file format. Future extensions may 97 // increase this value. 98 // 99 // When constructing an internal key to pass to DB.Seek{GE,LE}, 100 // internalKeyComparer sorts decreasing by kind (after sorting increasing by 101 // user key and decreasing by sequence number). Thus, use InternalKeyKindMax, 102 // which sorts 'less than or equal to' any other valid internalKeyKind, when 103 // searching for any kind of internal key formed by a certain user key and 104 // seqNum. 105 InternalKeyKindMax InternalKeyKind = 23 106 107 // Internal to the sstable format. Not exposed by any sstable iterator. 108 // Declared here to prevent definition of valid key kinds that set this bit. 109 InternalKeyKindSSTableInternalObsoleteBit InternalKeyKind = 64 110 InternalKeyKindSSTableInternalObsoleteMask InternalKeyKind = 191 111 112 // InternalKeyZeroSeqnumMaxTrailer is the largest trailer with a 113 // zero sequence number. 114 InternalKeyZeroSeqnumMaxTrailer = uint64(255) 115 116 // A marker for an invalid key. 117 InternalKeyKindInvalid InternalKeyKind = InternalKeyKindSSTableInternalObsoleteMask 118 119 // InternalKeySeqNumBatch is a bit that is set on batch sequence numbers 120 // which prevents those entries from being excluded from iteration. 121 InternalKeySeqNumBatch = uint64(1 << 55) 122 123 // InternalKeySeqNumMax is the largest valid sequence number. 124 InternalKeySeqNumMax = uint64(1<<56 - 1) 125 126 // InternalKeyRangeDeleteSentinel is the marker for a range delete sentinel 127 // key. This sequence number and kind are used for the upper stable boundary 128 // when a range deletion tombstone is the largest key in an sstable. This is 129 // necessary because sstable boundaries are inclusive, while the end key of a 130 // range deletion tombstone is exclusive. 131 InternalKeyRangeDeleteSentinel = (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindRangeDelete) 132 133 // InternalKeyBoundaryRangeKey is the marker for a range key boundary. This 134 // sequence number and kind are used during interleaved range key and point 135 // iteration to allow an iterator to stop at range key start keys where 136 // there exists no point key. 137 InternalKeyBoundaryRangeKey = (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindRangeKeySet) 138 ) 139 140 // Assert InternalKeyKindSSTableInternalObsoleteBit > InternalKeyKindMax 141 const _ = uint(InternalKeyKindSSTableInternalObsoleteBit - InternalKeyKindMax - 1) 142 143 var internalKeyKindNames = []string{ 144 InternalKeyKindDelete: "DEL", 145 InternalKeyKindSet: "SET", 146 InternalKeyKindMerge: "MERGE", 147 InternalKeyKindLogData: "LOGDATA", 148 InternalKeyKindSingleDelete: "SINGLEDEL", 149 InternalKeyKindRangeDelete: "RANGEDEL", 150 InternalKeyKindSeparator: "SEPARATOR", 151 InternalKeyKindSetWithDelete: "SETWITHDEL", 152 InternalKeyKindRangeKeySet: "RANGEKEYSET", 153 InternalKeyKindRangeKeyUnset: "RANGEKEYUNSET", 154 InternalKeyKindRangeKeyDelete: "RANGEKEYDEL", 155 InternalKeyKindIngestSST: "INGESTSST", 156 InternalKeyKindDeleteSized: "DELSIZED", 157 InternalKeyKindInvalid: "INVALID", 158 } 159 160 func (k InternalKeyKind) String() string { 161 if int(k) < len(internalKeyKindNames) { 162 return internalKeyKindNames[k] 163 } 164 return fmt.Sprintf("UNKNOWN:%d", k) 165 } 166 167 // SafeFormat implements redact.SafeFormatter. 168 func (k InternalKeyKind) SafeFormat(w redact.SafePrinter, _ rune) { 169 w.Print(redact.SafeString(k.String())) 170 } 171 172 // InternalKey is a key used for the in-memory and on-disk partial DBs that 173 // make up a pebble DB. 174 // 175 // It consists of the user key (as given by the code that uses package pebble) 176 // followed by 8-bytes of metadata: 177 // - 1 byte for the type of internal key: delete or set, 178 // - 7 bytes for a uint56 sequence number, in little-endian format. 179 type InternalKey struct { 180 UserKey []byte 181 Trailer uint64 182 } 183 184 // InvalidInternalKey is an invalid internal key for which Valid() will return 185 // false. 186 var InvalidInternalKey = MakeInternalKey(nil, 0, InternalKeyKindInvalid) 187 188 // MakeInternalKey constructs an internal key from a specified user key, 189 // sequence number and kind. 190 func MakeInternalKey(userKey []byte, seqNum uint64, kind InternalKeyKind) InternalKey { 191 return InternalKey{ 192 UserKey: userKey, 193 Trailer: (seqNum << 8) | uint64(kind), 194 } 195 } 196 197 // MakeTrailer constructs an internal key trailer from the specified sequence 198 // number and kind. 199 func MakeTrailer(seqNum uint64, kind InternalKeyKind) uint64 { 200 return (seqNum << 8) | uint64(kind) 201 } 202 203 // MakeSearchKey constructs an internal key that is appropriate for searching 204 // for a the specified user key. The search key contain the maximal sequence 205 // number and kind ensuring that it sorts before any other internal keys for 206 // the same user key. 207 func MakeSearchKey(userKey []byte) InternalKey { 208 return InternalKey{ 209 UserKey: userKey, 210 Trailer: (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindMax), 211 } 212 } 213 214 // MakeRangeDeleteSentinelKey constructs an internal key that is a range 215 // deletion sentinel key, used as the upper boundary for an sstable when a 216 // range deletion is the largest key in an sstable. 217 func MakeRangeDeleteSentinelKey(userKey []byte) InternalKey { 218 return InternalKey{ 219 UserKey: userKey, 220 Trailer: InternalKeyRangeDeleteSentinel, 221 } 222 } 223 224 // MakeExclusiveSentinelKey constructs an internal key that is an 225 // exclusive sentinel key, used as the upper boundary for an sstable 226 // when a ranged key is the largest key in an sstable. 227 func MakeExclusiveSentinelKey(kind InternalKeyKind, userKey []byte) InternalKey { 228 return InternalKey{ 229 UserKey: userKey, 230 Trailer: (InternalKeySeqNumMax << 8) | uint64(kind), 231 } 232 } 233 234 var kindsMap = map[string]InternalKeyKind{ 235 "DEL": InternalKeyKindDelete, 236 "SINGLEDEL": InternalKeyKindSingleDelete, 237 "RANGEDEL": InternalKeyKindRangeDelete, 238 "LOGDATA": InternalKeyKindLogData, 239 "SET": InternalKeyKindSet, 240 "MERGE": InternalKeyKindMerge, 241 "INVALID": InternalKeyKindInvalid, 242 "SEPARATOR": InternalKeyKindSeparator, 243 "SETWITHDEL": InternalKeyKindSetWithDelete, 244 "RANGEKEYSET": InternalKeyKindRangeKeySet, 245 "RANGEKEYUNSET": InternalKeyKindRangeKeyUnset, 246 "RANGEKEYDEL": InternalKeyKindRangeKeyDelete, 247 "INGESTSST": InternalKeyKindIngestSST, 248 "DELSIZED": InternalKeyKindDeleteSized, 249 } 250 251 // ParseInternalKey parses the string representation of an internal key. The 252 // format is <user-key>.<kind>.<seq-num>. If the seq-num starts with a "b" it 253 // is marked as a batch-seq-num (i.e. the InternalKeySeqNumBatch bit is set). 254 func ParseInternalKey(s string) InternalKey { 255 x := strings.Split(s, ".") 256 ukey := x[0] 257 kind, ok := kindsMap[x[1]] 258 if !ok { 259 panic(fmt.Sprintf("unknown kind: %q", x[1])) 260 } 261 j := 0 262 if x[2][0] == 'b' { 263 j = 1 264 } 265 seqNum, _ := strconv.ParseUint(x[2][j:], 10, 64) 266 if x[2][0] == 'b' { 267 seqNum |= InternalKeySeqNumBatch 268 } 269 return MakeInternalKey([]byte(ukey), seqNum, kind) 270 } 271 272 // ParseKind parses the string representation of an internal key kind. 273 func ParseKind(s string) InternalKeyKind { 274 kind, ok := kindsMap[s] 275 if !ok { 276 panic(fmt.Sprintf("unknown kind: %q", s)) 277 } 278 return kind 279 } 280 281 // InternalTrailerLen is the number of bytes used to encode InternalKey.Trailer. 282 const InternalTrailerLen = 8 283 284 // DecodeInternalKey decodes an encoded internal key. See InternalKey.Encode(). 285 func DecodeInternalKey(encodedKey []byte) InternalKey { 286 n := len(encodedKey) - InternalTrailerLen 287 var trailer uint64 288 if n >= 0 { 289 trailer = binary.LittleEndian.Uint64(encodedKey[n:]) 290 encodedKey = encodedKey[:n:n] 291 } else { 292 trailer = uint64(InternalKeyKindInvalid) 293 encodedKey = nil 294 } 295 return InternalKey{ 296 UserKey: encodedKey, 297 Trailer: trailer, 298 } 299 } 300 301 // InternalCompare compares two internal keys using the specified comparison 302 // function. For equal user keys, internal keys compare in descending sequence 303 // number order. For equal user keys and sequence numbers, internal keys 304 // compare in descending kind order (this may happen in practice among range 305 // keys). 306 func InternalCompare(userCmp Compare, a, b InternalKey) int { 307 if x := userCmp(a.UserKey, b.UserKey); x != 0 { 308 return x 309 } 310 // Reverse order for trailer comparison. 311 return cmp.Compare(b.Trailer, a.Trailer) 312 } 313 314 // Encode encodes the receiver into the buffer. The buffer must be large enough 315 // to hold the encoded data. See InternalKey.Size(). 316 func (k InternalKey) Encode(buf []byte) { 317 i := copy(buf, k.UserKey) 318 binary.LittleEndian.PutUint64(buf[i:], k.Trailer) 319 } 320 321 // EncodeTrailer returns the trailer encoded to an 8-byte array. 322 func (k InternalKey) EncodeTrailer() [8]byte { 323 var buf [8]byte 324 binary.LittleEndian.PutUint64(buf[:], k.Trailer) 325 return buf 326 } 327 328 // Separator returns a separator key such that k <= x && x < other, where less 329 // than is consistent with the Compare function. The buf parameter may be used 330 // to store the returned InternalKey.UserKey, though it is valid to pass a 331 // nil. See the Separator type for details on separator keys. 332 func (k InternalKey) Separator( 333 cmp Compare, sep Separator, buf []byte, other InternalKey, 334 ) InternalKey { 335 buf = sep(buf, k.UserKey, other.UserKey) 336 if len(buf) <= len(k.UserKey) && cmp(k.UserKey, buf) < 0 { 337 // The separator user key is physically shorter than k.UserKey (if it is 338 // longer, we'll continue to use "k"), but logically after. Tack on the max 339 // sequence number to the shortened user key. Note that we could tack on 340 // any sequence number and kind here to create a valid separator key. We 341 // use the max sequence number to match the behavior of LevelDB and 342 // RocksDB. 343 return MakeInternalKey(buf, InternalKeySeqNumMax, InternalKeyKindSeparator) 344 } 345 return k 346 } 347 348 // Successor returns a successor key such that k <= x. A simple implementation 349 // may return k unchanged. The buf parameter may be used to store the returned 350 // InternalKey.UserKey, though it is valid to pass a nil. 351 func (k InternalKey) Successor(cmp Compare, succ Successor, buf []byte) InternalKey { 352 buf = succ(buf, k.UserKey) 353 if len(buf) <= len(k.UserKey) && cmp(k.UserKey, buf) < 0 { 354 // The successor user key is physically shorter that k.UserKey (if it is 355 // longer, we'll continue to use "k"), but logically after. Tack on the max 356 // sequence number to the shortened user key. Note that we could tack on 357 // any sequence number and kind here to create a valid separator key. We 358 // use the max sequence number to match the behavior of LevelDB and 359 // RocksDB. 360 return MakeInternalKey(buf, InternalKeySeqNumMax, InternalKeyKindSeparator) 361 } 362 return k 363 } 364 365 // Size returns the encoded size of the key. 366 func (k InternalKey) Size() int { 367 return len(k.UserKey) + 8 368 } 369 370 // SetSeqNum sets the sequence number component of the key. 371 func (k *InternalKey) SetSeqNum(seqNum uint64) { 372 k.Trailer = (seqNum << 8) | (k.Trailer & 0xff) 373 } 374 375 // SeqNum returns the sequence number component of the key. 376 func (k InternalKey) SeqNum() uint64 { 377 return k.Trailer >> 8 378 } 379 380 // SeqNumFromTrailer returns the sequence number component of a trailer. 381 func SeqNumFromTrailer(t uint64) uint64 { 382 return t >> 8 383 } 384 385 // Visible returns true if the key is visible at the specified snapshot 386 // sequence number. 387 func (k InternalKey) Visible(snapshot, batchSnapshot uint64) bool { 388 return Visible(k.SeqNum(), snapshot, batchSnapshot) 389 } 390 391 // Visible returns true if a key with the provided sequence number is visible at 392 // the specified snapshot sequence numbers. 393 func Visible(seqNum uint64, snapshot, batchSnapshot uint64) bool { 394 // There are two snapshot sequence numbers, one for committed keys and one 395 // for batch keys. If a seqNum is less than `snapshot`, then seqNum 396 // corresponds to a committed key that is visible. If seqNum has its batch 397 // bit set, then seqNum corresponds to an uncommitted batch key. Its 398 // visible if its snapshot is less than batchSnapshot. 399 // 400 // There's one complication. The maximal sequence number 401 // (`InternalKeySeqNumMax`) is used across Pebble for exclusive sentinel 402 // keys and other purposes. The maximal sequence number has its batch bit 403 // set, but it can never be < `batchSnapshot`, since there is no expressible 404 // larger snapshot. We dictate that the maximal sequence number is always 405 // visible. 406 return seqNum < snapshot || 407 ((seqNum&InternalKeySeqNumBatch) != 0 && seqNum < batchSnapshot) || 408 seqNum == InternalKeySeqNumMax 409 } 410 411 // SetKind sets the kind component of the key. 412 func (k *InternalKey) SetKind(kind InternalKeyKind) { 413 k.Trailer = (k.Trailer &^ 0xff) | uint64(kind) 414 } 415 416 // Kind returns the kind component of the key. 417 func (k InternalKey) Kind() InternalKeyKind { 418 return TrailerKind(k.Trailer) 419 } 420 421 // TrailerKind returns the key kind of the key trailer. 422 func TrailerKind(trailer uint64) InternalKeyKind { 423 return InternalKeyKind(trailer & 0xff) 424 } 425 426 // Valid returns true if the key has a valid kind. 427 func (k InternalKey) Valid() bool { 428 return k.Kind() <= InternalKeyKindMax 429 } 430 431 // Clone clones the storage for the UserKey component of the key. 432 func (k InternalKey) Clone() InternalKey { 433 if len(k.UserKey) == 0 { 434 return k 435 } 436 return InternalKey{ 437 UserKey: append([]byte(nil), k.UserKey...), 438 Trailer: k.Trailer, 439 } 440 } 441 442 // CopyFrom converts this InternalKey into a clone of the passed-in InternalKey, 443 // reusing any space already used for the current UserKey. 444 func (k *InternalKey) CopyFrom(k2 InternalKey) { 445 k.UserKey = append(k.UserKey[:0], k2.UserKey...) 446 k.Trailer = k2.Trailer 447 } 448 449 // String returns a string representation of the key. 450 func (k InternalKey) String() string { 451 return fmt.Sprintf("%s#%d,%d", FormatBytes(k.UserKey), k.SeqNum(), k.Kind()) 452 } 453 454 // Pretty returns a formatter for the key. 455 func (k InternalKey) Pretty(f FormatKey) fmt.Formatter { 456 return prettyInternalKey{k, f} 457 } 458 459 // IsExclusiveSentinel returns whether this internal key excludes point keys 460 // with the same user key if used as an end boundary. See the comment on 461 // InternalKeyRangeDeletionSentinel. 462 func (k InternalKey) IsExclusiveSentinel() bool { 463 switch kind := k.Kind(); kind { 464 case InternalKeyKindRangeDelete: 465 return k.Trailer == InternalKeyRangeDeleteSentinel 466 case InternalKeyKindRangeKeyDelete, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeySet: 467 return (k.Trailer >> 8) == InternalKeySeqNumMax 468 default: 469 return false 470 } 471 } 472 473 type prettyInternalKey struct { 474 InternalKey 475 formatKey FormatKey 476 } 477 478 func (k prettyInternalKey) Format(s fmt.State, c rune) { 479 if seqNum := k.SeqNum(); seqNum == InternalKeySeqNumMax { 480 fmt.Fprintf(s, "%s#inf,%s", k.formatKey(k.UserKey), k.Kind()) 481 } else { 482 fmt.Fprintf(s, "%s#%d,%s", k.formatKey(k.UserKey), k.SeqNum(), k.Kind()) 483 } 484 } 485 486 // ParsePrettyInternalKey parses the pretty string representation of an 487 // internal key. The format is <user-key>#<seq-num>,<kind>. 488 func ParsePrettyInternalKey(s string) InternalKey { 489 x := strings.FieldsFunc(s, func(c rune) bool { return c == '#' || c == ',' }) 490 ukey := x[0] 491 kind, ok := kindsMap[x[2]] 492 if !ok { 493 panic(fmt.Sprintf("unknown kind: %q", x[2])) 494 } 495 var seqNum uint64 496 if x[1] == "max" || x[1] == "inf" { 497 seqNum = InternalKeySeqNumMax 498 } else { 499 seqNum, _ = strconv.ParseUint(x[1], 10, 64) 500 } 501 return MakeInternalKey([]byte(ukey), seqNum, kind) 502 }