github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/base/internal.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package base // import "github.com/cockroachdb/pebble/internal/base" 6 7 import ( 8 "encoding/binary" 9 "fmt" 10 "strconv" 11 "strings" 12 13 "github.com/cockroachdb/redact" 14 ) 15 16 const ( 17 // SeqNumZero is the zero sequence number, set by compactions if they can 18 // guarantee there are no keys underneath an internal key. 19 SeqNumZero = uint64(0) 20 // SeqNumStart is the first sequence number assigned to a key. Sequence 21 // numbers 1-9 are reserved for potential future use. 22 SeqNumStart = uint64(10) 23 ) 24 25 // InternalKeyKind enumerates the kind of key: a deletion tombstone, a set 26 // value, a merged value, etc. 27 type InternalKeyKind uint8 28 29 // These constants are part of the file format, and should not be changed. 30 const ( 31 InternalKeyKindDelete InternalKeyKind = 0 32 InternalKeyKindSet InternalKeyKind = 1 33 InternalKeyKindMerge InternalKeyKind = 2 34 InternalKeyKindLogData InternalKeyKind = 3 35 //InternalKeyKindColumnFamilyDeletion InternalKeyKind = 4 36 //InternalKeyKindColumnFamilyValue InternalKeyKind = 5 37 //InternalKeyKindColumnFamilyMerge InternalKeyKind = 6 38 39 // InternalKeyKindSingleDelete (SINGLEDEL) is a performance optimization 40 // solely for compactions (to reduce write amp and space amp). Readers other 41 // than compactions should treat SINGLEDEL as equivalent to a DEL. 42 // Historically, it was simpler for readers other than compactions to treat 43 // SINGLEDEL as equivalent to DEL, but as of the introduction of 44 // InternalKeyKindSSTableInternalObsoleteBit, this is also necessary for 45 // correctness. 46 InternalKeyKindSingleDelete InternalKeyKind = 7 47 //InternalKeyKindColumnFamilySingleDelete InternalKeyKind = 8 48 //InternalKeyKindBeginPrepareXID InternalKeyKind = 9 49 //InternalKeyKindEndPrepareXID InternalKeyKind = 10 50 //InternalKeyKindCommitXID InternalKeyKind = 11 51 //InternalKeyKindRollbackXID InternalKeyKind = 12 52 //InternalKeyKindNoop InternalKeyKind = 13 53 //InternalKeyKindColumnFamilyRangeDelete InternalKeyKind = 14 54 InternalKeyKindRangeDelete InternalKeyKind = 15 55 //InternalKeyKindColumnFamilyBlobIndex InternalKeyKind = 16 56 //InternalKeyKindBlobIndex InternalKeyKind = 17 57 58 // InternalKeyKindSeparator is a key used for separator / successor keys 59 // written to sstable block indexes. 60 // 61 // NOTE: the RocksDB value has been repurposed. This was done to ensure that 62 // keys written to block indexes with value "17" (when 17 happened to be the 63 // max value, and InternalKeyKindMax was therefore set to 17), remain stable 64 // when new key kinds are supported in Pebble. 65 InternalKeyKindSeparator InternalKeyKind = 17 66 67 // InternalKeyKindSetWithDelete keys are SET keys that have met with a 68 // DELETE or SINGLEDEL key in a prior compaction. This key kind is 69 // specific to Pebble. See 70 // https://github.com/cockroachdb/pebble/issues/1255. 71 InternalKeyKindSetWithDelete InternalKeyKind = 18 72 73 // InternalKeyKindRangeKeyDelete removes all range keys within a key range. 74 // See the internal/rangekey package for more details. 75 InternalKeyKindRangeKeyDelete InternalKeyKind = 19 76 // InternalKeyKindRangeKeySet and InternalKeyKindRangeUnset represent 77 // keys that set and unset values associated with ranges of key 78 // space. See the internal/rangekey package for more details. 79 InternalKeyKindRangeKeyUnset InternalKeyKind = 20 80 InternalKeyKindRangeKeySet InternalKeyKind = 21 81 82 // InternalKeyKindIngestSST is used to distinguish a batch that corresponds to 83 // the WAL entry for ingested sstables that are added to the flushable 84 // queue. This InternalKeyKind cannot appear, amongst other key kinds in a 85 // batch, or in an sstable. 86 InternalKeyKindIngestSST InternalKeyKind = 22 87 88 // InternalKeyKindDeleteSized keys behave identically to 89 // InternalKeyKindDelete keys, except that they hold an associated uint64 90 // value indicating the (len(key)+len(value)) of the shadowed entry the 91 // tombstone is expected to delete. This value is used to inform compaction 92 // heuristics, but is not required to be accurate for correctness. 93 InternalKeyKindDeleteSized InternalKeyKind = 23 94 95 // This maximum value isn't part of the file format. Future extensions may 96 // increase this value. 97 // 98 // When constructing an internal key to pass to DB.Seek{GE,LE}, 99 // internalKeyComparer sorts decreasing by kind (after sorting increasing by 100 // user key and decreasing by sequence number). Thus, use InternalKeyKindMax, 101 // which sorts 'less than or equal to' any other valid internalKeyKind, when 102 // searching for any kind of internal key formed by a certain user key and 103 // seqNum. 104 InternalKeyKindMax InternalKeyKind = 23 105 106 // Internal to the sstable format. Not exposed by any sstable iterator. 107 // Declared here to prevent definition of valid key kinds that set this bit. 108 InternalKeyKindSSTableInternalObsoleteBit InternalKeyKind = 64 109 InternalKeyKindSSTableInternalObsoleteMask InternalKeyKind = 191 110 111 // InternalKeyZeroSeqnumMaxTrailer is the largest trailer with a 112 // zero sequence number. 113 InternalKeyZeroSeqnumMaxTrailer = uint64(255) 114 115 // A marker for an invalid key. 116 InternalKeyKindInvalid InternalKeyKind = InternalKeyKindSSTableInternalObsoleteMask 117 118 // InternalKeySeqNumBatch is a bit that is set on batch sequence numbers 119 // which prevents those entries from being excluded from iteration. 120 InternalKeySeqNumBatch = uint64(1 << 55) 121 122 // InternalKeySeqNumMax is the largest valid sequence number. 123 InternalKeySeqNumMax = uint64(1<<56 - 1) 124 125 // InternalKeyRangeDeleteSentinel is the marker for a range delete sentinel 126 // key. This sequence number and kind are used for the upper stable boundary 127 // when a range deletion tombstone is the largest key in an sstable. This is 128 // necessary because sstable boundaries are inclusive, while the end key of a 129 // range deletion tombstone is exclusive. 130 InternalKeyRangeDeleteSentinel = (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindRangeDelete) 131 132 // InternalKeyBoundaryRangeKey is the marker for a range key boundary. This 133 // sequence number and kind are used during interleaved range key and point 134 // iteration to allow an iterator to stop at range key start keys where 135 // there exists no point key. 136 InternalKeyBoundaryRangeKey = (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindRangeKeySet) 137 ) 138 139 // Assert InternalKeyKindSSTableInternalObsoleteBit > InternalKeyKindMax 140 const _ = uint(InternalKeyKindSSTableInternalObsoleteBit - InternalKeyKindMax - 1) 141 142 var internalKeyKindNames = []string{ 143 InternalKeyKindDelete: "DEL", 144 InternalKeyKindSet: "SET", 145 InternalKeyKindMerge: "MERGE", 146 InternalKeyKindLogData: "LOGDATA", 147 InternalKeyKindSingleDelete: "SINGLEDEL", 148 InternalKeyKindRangeDelete: "RANGEDEL", 149 InternalKeyKindSeparator: "SEPARATOR", 150 InternalKeyKindSetWithDelete: "SETWITHDEL", 151 InternalKeyKindRangeKeySet: "RANGEKEYSET", 152 InternalKeyKindRangeKeyUnset: "RANGEKEYUNSET", 153 InternalKeyKindRangeKeyDelete: "RANGEKEYDEL", 154 InternalKeyKindIngestSST: "INGESTSST", 155 InternalKeyKindDeleteSized: "DELSIZED", 156 InternalKeyKindInvalid: "INVALID", 157 } 158 159 func (k InternalKeyKind) String() string { 160 if int(k) < len(internalKeyKindNames) { 161 return internalKeyKindNames[k] 162 } 163 return fmt.Sprintf("UNKNOWN:%d", k) 164 } 165 166 // SafeFormat implements redact.SafeFormatter. 167 func (k InternalKeyKind) SafeFormat(w redact.SafePrinter, _ rune) { 168 w.Print(redact.SafeString(k.String())) 169 } 170 171 // InternalKey is a key used for the in-memory and on-disk partial DBs that 172 // make up a pebble DB. 173 // 174 // It consists of the user key (as given by the code that uses package pebble) 175 // followed by 8-bytes of metadata: 176 // - 1 byte for the type of internal key: delete or set, 177 // - 7 bytes for a uint56 sequence number, in little-endian format. 178 type InternalKey struct { 179 UserKey []byte 180 Trailer uint64 181 } 182 183 // InvalidInternalKey is an invalid internal key for which Valid() will return 184 // false. 185 var InvalidInternalKey = MakeInternalKey(nil, 0, InternalKeyKindInvalid) 186 187 // MakeInternalKey constructs an internal key from a specified user key, 188 // sequence number and kind. 189 func MakeInternalKey(userKey []byte, seqNum uint64, kind InternalKeyKind) InternalKey { 190 return InternalKey{ 191 UserKey: userKey, 192 Trailer: (seqNum << 8) | uint64(kind), 193 } 194 } 195 196 // MakeTrailer constructs an internal key trailer from the specified sequence 197 // number and kind. 198 func MakeTrailer(seqNum uint64, kind InternalKeyKind) uint64 { 199 return (seqNum << 8) | uint64(kind) 200 } 201 202 // MakeSearchKey constructs an internal key that is appropriate for searching 203 // for a the specified user key. The search key contain the maximal sequence 204 // number and kind ensuring that it sorts before any other internal keys for 205 // the same user key. 206 func MakeSearchKey(userKey []byte) InternalKey { 207 return InternalKey{ 208 UserKey: userKey, 209 Trailer: (InternalKeySeqNumMax << 8) | uint64(InternalKeyKindMax), 210 } 211 } 212 213 // MakeRangeDeleteSentinelKey constructs an internal key that is a range 214 // deletion sentinel key, used as the upper boundary for an sstable when a 215 // range deletion is the largest key in an sstable. 216 func MakeRangeDeleteSentinelKey(userKey []byte) InternalKey { 217 return InternalKey{ 218 UserKey: userKey, 219 Trailer: InternalKeyRangeDeleteSentinel, 220 } 221 } 222 223 // MakeExclusiveSentinelKey constructs an internal key that is an 224 // exclusive sentinel key, used as the upper boundary for an sstable 225 // when a ranged key is the largest key in an sstable. 226 func MakeExclusiveSentinelKey(kind InternalKeyKind, userKey []byte) InternalKey { 227 return InternalKey{ 228 UserKey: userKey, 229 Trailer: (InternalKeySeqNumMax << 8) | uint64(kind), 230 } 231 } 232 233 var kindsMap = map[string]InternalKeyKind{ 234 "DEL": InternalKeyKindDelete, 235 "SINGLEDEL": InternalKeyKindSingleDelete, 236 "RANGEDEL": InternalKeyKindRangeDelete, 237 "LOGDATA": InternalKeyKindLogData, 238 "SET": InternalKeyKindSet, 239 "MERGE": InternalKeyKindMerge, 240 "INVALID": InternalKeyKindInvalid, 241 "SEPARATOR": InternalKeyKindSeparator, 242 "SETWITHDEL": InternalKeyKindSetWithDelete, 243 "RANGEKEYSET": InternalKeyKindRangeKeySet, 244 "RANGEKEYUNSET": InternalKeyKindRangeKeyUnset, 245 "RANGEKEYDEL": InternalKeyKindRangeKeyDelete, 246 "INGESTSST": InternalKeyKindIngestSST, 247 "DELSIZED": InternalKeyKindDeleteSized, 248 } 249 250 // ParseInternalKey parses the string representation of an internal key. The 251 // format is <user-key>.<kind>.<seq-num>. If the seq-num starts with a "b" it 252 // is marked as a batch-seq-num (i.e. the InternalKeySeqNumBatch bit is set). 253 func ParseInternalKey(s string) InternalKey { 254 x := strings.Split(s, ".") 255 ukey := x[0] 256 kind, ok := kindsMap[x[1]] 257 if !ok { 258 panic(fmt.Sprintf("unknown kind: %q", x[1])) 259 } 260 j := 0 261 if x[2][0] == 'b' { 262 j = 1 263 } 264 seqNum, _ := strconv.ParseUint(x[2][j:], 10, 64) 265 if x[2][0] == 'b' { 266 seqNum |= InternalKeySeqNumBatch 267 } 268 return MakeInternalKey([]byte(ukey), seqNum, kind) 269 } 270 271 // ParseKind parses the string representation of an internal key kind. 272 func ParseKind(s string) InternalKeyKind { 273 kind, ok := kindsMap[s] 274 if !ok { 275 panic(fmt.Sprintf("unknown kind: %q", s)) 276 } 277 return kind 278 } 279 280 // InternalTrailerLen is the number of bytes used to encode InternalKey.Trailer. 281 const InternalTrailerLen = 8 282 283 // DecodeInternalKey decodes an encoded internal key. See InternalKey.Encode(). 284 func DecodeInternalKey(encodedKey []byte) InternalKey { 285 n := len(encodedKey) - InternalTrailerLen 286 var trailer uint64 287 if n >= 0 { 288 trailer = binary.LittleEndian.Uint64(encodedKey[n:]) 289 encodedKey = encodedKey[:n:n] 290 } else { 291 trailer = uint64(InternalKeyKindInvalid) 292 encodedKey = nil 293 } 294 return InternalKey{ 295 UserKey: encodedKey, 296 Trailer: trailer, 297 } 298 } 299 300 // InternalCompare compares two internal keys using the specified comparison 301 // function. For equal user keys, internal keys compare in descending sequence 302 // number order. For equal user keys and sequence numbers, internal keys 303 // compare in descending kind order (this may happen in practice among range 304 // keys). 305 func InternalCompare(userCmp Compare, a, b InternalKey) int { 306 if x := userCmp(a.UserKey, b.UserKey); x != 0 { 307 return x 308 } 309 if a.Trailer > b.Trailer { 310 return -1 311 } 312 if a.Trailer < b.Trailer { 313 return 1 314 } 315 return 0 316 } 317 318 // Encode encodes the receiver into the buffer. The buffer must be large enough 319 // to hold the encoded data. See InternalKey.Size(). 320 func (k InternalKey) Encode(buf []byte) { 321 i := copy(buf, k.UserKey) 322 binary.LittleEndian.PutUint64(buf[i:], k.Trailer) 323 } 324 325 // EncodeTrailer returns the trailer encoded to an 8-byte array. 326 func (k InternalKey) EncodeTrailer() [8]byte { 327 var buf [8]byte 328 binary.LittleEndian.PutUint64(buf[:], k.Trailer) 329 return buf 330 } 331 332 // Separator returns a separator key such that k <= x && x < other, where less 333 // than is consistent with the Compare function. The buf parameter may be used 334 // to store the returned InternalKey.UserKey, though it is valid to pass a 335 // nil. See the Separator type for details on separator keys. 336 func (k InternalKey) Separator( 337 cmp Compare, sep Separator, buf []byte, other InternalKey, 338 ) InternalKey { 339 buf = sep(buf, k.UserKey, other.UserKey) 340 if len(buf) <= len(k.UserKey) && cmp(k.UserKey, buf) < 0 { 341 // The separator user key is physically shorter than k.UserKey (if it is 342 // longer, we'll continue to use "k"), but logically after. Tack on the max 343 // sequence number to the shortened user key. Note that we could tack on 344 // any sequence number and kind here to create a valid separator key. We 345 // use the max sequence number to match the behavior of LevelDB and 346 // RocksDB. 347 return MakeInternalKey(buf, InternalKeySeqNumMax, InternalKeyKindSeparator) 348 } 349 return k 350 } 351 352 // Successor returns a successor key such that k <= x. A simple implementation 353 // may return k unchanged. The buf parameter may be used to store the returned 354 // InternalKey.UserKey, though it is valid to pass a nil. 355 func (k InternalKey) Successor(cmp Compare, succ Successor, buf []byte) InternalKey { 356 buf = succ(buf, k.UserKey) 357 if len(buf) <= len(k.UserKey) && cmp(k.UserKey, buf) < 0 { 358 // The successor user key is physically shorter that k.UserKey (if it is 359 // longer, we'll continue to use "k"), but logically after. Tack on the max 360 // sequence number to the shortened user key. Note that we could tack on 361 // any sequence number and kind here to create a valid separator key. We 362 // use the max sequence number to match the behavior of LevelDB and 363 // RocksDB. 364 return MakeInternalKey(buf, InternalKeySeqNumMax, InternalKeyKindSeparator) 365 } 366 return k 367 } 368 369 // Size returns the encoded size of the key. 370 func (k InternalKey) Size() int { 371 return len(k.UserKey) + 8 372 } 373 374 // SetSeqNum sets the sequence number component of the key. 375 func (k *InternalKey) SetSeqNum(seqNum uint64) { 376 k.Trailer = (seqNum << 8) | (k.Trailer & 0xff) 377 } 378 379 // SeqNum returns the sequence number component of the key. 380 func (k InternalKey) SeqNum() uint64 { 381 return k.Trailer >> 8 382 } 383 384 // SeqNumFromTrailer returns the sequence number component of a trailer. 385 func SeqNumFromTrailer(t uint64) uint64 { 386 return t >> 8 387 } 388 389 // Visible returns true if the key is visible at the specified snapshot 390 // sequence number. 391 func (k InternalKey) Visible(snapshot, batchSnapshot uint64) bool { 392 return Visible(k.SeqNum(), snapshot, batchSnapshot) 393 } 394 395 // Visible returns true if a key with the provided sequence number is visible at 396 // the specified snapshot sequence numbers. 397 func Visible(seqNum uint64, snapshot, batchSnapshot uint64) bool { 398 // There are two snapshot sequence numbers, one for committed keys and one 399 // for batch keys. If a seqNum is less than `snapshot`, then seqNum 400 // corresponds to a committed key that is visible. If seqNum has its batch 401 // bit set, then seqNum corresponds to an uncommitted batch key. Its 402 // visible if its snapshot is less than batchSnapshot. 403 // 404 // There's one complication. The maximal sequence number 405 // (`InternalKeySeqNumMax`) is used across Pebble for exclusive sentinel 406 // keys and other purposes. The maximal sequence number has its batch bit 407 // set, but it can never be < `batchSnapshot`, since there is no expressible 408 // larger snapshot. We dictate that the maximal sequence number is always 409 // visible. 410 return seqNum < snapshot || 411 ((seqNum&InternalKeySeqNumBatch) != 0 && seqNum < batchSnapshot) || 412 seqNum == InternalKeySeqNumMax 413 } 414 415 // SetKind sets the kind component of the key. 416 func (k *InternalKey) SetKind(kind InternalKeyKind) { 417 k.Trailer = (k.Trailer &^ 0xff) | uint64(kind) 418 } 419 420 // Kind returns the kind component of the key. 421 func (k InternalKey) Kind() InternalKeyKind { 422 return TrailerKind(k.Trailer) 423 } 424 425 // TrailerKind returns the key kind of the key trailer. 426 func TrailerKind(trailer uint64) InternalKeyKind { 427 return InternalKeyKind(trailer & 0xff) 428 } 429 430 // Valid returns true if the key has a valid kind. 431 func (k InternalKey) Valid() bool { 432 return k.Kind() <= InternalKeyKindMax 433 } 434 435 // Clone clones the storage for the UserKey component of the key. 436 func (k InternalKey) Clone() InternalKey { 437 if len(k.UserKey) == 0 { 438 return k 439 } 440 return InternalKey{ 441 UserKey: append([]byte(nil), k.UserKey...), 442 Trailer: k.Trailer, 443 } 444 } 445 446 // CopyFrom converts this InternalKey into a clone of the passed-in InternalKey, 447 // reusing any space already used for the current UserKey. 448 func (k *InternalKey) CopyFrom(k2 InternalKey) { 449 k.UserKey = append(k.UserKey[:0], k2.UserKey...) 450 k.Trailer = k2.Trailer 451 } 452 453 // String returns a string representation of the key. 454 func (k InternalKey) String() string { 455 return fmt.Sprintf("%s#%d,%d", FormatBytes(k.UserKey), k.SeqNum(), k.Kind()) 456 } 457 458 // Pretty returns a formatter for the key. 459 func (k InternalKey) Pretty(f FormatKey) fmt.Formatter { 460 return prettyInternalKey{k, f} 461 } 462 463 // IsExclusiveSentinel returns whether this internal key excludes point keys 464 // with the same user key if used as an end boundary. See the comment on 465 // InternalKeyRangeDeletionSentinel. 466 func (k InternalKey) IsExclusiveSentinel() bool { 467 switch kind := k.Kind(); kind { 468 case InternalKeyKindRangeDelete: 469 return k.Trailer == InternalKeyRangeDeleteSentinel 470 case InternalKeyKindRangeKeyDelete, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeySet: 471 return (k.Trailer >> 8) == InternalKeySeqNumMax 472 default: 473 return false 474 } 475 } 476 477 type prettyInternalKey struct { 478 InternalKey 479 formatKey FormatKey 480 } 481 482 func (k prettyInternalKey) Format(s fmt.State, c rune) { 483 if seqNum := k.SeqNum(); seqNum == InternalKeySeqNumMax { 484 fmt.Fprintf(s, "%s#inf,%s", k.formatKey(k.UserKey), k.Kind()) 485 } else { 486 fmt.Fprintf(s, "%s#%d,%s", k.formatKey(k.UserKey), k.SeqNum(), k.Kind()) 487 } 488 } 489 490 // ParsePrettyInternalKey parses the pretty string representation of an 491 // internal key. The format is <user-key>#<seq-num>,<kind>. 492 func ParsePrettyInternalKey(s string) InternalKey { 493 x := strings.FieldsFunc(s, func(c rune) bool { return c == '#' || c == ',' }) 494 ukey := x[0] 495 kind, ok := kindsMap[x[2]] 496 if !ok { 497 panic(fmt.Sprintf("unknown kind: %q", x[2])) 498 } 499 var seqNum uint64 500 if x[1] == "max" || x[1] == "inf" { 501 seqNum = InternalKeySeqNumMax 502 } else { 503 seqNum, _ = strconv.ParseUint(x[1], 10, 64) 504 } 505 return MakeInternalKey([]byte(ukey), seqNum, kind) 506 }