github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/keyspan/span.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package keyspan // import "github.com/cockroachdb/pebble/internal/keyspan" 6 7 import ( 8 "bytes" 9 "fmt" 10 "sort" 11 "strconv" 12 "strings" 13 "unicode" 14 15 "github.com/cockroachdb/pebble/internal/base" 16 ) 17 18 // Span represents a set of keys over a span of user key space. All of the keys 19 // within a Span are applied across the span's key span indicated by Start and 20 // End. Each internal key applied over the user key span appears as a separate 21 // Key, with its own kind and sequence number. Optionally, each Key may also 22 // have a Suffix and/or Value. 23 // 24 // Note that the start user key is inclusive and the end user key is exclusive. 25 // 26 // Currently the only supported key kinds are: 27 // 28 // RANGEDEL, RANGEKEYSET, RANGEKEYUNSET, RANGEKEYDEL. 29 type Span struct { 30 // Start and End encode the user key range of all the contained items, with 31 // an inclusive start key and exclusive end key. Both Start and End must be 32 // non-nil, or both nil if representing an invalid Span. 33 Start, End []byte 34 // Keys holds the set of keys applied over the [Start, End) user key range. 35 // Keys is sorted by (SeqNum, Kind) descending, unless otherwise specified 36 // by the context. If SeqNum and Kind are equal, the order of Keys is 37 // undefined. Keys may be empty, even if Start and End are non-nil. 38 // 39 // Keys are a decoded representation of the internal keys stored in batches 40 // or sstable blocks. A single internal key in a range key block may produce 41 // several decoded Keys. 42 Keys []Key 43 KeysOrder KeysOrder 44 } 45 46 // KeysOrder describes the ordering of Keys within a Span. 47 type KeysOrder int8 48 49 const ( 50 // ByTrailerDesc indicates a Span's keys are sorted by Trailer descending. 51 // This is the default ordering, and the ordering used during physical 52 // storage. 53 ByTrailerDesc KeysOrder = iota 54 // BySuffixAsc indicates a Span's keys are sorted by Suffix ascending. This 55 // ordering is used during user iteration of range keys. 56 BySuffixAsc 57 ) 58 59 // Key represents a single key applied over a span of user keys. A Key is 60 // contained by a Span which specifies the span of user keys over which the Key 61 // is applied. 62 type Key struct { 63 // Trailer contains the key kind and sequence number. 64 Trailer uint64 65 // Suffix holds an optional suffix associated with the key. This is only 66 // non-nil for RANGEKEYSET and RANGEKEYUNSET keys. 67 Suffix []byte 68 // Value holds a logical value associated with the Key. It is NOT the 69 // internal value stored in a range key or range deletion block. This is 70 // only non-nil for RANGEKEYSET keys. 71 Value []byte 72 } 73 74 // SeqNum returns the sequence number component of the key. 75 func (k Key) SeqNum() uint64 { 76 return k.Trailer >> 8 77 } 78 79 // VisibleAt returns true if the provided key is visible at the provided 80 // snapshot sequence number. It interprets batch sequence numbers as always 81 // visible, because non-visible batch span keys are filtered when they're 82 // fragmented. 83 func (k Key) VisibleAt(snapshot uint64) bool { 84 seq := k.SeqNum() 85 return seq < snapshot || seq&base.InternalKeySeqNumBatch != 0 86 } 87 88 // Kind returns the kind component of the key. 89 func (k Key) Kind() base.InternalKeyKind { 90 return base.InternalKeyKind(k.Trailer & 0xff) 91 } 92 93 // Equal returns true if this Key is equal to the given key. Two keys are said 94 // to be equal if the two Keys have equal trailers, suffix and value. Suffix 95 // comparison uses the provided base.Compare func. Value comparison is bytewise. 96 func (k Key) Equal(equal base.Equal, b Key) bool { 97 return k.Trailer == b.Trailer && 98 equal(k.Suffix, b.Suffix) && 99 bytes.Equal(k.Value, b.Value) 100 } 101 102 // Valid returns true if the span is defined. 103 func (s *Span) Valid() bool { 104 return s.Start != nil && s.End != nil 105 } 106 107 // Empty returns true if the span does not contain any keys. An empty span may 108 // still be Valid. A non-empty span must be Valid. 109 // 110 // An Empty span may be produced by Visible, or be produced by iterators in 111 // order to surface the gaps between keys. 112 func (s *Span) Empty() bool { 113 return s == nil || len(s.Keys) == 0 114 } 115 116 // SmallestKey returns the smallest internal key defined by the span's keys. 117 // It requires the Span's keys be in ByTrailerDesc order. It panics if the span 118 // contains no keys or its keys are sorted in a different order. 119 func (s *Span) SmallestKey() base.InternalKey { 120 if len(s.Keys) == 0 { 121 panic("pebble: Span contains no keys") 122 } else if s.KeysOrder != ByTrailerDesc { 123 panic("pebble: span's keys unexpectedly not in trailer order") 124 } 125 // The first key has the highest (sequence number,kind) tuple. 126 return base.InternalKey{ 127 UserKey: s.Start, 128 Trailer: s.Keys[0].Trailer, 129 } 130 } 131 132 // LargestKey returns the largest internal key defined by the span's keys. The 133 // returned key will always be a "sentinel key" at the end boundary. The 134 // "sentinel key" models the exclusive end boundary by returning an InternalKey 135 // with the maximal sequence number, ensuring all InternalKeys with the same 136 // user key sort after the sentinel key. 137 // 138 // It requires the Span's keys be in ByTrailerDesc order. It panics if the span 139 // contains no keys or its keys are sorted in a different order. 140 func (s *Span) LargestKey() base.InternalKey { 141 if len(s.Keys) == 0 { 142 panic("pebble: Span contains no keys") 143 } else if s.KeysOrder != ByTrailerDesc { 144 panic("pebble: span's keys unexpectedly not in trailer order") 145 } 146 // The last key has the lowest (sequence number,kind) tuple. 147 kind := s.Keys[len(s.Keys)-1].Kind() 148 return base.MakeExclusiveSentinelKey(kind, s.End) 149 } 150 151 // SmallestSeqNum returns the smallest sequence number of a key contained within 152 // the span. It requires the Span's keys be in ByTrailerDesc order. It panics if 153 // the span contains no keys or its keys are sorted in a different order. 154 func (s *Span) SmallestSeqNum() uint64 { 155 if len(s.Keys) == 0 { 156 panic("pebble: Span contains no keys") 157 } else if s.KeysOrder != ByTrailerDesc { 158 panic("pebble: span's keys unexpectedly not in trailer order") 159 } 160 161 return s.Keys[len(s.Keys)-1].SeqNum() 162 } 163 164 // LargestSeqNum returns the largest sequence number of a key contained within 165 // the span. It requires the Span's keys be in ByTrailerDesc order. It panics if 166 // the span contains no keys or its keys are sorted in a different order. 167 func (s *Span) LargestSeqNum() uint64 { 168 if len(s.Keys) == 0 { 169 panic("pebble: Span contains no keys") 170 } else if s.KeysOrder != ByTrailerDesc { 171 panic("pebble: span's keys unexpectedly not in trailer order") 172 } 173 return s.Keys[0].SeqNum() 174 } 175 176 // TODO(jackson): Replace most of the calls to Visible with more targeted calls 177 // that avoid the need to construct a new Span. 178 179 // Visible returns a span with the subset of keys visible at the provided 180 // sequence number. It requires the Span's keys be in ByTrailerDesc order. It 181 // panics if the span's keys are sorted in a different order. 182 // 183 // Visible may incur an allocation, so callers should prefer targeted, 184 // non-allocating methods when possible. 185 func (s Span) Visible(snapshot uint64) Span { 186 if s.KeysOrder != ByTrailerDesc { 187 panic("pebble: span's keys unexpectedly not in trailer order") 188 } 189 190 ret := Span{Start: s.Start, End: s.End} 191 if len(s.Keys) == 0 { 192 return ret 193 } 194 195 // Keys from indexed batches may force an allocation. The Keys slice is 196 // ordered by sequence number, so ordinarily we can return the trailing 197 // subslice containing keys with sequence numbers less than `seqNum`. 198 // 199 // However, batch keys are special. Only visible batch keys are included 200 // when an Iterator's batch spans are fragmented. They must always be 201 // visible. 202 // 203 // Batch keys can create a sandwich of visible batch keys at the beginning 204 // of the slice and visible committed keys at the end of the slice, forcing 205 // us to allocate a new slice and copy the contents. 206 // 207 // Care is taking to only incur an allocation only when batch keys and 208 // visible keys actually sandwich non-visible keys. 209 210 // lastBatchIdx and lastNonVisibleIdx are set to the last index of a batch 211 // key and a non-visible key respectively. 212 lastBatchIdx := -1 213 lastNonVisibleIdx := -1 214 for i := range s.Keys { 215 if seqNum := s.Keys[i].SeqNum(); seqNum&base.InternalKeySeqNumBatch != 0 { 216 // Batch key. Always visible. 217 lastBatchIdx = i 218 } else if seqNum >= snapshot { 219 // This key is not visible. 220 lastNonVisibleIdx = i 221 } 222 } 223 224 // In the following comments: b = batch, h = hidden, v = visible (committed). 225 switch { 226 case lastNonVisibleIdx == -1: 227 // All keys are visible. 228 // 229 // [b b b], [v v v] and [b b b v v v] 230 ret.Keys = s.Keys 231 case lastBatchIdx == -1: 232 // There are no batch keys, so we can return the continuous subslice 233 // starting after the last non-visible Key. 234 // 235 // h h h [v v v] 236 ret.Keys = s.Keys[lastNonVisibleIdx+1:] 237 case lastNonVisibleIdx == len(s.Keys)-1: 238 // While we have a batch key and non-visible keys, there are no 239 // committed visible keys. The 'sandwich' is missing the bottom layer, 240 // so we can return the continuous sublice at the beginning. 241 // 242 // [b b b] h h h 243 ret.Keys = s.Keys[0 : lastBatchIdx+1] 244 default: 245 // This is the problematic sandwich case. Allocate a new slice, copying 246 // the batch keys and the visible keys into it. 247 // 248 // [b b b] h h h [v v v] 249 ret.Keys = make([]Key, (lastBatchIdx+1)+(len(s.Keys)-lastNonVisibleIdx-1)) 250 copy(ret.Keys, s.Keys[:lastBatchIdx+1]) 251 copy(ret.Keys[lastBatchIdx+1:], s.Keys[lastNonVisibleIdx+1:]) 252 } 253 return ret 254 } 255 256 // VisibleAt returns true if the span contains a key visible at the provided 257 // snapshot. Keys with sequence numbers with the batch bit set are treated as 258 // always visible. 259 // 260 // VisibleAt requires the Span's keys be in ByTrailerDesc order. It panics if 261 // the span's keys are sorted in a different order. 262 func (s *Span) VisibleAt(snapshot uint64) bool { 263 if s.KeysOrder != ByTrailerDesc { 264 panic("pebble: span's keys unexpectedly not in trailer order") 265 } 266 if len(s.Keys) == 0 { 267 return false 268 } else if first := s.Keys[0].SeqNum(); first&base.InternalKeySeqNumBatch != 0 { 269 // Only visible batch keys are included when an Iterator's batch spans 270 // are fragmented. They must always be visible. 271 return true 272 } else { 273 // Otherwise we check the last key. Since keys are ordered decreasing in 274 // sequence number, the last key has the lowest sequence number of any 275 // of the span's keys. If any of the keys are visible, the last key must 276 // be visible. Or put differently: if the last key is not visible, then 277 // no key is visible. 278 return s.Keys[len(s.Keys)-1].SeqNum() < snapshot 279 } 280 } 281 282 // ShallowClone returns the span with a Keys slice owned by the span itself. 283 // None of the key byte slices are cloned (see Span.DeepClone). 284 func (s *Span) ShallowClone() Span { 285 c := Span{ 286 Start: s.Start, 287 End: s.End, 288 Keys: make([]Key, len(s.Keys)), 289 KeysOrder: s.KeysOrder, 290 } 291 copy(c.Keys, s.Keys) 292 return c 293 } 294 295 // DeepClone clones the span, creating copies of all contained slices. DeepClone 296 // is intended for non-production code paths like tests, the level checker, etc 297 // because it is allocation heavy. 298 func (s *Span) DeepClone() Span { 299 c := Span{ 300 Start: make([]byte, len(s.Start)), 301 End: make([]byte, len(s.End)), 302 Keys: make([]Key, len(s.Keys)), 303 KeysOrder: s.KeysOrder, 304 } 305 copy(c.Start, s.Start) 306 copy(c.End, s.End) 307 for i := range s.Keys { 308 c.Keys[i].Trailer = s.Keys[i].Trailer 309 if len(s.Keys[i].Suffix) > 0 { 310 c.Keys[i].Suffix = make([]byte, len(s.Keys[i].Suffix)) 311 copy(c.Keys[i].Suffix, s.Keys[i].Suffix) 312 } 313 if len(s.Keys[i].Value) > 0 { 314 c.Keys[i].Value = make([]byte, len(s.Keys[i].Value)) 315 copy(c.Keys[i].Value, s.Keys[i].Value) 316 } 317 } 318 return c 319 } 320 321 // Contains returns true if the specified key resides within the span's bounds. 322 func (s *Span) Contains(cmp base.Compare, key []byte) bool { 323 return cmp(s.Start, key) <= 0 && cmp(key, s.End) < 0 324 } 325 326 // Covers returns true if the span covers keys at seqNum. 327 // 328 // Covers requires the Span's keys be in ByTrailerDesc order. It panics if the 329 // span's keys are sorted in a different order. 330 func (s Span) Covers(seqNum uint64) bool { 331 if s.KeysOrder != ByTrailerDesc { 332 panic("pebble: span's keys unexpectedly not in trailer order") 333 } 334 return !s.Empty() && s.Keys[0].SeqNum() > seqNum 335 } 336 337 // CoversAt returns true if the span contains a key that is visible at the 338 // provided snapshot sequence number, and that key's sequence number is higher 339 // than seqNum. 340 // 341 // Keys with sequence numbers with the batch bit set are treated as always 342 // visible. 343 // 344 // CoversAt requires the Span's keys be in ByTrailerDesc order. It panics if the 345 // span's keys are sorted in a different order. 346 func (s *Span) CoversAt(snapshot, seqNum uint64) bool { 347 if s.KeysOrder != ByTrailerDesc { 348 panic("pebble: span's keys unexpectedly not in trailer order") 349 } 350 // NB: A key is visible at `snapshot` if its sequence number is strictly 351 // less than `snapshot`. See base.Visible. 352 for i := range s.Keys { 353 if kseq := s.Keys[i].SeqNum(); kseq&base.InternalKeySeqNumBatch != 0 { 354 // Only visible batch keys are included when an Iterator's batch spans 355 // are fragmented. They must always be visible. 356 return kseq > seqNum 357 } else if kseq < snapshot { 358 return kseq > seqNum 359 } 360 } 361 return false 362 } 363 364 // String returns a string representation of the span. 365 func (s Span) String() string { 366 return fmt.Sprint(prettySpan{Span: s, formatKey: base.DefaultFormatter}) 367 } 368 369 // Pretty returns a formatter for the span. 370 func (s Span) Pretty(f base.FormatKey) fmt.Formatter { 371 // TODO(jackson): Take a base.FormatValue to format Key.Value too. 372 return prettySpan{s, f} 373 } 374 375 type prettySpan struct { 376 Span 377 formatKey base.FormatKey 378 } 379 380 func (s prettySpan) Format(fs fmt.State, c rune) { 381 if !s.Valid() { 382 fmt.Fprintf(fs, "<invalid>") 383 return 384 } 385 fmt.Fprintf(fs, "%s-%s:{", s.formatKey(s.Start), s.formatKey(s.End)) 386 for i, k := range s.Keys { 387 if i > 0 { 388 fmt.Fprint(fs, " ") 389 } 390 fmt.Fprintf(fs, "(#%d,%s", k.SeqNum(), k.Kind()) 391 if len(k.Suffix) > 0 || len(k.Value) > 0 { 392 fmt.Fprintf(fs, ",%s", k.Suffix) 393 } 394 if len(k.Value) > 0 { 395 fmt.Fprintf(fs, ",%s", k.Value) 396 } 397 fmt.Fprint(fs, ")") 398 } 399 fmt.Fprintf(fs, "}") 400 } 401 402 // SortKeysByTrailer sorts a keys slice by trailer. 403 func SortKeysByTrailer(keys *[]Key) { 404 // NB: keys is a pointer to a slice instead of a slice to avoid `sorted` 405 // escaping to the heap. 406 sorted := (*keysBySeqNumKind)(keys) 407 sort.Sort(sorted) 408 } 409 410 // KeysBySuffix implements sort.Interface, sorting its member Keys slice to by 411 // Suffix in the order dictated by Cmp. 412 type KeysBySuffix struct { 413 Cmp base.Compare 414 Keys []Key 415 } 416 417 func (s *KeysBySuffix) Len() int { return len(s.Keys) } 418 func (s *KeysBySuffix) Less(i, j int) bool { return s.Cmp(s.Keys[i].Suffix, s.Keys[j].Suffix) < 0 } 419 func (s *KeysBySuffix) Swap(i, j int) { s.Keys[i], s.Keys[j] = s.Keys[j], s.Keys[i] } 420 421 // ParseSpan parses the string representation of a Span. It's intended for 422 // tests. ParseSpan panics if passed a malformed span representation. 423 func ParseSpan(input string) Span { 424 var s Span 425 parts := strings.FieldsFunc(input, func(r rune) bool { 426 switch r { 427 case '-', ':', '{', '}': 428 return true 429 default: 430 return unicode.IsSpace(r) 431 } 432 }) 433 s.Start, s.End = []byte(parts[0]), []byte(parts[1]) 434 435 // Each of the remaining parts represents a single Key. 436 s.Keys = make([]Key, 0, len(parts)-2) 437 for _, p := range parts[2:] { 438 keyFields := strings.FieldsFunc(p, func(r rune) bool { 439 switch r { 440 case '#', ',', '(', ')': 441 return true 442 default: 443 return unicode.IsSpace(r) 444 } 445 }) 446 447 var k Key 448 // Parse the sequence number. 449 seqNum, err := strconv.ParseUint(keyFields[0], 10, 64) 450 if err != nil { 451 panic(fmt.Sprintf("invalid sequence number: %q: %s", keyFields[0], err)) 452 } 453 // Parse the key kind. 454 kind := base.ParseKind(keyFields[1]) 455 k.Trailer = base.MakeTrailer(seqNum, kind) 456 // Parse the optional suffix. 457 if len(keyFields) >= 3 { 458 k.Suffix = []byte(keyFields[2]) 459 } 460 // Parse the optional value. 461 if len(keyFields) >= 4 { 462 k.Value = []byte(keyFields[3]) 463 } 464 s.Keys = append(s.Keys, k) 465 } 466 return s 467 }