github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/internal/manifest/l0_sublevels.go (about) 1 // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package manifest 6 7 import ( 8 "bytes" 9 stdcmp "cmp" 10 "fmt" 11 "math" 12 "slices" 13 "sort" 14 "strings" 15 16 "github.com/cockroachdb/errors" 17 "github.com/cockroachdb/pebble/internal/base" 18 "github.com/cockroachdb/pebble/internal/invariants" 19 ) 20 21 // errInvalidL0SublevelsOpt is for use in AddL0Files when the incremental 22 // sublevel generation optimization failed, and NewL0Sublevels must be called. 23 var errInvalidL0SublevelsOpt = errors.New("pebble: L0 sublevel generation optimization cannot be used") 24 25 // Intervals are of the form [start, end) with no gap between intervals. Each 26 // file overlaps perfectly with a sequence of intervals. This perfect overlap 27 // occurs because the union of file boundary keys is used to pick intervals. 28 // However the largest key in a file is inclusive, so when it is used as 29 // an interval, the actual key is ImmediateSuccessor(key). We don't have the 30 // ImmediateSuccessor function to do this computation, so we instead keep an 31 // isLargest bool to remind the code about this fact. This is used for 32 // comparisons in the following manner: 33 // - intervalKey{k, false} < intervalKey{k, true} 34 // - k1 < k2 -> intervalKey{k1, _} < intervalKey{k2, _}. 35 // 36 // Note that the file's largest key is exclusive if the internal key 37 // has a trailer matching the rangedel sentinel key. In this case, we set 38 // isLargest to false for end interval computation. 39 // 40 // For example, consider three files with bounds [a,e], [b,g], and [e,j]. The 41 // interval keys produced would be intervalKey{a, false}, intervalKey{b, false}, 42 // intervalKey{e, false}, intervalKey{e, true}, intervalKey{g, true} and 43 // intervalKey{j, true}, resulting in intervals 44 // [a, b), [b, (e, false)), [(e,false), (e, true)), [(e, true), (g, true)) and 45 // [(g, true), (j, true)). The first file overlaps with the first three 46 // perfectly, the second file overlaps with the second through to fourth 47 // intervals, and the third file overlaps with the last three. 48 // 49 // The intervals are indexed starting from 0, with the index of the interval 50 // being the index of the start key of the interval. 51 // 52 // In addition to helping with compaction picking, we use interval indices 53 // to assign each file an interval range once. Subsequent operations, say 54 // picking overlapping files for a compaction, only need to use the index 55 // numbers and so avoid expensive byte slice comparisons. 56 type intervalKey struct { 57 key []byte 58 isLargest bool 59 } 60 61 // intervalKeyTemp is used in the sortAndSweep step. It contains additional metadata 62 // which is used to generate the {min,max}IntervalIndex for files. 63 type intervalKeyTemp struct { 64 intervalKey intervalKey 65 fileMeta *FileMetadata 66 isEndKey bool 67 } 68 69 func (i *intervalKeyTemp) setFileIntervalIndex(idx int) { 70 if i.isEndKey { 71 // This is the right endpoint of some file interval, so the 72 // file.maxIntervalIndex must be j - 1 as maxIntervalIndex is 73 // inclusive. 74 i.fileMeta.maxIntervalIndex = idx - 1 75 return 76 } 77 // This is the left endpoint for some file interval, so the 78 // file.minIntervalIndex must be j. 79 i.fileMeta.minIntervalIndex = idx 80 } 81 82 func intervalKeyCompare(cmp Compare, a, b intervalKey) int { 83 rv := cmp(a.key, b.key) 84 if rv == 0 { 85 if a.isLargest && !b.isLargest { 86 return +1 87 } 88 if !a.isLargest && b.isLargest { 89 return -1 90 } 91 } 92 return rv 93 } 94 95 type intervalKeySorter struct { 96 keys []intervalKeyTemp 97 cmp Compare 98 } 99 100 func (s intervalKeySorter) Len() int { return len(s.keys) } 101 func (s intervalKeySorter) Less(i, j int) bool { 102 return intervalKeyCompare(s.cmp, s.keys[i].intervalKey, s.keys[j].intervalKey) < 0 103 } 104 func (s intervalKeySorter) Swap(i, j int) { 105 s.keys[i], s.keys[j] = s.keys[j], s.keys[i] 106 } 107 108 // sortAndSweep will sort the intervalKeys using intervalKeySorter, remove the 109 // duplicate fileIntervals, and set the {min, max}IntervalIndex for the files. 110 func sortAndSweep(keys []intervalKeyTemp, cmp Compare) []intervalKeyTemp { 111 if len(keys) == 0 { 112 return nil 113 } 114 sorter := intervalKeySorter{keys: keys, cmp: cmp} 115 sort.Sort(sorter) 116 117 // intervalKeys are generated using the file bounds. Specifically, there are 118 // 2 intervalKeys for each file, and len(keys) = 2 * number of files. Each 119 // `intervalKeyTemp` stores information about which file it was generated 120 // from, and whether the key represents the end key of the file. So, as 121 // we're deduplicating the `keys` slice, we're guaranteed to iterate over 122 // the interval keys belonging to each of the files. Since the 123 // file.{min,max}IntervalIndex points to the position of the files bounds in 124 // the deduplicated `keys` slice, we can determine 125 // file.{min,max}IntervalIndex during the iteration. 126 i := 0 127 j := 0 128 for i < len(keys) { 129 // loop invariant: j <= i 130 currKey := keys[i] 131 keys[j] = keys[i] 132 133 for { 134 keys[i].setFileIntervalIndex(j) 135 i++ 136 if i >= len(keys) || intervalKeyCompare(cmp, currKey.intervalKey, keys[i].intervalKey) != 0 { 137 break 138 } 139 } 140 j++ 141 } 142 return keys[:j] 143 } 144 145 // A key interval of the form [start, end). The end is not represented here 146 // since it is implicit in the start of the next interval. The last interval is 147 // an exception but we don't need to ever lookup the end of that interval; the 148 // last fileInterval will only act as an end key marker. The set of intervals 149 // is const after initialization. 150 type fileInterval struct { 151 index int 152 startKey intervalKey 153 154 // True iff some file in this interval is compacting to base. Such intervals 155 // cannot have any files participate in L0 -> Lbase compactions. 156 isBaseCompacting bool 157 158 // The min and max intervals index across all the files that overlap with 159 // this interval. Inclusive on both sides. 160 filesMinIntervalIndex int 161 filesMaxIntervalIndex int 162 163 // True if another interval that has a file extending into this interval is 164 // undergoing a compaction into Lbase. In other words, this bool is true if 165 // any interval in [filesMinIntervalIndex, filesMaxIntervalIndex] has 166 // isBaseCompacting set to true. This lets the compaction picker 167 // de-prioritize this interval for picking compactions, since there's a high 168 // chance that a base compaction with a sufficient height of sublevels 169 // rooted at this interval could not be chosen due to the ongoing base 170 // compaction in the other interval. If the file straddling the two 171 // intervals is at a sufficiently high sublevel (with enough compactible 172 // files below it to satisfy minCompactionDepth), this is not an issue, but 173 // to optimize for quickly picking base compactions far away from other base 174 // compactions, this bool is used as a heuristic (but not as a complete 175 // disqualifier). 176 intervalRangeIsBaseCompacting bool 177 178 // All files in this interval, in increasing sublevel order. 179 files []*FileMetadata 180 181 // len(files) - compactingFileCount is the stack depth that requires 182 // starting new compactions. This metric is not precise since the 183 // compactingFileCount can include files that are part of N (where N > 1) 184 // intra-L0 compactions, so the stack depth after those complete will be 185 // len(files) - compactingFileCount + N. We ignore this imprecision since we 186 // don't want to track which files are part of which intra-L0 compaction. 187 compactingFileCount int 188 189 // Interpolated from files in this interval. For files spanning multiple 190 // intervals, we assume an equal distribution of bytes across all those 191 // intervals. 192 estimatedBytes uint64 193 } 194 195 // Helper type for any cases requiring a bool slice. 196 type bitSet []bool 197 198 func newBitSet(n int) bitSet { 199 return make([]bool, n) 200 } 201 202 func (b *bitSet) markBit(i int) { 203 (*b)[i] = true 204 } 205 206 func (b *bitSet) markBits(start, end int) { 207 for i := start; i < end; i++ { 208 (*b)[i] = true 209 } 210 } 211 212 func (b *bitSet) clearAllBits() { 213 for i := range *b { 214 (*b)[i] = false 215 } 216 } 217 218 // L0Compaction describes an active compaction with inputs from L0. 219 type L0Compaction struct { 220 Smallest InternalKey 221 Largest InternalKey 222 IsIntraL0 bool 223 } 224 225 // L0Sublevels represents a sublevel view of SSTables in L0. Tables in one 226 // sublevel are non-overlapping in key ranges, and keys in higher-indexed 227 // sublevels shadow older versions in lower-indexed sublevels. These invariants 228 // are similar to the regular level invariants, except with higher indexed 229 // sublevels having newer keys as opposed to lower indexed levels. 230 // 231 // There is no limit to the number of sublevels that can exist in L0 at any 232 // time, however read and compaction performance is best when there are as few 233 // sublevels as possible. 234 type L0Sublevels struct { 235 // Levels are ordered from oldest sublevel to youngest sublevel in the 236 // outer slice, and the inner slice contains non-overlapping files for 237 // that sublevel in increasing key order. Levels is constructed from 238 // levelFiles and is used by callers that require a LevelSlice. The below two 239 // fields are treated as immutable once created in NewL0Sublevels. 240 Levels []LevelSlice 241 levelFiles [][]*FileMetadata 242 243 cmp Compare 244 formatKey base.FormatKey 245 246 fileBytes uint64 247 // All the L0 files, ordered from oldest to youngest. 248 levelMetadata *LevelMetadata 249 250 // The file intervals in increasing key order. 251 orderedIntervals []fileInterval 252 253 // Keys to break flushes at. 254 flushSplitUserKeys [][]byte 255 256 // Only used to check invariants. 257 addL0FilesCalled bool 258 } 259 260 type sublevelSorter []*FileMetadata 261 262 // Len implements sort.Interface. 263 func (sl sublevelSorter) Len() int { 264 return len(sl) 265 } 266 267 // Less implements sort.Interface. 268 func (sl sublevelSorter) Less(i, j int) bool { 269 return sl[i].minIntervalIndex < sl[j].minIntervalIndex 270 } 271 272 // Swap implements sort.Interface. 273 func (sl sublevelSorter) Swap(i, j int) { 274 sl[i], sl[j] = sl[j], sl[i] 275 } 276 277 // NewL0Sublevels creates an L0Sublevels instance for a given set of L0 files. 278 // These files must all be in L0 and must be sorted by seqnum (see 279 // SortBySeqNum). During interval iteration, when flushSplitMaxBytes bytes are 280 // exceeded in the range of intervals since the last flush split key, a flush 281 // split key is added. 282 // 283 // This method can be called without DB.mu being held, so any DB.mu protected 284 // fields in FileMetadata cannot be accessed here, such as Compacting and 285 // IsIntraL0Compacting. Those fields are accessed in InitCompactingFileInfo 286 // instead. 287 func NewL0Sublevels( 288 levelMetadata *LevelMetadata, cmp Compare, formatKey base.FormatKey, flushSplitMaxBytes int64, 289 ) (*L0Sublevels, error) { 290 s := &L0Sublevels{cmp: cmp, formatKey: formatKey} 291 s.levelMetadata = levelMetadata 292 keys := make([]intervalKeyTemp, 0, 2*s.levelMetadata.Len()) 293 iter := levelMetadata.Iter() 294 for i, f := 0, iter.First(); f != nil; i, f = i+1, iter.Next() { 295 f.L0Index = i 296 keys = append(keys, intervalKeyTemp{ 297 intervalKey: intervalKey{key: f.Smallest.UserKey}, 298 fileMeta: f, 299 isEndKey: false, 300 }) 301 keys = append(keys, intervalKeyTemp{ 302 intervalKey: intervalKey{ 303 key: f.Largest.UserKey, 304 isLargest: !f.Largest.IsExclusiveSentinel(), 305 }, 306 fileMeta: f, 307 isEndKey: true, 308 }) 309 } 310 keys = sortAndSweep(keys, cmp) 311 // All interval indices reference s.orderedIntervals. 312 s.orderedIntervals = make([]fileInterval, len(keys)) 313 for i := range keys { 314 s.orderedIntervals[i] = fileInterval{ 315 index: i, 316 startKey: keys[i].intervalKey, 317 filesMinIntervalIndex: i, 318 filesMaxIntervalIndex: i, 319 } 320 } 321 // Initialize minIntervalIndex and maxIntervalIndex for each file, and use that 322 // to update intervals. 323 for f := iter.First(); f != nil; f = iter.Next() { 324 if err := s.addFileToSublevels(f, false /* checkInvariant */); err != nil { 325 return nil, err 326 } 327 } 328 // Sort each sublevel in increasing key order. 329 for i := range s.levelFiles { 330 sort.Sort(sublevelSorter(s.levelFiles[i])) 331 } 332 333 // Construct a parallel slice of sublevel B-Trees. 334 // TODO(jackson): Consolidate and only use the B-Trees. 335 for _, sublevelFiles := range s.levelFiles { 336 tr, ls := makeBTree(btreeCmpSmallestKey(cmp), sublevelFiles) 337 s.Levels = append(s.Levels, ls) 338 tr.Release() 339 } 340 341 s.calculateFlushSplitKeys(flushSplitMaxBytes) 342 return s, nil 343 } 344 345 // Helper function to merge new intervalKeys into an existing slice of old 346 // fileIntervals, into result. Returns the new result and a slice of ints 347 // mapping old interval indices to new ones. The added intervalKeys do not need 348 // to be sorted; they get sorted and deduped in this function. 349 func mergeIntervals( 350 old, result []fileInterval, added []intervalKeyTemp, compare Compare, 351 ) ([]fileInterval, []int) { 352 sorter := intervalKeySorter{keys: added, cmp: compare} 353 sort.Sort(sorter) 354 355 oldToNewMap := make([]int, len(old)) 356 i := 0 357 j := 0 358 359 for i < len(old) || j < len(added) { 360 for j > 0 && j < len(added) && intervalKeyCompare(compare, added[j-1].intervalKey, added[j].intervalKey) == 0 { 361 added[j].setFileIntervalIndex(len(result) - 1) 362 j++ 363 } 364 if i >= len(old) && j >= len(added) { 365 break 366 } 367 var cmp int 368 if i >= len(old) { 369 cmp = +1 370 } 371 if j >= len(added) { 372 cmp = -1 373 } 374 if cmp == 0 { 375 cmp = intervalKeyCompare(compare, old[i].startKey, added[j].intervalKey) 376 } 377 switch { 378 case cmp <= 0: 379 // Shallow-copy the existing interval. 380 newInterval := old[i] 381 result = append(result, newInterval) 382 oldToNewMap[i] = len(result) - 1 383 i++ 384 if cmp == 0 { 385 added[j].setFileIntervalIndex(len(result) - 1) 386 j++ 387 } 388 case cmp > 0: 389 var prevInterval fileInterval 390 // Insert a new interval for a newly-added file. prevInterval, if 391 // non-zero, will be "inherited"; we copy its files as those extend 392 // into this interval. 393 if len(result) > 0 { 394 prevInterval = result[len(result)-1] 395 } 396 newInterval := fileInterval{ 397 index: len(result), 398 startKey: added[j].intervalKey, 399 filesMinIntervalIndex: len(result), 400 filesMaxIntervalIndex: len(result), 401 402 // estimatedBytes gets recalculated later on, as the number of intervals 403 // the file bytes are interpolated over has changed. 404 estimatedBytes: 0, 405 // Copy the below attributes from prevInterval. 406 files: append([]*FileMetadata(nil), prevInterval.files...), 407 isBaseCompacting: prevInterval.isBaseCompacting, 408 intervalRangeIsBaseCompacting: prevInterval.intervalRangeIsBaseCompacting, 409 compactingFileCount: prevInterval.compactingFileCount, 410 } 411 result = append(result, newInterval) 412 added[j].setFileIntervalIndex(len(result) - 1) 413 j++ 414 } 415 } 416 return result, oldToNewMap 417 } 418 419 // AddL0Files incrementally builds a new L0Sublevels for when the only change 420 // since the receiver L0Sublevels was an addition of the specified files, with 421 // no L0 deletions. The common case of this is an ingestion or a flush. These 422 // files can "sit on top" of existing sublevels, creating at most one new 423 // sublevel for a flush (and possibly multiple for an ingestion), and at most 424 // 2*len(files) additions to s.orderedIntervals. No files must have been deleted 425 // from L0, and the added files must all be newer in sequence numbers than 426 // existing files in L0Sublevels. The files parameter must be sorted in seqnum 427 // order. The levelMetadata parameter corresponds to the new L0 post addition of 428 // files. This method is meant to be significantly more performant than 429 // NewL0Sublevels. 430 // 431 // Note that this function can only be called once on a given receiver; it 432 // appends to some slices in s which is only safe when done once. This is okay, 433 // as the common case (generating a new L0Sublevels after a flush/ingestion) is 434 // only going to necessitate one call of this method on a given receiver. The 435 // returned value, if non-nil, can then have [*L0Sublevels.AddL0Files] called on 436 // it again, and so on. If [errInvalidL0SublevelsOpt] is returned as an error, 437 // it likely means the optimization could not be applied (i.e. files added were 438 // older than files already in the sublevels, which is possible around 439 // ingestions and in tests). Eg. it can happen when an ingested file was 440 // ingested without queueing a flush since it did not actually overlap with any 441 // keys in the memtable. Later on the memtable was flushed, and the memtable had 442 // keys spanning around the ingested file, producing a flushed file that 443 // overlapped with the ingested file in file bounds but not in keys. It's 444 // possible for that flushed file to have a lower LargestSeqNum than the 445 // ingested file if all the additions after the ingestion were to another 446 // flushed file that was split into a separate sstable during flush. Any other 447 // non-nil error means [L0Sublevels] generation failed in the same way as 448 // [NewL0Sublevels] would likely fail. 449 func (s *L0Sublevels) AddL0Files( 450 files []*FileMetadata, flushSplitMaxBytes int64, levelMetadata *LevelMetadata, 451 ) (*L0Sublevels, error) { 452 if invariants.Enabled && s.addL0FilesCalled { 453 panic("AddL0Files called twice on the same receiver") 454 } 455 s.addL0FilesCalled = true 456 457 // Start with a shallow copy of s. 458 newVal := &L0Sublevels{} 459 *newVal = *s 460 461 newVal.addL0FilesCalled = false 462 newVal.levelMetadata = levelMetadata 463 // Deep copy levelFiles and Levels, as they are mutated and sorted below. 464 // Shallow copies of slices that we just append to, are okay. 465 newVal.levelFiles = make([][]*FileMetadata, len(s.levelFiles)) 466 for i := range s.levelFiles { 467 newVal.levelFiles[i] = make([]*FileMetadata, len(s.levelFiles[i])) 468 copy(newVal.levelFiles[i], s.levelFiles[i]) 469 } 470 newVal.Levels = make([]LevelSlice, len(s.Levels)) 471 copy(newVal.Levels, s.Levels) 472 473 fileKeys := make([]intervalKeyTemp, 0, 2*len(files)) 474 for _, f := range files { 475 left := intervalKeyTemp{ 476 intervalKey: intervalKey{key: f.Smallest.UserKey}, 477 fileMeta: f, 478 } 479 right := intervalKeyTemp{ 480 intervalKey: intervalKey{ 481 key: f.Largest.UserKey, 482 isLargest: !f.Largest.IsExclusiveSentinel(), 483 }, 484 fileMeta: f, 485 isEndKey: true, 486 } 487 fileKeys = append(fileKeys, left, right) 488 } 489 keys := make([]fileInterval, 0, 2*levelMetadata.Len()) 490 var oldToNewMap []int 491 // We can avoid the sortAndSweep step on the combined length of 492 // s.orderedIntervals and fileKeys by treating this as a merge of two sorted 493 // runs, fileKeys and s.orderedIntervals, into `keys` which will form 494 // newVal.orderedIntervals. 495 keys, oldToNewMap = mergeIntervals(s.orderedIntervals, keys, fileKeys, s.cmp) 496 if invariants.Enabled { 497 for i := 1; i < len(keys); i++ { 498 if intervalKeyCompare(newVal.cmp, keys[i-1].startKey, keys[i].startKey) >= 0 { 499 panic("keys not sorted correctly") 500 } 501 } 502 } 503 newVal.orderedIntervals = keys 504 // Update indices in s.orderedIntervals for fileIntervals we retained. 505 for _, newIdx := range oldToNewMap { 506 newInterval := &keys[newIdx] 507 newInterval.index = newIdx 508 // This code, and related code in the for loop below, adjusts 509 // files{Min,Max}IntervalIndex just for interval indices shifting due to 510 // new intervals, and not for any of the new files being added to the 511 // same intervals. The goal is to produce a state of the system that's 512 // accurate for all existing files, and has all the new intervals to 513 // support new files. Once that's done, we can just call 514 // addFileToSublevel to adjust all relevant intervals for new files. 515 newInterval.filesMinIntervalIndex = oldToNewMap[newInterval.filesMinIntervalIndex] 516 // maxIntervalIndexes are special. Since it's an inclusive end bound, we 517 // actually have to map it to the _next_ old interval's new previous 518 // interval. This logic is easier to understand if you see 519 // [f.minIntervalIndex, f.maxIntervalIndex] as [f.minIntervalIndex, 520 // f.maxIntervalIndex+1). The other case to remember is when the 521 // interval is completely empty (i.e. len(newInterval.files) == 0); in 522 // that case we want to refer back to ourselves regardless of additions 523 // to the right of us. 524 if newInterval.filesMaxIntervalIndex < len(oldToNewMap)-1 && len(newInterval.files) > 0 { 525 newInterval.filesMaxIntervalIndex = oldToNewMap[newInterval.filesMaxIntervalIndex+1] - 1 526 } else { 527 // newInterval.filesMaxIntervalIndex == len(oldToNewMap)-1. 528 newInterval.filesMaxIntervalIndex = oldToNewMap[newInterval.filesMaxIntervalIndex] 529 } 530 } 531 // Loop through all instances of new intervals added between two old 532 // intervals and expand [filesMinIntervalIndex, filesMaxIntervalIndex] of 533 // new intervals to reflect that of adjacent old intervals. 534 { 535 // We can skip cases where new intervals were added to the left of all 536 // existing intervals (eg. if the first entry in oldToNewMap is 537 // oldToNewMap[0] >= 1). Those intervals will only contain newly added 538 // files and will have their parameters adjusted down in 539 // addFileToSublevels. The same can also be said about new intervals 540 // that are to the right of all existing intervals. 541 lastIdx := 0 542 for _, newIdx := range oldToNewMap { 543 for i := lastIdx + 1; i < newIdx; i++ { 544 minIntervalIndex := i 545 maxIntervalIndex := i 546 if keys[lastIdx].filesMaxIntervalIndex != lastIdx { 547 // Last old interval has files extending into keys[i]. 548 minIntervalIndex = keys[lastIdx].filesMinIntervalIndex 549 maxIntervalIndex = keys[lastIdx].filesMaxIntervalIndex 550 } 551 552 keys[i].filesMinIntervalIndex = minIntervalIndex 553 keys[i].filesMaxIntervalIndex = maxIntervalIndex 554 } 555 lastIdx = newIdx 556 } 557 } 558 // Go through old files and update interval indices. 559 // 560 // TODO(bilal): This is the only place in this method where we loop through 561 // all existing files, which could be much more in number than newly added 562 // files. See if we can avoid the need for this, either by getting rid of 563 // f.minIntervalIndex and f.maxIntervalIndex and calculating them on the fly 564 // with a binary search, or by only looping through files to the right of 565 // the first interval touched by this method. 566 for sublevel := range s.Levels { 567 s.Levels[sublevel].Each(func(f *FileMetadata) { 568 oldIntervalDelta := f.maxIntervalIndex - f.minIntervalIndex + 1 569 oldMinIntervalIndex := f.minIntervalIndex 570 f.minIntervalIndex = oldToNewMap[f.minIntervalIndex] 571 // maxIntervalIndex is special. Since it's an inclusive end bound, 572 // we actually have to map it to the _next_ old interval's new 573 // previous interval. This logic is easier to understand if you see 574 // [f.minIntervalIndex, f.maxIntervalIndex] as [f.minIntervalIndex, 575 // f.maxIntervalIndex+1). 576 f.maxIntervalIndex = oldToNewMap[f.maxIntervalIndex+1] - 1 577 newIntervalDelta := f.maxIntervalIndex - f.minIntervalIndex + 1 578 // Recalculate estimatedBytes for all old files across new 579 // intervals, but only if new intervals were added in between. 580 if oldIntervalDelta != newIntervalDelta { 581 // j is incremented so that oldToNewMap[j] points to the next 582 // old interval. This is used to distinguish between old 583 // intervals (i.e. ones where we need to subtract 584 // f.Size/oldIntervalDelta) from new ones (where we don't need 585 // to subtract). In both cases we need to add 586 // f.Size/newIntervalDelta. 587 j := oldMinIntervalIndex 588 for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ { 589 if oldToNewMap[j] == i { 590 newVal.orderedIntervals[i].estimatedBytes -= f.Size / uint64(oldIntervalDelta) 591 j++ 592 } 593 newVal.orderedIntervals[i].estimatedBytes += f.Size / uint64(newIntervalDelta) 594 } 595 } 596 }) 597 } 598 updatedSublevels := make([]int, 0) 599 // Update interval indices for new files. 600 for i, f := range files { 601 f.L0Index = s.levelMetadata.Len() + i 602 if err := newVal.addFileToSublevels(f, true /* checkInvariant */); err != nil { 603 return nil, err 604 } 605 updatedSublevels = append(updatedSublevels, f.SubLevel) 606 } 607 608 // Sort and deduplicate updatedSublevels. 609 sort.Ints(updatedSublevels) 610 { 611 j := 0 612 for i := 1; i < len(updatedSublevels); i++ { 613 if updatedSublevels[i] != updatedSublevels[j] { 614 j++ 615 updatedSublevels[j] = updatedSublevels[i] 616 } 617 } 618 updatedSublevels = updatedSublevels[:j+1] 619 } 620 621 // Sort each updated sublevel in increasing key order. 622 for _, sublevel := range updatedSublevels { 623 sort.Sort(sublevelSorter(newVal.levelFiles[sublevel])) 624 } 625 626 // Construct a parallel slice of sublevel B-Trees. 627 // TODO(jackson): Consolidate and only use the B-Trees. 628 for _, sublevel := range updatedSublevels { 629 tr, ls := makeBTree(btreeCmpSmallestKey(newVal.cmp), newVal.levelFiles[sublevel]) 630 if sublevel == len(newVal.Levels) { 631 newVal.Levels = append(newVal.Levels, ls) 632 } else { 633 // sublevel < len(s.Levels). If this panics, updatedSublevels was not 634 // populated correctly. 635 newVal.Levels[sublevel] = ls 636 } 637 tr.Release() 638 } 639 640 newVal.flushSplitUserKeys = nil 641 newVal.calculateFlushSplitKeys(flushSplitMaxBytes) 642 return newVal, nil 643 } 644 645 // addFileToSublevels is called during L0Sublevels generation, and adds f to the 646 // correct sublevel's levelFiles, the relevant intervals' files slices, and sets 647 // interval indices on f. This method, if called successively on multiple files, 648 // _must_ be called on successively newer files (by seqnum). If checkInvariant 649 // is true, it could check for this in some cases and return 650 // [errInvalidL0SublevelsOpt] if that invariant isn't held. 651 func (s *L0Sublevels) addFileToSublevels(f *FileMetadata, checkInvariant bool) error { 652 // This is a simple and not very accurate estimate of the number of 653 // bytes this SSTable contributes to the intervals it is a part of. 654 // 655 // TODO(bilal): Call EstimateDiskUsage in sstable.Reader with interval 656 // bounds to get a better estimate for each interval. 657 interpolatedBytes := f.Size / uint64(f.maxIntervalIndex-f.minIntervalIndex+1) 658 s.fileBytes += f.Size 659 subLevel := 0 660 // Update state in every fileInterval for this file. 661 for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ { 662 interval := &s.orderedIntervals[i] 663 if len(interval.files) > 0 { 664 if checkInvariant && interval.files[len(interval.files)-1].LargestSeqNum > f.LargestSeqNum { 665 // We are sliding this file "underneath" an existing file. Throw away 666 // and start over in NewL0Sublevels. 667 return errInvalidL0SublevelsOpt 668 } 669 // interval.files is sorted by sublevels, from lowest to highest. 670 // AddL0Files can only add files at sublevels higher than existing files 671 // in the same key intervals. 672 if maxSublevel := interval.files[len(interval.files)-1].SubLevel; subLevel <= maxSublevel { 673 subLevel = maxSublevel + 1 674 } 675 } 676 interval.estimatedBytes += interpolatedBytes 677 if f.minIntervalIndex < interval.filesMinIntervalIndex { 678 interval.filesMinIntervalIndex = f.minIntervalIndex 679 } 680 if f.maxIntervalIndex > interval.filesMaxIntervalIndex { 681 interval.filesMaxIntervalIndex = f.maxIntervalIndex 682 } 683 interval.files = append(interval.files, f) 684 } 685 f.SubLevel = subLevel 686 if subLevel > len(s.levelFiles) { 687 return errors.Errorf("chose a sublevel beyond allowed range of sublevels: %d vs 0-%d", subLevel, len(s.levelFiles)) 688 } 689 if subLevel == len(s.levelFiles) { 690 s.levelFiles = append(s.levelFiles, []*FileMetadata{f}) 691 } else { 692 s.levelFiles[subLevel] = append(s.levelFiles[subLevel], f) 693 } 694 return nil 695 } 696 697 func (s *L0Sublevels) calculateFlushSplitKeys(flushSplitMaxBytes int64) { 698 var cumulativeBytes uint64 699 // Multiply flushSplitMaxBytes by the number of sublevels. This prevents 700 // excessive flush splitting when the number of sublevels increases. 701 flushSplitMaxBytes *= int64(len(s.levelFiles)) 702 for i := 0; i < len(s.orderedIntervals); i++ { 703 interval := &s.orderedIntervals[i] 704 if flushSplitMaxBytes > 0 && cumulativeBytes > uint64(flushSplitMaxBytes) && 705 (len(s.flushSplitUserKeys) == 0 || 706 !bytes.Equal(interval.startKey.key, s.flushSplitUserKeys[len(s.flushSplitUserKeys)-1])) { 707 s.flushSplitUserKeys = append(s.flushSplitUserKeys, interval.startKey.key) 708 cumulativeBytes = 0 709 } 710 cumulativeBytes += s.orderedIntervals[i].estimatedBytes 711 } 712 } 713 714 // InitCompactingFileInfo initializes internal flags relating to compacting 715 // files. Must be called after sublevel initialization. 716 // 717 // Requires DB.mu *and* the manifest lock to be held. 718 func (s *L0Sublevels) InitCompactingFileInfo(inProgress []L0Compaction) { 719 for i := range s.orderedIntervals { 720 s.orderedIntervals[i].compactingFileCount = 0 721 s.orderedIntervals[i].isBaseCompacting = false 722 s.orderedIntervals[i].intervalRangeIsBaseCompacting = false 723 } 724 725 iter := s.levelMetadata.Iter() 726 for f := iter.First(); f != nil; f = iter.Next() { 727 if invariants.Enabled { 728 if !bytes.Equal(s.orderedIntervals[f.minIntervalIndex].startKey.key, f.Smallest.UserKey) { 729 panic(fmt.Sprintf("f.minIntervalIndex in FileMetadata out of sync with intervals in L0Sublevels: %s != %s", 730 s.formatKey(s.orderedIntervals[f.minIntervalIndex].startKey.key), s.formatKey(f.Smallest.UserKey))) 731 } 732 if !bytes.Equal(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, f.Largest.UserKey) { 733 panic(fmt.Sprintf("f.maxIntervalIndex in FileMetadata out of sync with intervals in L0Sublevels: %s != %s", 734 s.formatKey(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key), s.formatKey(f.Smallest.UserKey))) 735 } 736 } 737 if !f.IsCompacting() { 738 continue 739 } 740 if invariants.Enabled { 741 if s.cmp(s.orderedIntervals[f.minIntervalIndex].startKey.key, f.Smallest.UserKey) != 0 || s.cmp(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, f.Largest.UserKey) != 0 { 742 panic(fmt.Sprintf("file %s has inconsistent L0 Sublevel interval bounds: %s-%s, %s-%s", f.FileNum, 743 s.orderedIntervals[f.minIntervalIndex].startKey.key, s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, 744 f.Smallest.UserKey, f.Largest.UserKey)) 745 } 746 } 747 for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ { 748 interval := &s.orderedIntervals[i] 749 interval.compactingFileCount++ 750 if !f.IsIntraL0Compacting { 751 // If f.Compacting && !f.IsIntraL0Compacting, this file is 752 // being compacted to Lbase. 753 interval.isBaseCompacting = true 754 } 755 } 756 } 757 758 // Some intervals may be base compacting without the files contained within 759 // those intervals being marked as compacting. This is possible if the files 760 // were added after the compaction initiated, and the active compaction 761 // files straddle the input file. Mark these intervals as base compacting. 762 for _, c := range inProgress { 763 startIK := intervalKey{key: c.Smallest.UserKey, isLargest: false} 764 endIK := intervalKey{key: c.Largest.UserKey, isLargest: !c.Largest.IsExclusiveSentinel()} 765 start, _ := slices.BinarySearchFunc(s.orderedIntervals, startIK, func(a fileInterval, b intervalKey) int { 766 return intervalKeyCompare(s.cmp, a.startKey, b) 767 }) 768 end, _ := slices.BinarySearchFunc(s.orderedIntervals, endIK, func(a fileInterval, b intervalKey) int { 769 return intervalKeyCompare(s.cmp, a.startKey, b) 770 }) 771 for i := start; i < end && i < len(s.orderedIntervals); i++ { 772 interval := &s.orderedIntervals[i] 773 if !c.IsIntraL0 { 774 interval.isBaseCompacting = true 775 } 776 } 777 } 778 779 min := 0 780 for i := range s.orderedIntervals { 781 interval := &s.orderedIntervals[i] 782 if interval.isBaseCompacting { 783 minIndex := interval.filesMinIntervalIndex 784 if minIndex < min { 785 minIndex = min 786 } 787 for j := minIndex; j <= interval.filesMaxIntervalIndex; j++ { 788 min = j 789 s.orderedIntervals[j].intervalRangeIsBaseCompacting = true 790 } 791 } 792 } 793 } 794 795 // String produces a string containing useful debug information. Useful in test 796 // code and debugging. 797 func (s *L0Sublevels) String() string { 798 return s.describe(false) 799 } 800 801 func (s *L0Sublevels) describe(verbose bool) string { 802 var buf strings.Builder 803 fmt.Fprintf(&buf, "file count: %d, sublevels: %d, intervals: %d\nflush split keys(%d): [", 804 s.levelMetadata.Len(), len(s.levelFiles), len(s.orderedIntervals), len(s.flushSplitUserKeys)) 805 for i := range s.flushSplitUserKeys { 806 fmt.Fprintf(&buf, "%s", s.formatKey(s.flushSplitUserKeys[i])) 807 if i < len(s.flushSplitUserKeys)-1 { 808 fmt.Fprintf(&buf, ", ") 809 } 810 } 811 fmt.Fprintln(&buf, "]") 812 numCompactingFiles := 0 813 for i := len(s.levelFiles) - 1; i >= 0; i-- { 814 maxIntervals := 0 815 sumIntervals := 0 816 var totalBytes uint64 817 for _, f := range s.levelFiles[i] { 818 intervals := f.maxIntervalIndex - f.minIntervalIndex + 1 819 if intervals > maxIntervals { 820 maxIntervals = intervals 821 } 822 sumIntervals += intervals 823 totalBytes += f.Size 824 if f.IsCompacting() { 825 numCompactingFiles++ 826 } 827 } 828 fmt.Fprintf(&buf, "0.%d: file count: %d, bytes: %d, width (mean, max): %0.1f, %d, interval range: [%d, %d]\n", 829 i, len(s.levelFiles[i]), totalBytes, float64(sumIntervals)/float64(len(s.levelFiles[i])), maxIntervals, s.levelFiles[i][0].minIntervalIndex, 830 s.levelFiles[i][len(s.levelFiles[i])-1].maxIntervalIndex) 831 for _, f := range s.levelFiles[i] { 832 intervals := f.maxIntervalIndex - f.minIntervalIndex + 1 833 if verbose { 834 fmt.Fprintf(&buf, "\t%s\n", f) 835 } 836 if s.levelMetadata.Len() > 50 && intervals*3 > len(s.orderedIntervals) { 837 var intervalsBytes uint64 838 for k := f.minIntervalIndex; k <= f.maxIntervalIndex; k++ { 839 intervalsBytes += s.orderedIntervals[k].estimatedBytes 840 } 841 fmt.Fprintf(&buf, "wide file: %d, [%d, %d], byte fraction: %f\n", 842 f.FileNum, f.minIntervalIndex, f.maxIntervalIndex, 843 float64(intervalsBytes)/float64(s.fileBytes)) 844 } 845 } 846 } 847 848 lastCompactingIntervalStart := -1 849 fmt.Fprintf(&buf, "compacting file count: %d, base compacting intervals: ", numCompactingFiles) 850 i := 0 851 foundBaseCompactingIntervals := false 852 for ; i < len(s.orderedIntervals); i++ { 853 interval := &s.orderedIntervals[i] 854 if len(interval.files) == 0 { 855 continue 856 } 857 if !interval.isBaseCompacting { 858 if lastCompactingIntervalStart != -1 { 859 if foundBaseCompactingIntervals { 860 buf.WriteString(", ") 861 } 862 fmt.Fprintf(&buf, "[%d, %d]", lastCompactingIntervalStart, i-1) 863 foundBaseCompactingIntervals = true 864 } 865 lastCompactingIntervalStart = -1 866 } else { 867 if lastCompactingIntervalStart == -1 { 868 lastCompactingIntervalStart = i 869 } 870 } 871 } 872 if lastCompactingIntervalStart != -1 { 873 if foundBaseCompactingIntervals { 874 buf.WriteString(", ") 875 } 876 fmt.Fprintf(&buf, "[%d, %d]", lastCompactingIntervalStart, i-1) 877 } else if !foundBaseCompactingIntervals { 878 fmt.Fprintf(&buf, "none") 879 } 880 fmt.Fprintln(&buf, "") 881 return buf.String() 882 } 883 884 // ReadAmplification returns the contribution of L0Sublevels to the read 885 // amplification for any particular point key. It is the maximum height of any 886 // tracked fileInterval. This is always less than or equal to the number of 887 // sublevels. 888 func (s *L0Sublevels) ReadAmplification() int { 889 amp := 0 890 for i := range s.orderedIntervals { 891 interval := &s.orderedIntervals[i] 892 fileCount := len(interval.files) 893 if amp < fileCount { 894 amp = fileCount 895 } 896 } 897 return amp 898 } 899 900 // UserKeyRange encodes a key range in user key space. A UserKeyRange's Start 901 // and End boundaries are both inclusive. 902 type UserKeyRange struct { 903 Start, End []byte 904 } 905 906 // InUseKeyRanges returns the merged table bounds of L0 files overlapping the 907 // provided user key range. The returned key ranges are sorted and 908 // nonoverlapping. 909 func (s *L0Sublevels) InUseKeyRanges(smallest, largest []byte) []UserKeyRange { 910 // Binary search to find the provided keys within the intervals. 911 startIK := intervalKey{key: smallest, isLargest: false} 912 endIK := intervalKey{key: largest, isLargest: true} 913 start := sort.Search(len(s.orderedIntervals), func(i int) bool { 914 return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, startIK) > 0 915 }) 916 if start > 0 { 917 // Back up to the first interval with a start key <= startIK. 918 start-- 919 } 920 end := sort.Search(len(s.orderedIntervals), func(i int) bool { 921 return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, endIK) > 0 922 }) 923 924 var keyRanges []UserKeyRange 925 var curr *UserKeyRange 926 for i := start; i < end; { 927 // Intervals with no files are not in use and can be skipped, once we 928 // end the current UserKeyRange. 929 if len(s.orderedIntervals[i].files) == 0 { 930 curr = nil 931 i++ 932 continue 933 } 934 935 // If curr is nil, start a new in-use key range. 936 if curr == nil { 937 keyRanges = append(keyRanges, UserKeyRange{ 938 Start: s.orderedIntervals[i].startKey.key, 939 }) 940 curr = &keyRanges[len(keyRanges)-1] 941 } 942 943 // If the filesMaxIntervalIndex is not the current index, we can jump to 944 // the max index, knowing that all intermediary intervals are overlapped 945 // by some file. 946 if maxIdx := s.orderedIntervals[i].filesMaxIntervalIndex; maxIdx != i { 947 // Note that end may be less than or equal to maxIdx if we're 948 // concerned with a key range that ends before the interval at 949 // maxIdx starts. We must set curr.End now, before making that leap, 950 // because this iteration may be the last. 951 i = maxIdx 952 curr.End = s.orderedIntervals[i+1].startKey.key 953 continue 954 } 955 956 // No files overlapping with this interval overlap with the next 957 // interval. Update the current end to be the next interval's start key. 958 // Note that curr is not necessarily finished, because there may be an 959 // abutting non-empty interval. 960 curr.End = s.orderedIntervals[i+1].startKey.key 961 i++ 962 } 963 return keyRanges 964 } 965 966 // FlushSplitKeys returns a slice of user keys to split flushes at. Used by 967 // flushes to avoid writing sstables that straddle these split keys. These 968 // should be interpreted as the keys to start the next sstable (not the last key 969 // to include in the prev sstable). These are user keys so that range tombstones 970 // can be properly truncated (untruncated range tombstones are not permitted for 971 // L0 files). 972 func (s *L0Sublevels) FlushSplitKeys() [][]byte { 973 return s.flushSplitUserKeys 974 } 975 976 // MaxDepthAfterOngoingCompactions returns an estimate of maximum depth of 977 // sublevels after all ongoing compactions run to completion. Used by compaction 978 // picker to decide compaction score for L0. There is no scoring for intra-L0 979 // compactions -- they only run if L0 score is high but we're unable to pick an 980 // L0 -> Lbase compaction. 981 func (s *L0Sublevels) MaxDepthAfterOngoingCompactions() int { 982 depth := 0 983 for i := range s.orderedIntervals { 984 interval := &s.orderedIntervals[i] 985 intervalDepth := len(interval.files) - interval.compactingFileCount 986 if depth < intervalDepth { 987 depth = intervalDepth 988 } 989 } 990 return depth 991 } 992 993 // Only for temporary debugging in the absence of proper tests. 994 // 995 // TODO(bilal): Simplify away the debugging statements in this method, and make 996 // this a pure sanity checker. 997 // 998 //lint:ignore U1000 - useful for debugging 999 func (s *L0Sublevels) checkCompaction(c *L0CompactionFiles) error { 1000 includedFiles := newBitSet(s.levelMetadata.Len()) 1001 fileIntervalsByLevel := make([]struct { 1002 min int 1003 max int 1004 }, len(s.levelFiles)) 1005 for i := range fileIntervalsByLevel { 1006 fileIntervalsByLevel[i].min = math.MaxInt32 1007 fileIntervalsByLevel[i].max = 0 1008 } 1009 var topLevel int 1010 var increment int 1011 var limitReached func(int) bool 1012 if c.isIntraL0 { 1013 topLevel = len(s.levelFiles) - 1 1014 increment = +1 1015 limitReached = func(level int) bool { 1016 return level == len(s.levelFiles) 1017 } 1018 } else { 1019 topLevel = 0 1020 increment = -1 1021 limitReached = func(level int) bool { 1022 return level < 0 1023 } 1024 } 1025 for _, f := range c.Files { 1026 if fileIntervalsByLevel[f.SubLevel].min > f.minIntervalIndex { 1027 fileIntervalsByLevel[f.SubLevel].min = f.minIntervalIndex 1028 } 1029 if fileIntervalsByLevel[f.SubLevel].max < f.maxIntervalIndex { 1030 fileIntervalsByLevel[f.SubLevel].max = f.maxIntervalIndex 1031 } 1032 includedFiles.markBit(f.L0Index) 1033 if c.isIntraL0 { 1034 if topLevel > f.SubLevel { 1035 topLevel = f.SubLevel 1036 } 1037 } else { 1038 if topLevel < f.SubLevel { 1039 topLevel = f.SubLevel 1040 } 1041 } 1042 } 1043 min := fileIntervalsByLevel[topLevel].min 1044 max := fileIntervalsByLevel[topLevel].max 1045 for level := topLevel; !limitReached(level); level += increment { 1046 if fileIntervalsByLevel[level].min < min { 1047 min = fileIntervalsByLevel[level].min 1048 } 1049 if fileIntervalsByLevel[level].max > max { 1050 max = fileIntervalsByLevel[level].max 1051 } 1052 index, _ := slices.BinarySearchFunc(s.levelFiles[level], min, func(a *FileMetadata, b int) int { 1053 return stdcmp.Compare(a.maxIntervalIndex, b) 1054 }) 1055 // start := index 1056 for ; index < len(s.levelFiles[level]); index++ { 1057 f := s.levelFiles[level][index] 1058 if f.minIntervalIndex > max { 1059 break 1060 } 1061 if c.isIntraL0 && f.LargestSeqNum >= c.earliestUnflushedSeqNum { 1062 return errors.Errorf( 1063 "sstable %s in compaction has sequence numbers higher than the earliest unflushed seqnum %d: %d-%d", 1064 f.FileNum, c.earliestUnflushedSeqNum, f.SmallestSeqNum, 1065 f.LargestSeqNum) 1066 } 1067 if !includedFiles[f.L0Index] { 1068 var buf strings.Builder 1069 fmt.Fprintf(&buf, "bug %t, seed interval: %d: level %d, sl index %d, f.index %d, min %d, max %d, pre-min %d, pre-max %d, f.min %d, f.max %d, filenum: %d, isCompacting: %t\n%s\n", 1070 c.isIntraL0, c.seedInterval, level, index, f.L0Index, min, max, c.preExtensionMinInterval, c.preExtensionMaxInterval, 1071 f.minIntervalIndex, f.maxIntervalIndex, 1072 f.FileNum, f.IsCompacting(), s) 1073 fmt.Fprintf(&buf, "files included:\n") 1074 for _, f := range c.Files { 1075 fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n", 1076 f.FileNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex) 1077 } 1078 fmt.Fprintf(&buf, "files added:\n") 1079 for _, f := range c.filesAdded { 1080 fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n", 1081 f.FileNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex) 1082 } 1083 return errors.New(buf.String()) 1084 } 1085 } 1086 } 1087 return nil 1088 } 1089 1090 // UpdateStateForStartedCompaction updates internal L0Sublevels state for a 1091 // recently started compaction. isBase specifies if this is a base compaction; 1092 // if false, this is assumed to be an intra-L0 compaction. The specified 1093 // compaction must be involving L0 SSTables. It's assumed that the Compacting 1094 // and IsIntraL0Compacting fields are already set on all [FileMetadata]s passed 1095 // in. 1096 func (s *L0Sublevels) UpdateStateForStartedCompaction(inputs []LevelSlice, isBase bool) error { 1097 minIntervalIndex := -1 1098 maxIntervalIndex := 0 1099 for i := range inputs { 1100 iter := inputs[i].Iter() 1101 for f := iter.First(); f != nil; f = iter.Next() { 1102 for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ { 1103 interval := &s.orderedIntervals[i] 1104 interval.compactingFileCount++ 1105 } 1106 if f.minIntervalIndex < minIntervalIndex || minIntervalIndex == -1 { 1107 minIntervalIndex = f.minIntervalIndex 1108 } 1109 if f.maxIntervalIndex > maxIntervalIndex { 1110 maxIntervalIndex = f.maxIntervalIndex 1111 } 1112 } 1113 } 1114 if isBase { 1115 for i := minIntervalIndex; i <= maxIntervalIndex; i++ { 1116 interval := &s.orderedIntervals[i] 1117 interval.isBaseCompacting = isBase 1118 for j := interval.filesMinIntervalIndex; j <= interval.filesMaxIntervalIndex; j++ { 1119 s.orderedIntervals[j].intervalRangeIsBaseCompacting = true 1120 } 1121 } 1122 } 1123 return nil 1124 } 1125 1126 // L0CompactionFiles represents a candidate set of L0 files for compaction. Also 1127 // referred to as "lcf". Contains state information useful for generating the 1128 // compaction (such as Files), as well as for picking between candidate 1129 // compactions (eg. fileBytes and seedIntervalStackDepthReduction). 1130 type L0CompactionFiles struct { 1131 Files []*FileMetadata 1132 1133 FilesIncluded bitSet 1134 // A "seed interval" is an interval with a high stack depth that was chosen 1135 // to bootstrap this compaction candidate. seedIntervalStackDepthReduction 1136 // is the number of sublevels that have a file in the seed interval that is 1137 // a part of this compaction. 1138 seedIntervalStackDepthReduction int 1139 // For base compactions, seedIntervalMinLevel is 0, and for intra-L0 1140 // compactions, seedIntervalMaxLevel is len(s.Files)-1 i.e. the highest 1141 // sublevel. 1142 seedIntervalMinLevel int 1143 seedIntervalMaxLevel int 1144 // Index of the seed interval. 1145 seedInterval int 1146 // Sum of file sizes for all files in this compaction. 1147 fileBytes uint64 1148 // Intervals with index [minIntervalIndex, maxIntervalIndex] are 1149 // participating in this compaction; it's the union set of all intervals 1150 // overlapped by participating files. 1151 minIntervalIndex int 1152 maxIntervalIndex int 1153 1154 // Set for intra-L0 compactions. SSTables with sequence numbers greater 1155 // than earliestUnflushedSeqNum cannot be a part of intra-L0 compactions. 1156 isIntraL0 bool 1157 earliestUnflushedSeqNum uint64 1158 1159 // For debugging purposes only. Used in checkCompaction(). 1160 preExtensionMinInterval int 1161 preExtensionMaxInterval int 1162 filesAdded []*FileMetadata 1163 } 1164 1165 // Clone allocates a new L0CompactionFiles, with the same underlying data. Note 1166 // that the two fileMetadata slices contain values that point to the same 1167 // underlying fileMetadata object. This is safe because these objects are read 1168 // only. 1169 func (l *L0CompactionFiles) Clone() *L0CompactionFiles { 1170 oldLcf := *l 1171 return &oldLcf 1172 } 1173 1174 // String merely prints the starting address of the first file, if it exists. 1175 func (l *L0CompactionFiles) String() string { 1176 if len(l.Files) > 0 { 1177 return fmt.Sprintf("First File Address: %p", &l.Files[0]) 1178 } 1179 return "" 1180 } 1181 1182 // addFile adds the specified file to the LCF. 1183 func (l *L0CompactionFiles) addFile(f *FileMetadata) { 1184 if l.FilesIncluded[f.L0Index] { 1185 return 1186 } 1187 l.FilesIncluded.markBit(f.L0Index) 1188 l.Files = append(l.Files, f) 1189 l.filesAdded = append(l.filesAdded, f) 1190 l.fileBytes += f.Size 1191 if f.minIntervalIndex < l.minIntervalIndex { 1192 l.minIntervalIndex = f.minIntervalIndex 1193 } 1194 if f.maxIntervalIndex > l.maxIntervalIndex { 1195 l.maxIntervalIndex = f.maxIntervalIndex 1196 } 1197 } 1198 1199 // Helper to order intervals being considered for compaction. 1200 type intervalAndScore struct { 1201 interval int 1202 score int 1203 } 1204 type intervalSorterByDecreasingScore []intervalAndScore 1205 1206 func (is intervalSorterByDecreasingScore) Len() int { return len(is) } 1207 func (is intervalSorterByDecreasingScore) Less(i, j int) bool { 1208 return is[i].score > is[j].score 1209 } 1210 func (is intervalSorterByDecreasingScore) Swap(i, j int) { 1211 is[i], is[j] = is[j], is[i] 1212 } 1213 1214 // Compactions: 1215 // 1216 // The sub-levels and intervals can be visualized in 2 dimensions as the X axis 1217 // containing intervals in increasing order and the Y axis containing sub-levels 1218 // (older to younger). The intervals can be sparse wrt sub-levels. We observe 1219 // that the system is typically under severe pressure in L0 during large numbers 1220 // of ingestions where most files added to L0 are narrow and non-overlapping. 1221 // 1222 // L0.1 d---g 1223 // L0.0 c--e g--j o--s u--x 1224 // 1225 // As opposed to a case with a lot of wide, overlapping L0 files: 1226 // 1227 // L0.3 d-----------r 1228 // L0.2 c--------o 1229 // L0.1 b-----------q 1230 // L0.0 a----------------x 1231 // 1232 // In that case we expect the rectangle represented in the good visualization 1233 // above (i.e. the first one) to be wide and short, and not too sparse (most 1234 // intervals will have fileCount close to the sub-level count), which would make 1235 // it amenable to concurrent L0 -> Lbase compactions. 1236 // 1237 // L0 -> Lbase: The high-level goal of a L0 -> Lbase compaction is to reduce 1238 // stack depth, by compacting files in the intervals with the highest (fileCount 1239 // - compactingCount). Additionally, we would like compactions to not involve a 1240 // huge number of files, so that they finish quickly, and to allow for 1241 // concurrent L0 -> Lbase compactions when needed. In order to achieve these 1242 // goals we would like compactions to visualize as capturing thin and tall 1243 // rectangles. The approach below is to consider intervals in some order and 1244 // then try to construct a compaction using the interval. The first interval we 1245 // can construct a compaction for is the compaction that is started. There can 1246 // be multiple heuristics in choosing the ordering of the intervals -- the code 1247 // uses one heuristic that worked well for a large ingestion stemming from a 1248 // cockroachdb import, but additional experimentation is necessary to pick a 1249 // general heuristic. Additionally, the compaction that gets picked may be not 1250 // as desirable as one that could be constructed later in terms of reducing 1251 // stack depth (since adding more files to the compaction can get blocked by 1252 // needing to encompass files that are already being compacted). So an 1253 // alternative would be to try to construct more than one compaction and pick 1254 // the best one. 1255 // 1256 // Here's a visualization of an ideal L0->LBase compaction selection: 1257 // 1258 // L0.3 a--d g-j 1259 // L0.2 f--j r-t 1260 // L0.1 b-d e---j 1261 // L0.0 a--d f--j l--o p-----x 1262 // 1263 // Lbase a--------i m---------w 1264 // 1265 // The [g,j] interval has the highest stack depth, so it would have the highest 1266 // priority for selecting a base compaction candidate. Assuming none of the 1267 // files are already compacting, this is the compaction that will be chosen: 1268 // 1269 // _______ 1270 // L0.3 a--d | g-j| 1271 // L0.2 | f--j| r-t 1272 // L0.1 b-d |e---j| 1273 // L0.0 a--d | f--j| l--o p-----x 1274 // 1275 // Lbase a--------i m---------w 1276 // 1277 // Note that running this compaction will mark the a--i file in Lbase as 1278 // compacting, and when ExtendL0ForBaseCompactionTo is called with the bounds of 1279 // that base file, it'll expand the compaction to also include all L0 files in 1280 // the a-d interval. The resultant compaction would then be: 1281 // 1282 // _____________ 1283 // L0.3 |a--d g-j| 1284 // L0.2 | f--j| r-t 1285 // L0.1 | b-d e---j| 1286 // L0.0 |a--d f--j| l--o p-----x 1287 // 1288 // Lbase a--------i m---------w 1289 // 1290 // The next best interval for base compaction would therefore be the one 1291 // including r--t in L0.2 and p--x in L0.0, and both this compaction and the one 1292 // picked earlier can run in parallel. This is assuming minCompactionDepth >= 2, 1293 // otherwise the second compaction has too little depth to pick. 1294 // 1295 // _____________ 1296 // L0.3 |a--d g-j| _________ 1297 // L0.2 | f--j| | r-t | 1298 // L0.1 | b-d e---j| | | 1299 // L0.0 |a--d f--j| l--o |p-----x| 1300 // 1301 // Lbase a--------i m---------w 1302 // 1303 // Note that when ExtendL0ForBaseCompactionTo is called, the compaction expands 1304 // to the following, given that the [l,o] file can be added without including 1305 // additional files in Lbase: 1306 // 1307 // _____________ 1308 // L0.3 |a--d g-j| _________ 1309 // L0.2 | f--j| | r-t | 1310 // L0.1 | b-d e---j|______| | 1311 // L0.0 |a--d f--j||l--o p-----x| 1312 // 1313 // Lbase a--------i m---------w 1314 // 1315 // If an additional file existed in LBase that overlapped with [l,o], it would 1316 // be excluded from the compaction. Concretely: 1317 // 1318 // _____________ 1319 // L0.3 |a--d g-j| _________ 1320 // L0.2 | f--j| | r-t | 1321 // L0.1 | b-d e---j| | | 1322 // L0.0 |a--d f--j| l--o |p-----x| 1323 // 1324 // Lbase a--------ij--lm---------w 1325 // 1326 // Intra-L0: If the L0 score is high, but PickBaseCompaction() is unable to pick 1327 // a compaction, PickIntraL0Compaction will be used to pick an intra-L0 1328 // compaction. Similar to L0 -> Lbase compactions, we want to allow for multiple 1329 // intra-L0 compactions and not generate wide output files that hinder later 1330 // concurrency of L0 -> Lbase compactions. Also compactions that produce wide 1331 // files don't reduce stack depth -- they represent wide rectangles in our 1332 // visualization, which means many intervals have their depth reduced by a small 1333 // amount. Typically, L0 files have non-overlapping sequence numbers, and 1334 // sticking to that invariant would require us to consider intra-L0 compactions 1335 // that proceed from youngest to oldest files, which could result in the 1336 // aforementioned undesirable wide rectangle shape. But this non-overlapping 1337 // sequence number is already relaxed in RocksDB -- sstables are primarily 1338 // ordered by their largest sequence number. So we can arrange for intra-L0 1339 // compactions to capture thin and tall rectangles starting with the top of the 1340 // stack (youngest files). Like the L0 -> Lbase case we order the intervals 1341 // using a heuristic and consider each in turn. The same comment about better L0 1342 // -> Lbase heuristics and not being greedy applies here. 1343 // 1344 // Going back to a modified version of our example from earlier, let's say these 1345 // are the base compactions in progress: 1346 // _______ 1347 // L0.3 a--d | g-j| _________ 1348 // L0.2 | f--j| | r-t | 1349 // L0.1 b-d |e---j| | | 1350 // L0.0 a--d | f--j| l--o |p-----x| 1351 // 1352 // Lbase a---------i m---------w 1353 // 1354 // Since both LBase files are compacting, the only L0 compaction that can be 1355 // picked is an intra-L0 compaction. For this, the b--d interval has the highest 1356 // stack depth (3), and starting with a--d in L0.3 as the seed file, we can 1357 // iterate downward and build this compaction, assuming all files in that 1358 // interval are not compacting and have a highest sequence number less than 1359 // earliestUnflushedSeqNum: 1360 // 1361 // _______ 1362 // L0.3 |a--d| | g-j| _________ 1363 // L0.2 | | | f--j| | r-t | 1364 // L0.1 | b-d| |e---j| | | 1365 // L0.0 |a--d| | f--j| l--o |p-----x| 1366 // ------ 1367 // Lbase a---------i m---------w 1368 // 1369 1370 // PickBaseCompaction picks a base compaction based on the above specified 1371 // heuristics, for the specified Lbase files and a minimum depth of overlapping 1372 // files that can be selected for compaction. Returns nil if no compaction is 1373 // possible. 1374 func (s *L0Sublevels) PickBaseCompaction( 1375 minCompactionDepth int, baseFiles LevelSlice, 1376 ) (*L0CompactionFiles, error) { 1377 // For LBase compactions, we consider intervals in a greedy manner in the 1378 // following order: 1379 // - Intervals that are unlikely to be blocked due 1380 // to ongoing L0 -> Lbase compactions. These are the ones with 1381 // !isBaseCompacting && !intervalRangeIsBaseCompacting. 1382 // - Intervals that are !isBaseCompacting && intervalRangeIsBaseCompacting. 1383 // 1384 // The ordering heuristic exists just to avoid wasted work. Ideally, 1385 // we would consider all intervals with isBaseCompacting = false and 1386 // construct a compaction for it and compare the constructed compactions 1387 // and pick the best one. If microbenchmarks show that we can afford 1388 // this cost we can eliminate this heuristic. 1389 scoredIntervals := make([]intervalAndScore, 0, len(s.orderedIntervals)) 1390 sublevelCount := len(s.levelFiles) 1391 for i := range s.orderedIntervals { 1392 interval := &s.orderedIntervals[i] 1393 depth := len(interval.files) - interval.compactingFileCount 1394 if interval.isBaseCompacting || minCompactionDepth > depth { 1395 continue 1396 } 1397 if interval.intervalRangeIsBaseCompacting { 1398 scoredIntervals = append(scoredIntervals, intervalAndScore{interval: i, score: depth}) 1399 } else { 1400 // Prioritize this interval by incrementing the score by the number 1401 // of sublevels. 1402 scoredIntervals = append(scoredIntervals, intervalAndScore{interval: i, score: depth + sublevelCount}) 1403 } 1404 } 1405 sort.Sort(intervalSorterByDecreasingScore(scoredIntervals)) 1406 1407 // Optimization to avoid considering different intervals that 1408 // are likely to choose the same seed file. Again this is just 1409 // to reduce wasted work. 1410 consideredIntervals := newBitSet(len(s.orderedIntervals)) 1411 for _, scoredInterval := range scoredIntervals { 1412 interval := &s.orderedIntervals[scoredInterval.interval] 1413 if consideredIntervals[interval.index] { 1414 continue 1415 } 1416 1417 // Pick the seed file for the interval as the file 1418 // in the lowest sub-level. 1419 f := interval.files[0] 1420 // Don't bother considering the intervals that are covered by the seed 1421 // file since they are likely nearby. Note that it is possible that 1422 // those intervals have seed files at lower sub-levels so could be 1423 // viable for compaction. 1424 if f == nil { 1425 return nil, errors.New("no seed file found in sublevel intervals") 1426 } 1427 consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1) 1428 if f.IsCompacting() { 1429 if f.IsIntraL0Compacting { 1430 // If we're picking a base compaction and we came across a seed 1431 // file candidate that's being intra-L0 compacted, skip the 1432 // interval instead of erroring out. 1433 continue 1434 } 1435 // We chose a compaction seed file that should not be compacting. 1436 // Usually means the score is not accurately accounting for files 1437 // already compacting, or internal state is inconsistent. 1438 return nil, errors.Errorf("file %s chosen as seed file for compaction should not be compacting", f.FileNum) 1439 } 1440 1441 c := s.baseCompactionUsingSeed(f, interval.index, minCompactionDepth) 1442 if c != nil { 1443 // Check if the chosen compaction overlaps with any files in Lbase 1444 // that have Compacting = true. If that's the case, this compaction 1445 // cannot be chosen. 1446 baseIter := baseFiles.Iter() 1447 // An interval starting at ImmediateSuccessor(key) can never be the 1448 // first interval of a compaction since no file can start at that 1449 // interval. 1450 m := baseIter.SeekGE(s.cmp, s.orderedIntervals[c.minIntervalIndex].startKey.key) 1451 1452 var baseCompacting bool 1453 for ; m != nil && !baseCompacting; m = baseIter.Next() { 1454 cmp := s.cmp(m.Smallest.UserKey, s.orderedIntervals[c.maxIntervalIndex+1].startKey.key) 1455 // Compaction is ending at exclusive bound of c.maxIntervalIndex+1 1456 if cmp > 0 || (cmp == 0 && !s.orderedIntervals[c.maxIntervalIndex+1].startKey.isLargest) { 1457 break 1458 } 1459 baseCompacting = baseCompacting || m.IsCompacting() 1460 } 1461 if baseCompacting { 1462 continue 1463 } 1464 return c, nil 1465 } 1466 } 1467 return nil, nil 1468 } 1469 1470 // Helper function for building an L0 -> Lbase compaction using a seed interval 1471 // and seed file in that seed interval. 1472 func (s *L0Sublevels) baseCompactionUsingSeed( 1473 f *FileMetadata, intervalIndex int, minCompactionDepth int, 1474 ) *L0CompactionFiles { 1475 c := &L0CompactionFiles{ 1476 FilesIncluded: newBitSet(s.levelMetadata.Len()), 1477 seedInterval: intervalIndex, 1478 seedIntervalMinLevel: 0, 1479 minIntervalIndex: f.minIntervalIndex, 1480 maxIntervalIndex: f.maxIntervalIndex, 1481 } 1482 c.addFile(f) 1483 1484 // The first iteration of this loop builds the compaction at the seed file's 1485 // sublevel. Future iterations expand on this compaction by stacking more 1486 // files from intervalIndex and repeating. This is an optional activity so 1487 // when it fails we can fallback to the last successful candidate. 1488 var lastCandidate *L0CompactionFiles 1489 interval := &s.orderedIntervals[intervalIndex] 1490 1491 for i := 0; i < len(interval.files); i++ { 1492 f2 := interval.files[i] 1493 sl := f2.SubLevel 1494 c.seedIntervalStackDepthReduction++ 1495 c.seedIntervalMaxLevel = sl 1496 c.addFile(f2) 1497 // The seed file is in the lowest sublevel in the seed interval, but it 1498 // may overlap with other files in even lower sublevels. For correctness 1499 // we need to grow our interval to include those files, and capture all 1500 // files in the next level that fall in this extended interval and so 1501 // on. This can result in a triangular shape like the following where 1502 // again the X axis is the key intervals and the Y axis is oldest to 1503 // youngest. Note that it is not necessary for correctness to fill out 1504 // the shape at the higher sub-levels to make it more rectangular since 1505 // the invariant only requires that younger versions of a key not be 1506 // moved to Lbase while leaving behind older versions. 1507 // - 1508 // --- 1509 // ----- 1510 // It may be better for performance to have a more rectangular shape 1511 // since the files being left behind will overlap with the same Lbase 1512 // key range as that of this compaction. But there is also the danger 1513 // that in trying to construct a more rectangular shape we will be 1514 // forced to pull in a file that is already compacting. We expect 1515 // extendCandidateToRectangle to eventually be called on this compaction 1516 // if it's chosen, at which point we would iterate backward and choose 1517 // those files. This logic is similar to compaction.grow for non-L0 1518 // compactions. 1519 done := false 1520 for currLevel := sl - 1; currLevel >= 0; currLevel-- { 1521 if !s.extendFiles(currLevel, math.MaxUint64, c) { 1522 // Failed to extend due to ongoing compaction. 1523 done = true 1524 break 1525 } 1526 } 1527 if done { 1528 break 1529 } 1530 // Observed some compactions using > 1GB from L0 in an import 1531 // experiment. Very long running compactions are not great as they 1532 // reduce concurrency while they run, and take a while to produce 1533 // results, though they're sometimes unavoidable. There is a tradeoff 1534 // here in that adding more depth is more efficient in reducing stack 1535 // depth, but long running compactions reduce flexibility in what can 1536 // run concurrently in L0 and even Lbase -> Lbase+1. An increase more 1537 // than 150% in bytes since the last candidate compaction (along with a 1538 // total compaction size in excess of 100mb), or a total compaction size 1539 // beyond a hard limit of 500mb, is criteria for rejecting this 1540 // candidate. This lets us prefer slow growths as we add files, while 1541 // still having a hard limit. Note that if this is the first compaction 1542 // candidate to reach a stack depth reduction of minCompactionDepth or 1543 // higher, this candidate will be chosen regardless. 1544 if lastCandidate == nil { 1545 lastCandidate = &L0CompactionFiles{} 1546 } else if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth && 1547 c.fileBytes > 100<<20 && 1548 (float64(c.fileBytes)/float64(lastCandidate.fileBytes) > 1.5 || c.fileBytes > 500<<20) { 1549 break 1550 } 1551 *lastCandidate = *c 1552 } 1553 if lastCandidate != nil && lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth { 1554 lastCandidate.FilesIncluded.clearAllBits() 1555 for _, f := range lastCandidate.Files { 1556 lastCandidate.FilesIncluded.markBit(f.L0Index) 1557 } 1558 return lastCandidate 1559 } 1560 return nil 1561 } 1562 1563 // Expands fields in the provided L0CompactionFiles instance (cFiles) to 1564 // include overlapping files in the specified sublevel. Returns true if the 1565 // compaction is possible (i.e. does not conflict with any base/intra-L0 1566 // compacting files). 1567 func (s *L0Sublevels) extendFiles( 1568 sl int, earliestUnflushedSeqNum uint64, cFiles *L0CompactionFiles, 1569 ) bool { 1570 index, _ := slices.BinarySearchFunc(s.levelFiles[sl], cFiles.minIntervalIndex, func(a *FileMetadata, b int) int { 1571 return stdcmp.Compare(a.maxIntervalIndex, b) 1572 }) 1573 for ; index < len(s.levelFiles[sl]); index++ { 1574 f := s.levelFiles[sl][index] 1575 if f.minIntervalIndex > cFiles.maxIntervalIndex { 1576 break 1577 } 1578 if f.IsCompacting() { 1579 return false 1580 } 1581 // Skip over files that are newer than earliestUnflushedSeqNum. This is 1582 // okay because this compaction can just pretend these files are not in 1583 // L0 yet. These files must be in higher sublevels than any overlapping 1584 // files with f.LargestSeqNum < earliestUnflushedSeqNum, and the output 1585 // of the compaction will also go in a lower (older) sublevel than this 1586 // file by definition. 1587 if f.LargestSeqNum >= earliestUnflushedSeqNum { 1588 continue 1589 } 1590 cFiles.addFile(f) 1591 } 1592 return true 1593 } 1594 1595 // PickIntraL0Compaction picks an intra-L0 compaction for files in this 1596 // sublevel. This method is only called when a base compaction cannot be chosen. 1597 // See comment above [PickBaseCompaction] for heuristics involved in this 1598 // selection. 1599 func (s *L0Sublevels) PickIntraL0Compaction( 1600 earliestUnflushedSeqNum uint64, minCompactionDepth int, 1601 ) (*L0CompactionFiles, error) { 1602 scoredIntervals := make([]intervalAndScore, len(s.orderedIntervals)) 1603 for i := range s.orderedIntervals { 1604 interval := &s.orderedIntervals[i] 1605 depth := len(interval.files) - interval.compactingFileCount 1606 if minCompactionDepth > depth { 1607 continue 1608 } 1609 scoredIntervals[i] = intervalAndScore{interval: i, score: depth} 1610 } 1611 sort.Sort(intervalSorterByDecreasingScore(scoredIntervals)) 1612 1613 // Optimization to avoid considering different intervals that are likely to 1614 // choose the same seed file. Again this is just to reduce wasted work. 1615 consideredIntervals := newBitSet(len(s.orderedIntervals)) 1616 for _, scoredInterval := range scoredIntervals { 1617 interval := &s.orderedIntervals[scoredInterval.interval] 1618 if consideredIntervals[interval.index] { 1619 continue 1620 } 1621 1622 var f *FileMetadata 1623 // Pick the seed file for the interval as the file in the highest 1624 // sub-level. 1625 stackDepthReduction := scoredInterval.score 1626 for i := len(interval.files) - 1; i >= 0; i-- { 1627 f = interval.files[i] 1628 if f.IsCompacting() { 1629 break 1630 } 1631 consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1) 1632 // Can this be the seed file? Files with newer sequence numbers than 1633 // earliestUnflushedSeqNum cannot be in the compaction. 1634 if f.LargestSeqNum >= earliestUnflushedSeqNum { 1635 stackDepthReduction-- 1636 if stackDepthReduction == 0 { 1637 break 1638 } 1639 } else { 1640 break 1641 } 1642 } 1643 if stackDepthReduction < minCompactionDepth { 1644 // Can't use this interval. 1645 continue 1646 } 1647 1648 if f == nil { 1649 return nil, errors.New("no seed file found in sublevel intervals") 1650 } 1651 if f.IsCompacting() { 1652 // This file could be in a concurrent intra-L0 or base compaction. 1653 // Try another interval. 1654 continue 1655 } 1656 1657 // We have a seed file. Build a compaction off of that seed. 1658 c := s.intraL0CompactionUsingSeed( 1659 f, interval.index, earliestUnflushedSeqNum, minCompactionDepth) 1660 if c != nil { 1661 return c, nil 1662 } 1663 } 1664 return nil, nil 1665 } 1666 1667 func (s *L0Sublevels) intraL0CompactionUsingSeed( 1668 f *FileMetadata, intervalIndex int, earliestUnflushedSeqNum uint64, minCompactionDepth int, 1669 ) *L0CompactionFiles { 1670 // We know that all the files that overlap with intervalIndex have 1671 // LargestSeqNum < earliestUnflushedSeqNum, but for other intervals 1672 // we need to exclude files >= earliestUnflushedSeqNum 1673 1674 c := &L0CompactionFiles{ 1675 FilesIncluded: newBitSet(s.levelMetadata.Len()), 1676 seedInterval: intervalIndex, 1677 seedIntervalMaxLevel: len(s.levelFiles) - 1, 1678 minIntervalIndex: f.minIntervalIndex, 1679 maxIntervalIndex: f.maxIntervalIndex, 1680 isIntraL0: true, 1681 earliestUnflushedSeqNum: earliestUnflushedSeqNum, 1682 } 1683 c.addFile(f) 1684 1685 var lastCandidate *L0CompactionFiles 1686 interval := &s.orderedIntervals[intervalIndex] 1687 slIndex := len(interval.files) - 1 1688 for { 1689 if interval.files[slIndex] == f { 1690 break 1691 } 1692 slIndex-- 1693 } 1694 // The first iteration of this loop produces an intra-L0 compaction at the 1695 // seed level. Iterations after that optionally add to the compaction by 1696 // stacking more files from intervalIndex and repeating. This is an optional 1697 // activity so when it fails we can fallback to the last successful 1698 // candidate. The code stops adding when it can't add more, or when 1699 // fileBytes grows too large. 1700 for ; slIndex >= 0; slIndex-- { 1701 f2 := interval.files[slIndex] 1702 sl := f2.SubLevel 1703 if f2.IsCompacting() { 1704 break 1705 } 1706 c.seedIntervalStackDepthReduction++ 1707 c.seedIntervalMinLevel = sl 1708 c.addFile(f2) 1709 // The seed file captures all files in the higher level that fall in the 1710 // range of intervals. That may extend the range of intervals so for 1711 // correctness we need to capture all files in the next higher level 1712 // that fall in this extended interval and so on. This can result in an 1713 // inverted triangular shape like the following where again the X axis 1714 // is the key intervals and the Y axis is oldest to youngest. Note that 1715 // it is not necessary for correctness to fill out the shape at lower 1716 // sub-levels to make it more rectangular since the invariant only 1717 // requires that if we move an older seqnum for key k into a file that 1718 // has a higher seqnum, we also move all younger seqnums for that key k 1719 // into that file. 1720 // ----- 1721 // --- 1722 // - 1723 // It may be better for performance to have a more rectangular shape 1724 // since it will reduce the stack depth for more intervals. But there is 1725 // also the danger that in explicitly trying to construct a more 1726 // rectangular shape we will be forced to pull in a file that is already 1727 // compacting. We assume that the performance concern is not a practical 1728 // issue. 1729 done := false 1730 for currLevel := sl + 1; currLevel < len(s.levelFiles); currLevel++ { 1731 if !s.extendFiles(currLevel, earliestUnflushedSeqNum, c) { 1732 // Failed to extend due to ongoing compaction. 1733 done = true 1734 break 1735 } 1736 } 1737 if done { 1738 break 1739 } 1740 if lastCandidate == nil { 1741 lastCandidate = &L0CompactionFiles{} 1742 } else if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth && 1743 c.fileBytes > 100<<20 && 1744 (float64(c.fileBytes)/float64(lastCandidate.fileBytes) > 1.5 || c.fileBytes > 500<<20) { 1745 break 1746 } 1747 *lastCandidate = *c 1748 } 1749 if lastCandidate != nil && lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth { 1750 lastCandidate.FilesIncluded.clearAllBits() 1751 for _, f := range lastCandidate.Files { 1752 lastCandidate.FilesIncluded.markBit(f.L0Index) 1753 } 1754 s.extendCandidateToRectangle( 1755 lastCandidate.minIntervalIndex, lastCandidate.maxIntervalIndex, lastCandidate, false) 1756 return lastCandidate 1757 } 1758 return nil 1759 } 1760 1761 // ExtendL0ForBaseCompactionTo extends the specified base compaction candidate 1762 // L0CompactionFiles to optionally cover more files in L0 without "touching" any 1763 // of the passed-in keys (i.e. the smallest/largest bounds are exclusive), as 1764 // including any user keys for those internal keys could require choosing more 1765 // files in LBase which is undesirable. Unbounded start/end keys are indicated 1766 // by passing in the InvalidInternalKey. 1767 func (s *L0Sublevels) ExtendL0ForBaseCompactionTo( 1768 smallest, largest InternalKey, candidate *L0CompactionFiles, 1769 ) bool { 1770 firstIntervalIndex := 0 1771 lastIntervalIndex := len(s.orderedIntervals) - 1 1772 if smallest.Kind() != base.InternalKeyKindInvalid { 1773 if smallest.Trailer == base.InternalKeyRangeDeleteSentinel { 1774 // Starting at smallest.UserKey == interval.startKey is okay. 1775 firstIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool { 1776 return s.cmp(smallest.UserKey, s.orderedIntervals[i].startKey.key) <= 0 1777 }) 1778 } else { 1779 firstIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool { 1780 // Need to start at >= smallest since if we widen too much we may miss 1781 // an Lbase file that overlaps with an L0 file that will get picked in 1782 // this widening, which would be bad. This interval will not start with 1783 // an immediate successor key. 1784 return s.cmp(smallest.UserKey, s.orderedIntervals[i].startKey.key) < 0 1785 }) 1786 } 1787 } 1788 if largest.Kind() != base.InternalKeyKindInvalid { 1789 // First interval that starts at or beyond the largest. This interval will not 1790 // start with an immediate successor key. 1791 lastIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool { 1792 return s.cmp(largest.UserKey, s.orderedIntervals[i].startKey.key) <= 0 1793 }) 1794 // Right now, lastIntervalIndex has a startKey that extends beyond largest. 1795 // The previous interval, by definition, has an end key higher than largest. 1796 // Iterate back twice to get the last interval that's completely within 1797 // (smallest, largest). Except in the case where we went past the end of the 1798 // list; in that case, the last interval to include is the very last 1799 // interval in the list. 1800 if lastIntervalIndex < len(s.orderedIntervals) { 1801 lastIntervalIndex-- 1802 } 1803 lastIntervalIndex-- 1804 } 1805 if lastIntervalIndex < firstIntervalIndex { 1806 return false 1807 } 1808 return s.extendCandidateToRectangle(firstIntervalIndex, lastIntervalIndex, candidate, true) 1809 } 1810 1811 // Best-effort attempt to make the compaction include more files in the 1812 // rectangle defined by [minIntervalIndex, maxIntervalIndex] on the X axis and 1813 // bounded on the Y axis by seedIntervalMinLevel and seedIntervalMaxLevel. 1814 // 1815 // This is strictly an optional extension; at any point where we can't feasibly 1816 // add more files, the sublevel iteration can be halted early and candidate will 1817 // still be a correct compaction candidate. 1818 // 1819 // Consider this scenario (original candidate is inside the rectangle), with 1820 // isBase = true and interval bounds a-j (from the union of base file bounds and 1821 // that of compaction candidate): 1822 // 1823 // _______ 1824 // L0.3 a--d | g-j| 1825 // L0.2 | f--j| r-t 1826 // L0.1 b-d |e---j| 1827 // L0.0 a--d | f--j| l--o p-----x 1828 // 1829 // Lbase a--------i m---------w 1830 // 1831 // This method will iterate from the bottom up. At L0.0, it will add a--d since 1832 // it's in the bounds, then add b-d, then a--d, and so on, to produce this: 1833 // 1834 // _____________ 1835 // L0.3 |a--d g-j| 1836 // L0.2 | f--j| r-t 1837 // L0.1 | b-d e---j| 1838 // L0.0 |a--d f--j| l--o p-----x 1839 // 1840 // Lbase a-------i m---------w 1841 // 1842 // Let's assume that, instead of a--d in the top sublevel, we had 3 files, a-b, 1843 // bb-c, and cc-d, of which bb-c is compacting. Let's also add another sublevel 1844 // L0.4 with some files, all of which aren't compacting: 1845 // 1846 // L0.4 a------c ca--d _______ 1847 // L0.3 a-b bb-c cc-d | g-j| 1848 // L0.2 | f--j| r-t 1849 // L0.1 b----------d |e---j| 1850 // L0.0 a------------d | f--j| l--o p-----x 1851 // 1852 // Lbase a------------------i m---------w 1853 // 1854 // This method then needs to choose between the left side of L0.3 bb-c (i.e. 1855 // a-b), or the right side (i.e. cc-d and g-j) for inclusion in this compaction. 1856 // Since the right side has more files as well as one file that has already been 1857 // picked, it gets chosen at that sublevel, resulting in this intermediate 1858 // compaction: 1859 // 1860 // L0.4 a------c ca--d 1861 // ______________ 1862 // L0.3 a-b bb-c| cc-d g-j| 1863 // L0.2 _________| f--j| r-t 1864 // L0.1 | b----------d e---j| 1865 // L0.0 |a------------d f--j| l--o p-----x 1866 // 1867 // Lbase a------------------i m---------w 1868 // 1869 // Since bb-c had to be excluded at L0.3, the interval bounds for L0.4 are 1870 // actually ca-j, since ca is the next interval start key after the end interval 1871 // of bb-c. This would result in only ca-d being chosen at that sublevel, even 1872 // though a--c is also not compacting. This is the final result: 1873 // 1874 // ______________ 1875 // L0.4 a------c|ca--d | 1876 // L0.3 a-b bb-c| cc-d g-j| 1877 // L0.2 _________| f--j| r-t 1878 // L0.1 | b----------d e---j| 1879 // L0.0 |a------------d f--j| l--o p-----x 1880 // 1881 // Lbase a------------------i m---------w 1882 // 1883 // TODO(bilal): Add more targeted tests for this method, through 1884 // ExtendL0ForBaseCompactionTo and intraL0CompactionUsingSeed. 1885 func (s *L0Sublevels) extendCandidateToRectangle( 1886 minIntervalIndex int, maxIntervalIndex int, candidate *L0CompactionFiles, isBase bool, 1887 ) bool { 1888 candidate.preExtensionMinInterval = candidate.minIntervalIndex 1889 candidate.preExtensionMaxInterval = candidate.maxIntervalIndex 1890 // Extend {min,max}IntervalIndex to include all of the candidate's current 1891 // bounds. 1892 if minIntervalIndex > candidate.minIntervalIndex { 1893 minIntervalIndex = candidate.minIntervalIndex 1894 } 1895 if maxIntervalIndex < candidate.maxIntervalIndex { 1896 maxIntervalIndex = candidate.maxIntervalIndex 1897 } 1898 var startLevel, increment, endLevel int 1899 if isBase { 1900 startLevel = 0 1901 increment = +1 1902 // seedIntervalMaxLevel is inclusive, while endLevel is exclusive. 1903 endLevel = candidate.seedIntervalMaxLevel + 1 1904 } else { 1905 startLevel = len(s.levelFiles) - 1 1906 increment = -1 1907 // seedIntervalMinLevel is inclusive, while endLevel is exclusive. 1908 endLevel = candidate.seedIntervalMinLevel - 1 1909 } 1910 // Stats for files. 1911 addedCount := 0 1912 // Iterate from the oldest sub-level for L0 -> Lbase and youngest sub-level 1913 // for intra-L0. The idea here is that anything that can't be included from 1914 // that level constrains what can be included from the next level. This 1915 // change in constraint is directly incorporated into minIntervalIndex, 1916 // maxIntervalIndex. 1917 for sl := startLevel; sl != endLevel; sl += increment { 1918 files := s.levelFiles[sl] 1919 // Find the first file that overlaps with minIntervalIndex. 1920 index := sort.Search(len(files), func(i int) bool { 1921 return minIntervalIndex <= files[i].maxIntervalIndex 1922 }) 1923 // Track the files that are fully within the current constraint of 1924 // [minIntervalIndex, maxIntervalIndex]. 1925 firstIndex := -1 1926 lastIndex := -1 1927 for ; index < len(files); index++ { 1928 f := files[index] 1929 if f.minIntervalIndex > maxIntervalIndex { 1930 break 1931 } 1932 include := true 1933 // Extends out on the left so can't be included. This narrows what 1934 // we can included in the next level. 1935 if f.minIntervalIndex < minIntervalIndex { 1936 include = false 1937 minIntervalIndex = f.maxIntervalIndex + 1 1938 } 1939 // Extends out on the right so can't be included. 1940 if f.maxIntervalIndex > maxIntervalIndex { 1941 include = false 1942 maxIntervalIndex = f.minIntervalIndex - 1 1943 } 1944 if !include { 1945 continue 1946 } 1947 if firstIndex == -1 { 1948 firstIndex = index 1949 } 1950 lastIndex = index 1951 } 1952 if minIntervalIndex > maxIntervalIndex { 1953 // We excluded files that prevent continuation. 1954 break 1955 } 1956 if firstIndex < 0 { 1957 // No files to add in this sub-level. 1958 continue 1959 } 1960 // We have the files in [firstIndex, lastIndex] as potential for 1961 // inclusion. Some of these may already have been picked. Some of them 1962 // may be already compacting. The latter is tricky since we have to 1963 // decide whether to contract minIntervalIndex or maxIntervalIndex when 1964 // we encounter an already compacting file. We pick the longest sequence 1965 // between firstIndex and lastIndex of non-compacting files -- this is 1966 // represented by [candidateNonCompactingFirst, 1967 // candidateNonCompactingLast]. 1968 nonCompactingFirst := -1 1969 currentRunHasAlreadyPickedFiles := false 1970 candidateNonCompactingFirst := -1 1971 candidateNonCompactingLast := -1 1972 candidateHasAlreadyPickedFiles := false 1973 for index = firstIndex; index <= lastIndex; index++ { 1974 f := files[index] 1975 if f.IsCompacting() { 1976 if nonCompactingFirst != -1 { 1977 last := index - 1 1978 // Prioritize runs of consecutive non-compacting files that 1979 // have files that have already been picked. That is to say, 1980 // if candidateHasAlreadyPickedFiles == true, we stick with 1981 // it, and if currentRunHasAlreadyPickedfiles == true, we 1982 // pick that run even if it contains fewer files than the 1983 // previous candidate. 1984 if !candidateHasAlreadyPickedFiles && (candidateNonCompactingFirst == -1 || 1985 currentRunHasAlreadyPickedFiles || 1986 (last-nonCompactingFirst) > (candidateNonCompactingLast-candidateNonCompactingFirst)) { 1987 candidateNonCompactingFirst = nonCompactingFirst 1988 candidateNonCompactingLast = last 1989 candidateHasAlreadyPickedFiles = currentRunHasAlreadyPickedFiles 1990 } 1991 } 1992 nonCompactingFirst = -1 1993 currentRunHasAlreadyPickedFiles = false 1994 continue 1995 } 1996 if nonCompactingFirst == -1 { 1997 nonCompactingFirst = index 1998 } 1999 if candidate.FilesIncluded[f.L0Index] { 2000 currentRunHasAlreadyPickedFiles = true 2001 } 2002 } 2003 // Logic duplicated from inside the for loop above. 2004 if nonCompactingFirst != -1 { 2005 last := index - 1 2006 if !candidateHasAlreadyPickedFiles && (candidateNonCompactingFirst == -1 || 2007 currentRunHasAlreadyPickedFiles || 2008 (last-nonCompactingFirst) > (candidateNonCompactingLast-candidateNonCompactingFirst)) { 2009 candidateNonCompactingFirst = nonCompactingFirst 2010 candidateNonCompactingLast = last 2011 } 2012 } 2013 if candidateNonCompactingFirst == -1 { 2014 // All files are compacting. There will be gaps that we could 2015 // exploit to continue, but don't bother. 2016 break 2017 } 2018 // May need to shrink [minIntervalIndex, maxIntervalIndex] for the next level. 2019 if candidateNonCompactingFirst > firstIndex { 2020 minIntervalIndex = files[candidateNonCompactingFirst-1].maxIntervalIndex + 1 2021 } 2022 if candidateNonCompactingLast < lastIndex { 2023 maxIntervalIndex = files[candidateNonCompactingLast+1].minIntervalIndex - 1 2024 } 2025 for index := candidateNonCompactingFirst; index <= candidateNonCompactingLast; index++ { 2026 f := files[index] 2027 if f.IsCompacting() { 2028 // TODO(bilal): Do a logger.Fatalf instead of a panic, for 2029 // cleaner unwinding and error messages. 2030 panic(fmt.Sprintf("expected %s to not be compacting", f.FileNum)) 2031 } 2032 if candidate.isIntraL0 && f.LargestSeqNum >= candidate.earliestUnflushedSeqNum { 2033 continue 2034 } 2035 if !candidate.FilesIncluded[f.L0Index] { 2036 addedCount++ 2037 candidate.addFile(f) 2038 } 2039 } 2040 } 2041 return addedCount > 0 2042 }