github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/manifest/l0_sublevels.go (about) 1 // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package manifest 6 7 import ( 8 "bytes" 9 "fmt" 10 "math" 11 "sort" 12 "strings" 13 14 "github.com/cockroachdb/errors" 15 "github.com/cockroachdb/pebble/internal/base" 16 "github.com/cockroachdb/pebble/internal/invariants" 17 ) 18 19 // errInvalidL0SublevelsOpt is for use in AddL0Files when the incremental 20 // sublevel generation optimization failed, and NewL0Sublevels must be called. 21 var errInvalidL0SublevelsOpt = errors.New("pebble: L0 sublevel generation optimization cannot be used") 22 23 // Intervals are of the form [start, end) with no gap between intervals. Each 24 // file overlaps perfectly with a sequence of intervals. This perfect overlap 25 // occurs because the union of file boundary keys is used to pick intervals. 26 // However the largest key in a file is inclusive, so when it is used as 27 // an interval, the actual key is ImmediateSuccessor(key). We don't have the 28 // ImmediateSuccessor function to do this computation, so we instead keep an 29 // isLargest bool to remind the code about this fact. This is used for 30 // comparisons in the following manner: 31 // - intervalKey{k, false} < intervalKey{k, true} 32 // - k1 < k2 -> intervalKey{k1, _} < intervalKey{k2, _}. 33 // 34 // Note that the file's largest key is exclusive if the internal key 35 // has a trailer matching the rangedel sentinel key. In this case, we set 36 // isLargest to false for end interval computation. 37 // 38 // For example, consider three files with bounds [a,e], [b,g], and [e,j]. The 39 // interval keys produced would be intervalKey{a, false}, intervalKey{b, false}, 40 // intervalKey{e, false}, intervalKey{e, true}, intervalKey{g, true} and 41 // intervalKey{j, true}, resulting in intervals 42 // [a, b), [b, (e, false)), [(e,false), (e, true)), [(e, true), (g, true)) and 43 // [(g, true), (j, true)). The first file overlaps with the first three 44 // perfectly, the second file overlaps with the second through to fourth 45 // intervals, and the third file overlaps with the last three. 46 // 47 // The intervals are indexed starting from 0, with the index of the interval 48 // being the index of the start key of the interval. 49 // 50 // In addition to helping with compaction picking, we use interval indices 51 // to assign each file an interval range once. Subsequent operations, say 52 // picking overlapping files for a compaction, only need to use the index 53 // numbers and so avoid expensive byte slice comparisons. 54 type intervalKey struct { 55 key []byte 56 isLargest bool 57 } 58 59 // intervalKeyTemp is used in the sortAndSweep step. It contains additional metadata 60 // which is used to generate the {min,max}IntervalIndex for files. 61 type intervalKeyTemp struct { 62 intervalKey intervalKey 63 fileMeta *FileMetadata 64 isEndKey bool 65 } 66 67 func (i *intervalKeyTemp) setFileIntervalIndex(idx int) { 68 if i.isEndKey { 69 // This is the right endpoint of some file interval, so the 70 // file.maxIntervalIndex must be j - 1 as maxIntervalIndex is 71 // inclusive. 72 i.fileMeta.maxIntervalIndex = idx - 1 73 return 74 } 75 // This is the left endpoint for some file interval, so the 76 // file.minIntervalIndex must be j. 77 i.fileMeta.minIntervalIndex = idx 78 } 79 80 func intervalKeyCompare(cmp Compare, a, b intervalKey) int { 81 rv := cmp(a.key, b.key) 82 if rv == 0 { 83 if a.isLargest && !b.isLargest { 84 return +1 85 } 86 if !a.isLargest && b.isLargest { 87 return -1 88 } 89 } 90 return rv 91 } 92 93 type intervalKeySorter struct { 94 keys []intervalKeyTemp 95 cmp Compare 96 } 97 98 func (s intervalKeySorter) Len() int { return len(s.keys) } 99 func (s intervalKeySorter) Less(i, j int) bool { 100 return intervalKeyCompare(s.cmp, s.keys[i].intervalKey, s.keys[j].intervalKey) < 0 101 } 102 func (s intervalKeySorter) Swap(i, j int) { 103 s.keys[i], s.keys[j] = s.keys[j], s.keys[i] 104 } 105 106 // sortAndSweep will sort the intervalKeys using intervalKeySorter, remove the 107 // duplicate fileIntervals, and set the {min, max}IntervalIndex for the files. 108 func sortAndSweep(keys []intervalKeyTemp, cmp Compare) []intervalKeyTemp { 109 if len(keys) == 0 { 110 return nil 111 } 112 sorter := intervalKeySorter{keys: keys, cmp: cmp} 113 sort.Sort(sorter) 114 115 // intervalKeys are generated using the file bounds. Specifically, there are 116 // 2 intervalKeys for each file, and len(keys) = 2 * number of files. Each 117 // `intervalKeyTemp` stores information about which file it was generated 118 // from, and whether the key represents the end key of the file. So, as 119 // we're deduplicating the `keys` slice, we're guaranteed to iterate over 120 // the interval keys belonging to each of the files. Since the 121 // file.{min,max}IntervalIndex points to the position of the files bounds in 122 // the deduplicated `keys` slice, we can determine 123 // file.{min,max}IntervalIndex during the iteration. 124 i := 0 125 j := 0 126 for i < len(keys) { 127 // loop invariant: j <= i 128 currKey := keys[i] 129 keys[j] = keys[i] 130 131 for { 132 keys[i].setFileIntervalIndex(j) 133 i++ 134 if i >= len(keys) || intervalKeyCompare(cmp, currKey.intervalKey, keys[i].intervalKey) != 0 { 135 break 136 } 137 } 138 j++ 139 } 140 return keys[:j] 141 } 142 143 // A key interval of the form [start, end). The end is not represented here 144 // since it is implicit in the start of the next interval. The last interval is 145 // an exception but we don't need to ever lookup the end of that interval; the 146 // last fileInterval will only act as an end key marker. The set of intervals 147 // is const after initialization. 148 type fileInterval struct { 149 index int 150 startKey intervalKey 151 152 // True iff some file in this interval is compacting to base. Such intervals 153 // cannot have any files participate in L0 -> Lbase compactions. 154 isBaseCompacting bool 155 156 // The min and max intervals index across all the files that overlap with 157 // this interval. Inclusive on both sides. 158 filesMinIntervalIndex int 159 filesMaxIntervalIndex int 160 161 // True if another interval that has a file extending into this interval is 162 // undergoing a compaction into Lbase. In other words, this bool is true if 163 // any interval in [filesMinIntervalIndex, filesMaxIntervalIndex] has 164 // isBaseCompacting set to true. This lets the compaction picker 165 // de-prioritize this interval for picking compactions, since there's a high 166 // chance that a base compaction with a sufficient height of sublevels 167 // rooted at this interval could not be chosen due to the ongoing base 168 // compaction in the other interval. If the file straddling the two 169 // intervals is at a sufficiently high sublevel (with enough compactible 170 // files below it to satisfy minCompactionDepth), this is not an issue, but 171 // to optimize for quickly picking base compactions far away from other base 172 // compactions, this bool is used as a heuristic (but not as a complete 173 // disqualifier). 174 intervalRangeIsBaseCompacting bool 175 176 // All files in this interval, in increasing sublevel order. 177 files []*FileMetadata 178 179 // len(files) - compactingFileCount is the stack depth that requires 180 // starting new compactions. This metric is not precise since the 181 // compactingFileCount can include files that are part of N (where N > 1) 182 // intra-L0 compactions, so the stack depth after those complete will be 183 // len(files) - compactingFileCount + N. We ignore this imprecision since we 184 // don't want to track which files are part of which intra-L0 compaction. 185 compactingFileCount int 186 187 // Interpolated from files in this interval. For files spanning multiple 188 // intervals, we assume an equal distribution of bytes across all those 189 // intervals. 190 estimatedBytes uint64 191 } 192 193 // Helper type for any cases requiring a bool slice. 194 type bitSet []bool 195 196 func newBitSet(n int) bitSet { 197 return make([]bool, n) 198 } 199 200 func (b *bitSet) markBit(i int) { 201 (*b)[i] = true 202 } 203 204 func (b *bitSet) markBits(start, end int) { 205 for i := start; i < end; i++ { 206 (*b)[i] = true 207 } 208 } 209 210 func (b *bitSet) clearAllBits() { 211 for i := range *b { 212 (*b)[i] = false 213 } 214 } 215 216 // L0Compaction describes an active compaction with inputs from L0. 217 type L0Compaction struct { 218 Smallest InternalKey 219 Largest InternalKey 220 IsIntraL0 bool 221 } 222 223 // L0Sublevels represents a sublevel view of SSTables in L0. Tables in one 224 // sublevel are non-overlapping in key ranges, and keys in higher-indexed 225 // sublevels shadow older versions in lower-indexed sublevels. These invariants 226 // are similar to the regular level invariants, except with higher indexed 227 // sublevels having newer keys as opposed to lower indexed levels. 228 // 229 // There is no limit to the number of sublevels that can exist in L0 at any 230 // time, however read and compaction performance is best when there are as few 231 // sublevels as possible. 232 type L0Sublevels struct { 233 // Levels are ordered from oldest sublevel to youngest sublevel in the 234 // outer slice, and the inner slice contains non-overlapping files for 235 // that sublevel in increasing key order. Levels is constructed from 236 // levelFiles and is used by callers that require a LevelSlice. The below two 237 // fields are treated as immutable once created in NewL0Sublevels. 238 Levels []LevelSlice 239 levelFiles [][]*FileMetadata 240 241 cmp Compare 242 formatKey base.FormatKey 243 244 fileBytes uint64 245 // All the L0 files, ordered from oldest to youngest. 246 levelMetadata *LevelMetadata 247 248 // The file intervals in increasing key order. 249 orderedIntervals []fileInterval 250 251 // Keys to break flushes at. 252 flushSplitUserKeys [][]byte 253 254 // Only used to check invariants. 255 addL0FilesCalled bool 256 } 257 258 type sublevelSorter []*FileMetadata 259 260 // Len implements sort.Interface. 261 func (sl sublevelSorter) Len() int { 262 return len(sl) 263 } 264 265 // Less implements sort.Interface. 266 func (sl sublevelSorter) Less(i, j int) bool { 267 return sl[i].minIntervalIndex < sl[j].minIntervalIndex 268 } 269 270 // Swap implements sort.Interface. 271 func (sl sublevelSorter) Swap(i, j int) { 272 sl[i], sl[j] = sl[j], sl[i] 273 } 274 275 // NewL0Sublevels creates an L0Sublevels instance for a given set of L0 files. 276 // These files must all be in L0 and must be sorted by seqnum (see 277 // SortBySeqNum). During interval iteration, when flushSplitMaxBytes bytes are 278 // exceeded in the range of intervals since the last flush split key, a flush 279 // split key is added. 280 // 281 // This method can be called without DB.mu being held, so any DB.mu protected 282 // fields in FileMetadata cannot be accessed here, such as Compacting and 283 // IsIntraL0Compacting. Those fields are accessed in InitCompactingFileInfo 284 // instead. 285 func NewL0Sublevels( 286 levelMetadata *LevelMetadata, cmp Compare, formatKey base.FormatKey, flushSplitMaxBytes int64, 287 ) (*L0Sublevels, error) { 288 s := &L0Sublevels{cmp: cmp, formatKey: formatKey} 289 s.levelMetadata = levelMetadata 290 keys := make([]intervalKeyTemp, 0, 2*s.levelMetadata.Len()) 291 iter := levelMetadata.Iter() 292 for i, f := 0, iter.First(); f != nil; i, f = i+1, iter.Next() { 293 f.L0Index = i 294 keys = append(keys, intervalKeyTemp{ 295 intervalKey: intervalKey{key: f.Smallest.UserKey}, 296 fileMeta: f, 297 isEndKey: false, 298 }) 299 keys = append(keys, intervalKeyTemp{ 300 intervalKey: intervalKey{ 301 key: f.Largest.UserKey, 302 isLargest: !f.Largest.IsExclusiveSentinel(), 303 }, 304 fileMeta: f, 305 isEndKey: true, 306 }) 307 } 308 keys = sortAndSweep(keys, cmp) 309 // All interval indices reference s.orderedIntervals. 310 s.orderedIntervals = make([]fileInterval, len(keys)) 311 for i := range keys { 312 s.orderedIntervals[i] = fileInterval{ 313 index: i, 314 startKey: keys[i].intervalKey, 315 filesMinIntervalIndex: i, 316 filesMaxIntervalIndex: i, 317 } 318 } 319 // Initialize minIntervalIndex and maxIntervalIndex for each file, and use that 320 // to update intervals. 321 for f := iter.First(); f != nil; f = iter.Next() { 322 if err := s.addFileToSublevels(f, false /* checkInvariant */); err != nil { 323 return nil, err 324 } 325 } 326 // Sort each sublevel in increasing key order. 327 for i := range s.levelFiles { 328 sort.Sort(sublevelSorter(s.levelFiles[i])) 329 } 330 331 // Construct a parallel slice of sublevel B-Trees. 332 // TODO(jackson): Consolidate and only use the B-Trees. 333 for _, sublevelFiles := range s.levelFiles { 334 tr, ls := makeBTree(btreeCmpSmallestKey(cmp), sublevelFiles) 335 s.Levels = append(s.Levels, ls) 336 tr.Release() 337 } 338 339 s.calculateFlushSplitKeys(flushSplitMaxBytes) 340 return s, nil 341 } 342 343 // Helper function to merge new intervalKeys into an existing slice of old 344 // fileIntervals, into result. Returns the new result and a slice of ints 345 // mapping old interval indices to new ones. The added intervalKeys do not need 346 // to be sorted; they get sorted and deduped in this function. 347 func mergeIntervals( 348 old, result []fileInterval, added []intervalKeyTemp, compare Compare, 349 ) ([]fileInterval, []int) { 350 sorter := intervalKeySorter{keys: added, cmp: compare} 351 sort.Sort(sorter) 352 353 oldToNewMap := make([]int, len(old)) 354 i := 0 355 j := 0 356 357 for i < len(old) || j < len(added) { 358 for j > 0 && j < len(added) && intervalKeyCompare(compare, added[j-1].intervalKey, added[j].intervalKey) == 0 { 359 added[j].setFileIntervalIndex(len(result) - 1) 360 j++ 361 } 362 if i >= len(old) && j >= len(added) { 363 break 364 } 365 var cmp int 366 if i >= len(old) { 367 cmp = +1 368 } 369 if j >= len(added) { 370 cmp = -1 371 } 372 if cmp == 0 { 373 cmp = intervalKeyCompare(compare, old[i].startKey, added[j].intervalKey) 374 } 375 switch { 376 case cmp <= 0: 377 // Shallow-copy the existing interval. 378 newInterval := old[i] 379 result = append(result, newInterval) 380 oldToNewMap[i] = len(result) - 1 381 i++ 382 if cmp == 0 { 383 added[j].setFileIntervalIndex(len(result) - 1) 384 j++ 385 } 386 case cmp > 0: 387 var prevInterval fileInterval 388 // Insert a new interval for a newly-added file. prevInterval, if 389 // non-zero, will be "inherited"; we copy its files as those extend 390 // into this interval. 391 if len(result) > 0 { 392 prevInterval = result[len(result)-1] 393 } 394 newInterval := fileInterval{ 395 index: len(result), 396 startKey: added[j].intervalKey, 397 filesMinIntervalIndex: len(result), 398 filesMaxIntervalIndex: len(result), 399 400 // estimatedBytes gets recalculated later on, as the number of intervals 401 // the file bytes are interpolated over has changed. 402 estimatedBytes: 0, 403 // Copy the below attributes from prevInterval. 404 files: append([]*FileMetadata(nil), prevInterval.files...), 405 isBaseCompacting: prevInterval.isBaseCompacting, 406 intervalRangeIsBaseCompacting: prevInterval.intervalRangeIsBaseCompacting, 407 compactingFileCount: prevInterval.compactingFileCount, 408 } 409 result = append(result, newInterval) 410 added[j].setFileIntervalIndex(len(result) - 1) 411 j++ 412 } 413 } 414 return result, oldToNewMap 415 } 416 417 // AddL0Files incrementally builds a new L0Sublevels for when the only change 418 // since the receiver L0Sublevels was an addition of the specified files, with 419 // no L0 deletions. The common case of this is an ingestion or a flush. These 420 // files can "sit on top" of existing sublevels, creating at most one new 421 // sublevel for a flush (and possibly multiple for an ingestion), and at most 422 // 2*len(files) additions to s.orderedIntervals. No files must have been deleted 423 // from L0, and the added files must all be newer in sequence numbers than 424 // existing files in L0Sublevels. The files parameter must be sorted in seqnum 425 // order. The levelMetadata parameter corresponds to the new L0 post addition of 426 // files. This method is meant to be significantly more performant than 427 // NewL0Sublevels. 428 // 429 // Note that this function can only be called once on a given receiver; it 430 // appends to some slices in s which is only safe when done once. This is okay, 431 // as the common case (generating a new L0Sublevels after a flush/ingestion) is 432 // only going to necessitate one call of this method on a given receiver. The 433 // returned value, if non-nil, can then have [*L0Sublevels.AddL0Files] called on 434 // it again, and so on. If [errInvalidL0SublevelsOpt] is returned as an error, 435 // it likely means the optimization could not be applied (i.e. files added were 436 // older than files already in the sublevels, which is possible around 437 // ingestions and in tests). Eg. it can happen when an ingested file was 438 // ingested without queueing a flush since it did not actually overlap with any 439 // keys in the memtable. Later on the memtable was flushed, and the memtable had 440 // keys spanning around the ingested file, producing a flushed file that 441 // overlapped with the ingested file in file bounds but not in keys. It's 442 // possible for that flushed file to have a lower LargestSeqNum than the 443 // ingested file if all the additions after the ingestion were to another 444 // flushed file that was split into a separate sstable during flush. Any other 445 // non-nil error means [L0Sublevels] generation failed in the same way as 446 // [NewL0Sublevels] would likely fail. 447 func (s *L0Sublevels) AddL0Files( 448 files []*FileMetadata, flushSplitMaxBytes int64, levelMetadata *LevelMetadata, 449 ) (*L0Sublevels, error) { 450 if invariants.Enabled && s.addL0FilesCalled { 451 panic("AddL0Files called twice on the same receiver") 452 } 453 s.addL0FilesCalled = true 454 455 // Start with a shallow copy of s. 456 newVal := &L0Sublevels{} 457 *newVal = *s 458 459 newVal.addL0FilesCalled = false 460 newVal.levelMetadata = levelMetadata 461 // Deep copy levelFiles and Levels, as they are mutated and sorted below. 462 // Shallow copies of slices that we just append to, are okay. 463 newVal.levelFiles = make([][]*FileMetadata, len(s.levelFiles)) 464 for i := range s.levelFiles { 465 newVal.levelFiles[i] = make([]*FileMetadata, len(s.levelFiles[i])) 466 copy(newVal.levelFiles[i], s.levelFiles[i]) 467 } 468 newVal.Levels = make([]LevelSlice, len(s.Levels)) 469 copy(newVal.Levels, s.Levels) 470 471 fileKeys := make([]intervalKeyTemp, 0, 2*len(files)) 472 for _, f := range files { 473 left := intervalKeyTemp{ 474 intervalKey: intervalKey{key: f.Smallest.UserKey}, 475 fileMeta: f, 476 } 477 right := intervalKeyTemp{ 478 intervalKey: intervalKey{ 479 key: f.Largest.UserKey, 480 isLargest: !f.Largest.IsExclusiveSentinel(), 481 }, 482 fileMeta: f, 483 isEndKey: true, 484 } 485 fileKeys = append(fileKeys, left, right) 486 } 487 keys := make([]fileInterval, 0, 2*levelMetadata.Len()) 488 var oldToNewMap []int 489 // We can avoid the sortAndSweep step on the combined length of 490 // s.orderedIntervals and fileKeys by treating this as a merge of two sorted 491 // runs, fileKeys and s.orderedIntervals, into `keys` which will form 492 // newVal.orderedIntervals. 493 keys, oldToNewMap = mergeIntervals(s.orderedIntervals, keys, fileKeys, s.cmp) 494 if invariants.Enabled { 495 for i := 1; i < len(keys); i++ { 496 if intervalKeyCompare(newVal.cmp, keys[i-1].startKey, keys[i].startKey) >= 0 { 497 panic("keys not sorted correctly") 498 } 499 } 500 } 501 newVal.orderedIntervals = keys 502 // Update indices in s.orderedIntervals for fileIntervals we retained. 503 for _, newIdx := range oldToNewMap { 504 newInterval := &keys[newIdx] 505 newInterval.index = newIdx 506 // This code, and related code in the for loop below, adjusts 507 // files{Min,Max}IntervalIndex just for interval indices shifting due to 508 // new intervals, and not for any of the new files being added to the 509 // same intervals. The goal is to produce a state of the system that's 510 // accurate for all existing files, and has all the new intervals to 511 // support new files. Once that's done, we can just call 512 // addFileToSublevel to adjust all relevant intervals for new files. 513 newInterval.filesMinIntervalIndex = oldToNewMap[newInterval.filesMinIntervalIndex] 514 // maxIntervalIndexes are special. Since it's an inclusive end bound, we 515 // actually have to map it to the _next_ old interval's new previous 516 // interval. This logic is easier to understand if you see 517 // [f.minIntervalIndex, f.maxIntervalIndex] as [f.minIntervalIndex, 518 // f.maxIntervalIndex+1). The other case to remember is when the 519 // interval is completely empty (i.e. len(newInterval.files) == 0); in 520 // that case we want to refer back to ourselves regardless of additions 521 // to the right of us. 522 if newInterval.filesMaxIntervalIndex < len(oldToNewMap)-1 && len(newInterval.files) > 0 { 523 newInterval.filesMaxIntervalIndex = oldToNewMap[newInterval.filesMaxIntervalIndex+1] - 1 524 } else { 525 // newInterval.filesMaxIntervalIndex == len(oldToNewMap)-1. 526 newInterval.filesMaxIntervalIndex = oldToNewMap[newInterval.filesMaxIntervalIndex] 527 } 528 } 529 // Loop through all instances of new intervals added between two old 530 // intervals and expand [filesMinIntervalIndex, filesMaxIntervalIndex] of 531 // new intervals to reflect that of adjacent old intervals. 532 { 533 // We can skip cases where new intervals were added to the left of all 534 // existing intervals (eg. if the first entry in oldToNewMap is 535 // oldToNewMap[0] >= 1). Those intervals will only contain newly added 536 // files and will have their parameters adjusted down in 537 // addFileToSublevels. The same can also be said about new intervals 538 // that are to the right of all existing intervals. 539 lastIdx := 0 540 for _, newIdx := range oldToNewMap { 541 for i := lastIdx + 1; i < newIdx; i++ { 542 minIntervalIndex := i 543 maxIntervalIndex := i 544 if keys[lastIdx].filesMaxIntervalIndex != lastIdx { 545 // Last old interval has files extending into keys[i]. 546 minIntervalIndex = keys[lastIdx].filesMinIntervalIndex 547 maxIntervalIndex = keys[lastIdx].filesMaxIntervalIndex 548 } 549 550 keys[i].filesMinIntervalIndex = minIntervalIndex 551 keys[i].filesMaxIntervalIndex = maxIntervalIndex 552 } 553 lastIdx = newIdx 554 } 555 } 556 // Go through old files and update interval indices. 557 // 558 // TODO(bilal): This is the only place in this method where we loop through 559 // all existing files, which could be much more in number than newly added 560 // files. See if we can avoid the need for this, either by getting rid of 561 // f.minIntervalIndex and f.maxIntervalIndex and calculating them on the fly 562 // with a binary search, or by only looping through files to the right of 563 // the first interval touched by this method. 564 for sublevel := range s.Levels { 565 s.Levels[sublevel].Each(func(f *FileMetadata) { 566 oldIntervalDelta := f.maxIntervalIndex - f.minIntervalIndex + 1 567 oldMinIntervalIndex := f.minIntervalIndex 568 f.minIntervalIndex = oldToNewMap[f.minIntervalIndex] 569 // maxIntervalIndex is special. Since it's an inclusive end bound, 570 // we actually have to map it to the _next_ old interval's new 571 // previous interval. This logic is easier to understand if you see 572 // [f.minIntervalIndex, f.maxIntervalIndex] as [f.minIntervalIndex, 573 // f.maxIntervalIndex+1). 574 f.maxIntervalIndex = oldToNewMap[f.maxIntervalIndex+1] - 1 575 newIntervalDelta := f.maxIntervalIndex - f.minIntervalIndex + 1 576 // Recalculate estimatedBytes for all old files across new 577 // intervals, but only if new intervals were added in between. 578 if oldIntervalDelta != newIntervalDelta { 579 // j is incremented so that oldToNewMap[j] points to the next 580 // old interval. This is used to distinguish between old 581 // intervals (i.e. ones where we need to subtract 582 // f.Size/oldIntervalDelta) from new ones (where we don't need 583 // to subtract). In both cases we need to add 584 // f.Size/newIntervalDelta. 585 j := oldMinIntervalIndex 586 for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ { 587 if oldToNewMap[j] == i { 588 newVal.orderedIntervals[i].estimatedBytes -= f.Size / uint64(oldIntervalDelta) 589 j++ 590 } 591 newVal.orderedIntervals[i].estimatedBytes += f.Size / uint64(newIntervalDelta) 592 } 593 } 594 }) 595 } 596 updatedSublevels := make([]int, 0) 597 // Update interval indices for new files. 598 for i, f := range files { 599 f.L0Index = s.levelMetadata.Len() + i 600 if err := newVal.addFileToSublevels(f, true /* checkInvariant */); err != nil { 601 return nil, err 602 } 603 updatedSublevels = append(updatedSublevels, f.SubLevel) 604 } 605 606 // Sort and deduplicate updatedSublevels. 607 sort.Ints(updatedSublevels) 608 { 609 j := 0 610 for i := 1; i < len(updatedSublevels); i++ { 611 if updatedSublevels[i] != updatedSublevels[j] { 612 j++ 613 updatedSublevels[j] = updatedSublevels[i] 614 } 615 } 616 updatedSublevels = updatedSublevels[:j+1] 617 } 618 619 // Sort each updated sublevel in increasing key order. 620 for _, sublevel := range updatedSublevels { 621 sort.Sort(sublevelSorter(newVal.levelFiles[sublevel])) 622 } 623 624 // Construct a parallel slice of sublevel B-Trees. 625 // TODO(jackson): Consolidate and only use the B-Trees. 626 for _, sublevel := range updatedSublevels { 627 tr, ls := makeBTree(btreeCmpSmallestKey(newVal.cmp), newVal.levelFiles[sublevel]) 628 if sublevel == len(newVal.Levels) { 629 newVal.Levels = append(newVal.Levels, ls) 630 } else { 631 // sublevel < len(s.Levels). If this panics, updatedSublevels was not 632 // populated correctly. 633 newVal.Levels[sublevel] = ls 634 } 635 tr.Release() 636 } 637 638 newVal.flushSplitUserKeys = nil 639 newVal.calculateFlushSplitKeys(flushSplitMaxBytes) 640 return newVal, nil 641 } 642 643 // addFileToSublevels is called during L0Sublevels generation, and adds f to the 644 // correct sublevel's levelFiles, the relevant intervals' files slices, and sets 645 // interval indices on f. This method, if called successively on multiple files, 646 // _must_ be called on successively newer files (by seqnum). If checkInvariant 647 // is true, it could check for this in some cases and return 648 // [errInvalidL0SublevelsOpt] if that invariant isn't held. 649 func (s *L0Sublevels) addFileToSublevels(f *FileMetadata, checkInvariant bool) error { 650 // This is a simple and not very accurate estimate of the number of 651 // bytes this SSTable contributes to the intervals it is a part of. 652 // 653 // TODO(bilal): Call EstimateDiskUsage in sstable.Reader with interval 654 // bounds to get a better estimate for each interval. 655 interpolatedBytes := f.Size / uint64(f.maxIntervalIndex-f.minIntervalIndex+1) 656 s.fileBytes += f.Size 657 subLevel := 0 658 // Update state in every fileInterval for this file. 659 for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ { 660 interval := &s.orderedIntervals[i] 661 if len(interval.files) > 0 { 662 if checkInvariant && interval.files[len(interval.files)-1].LargestSeqNum > f.LargestSeqNum { 663 // We are sliding this file "underneath" an existing file. Throw away 664 // and start over in NewL0Sublevels. 665 return errInvalidL0SublevelsOpt 666 } 667 // interval.files is sorted by sublevels, from lowest to highest. 668 // AddL0Files can only add files at sublevels higher than existing files 669 // in the same key intervals. 670 if maxSublevel := interval.files[len(interval.files)-1].SubLevel; subLevel <= maxSublevel { 671 subLevel = maxSublevel + 1 672 } 673 } 674 interval.estimatedBytes += interpolatedBytes 675 if f.minIntervalIndex < interval.filesMinIntervalIndex { 676 interval.filesMinIntervalIndex = f.minIntervalIndex 677 } 678 if f.maxIntervalIndex > interval.filesMaxIntervalIndex { 679 interval.filesMaxIntervalIndex = f.maxIntervalIndex 680 } 681 interval.files = append(interval.files, f) 682 } 683 f.SubLevel = subLevel 684 if subLevel > len(s.levelFiles) { 685 return errors.Errorf("chose a sublevel beyond allowed range of sublevels: %d vs 0-%d", subLevel, len(s.levelFiles)) 686 } 687 if subLevel == len(s.levelFiles) { 688 s.levelFiles = append(s.levelFiles, []*FileMetadata{f}) 689 } else { 690 s.levelFiles[subLevel] = append(s.levelFiles[subLevel], f) 691 } 692 return nil 693 } 694 695 func (s *L0Sublevels) calculateFlushSplitKeys(flushSplitMaxBytes int64) { 696 var cumulativeBytes uint64 697 // Multiply flushSplitMaxBytes by the number of sublevels. This prevents 698 // excessive flush splitting when the number of sublevels increases. 699 flushSplitMaxBytes *= int64(len(s.levelFiles)) 700 for i := 0; i < len(s.orderedIntervals); i++ { 701 interval := &s.orderedIntervals[i] 702 if flushSplitMaxBytes > 0 && cumulativeBytes > uint64(flushSplitMaxBytes) && 703 (len(s.flushSplitUserKeys) == 0 || 704 !bytes.Equal(interval.startKey.key, s.flushSplitUserKeys[len(s.flushSplitUserKeys)-1])) { 705 s.flushSplitUserKeys = append(s.flushSplitUserKeys, interval.startKey.key) 706 cumulativeBytes = 0 707 } 708 cumulativeBytes += s.orderedIntervals[i].estimatedBytes 709 } 710 } 711 712 // InitCompactingFileInfo initializes internal flags relating to compacting 713 // files. Must be called after sublevel initialization. 714 // 715 // Requires DB.mu *and* the manifest lock to be held. 716 func (s *L0Sublevels) InitCompactingFileInfo(inProgress []L0Compaction) { 717 for i := range s.orderedIntervals { 718 s.orderedIntervals[i].compactingFileCount = 0 719 s.orderedIntervals[i].isBaseCompacting = false 720 s.orderedIntervals[i].intervalRangeIsBaseCompacting = false 721 } 722 723 iter := s.levelMetadata.Iter() 724 for f := iter.First(); f != nil; f = iter.Next() { 725 if invariants.Enabled { 726 if !bytes.Equal(s.orderedIntervals[f.minIntervalIndex].startKey.key, f.Smallest.UserKey) { 727 panic(fmt.Sprintf("f.minIntervalIndex in FileMetadata out of sync with intervals in L0Sublevels: %s != %s", 728 s.formatKey(s.orderedIntervals[f.minIntervalIndex].startKey.key), s.formatKey(f.Smallest.UserKey))) 729 } 730 if !bytes.Equal(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, f.Largest.UserKey) { 731 panic(fmt.Sprintf("f.maxIntervalIndex in FileMetadata out of sync with intervals in L0Sublevels: %s != %s", 732 s.formatKey(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key), s.formatKey(f.Smallest.UserKey))) 733 } 734 } 735 if !f.IsCompacting() { 736 continue 737 } 738 if invariants.Enabled { 739 if s.cmp(s.orderedIntervals[f.minIntervalIndex].startKey.key, f.Smallest.UserKey) != 0 || s.cmp(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, f.Largest.UserKey) != 0 { 740 panic(fmt.Sprintf("file %s has inconsistent L0 Sublevel interval bounds: %s-%s, %s-%s", f.FileNum, 741 s.orderedIntervals[f.minIntervalIndex].startKey.key, s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, 742 f.Smallest.UserKey, f.Largest.UserKey)) 743 } 744 } 745 for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ { 746 interval := &s.orderedIntervals[i] 747 interval.compactingFileCount++ 748 if !f.IsIntraL0Compacting { 749 // If f.Compacting && !f.IsIntraL0Compacting, this file is 750 // being compacted to Lbase. 751 interval.isBaseCompacting = true 752 } 753 } 754 } 755 756 // Some intervals may be base compacting without the files contained within 757 // those intervals being marked as compacting. This is possible if the files 758 // were added after the compaction initiated, and the active compaction 759 // files straddle the input file. Mark these intervals as base compacting. 760 for _, c := range inProgress { 761 startIK := intervalKey{key: c.Smallest.UserKey, isLargest: false} 762 endIK := intervalKey{key: c.Largest.UserKey, isLargest: !c.Largest.IsExclusiveSentinel()} 763 start := sort.Search(len(s.orderedIntervals), func(i int) bool { 764 return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, startIK) >= 0 765 }) 766 end := sort.Search(len(s.orderedIntervals), func(i int) bool { 767 return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, endIK) >= 0 768 }) 769 for i := start; i < end && i < len(s.orderedIntervals); i++ { 770 interval := &s.orderedIntervals[i] 771 if !c.IsIntraL0 { 772 interval.isBaseCompacting = true 773 } 774 } 775 } 776 777 min := 0 778 for i := range s.orderedIntervals { 779 interval := &s.orderedIntervals[i] 780 if interval.isBaseCompacting { 781 minIndex := interval.filesMinIntervalIndex 782 if minIndex < min { 783 minIndex = min 784 } 785 for j := minIndex; j <= interval.filesMaxIntervalIndex; j++ { 786 min = j 787 s.orderedIntervals[j].intervalRangeIsBaseCompacting = true 788 } 789 } 790 } 791 } 792 793 // String produces a string containing useful debug information. Useful in test 794 // code and debugging. 795 func (s *L0Sublevels) String() string { 796 return s.describe(false) 797 } 798 799 func (s *L0Sublevels) describe(verbose bool) string { 800 var buf strings.Builder 801 fmt.Fprintf(&buf, "file count: %d, sublevels: %d, intervals: %d\nflush split keys(%d): [", 802 s.levelMetadata.Len(), len(s.levelFiles), len(s.orderedIntervals), len(s.flushSplitUserKeys)) 803 for i := range s.flushSplitUserKeys { 804 fmt.Fprintf(&buf, "%s", s.formatKey(s.flushSplitUserKeys[i])) 805 if i < len(s.flushSplitUserKeys)-1 { 806 fmt.Fprintf(&buf, ", ") 807 } 808 } 809 fmt.Fprintln(&buf, "]") 810 numCompactingFiles := 0 811 for i := len(s.levelFiles) - 1; i >= 0; i-- { 812 maxIntervals := 0 813 sumIntervals := 0 814 var totalBytes uint64 815 for _, f := range s.levelFiles[i] { 816 intervals := f.maxIntervalIndex - f.minIntervalIndex + 1 817 if intervals > maxIntervals { 818 maxIntervals = intervals 819 } 820 sumIntervals += intervals 821 totalBytes += f.Size 822 if f.IsCompacting() { 823 numCompactingFiles++ 824 } 825 } 826 fmt.Fprintf(&buf, "0.%d: file count: %d, bytes: %d, width (mean, max): %0.1f, %d, interval range: [%d, %d]\n", 827 i, len(s.levelFiles[i]), totalBytes, float64(sumIntervals)/float64(len(s.levelFiles[i])), maxIntervals, s.levelFiles[i][0].minIntervalIndex, 828 s.levelFiles[i][len(s.levelFiles[i])-1].maxIntervalIndex) 829 for _, f := range s.levelFiles[i] { 830 intervals := f.maxIntervalIndex - f.minIntervalIndex + 1 831 if verbose { 832 fmt.Fprintf(&buf, "\t%s\n", f) 833 } 834 if s.levelMetadata.Len() > 50 && intervals*3 > len(s.orderedIntervals) { 835 var intervalsBytes uint64 836 for k := f.minIntervalIndex; k <= f.maxIntervalIndex; k++ { 837 intervalsBytes += s.orderedIntervals[k].estimatedBytes 838 } 839 fmt.Fprintf(&buf, "wide file: %d, [%d, %d], byte fraction: %f\n", 840 f.FileNum, f.minIntervalIndex, f.maxIntervalIndex, 841 float64(intervalsBytes)/float64(s.fileBytes)) 842 } 843 } 844 } 845 846 lastCompactingIntervalStart := -1 847 fmt.Fprintf(&buf, "compacting file count: %d, base compacting intervals: ", numCompactingFiles) 848 i := 0 849 foundBaseCompactingIntervals := false 850 for ; i < len(s.orderedIntervals); i++ { 851 interval := &s.orderedIntervals[i] 852 if len(interval.files) == 0 { 853 continue 854 } 855 if !interval.isBaseCompacting { 856 if lastCompactingIntervalStart != -1 { 857 if foundBaseCompactingIntervals { 858 buf.WriteString(", ") 859 } 860 fmt.Fprintf(&buf, "[%d, %d]", lastCompactingIntervalStart, i-1) 861 foundBaseCompactingIntervals = true 862 } 863 lastCompactingIntervalStart = -1 864 } else { 865 if lastCompactingIntervalStart == -1 { 866 lastCompactingIntervalStart = i 867 } 868 } 869 } 870 if lastCompactingIntervalStart != -1 { 871 if foundBaseCompactingIntervals { 872 buf.WriteString(", ") 873 } 874 fmt.Fprintf(&buf, "[%d, %d]", lastCompactingIntervalStart, i-1) 875 } else if !foundBaseCompactingIntervals { 876 fmt.Fprintf(&buf, "none") 877 } 878 fmt.Fprintln(&buf, "") 879 return buf.String() 880 } 881 882 // ReadAmplification returns the contribution of L0Sublevels to the read 883 // amplification for any particular point key. It is the maximum height of any 884 // tracked fileInterval. This is always less than or equal to the number of 885 // sublevels. 886 func (s *L0Sublevels) ReadAmplification() int { 887 amp := 0 888 for i := range s.orderedIntervals { 889 interval := &s.orderedIntervals[i] 890 fileCount := len(interval.files) 891 if amp < fileCount { 892 amp = fileCount 893 } 894 } 895 return amp 896 } 897 898 // UserKeyRange encodes a key range in user key space. A UserKeyRange's Start 899 // and End boundaries are both inclusive. 900 type UserKeyRange struct { 901 Start, End []byte 902 } 903 904 // InUseKeyRanges returns the merged table bounds of L0 files overlapping the 905 // provided user key range. The returned key ranges are sorted and 906 // nonoverlapping. 907 func (s *L0Sublevels) InUseKeyRanges(smallest, largest []byte) []UserKeyRange { 908 // Binary search to find the provided keys within the intervals. 909 startIK := intervalKey{key: smallest, isLargest: false} 910 endIK := intervalKey{key: largest, isLargest: true} 911 start := sort.Search(len(s.orderedIntervals), func(i int) bool { 912 return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, startIK) > 0 913 }) 914 if start > 0 { 915 // Back up to the first interval with a start key <= startIK. 916 start-- 917 } 918 end := sort.Search(len(s.orderedIntervals), func(i int) bool { 919 return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, endIK) > 0 920 }) 921 922 var keyRanges []UserKeyRange 923 var curr *UserKeyRange 924 for i := start; i < end; { 925 // Intervals with no files are not in use and can be skipped, once we 926 // end the current UserKeyRange. 927 if len(s.orderedIntervals[i].files) == 0 { 928 curr = nil 929 i++ 930 continue 931 } 932 933 // If curr is nil, start a new in-use key range. 934 if curr == nil { 935 keyRanges = append(keyRanges, UserKeyRange{ 936 Start: s.orderedIntervals[i].startKey.key, 937 }) 938 curr = &keyRanges[len(keyRanges)-1] 939 } 940 941 // If the filesMaxIntervalIndex is not the current index, we can jump to 942 // the max index, knowing that all intermediary intervals are overlapped 943 // by some file. 944 if maxIdx := s.orderedIntervals[i].filesMaxIntervalIndex; maxIdx != i { 945 // Note that end may be less than or equal to maxIdx if we're 946 // concerned with a key range that ends before the interval at 947 // maxIdx starts. We must set curr.End now, before making that leap, 948 // because this iteration may be the last. 949 i = maxIdx 950 curr.End = s.orderedIntervals[i+1].startKey.key 951 continue 952 } 953 954 // No files overlapping with this interval overlap with the next 955 // interval. Update the current end to be the next interval's start key. 956 // Note that curr is not necessarily finished, because there may be an 957 // abutting non-empty interval. 958 curr.End = s.orderedIntervals[i+1].startKey.key 959 i++ 960 } 961 return keyRanges 962 } 963 964 // FlushSplitKeys returns a slice of user keys to split flushes at. Used by 965 // flushes to avoid writing sstables that straddle these split keys. These 966 // should be interpreted as the keys to start the next sstable (not the last key 967 // to include in the prev sstable). These are user keys so that range tombstones 968 // can be properly truncated (untruncated range tombstones are not permitted for 969 // L0 files). 970 func (s *L0Sublevels) FlushSplitKeys() [][]byte { 971 return s.flushSplitUserKeys 972 } 973 974 // MaxDepthAfterOngoingCompactions returns an estimate of maximum depth of 975 // sublevels after all ongoing compactions run to completion. Used by compaction 976 // picker to decide compaction score for L0. There is no scoring for intra-L0 977 // compactions -- they only run if L0 score is high but we're unable to pick an 978 // L0 -> Lbase compaction. 979 func (s *L0Sublevels) MaxDepthAfterOngoingCompactions() int { 980 depth := 0 981 for i := range s.orderedIntervals { 982 interval := &s.orderedIntervals[i] 983 intervalDepth := len(interval.files) - interval.compactingFileCount 984 if depth < intervalDepth { 985 depth = intervalDepth 986 } 987 } 988 return depth 989 } 990 991 // Only for temporary debugging in the absence of proper tests. 992 // 993 // TODO(bilal): Simplify away the debugging statements in this method, and make 994 // this a pure sanity checker. 995 // 996 //lint:ignore U1000 - useful for debugging 997 func (s *L0Sublevels) checkCompaction(c *L0CompactionFiles) error { 998 includedFiles := newBitSet(s.levelMetadata.Len()) 999 fileIntervalsByLevel := make([]struct { 1000 min int 1001 max int 1002 }, len(s.levelFiles)) 1003 for i := range fileIntervalsByLevel { 1004 fileIntervalsByLevel[i].min = math.MaxInt32 1005 fileIntervalsByLevel[i].max = 0 1006 } 1007 var topLevel int 1008 var increment int 1009 var limitReached func(int) bool 1010 if c.isIntraL0 { 1011 topLevel = len(s.levelFiles) - 1 1012 increment = +1 1013 limitReached = func(level int) bool { 1014 return level == len(s.levelFiles) 1015 } 1016 } else { 1017 topLevel = 0 1018 increment = -1 1019 limitReached = func(level int) bool { 1020 return level < 0 1021 } 1022 } 1023 for _, f := range c.Files { 1024 if fileIntervalsByLevel[f.SubLevel].min > f.minIntervalIndex { 1025 fileIntervalsByLevel[f.SubLevel].min = f.minIntervalIndex 1026 } 1027 if fileIntervalsByLevel[f.SubLevel].max < f.maxIntervalIndex { 1028 fileIntervalsByLevel[f.SubLevel].max = f.maxIntervalIndex 1029 } 1030 includedFiles.markBit(f.L0Index) 1031 if c.isIntraL0 { 1032 if topLevel > f.SubLevel { 1033 topLevel = f.SubLevel 1034 } 1035 } else { 1036 if topLevel < f.SubLevel { 1037 topLevel = f.SubLevel 1038 } 1039 } 1040 } 1041 min := fileIntervalsByLevel[topLevel].min 1042 max := fileIntervalsByLevel[topLevel].max 1043 for level := topLevel; !limitReached(level); level += increment { 1044 if fileIntervalsByLevel[level].min < min { 1045 min = fileIntervalsByLevel[level].min 1046 } 1047 if fileIntervalsByLevel[level].max > max { 1048 max = fileIntervalsByLevel[level].max 1049 } 1050 index := sort.Search(len(s.levelFiles[level]), func(i int) bool { 1051 return s.levelFiles[level][i].maxIntervalIndex >= min 1052 }) 1053 // start := index 1054 for ; index < len(s.levelFiles[level]); index++ { 1055 f := s.levelFiles[level][index] 1056 if f.minIntervalIndex > max { 1057 break 1058 } 1059 if c.isIntraL0 && f.LargestSeqNum >= c.earliestUnflushedSeqNum { 1060 return errors.Errorf( 1061 "sstable %s in compaction has sequence numbers higher than the earliest unflushed seqnum %d: %d-%d", 1062 f.FileNum, c.earliestUnflushedSeqNum, f.SmallestSeqNum, 1063 f.LargestSeqNum) 1064 } 1065 if !includedFiles[f.L0Index] { 1066 var buf strings.Builder 1067 fmt.Fprintf(&buf, "bug %t, seed interval: %d: level %d, sl index %d, f.index %d, min %d, max %d, pre-min %d, pre-max %d, f.min %d, f.max %d, filenum: %d, isCompacting: %t\n%s\n", 1068 c.isIntraL0, c.seedInterval, level, index, f.L0Index, min, max, c.preExtensionMinInterval, c.preExtensionMaxInterval, 1069 f.minIntervalIndex, f.maxIntervalIndex, 1070 f.FileNum, f.IsCompacting(), s) 1071 fmt.Fprintf(&buf, "files included:\n") 1072 for _, f := range c.Files { 1073 fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n", 1074 f.FileNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex) 1075 } 1076 fmt.Fprintf(&buf, "files added:\n") 1077 for _, f := range c.filesAdded { 1078 fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n", 1079 f.FileNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex) 1080 } 1081 return errors.New(buf.String()) 1082 } 1083 } 1084 } 1085 return nil 1086 } 1087 1088 // UpdateStateForStartedCompaction updates internal L0Sublevels state for a 1089 // recently started compaction. isBase specifies if this is a base compaction; 1090 // if false, this is assumed to be an intra-L0 compaction. The specified 1091 // compaction must be involving L0 SSTables. It's assumed that the Compacting 1092 // and IsIntraL0Compacting fields are already set on all [FileMetadata]s passed 1093 // in. 1094 func (s *L0Sublevels) UpdateStateForStartedCompaction(inputs []LevelSlice, isBase bool) error { 1095 minIntervalIndex := -1 1096 maxIntervalIndex := 0 1097 for i := range inputs { 1098 iter := inputs[i].Iter() 1099 for f := iter.First(); f != nil; f = iter.Next() { 1100 for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ { 1101 interval := &s.orderedIntervals[i] 1102 interval.compactingFileCount++ 1103 } 1104 if f.minIntervalIndex < minIntervalIndex || minIntervalIndex == -1 { 1105 minIntervalIndex = f.minIntervalIndex 1106 } 1107 if f.maxIntervalIndex > maxIntervalIndex { 1108 maxIntervalIndex = f.maxIntervalIndex 1109 } 1110 } 1111 } 1112 if isBase { 1113 for i := minIntervalIndex; i <= maxIntervalIndex; i++ { 1114 interval := &s.orderedIntervals[i] 1115 interval.isBaseCompacting = isBase 1116 for j := interval.filesMinIntervalIndex; j <= interval.filesMaxIntervalIndex; j++ { 1117 s.orderedIntervals[j].intervalRangeIsBaseCompacting = true 1118 } 1119 } 1120 } 1121 return nil 1122 } 1123 1124 // L0CompactionFiles represents a candidate set of L0 files for compaction. Also 1125 // referred to as "lcf". Contains state information useful for generating the 1126 // compaction (such as Files), as well as for picking between candidate 1127 // compactions (eg. fileBytes and seedIntervalStackDepthReduction). 1128 type L0CompactionFiles struct { 1129 Files []*FileMetadata 1130 1131 FilesIncluded bitSet 1132 // A "seed interval" is an interval with a high stack depth that was chosen 1133 // to bootstrap this compaction candidate. seedIntervalStackDepthReduction 1134 // is the number of sublevels that have a file in the seed interval that is 1135 // a part of this compaction. 1136 seedIntervalStackDepthReduction int 1137 // For base compactions, seedIntervalMinLevel is 0, and for intra-L0 1138 // compactions, seedIntervalMaxLevel is len(s.Files)-1 i.e. the highest 1139 // sublevel. 1140 seedIntervalMinLevel int 1141 seedIntervalMaxLevel int 1142 // Index of the seed interval. 1143 seedInterval int 1144 // Sum of file sizes for all files in this compaction. 1145 fileBytes uint64 1146 // Intervals with index [minIntervalIndex, maxIntervalIndex] are 1147 // participating in this compaction; it's the union set of all intervals 1148 // overlapped by participating files. 1149 minIntervalIndex int 1150 maxIntervalIndex int 1151 1152 // Set for intra-L0 compactions. SSTables with sequence numbers greater 1153 // than earliestUnflushedSeqNum cannot be a part of intra-L0 compactions. 1154 isIntraL0 bool 1155 earliestUnflushedSeqNum uint64 1156 1157 // For debugging purposes only. Used in checkCompaction(). 1158 preExtensionMinInterval int 1159 preExtensionMaxInterval int 1160 filesAdded []*FileMetadata 1161 } 1162 1163 // Clone allocates a new L0CompactionFiles, with the same underlying data. Note 1164 // that the two fileMetadata slices contain values that point to the same 1165 // underlying fileMetadata object. This is safe because these objects are read 1166 // only. 1167 func (l *L0CompactionFiles) Clone() *L0CompactionFiles { 1168 oldLcf := *l 1169 return &oldLcf 1170 } 1171 1172 // String merely prints the starting address of the first file, if it exists. 1173 func (l *L0CompactionFiles) String() string { 1174 if len(l.Files) > 0 { 1175 return fmt.Sprintf("First File Address: %p", &l.Files[0]) 1176 } 1177 return "" 1178 } 1179 1180 // addFile adds the specified file to the LCF. 1181 func (l *L0CompactionFiles) addFile(f *FileMetadata) { 1182 if l.FilesIncluded[f.L0Index] { 1183 return 1184 } 1185 l.FilesIncluded.markBit(f.L0Index) 1186 l.Files = append(l.Files, f) 1187 l.filesAdded = append(l.filesAdded, f) 1188 l.fileBytes += f.Size 1189 if f.minIntervalIndex < l.minIntervalIndex { 1190 l.minIntervalIndex = f.minIntervalIndex 1191 } 1192 if f.maxIntervalIndex > l.maxIntervalIndex { 1193 l.maxIntervalIndex = f.maxIntervalIndex 1194 } 1195 } 1196 1197 // Helper to order intervals being considered for compaction. 1198 type intervalAndScore struct { 1199 interval int 1200 score int 1201 } 1202 type intervalSorterByDecreasingScore []intervalAndScore 1203 1204 func (is intervalSorterByDecreasingScore) Len() int { return len(is) } 1205 func (is intervalSorterByDecreasingScore) Less(i, j int) bool { 1206 return is[i].score > is[j].score 1207 } 1208 func (is intervalSorterByDecreasingScore) Swap(i, j int) { 1209 is[i], is[j] = is[j], is[i] 1210 } 1211 1212 // Compactions: 1213 // 1214 // The sub-levels and intervals can be visualized in 2 dimensions as the X axis 1215 // containing intervals in increasing order and the Y axis containing sub-levels 1216 // (older to younger). The intervals can be sparse wrt sub-levels. We observe 1217 // that the system is typically under severe pressure in L0 during large numbers 1218 // of ingestions where most files added to L0 are narrow and non-overlapping. 1219 // 1220 // L0.1 d---g 1221 // L0.0 c--e g--j o--s u--x 1222 // 1223 // As opposed to a case with a lot of wide, overlapping L0 files: 1224 // 1225 // L0.3 d-----------r 1226 // L0.2 c--------o 1227 // L0.1 b-----------q 1228 // L0.0 a----------------x 1229 // 1230 // In that case we expect the rectangle represented in the good visualization 1231 // above (i.e. the first one) to be wide and short, and not too sparse (most 1232 // intervals will have fileCount close to the sub-level count), which would make 1233 // it amenable to concurrent L0 -> Lbase compactions. 1234 // 1235 // L0 -> Lbase: The high-level goal of a L0 -> Lbase compaction is to reduce 1236 // stack depth, by compacting files in the intervals with the highest (fileCount 1237 // - compactingCount). Additionally, we would like compactions to not involve a 1238 // huge number of files, so that they finish quickly, and to allow for 1239 // concurrent L0 -> Lbase compactions when needed. In order to achieve these 1240 // goals we would like compactions to visualize as capturing thin and tall 1241 // rectangles. The approach below is to consider intervals in some order and 1242 // then try to construct a compaction using the interval. The first interval we 1243 // can construct a compaction for is the compaction that is started. There can 1244 // be multiple heuristics in choosing the ordering of the intervals -- the code 1245 // uses one heuristic that worked well for a large ingestion stemming from a 1246 // cockroachdb import, but additional experimentation is necessary to pick a 1247 // general heuristic. Additionally, the compaction that gets picked may be not 1248 // as desirable as one that could be constructed later in terms of reducing 1249 // stack depth (since adding more files to the compaction can get blocked by 1250 // needing to encompass files that are already being compacted). So an 1251 // alternative would be to try to construct more than one compaction and pick 1252 // the best one. 1253 // 1254 // Here's a visualization of an ideal L0->LBase compaction selection: 1255 // 1256 // L0.3 a--d g-j 1257 // L0.2 f--j r-t 1258 // L0.1 b-d e---j 1259 // L0.0 a--d f--j l--o p-----x 1260 // 1261 // Lbase a--------i m---------w 1262 // 1263 // The [g,j] interval has the highest stack depth, so it would have the highest 1264 // priority for selecting a base compaction candidate. Assuming none of the 1265 // files are already compacting, this is the compaction that will be chosen: 1266 // 1267 // _______ 1268 // L0.3 a--d | g-j| 1269 // L0.2 | f--j| r-t 1270 // L0.1 b-d |e---j| 1271 // L0.0 a--d | f--j| l--o p-----x 1272 // 1273 // Lbase a--------i m---------w 1274 // 1275 // Note that running this compaction will mark the a--i file in Lbase as 1276 // compacting, and when ExtendL0ForBaseCompactionTo is called with the bounds of 1277 // that base file, it'll expand the compaction to also include all L0 files in 1278 // the a-d interval. The resultant compaction would then be: 1279 // 1280 // _____________ 1281 // L0.3 |a--d g-j| 1282 // L0.2 | f--j| r-t 1283 // L0.1 | b-d e---j| 1284 // L0.0 |a--d f--j| l--o p-----x 1285 // 1286 // Lbase a--------i m---------w 1287 // 1288 // The next best interval for base compaction would therefore be the one 1289 // including r--t in L0.2 and p--x in L0.0, and both this compaction and the one 1290 // picked earlier can run in parallel. This is assuming minCompactionDepth >= 2, 1291 // otherwise the second compaction has too little depth to pick. 1292 // 1293 // _____________ 1294 // L0.3 |a--d g-j| _________ 1295 // L0.2 | f--j| | r-t | 1296 // L0.1 | b-d e---j| | | 1297 // L0.0 |a--d f--j| l--o |p-----x| 1298 // 1299 // Lbase a--------i m---------w 1300 // 1301 // Note that when ExtendL0ForBaseCompactionTo is called, the compaction expands 1302 // to the following, given that the [l,o] file can be added without including 1303 // additional files in Lbase: 1304 // 1305 // _____________ 1306 // L0.3 |a--d g-j| _________ 1307 // L0.2 | f--j| | r-t | 1308 // L0.1 | b-d e---j|______| | 1309 // L0.0 |a--d f--j||l--o p-----x| 1310 // 1311 // Lbase a--------i m---------w 1312 // 1313 // If an additional file existed in LBase that overlapped with [l,o], it would 1314 // be excluded from the compaction. Concretely: 1315 // 1316 // _____________ 1317 // L0.3 |a--d g-j| _________ 1318 // L0.2 | f--j| | r-t | 1319 // L0.1 | b-d e---j| | | 1320 // L0.0 |a--d f--j| l--o |p-----x| 1321 // 1322 // Lbase a--------ij--lm---------w 1323 // 1324 // Intra-L0: If the L0 score is high, but PickBaseCompaction() is unable to pick 1325 // a compaction, PickIntraL0Compaction will be used to pick an intra-L0 1326 // compaction. Similar to L0 -> Lbase compactions, we want to allow for multiple 1327 // intra-L0 compactions and not generate wide output files that hinder later 1328 // concurrency of L0 -> Lbase compactions. Also compactions that produce wide 1329 // files don't reduce stack depth -- they represent wide rectangles in our 1330 // visualization, which means many intervals have their depth reduced by a small 1331 // amount. Typically, L0 files have non-overlapping sequence numbers, and 1332 // sticking to that invariant would require us to consider intra-L0 compactions 1333 // that proceed from youngest to oldest files, which could result in the 1334 // aforementioned undesirable wide rectangle shape. But this non-overlapping 1335 // sequence number is already relaxed in RocksDB -- sstables are primarily 1336 // ordered by their largest sequence number. So we can arrange for intra-L0 1337 // compactions to capture thin and tall rectangles starting with the top of the 1338 // stack (youngest files). Like the L0 -> Lbase case we order the intervals 1339 // using a heuristic and consider each in turn. The same comment about better L0 1340 // -> Lbase heuristics and not being greedy applies here. 1341 // 1342 // Going back to a modified version of our example from earlier, let's say these 1343 // are the base compactions in progress: 1344 // _______ 1345 // L0.3 a--d | g-j| _________ 1346 // L0.2 | f--j| | r-t | 1347 // L0.1 b-d |e---j| | | 1348 // L0.0 a--d | f--j| l--o |p-----x| 1349 // 1350 // Lbase a---------i m---------w 1351 // 1352 // Since both LBase files are compacting, the only L0 compaction that can be 1353 // picked is an intra-L0 compaction. For this, the b--d interval has the highest 1354 // stack depth (3), and starting with a--d in L0.3 as the seed file, we can 1355 // iterate downward and build this compaction, assuming all files in that 1356 // interval are not compacting and have a highest sequence number less than 1357 // earliestUnflushedSeqNum: 1358 // 1359 // _______ 1360 // L0.3 |a--d| | g-j| _________ 1361 // L0.2 | | | f--j| | r-t | 1362 // L0.1 | b-d| |e---j| | | 1363 // L0.0 |a--d| | f--j| l--o |p-----x| 1364 // ------ 1365 // Lbase a---------i m---------w 1366 // 1367 1368 // PickBaseCompaction picks a base compaction based on the above specified 1369 // heuristics, for the specified Lbase files and a minimum depth of overlapping 1370 // files that can be selected for compaction. Returns nil if no compaction is 1371 // possible. 1372 func (s *L0Sublevels) PickBaseCompaction( 1373 minCompactionDepth int, baseFiles LevelSlice, 1374 ) (*L0CompactionFiles, error) { 1375 // For LBase compactions, we consider intervals in a greedy manner in the 1376 // following order: 1377 // - Intervals that are unlikely to be blocked due 1378 // to ongoing L0 -> Lbase compactions. These are the ones with 1379 // !isBaseCompacting && !intervalRangeIsBaseCompacting. 1380 // - Intervals that are !isBaseCompacting && intervalRangeIsBaseCompacting. 1381 // 1382 // The ordering heuristic exists just to avoid wasted work. Ideally, 1383 // we would consider all intervals with isBaseCompacting = false and 1384 // construct a compaction for it and compare the constructed compactions 1385 // and pick the best one. If microbenchmarks show that we can afford 1386 // this cost we can eliminate this heuristic. 1387 scoredIntervals := make([]intervalAndScore, 0, len(s.orderedIntervals)) 1388 sublevelCount := len(s.levelFiles) 1389 for i := range s.orderedIntervals { 1390 interval := &s.orderedIntervals[i] 1391 depth := len(interval.files) - interval.compactingFileCount 1392 if interval.isBaseCompacting || minCompactionDepth > depth { 1393 continue 1394 } 1395 if interval.intervalRangeIsBaseCompacting { 1396 scoredIntervals = append(scoredIntervals, intervalAndScore{interval: i, score: depth}) 1397 } else { 1398 // Prioritize this interval by incrementing the score by the number 1399 // of sublevels. 1400 scoredIntervals = append(scoredIntervals, intervalAndScore{interval: i, score: depth + sublevelCount}) 1401 } 1402 } 1403 sort.Sort(intervalSorterByDecreasingScore(scoredIntervals)) 1404 1405 // Optimization to avoid considering different intervals that 1406 // are likely to choose the same seed file. Again this is just 1407 // to reduce wasted work. 1408 consideredIntervals := newBitSet(len(s.orderedIntervals)) 1409 for _, scoredInterval := range scoredIntervals { 1410 interval := &s.orderedIntervals[scoredInterval.interval] 1411 if consideredIntervals[interval.index] { 1412 continue 1413 } 1414 1415 // Pick the seed file for the interval as the file 1416 // in the lowest sub-level. 1417 f := interval.files[0] 1418 // Don't bother considering the intervals that are covered by the seed 1419 // file since they are likely nearby. Note that it is possible that 1420 // those intervals have seed files at lower sub-levels so could be 1421 // viable for compaction. 1422 if f == nil { 1423 return nil, errors.New("no seed file found in sublevel intervals") 1424 } 1425 consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1) 1426 if f.IsCompacting() { 1427 if f.IsIntraL0Compacting { 1428 // If we're picking a base compaction and we came across a seed 1429 // file candidate that's being intra-L0 compacted, skip the 1430 // interval instead of erroring out. 1431 continue 1432 } 1433 // We chose a compaction seed file that should not be compacting. 1434 // Usually means the score is not accurately accounting for files 1435 // already compacting, or internal state is inconsistent. 1436 return nil, errors.Errorf("file %s chosen as seed file for compaction should not be compacting", f.FileNum) 1437 } 1438 1439 c := s.baseCompactionUsingSeed(f, interval.index, minCompactionDepth) 1440 if c != nil { 1441 // Check if the chosen compaction overlaps with any files in Lbase 1442 // that have Compacting = true. If that's the case, this compaction 1443 // cannot be chosen. 1444 baseIter := baseFiles.Iter() 1445 // An interval starting at ImmediateSuccessor(key) can never be the 1446 // first interval of a compaction since no file can start at that 1447 // interval. 1448 m := baseIter.SeekGE(s.cmp, s.orderedIntervals[c.minIntervalIndex].startKey.key) 1449 1450 var baseCompacting bool 1451 for ; m != nil && !baseCompacting; m = baseIter.Next() { 1452 cmp := s.cmp(m.Smallest.UserKey, s.orderedIntervals[c.maxIntervalIndex+1].startKey.key) 1453 // Compaction is ending at exclusive bound of c.maxIntervalIndex+1 1454 if cmp > 0 || (cmp == 0 && !s.orderedIntervals[c.maxIntervalIndex+1].startKey.isLargest) { 1455 break 1456 } 1457 baseCompacting = baseCompacting || m.IsCompacting() 1458 } 1459 if baseCompacting { 1460 continue 1461 } 1462 return c, nil 1463 } 1464 } 1465 return nil, nil 1466 } 1467 1468 // Helper function for building an L0 -> Lbase compaction using a seed interval 1469 // and seed file in that seed interval. 1470 func (s *L0Sublevels) baseCompactionUsingSeed( 1471 f *FileMetadata, intervalIndex int, minCompactionDepth int, 1472 ) *L0CompactionFiles { 1473 c := &L0CompactionFiles{ 1474 FilesIncluded: newBitSet(s.levelMetadata.Len()), 1475 seedInterval: intervalIndex, 1476 seedIntervalMinLevel: 0, 1477 minIntervalIndex: f.minIntervalIndex, 1478 maxIntervalIndex: f.maxIntervalIndex, 1479 } 1480 c.addFile(f) 1481 1482 // The first iteration of this loop builds the compaction at the seed file's 1483 // sublevel. Future iterations expand on this compaction by stacking more 1484 // files from intervalIndex and repeating. This is an optional activity so 1485 // when it fails we can fallback to the last successful candidate. 1486 var lastCandidate *L0CompactionFiles 1487 interval := &s.orderedIntervals[intervalIndex] 1488 1489 for i := 0; i < len(interval.files); i++ { 1490 f2 := interval.files[i] 1491 sl := f2.SubLevel 1492 c.seedIntervalStackDepthReduction++ 1493 c.seedIntervalMaxLevel = sl 1494 c.addFile(f2) 1495 // The seed file is in the lowest sublevel in the seed interval, but it 1496 // may overlap with other files in even lower sublevels. For correctness 1497 // we need to grow our interval to include those files, and capture all 1498 // files in the next level that fall in this extended interval and so 1499 // on. This can result in a triangular shape like the following where 1500 // again the X axis is the key intervals and the Y axis is oldest to 1501 // youngest. Note that it is not necessary for correctness to fill out 1502 // the shape at the higher sub-levels to make it more rectangular since 1503 // the invariant only requires that younger versions of a key not be 1504 // moved to Lbase while leaving behind older versions. 1505 // - 1506 // --- 1507 // ----- 1508 // It may be better for performance to have a more rectangular shape 1509 // since the files being left behind will overlap with the same Lbase 1510 // key range as that of this compaction. But there is also the danger 1511 // that in trying to construct a more rectangular shape we will be 1512 // forced to pull in a file that is already compacting. We expect 1513 // extendCandidateToRectangle to eventually be called on this compaction 1514 // if it's chosen, at which point we would iterate backward and choose 1515 // those files. This logic is similar to compaction.grow for non-L0 1516 // compactions. 1517 done := false 1518 for currLevel := sl - 1; currLevel >= 0; currLevel-- { 1519 if !s.extendFiles(currLevel, math.MaxUint64, c) { 1520 // Failed to extend due to ongoing compaction. 1521 done = true 1522 break 1523 } 1524 } 1525 if done { 1526 break 1527 } 1528 // Observed some compactions using > 1GB from L0 in an import 1529 // experiment. Very long running compactions are not great as they 1530 // reduce concurrency while they run, and take a while to produce 1531 // results, though they're sometimes unavoidable. There is a tradeoff 1532 // here in that adding more depth is more efficient in reducing stack 1533 // depth, but long running compactions reduce flexibility in what can 1534 // run concurrently in L0 and even Lbase -> Lbase+1. An increase more 1535 // than 150% in bytes since the last candidate compaction (along with a 1536 // total compaction size in excess of 100mb), or a total compaction size 1537 // beyond a hard limit of 500mb, is criteria for rejecting this 1538 // candidate. This lets us prefer slow growths as we add files, while 1539 // still having a hard limit. Note that if this is the first compaction 1540 // candidate to reach a stack depth reduction of minCompactionDepth or 1541 // higher, this candidate will be chosen regardless. 1542 if lastCandidate == nil { 1543 lastCandidate = &L0CompactionFiles{} 1544 } else if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth && 1545 c.fileBytes > 100<<20 && 1546 (float64(c.fileBytes)/float64(lastCandidate.fileBytes) > 1.5 || c.fileBytes > 500<<20) { 1547 break 1548 } 1549 *lastCandidate = *c 1550 } 1551 if lastCandidate != nil && lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth { 1552 lastCandidate.FilesIncluded.clearAllBits() 1553 for _, f := range lastCandidate.Files { 1554 lastCandidate.FilesIncluded.markBit(f.L0Index) 1555 } 1556 return lastCandidate 1557 } 1558 return nil 1559 } 1560 1561 // Expands fields in the provided L0CompactionFiles instance (cFiles) to 1562 // include overlapping files in the specified sublevel. Returns true if the 1563 // compaction is possible (i.e. does not conflict with any base/intra-L0 1564 // compacting files). 1565 func (s *L0Sublevels) extendFiles( 1566 sl int, earliestUnflushedSeqNum uint64, cFiles *L0CompactionFiles, 1567 ) bool { 1568 index := sort.Search(len(s.levelFiles[sl]), func(i int) bool { 1569 return s.levelFiles[sl][i].maxIntervalIndex >= cFiles.minIntervalIndex 1570 }) 1571 for ; index < len(s.levelFiles[sl]); index++ { 1572 f := s.levelFiles[sl][index] 1573 if f.minIntervalIndex > cFiles.maxIntervalIndex { 1574 break 1575 } 1576 if f.IsCompacting() { 1577 return false 1578 } 1579 // Skip over files that are newer than earliestUnflushedSeqNum. This is 1580 // okay because this compaction can just pretend these files are not in 1581 // L0 yet. These files must be in higher sublevels than any overlapping 1582 // files with f.LargestSeqNum < earliestUnflushedSeqNum, and the output 1583 // of the compaction will also go in a lower (older) sublevel than this 1584 // file by definition. 1585 if f.LargestSeqNum >= earliestUnflushedSeqNum { 1586 continue 1587 } 1588 cFiles.addFile(f) 1589 } 1590 return true 1591 } 1592 1593 // PickIntraL0Compaction picks an intra-L0 compaction for files in this 1594 // sublevel. This method is only called when a base compaction cannot be chosen. 1595 // See comment above [PickBaseCompaction] for heuristics involved in this 1596 // selection. 1597 func (s *L0Sublevels) PickIntraL0Compaction( 1598 earliestUnflushedSeqNum uint64, minCompactionDepth int, 1599 ) (*L0CompactionFiles, error) { 1600 scoredIntervals := make([]intervalAndScore, len(s.orderedIntervals)) 1601 for i := range s.orderedIntervals { 1602 interval := &s.orderedIntervals[i] 1603 depth := len(interval.files) - interval.compactingFileCount 1604 if minCompactionDepth > depth { 1605 continue 1606 } 1607 scoredIntervals[i] = intervalAndScore{interval: i, score: depth} 1608 } 1609 sort.Sort(intervalSorterByDecreasingScore(scoredIntervals)) 1610 1611 // Optimization to avoid considering different intervals that are likely to 1612 // choose the same seed file. Again this is just to reduce wasted work. 1613 consideredIntervals := newBitSet(len(s.orderedIntervals)) 1614 for _, scoredInterval := range scoredIntervals { 1615 interval := &s.orderedIntervals[scoredInterval.interval] 1616 if consideredIntervals[interval.index] { 1617 continue 1618 } 1619 1620 var f *FileMetadata 1621 // Pick the seed file for the interval as the file in the highest 1622 // sub-level. 1623 stackDepthReduction := scoredInterval.score 1624 for i := len(interval.files) - 1; i >= 0; i-- { 1625 f = interval.files[i] 1626 if f.IsCompacting() { 1627 break 1628 } 1629 consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1) 1630 // Can this be the seed file? Files with newer sequence numbers than 1631 // earliestUnflushedSeqNum cannot be in the compaction. 1632 if f.LargestSeqNum >= earliestUnflushedSeqNum { 1633 stackDepthReduction-- 1634 if stackDepthReduction == 0 { 1635 break 1636 } 1637 } else { 1638 break 1639 } 1640 } 1641 if stackDepthReduction < minCompactionDepth { 1642 // Can't use this interval. 1643 continue 1644 } 1645 1646 if f == nil { 1647 return nil, errors.New("no seed file found in sublevel intervals") 1648 } 1649 if f.IsCompacting() { 1650 // This file could be in a concurrent intra-L0 or base compaction. 1651 // Try another interval. 1652 continue 1653 } 1654 1655 // We have a seed file. Build a compaction off of that seed. 1656 c := s.intraL0CompactionUsingSeed( 1657 f, interval.index, earliestUnflushedSeqNum, minCompactionDepth) 1658 if c != nil { 1659 return c, nil 1660 } 1661 } 1662 return nil, nil 1663 } 1664 1665 func (s *L0Sublevels) intraL0CompactionUsingSeed( 1666 f *FileMetadata, intervalIndex int, earliestUnflushedSeqNum uint64, minCompactionDepth int, 1667 ) *L0CompactionFiles { 1668 // We know that all the files that overlap with intervalIndex have 1669 // LargestSeqNum < earliestUnflushedSeqNum, but for other intervals 1670 // we need to exclude files >= earliestUnflushedSeqNum 1671 1672 c := &L0CompactionFiles{ 1673 FilesIncluded: newBitSet(s.levelMetadata.Len()), 1674 seedInterval: intervalIndex, 1675 seedIntervalMaxLevel: len(s.levelFiles) - 1, 1676 minIntervalIndex: f.minIntervalIndex, 1677 maxIntervalIndex: f.maxIntervalIndex, 1678 isIntraL0: true, 1679 earliestUnflushedSeqNum: earliestUnflushedSeqNum, 1680 } 1681 c.addFile(f) 1682 1683 var lastCandidate *L0CompactionFiles 1684 interval := &s.orderedIntervals[intervalIndex] 1685 slIndex := len(interval.files) - 1 1686 for { 1687 if interval.files[slIndex] == f { 1688 break 1689 } 1690 slIndex-- 1691 } 1692 // The first iteration of this loop produces an intra-L0 compaction at the 1693 // seed level. Iterations after that optionally add to the compaction by 1694 // stacking more files from intervalIndex and repeating. This is an optional 1695 // activity so when it fails we can fallback to the last successful 1696 // candidate. The code stops adding when it can't add more, or when 1697 // fileBytes grows too large. 1698 for ; slIndex >= 0; slIndex-- { 1699 f2 := interval.files[slIndex] 1700 sl := f2.SubLevel 1701 if f2.IsCompacting() { 1702 break 1703 } 1704 c.seedIntervalStackDepthReduction++ 1705 c.seedIntervalMinLevel = sl 1706 c.addFile(f2) 1707 // The seed file captures all files in the higher level that fall in the 1708 // range of intervals. That may extend the range of intervals so for 1709 // correctness we need to capture all files in the next higher level 1710 // that fall in this extended interval and so on. This can result in an 1711 // inverted triangular shape like the following where again the X axis 1712 // is the key intervals and the Y axis is oldest to youngest. Note that 1713 // it is not necessary for correctness to fill out the shape at lower 1714 // sub-levels to make it more rectangular since the invariant only 1715 // requires that if we move an older seqnum for key k into a file that 1716 // has a higher seqnum, we also move all younger seqnums for that key k 1717 // into that file. 1718 // ----- 1719 // --- 1720 // - 1721 // It may be better for performance to have a more rectangular shape 1722 // since it will reduce the stack depth for more intervals. But there is 1723 // also the danger that in explicitly trying to construct a more 1724 // rectangular shape we will be forced to pull in a file that is already 1725 // compacting. We assume that the performance concern is not a practical 1726 // issue. 1727 done := false 1728 for currLevel := sl + 1; currLevel < len(s.levelFiles); currLevel++ { 1729 if !s.extendFiles(currLevel, earliestUnflushedSeqNum, c) { 1730 // Failed to extend due to ongoing compaction. 1731 done = true 1732 break 1733 } 1734 } 1735 if done { 1736 break 1737 } 1738 if lastCandidate == nil { 1739 lastCandidate = &L0CompactionFiles{} 1740 } else if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth && 1741 c.fileBytes > 100<<20 && 1742 (float64(c.fileBytes)/float64(lastCandidate.fileBytes) > 1.5 || c.fileBytes > 500<<20) { 1743 break 1744 } 1745 *lastCandidate = *c 1746 } 1747 if lastCandidate != nil && lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth { 1748 lastCandidate.FilesIncluded.clearAllBits() 1749 for _, f := range lastCandidate.Files { 1750 lastCandidate.FilesIncluded.markBit(f.L0Index) 1751 } 1752 s.extendCandidateToRectangle( 1753 lastCandidate.minIntervalIndex, lastCandidate.maxIntervalIndex, lastCandidate, false) 1754 return lastCandidate 1755 } 1756 return nil 1757 } 1758 1759 // ExtendL0ForBaseCompactionTo extends the specified base compaction candidate 1760 // L0CompactionFiles to optionally cover more files in L0 without "touching" any 1761 // of the passed-in keys (i.e. the smallest/largest bounds are exclusive), as 1762 // including any user keys for those internal keys could require choosing more 1763 // files in LBase which is undesirable. Unbounded start/end keys are indicated 1764 // by passing in the InvalidInternalKey. 1765 func (s *L0Sublevels) ExtendL0ForBaseCompactionTo( 1766 smallest, largest InternalKey, candidate *L0CompactionFiles, 1767 ) bool { 1768 firstIntervalIndex := 0 1769 lastIntervalIndex := len(s.orderedIntervals) - 1 1770 if smallest.Kind() != base.InternalKeyKindInvalid { 1771 if smallest.Trailer == base.InternalKeyRangeDeleteSentinel { 1772 // Starting at smallest.UserKey == interval.startKey is okay. 1773 firstIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool { 1774 return s.cmp(smallest.UserKey, s.orderedIntervals[i].startKey.key) <= 0 1775 }) 1776 } else { 1777 firstIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool { 1778 // Need to start at >= smallest since if we widen too much we may miss 1779 // an Lbase file that overlaps with an L0 file that will get picked in 1780 // this widening, which would be bad. This interval will not start with 1781 // an immediate successor key. 1782 return s.cmp(smallest.UserKey, s.orderedIntervals[i].startKey.key) < 0 1783 }) 1784 } 1785 } 1786 if largest.Kind() != base.InternalKeyKindInvalid { 1787 // First interval that starts at or beyond the largest. This interval will not 1788 // start with an immediate successor key. 1789 lastIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool { 1790 return s.cmp(largest.UserKey, s.orderedIntervals[i].startKey.key) <= 0 1791 }) 1792 // Right now, lastIntervalIndex has a startKey that extends beyond largest. 1793 // The previous interval, by definition, has an end key higher than largest. 1794 // Iterate back twice to get the last interval that's completely within 1795 // (smallest, largest). Except in the case where we went past the end of the 1796 // list; in that case, the last interval to include is the very last 1797 // interval in the list. 1798 if lastIntervalIndex < len(s.orderedIntervals) { 1799 lastIntervalIndex-- 1800 } 1801 lastIntervalIndex-- 1802 } 1803 if lastIntervalIndex < firstIntervalIndex { 1804 return false 1805 } 1806 return s.extendCandidateToRectangle(firstIntervalIndex, lastIntervalIndex, candidate, true) 1807 } 1808 1809 // Best-effort attempt to make the compaction include more files in the 1810 // rectangle defined by [minIntervalIndex, maxIntervalIndex] on the X axis and 1811 // bounded on the Y axis by seedIntervalMinLevel and seedIntervalMaxLevel. 1812 // 1813 // This is strictly an optional extension; at any point where we can't feasibly 1814 // add more files, the sublevel iteration can be halted early and candidate will 1815 // still be a correct compaction candidate. 1816 // 1817 // Consider this scenario (original candidate is inside the rectangle), with 1818 // isBase = true and interval bounds a-j (from the union of base file bounds and 1819 // that of compaction candidate): 1820 // 1821 // _______ 1822 // L0.3 a--d | g-j| 1823 // L0.2 | f--j| r-t 1824 // L0.1 b-d |e---j| 1825 // L0.0 a--d | f--j| l--o p-----x 1826 // 1827 // Lbase a--------i m---------w 1828 // 1829 // This method will iterate from the bottom up. At L0.0, it will add a--d since 1830 // it's in the bounds, then add b-d, then a--d, and so on, to produce this: 1831 // 1832 // _____________ 1833 // L0.3 |a--d g-j| 1834 // L0.2 | f--j| r-t 1835 // L0.1 | b-d e---j| 1836 // L0.0 |a--d f--j| l--o p-----x 1837 // 1838 // Lbase a-------i m---------w 1839 // 1840 // Let's assume that, instead of a--d in the top sublevel, we had 3 files, a-b, 1841 // bb-c, and cc-d, of which bb-c is compacting. Let's also add another sublevel 1842 // L0.4 with some files, all of which aren't compacting: 1843 // 1844 // L0.4 a------c ca--d _______ 1845 // L0.3 a-b bb-c cc-d | g-j| 1846 // L0.2 | f--j| r-t 1847 // L0.1 b----------d |e---j| 1848 // L0.0 a------------d | f--j| l--o p-----x 1849 // 1850 // Lbase a------------------i m---------w 1851 // 1852 // This method then needs to choose between the left side of L0.3 bb-c (i.e. 1853 // a-b), or the right side (i.e. cc-d and g-j) for inclusion in this compaction. 1854 // Since the right side has more files as well as one file that has already been 1855 // picked, it gets chosen at that sublevel, resulting in this intermediate 1856 // compaction: 1857 // 1858 // L0.4 a------c ca--d 1859 // ______________ 1860 // L0.3 a-b bb-c| cc-d g-j| 1861 // L0.2 _________| f--j| r-t 1862 // L0.1 | b----------d e---j| 1863 // L0.0 |a------------d f--j| l--o p-----x 1864 // 1865 // Lbase a------------------i m---------w 1866 // 1867 // Since bb-c had to be excluded at L0.3, the interval bounds for L0.4 are 1868 // actually ca-j, since ca is the next interval start key after the end interval 1869 // of bb-c. This would result in only ca-d being chosen at that sublevel, even 1870 // though a--c is also not compacting. This is the final result: 1871 // 1872 // ______________ 1873 // L0.4 a------c|ca--d | 1874 // L0.3 a-b bb-c| cc-d g-j| 1875 // L0.2 _________| f--j| r-t 1876 // L0.1 | b----------d e---j| 1877 // L0.0 |a------------d f--j| l--o p-----x 1878 // 1879 // Lbase a------------------i m---------w 1880 // 1881 // TODO(bilal): Add more targeted tests for this method, through 1882 // ExtendL0ForBaseCompactionTo and intraL0CompactionUsingSeed. 1883 func (s *L0Sublevels) extendCandidateToRectangle( 1884 minIntervalIndex int, maxIntervalIndex int, candidate *L0CompactionFiles, isBase bool, 1885 ) bool { 1886 candidate.preExtensionMinInterval = candidate.minIntervalIndex 1887 candidate.preExtensionMaxInterval = candidate.maxIntervalIndex 1888 // Extend {min,max}IntervalIndex to include all of the candidate's current 1889 // bounds. 1890 if minIntervalIndex > candidate.minIntervalIndex { 1891 minIntervalIndex = candidate.minIntervalIndex 1892 } 1893 if maxIntervalIndex < candidate.maxIntervalIndex { 1894 maxIntervalIndex = candidate.maxIntervalIndex 1895 } 1896 var startLevel, increment, endLevel int 1897 if isBase { 1898 startLevel = 0 1899 increment = +1 1900 // seedIntervalMaxLevel is inclusive, while endLevel is exclusive. 1901 endLevel = candidate.seedIntervalMaxLevel + 1 1902 } else { 1903 startLevel = len(s.levelFiles) - 1 1904 increment = -1 1905 // seedIntervalMinLevel is inclusive, while endLevel is exclusive. 1906 endLevel = candidate.seedIntervalMinLevel - 1 1907 } 1908 // Stats for files. 1909 addedCount := 0 1910 // Iterate from the oldest sub-level for L0 -> Lbase and youngest sub-level 1911 // for intra-L0. The idea here is that anything that can't be included from 1912 // that level constrains what can be included from the next level. This 1913 // change in constraint is directly incorporated into minIntervalIndex, 1914 // maxIntervalIndex. 1915 for sl := startLevel; sl != endLevel; sl += increment { 1916 files := s.levelFiles[sl] 1917 // Find the first file that overlaps with minIntervalIndex. 1918 index := sort.Search(len(files), func(i int) bool { 1919 return minIntervalIndex <= files[i].maxIntervalIndex 1920 }) 1921 // Track the files that are fully within the current constraint of 1922 // [minIntervalIndex, maxIntervalIndex]. 1923 firstIndex := -1 1924 lastIndex := -1 1925 for ; index < len(files); index++ { 1926 f := files[index] 1927 if f.minIntervalIndex > maxIntervalIndex { 1928 break 1929 } 1930 include := true 1931 // Extends out on the left so can't be included. This narrows what 1932 // we can included in the next level. 1933 if f.minIntervalIndex < minIntervalIndex { 1934 include = false 1935 minIntervalIndex = f.maxIntervalIndex + 1 1936 } 1937 // Extends out on the right so can't be included. 1938 if f.maxIntervalIndex > maxIntervalIndex { 1939 include = false 1940 maxIntervalIndex = f.minIntervalIndex - 1 1941 } 1942 if !include { 1943 continue 1944 } 1945 if firstIndex == -1 { 1946 firstIndex = index 1947 } 1948 lastIndex = index 1949 } 1950 if minIntervalIndex > maxIntervalIndex { 1951 // We excluded files that prevent continuation. 1952 break 1953 } 1954 if firstIndex < 0 { 1955 // No files to add in this sub-level. 1956 continue 1957 } 1958 // We have the files in [firstIndex, lastIndex] as potential for 1959 // inclusion. Some of these may already have been picked. Some of them 1960 // may be already compacting. The latter is tricky since we have to 1961 // decide whether to contract minIntervalIndex or maxIntervalIndex when 1962 // we encounter an already compacting file. We pick the longest sequence 1963 // between firstIndex and lastIndex of non-compacting files -- this is 1964 // represented by [candidateNonCompactingFirst, 1965 // candidateNonCompactingLast]. 1966 nonCompactingFirst := -1 1967 currentRunHasAlreadyPickedFiles := false 1968 candidateNonCompactingFirst := -1 1969 candidateNonCompactingLast := -1 1970 candidateHasAlreadyPickedFiles := false 1971 for index = firstIndex; index <= lastIndex; index++ { 1972 f := files[index] 1973 if f.IsCompacting() { 1974 if nonCompactingFirst != -1 { 1975 last := index - 1 1976 // Prioritize runs of consecutive non-compacting files that 1977 // have files that have already been picked. That is to say, 1978 // if candidateHasAlreadyPickedFiles == true, we stick with 1979 // it, and if currentRunHasAlreadyPickedfiles == true, we 1980 // pick that run even if it contains fewer files than the 1981 // previous candidate. 1982 if !candidateHasAlreadyPickedFiles && (candidateNonCompactingFirst == -1 || 1983 currentRunHasAlreadyPickedFiles || 1984 (last-nonCompactingFirst) > (candidateNonCompactingLast-candidateNonCompactingFirst)) { 1985 candidateNonCompactingFirst = nonCompactingFirst 1986 candidateNonCompactingLast = last 1987 candidateHasAlreadyPickedFiles = currentRunHasAlreadyPickedFiles 1988 } 1989 } 1990 nonCompactingFirst = -1 1991 currentRunHasAlreadyPickedFiles = false 1992 continue 1993 } 1994 if nonCompactingFirst == -1 { 1995 nonCompactingFirst = index 1996 } 1997 if candidate.FilesIncluded[f.L0Index] { 1998 currentRunHasAlreadyPickedFiles = true 1999 } 2000 } 2001 // Logic duplicated from inside the for loop above. 2002 if nonCompactingFirst != -1 { 2003 last := index - 1 2004 if !candidateHasAlreadyPickedFiles && (candidateNonCompactingFirst == -1 || 2005 currentRunHasAlreadyPickedFiles || 2006 (last-nonCompactingFirst) > (candidateNonCompactingLast-candidateNonCompactingFirst)) { 2007 candidateNonCompactingFirst = nonCompactingFirst 2008 candidateNonCompactingLast = last 2009 } 2010 } 2011 if candidateNonCompactingFirst == -1 { 2012 // All files are compacting. There will be gaps that we could 2013 // exploit to continue, but don't bother. 2014 break 2015 } 2016 // May need to shrink [minIntervalIndex, maxIntervalIndex] for the next level. 2017 if candidateNonCompactingFirst > firstIndex { 2018 minIntervalIndex = files[candidateNonCompactingFirst-1].maxIntervalIndex + 1 2019 } 2020 if candidateNonCompactingLast < lastIndex { 2021 maxIntervalIndex = files[candidateNonCompactingLast+1].minIntervalIndex - 1 2022 } 2023 for index := candidateNonCompactingFirst; index <= candidateNonCompactingLast; index++ { 2024 f := files[index] 2025 if f.IsCompacting() { 2026 // TODO(bilal): Do a logger.Fatalf instead of a panic, for 2027 // cleaner unwinding and error messages. 2028 panic(fmt.Sprintf("expected %s to not be compacting", f.FileNum)) 2029 } 2030 if candidate.isIntraL0 && f.LargestSeqNum >= candidate.earliestUnflushedSeqNum { 2031 continue 2032 } 2033 if !candidate.FilesIncluded[f.L0Index] { 2034 addedCount++ 2035 candidate.addFile(f) 2036 } 2037 } 2038 } 2039 return addedCount > 0 2040 }