github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/internal/manifest/l0_sublevels.go (about) 1 // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package manifest 6 7 import ( 8 "bytes" 9 "fmt" 10 "math" 11 "sort" 12 "strings" 13 14 "github.com/cockroachdb/errors" 15 "github.com/zuoyebang/bitalostable/internal/base" 16 "github.com/zuoyebang/bitalostable/internal/invariants" 17 ) 18 19 // errInvalidL0SublevelsOpt is for use in AddL0Files when the incremental 20 // sublevel generation optimization failed, and NewL0Sublevels must be called. 21 var errInvalidL0SublevelsOpt = errors.New("bitalostable: L0 sublevel generation optimization cannot be used") 22 23 // Intervals are of the form [start, end) with no gap between intervals. Each 24 // file overlaps perfectly with a sequence of intervals. This perfect overlap 25 // occurs because the union of file boundary keys is used to pick intervals. 26 // However the largest key in a file is inclusive, so when it is used as 27 // an interval, the actual key is ImmediateSuccessor(key). We don't have the 28 // ImmediateSuccessor function to do this computation, so we instead keep an 29 // isLargest bool to remind the code about this fact. This is used for 30 // comparisons in the following manner: 31 // - intervalKey{k, false} < intervalKey{k, true} 32 // - k1 < k2 -> intervalKey{k1, _} < intervalKey{k2, _}. 33 // 34 // Note that the file's largest key is exclusive if the internal key 35 // has a trailer matching the rangedel sentinel key. In this case, we set 36 // isLargest to false for end interval computation. 37 // 38 // For example, consider three files with bounds [a,e], [b,g], and [e,j]. The 39 // interval keys produced would be intervalKey{a, false}, intervalKey{b, false}, 40 // intervalKey{e, false}, intervalKey{e, true}, intervalKey{g, true} and 41 // intervalKey{j, true}, resulting in intervals 42 // [a, b), [b, (e, false)), [(e,false), (e, true)), [(e, true), (g, true)) and 43 // [(g, true), (j, true)). The first file overlaps with the first three 44 // perfectly, the second file overlaps with the second through to fourth 45 // intervals, and the third file overlaps with the last three. 46 // 47 // The intervals are indexed starting from 0, with the index of the interval 48 // being the index of the start key of the interval. 49 // 50 // In addition to helping with compaction picking, we use interval indices 51 // to assign each file an interval range once. Subsequent operations, say 52 // picking overlapping files for a compaction, only need to use the index 53 // numbers and so avoid expensive byte slice comparisons. 54 type intervalKey struct { 55 key []byte 56 isLargest bool 57 } 58 59 // intervalKeyTemp is used in the sortAndSweep step. It contains additional metadata 60 // which is used to generate the {min,max}IntervalIndex for files. 61 type intervalKeyTemp struct { 62 intervalKey intervalKey 63 fileMeta *FileMetadata 64 isEndKey bool 65 } 66 67 func (i *intervalKeyTemp) setFileIntervalIndex(idx int) { 68 if i.isEndKey { 69 // This is the right endpoint of some file interval, so the 70 // file.maxIntervalIndex must be j - 1 as maxIntervalIndex is 71 // inclusive. 72 i.fileMeta.maxIntervalIndex = idx - 1 73 return 74 } 75 // This is the left endpoint for some file interval, so the 76 // file.minIntervalIndex must be j. 77 i.fileMeta.minIntervalIndex = idx 78 } 79 80 func intervalKeyCompare(cmp Compare, a, b intervalKey) int { 81 rv := cmp(a.key, b.key) 82 if rv == 0 { 83 if a.isLargest && !b.isLargest { 84 return +1 85 } 86 if !a.isLargest && b.isLargest { 87 return -1 88 } 89 } 90 return rv 91 } 92 93 type intervalKeySorter struct { 94 keys []intervalKeyTemp 95 cmp Compare 96 } 97 98 func (s intervalKeySorter) Len() int { return len(s.keys) } 99 func (s intervalKeySorter) Less(i, j int) bool { 100 return intervalKeyCompare(s.cmp, s.keys[i].intervalKey, s.keys[j].intervalKey) < 0 101 } 102 func (s intervalKeySorter) Swap(i, j int) { 103 s.keys[i], s.keys[j] = s.keys[j], s.keys[i] 104 } 105 106 // sortAndSweep will sort the intervalKeys using intervalKeySorter, remove the 107 // duplicate fileIntervals, and set the {min, max}IntervalIndex for the files. 108 func sortAndSweep(keys []intervalKeyTemp, cmp Compare) []intervalKeyTemp { 109 if len(keys) == 0 { 110 return nil 111 } 112 sorter := intervalKeySorter{keys: keys, cmp: cmp} 113 sort.Sort(sorter) 114 115 // intervalKeys are generated using the file bounds. Specifically, there are 2 intervalKeys 116 // for each file, and len(keys) = 2 * number of files. Each intervalKeyTemp stores information 117 // about which file it was generated from, and whether the key represents the end key of the file. 118 // So, as we're deduplicating the `keys` slice, we're guaranteed to iterate over the interval 119 // keys belonging to each of the files. Since the file.{min,max}IntervalIndex points to the position 120 // of the files bounds in the deduplicated `keys` slice, we can determine file.{min,max}IntervalIndex 121 // during the iteration. 122 i := 0 123 j := 0 124 for i < len(keys) { 125 // loop invariant: j <= i 126 currKey := keys[i] 127 keys[j] = keys[i] 128 129 for { 130 keys[i].setFileIntervalIndex(j) 131 i++ 132 if i >= len(keys) || intervalKeyCompare(cmp, currKey.intervalKey, keys[i].intervalKey) != 0 { 133 break 134 } 135 } 136 j++ 137 } 138 return keys[:j] 139 } 140 141 // A key interval of the form [start, end). The end is not represented here 142 // since it is implicit in the start of the next interval. The last interval is 143 // an exception but we don't need to ever lookup the end of that interval; the 144 // last fileInterval will only act as an end key marker. The set of intervals 145 // is const after initialization. 146 type fileInterval struct { 147 index int 148 startKey intervalKey 149 150 // True iff some file in this interval is compacting to base. Such intervals 151 // cannot have any files participate in L0 -> Lbase compactions. 152 isBaseCompacting bool 153 154 // The min and max intervals index across all the files that overlap with this 155 // interval. Inclusive on both sides. 156 filesMinIntervalIndex int 157 filesMaxIntervalIndex int 158 159 // True if another interval that has a file extending into this interval is 160 // undergoing a compaction into Lbase. In other words, this bool is true 161 // if any interval in [filesMinIntervalIndex, 162 // filesMaxIntervalIndex] has isBaseCompacting set to true. This 163 // lets the compaction picker de-prioritize this interval for picking 164 // compactions, since there's a high chance that a base compaction with a 165 // sufficient height of sublevels rooted at this interval could not be 166 // chosen due to the ongoing base compaction in the 167 // other interval. If the file straddling the two intervals is at a 168 // sufficiently high sublevel (with enough compactible files below it to 169 // satisfy minCompactionDepth), this is not an issue, but to optimize for 170 // quickly picking base compactions far away from other base compactions, 171 // this bool is used as a heuristic (but not as a complete disqualifier). 172 intervalRangeIsBaseCompacting bool 173 174 // All files in this interval, in increasing sublevel order. 175 files []*FileMetadata 176 177 // len(files) - compactingFileCount is the stack depth that requires 178 // starting new compactions. This metric is not precise since the 179 // compactingFileCount can include files that are part of N (where N > 1) 180 // intra-L0 compactions, so the stack depth after those complete will be 181 // len(files) - compactingFileCount + N. We ignore this imprecision since 182 // we don't want to track which files are part of which intra-L0 183 // compaction. 184 compactingFileCount int 185 186 // Interpolated from files in this interval. For files spanning multiple 187 // intervals, we assume an equal distribution of bytes across all those 188 // intervals. 189 estimatedBytes uint64 190 } 191 192 // Helper type for any cases requiring a bool slice. 193 type bitSet []bool 194 195 func newBitSet(n int) bitSet { 196 return make([]bool, n) 197 } 198 199 func (b *bitSet) markBit(i int) { 200 (*b)[i] = true 201 } 202 203 func (b *bitSet) markBits(start, end int) { 204 for i := start; i < end; i++ { 205 (*b)[i] = true 206 } 207 } 208 209 func (b *bitSet) clearAllBits() { 210 for i := range *b { 211 (*b)[i] = false 212 } 213 } 214 215 // L0Compaction describes an active compaction with inputs from L0. 216 type L0Compaction struct { 217 Smallest InternalKey 218 Largest InternalKey 219 IsIntraL0 bool 220 } 221 222 // L0Sublevels represents a sublevel view of SSTables in L0. Tables in one 223 // sublevel are non-overlapping in key ranges, and keys in higher-indexed 224 // sublevels shadow older versions in lower-indexed sublevels. These invariants 225 // are similar to the regular level invariants, except with higher indexed 226 // sublevels having newer keys as opposed to lower indexed levels. 227 // 228 // There is no limit to the number of sublevels that can exist in L0 at any 229 // time, however read and compaction performance is best when there are as few 230 // sublevels as possible. 231 type L0Sublevels struct { 232 // Levels are ordered from oldest sublevel to youngest sublevel in the 233 // outer slice, and the inner slice contains non-overlapping files for 234 // that sublevel in increasing key order. Levels is constructed from 235 // levelFiles and is used by callers that require a LevelSlice. The below two 236 // fields are treated as immutable once created in NewL0Sublevels. 237 Levels []LevelSlice 238 levelFiles [][]*FileMetadata 239 240 cmp Compare 241 formatKey base.FormatKey 242 243 fileBytes uint64 244 // All the L0 files, ordered from oldest to youngest. 245 levelMetadata *LevelMetadata 246 247 // The file intervals in increasing key order. 248 orderedIntervals []fileInterval 249 250 // Keys to break flushes at. 251 flushSplitUserKeys [][]byte 252 253 // Only used to check invariants. 254 addL0FilesCalled bool 255 } 256 257 type sublevelSorter []*FileMetadata 258 259 // Len implements sort.Interface. 260 func (sl sublevelSorter) Len() int { 261 return len(sl) 262 } 263 264 // Less implements sort.Interface. 265 func (sl sublevelSorter) Less(i, j int) bool { 266 return sl[i].minIntervalIndex < sl[j].minIntervalIndex 267 } 268 269 // Swap implements sort.Interface. 270 func (sl sublevelSorter) Swap(i, j int) { 271 sl[i], sl[j] = sl[j], sl[i] 272 } 273 274 // NewL0Sublevels creates an L0Sublevels instance for a given set of L0 files. 275 // These files must all be in L0 and must be sorted by seqnum (see 276 // SortBySeqNum). During interval iteration, when flushSplitMaxBytes bytes are 277 // exceeded in the range of intervals since the last flush split key, a flush 278 // split key is added. 279 // 280 // This method can be called without DB.mu being held, so any DB.mu protected 281 // fields in FileMetadata cannot be accessed here, such as Compacting and 282 // IsIntraL0Compacting. Those fields are accessed in InitCompactingFileInfo 283 // instead. 284 func NewL0Sublevels( 285 levelMetadata *LevelMetadata, cmp Compare, formatKey base.FormatKey, flushSplitMaxBytes int64, 286 ) (*L0Sublevels, error) { 287 s := &L0Sublevels{cmp: cmp, formatKey: formatKey} 288 s.levelMetadata = levelMetadata 289 keys := make([]intervalKeyTemp, 0, 2*s.levelMetadata.Len()) 290 iter := levelMetadata.Iter() 291 for i, f := 0, iter.First(); f != nil; i, f = i+1, iter.Next() { 292 f.L0Index = i 293 keys = append(keys, intervalKeyTemp{ 294 intervalKey: intervalKey{key: f.Smallest.UserKey}, 295 fileMeta: f, 296 isEndKey: false, 297 }) 298 keys = append(keys, intervalKeyTemp{ 299 intervalKey: intervalKey{ 300 key: f.Largest.UserKey, 301 isLargest: !f.Largest.IsExclusiveSentinel(), 302 }, 303 fileMeta: f, 304 isEndKey: true, 305 }) 306 } 307 keys = sortAndSweep(keys, cmp) 308 // All interval indices reference s.orderedIntervals. 309 s.orderedIntervals = make([]fileInterval, len(keys)) 310 for i := range keys { 311 s.orderedIntervals[i] = fileInterval{ 312 index: i, 313 startKey: keys[i].intervalKey, 314 filesMinIntervalIndex: i, 315 filesMaxIntervalIndex: i, 316 } 317 } 318 // Initialize minIntervalIndex and maxIntervalIndex for each file, and use that 319 // to update intervals. 320 for f := iter.First(); f != nil; f = iter.Next() { 321 if err := s.addFileToSublevels(f, false /* checkInvariant */); err != nil { 322 return nil, err 323 } 324 } 325 // Sort each sublevel in increasing key order. 326 for i := range s.levelFiles { 327 sort.Sort(sublevelSorter(s.levelFiles[i])) 328 } 329 330 // Construct a parallel slice of sublevel B-Trees. 331 // TODO(jackson): Consolidate and only use the B-Trees. 332 for _, sublevelFiles := range s.levelFiles { 333 tr, ls := makeBTree(btreeCmpSmallestKey(cmp), sublevelFiles) 334 s.Levels = append(s.Levels, ls) 335 tr.release() 336 } 337 338 s.calculateFlushSplitKeys(flushSplitMaxBytes) 339 return s, nil 340 } 341 342 // Helper function to merge new intervalKeys into an existing slice 343 // of old fileIntervals, into result. Returns the new result and a slice of ints 344 // mapping old interval indices to new ones. The added intervalKeys do not 345 // need to be sorted; they get sorted and deduped in this function. 346 func mergeIntervals( 347 old, result []fileInterval, added []intervalKeyTemp, compare Compare, 348 ) ([]fileInterval, []int) { 349 sorter := intervalKeySorter{keys: added, cmp: compare} 350 sort.Sort(sorter) 351 352 oldToNewMap := make([]int, len(old)) 353 i := 0 354 j := 0 355 356 for i < len(old) || j < len(added) { 357 for j > 0 && j < len(added) && intervalKeyCompare(compare, added[j-1].intervalKey, added[j].intervalKey) == 0 { 358 added[j].setFileIntervalIndex(len(result) - 1) 359 j++ 360 } 361 if i >= len(old) && j >= len(added) { 362 break 363 } 364 var cmp int 365 if i >= len(old) { 366 cmp = +1 367 } 368 if j >= len(added) { 369 cmp = -1 370 } 371 if cmp == 0 { 372 cmp = intervalKeyCompare(compare, old[i].startKey, added[j].intervalKey) 373 } 374 switch { 375 case cmp <= 0: 376 // Shallow-copy the existing interval. 377 newInterval := old[i] 378 result = append(result, newInterval) 379 oldToNewMap[i] = len(result) - 1 380 i++ 381 if cmp == 0 { 382 added[j].setFileIntervalIndex(len(result) - 1) 383 j++ 384 } 385 case cmp > 0: 386 var prevInterval fileInterval 387 // Insert a new interval for a newly-added file. prevInterval, if 388 // non-zero, will be "inherited"; we copy its files as those extend 389 // into this interval. 390 if len(result) > 0 { 391 prevInterval = result[len(result)-1] 392 } 393 newInterval := fileInterval{ 394 index: len(result), 395 startKey: added[j].intervalKey, 396 filesMinIntervalIndex: len(result), 397 filesMaxIntervalIndex: len(result), 398 399 // estimatedBytes gets recalculated later on, as the number of intervals 400 // the file bytes are interpolated over has changed. 401 estimatedBytes: 0, 402 // Copy the below attributes from prevInterval. 403 files: append([]*FileMetadata(nil), prevInterval.files...), 404 isBaseCompacting: prevInterval.isBaseCompacting, 405 intervalRangeIsBaseCompacting: prevInterval.intervalRangeIsBaseCompacting, 406 compactingFileCount: prevInterval.compactingFileCount, 407 } 408 result = append(result, newInterval) 409 added[j].setFileIntervalIndex(len(result) - 1) 410 j++ 411 } 412 } 413 return result, oldToNewMap 414 } 415 416 // AddL0Files incrementally builds a new L0Sublevels for when the only 417 // change since the receiver L0Sublevels was an addition of the specified files, 418 // with no L0 deletions. The common case of this is an ingestion or a flush. 419 // These files can "sit on top" of existing sublevels, creating at most one 420 // new sublevel for a flush (and possibly multiple for an ingestion), and at 421 // most 2*len(files) additions to s.orderedIntervals. No files must have been 422 // deleted from L0, and the added files must all be newer in sequence numbers 423 // than existing files in L0Sublevels. The files parameter must be sorted in 424 // seqnum order. The levelMetadata parameter corresponds to the new L0 post 425 // addition of files. This method is meant to be significantly more performant 426 // than NewL0Sublevels. 427 // 428 // Note that this function can only be called once on a given receiver; it 429 // appends to some slices in s which is only safe when done once. This is okay, 430 // as the common case (generating a new L0Sublevels after a flush/ingestion) is 431 // only going to necessitate one call of this method on a given receiver. The 432 // returned value, if non-nil, can then have AddL0Files called on it again, and 433 // so on. If errInvalidL0SublevelsOpt is returned as an error, it likely means 434 // the optimization could not be applied (i.e. files added were older than files 435 // already in the sublevels, which is possible around ingestions and in tests). 436 // Eg. it can happen when an ingested file was ingested without queueing a flush 437 // since it did not actually overlap with any keys in the memtable. Later on the 438 // memtable was flushed, and the memtable had keys spanning around the ingested 439 // file, producing a flushed file that overlapped with the ingested file in file 440 // bounds but not in keys. It's possible for that flushed file to have a lower 441 // LargestSeqNum than the ingested file if all the additions after the ingestion 442 // were to another flushed file that was split into a separate sstable during 443 // flush. Any other non-nil error means L0Sublevels generation failed in the same 444 // way as NewL0Sublevels would likely fail. 445 func (s *L0Sublevels) AddL0Files( 446 files []*FileMetadata, flushSplitMaxBytes int64, levelMetadata *LevelMetadata, 447 ) (*L0Sublevels, error) { 448 if invariants.Enabled && s.addL0FilesCalled { 449 panic("AddL0Files called twice on the same receiver") 450 } 451 s.addL0FilesCalled = true 452 453 // Start with a shallow copy of s. 454 newVal := &L0Sublevels{} 455 *newVal = *s 456 457 newVal.addL0FilesCalled = false 458 newVal.levelMetadata = levelMetadata 459 // Deep copy levelFiles and Levels, as they are mutated and sorted below. 460 // Shallow copies of slices that we just append to, are okay. 461 newVal.levelFiles = make([][]*FileMetadata, len(s.levelFiles)) 462 for i := range s.levelFiles { 463 newVal.levelFiles[i] = make([]*FileMetadata, len(s.levelFiles[i])) 464 copy(newVal.levelFiles[i], s.levelFiles[i]) 465 } 466 newVal.Levels = make([]LevelSlice, len(s.Levels)) 467 copy(newVal.Levels, s.Levels) 468 469 fileKeys := make([]intervalKeyTemp, 0, 2*len(files)) 470 for _, f := range files { 471 left := intervalKeyTemp{ 472 intervalKey: intervalKey{key: f.Smallest.UserKey}, 473 fileMeta: f, 474 } 475 right := intervalKeyTemp{ 476 intervalKey: intervalKey{ 477 key: f.Largest.UserKey, 478 isLargest: !f.Largest.IsExclusiveSentinel(), 479 }, 480 fileMeta: f, 481 isEndKey: true, 482 } 483 fileKeys = append(fileKeys, left, right) 484 } 485 keys := make([]fileInterval, 0, 2*levelMetadata.Len()) 486 var oldToNewMap []int 487 // We can avoid the sortAndSweep step on the combined length of 488 // s.orderedIntervals and fileKeys by treating this as a merge of two 489 // sorted runs, fileKeys and s.orderedIntervals, into `keys` which will form 490 // newVal.orderedIntervals. 491 keys, oldToNewMap = mergeIntervals(s.orderedIntervals, keys, fileKeys, s.cmp) 492 if invariants.Enabled { 493 for i := 1; i < len(keys); i++ { 494 if intervalKeyCompare(newVal.cmp, keys[i-1].startKey, keys[i].startKey) >= 0 { 495 panic("keys not sorted correctly") 496 } 497 } 498 } 499 newVal.orderedIntervals = keys 500 // Update indices in s.orderedIntervals for fileIntervals we retained. 501 for _, newIdx := range oldToNewMap { 502 newInterval := &keys[newIdx] 503 newInterval.index = newIdx 504 // This code, and related code in the for loop below, adjusts 505 // files{Min,Max}IntervalIndex just for interval indices shifting due to new 506 // intervals, and not for any of the new files being added to the same 507 // intervals. The goal is to produce a state of the system that's accurate 508 // for all existing files, and has all the new intervals to support new 509 // files. Once that's done, we can just call addFileToSublevel to adjust 510 // all relevant intervals for new files. 511 newInterval.filesMinIntervalIndex = oldToNewMap[newInterval.filesMinIntervalIndex] 512 // maxIntervalIndexes are special. Since it's an inclusive end bound, we 513 // actually have to map it to the _next_ old interval's new previous 514 // interval. This logic is easier to understand if you see 515 // [f.minIntervalIndex, f.maxIntervalIndex] as [f.minIntervalIndex, 516 // f.maxIntervalIndex+1). The other case to remember is when the interval is 517 // completely empty (i.e. len(newInterval.files) == 0); in that case we want 518 // to refer back to ourselves regardless of additions to the right of us. 519 if newInterval.filesMaxIntervalIndex < len(oldToNewMap)-1 && len(newInterval.files) > 0 { 520 newInterval.filesMaxIntervalIndex = oldToNewMap[newInterval.filesMaxIntervalIndex+1] - 1 521 } else { 522 // newInterval.filesMaxIntervalIndex == len(oldToNewMap)-1. 523 newInterval.filesMaxIntervalIndex = oldToNewMap[newInterval.filesMaxIntervalIndex] 524 } 525 } 526 // Loop through all instances of new intervals added between two old intervals 527 // and expand [filesMinIntervalIndex, filesMaxIntervalIndex] of new intervals 528 // to reflect that of adjacent old intervals. 529 { 530 // We can skip cases where new intervals were added to the left of all 531 // existing intervals (eg. if the first entry in oldToNewMap is 532 // oldToNewMap[0] >= 1). Those intervals will only contain newly added files 533 // and will have their parameters adjusted down in addFileToSublevels. The 534 // same can also be said about new intervals that are to the right of all 535 // existing intervals. 536 lastIdx := 0 537 for _, newIdx := range oldToNewMap { 538 for i := lastIdx + 1; i < newIdx; i++ { 539 minIntervalIndex := i 540 maxIntervalIndex := i 541 if keys[lastIdx].filesMaxIntervalIndex != lastIdx { 542 // Last old interval has files extending into keys[i]. 543 minIntervalIndex = keys[lastIdx].filesMinIntervalIndex 544 maxIntervalIndex = keys[lastIdx].filesMaxIntervalIndex 545 } 546 547 keys[i].filesMinIntervalIndex = minIntervalIndex 548 keys[i].filesMaxIntervalIndex = maxIntervalIndex 549 } 550 lastIdx = newIdx 551 } 552 } 553 // Go through old files and update interval indices. 554 // 555 // TODO(bilal): This is the only place in this method where we loop through 556 // all existing files, which could be much more in number than newly added 557 // files. See if we can avoid the need for this, either by getting rid of 558 // f.minIntervalIndex and f.maxIntervalIndex and calculating them on the 559 // fly with a binary search, or by only looping through files to the right 560 // of the first interval touched by this method. 561 for sublevel := range s.Levels { 562 s.Levels[sublevel].Each(func(f *FileMetadata) { 563 oldIntervalDelta := f.maxIntervalIndex - f.minIntervalIndex + 1 564 oldMinIntervalIndex := f.minIntervalIndex 565 f.minIntervalIndex = oldToNewMap[f.minIntervalIndex] 566 // maxIntervalIndex is special. Since it's an inclusive end bound, we 567 // actually have to map it to the _next_ old interval's new previous 568 // interval. This logic is easier to understand if you see 569 // [f.minIntervalIndex, f.maxIntervalIndex] as 570 // [f.minIntervalIndex, f.maxIntervalIndex+1). 571 f.maxIntervalIndex = oldToNewMap[f.maxIntervalIndex+1] - 1 572 newIntervalDelta := f.maxIntervalIndex - f.minIntervalIndex + 1 573 // Recalculate estimatedBytes for all old files across new intervals, but 574 // only if new intervals were added in between. 575 if oldIntervalDelta != newIntervalDelta { 576 // j is incremented so that oldToNewMap[j] points to the next old 577 // interval. This is used to distinguish between old intervals (i.e. 578 // ones where we need to subtract f.Size/oldIntervalDelta) from new 579 // ones (where we don't need to subtract). In both cases we need to add 580 // f.Size/newIntervalDelta. 581 j := oldMinIntervalIndex 582 for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ { 583 if oldToNewMap[j] == i { 584 newVal.orderedIntervals[i].estimatedBytes -= f.Size / uint64(oldIntervalDelta) 585 j++ 586 } 587 newVal.orderedIntervals[i].estimatedBytes += f.Size / uint64(newIntervalDelta) 588 } 589 } 590 }) 591 } 592 updatedSublevels := make([]int, 0) 593 // Update interval indices for new files. 594 for i, f := range files { 595 f.L0Index = s.levelMetadata.Len() + i 596 if err := newVal.addFileToSublevels(f, true /* checkInvariant */); err != nil { 597 return nil, err 598 } 599 updatedSublevels = append(updatedSublevels, f.SubLevel) 600 } 601 602 // Sort and deduplicate updatedSublevels. 603 sort.Ints(updatedSublevels) 604 { 605 j := 0 606 for i := 1; i < len(updatedSublevels); i++ { 607 if updatedSublevels[i] != updatedSublevels[j] { 608 j++ 609 updatedSublevels[j] = updatedSublevels[i] 610 } 611 } 612 updatedSublevels = updatedSublevels[:j+1] 613 } 614 615 // Sort each updated sublevel in increasing key order. 616 for _, sublevel := range updatedSublevels { 617 sort.Sort(sublevelSorter(newVal.levelFiles[sublevel])) 618 } 619 620 // Construct a parallel slice of sublevel B-Trees. 621 // TODO(jackson): Consolidate and only use the B-Trees. 622 for _, sublevel := range updatedSublevels { 623 tr, ls := makeBTree(btreeCmpSmallestKey(newVal.cmp), newVal.levelFiles[sublevel]) 624 if sublevel == len(newVal.Levels) { 625 newVal.Levels = append(newVal.Levels, ls) 626 } else { 627 // sublevel < len(s.Levels). If this panics, updatedSublevels was not 628 // populated correctly. 629 newVal.Levels[sublevel] = ls 630 } 631 tr.release() 632 } 633 634 newVal.flushSplitUserKeys = nil 635 newVal.calculateFlushSplitKeys(flushSplitMaxBytes) 636 return newVal, nil 637 } 638 639 // addFileToSublevels is called during L0Sublevels generation, and adds f to 640 // the correct sublevel's levelFiles, the relevant intervals' files slices, and 641 // sets interval indices on f. This method, if called successively on multiple 642 // files, _must_ be called on successively newer files (by seqnum). If 643 // checkInvariant is true, it could check for this in some cases and return 644 // errInvalidL0SublevelsOpt if that invariant isn't held. 645 func (s *L0Sublevels) addFileToSublevels(f *FileMetadata, checkInvariant bool) error { 646 // This is a simple and not very accurate estimate of the number of 647 // bytes this SSTable contributes to the intervals it is a part of. 648 // 649 // TODO(bilal): Call EstimateDiskUsage in sstable.Reader with interval 650 // bounds to get a better estimate for each interval. 651 interpolatedBytes := f.Size / uint64(f.maxIntervalIndex-f.minIntervalIndex+1) 652 s.fileBytes += f.Size 653 subLevel := 0 654 // Update state in every fileInterval for this file. 655 for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ { 656 interval := &s.orderedIntervals[i] 657 if len(interval.files) > 0 && 658 subLevel <= interval.files[len(interval.files)-1].SubLevel { 659 if checkInvariant && interval.files[len(interval.files)-1].LargestSeqNum > f.LargestSeqNum { 660 // We are sliding this file "underneath" an existing file. Throw away 661 // and start over in NewL0Sublevels. 662 return errInvalidL0SublevelsOpt 663 } 664 subLevel = interval.files[len(interval.files)-1].SubLevel + 1 665 } 666 interval.estimatedBytes += interpolatedBytes 667 if f.minIntervalIndex < interval.filesMinIntervalIndex { 668 interval.filesMinIntervalIndex = f.minIntervalIndex 669 } 670 if f.maxIntervalIndex > interval.filesMaxIntervalIndex { 671 interval.filesMaxIntervalIndex = f.maxIntervalIndex 672 } 673 interval.files = append(interval.files, f) 674 } 675 f.SubLevel = subLevel 676 if subLevel > len(s.levelFiles) { 677 return errors.Errorf("chose a sublevel beyond allowed range of sublevels: %d vs 0-%d", subLevel, len(s.levelFiles)) 678 } 679 if subLevel == len(s.levelFiles) { 680 s.levelFiles = append(s.levelFiles, []*FileMetadata{f}) 681 } else { 682 s.levelFiles[subLevel] = append(s.levelFiles[subLevel], f) 683 } 684 return nil 685 } 686 687 func (s *L0Sublevels) calculateFlushSplitKeys(flushSplitMaxBytes int64) { 688 var cumulativeBytes uint64 689 // Multiply flushSplitMaxBytes by the number of sublevels. This prevents 690 // excessive flush splitting when the number of sublevels increases. 691 flushSplitMaxBytes *= int64(len(s.levelFiles)) 692 for i := 0; i < len(s.orderedIntervals); i++ { 693 interval := &s.orderedIntervals[i] 694 if flushSplitMaxBytes > 0 && cumulativeBytes > uint64(flushSplitMaxBytes) && 695 (len(s.flushSplitUserKeys) == 0 || 696 !bytes.Equal(interval.startKey.key, s.flushSplitUserKeys[len(s.flushSplitUserKeys)-1])) { 697 s.flushSplitUserKeys = append(s.flushSplitUserKeys, interval.startKey.key) 698 cumulativeBytes = 0 699 } 700 cumulativeBytes += s.orderedIntervals[i].estimatedBytes 701 } 702 } 703 704 // InitCompactingFileInfo initializes internal flags relating to compacting 705 // files. Must be called after sublevel initialization. 706 // 707 // Requires DB.mu to be held. 708 func (s *L0Sublevels) InitCompactingFileInfo(inProgress []L0Compaction) { 709 for i := range s.orderedIntervals { 710 s.orderedIntervals[i].compactingFileCount = 0 711 s.orderedIntervals[i].isBaseCompacting = false 712 s.orderedIntervals[i].intervalRangeIsBaseCompacting = false 713 } 714 715 iter := s.levelMetadata.Iter() 716 for f := iter.First(); f != nil; f = iter.Next() { 717 if invariants.Enabled { 718 if !bytes.Equal(s.orderedIntervals[f.minIntervalIndex].startKey.key, f.Smallest.UserKey) { 719 panic(fmt.Sprintf("f.minIntervalIndex in FileMetadata out of sync with intervals in L0Sublevels: %s != %s", 720 s.formatKey(s.orderedIntervals[f.minIntervalIndex].startKey.key), s.formatKey(f.Smallest.UserKey))) 721 } 722 if !bytes.Equal(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key, f.Largest.UserKey) { 723 panic(fmt.Sprintf("f.maxIntervalIndex in FileMetadata out of sync with intervals in L0Sublevels: %s != %s", 724 s.formatKey(s.orderedIntervals[f.maxIntervalIndex+1].startKey.key), s.formatKey(f.Smallest.UserKey))) 725 } 726 } 727 if !f.IsCompacting() { 728 continue 729 } 730 for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ { 731 interval := &s.orderedIntervals[i] 732 interval.compactingFileCount++ 733 if !f.IsIntraL0Compacting { 734 // If f.Compacting && !f.IsIntraL0Compacting, this file is 735 // being compacted to Lbase. 736 interval.isBaseCompacting = true 737 } 738 } 739 } 740 741 // Some intervals may be base compacting without the files contained 742 // within those intervals being marked as compacting. This is possible if 743 // the files were added after the compaction initiated, and the active 744 // compaction files straddle the input file. Mark these intervals as base 745 // compacting. 746 for _, c := range inProgress { 747 startIK := intervalKey{key: c.Smallest.UserKey, isLargest: false} 748 endIK := intervalKey{key: c.Largest.UserKey, isLargest: !c.Largest.IsExclusiveSentinel()} 749 start := sort.Search(len(s.orderedIntervals), func(i int) bool { 750 return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, startIK) >= 0 751 }) 752 end := sort.Search(len(s.orderedIntervals), func(i int) bool { 753 return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, endIK) >= 0 754 }) 755 for i := start; i < end && i < len(s.orderedIntervals); i++ { 756 interval := &s.orderedIntervals[i] 757 if !c.IsIntraL0 { 758 interval.isBaseCompacting = true 759 } 760 } 761 } 762 763 min := 0 764 for i := range s.orderedIntervals { 765 interval := &s.orderedIntervals[i] 766 if interval.isBaseCompacting { 767 minIndex := interval.filesMinIntervalIndex 768 if minIndex < min { 769 minIndex = min 770 } 771 for j := minIndex; j <= interval.filesMaxIntervalIndex; j++ { 772 min = j 773 s.orderedIntervals[j].intervalRangeIsBaseCompacting = true 774 } 775 } 776 } 777 } 778 779 // String produces a string containing useful debug information. Useful in test 780 // code and debugging. 781 func (s *L0Sublevels) String() string { 782 return s.describe(false) 783 } 784 785 func (s *L0Sublevels) describe(verbose bool) string { 786 var buf strings.Builder 787 fmt.Fprintf(&buf, "file count: %d, sublevels: %d, intervals: %d\nflush split keys(%d): [", 788 s.levelMetadata.Len(), len(s.levelFiles), len(s.orderedIntervals), len(s.flushSplitUserKeys)) 789 for i := range s.flushSplitUserKeys { 790 fmt.Fprintf(&buf, "%s", s.formatKey(s.flushSplitUserKeys[i])) 791 if i < len(s.flushSplitUserKeys)-1 { 792 fmt.Fprintf(&buf, ", ") 793 } 794 } 795 fmt.Fprintln(&buf, "]") 796 numCompactingFiles := 0 797 for i := len(s.levelFiles) - 1; i >= 0; i-- { 798 maxIntervals := 0 799 sumIntervals := 0 800 var totalBytes uint64 801 for _, f := range s.levelFiles[i] { 802 intervals := f.maxIntervalIndex - f.minIntervalIndex + 1 803 if intervals > maxIntervals { 804 maxIntervals = intervals 805 } 806 sumIntervals += intervals 807 totalBytes += f.Size 808 if f.IsCompacting() { 809 numCompactingFiles++ 810 } 811 } 812 fmt.Fprintf(&buf, "0.%d: file count: %d, bytes: %d, width (mean, max): %0.1f, %d, interval range: [%d, %d]\n", 813 i, len(s.levelFiles[i]), totalBytes, float64(sumIntervals)/float64(len(s.levelFiles[i])), maxIntervals, s.levelFiles[i][0].minIntervalIndex, 814 s.levelFiles[i][len(s.levelFiles[i])-1].maxIntervalIndex) 815 for _, f := range s.levelFiles[i] { 816 intervals := f.maxIntervalIndex - f.minIntervalIndex + 1 817 if verbose { 818 fmt.Fprintf(&buf, "\t%s\n", f) 819 } 820 if s.levelMetadata.Len() > 50 && intervals*3 > len(s.orderedIntervals) { 821 var intervalsBytes uint64 822 for k := f.minIntervalIndex; k <= f.maxIntervalIndex; k++ { 823 intervalsBytes += s.orderedIntervals[k].estimatedBytes 824 } 825 fmt.Fprintf(&buf, "wide file: %d, [%d, %d], byte fraction: %f\n", 826 f.FileNum, f.minIntervalIndex, f.maxIntervalIndex, 827 float64(intervalsBytes)/float64(s.fileBytes)) 828 } 829 } 830 } 831 832 lastCompactingIntervalStart := -1 833 fmt.Fprintf(&buf, "compacting file count: %d, base compacting intervals: ", numCompactingFiles) 834 i := 0 835 foundBaseCompactingIntervals := false 836 for ; i < len(s.orderedIntervals); i++ { 837 interval := &s.orderedIntervals[i] 838 if len(interval.files) == 0 { 839 continue 840 } 841 if !interval.isBaseCompacting { 842 if lastCompactingIntervalStart != -1 { 843 if foundBaseCompactingIntervals { 844 buf.WriteString(", ") 845 } 846 fmt.Fprintf(&buf, "[%d, %d]", lastCompactingIntervalStart, i-1) 847 foundBaseCompactingIntervals = true 848 } 849 lastCompactingIntervalStart = -1 850 } else { 851 if lastCompactingIntervalStart == -1 { 852 lastCompactingIntervalStart = i 853 } 854 } 855 } 856 if lastCompactingIntervalStart != -1 { 857 if foundBaseCompactingIntervals { 858 buf.WriteString(", ") 859 } 860 fmt.Fprintf(&buf, "[%d, %d]", lastCompactingIntervalStart, i-1) 861 } else if !foundBaseCompactingIntervals { 862 fmt.Fprintf(&buf, "none") 863 } 864 fmt.Fprintln(&buf, "") 865 return buf.String() 866 } 867 868 // ReadAmplification returns the contribution of L0Sublevels to the read 869 // amplification for any particular point key. It is the maximum height of any 870 // tracked fileInterval. This is always less than or equal to the number of 871 // sublevels. 872 func (s *L0Sublevels) ReadAmplification() int { 873 amp := 0 874 for i := range s.orderedIntervals { 875 interval := &s.orderedIntervals[i] 876 fileCount := len(interval.files) 877 if amp < fileCount { 878 amp = fileCount 879 } 880 } 881 return amp 882 } 883 884 // UserKeyRange encodes a key range in user key space. A UserKeyRange's Start 885 // and End boundaries are both inclusive. 886 type UserKeyRange struct { 887 Start, End []byte 888 } 889 890 // InUseKeyRanges returns the merged table bounds of L0 files overlapping the 891 // provided user key range. The returned key ranges are sorted and 892 // nonoverlapping. 893 func (s *L0Sublevels) InUseKeyRanges(smallest, largest []byte) []UserKeyRange { 894 // Binary search to find the provided keys within the intervals. 895 startIK := intervalKey{key: smallest, isLargest: false} 896 endIK := intervalKey{key: largest, isLargest: true} 897 start := sort.Search(len(s.orderedIntervals), func(i int) bool { 898 return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, startIK) > 0 899 }) 900 if start > 0 { 901 // Back up to the first interval with a start key <= startIK. 902 start-- 903 } 904 end := sort.Search(len(s.orderedIntervals), func(i int) bool { 905 return intervalKeyCompare(s.cmp, s.orderedIntervals[i].startKey, endIK) > 0 906 }) 907 908 var keyRanges []UserKeyRange 909 var curr *UserKeyRange 910 for i := start; i < end; { 911 // Intervals with no files are not in use and can be skipped, once we 912 // end the current UserKeyRange. 913 if len(s.orderedIntervals[i].files) == 0 { 914 curr = nil 915 i++ 916 continue 917 } 918 919 // If curr is nil, start a new in-use key range. 920 if curr == nil { 921 keyRanges = append(keyRanges, UserKeyRange{ 922 Start: s.orderedIntervals[i].startKey.key, 923 }) 924 curr = &keyRanges[len(keyRanges)-1] 925 } 926 927 // If the filesMaxIntervalIndex is not the current index, we can jump 928 // to the max index, knowing that all intermediary intervals are 929 // overlapped by some file. 930 if maxIdx := s.orderedIntervals[i].filesMaxIntervalIndex; maxIdx != i { 931 // Note that end may be less than or equal to maxIdx if we're 932 // concerned with a key range that ends before the interval at 933 // maxIdx starts. We must set curr.End now, before making that 934 // leap, because this iteration may be the last. 935 i = maxIdx 936 curr.End = s.orderedIntervals[i+1].startKey.key 937 continue 938 } 939 940 // No files overlapping with this interval overlap with the next 941 // interval. Update the current end to be the next interval's start 942 // key. Note that curr is not necessarily finished, because there may 943 // be an abutting non-empty interval. 944 curr.End = s.orderedIntervals[i+1].startKey.key 945 i++ 946 } 947 return keyRanges 948 } 949 950 // FlushSplitKeys returns a slice of user keys to split flushes at. 951 // Used by flushes to avoid writing sstables that straddle these split keys. 952 // These should be interpreted as the keys to start the next sstable (not the 953 // last key to include in the prev sstable). These are user keys so that 954 // range tombstones can be properly truncated (untruncated range tombstones 955 // are not permitted for L0 files). 956 func (s *L0Sublevels) FlushSplitKeys() [][]byte { 957 return s.flushSplitUserKeys 958 } 959 960 // MaxDepthAfterOngoingCompactions returns an estimate of maximum depth of 961 // sublevels after all ongoing compactions run to completion. Used by compaction 962 // picker to decide compaction score for L0. There is no scoring for intra-L0 963 // compactions -- they only run if L0 score is high but we're unable to pick an 964 // L0 -> Lbase compaction. 965 func (s *L0Sublevels) MaxDepthAfterOngoingCompactions() int { 966 depth := 0 967 for i := range s.orderedIntervals { 968 interval := &s.orderedIntervals[i] 969 intervalDepth := len(interval.files) - interval.compactingFileCount 970 if depth < intervalDepth { 971 depth = intervalDepth 972 } 973 } 974 return depth 975 } 976 977 // Only for temporary debugging in the absence of proper tests. 978 // 979 // TODO(bilal): Simplify away the debugging statements in this method, and make 980 // this a pure sanity checker. 981 // 982 //lint:ignore U1000 - useful for debugging 983 func (s *L0Sublevels) checkCompaction(c *L0CompactionFiles) error { 984 includedFiles := newBitSet(s.levelMetadata.Len()) 985 fileIntervalsByLevel := make([]struct { 986 min int 987 max int 988 }, len(s.levelFiles)) 989 for i := range fileIntervalsByLevel { 990 fileIntervalsByLevel[i].min = math.MaxInt32 991 fileIntervalsByLevel[i].max = 0 992 } 993 var topLevel int 994 var increment int 995 var limitReached func(int) bool 996 if c.isIntraL0 { 997 topLevel = len(s.levelFiles) - 1 998 increment = +1 999 limitReached = func(level int) bool { 1000 return level == len(s.levelFiles) 1001 } 1002 } else { 1003 topLevel = 0 1004 increment = -1 1005 limitReached = func(level int) bool { 1006 return level < 0 1007 } 1008 } 1009 for _, f := range c.Files { 1010 if fileIntervalsByLevel[f.SubLevel].min > f.minIntervalIndex { 1011 fileIntervalsByLevel[f.SubLevel].min = f.minIntervalIndex 1012 } 1013 if fileIntervalsByLevel[f.SubLevel].max < f.maxIntervalIndex { 1014 fileIntervalsByLevel[f.SubLevel].max = f.maxIntervalIndex 1015 } 1016 includedFiles.markBit(f.L0Index) 1017 if c.isIntraL0 { 1018 if topLevel > f.SubLevel { 1019 topLevel = f.SubLevel 1020 } 1021 } else { 1022 if topLevel < f.SubLevel { 1023 topLevel = f.SubLevel 1024 } 1025 } 1026 } 1027 min := fileIntervalsByLevel[topLevel].min 1028 max := fileIntervalsByLevel[topLevel].max 1029 for level := topLevel; !limitReached(level); level += increment { 1030 if fileIntervalsByLevel[level].min < min { 1031 min = fileIntervalsByLevel[level].min 1032 } 1033 if fileIntervalsByLevel[level].max > max { 1034 max = fileIntervalsByLevel[level].max 1035 } 1036 index := sort.Search(len(s.levelFiles[level]), func(i int) bool { 1037 return s.levelFiles[level][i].maxIntervalIndex >= min 1038 }) 1039 // start := index 1040 for ; index < len(s.levelFiles[level]); index++ { 1041 f := s.levelFiles[level][index] 1042 if f.minIntervalIndex > max { 1043 break 1044 } 1045 if c.isIntraL0 && f.LargestSeqNum >= c.earliestUnflushedSeqNum { 1046 return errors.Errorf( 1047 "sstable %s in compaction has sequence numbers higher than the earliest unflushed seqnum %d: %d-%d", 1048 f.FileNum, c.earliestUnflushedSeqNum, f.SmallestSeqNum, 1049 f.LargestSeqNum) 1050 } 1051 if !includedFiles[f.L0Index] { 1052 var buf strings.Builder 1053 fmt.Fprintf(&buf, "bug %t, seed interval: %d: level %d, sl index %d, f.index %d, min %d, max %d, pre-min %d, pre-max %d, f.min %d, f.max %d, filenum: %d, isCompacting: %t\n%s\n", 1054 c.isIntraL0, c.seedInterval, level, index, f.L0Index, min, max, c.preExtensionMinInterval, c.preExtensionMaxInterval, 1055 f.minIntervalIndex, f.maxIntervalIndex, 1056 f.FileNum, f.IsCompacting(), s) 1057 fmt.Fprintf(&buf, "files included:\n") 1058 for _, f := range c.Files { 1059 fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n", 1060 f.FileNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex) 1061 } 1062 fmt.Fprintf(&buf, "files added:\n") 1063 for _, f := range c.filesAdded { 1064 fmt.Fprintf(&buf, "filenum: %d, sl: %d, index: %d, [%d, %d]\n", 1065 f.FileNum, f.SubLevel, f.L0Index, f.minIntervalIndex, f.maxIntervalIndex) 1066 } 1067 return errors.New(buf.String()) 1068 } 1069 } 1070 } 1071 return nil 1072 } 1073 1074 // UpdateStateForStartedCompaction updates internal L0Sublevels state for a 1075 // recently started compaction. isBase specifies if this is a base compaction; 1076 // if false, this is assumed to be an intra-L0 compaction. The specified 1077 // compaction must be involving L0 SSTables. It's assumed that the Compacting 1078 // and IsIntraL0Compacting fields are already set on all FileMetadatas passed 1079 // in. 1080 func (s *L0Sublevels) UpdateStateForStartedCompaction(inputs []LevelSlice, isBase bool) error { 1081 minIntervalIndex := -1 1082 maxIntervalIndex := 0 1083 for i := range inputs { 1084 iter := inputs[i].Iter() 1085 for f := iter.First(); f != nil; f = iter.Next() { 1086 for i := f.minIntervalIndex; i <= f.maxIntervalIndex; i++ { 1087 interval := &s.orderedIntervals[i] 1088 interval.compactingFileCount++ 1089 } 1090 if f.minIntervalIndex < minIntervalIndex || minIntervalIndex == -1 { 1091 minIntervalIndex = f.minIntervalIndex 1092 } 1093 if f.maxIntervalIndex > maxIntervalIndex { 1094 maxIntervalIndex = f.maxIntervalIndex 1095 } 1096 } 1097 } 1098 if isBase { 1099 for i := minIntervalIndex; i <= maxIntervalIndex; i++ { 1100 interval := &s.orderedIntervals[i] 1101 interval.isBaseCompacting = isBase 1102 for j := interval.filesMinIntervalIndex; j <= interval.filesMaxIntervalIndex; j++ { 1103 s.orderedIntervals[j].intervalRangeIsBaseCompacting = true 1104 } 1105 } 1106 } 1107 return nil 1108 } 1109 1110 // L0CompactionFiles represents a candidate set of L0 files for compaction. 1111 // Also referred to as "lcf". Contains state information useful 1112 // for generating the compaction (such as Files), as well as for picking 1113 // between candidate compactions (eg. fileBytes and 1114 // seedIntervalStackDepthReduction). 1115 type L0CompactionFiles struct { 1116 Files []*FileMetadata 1117 1118 FilesIncluded bitSet 1119 // A "seed interval" is an interval with a high stack depth that was chosen 1120 // to bootstrap this compaction candidate. seedIntervalStackDepthReduction 1121 // is the number of sublevels that have a file in the seed interval that is 1122 // a part of this compaction. 1123 seedIntervalStackDepthReduction int 1124 // For base compactions, seedIntervalMinLevel is 0, and for intra-L0 1125 // compactions, seedIntervalMaxLevel is len(s.Files)-1 i.e. the highest 1126 // sublevel. 1127 seedIntervalMinLevel int 1128 seedIntervalMaxLevel int 1129 // Index of the seed interval. 1130 seedInterval int 1131 // Sum of file sizes for all files in this compaction. 1132 fileBytes uint64 1133 // Intervals with index [minIntervalIndex, maxIntervalIndex] are 1134 // participating in this compaction; it's the union set of all intervals 1135 // overlapped by participating files. 1136 minIntervalIndex int 1137 maxIntervalIndex int 1138 1139 // Set for intra-L0 compactions. SSTables with sequence numbers greater 1140 // than earliestUnflushedSeqNum cannot be a part of intra-L0 compactions. 1141 isIntraL0 bool 1142 earliestUnflushedSeqNum uint64 1143 1144 // For debugging purposes only. Used in checkCompaction(). 1145 preExtensionMinInterval int 1146 preExtensionMaxInterval int 1147 filesAdded []*FileMetadata 1148 } 1149 1150 // addFile adds the specified file to the LCF. 1151 func (l *L0CompactionFiles) addFile(f *FileMetadata) { 1152 if l.FilesIncluded[f.L0Index] { 1153 return 1154 } 1155 l.FilesIncluded.markBit(f.L0Index) 1156 l.Files = append(l.Files, f) 1157 l.filesAdded = append(l.filesAdded, f) 1158 l.fileBytes += f.Size 1159 if f.minIntervalIndex < l.minIntervalIndex { 1160 l.minIntervalIndex = f.minIntervalIndex 1161 } 1162 if f.maxIntervalIndex > l.maxIntervalIndex { 1163 l.maxIntervalIndex = f.maxIntervalIndex 1164 } 1165 } 1166 1167 // Helper to order intervals being considered for compaction. 1168 type intervalAndScore struct { 1169 interval int 1170 score int 1171 } 1172 type intervalSorterByDecreasingScore []intervalAndScore 1173 1174 func (is intervalSorterByDecreasingScore) Len() int { return len(is) } 1175 func (is intervalSorterByDecreasingScore) Less(i, j int) bool { 1176 return is[i].score > is[j].score 1177 } 1178 func (is intervalSorterByDecreasingScore) Swap(i, j int) { 1179 is[i], is[j] = is[j], is[i] 1180 } 1181 1182 // Compactions: 1183 // 1184 // The sub-levels and intervals can be visualized in 2 dimensions as the X 1185 // axis containing intervals in increasing order and the Y axis containing 1186 // sub-levels (older to younger). The intervals can be sparse wrt sub-levels. 1187 // We observe that the system is typically under severe pressure in L0 during 1188 // large numbers of ingestions where most files added to L0 are narrow and 1189 // non-overlapping. 1190 // 1191 // L0.1 d---g 1192 // L0.0 c--e g--j o--s u--x 1193 // 1194 // As opposed to a case with a lot of wide, overlapping L0 files: 1195 // 1196 // L0.3 d-----------r 1197 // L0.2 c--------o 1198 // L0.1 b-----------q 1199 // L0.0 a----------------x 1200 // 1201 // In that case we expect the rectangle represented in the good visualization 1202 // above (i.e. the first one) to be wide and short, and not too sparse (most 1203 // intervals will have fileCount close to the sub-level count), which would make 1204 // it amenable to concurrent L0 -> Lbase compactions. 1205 // 1206 // L0 -> Lbase: The high-level goal of a L0 -> Lbase compaction is to reduce 1207 // stack depth, by compacting files in the intervals with the highest 1208 // (fileCount - compactingCount). Additionally, we would like compactions to 1209 // not involve a huge number of files, so that they finish quickly, and to 1210 // allow for concurrent L0 -> Lbase compactions when needed. In order to 1211 // achieve these goals we would like compactions to visualize as capturing 1212 // thin and tall rectangles. The approach below is to consider intervals in 1213 // some order and then try to construct a compaction using the interval. The 1214 // first interval we can construct a compaction for is the compaction that is 1215 // started. There can be multiple heuristics in choosing the ordering of the 1216 // intervals -- the code uses one heuristic that worked well for a large 1217 // ingestion stemming from a cockroachdb import, but additional experimentation 1218 // is necessary to pick a general heuristic. Additionally, the compaction that 1219 // gets picked may be not as desirable as one that could be constructed later 1220 // in terms of reducing stack depth (since adding more files to the compaction 1221 // can get blocked by needing to encompass files that are already being 1222 // compacted). So an alternative would be to try to construct more than one 1223 // compaction and pick the best one. 1224 // 1225 // Here's a visualization of an ideal L0->LBase compaction selection: 1226 // 1227 // L0.3 a--d g-j 1228 // L0.2 f--j r-t 1229 // L0.1 b-d e---j 1230 // L0.0 a--d f--j l--o p-----x 1231 // 1232 // Lbase a--------i m---------w 1233 // 1234 // The [g,j] interval has the highest stack depth, so it would have the highest 1235 // priority for selecting a base compaction candidate. Assuming none of the 1236 // files are already compacting, this is the compaction that will be chosen: 1237 // 1238 // _______ 1239 // L0.3 a--d | g-j| 1240 // L0.2 | f--j| r-t 1241 // L0.1 b-d |e---j| 1242 // L0.0 a--d | f--j| l--o p-----x 1243 // 1244 // Lbase a--------i m---------w 1245 // 1246 // Note that running this compaction will mark the a--i file in Lbase as 1247 // compacting, and when ExtendL0ForBaseCompactionTo is called with the bounds 1248 // of that base file, it'll expand the compaction to also include all L0 files 1249 // in the a-d interval. The resultant compaction would then be: 1250 // 1251 // _____________ 1252 // L0.3 |a--d g-j| 1253 // L0.2 | f--j| r-t 1254 // L0.1 | b-d e---j| 1255 // L0.0 |a--d f--j| l--o p-----x 1256 // 1257 // Lbase a--------i m---------w 1258 // 1259 // The next best interval for base compaction would therefore 1260 // be the one including r--t in L0.2 and p--x in L0.0, and both this compaction 1261 // and the one picked earlier can run in parallel. This is assuming 1262 // minCompactionDepth >= 2, otherwise the second compaction has too little 1263 // depth to pick. 1264 // 1265 // _____________ 1266 // L0.3 |a--d g-j| _________ 1267 // L0.2 | f--j| | r-t | 1268 // L0.1 | b-d e---j| | | 1269 // L0.0 |a--d f--j| l--o |p-----x| 1270 // 1271 // Lbase a--------i m---------w 1272 // 1273 // Note that when ExtendL0ForBaseCompactionTo is called, the compaction expands 1274 // to the following, given that the [l,o] file can be added without including 1275 // additional files in Lbase: 1276 // 1277 // _____________ 1278 // L0.3 |a--d g-j| _________ 1279 // L0.2 | f--j| | r-t | 1280 // L0.1 | b-d e---j|______| | 1281 // L0.0 |a--d f--j||l--o p-----x| 1282 // 1283 // Lbase a--------i m---------w 1284 // 1285 // If an additional file existed in LBase that overlapped with [l,o], it would 1286 // be excluded from the compaction. Concretely: 1287 // 1288 // _____________ 1289 // L0.3 |a--d g-j| _________ 1290 // L0.2 | f--j| | r-t | 1291 // L0.1 | b-d e---j| | | 1292 // L0.0 |a--d f--j| l--o |p-----x| 1293 // 1294 // Lbase a--------ij--lm---------w 1295 // 1296 // Intra-L0: If the L0 score is high, but PickBaseCompaction() is unable to 1297 // pick a compaction, PickIntraL0Compaction will be used to pick an intra-L0 1298 // compaction. Similar to L0 -> Lbase compactions, we want to allow for 1299 // multiple intra-L0 compactions and not generate wide output files that 1300 // hinder later concurrency of L0 -> Lbase compactions. Also compactions 1301 // that produce wide files don't reduce stack depth -- they represent wide 1302 // rectangles in our visualization, which means many intervals have their 1303 // depth reduced by a small amount. Typically, L0 files have non-overlapping 1304 // sequence numbers, and sticking to that invariant would require us to 1305 // consider intra-L0 compactions that proceed from youngest to oldest files, 1306 // which could result in the aforementioned undesirable wide rectangle 1307 // shape. But this non-overlapping sequence number is already relaxed in 1308 // RocksDB -- sstables are primarily ordered by their largest sequence 1309 // number. So we can arrange for intra-L0 compactions to capture thin and 1310 // tall rectangles starting with the top of the stack (youngest files). 1311 // Like the L0 -> Lbase case we order the intervals using a heuristic and 1312 // consider each in turn. The same comment about better L0 -> Lbase heuristics 1313 // and not being greedy applies here. 1314 // 1315 // Going back to a modified version of our example from earlier, let's say these 1316 // are the base compactions in progress: 1317 // _______ 1318 // L0.3 a--d | g-j| _________ 1319 // L0.2 | f--j| | r-t | 1320 // L0.1 b-d |e---j| | | 1321 // L0.0 a--d | f--j| l--o |p-----x| 1322 // 1323 // Lbase a---------i m---------w 1324 // 1325 // Since both LBase files are compacting, the only L0 compaction that can be 1326 // picked is an intra-L0 compaction. For this, the b--d interval has the highest 1327 // stack depth (3), and starting with a--d in L0.3 as the seed file, we can 1328 // iterate downward and build this compaction, assuming all files in that 1329 // interval are not compacting and have a highest sequence number less than 1330 // earliestUnflushedSeqNum: 1331 // 1332 // _______ 1333 // L0.3 |a--d| | g-j| _________ 1334 // L0.2 | | | f--j| | r-t | 1335 // L0.1 | b-d| |e---j| | | 1336 // L0.0 |a--d| | f--j| l--o |p-----x| 1337 // ------ 1338 // Lbase a---------i m---------w 1339 // 1340 1341 // PickBaseCompaction picks a base compaction based on the above specified 1342 // heuristics, for the specified Lbase files and a minimum depth of overlapping 1343 // files that can be selected for compaction. Returns nil if no compaction is 1344 // possible. 1345 func (s *L0Sublevels) PickBaseCompaction( 1346 minCompactionDepth int, baseFiles LevelSlice, 1347 ) (*L0CompactionFiles, error) { 1348 // For LBase compactions, we consider intervals in a greedy manner in the 1349 // following order: 1350 // - Intervals that are unlikely to be blocked due 1351 // to ongoing L0 -> Lbase compactions. These are the ones with 1352 // !isBaseCompacting && !intervalRangeIsBaseCompacting. 1353 // - Intervals that are !isBaseCompacting && intervalRangeIsBaseCompacting. 1354 // 1355 // The ordering heuristic exists just to avoid wasted work. Ideally, 1356 // we would consider all intervals with isBaseCompacting = false and 1357 // construct a compaction for it and compare the constructed compactions 1358 // and pick the best one. If microbenchmarks show that we can afford 1359 // this cost we can eliminate this heuristic. 1360 scoredIntervals := make([]intervalAndScore, 0, len(s.orderedIntervals)) 1361 sublevelCount := len(s.levelFiles) 1362 for i := range s.orderedIntervals { 1363 interval := &s.orderedIntervals[i] 1364 depth := len(interval.files) - interval.compactingFileCount 1365 if interval.isBaseCompacting || minCompactionDepth > depth { 1366 continue 1367 } 1368 if interval.intervalRangeIsBaseCompacting { 1369 scoredIntervals = append(scoredIntervals, intervalAndScore{interval: i, score: depth}) 1370 } else { 1371 // Prioritize this interval by incrementing the score by the number 1372 // of sublevels. 1373 scoredIntervals = append(scoredIntervals, intervalAndScore{interval: i, score: depth + sublevelCount}) 1374 } 1375 } 1376 sort.Sort(intervalSorterByDecreasingScore(scoredIntervals)) 1377 1378 // Optimization to avoid considering different intervals that 1379 // are likely to choose the same seed file. Again this is just 1380 // to reduce wasted work. 1381 consideredIntervals := newBitSet(len(s.orderedIntervals)) 1382 for _, scoredInterval := range scoredIntervals { 1383 interval := &s.orderedIntervals[scoredInterval.interval] 1384 if consideredIntervals[interval.index] { 1385 continue 1386 } 1387 1388 // Pick the seed file for the interval as the file 1389 // in the lowest sub-level. 1390 f := interval.files[0] 1391 // Don't bother considering the intervals that are 1392 // covered by the seed file since they are likely 1393 // nearby. Note that it is possible that those intervals 1394 // have seed files at lower sub-levels so could be 1395 // viable for compaction. 1396 if f == nil { 1397 return nil, errors.New("no seed file found in sublevel intervals") 1398 } 1399 consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1) 1400 if f.IsCompacting() { 1401 if f.IsIntraL0Compacting { 1402 // If we're picking a base compaction and we came across a 1403 // seed file candidate that's being intra-L0 compacted, skip 1404 // the interval instead of erroring out. 1405 continue 1406 } 1407 // We chose a compaction seed file that should not be 1408 // compacting. Usually means the score is not accurately 1409 // accounting for files already compacting, or internal state is 1410 // inconsistent. 1411 return nil, errors.Errorf("file %s chosen as seed file for compaction should not be compacting", f.FileNum) 1412 } 1413 1414 c := s.baseCompactionUsingSeed(f, interval.index, minCompactionDepth) 1415 if c != nil { 1416 // Check if the chosen compaction overlaps with any files 1417 // in Lbase that have Compacting = true. If that's the case, 1418 // this compaction cannot be chosen. 1419 baseIter := baseFiles.Iter() 1420 // An interval starting at ImmediateSuccessor(key) can never be the 1421 // first interval of a compaction since no file can start at that 1422 // interval. 1423 m := baseIter.SeekGE(s.cmp, s.orderedIntervals[c.minIntervalIndex].startKey.key) 1424 1425 var baseCompacting bool 1426 for ; m != nil && !baseCompacting; m = baseIter.Next() { 1427 cmp := s.cmp(m.Smallest.UserKey, s.orderedIntervals[c.maxIntervalIndex+1].startKey.key) 1428 // Compaction is ending at exclusive bound of c.maxIntervalIndex+1 1429 if cmp > 0 || (cmp == 0 && !s.orderedIntervals[c.maxIntervalIndex+1].startKey.isLargest) { 1430 break 1431 } 1432 baseCompacting = baseCompacting || m.IsCompacting() 1433 } 1434 if baseCompacting { 1435 continue 1436 } 1437 return c, nil 1438 } 1439 } 1440 return nil, nil 1441 } 1442 1443 // Helper function for building an L0 -> Lbase compaction using a seed interval 1444 // and seed file in that seed interval. 1445 func (s *L0Sublevels) baseCompactionUsingSeed( 1446 f *FileMetadata, intervalIndex int, minCompactionDepth int, 1447 ) *L0CompactionFiles { 1448 c := &L0CompactionFiles{ 1449 FilesIncluded: newBitSet(s.levelMetadata.Len()), 1450 seedInterval: intervalIndex, 1451 seedIntervalMinLevel: 0, 1452 minIntervalIndex: f.minIntervalIndex, 1453 maxIntervalIndex: f.maxIntervalIndex, 1454 } 1455 c.addFile(f) 1456 1457 // The first iteration of this loop builds the compaction at the seed file's 1458 // sublevel. Future iterations expand on this compaction by stacking 1459 // more files from intervalIndex and repeating. This is an 1460 // optional activity so when it fails we can fallback to the last 1461 // successful candidate. 1462 var lastCandidate *L0CompactionFiles 1463 interval := &s.orderedIntervals[intervalIndex] 1464 1465 for i := 0; i < len(interval.files); i++ { 1466 f2 := interval.files[i] 1467 sl := f2.SubLevel 1468 c.seedIntervalStackDepthReduction++ 1469 c.seedIntervalMaxLevel = sl 1470 c.addFile(f2) 1471 // The seed file is in the lowest sublevel in the seed interval, but it may 1472 // overlap with other files in even lower sublevels. For 1473 // correctness we need to grow our interval to include those files, and 1474 // capture all files in the next level that fall in this extended interval 1475 // and so on. This can result in a triangular shape like the following 1476 // where again the X axis is the key intervals and the Y axis 1477 // is oldest to youngest. Note that it is not necessary for 1478 // correctness to fill out the shape at the higher sub-levels 1479 // to make it more rectangular since the invariant only requires 1480 // that younger versions of a key not be moved to Lbase while 1481 // leaving behind older versions. 1482 // - 1483 // --- 1484 // ----- 1485 // It may be better for performance to have a more rectangular 1486 // shape since the files being left behind will overlap with the 1487 // same Lbase key range as that of this compaction. But there is 1488 // also the danger that in trying to construct a more rectangular 1489 // shape we will be forced to pull in a file that is already 1490 // compacting. We expect extendCandidateToRectangle to eventually be called 1491 // on this compaction if it's chosen, at which point we would iterate 1492 // backward and choose those files. This logic is similar to compaction.grow 1493 // for non-L0 compactions. 1494 done := false 1495 for currLevel := sl - 1; currLevel >= 0; currLevel-- { 1496 if !s.extendFiles(currLevel, math.MaxUint64, c) { 1497 // Failed to extend due to ongoing compaction. 1498 done = true 1499 break 1500 } 1501 } 1502 if done { 1503 break 1504 } 1505 // Observed some compactions using > 1GB from L0 in an import 1506 // experiment. Very long running compactions are not great as they 1507 // reduce concurrency while they run, and take a while to produce 1508 // results, though they're sometimes unavoidable. There is a tradeoff 1509 // here in that adding more depth is more efficient in reducing stack 1510 // depth, but long running compactions reduce flexibility in what can 1511 // run concurrently in L0 and even Lbase -> Lbase+1. An increase more 1512 // than 150% in bytes since the last candidate compaction (along with a 1513 // total compaction size in excess of 100mb), or a total compaction 1514 // size beyond a hard limit of 500mb, is criteria for rejecting this 1515 // candidate. This lets us prefer slow growths as we add files, while 1516 // still having a hard limit. Note that if this is the first compaction 1517 // candidate to reach a stack depth reduction of minCompactionDepth or 1518 // higher, this candidate will be chosen regardless. 1519 if lastCandidate == nil { 1520 lastCandidate = &L0CompactionFiles{} 1521 } else if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth && 1522 c.fileBytes > 100<<20 && 1523 (float64(c.fileBytes)/float64(lastCandidate.fileBytes) > 1.5 || c.fileBytes > 500<<20) { 1524 break 1525 } 1526 *lastCandidate = *c 1527 } 1528 if lastCandidate != nil && lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth { 1529 lastCandidate.FilesIncluded.clearAllBits() 1530 for _, f := range lastCandidate.Files { 1531 lastCandidate.FilesIncluded.markBit(f.L0Index) 1532 } 1533 return lastCandidate 1534 } 1535 return nil 1536 } 1537 1538 // Expands fields in the provided L0CompactionFiles instance (cFiles) to 1539 // include overlapping files in the specified sublevel. Returns true if the 1540 // compaction is possible (i.e. does not conflict with any base/intra-L0 1541 // compacting files). 1542 func (s *L0Sublevels) extendFiles( 1543 sl int, earliestUnflushedSeqNum uint64, cFiles *L0CompactionFiles, 1544 ) bool { 1545 index := sort.Search(len(s.levelFiles[sl]), func(i int) bool { 1546 return s.levelFiles[sl][i].maxIntervalIndex >= cFiles.minIntervalIndex 1547 }) 1548 for ; index < len(s.levelFiles[sl]); index++ { 1549 f := s.levelFiles[sl][index] 1550 if f.minIntervalIndex > cFiles.maxIntervalIndex { 1551 break 1552 } 1553 if f.IsCompacting() { 1554 return false 1555 } 1556 // Skip over files that are newer than earliestUnflushedSeqNum. This is 1557 // okay because this compaction can just pretend these files are not in 1558 // L0 yet. These files must be in higher sublevels than any overlapping 1559 // files with f.LargestSeqNum < earliestUnflushedSeqNum, and the output 1560 // of the compaction will also go in a lower (older) sublevel than this 1561 // file by definition. 1562 if f.LargestSeqNum >= earliestUnflushedSeqNum { 1563 continue 1564 } 1565 cFiles.addFile(f) 1566 } 1567 return true 1568 } 1569 1570 // PickIntraL0Compaction picks an intra-L0 compaction for files in this 1571 // sublevel. This method is only called when a base compaction cannot be chosen. 1572 // See comment above PickBaseCompaction for heuristics involved in this 1573 // selection. 1574 func (s *L0Sublevels) PickIntraL0Compaction( 1575 earliestUnflushedSeqNum uint64, minCompactionDepth int, 1576 ) (*L0CompactionFiles, error) { 1577 scoredIntervals := make([]intervalAndScore, len(s.orderedIntervals)) 1578 for i := range s.orderedIntervals { 1579 interval := &s.orderedIntervals[i] 1580 depth := len(interval.files) - interval.compactingFileCount 1581 if minCompactionDepth > depth { 1582 continue 1583 } 1584 scoredIntervals[i] = intervalAndScore{interval: i, score: depth} 1585 } 1586 sort.Sort(intervalSorterByDecreasingScore(scoredIntervals)) 1587 1588 // Optimization to avoid considering different intervals that 1589 // are likely to choose the same seed file. Again this is just 1590 // to reduce wasted work. 1591 consideredIntervals := newBitSet(len(s.orderedIntervals)) 1592 for _, scoredInterval := range scoredIntervals { 1593 interval := &s.orderedIntervals[scoredInterval.interval] 1594 if consideredIntervals[interval.index] { 1595 continue 1596 } 1597 1598 var f *FileMetadata 1599 // Pick the seed file for the interval as the file 1600 // in the highest sub-level. 1601 stackDepthReduction := scoredInterval.score 1602 for i := len(interval.files) - 1; i >= 0; i-- { 1603 f = interval.files[i] 1604 if f.IsCompacting() { 1605 break 1606 } 1607 consideredIntervals.markBits(f.minIntervalIndex, f.maxIntervalIndex+1) 1608 // Can this be the seed file? Files with newer sequence 1609 // numbers than earliestUnflushedSeqNum cannot be in 1610 // the compaction. 1611 if f.LargestSeqNum >= earliestUnflushedSeqNum { 1612 stackDepthReduction-- 1613 if stackDepthReduction == 0 { 1614 break 1615 } 1616 } else { 1617 break 1618 } 1619 } 1620 if stackDepthReduction < minCompactionDepth { 1621 // Can't use this interval. 1622 continue 1623 } 1624 1625 if f == nil { 1626 return nil, errors.New("no seed file found in sublevel intervals") 1627 } 1628 if f.IsCompacting() { 1629 // This file could be in a concurrent intra-L0 or base compaction. 1630 // Try another interval. 1631 continue 1632 } 1633 1634 // We have a seed file. Build a compaction off of that seed. 1635 c := s.intraL0CompactionUsingSeed( 1636 f, interval.index, earliestUnflushedSeqNum, minCompactionDepth) 1637 if c != nil { 1638 return c, nil 1639 } 1640 } 1641 return nil, nil 1642 } 1643 1644 func (s *L0Sublevels) intraL0CompactionUsingSeed( 1645 f *FileMetadata, intervalIndex int, earliestUnflushedSeqNum uint64, minCompactionDepth int, 1646 ) *L0CompactionFiles { 1647 // We know that all the files that overlap with intervalIndex have 1648 // LargestSeqNum < earliestUnflushedSeqNum, but for other intervals 1649 // we need to exclude files >= earliestUnflushedSeqNum 1650 1651 c := &L0CompactionFiles{ 1652 FilesIncluded: newBitSet(s.levelMetadata.Len()), 1653 seedInterval: intervalIndex, 1654 seedIntervalMaxLevel: len(s.levelFiles) - 1, 1655 minIntervalIndex: f.minIntervalIndex, 1656 maxIntervalIndex: f.maxIntervalIndex, 1657 isIntraL0: true, 1658 earliestUnflushedSeqNum: earliestUnflushedSeqNum, 1659 } 1660 c.addFile(f) 1661 1662 var lastCandidate *L0CompactionFiles 1663 interval := &s.orderedIntervals[intervalIndex] 1664 slIndex := len(interval.files) - 1 1665 for { 1666 if interval.files[slIndex] == f { 1667 break 1668 } 1669 slIndex-- 1670 } 1671 // The first iteration of this loop produces an intra-L0 compaction at the 1672 // seed level. Iterations after that optionally add to the compaction by 1673 // stacking more files from intervalIndex and repeating. This is an 1674 // optional activity so when it fails we can fallback to the last 1675 // successful candidate. The code stops adding when it can't add more, or 1676 // when fileBytes grows too large. 1677 for ; slIndex >= 0; slIndex-- { 1678 f2 := interval.files[slIndex] 1679 sl := f2.SubLevel 1680 if f2.IsCompacting() { 1681 break 1682 } 1683 c.seedIntervalStackDepthReduction++ 1684 c.seedIntervalMinLevel = sl 1685 c.addFile(f2) 1686 // The seed file captures all files in the higher level that fall in the 1687 // range of intervals. That may extend the range of intervals so for 1688 // correctness we need to capture all files in the next higher level that 1689 // fall in this extended interval and so on. This can result in an 1690 // inverted triangular shape like the following where again the X axis is the 1691 // key intervals and the Y axis is oldest to youngest. Note that it is not 1692 // necessary for correctness to fill out the shape at lower sub-levels to 1693 // make it more rectangular since the invariant only requires that if we 1694 // move an older seqnum for key k into a file that has a higher seqnum, we 1695 // also move all younger seqnums for that key k into that file. 1696 // ----- 1697 // --- 1698 // - 1699 // 1700 // It may be better for performance to have a more rectangular shape since 1701 // it will reduce the stack depth for more intervals. But there is also 1702 // the danger that in explicitly trying to construct a more rectangular 1703 // shape we will be forced to pull in a file that is already compacting. 1704 // We assume that the performance concern is not a practical issue. 1705 done := false 1706 for currLevel := sl + 1; currLevel < len(s.levelFiles); currLevel++ { 1707 if !s.extendFiles(currLevel, earliestUnflushedSeqNum, c) { 1708 // Failed to extend due to ongoing compaction. 1709 done = true 1710 break 1711 } 1712 } 1713 if done { 1714 break 1715 } 1716 if lastCandidate == nil { 1717 lastCandidate = &L0CompactionFiles{} 1718 } else if lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth && 1719 c.fileBytes > 100<<20 && 1720 (float64(c.fileBytes)/float64(lastCandidate.fileBytes) > 1.5 || c.fileBytes > 500<<20) { 1721 break 1722 } 1723 *lastCandidate = *c 1724 } 1725 if lastCandidate != nil && lastCandidate.seedIntervalStackDepthReduction >= minCompactionDepth { 1726 lastCandidate.FilesIncluded.clearAllBits() 1727 for _, f := range lastCandidate.Files { 1728 lastCandidate.FilesIncluded.markBit(f.L0Index) 1729 } 1730 s.extendCandidateToRectangle( 1731 lastCandidate.minIntervalIndex, lastCandidate.maxIntervalIndex, lastCandidate, false) 1732 return lastCandidate 1733 } 1734 return nil 1735 } 1736 1737 // ExtendL0ForBaseCompactionTo extends the specified base compaction candidate 1738 // L0CompactionFiles to optionally cover more files in L0 without "touching" 1739 // any of the passed-in keys (i.e. the smallest/largest bounds are exclusive), 1740 // as including any user keys for those internal keys 1741 // could require choosing more files in LBase which is undesirable. Unbounded 1742 // start/end keys are indicated by passing in the InvalidInternalKey. 1743 func (s *L0Sublevels) ExtendL0ForBaseCompactionTo( 1744 smallest, largest InternalKey, candidate *L0CompactionFiles, 1745 ) bool { 1746 firstIntervalIndex := 0 1747 lastIntervalIndex := len(s.orderedIntervals) - 1 1748 if smallest.Kind() != base.InternalKeyKindInvalid { 1749 if smallest.Trailer == base.InternalKeyRangeDeleteSentinel { 1750 // Starting at smallest.UserKey == interval.startKey is okay. 1751 firstIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool { 1752 return s.cmp(smallest.UserKey, s.orderedIntervals[i].startKey.key) <= 0 1753 }) 1754 } else { 1755 firstIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool { 1756 // Need to start at >= smallest since if we widen too much we may miss 1757 // an Lbase file that overlaps with an L0 file that will get picked in 1758 // this widening, which would be bad. This interval will not start with 1759 // an immediate successor key. 1760 return s.cmp(smallest.UserKey, s.orderedIntervals[i].startKey.key) < 0 1761 }) 1762 } 1763 } 1764 if largest.Kind() != base.InternalKeyKindInvalid { 1765 // First interval that starts at or beyond the largest. This interval will not 1766 // start with an immediate successor key. 1767 lastIntervalIndex = sort.Search(len(s.orderedIntervals), func(i int) bool { 1768 return s.cmp(largest.UserKey, s.orderedIntervals[i].startKey.key) <= 0 1769 }) 1770 // Right now, lastIntervalIndex has a startKey that extends beyond largest. 1771 // The previous interval, by definition, has an end key higher than largest. 1772 // Iterate back twice to get the last interval that's completely within 1773 // (smallest, largest). Except in the case where we went past the end of the 1774 // list; in that case, the last interval to include is the very last 1775 // interval in the list. 1776 if lastIntervalIndex < len(s.orderedIntervals) { 1777 lastIntervalIndex-- 1778 } 1779 lastIntervalIndex-- 1780 } 1781 if lastIntervalIndex < firstIntervalIndex { 1782 return false 1783 } 1784 return s.extendCandidateToRectangle(firstIntervalIndex, lastIntervalIndex, candidate, true) 1785 } 1786 1787 // Best-effort attempt to make the compaction include more files in the 1788 // rectangle defined by [minIntervalIndex, maxIntervalIndex] on the X axis and 1789 // bounded on the Y axis by seedIntervalMinLevel and seedIntervalMaxLevel. 1790 // 1791 // This is strictly an optional extension; at any point where we can't feasibly 1792 // add more files, the sublevel iteration can be halted early and candidate will 1793 // still be a correct compaction candidate. 1794 // 1795 // Consider this scenario (original candidate is inside the rectangle), with 1796 // isBase = true and interval bounds a-j (from the union of base file bounds and 1797 // that of compaction candidate): 1798 // 1799 // _______ 1800 // L0.3 a--d | g-j| 1801 // L0.2 | f--j| r-t 1802 // L0.1 b-d |e---j| 1803 // L0.0 a--d | f--j| l--o p-----x 1804 // 1805 // Lbase a--------i m---------w 1806 // 1807 // This method will iterate from the bottom up. At L0.0, it will add a--d since 1808 // it's in the bounds, then add b-d, then a--d, and so on, to produce this: 1809 // 1810 // _____________ 1811 // L0.3 |a--d g-j| 1812 // L0.2 | f--j| r-t 1813 // L0.1 | b-d e---j| 1814 // L0.0 |a--d f--j| l--o p-----x 1815 // 1816 // Lbase a-------i m---------w 1817 // 1818 // Let's assume that, instead of a--d in the top sublevel, we had 3 files, a-b, 1819 // bb-c, and cc-d, of which bb-c is compacting. Let's also add another sublevel 1820 // L0.4 with some files, all of which aren't compacting: 1821 // 1822 // L0.4 a------c ca--d _______ 1823 // L0.3 a-b bb-c cc-d | g-j| 1824 // L0.2 | f--j| r-t 1825 // L0.1 b----------d |e---j| 1826 // L0.0 a------------d | f--j| l--o p-----x 1827 // 1828 // Lbase a------------------i m---------w 1829 // 1830 // This method then needs to choose between the left side of L0.3 bb-c 1831 // (i.e. a-b), or the right side (i.e. cc-d and g-j) for inclusion in this 1832 // compaction. Since the right side has more files as well as one file that has 1833 // already been picked, it gets chosen at that sublevel, resulting in this 1834 // intermediate compaction: 1835 // 1836 // L0.4 a------c ca--d 1837 // ______________ 1838 // L0.3 a-b bb-c| cc-d g-j| 1839 // L0.2 _________| f--j| r-t 1840 // L0.1 | b----------d e---j| 1841 // L0.0 |a------------d f--j| l--o p-----x 1842 // 1843 // Lbase a------------------i m---------w 1844 // 1845 // Since bb-c had to be excluded at L0.3, the interval bounds for L0.4 are 1846 // actually ca-j, since ca is the next interval start key after the end interval 1847 // of bb-c. This would result in only ca-d being chosen at that sublevel, even 1848 // though a--c is also not compacting. This is the final result: 1849 // 1850 // ______________ 1851 // L0.4 a------c|ca--d | 1852 // L0.3 a-b bb-c| cc-d g-j| 1853 // L0.2 _________| f--j| r-t 1854 // L0.1 | b----------d e---j| 1855 // L0.0 |a------------d f--j| l--o p-----x 1856 // 1857 // Lbase a------------------i m---------w 1858 // 1859 // TODO(bilal): Add more targeted tests for this method, through 1860 // ExtendL0ForBaseCompactionTo and intraL0CompactionUsingSeed. 1861 func (s *L0Sublevels) extendCandidateToRectangle( 1862 minIntervalIndex int, maxIntervalIndex int, candidate *L0CompactionFiles, isBase bool, 1863 ) bool { 1864 candidate.preExtensionMinInterval = candidate.minIntervalIndex 1865 candidate.preExtensionMaxInterval = candidate.maxIntervalIndex 1866 // Extend {min,max}IntervalIndex to include all of the candidate's current 1867 // bounds. 1868 if minIntervalIndex > candidate.minIntervalIndex { 1869 minIntervalIndex = candidate.minIntervalIndex 1870 } 1871 if maxIntervalIndex < candidate.maxIntervalIndex { 1872 maxIntervalIndex = candidate.maxIntervalIndex 1873 } 1874 var startLevel, increment, endLevel int 1875 if isBase { 1876 startLevel = 0 1877 increment = +1 1878 // seedIntervalMaxLevel is inclusive, while endLevel is exclusive. 1879 endLevel = candidate.seedIntervalMaxLevel + 1 1880 } else { 1881 startLevel = len(s.levelFiles) - 1 1882 increment = -1 1883 // seedIntervalMinLevel is inclusive, while endLevel is exclusive. 1884 endLevel = candidate.seedIntervalMinLevel - 1 1885 } 1886 // Stats for files. 1887 addedCount := 0 1888 // Iterate from the oldest sub-level for L0 -> Lbase and youngest 1889 // sub-level for intra-L0. The idea here is that anything that can't 1890 // be included from that level constrains what can be included from 1891 // the next level. This change in constraint is directly incorporated 1892 // into minIntervalIndex, maxIntervalIndex. 1893 for sl := startLevel; sl != endLevel; sl += increment { 1894 files := s.levelFiles[sl] 1895 // Find the first file that overlaps with minIntervalIndex. 1896 index := sort.Search(len(files), func(i int) bool { 1897 return minIntervalIndex <= files[i].maxIntervalIndex 1898 }) 1899 // Track the files that are fully within the current constraint 1900 // of [minIntervalIndex, maxIntervalIndex]. 1901 firstIndex := -1 1902 lastIndex := -1 1903 for ; index < len(files); index++ { 1904 f := files[index] 1905 if f.minIntervalIndex > maxIntervalIndex { 1906 break 1907 } 1908 include := true 1909 // Extends out on the left so can't be included. This narrows 1910 // what we can included in the next level. 1911 if f.minIntervalIndex < minIntervalIndex { 1912 include = false 1913 minIntervalIndex = f.maxIntervalIndex + 1 1914 } 1915 // Extends out on the right so can't be included. 1916 if f.maxIntervalIndex > maxIntervalIndex { 1917 include = false 1918 maxIntervalIndex = f.minIntervalIndex - 1 1919 } 1920 if !include { 1921 continue 1922 } 1923 if firstIndex == -1 { 1924 firstIndex = index 1925 } 1926 lastIndex = index 1927 } 1928 if minIntervalIndex > maxIntervalIndex { 1929 // We excluded files that prevent continuation. 1930 break 1931 } 1932 if firstIndex < 0 { 1933 // No files to add in this sub-level. 1934 continue 1935 } 1936 // We have the files in [firstIndex, lastIndex] as potential for 1937 // inclusion. Some of these may already have been picked. Some 1938 // of them may be already compacting. The latter is tricky since 1939 // we have to decide whether to contract minIntervalIndex or 1940 // maxIntervalIndex when we encounter an already compacting file. 1941 // We pick the longest sequence between firstIndex 1942 // and lastIndex of non-compacting files -- this is represented by 1943 // [candidateNonCompactingFirst, candidateNonCompactingLast]. 1944 nonCompactingFirst := -1 1945 currentRunHasAlreadyPickedFiles := false 1946 candidateNonCompactingFirst := -1 1947 candidateNonCompactingLast := -1 1948 candidateHasAlreadyPickedFiles := false 1949 for index = firstIndex; index <= lastIndex; index++ { 1950 f := files[index] 1951 if f.IsCompacting() { 1952 if nonCompactingFirst != -1 { 1953 last := index - 1 1954 // Prioritize runs of consecutive non-compacting files that 1955 // have files that have already been picked. That is to say, 1956 // if candidateHasAlreadyPickedFiles == true, we stick with 1957 // it, and if currentRunHasAlreadyPickedfiles == true, we 1958 // pick that run even if it contains fewer files than the 1959 // previous candidate. 1960 if !candidateHasAlreadyPickedFiles && (candidateNonCompactingFirst == -1 || 1961 currentRunHasAlreadyPickedFiles || 1962 (last-nonCompactingFirst) > (candidateNonCompactingLast-candidateNonCompactingFirst)) { 1963 candidateNonCompactingFirst = nonCompactingFirst 1964 candidateNonCompactingLast = last 1965 candidateHasAlreadyPickedFiles = currentRunHasAlreadyPickedFiles 1966 } 1967 } 1968 nonCompactingFirst = -1 1969 currentRunHasAlreadyPickedFiles = false 1970 continue 1971 } 1972 if nonCompactingFirst == -1 { 1973 nonCompactingFirst = index 1974 } 1975 if candidate.FilesIncluded[f.L0Index] { 1976 currentRunHasAlreadyPickedFiles = true 1977 } 1978 } 1979 // Logic duplicated from inside the for loop above. 1980 if nonCompactingFirst != -1 { 1981 last := index - 1 1982 if !candidateHasAlreadyPickedFiles && (candidateNonCompactingFirst == -1 || 1983 currentRunHasAlreadyPickedFiles || 1984 (last-nonCompactingFirst) > (candidateNonCompactingLast-candidateNonCompactingFirst)) { 1985 candidateNonCompactingFirst = nonCompactingFirst 1986 candidateNonCompactingLast = last 1987 } 1988 } 1989 if candidateNonCompactingFirst == -1 { 1990 // All files are compacting. There will be gaps that we could exploit 1991 // to continue, but don't bother. 1992 break 1993 } 1994 // May need to shrink [minIntervalIndex, maxIntervalIndex] for the next level. 1995 if candidateNonCompactingFirst > firstIndex { 1996 minIntervalIndex = files[candidateNonCompactingFirst-1].maxIntervalIndex + 1 1997 } 1998 if candidateNonCompactingLast < lastIndex { 1999 maxIntervalIndex = files[candidateNonCompactingLast+1].minIntervalIndex - 1 2000 } 2001 for index := candidateNonCompactingFirst; index <= candidateNonCompactingLast; index++ { 2002 f := files[index] 2003 if f.IsCompacting() { 2004 // TODO(bilal): Do a logger.Fatalf instead of a panic, for 2005 // cleaner unwinding and error messages. 2006 panic(fmt.Sprintf("expected %s to not be compacting", f.FileNum)) 2007 } 2008 if candidate.isIntraL0 && f.LargestSeqNum >= candidate.earliestUnflushedSeqNum { 2009 continue 2010 } 2011 if !candidate.FilesIncluded[f.L0Index] { 2012 addedCount++ 2013 candidate.addFile(f) 2014 } 2015 } 2016 } 2017 return addedCount > 0 2018 }