github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/table_stats.go (about) 1 // Copyright 2020 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package bitalostable 6 7 import ( 8 "fmt" 9 "math" 10 11 "github.com/zuoyebang/bitalostable/internal/base" 12 "github.com/zuoyebang/bitalostable/internal/keyspan" 13 "github.com/zuoyebang/bitalostable/internal/manifest" 14 "github.com/zuoyebang/bitalostable/sstable" 15 ) 16 17 // In-memory statistics about tables help inform compaction picking, but may 18 // be expensive to calculate or load from disk. Every time a database is 19 // opened, these statistics must be reloaded or recalculated. To minimize 20 // impact on user activity and compactions, we load these statistics 21 // asynchronously in the background and store loaded statistics in each 22 // table's *FileMetadata. 23 // 24 // This file implements the asynchronous loading of statistics by maintaining 25 // a list of files that require statistics, alongside their LSM levels. 26 // Whenever new files are added to the LSM, the files are appended to 27 // d.mu.tableStats.pending. If a stats collection job is not currently 28 // running, one is started in a separate goroutine. 29 // 30 // The stats collection job grabs and clears the pending list, computes table 31 // statistics relative to the current readState and updates the tables' file 32 // metadata. New pending files may accumulate during a stats collection job, 33 // so a completing job triggers a new job if necessary. Only one job runs at a 34 // time. 35 // 36 // When an existing database is opened, all files lack in-memory statistics. 37 // These files' stats are loaded incrementally whenever the pending list is 38 // empty by scanning a current readState for files missing statistics. Once a 39 // job completes a scan without finding any remaining files without 40 // statistics, it flips a `loadedInitial` flag. From then on, the stats 41 // collection job only needs to load statistics for new files appended to the 42 // pending list. 43 44 func (d *DB) maybeCollectTableStatsLocked() { 45 if d.shouldCollectTableStatsLocked() { 46 go d.collectTableStats() 47 } 48 } 49 50 // updateTableStatsLocked is called when new files are introduced, after the 51 // read state has been updated. It may trigger a new stat collection. 52 // DB.mu must be locked when calling. 53 func (d *DB) updateTableStatsLocked(newFiles []manifest.NewFileEntry) { 54 var needStats bool 55 for _, nf := range newFiles { 56 if !nf.Meta.StatsValidLocked() { 57 needStats = true 58 break 59 } 60 } 61 if !needStats { 62 return 63 } 64 65 d.mu.tableStats.pending = append(d.mu.tableStats.pending, newFiles...) 66 d.maybeCollectTableStatsLocked() 67 } 68 69 func (d *DB) shouldCollectTableStatsLocked() bool { 70 return !d.mu.tableStats.loading && 71 d.closed.Load() == nil && 72 !d.opts.private.disableTableStats && 73 (len(d.mu.tableStats.pending) > 0 || !d.mu.tableStats.loadedInitial) 74 } 75 76 // collectTableStats runs a table stats collection job, returning true if the 77 // invocation did the collection work, false otherwise (e.g. if another job was 78 // already running). 79 func (d *DB) collectTableStats() bool { 80 const maxTableStatsPerScan = 50 81 82 d.mu.Lock() 83 if !d.shouldCollectTableStatsLocked() { 84 d.mu.Unlock() 85 return false 86 } 87 88 pending := d.mu.tableStats.pending 89 d.mu.tableStats.pending = nil 90 d.mu.tableStats.loading = true 91 jobID := d.mu.nextJobID 92 d.mu.nextJobID++ 93 loadedInitial := d.mu.tableStats.loadedInitial 94 // Drop DB.mu before performing IO. 95 d.mu.Unlock() 96 97 // Every run of collectTableStats either collects stats from the pending 98 // list (if non-empty) or from scanning the version (loadedInitial is 99 // false). This job only runs if at least one of those conditions holds. 100 101 // Grab a read state to scan for tables. 102 rs := d.loadReadState() 103 var collected []collectedStats 104 var hints []deleteCompactionHint 105 if len(pending) > 0 { 106 collected, hints = d.loadNewFileStats(rs, pending) 107 } else { 108 var moreRemain bool 109 var buf [maxTableStatsPerScan]collectedStats 110 collected, hints, moreRemain = d.scanReadStateTableStats(rs, buf[:0]) 111 loadedInitial = !moreRemain 112 } 113 rs.unref() 114 115 // Update the FileMetadata with the loaded stats while holding d.mu. 116 d.mu.Lock() 117 defer d.mu.Unlock() 118 d.mu.tableStats.loading = false 119 if loadedInitial && !d.mu.tableStats.loadedInitial { 120 d.mu.tableStats.loadedInitial = loadedInitial 121 d.opts.EventListener.TableStatsLoaded(TableStatsInfo{ 122 JobID: jobID, 123 }) 124 } 125 126 maybeCompact := false 127 for _, c := range collected { 128 c.fileMetadata.Stats = c.TableStats 129 maybeCompact = maybeCompact || c.TableStats.RangeDeletionsBytesEstimate > 0 130 c.fileMetadata.StatsMarkValid() 131 } 132 d.mu.tableStats.cond.Broadcast() 133 d.maybeCollectTableStatsLocked() 134 if len(hints) > 0 { 135 // Verify that all of the hint tombstones' files still exist in the 136 // current version. Otherwise, the tombstone itself may have been 137 // compacted into L6 and more recent keys may have had their sequence 138 // numbers zeroed. 139 // 140 // Note that it's possible that the tombstone file is being compacted 141 // presently. In that case, the file will be present in v. When the 142 // compaction finishes compacting the tombstone file, it will detect 143 // and clear the hint. 144 // 145 // See DB.maybeUpdateDeleteCompactionHints. 146 v := d.mu.versions.currentVersion() 147 keepHints := hints[:0] 148 for _, h := range hints { 149 if v.Contains(h.tombstoneLevel, d.cmp, h.tombstoneFile) { 150 keepHints = append(keepHints, h) 151 } 152 } 153 d.mu.compact.deletionHints = append(d.mu.compact.deletionHints, keepHints...) 154 } 155 if maybeCompact { 156 d.maybeScheduleCompaction() 157 } 158 return true 159 } 160 161 type collectedStats struct { 162 *fileMetadata 163 manifest.TableStats 164 } 165 166 func (d *DB) loadNewFileStats( 167 rs *readState, pending []manifest.NewFileEntry, 168 ) ([]collectedStats, []deleteCompactionHint) { 169 var hints []deleteCompactionHint 170 collected := make([]collectedStats, 0, len(pending)) 171 for _, nf := range pending { 172 // A file's stats might have been populated by an earlier call to 173 // loadNewFileStats if the file was moved. 174 // NB: We're not holding d.mu which protects f.Stats, but only 175 // collectTableStats updates f.Stats for active files, and we 176 // ensure only one goroutine runs it at a time through 177 // d.mu.tableStats.loading. 178 if nf.Meta.StatsValidLocked() { 179 continue 180 } 181 182 // The file isn't guaranteed to still be live in the readState's 183 // version. It may have been deleted or moved. Skip it if it's not in 184 // the expected level. 185 if !rs.current.Contains(nf.Level, d.cmp, nf.Meta) { 186 continue 187 } 188 189 stats, newHints, err := d.loadTableStats(rs.current, nf.Level, nf.Meta) 190 if err != nil { 191 d.opts.EventListener.BackgroundError(err) 192 continue 193 } 194 // NB: We don't update the FileMetadata yet, because we aren't 195 // holding DB.mu. We'll copy it to the FileMetadata after we're 196 // finished with IO. 197 collected = append(collected, collectedStats{ 198 fileMetadata: nf.Meta, 199 TableStats: stats, 200 }) 201 hints = append(hints, newHints...) 202 } 203 return collected, hints 204 } 205 206 // scanReadStateTableStats is run by an active stat collection job when there 207 // are no pending new files, but there might be files that existed at Open for 208 // which we haven't loaded table stats. 209 func (d *DB) scanReadStateTableStats( 210 rs *readState, fill []collectedStats, 211 ) ([]collectedStats, []deleteCompactionHint, bool) { 212 moreRemain := false 213 var hints []deleteCompactionHint 214 for l, levelMetadata := range rs.current.Levels { 215 iter := levelMetadata.Iter() 216 for f := iter.First(); f != nil; f = iter.Next() { 217 // NB: We're not holding d.mu which protects f.Stats, but only the 218 // active stats collection job updates f.Stats for active files, 219 // and we ensure only one goroutine runs it at a time through 220 // d.mu.tableStats.loading. This makes it safe to read validity 221 // through f.Stats.ValidLocked despite not holding d.mu. 222 if f.StatsValidLocked() { 223 continue 224 } 225 226 // Limit how much work we do per read state. The older the read 227 // state is, the higher the likelihood files are no longer being 228 // used in the current version. If we've exhausted our allowance, 229 // return true for the last return value to signal there's more 230 // work to do. 231 if len(fill) == cap(fill) { 232 moreRemain = true 233 return fill, hints, moreRemain 234 } 235 236 stats, newHints, err := d.loadTableStats(rs.current, l, f) 237 if err != nil { 238 // Set `moreRemain` so we'll try again. 239 moreRemain = true 240 d.opts.EventListener.BackgroundError(err) 241 continue 242 } 243 fill = append(fill, collectedStats{ 244 fileMetadata: f, 245 TableStats: stats, 246 }) 247 hints = append(hints, newHints...) 248 } 249 } 250 return fill, hints, moreRemain 251 } 252 253 func (d *DB) loadTableStats( 254 v *version, level int, meta *fileMetadata, 255 ) (manifest.TableStats, []deleteCompactionHint, error) { 256 var stats manifest.TableStats 257 var compactionHints []deleteCompactionHint 258 err := d.tableCache.withReader(meta, func(r *sstable.Reader) (err error) { 259 stats.NumEntries = r.Properties.NumEntries 260 stats.NumDeletions = r.Properties.NumDeletions 261 if r.Properties.NumPointDeletions() > 0 { 262 if err = d.loadTablePointKeyStats(r, v, level, meta, &stats); err != nil { 263 return 264 } 265 } 266 if r.Properties.NumRangeDeletions > 0 || r.Properties.NumRangeKeyDels > 0 { 267 if compactionHints, err = d.loadTableRangeDelStats(r, v, level, meta, &stats); err != nil { 268 return 269 } 270 } 271 // TODO(travers): Once we have real-world data, consider collecting 272 // additional stats that may provide improved heuristics for compaction 273 // picking. 274 stats.NumRangeKeySets = r.Properties.NumRangeKeySets 275 return 276 }) 277 if err != nil { 278 return stats, nil, err 279 } 280 return stats, compactionHints, nil 281 } 282 283 // loadTablePointKeyStats calculates the point key statistics for the given 284 // table. The provided manifest.TableStats are updated. 285 func (d *DB) loadTablePointKeyStats( 286 r *sstable.Reader, v *version, level int, meta *fileMetadata, stats *manifest.TableStats, 287 ) error { 288 // TODO(jackson): If the file has a wide keyspace, the average 289 // value size beneath the entire file might not be representative 290 // of the size of the keys beneath the point tombstones. 291 // We could write the ranges of 'clusters' of point tombstones to 292 // a sstable property and call averageValueSizeBeneath for each of 293 // these narrower ranges to improve the estimate. 294 avgKeySize, avgValSize, err := d.averageEntrySizeBeneath(v, level, meta) 295 if err != nil { 296 return err 297 } 298 stats.PointDeletionsBytesEstimate = 299 pointDeletionsBytesEstimate(&r.Properties, avgKeySize, avgValSize) 300 return nil 301 } 302 303 // loadTableRangeDelStats calculates the range deletion and range key deletion 304 // statistics for the given table. 305 func (d *DB) loadTableRangeDelStats( 306 r *sstable.Reader, v *version, level int, meta *fileMetadata, stats *manifest.TableStats, 307 ) ([]deleteCompactionHint, error) { 308 iter, err := newCombinedDeletionKeyspanIter(d.opts.Comparer, r, meta) 309 if err != nil { 310 return nil, err 311 } 312 defer iter.Close() 313 var compactionHints []deleteCompactionHint 314 // We iterate over the defragmented range tombstones and range key deletions, 315 // which ensures we don't double count ranges deleted at different sequence 316 // numbers. Also, merging abutting tombstones reduces the number of calls to 317 // estimateReclaimedSizeBeneath which is costly, and improves the accuracy of 318 // our overall estimate. 319 for s := iter.First(); s != nil; s = iter.Next() { 320 start, end := s.Start, s.End 321 // We only need to consider deletion size estimates for tables that contain 322 // point keys. 323 var hasPoints bool 324 for _, k := range s.Keys { 325 if k.Kind() == base.InternalKeyKindRangeDelete { 326 hasPoints = true 327 break 328 } 329 } 330 331 // If the file is in the last level of the LSM, there is no data beneath 332 // it. The fact that there is still a range tombstone in a bottommost file 333 // suggests that an open snapshot kept the tombstone around. Estimate disk 334 // usage within the file itself. 335 // NOTE: If the span `s` wholly contains a table containing range keys, 336 // the returned size estimate will be slightly inflated by the range key 337 // block. However, in practice, range keys are expected to be rare, and 338 // the size of the range key block relative to the overall size of the 339 // table is expected to be small. 340 if hasPoints && level == numLevels-1 { 341 size, err := r.EstimateDiskUsage(start, end) 342 if err != nil { 343 return nil, err 344 } 345 stats.RangeDeletionsBytesEstimate += size 346 347 // As the file is in the bottommost level, there is no need to collect a 348 // deletion hint. 349 continue 350 } 351 352 // While the size estimates for point keys should only be updated if this 353 // span contains a range del, the sequence numbers are required for the 354 // hint. Unconditionally descend, but conditionally update the estimates. 355 hintType := compactionHintFromKeys(s.Keys) 356 estimate, hintSeqNum, err := d.estimateReclaimedSizeBeneath(v, level, start, end, hintType) 357 if err != nil { 358 return nil, err 359 } 360 stats.RangeDeletionsBytesEstimate += estimate 361 362 // If any files were completely contained with the range, 363 // hintSeqNum is the smallest sequence number contained in any 364 // such file. 365 if hintSeqNum == math.MaxUint64 { 366 continue 367 } 368 hint := deleteCompactionHint{ 369 hintType: hintType, 370 start: make([]byte, len(start)), 371 end: make([]byte, len(end)), 372 tombstoneFile: meta, 373 tombstoneLevel: level, 374 tombstoneLargestSeqNum: s.LargestSeqNum(), 375 tombstoneSmallestSeqNum: s.SmallestSeqNum(), 376 fileSmallestSeqNum: hintSeqNum, 377 } 378 copy(hint.start, start) 379 copy(hint.end, end) 380 compactionHints = append(compactionHints, hint) 381 } 382 return compactionHints, err 383 } 384 385 func (d *DB) averageEntrySizeBeneath( 386 v *version, level int, meta *fileMetadata, 387 ) (avgKeySize, avgValueSize uint64, err error) { 388 // Find all files in lower levels that overlap with meta, 389 // summing their value sizes and entry counts. 390 var fileSum, keySum, valSum, entryCount uint64 391 for l := level + 1; l < numLevels; l++ { 392 overlaps := v.Overlaps(l, d.cmp, meta.Smallest.UserKey, 393 meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel()) 394 iter := overlaps.Iter() 395 for file := iter.First(); file != nil; file = iter.Next() { 396 err := d.tableCache.withReader(file, func(r *sstable.Reader) (err error) { 397 fileSum += file.Size 398 entryCount += r.Properties.NumEntries 399 keySum += r.Properties.RawKeySize 400 valSum += r.Properties.RawValueSize 401 return nil 402 }) 403 if err != nil { 404 return 0, 0, err 405 } 406 } 407 } 408 if entryCount == 0 { 409 return 0, 0, nil 410 } 411 // RawKeySize and RawValueSize are uncompressed totals. Scale them 412 // according to the data size to account for compression, index blocks and 413 // metadata overhead. Eg: 414 // 415 // Compression rate × Average uncompressed key size 416 // 417 // ↓ 418 // 419 // FileSize RawKeySize 420 // ----------------------- × ---------- 421 // RawKeySize+RawValueSize NumEntries 422 // 423 // We refactor the calculation to avoid error from rounding/truncation. 424 totalSizePerEntry := fileSum / entryCount 425 uncompressedSum := keySum + valSum 426 avgKeySize = keySum * totalSizePerEntry / uncompressedSum 427 avgValueSize = valSum * totalSizePerEntry / uncompressedSum 428 return avgKeySize, avgValueSize, err 429 } 430 431 func (d *DB) estimateReclaimedSizeBeneath( 432 v *version, level int, start, end []byte, hintType deleteCompactionHintType, 433 ) (estimate uint64, hintSeqNum uint64, err error) { 434 // Find all files in lower levels that overlap with the deleted range 435 // [start, end). 436 // 437 // An overlapping file might be completely contained by the range 438 // tombstone, in which case we can count the entire file size in 439 // our estimate without doing any additional I/O. 440 // 441 // Otherwise, estimating the range for the file requires 442 // additional I/O to read the file's index blocks. 443 hintSeqNum = math.MaxUint64 444 for l := level + 1; l < numLevels; l++ { 445 overlaps := v.Overlaps(l, d.cmp, start, end, true /* exclusiveEnd */) 446 iter := overlaps.Iter() 447 for file := iter.First(); file != nil; file = iter.Next() { 448 startCmp := d.cmp(start, file.Smallest.UserKey) 449 endCmp := d.cmp(file.Largest.UserKey, end) 450 if startCmp <= 0 && (endCmp < 0 || endCmp == 0 && file.Largest.IsExclusiveSentinel()) { 451 // The range fully contains the file, so skip looking it up in table 452 // cache/looking at its indexes and add the full file size. Whether the 453 // disk estimate and hint seqnums are updated depends on a) the type of 454 // hint that requested the estimate and b) the keys contained in this 455 // current file. 456 var updateEstimates, updateHints bool 457 switch hintType { 458 case deleteCompactionHintTypePointKeyOnly: 459 // The range deletion byte estimates should only be updated if this 460 // table contains point keys. This ends up being an overestimate in 461 // the case that table also has range keys, but such keys are expected 462 // to contribute a negligible amount of the table's overall size, 463 // relative to point keys. 464 if file.HasPointKeys { 465 updateEstimates = true 466 } 467 // As the initiating span contained only range dels, hints can only be 468 // updated if this table does _not_ contain range keys. 469 if !file.HasRangeKeys { 470 updateHints = true 471 } 472 case deleteCompactionHintTypeRangeKeyOnly: 473 // The initiating span contained only range key dels. The estimates 474 // apply only to point keys, and are therefore not updated. 475 updateEstimates = false 476 // As the initiating span contained only range key dels, hints can 477 // only be updated if this table does _not_ contain point keys. 478 if !file.HasPointKeys { 479 updateHints = true 480 } 481 case deleteCompactionHintTypePointAndRangeKey: 482 // Always update the estimates and hints, as this hint type can drop a 483 // file, irrespective of the mixture of keys. Similar to above, the 484 // range del bytes estimates is an overestimate. 485 updateEstimates, updateHints = true, true 486 default: 487 panic(fmt.Sprintf("bitalostable: unknown hint type %s", hintType)) 488 } 489 if updateEstimates { 490 estimate += file.Size 491 } 492 if updateHints && hintSeqNum > file.SmallestSeqNum { 493 hintSeqNum = file.SmallestSeqNum 494 } 495 } else if d.cmp(file.Smallest.UserKey, end) <= 0 && d.cmp(start, file.Largest.UserKey) <= 0 { 496 // Partial overlap. 497 if hintType == deleteCompactionHintTypeRangeKeyOnly { 498 // If the hint that generated this overlap contains only range keys, 499 // there is no need to calculate disk usage, as the reclaimable space 500 // is expected to be minimal relative to point keys. 501 continue 502 } 503 var size uint64 504 err := d.tableCache.withReader(file, func(r *sstable.Reader) (err error) { 505 size, err = r.EstimateDiskUsage(start, end) 506 return err 507 }) 508 if err != nil { 509 return 0, hintSeqNum, err 510 } 511 estimate += size 512 } 513 } 514 } 515 return estimate, hintSeqNum, nil 516 } 517 518 func maybeSetStatsFromProperties(meta *fileMetadata, props *sstable.Properties) bool { 519 // If a table contains range deletions or range key deletions, we defer the 520 // stats collection. There are two main reasons for this: 521 // 522 // 1. Estimating the potential for reclaimed space due to a range deletion 523 // tombstone requires scanning the LSM - a potentially expensive operation 524 // that should be deferred. 525 // 2. Range deletions and / or range key deletions present an opportunity to 526 // compute "deletion hints", which also requires a scan of the LSM to 527 // compute tables that would be eligible for deletion. 528 // 529 // These two tasks are deferred to the table stats collector goroutine. 530 if props.NumRangeDeletions != 0 || props.NumRangeKeyDels != 0 { 531 return false 532 } 533 534 // If a table is more than 10% point deletions, don't calculate the 535 // PointDeletionsBytesEstimate statistic using our limited knowledge. The 536 // table stats collector can populate the stats and calculate an average 537 // of value size of all the tables beneath the table in the LSM, which 538 // will be more accurate. 539 if props.NumDeletions > props.NumEntries/10 { 540 return false 541 } 542 543 var pointEstimate uint64 544 if props.NumEntries > 0 { 545 // Use the file's own average key and value sizes as an estimate. This 546 // doesn't require any additional IO and since the number of point 547 // deletions in the file is low, the error introduced by this crude 548 // estimate is expected to be small. 549 avgKeySize, avgValSize := estimateEntrySizes(meta.Size, props) 550 pointEstimate = pointDeletionsBytesEstimate(props, avgKeySize, avgValSize) 551 } 552 553 meta.Stats.NumEntries = props.NumEntries 554 meta.Stats.NumDeletions = props.NumDeletions 555 meta.Stats.NumRangeKeySets = props.NumRangeKeySets 556 meta.Stats.PointDeletionsBytesEstimate = pointEstimate 557 meta.Stats.RangeDeletionsBytesEstimate = 0 558 meta.StatsMarkValid() 559 return true 560 } 561 562 func pointDeletionsBytesEstimate(props *sstable.Properties, avgKeySize, avgValSize uint64) uint64 { 563 if props.NumEntries == 0 { 564 return 0 565 } 566 // Estimate the potential space to reclaim using the table's own 567 // properties. There may or may not be keys covered by any individual 568 // point tombstone. If not, compacting the point tombstone into L6 will at 569 // least allow us to drop the point deletion key and will reclaim the key 570 // bytes. If there are covered key(s), we also get to drop key and value 571 // bytes for each covered key. 572 // 573 // We estimate assuming that each point tombstone on average covers 1 key. 574 // This is almost certainly an overestimate, but that's probably okay 575 // because point tombstones can slow range iterations even when they don't 576 // cover a key. It may be beneficial in the future to more accurately 577 // estimate which tombstones cover keys and which do not. 578 numPointDels := props.NumPointDeletions() 579 return numPointDels*avgKeySize + numPointDels*(avgKeySize+avgValSize) 580 } 581 582 func estimateEntrySizes( 583 fileSize uint64, props *sstable.Properties, 584 ) (avgKeySize, avgValSize uint64) { 585 // RawKeySize and RawValueSize are uncompressed totals. Scale them 586 // according to the data size to account for compression, index blocks and 587 // metadata overhead. Eg: 588 // 589 // Compression rate × Average uncompressed key size 590 // 591 // ↓ 592 // 593 // FileSize RawKeySize 594 // ----------------------- × ---------- 595 // RawKeySize+RawValueSize NumEntries 596 // 597 // We refactor the calculation to avoid error from rounding/truncation. 598 fileSizePerEntry := fileSize / props.NumEntries 599 uncompressedSum := props.RawKeySize + props.RawValueSize 600 avgKeySize = props.RawKeySize * fileSizePerEntry / uncompressedSum 601 avgValSize = props.RawValueSize * fileSizePerEntry / uncompressedSum 602 return avgKeySize, avgValSize 603 } 604 605 // newCombinedDeletionKeyspanIter returns a keyspan.FragmentIterator that 606 // returns "ranged deletion" spans for a single table, providing a combined view 607 // of both range deletion and range key deletion spans. The 608 // tableRangedDeletionIter is intended for use in the specific case of computing 609 // the statistics and deleteCompactionHints for a single table. 610 // 611 // As an example, consider the following set of spans from the range deletion 612 // and range key blocks of a table: 613 // 614 // |---------| |---------| |-------| RANGEKEYDELs 615 // |-----------|-------------| |-----| RANGEDELs 616 // 617 // __________________________________________________________ 618 // 619 // a b c d e f g h i j k l m n o p q r s t u v w x y z 620 // 621 // The tableRangedDeletionIter produces the following set of output spans, where 622 // '1' indicates a span containing only range deletions, '2' is a span 623 // containing only range key deletions, and '3' is a span containing a mixture 624 // of both range deletions and range key deletions. 625 // 626 // 1 3 1 3 2 1 3 2 627 // |-----|---------|-----|---|-----| |---|-|-----| 628 // 629 // __________________________________________________________ 630 // 631 // a b c d e f g h i j k l m n o p q r s t u v w x y z 632 // 633 // Algorithm. 634 // 635 // The iterator first defragments the range deletion and range key blocks 636 // separately. During this defragmentation, the range key block is also filtered 637 // so that keys other than range key deletes are ignored. The range delete and 638 // range key delete keyspaces are then merged. 639 // 640 // Note that the only fragmentation introduced by merging is from where a range 641 // del span overlaps with a range key del span. Within the bounds of any overlap 642 // there is guaranteed to be no further fragmentation, as the constituent spans 643 // have already been defragmented. To the left and right of any overlap, the 644 // same reasoning applies. For example, 645 // 646 // |--------| |-------| RANGEKEYDEL 647 // |---------------------------| RANGEDEL 648 // |----1---|----3---|----1----|---2---| Merged, fragmented spans. 649 // 650 // __________________________________________________________ 651 // 652 // a b c d e f g h i j k l m n o p q r s t u v w x y z 653 // 654 // Any fragmented abutting spans produced by the merging iter will be of 655 // differing types (i.e. a transition from a span with homogenous key kinds to a 656 // heterogeneous span, or a transition from a span with exclusively range dels 657 // to a span with exclusively range key dels). Therefore, further 658 // defragmentation is not required. 659 // 660 // Each span returned by the tableRangeDeletionIter will have at most four keys, 661 // corresponding to the largest and smallest sequence numbers encountered across 662 // the range deletes and range keys deletes that comprised the merged spans. 663 func newCombinedDeletionKeyspanIter( 664 comparer *base.Comparer, r *sstable.Reader, m *fileMetadata, 665 ) (keyspan.FragmentIterator, error) { 666 // The range del iter and range key iter are each wrapped in their own 667 // defragmenting iter. For each iter, abutting spans can always be merged. 668 var equal = keyspan.DefragmentMethodFunc(func(_ base.Equal, a, b *keyspan.Span) bool { return true }) 669 // Reduce keys by maintaining a slice of at most length two, corresponding to 670 // the largest and smallest keys in the defragmented span. This maintains the 671 // contract that the emitted slice is sorted by (SeqNum, Kind) descending. 672 reducer := func(current, incoming []keyspan.Key) []keyspan.Key { 673 if len(current) == 0 && len(incoming) == 0 { 674 // While this should never occur in practice, a defensive return is used 675 // here to preserve correctness. 676 return current 677 } 678 var largest, smallest keyspan.Key 679 var set bool 680 for _, keys := range [2][]keyspan.Key{current, incoming} { 681 if len(keys) == 0 { 682 continue 683 } 684 first, last := keys[0], keys[len(keys)-1] 685 if !set { 686 largest, smallest = first, last 687 set = true 688 continue 689 } 690 if first.Trailer > largest.Trailer { 691 largest = first 692 } 693 if last.Trailer < smallest.Trailer { 694 smallest = last 695 } 696 } 697 if largest.Equal(comparer.Equal, smallest) { 698 current = append(current[:0], largest) 699 } else { 700 current = append(current[:0], largest, smallest) 701 } 702 return current 703 } 704 705 // The separate iters for the range dels and range keys are wrapped in a 706 // merging iter to join the keyspaces into a single keyspace. The separate 707 // iters are only added if the particular key kind is present. 708 mIter := &keyspan.MergingIter{} 709 var transform = keyspan.TransformerFunc(func(cmp base.Compare, in keyspan.Span, out *keyspan.Span) error { 710 if in.KeysOrder != keyspan.ByTrailerDesc { 711 panic("bitalostable: combined deletion iter encountered keys in non-trailer descending order") 712 } 713 out.Start, out.End = in.Start, in.End 714 out.Keys = append(out.Keys[:0], in.Keys...) 715 out.KeysOrder = keyspan.ByTrailerDesc 716 // NB: The order of by-trailer descending may have been violated, 717 // because we've layered rangekey and rangedel iterators from the same 718 // sstable into the same keyspan.MergingIter. The MergingIter will 719 // return the keys in the order that the child iterators were provided. 720 // Sort the keys to ensure they're sorted by trailer descending. 721 keyspan.SortKeysByTrailer(&out.Keys) 722 return nil 723 }) 724 mIter.Init(comparer.Compare, transform) 725 726 iter, err := r.NewRawRangeDelIter() 727 if err != nil { 728 return nil, err 729 } 730 if iter != nil { 731 dIter := &keyspan.DefragmentingIter{} 732 dIter.Init(comparer, iter, equal, reducer) 733 iter = dIter 734 // Truncate tombstones to the containing file's bounds if necessary. 735 // See docs/range_deletions.md for why this is necessary. 736 iter = keyspan.Truncate( 737 comparer.Compare, iter, m.Smallest.UserKey, m.Largest.UserKey, nil, nil, 738 ) 739 mIter.AddLevel(iter) 740 } 741 742 iter, err = r.NewRawRangeKeyIter() 743 if err != nil { 744 return nil, err 745 } 746 if iter != nil { 747 // Wrap the range key iterator in a filter that elides keys other than range 748 // key deletions. 749 iter = keyspan.Filter(iter, func(in *keyspan.Span, out *keyspan.Span) (keep bool) { 750 out.Start, out.End = in.Start, in.End 751 out.Keys = out.Keys[:0] 752 for _, k := range in.Keys { 753 if k.Kind() != base.InternalKeyKindRangeKeyDelete { 754 continue 755 } 756 out.Keys = append(out.Keys, k) 757 } 758 return len(out.Keys) > 0 759 }) 760 dIter := &keyspan.DefragmentingIter{} 761 dIter.Init(comparer, iter, equal, reducer) 762 iter = dIter 763 mIter.AddLevel(iter) 764 } 765 766 return mIter, nil 767 } 768 769 // rangeKeySetsAnnotator implements manifest.Annotator, annotating B-Tree nodes 770 // with the sum of the files' counts of range key fragments. Its annotation type 771 // is a *uint64. The count of range key sets may change once a table's stats are 772 // loaded asynchronously, so its values are marked as cacheable only if a file's 773 // stats have been loaded. 774 type rangeKeySetsAnnotator struct{} 775 776 var _ manifest.Annotator = rangeKeySetsAnnotator{} 777 778 func (a rangeKeySetsAnnotator) Zero(dst interface{}) interface{} { 779 if dst == nil { 780 return new(uint64) 781 } 782 v := dst.(*uint64) 783 *v = 0 784 return v 785 } 786 787 func (a rangeKeySetsAnnotator) Accumulate( 788 f *fileMetadata, dst interface{}, 789 ) (v interface{}, cacheOK bool) { 790 vptr := dst.(*uint64) 791 *vptr = *vptr + f.Stats.NumRangeKeySets 792 return vptr, f.StatsValidLocked() 793 } 794 795 func (a rangeKeySetsAnnotator) Merge(src interface{}, dst interface{}) interface{} { 796 srcV := src.(*uint64) 797 dstV := dst.(*uint64) 798 *dstV = *dstV + *srcV 799 return dstV 800 } 801 802 // countRangeKeySetFragments counts the number of RANGEKEYSET keys across all 803 // files of the LSM. It only counts keys in files for which table stats have 804 // been loaded. It uses a b-tree annotator to cache intermediate values between 805 // calculations when possible. 806 func countRangeKeySetFragments(v *version) (count uint64) { 807 for l := 0; l < numLevels; l++ { 808 if v.RangeKeyLevels[l].Empty() { 809 continue 810 } 811 count += *v.RangeKeyLevels[l].Annotation(rangeKeySetsAnnotator{}).(*uint64) 812 } 813 return count 814 }