github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/table_stats.go (about) 1 // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "fmt" 9 "math" 10 11 "github.com/cockroachdb/errors" 12 "github.com/cockroachdb/pebble/internal/base" 13 "github.com/cockroachdb/pebble/internal/keyspan" 14 "github.com/cockroachdb/pebble/internal/manifest" 15 "github.com/cockroachdb/pebble/sstable" 16 ) 17 18 // In-memory statistics about tables help inform compaction picking, but may 19 // be expensive to calculate or load from disk. Every time a database is 20 // opened, these statistics must be reloaded or recalculated. To minimize 21 // impact on user activity and compactions, we load these statistics 22 // asynchronously in the background and store loaded statistics in each 23 // table's *FileMetadata. 24 // 25 // This file implements the asynchronous loading of statistics by maintaining 26 // a list of files that require statistics, alongside their LSM levels. 27 // Whenever new files are added to the LSM, the files are appended to 28 // d.mu.tableStats.pending. If a stats collection job is not currently 29 // running, one is started in a separate goroutine. 30 // 31 // The stats collection job grabs and clears the pending list, computes table 32 // statistics relative to the current readState and updates the tables' file 33 // metadata. New pending files may accumulate during a stats collection job, 34 // so a completing job triggers a new job if necessary. Only one job runs at a 35 // time. 36 // 37 // When an existing database is opened, all files lack in-memory statistics. 38 // These files' stats are loaded incrementally whenever the pending list is 39 // empty by scanning a current readState for files missing statistics. Once a 40 // job completes a scan without finding any remaining files without 41 // statistics, it flips a `loadedInitial` flag. From then on, the stats 42 // collection job only needs to load statistics for new files appended to the 43 // pending list. 44 45 func (d *DB) maybeCollectTableStatsLocked() { 46 if d.shouldCollectTableStatsLocked() { 47 go d.collectTableStats() 48 } 49 } 50 51 // updateTableStatsLocked is called when new files are introduced, after the 52 // read state has been updated. It may trigger a new stat collection. 53 // DB.mu must be locked when calling. 54 func (d *DB) updateTableStatsLocked(newFiles []manifest.NewFileEntry) { 55 var needStats bool 56 for _, nf := range newFiles { 57 if !nf.Meta.StatsValid() { 58 needStats = true 59 break 60 } 61 } 62 if !needStats { 63 return 64 } 65 66 d.mu.tableStats.pending = append(d.mu.tableStats.pending, newFiles...) 67 d.maybeCollectTableStatsLocked() 68 } 69 70 func (d *DB) shouldCollectTableStatsLocked() bool { 71 return !d.mu.tableStats.loading && 72 d.closed.Load() == nil && 73 !d.opts.private.disableTableStats && 74 (len(d.mu.tableStats.pending) > 0 || !d.mu.tableStats.loadedInitial) 75 } 76 77 // collectTableStats runs a table stats collection job, returning true if the 78 // invocation did the collection work, false otherwise (e.g. if another job was 79 // already running). 80 func (d *DB) collectTableStats() bool { 81 const maxTableStatsPerScan = 50 82 83 d.mu.Lock() 84 if !d.shouldCollectTableStatsLocked() { 85 d.mu.Unlock() 86 return false 87 } 88 89 pending := d.mu.tableStats.pending 90 d.mu.tableStats.pending = nil 91 d.mu.tableStats.loading = true 92 jobID := d.mu.nextJobID 93 d.mu.nextJobID++ 94 loadedInitial := d.mu.tableStats.loadedInitial 95 // Drop DB.mu before performing IO. 96 d.mu.Unlock() 97 98 // Every run of collectTableStats either collects stats from the pending 99 // list (if non-empty) or from scanning the version (loadedInitial is 100 // false). This job only runs if at least one of those conditions holds. 101 102 // Grab a read state to scan for tables. 103 rs := d.loadReadState() 104 var collected []collectedStats 105 var hints []deleteCompactionHint 106 if len(pending) > 0 { 107 collected, hints = d.loadNewFileStats(rs, pending) 108 } else { 109 var moreRemain bool 110 var buf [maxTableStatsPerScan]collectedStats 111 collected, hints, moreRemain = d.scanReadStateTableStats(rs, buf[:0]) 112 loadedInitial = !moreRemain 113 } 114 rs.unref() 115 116 // Update the FileMetadata with the loaded stats while holding d.mu. 117 d.mu.Lock() 118 defer d.mu.Unlock() 119 d.mu.tableStats.loading = false 120 if loadedInitial && !d.mu.tableStats.loadedInitial { 121 d.mu.tableStats.loadedInitial = loadedInitial 122 d.opts.EventListener.TableStatsLoaded(TableStatsInfo{ 123 JobID: jobID, 124 }) 125 } 126 127 maybeCompact := false 128 for _, c := range collected { 129 c.fileMetadata.Stats = c.TableStats 130 maybeCompact = maybeCompact || fileCompensation(c.fileMetadata) > 0 131 c.fileMetadata.StatsMarkValid() 132 } 133 d.mu.tableStats.cond.Broadcast() 134 d.maybeCollectTableStatsLocked() 135 if len(hints) > 0 && !d.opts.private.disableDeleteOnlyCompactions { 136 // Verify that all of the hint tombstones' files still exist in the 137 // current version. Otherwise, the tombstone itself may have been 138 // compacted into L6 and more recent keys may have had their sequence 139 // numbers zeroed. 140 // 141 // Note that it's possible that the tombstone file is being compacted 142 // presently. In that case, the file will be present in v. When the 143 // compaction finishes compacting the tombstone file, it will detect 144 // and clear the hint. 145 // 146 // See DB.maybeUpdateDeleteCompactionHints. 147 v := d.mu.versions.currentVersion() 148 keepHints := hints[:0] 149 for _, h := range hints { 150 if v.Contains(h.tombstoneLevel, d.cmp, h.tombstoneFile) { 151 keepHints = append(keepHints, h) 152 } 153 } 154 d.mu.compact.deletionHints = append(d.mu.compact.deletionHints, keepHints...) 155 } 156 if maybeCompact { 157 d.maybeScheduleCompaction() 158 } 159 return true 160 } 161 162 type collectedStats struct { 163 *fileMetadata 164 manifest.TableStats 165 } 166 167 func (d *DB) loadNewFileStats( 168 rs *readState, pending []manifest.NewFileEntry, 169 ) ([]collectedStats, []deleteCompactionHint) { 170 var hints []deleteCompactionHint 171 collected := make([]collectedStats, 0, len(pending)) 172 for _, nf := range pending { 173 // A file's stats might have been populated by an earlier call to 174 // loadNewFileStats if the file was moved. 175 // NB: We're not holding d.mu which protects f.Stats, but only 176 // collectTableStats updates f.Stats for active files, and we 177 // ensure only one goroutine runs it at a time through 178 // d.mu.tableStats.loading. 179 if nf.Meta.StatsValid() { 180 continue 181 } 182 183 // The file isn't guaranteed to still be live in the readState's 184 // version. It may have been deleted or moved. Skip it if it's not in 185 // the expected level. 186 if !rs.current.Contains(nf.Level, d.cmp, nf.Meta) { 187 continue 188 } 189 190 stats, newHints, err := d.loadTableStats( 191 rs.current, nf.Level, 192 nf.Meta, 193 ) 194 if err != nil { 195 d.opts.EventListener.BackgroundError(err) 196 continue 197 } 198 // NB: We don't update the FileMetadata yet, because we aren't 199 // holding DB.mu. We'll copy it to the FileMetadata after we're 200 // finished with IO. 201 collected = append(collected, collectedStats{ 202 fileMetadata: nf.Meta, 203 TableStats: stats, 204 }) 205 hints = append(hints, newHints...) 206 } 207 return collected, hints 208 } 209 210 // scanReadStateTableStats is run by an active stat collection job when there 211 // are no pending new files, but there might be files that existed at Open for 212 // which we haven't loaded table stats. 213 func (d *DB) scanReadStateTableStats( 214 rs *readState, fill []collectedStats, 215 ) ([]collectedStats, []deleteCompactionHint, bool) { 216 moreRemain := false 217 var hints []deleteCompactionHint 218 sizesChecked := make(map[base.DiskFileNum]struct{}) 219 for l, levelMetadata := range rs.current.Levels { 220 iter := levelMetadata.Iter() 221 for f := iter.First(); f != nil; f = iter.Next() { 222 // NB: We're not holding d.mu which protects f.Stats, but only the 223 // active stats collection job updates f.Stats for active files, 224 // and we ensure only one goroutine runs it at a time through 225 // d.mu.tableStats.loading. This makes it safe to read validity 226 // through f.Stats.ValidLocked despite not holding d.mu. 227 if f.StatsValid() { 228 continue 229 } 230 231 // Limit how much work we do per read state. The older the read 232 // state is, the higher the likelihood files are no longer being 233 // used in the current version. If we've exhausted our allowance, 234 // return true for the last return value to signal there's more 235 // work to do. 236 if len(fill) == cap(fill) { 237 moreRemain = true 238 return fill, hints, moreRemain 239 } 240 241 // If the file is remote and not SharedForeign, we should check if its size 242 // matches. This is because checkConsistency skips over remote files. 243 // 244 // SharedForeign and External files are skipped as their sizes are allowed 245 // to have a mismatch; the size stored in the FileBacking is just the part 246 // of the file that is referenced by this Pebble instance, not the size of 247 // the whole object. 248 objMeta, err := d.objProvider.Lookup(fileTypeTable, f.FileBacking.DiskFileNum) 249 if err != nil { 250 // Set `moreRemain` so we'll try again. 251 moreRemain = true 252 d.opts.EventListener.BackgroundError(err) 253 continue 254 } 255 256 shouldCheckSize := objMeta.IsRemote() && 257 !d.objProvider.IsSharedForeign(objMeta) && 258 !objMeta.IsExternal() 259 if _, ok := sizesChecked[f.FileBacking.DiskFileNum]; !ok && shouldCheckSize { 260 size, err := d.objProvider.Size(objMeta) 261 fileSize := f.FileBacking.Size 262 if err != nil { 263 moreRemain = true 264 d.opts.EventListener.BackgroundError(err) 265 continue 266 } 267 if size != int64(fileSize) { 268 err := errors.Errorf( 269 "during consistency check in loadTableStats: L%d: %s: object size mismatch (%s): %d (provider) != %d (MANIFEST)", 270 errors.Safe(l), f.FileNum, d.objProvider.Path(objMeta), 271 errors.Safe(size), errors.Safe(fileSize)) 272 d.opts.EventListener.BackgroundError(err) 273 d.opts.Logger.Fatalf("%s", err) 274 } 275 276 sizesChecked[f.FileBacking.DiskFileNum] = struct{}{} 277 } 278 279 stats, newHints, err := d.loadTableStats( 280 rs.current, l, f, 281 ) 282 if err != nil { 283 // Set `moreRemain` so we'll try again. 284 moreRemain = true 285 d.opts.EventListener.BackgroundError(err) 286 continue 287 } 288 fill = append(fill, collectedStats{ 289 fileMetadata: f, 290 TableStats: stats, 291 }) 292 hints = append(hints, newHints...) 293 } 294 } 295 return fill, hints, moreRemain 296 } 297 298 func (d *DB) loadTableStats( 299 v *version, level int, meta *fileMetadata, 300 ) (manifest.TableStats, []deleteCompactionHint, error) { 301 var stats manifest.TableStats 302 var compactionHints []deleteCompactionHint 303 err := d.tableCache.withCommonReader( 304 meta, func(r sstable.CommonReader) (err error) { 305 props := r.CommonProperties() 306 stats.NumEntries = props.NumEntries 307 stats.NumDeletions = props.NumDeletions 308 if props.NumPointDeletions() > 0 { 309 if err = d.loadTablePointKeyStats(props, v, level, meta, &stats); err != nil { 310 return 311 } 312 } 313 if props.NumRangeDeletions > 0 || props.NumRangeKeyDels > 0 { 314 if compactionHints, err = d.loadTableRangeDelStats( 315 r, v, level, meta, &stats, 316 ); err != nil { 317 return 318 } 319 } 320 // TODO(travers): Once we have real-world data, consider collecting 321 // additional stats that may provide improved heuristics for compaction 322 // picking. 323 stats.NumRangeKeySets = props.NumRangeKeySets 324 stats.ValueBlocksSize = props.ValueBlocksSize 325 return 326 }) 327 if err != nil { 328 return stats, nil, err 329 } 330 return stats, compactionHints, nil 331 } 332 333 // loadTablePointKeyStats calculates the point key statistics for the given 334 // table. The provided manifest.TableStats are updated. 335 func (d *DB) loadTablePointKeyStats( 336 props *sstable.CommonProperties, 337 v *version, 338 level int, 339 meta *fileMetadata, 340 stats *manifest.TableStats, 341 ) error { 342 // TODO(jackson): If the file has a wide keyspace, the average 343 // value size beneath the entire file might not be representative 344 // of the size of the keys beneath the point tombstones. 345 // We could write the ranges of 'clusters' of point tombstones to 346 // a sstable property and call averageValueSizeBeneath for each of 347 // these narrower ranges to improve the estimate. 348 avgValLogicalSize, compressionRatio, err := d.estimateSizesBeneath(v, level, meta, props) 349 if err != nil { 350 return err 351 } 352 stats.PointDeletionsBytesEstimate = 353 pointDeletionsBytesEstimate(meta.Size, props, avgValLogicalSize, compressionRatio) 354 return nil 355 } 356 357 // loadTableRangeDelStats calculates the range deletion and range key deletion 358 // statistics for the given table. 359 func (d *DB) loadTableRangeDelStats( 360 r sstable.CommonReader, v *version, level int, meta *fileMetadata, stats *manifest.TableStats, 361 ) ([]deleteCompactionHint, error) { 362 iter, err := newCombinedDeletionKeyspanIter(d.opts.Comparer, r, meta) 363 if err != nil { 364 return nil, err 365 } 366 defer iter.Close() 367 var compactionHints []deleteCompactionHint 368 // We iterate over the defragmented range tombstones and range key deletions, 369 // which ensures we don't double count ranges deleted at different sequence 370 // numbers. Also, merging abutting tombstones reduces the number of calls to 371 // estimateReclaimedSizeBeneath which is costly, and improves the accuracy of 372 // our overall estimate. 373 for s := iter.First(); s != nil; s = iter.Next() { 374 start, end := s.Start, s.End 375 // We only need to consider deletion size estimates for tables that contain 376 // RANGEDELs. 377 var maxRangeDeleteSeqNum uint64 378 for _, k := range s.Keys { 379 if k.Kind() == base.InternalKeyKindRangeDelete && maxRangeDeleteSeqNum < k.SeqNum() { 380 maxRangeDeleteSeqNum = k.SeqNum() 381 break 382 } 383 } 384 385 // If the file is in the last level of the LSM, there is no data beneath 386 // it. The fact that there is still a range tombstone in a bottommost file 387 // indicates two possibilites: 388 // 1. an open snapshot kept the tombstone around, and the data the 389 // tombstone deletes is contained within the file itself. 390 // 2. the file was ingested. 391 // In the first case, we'd like to estimate disk usage within the file 392 // itself since compacting the file will drop that covered data. In the 393 // second case, we expect that compacting the file will NOT drop any 394 // data and rewriting the file is a waste of write bandwidth. We can 395 // distinguish these cases by looking at the file metadata's sequence 396 // numbers. A file's range deletions can only delete data within the 397 // file at lower sequence numbers. All keys in an ingested sstable adopt 398 // the same sequence number, preventing tombstones from deleting keys 399 // within the same file. We check here if the largest RANGEDEL sequence 400 // number is greater than the file's smallest sequence number. If it is, 401 // the RANGEDEL could conceivably (although inconclusively) delete data 402 // within the same file. 403 // 404 // Note that this heuristic is imperfect. If a table containing a range 405 // deletion is ingested into L5 and subsequently compacted into L6 but 406 // an open snapshot prevents elision of covered keys in L6, the 407 // resulting RangeDeletionsBytesEstimate will incorrectly include all 408 // covered keys. 409 // 410 // TODO(jackson): We could prevent the above error in the heuristic by 411 // computing the file's RangeDeletionsBytesEstimate during the 412 // compaction itself. It's unclear how common this is. 413 // 414 // NOTE: If the span `s` wholly contains a table containing range keys, 415 // the returned size estimate will be slightly inflated by the range key 416 // block. However, in practice, range keys are expected to be rare, and 417 // the size of the range key block relative to the overall size of the 418 // table is expected to be small. 419 if level == numLevels-1 && meta.SmallestSeqNum < maxRangeDeleteSeqNum { 420 size, err := r.EstimateDiskUsage(start, end) 421 if err != nil { 422 return nil, err 423 } 424 stats.RangeDeletionsBytesEstimate += size 425 426 // As the file is in the bottommost level, there is no need to collect a 427 // deletion hint. 428 continue 429 } 430 431 // While the size estimates for point keys should only be updated if this 432 // span contains a range del, the sequence numbers are required for the 433 // hint. Unconditionally descend, but conditionally update the estimates. 434 hintType := compactionHintFromKeys(s.Keys) 435 estimate, hintSeqNum, err := d.estimateReclaimedSizeBeneath(v, level, start, end, hintType) 436 if err != nil { 437 return nil, err 438 } 439 stats.RangeDeletionsBytesEstimate += estimate 440 441 // If any files were completely contained with the range, 442 // hintSeqNum is the smallest sequence number contained in any 443 // such file. 444 if hintSeqNum == math.MaxUint64 { 445 continue 446 } 447 hint := deleteCompactionHint{ 448 hintType: hintType, 449 start: make([]byte, len(start)), 450 end: make([]byte, len(end)), 451 tombstoneFile: meta, 452 tombstoneLevel: level, 453 tombstoneLargestSeqNum: s.LargestSeqNum(), 454 tombstoneSmallestSeqNum: s.SmallestSeqNum(), 455 fileSmallestSeqNum: hintSeqNum, 456 } 457 copy(hint.start, start) 458 copy(hint.end, end) 459 compactionHints = append(compactionHints, hint) 460 } 461 return compactionHints, err 462 } 463 464 func (d *DB) estimateSizesBeneath( 465 v *version, level int, meta *fileMetadata, fileProps *sstable.CommonProperties, 466 ) (avgValueLogicalSize, compressionRatio float64, err error) { 467 // Find all files in lower levels that overlap with meta, 468 // summing their value sizes and entry counts. 469 file := meta 470 var fileSum, keySum, valSum, entryCount uint64 471 // Include the file itself. This is important because in some instances, the 472 // computed compression ratio is applied to the tombstones contained within 473 // `meta` itself. If there are no files beneath `meta` in the LSM, we would 474 // calculate a compression ratio of 0 which is not accurate for the file's 475 // own tombstones. 476 fileSum += file.Size 477 entryCount += fileProps.NumEntries 478 keySum += fileProps.RawKeySize 479 valSum += fileProps.RawValueSize 480 481 addPhysicalTableStats := func(r *sstable.Reader) (err error) { 482 fileSum += file.Size 483 entryCount += r.Properties.NumEntries 484 keySum += r.Properties.RawKeySize 485 valSum += r.Properties.RawValueSize 486 return nil 487 } 488 addVirtualTableStats := func(v sstable.VirtualReader) (err error) { 489 fileSum += file.Size 490 entryCount += file.Stats.NumEntries 491 keySum += v.Properties.RawKeySize 492 valSum += v.Properties.RawValueSize 493 return nil 494 } 495 496 for l := level + 1; l < numLevels; l++ { 497 overlaps := v.Overlaps(l, d.cmp, meta.Smallest.UserKey, 498 meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel()) 499 iter := overlaps.Iter() 500 for file = iter.First(); file != nil; file = iter.Next() { 501 var err error 502 if file.Virtual { 503 err = d.tableCache.withVirtualReader(file.VirtualMeta(), addVirtualTableStats) 504 } else { 505 err = d.tableCache.withReader(file.PhysicalMeta(), addPhysicalTableStats) 506 } 507 if err != nil { 508 return 0, 0, err 509 } 510 } 511 } 512 if entryCount == 0 { 513 return 0, 0, nil 514 } 515 // RawKeySize and RawValueSize are uncompressed totals. We'll need to scale 516 // the value sum according to the data size to account for compression, 517 // index blocks and metadata overhead. Eg: 518 // 519 // Compression rate × Average uncompressed value size 520 // 521 // ↓ 522 // 523 // FileSize RawValueSize 524 // ----------------------- × ------------ 525 // RawKeySize+RawValueSize NumEntries 526 // 527 // We return the average logical value size plus the compression ratio, 528 // leaving the scaling to the caller. This allows the caller to perform 529 // additional compression ratio scaling if necessary. 530 uncompressedSum := float64(keySum + valSum) 531 compressionRatio = float64(fileSum) / uncompressedSum 532 avgValueLogicalSize = (float64(valSum) / float64(entryCount)) 533 return avgValueLogicalSize, compressionRatio, nil 534 } 535 536 func (d *DB) estimateReclaimedSizeBeneath( 537 v *version, level int, start, end []byte, hintType deleteCompactionHintType, 538 ) (estimate uint64, hintSeqNum uint64, err error) { 539 // Find all files in lower levels that overlap with the deleted range 540 // [start, end). 541 // 542 // An overlapping file might be completely contained by the range 543 // tombstone, in which case we can count the entire file size in 544 // our estimate without doing any additional I/O. 545 // 546 // Otherwise, estimating the range for the file requires 547 // additional I/O to read the file's index blocks. 548 hintSeqNum = math.MaxUint64 549 for l := level + 1; l < numLevels; l++ { 550 overlaps := v.Overlaps(l, d.cmp, start, end, true /* exclusiveEnd */) 551 iter := overlaps.Iter() 552 for file := iter.First(); file != nil; file = iter.Next() { 553 startCmp := d.cmp(start, file.Smallest.UserKey) 554 endCmp := d.cmp(file.Largest.UserKey, end) 555 if startCmp <= 0 && (endCmp < 0 || endCmp == 0 && file.Largest.IsExclusiveSentinel()) { 556 // The range fully contains the file, so skip looking it up in table 557 // cache/looking at its indexes and add the full file size. Whether the 558 // disk estimate and hint seqnums are updated depends on a) the type of 559 // hint that requested the estimate and b) the keys contained in this 560 // current file. 561 var updateEstimates, updateHints bool 562 switch hintType { 563 case deleteCompactionHintTypePointKeyOnly: 564 // The range deletion byte estimates should only be updated if this 565 // table contains point keys. This ends up being an overestimate in 566 // the case that table also has range keys, but such keys are expected 567 // to contribute a negligible amount of the table's overall size, 568 // relative to point keys. 569 if file.HasPointKeys { 570 updateEstimates = true 571 } 572 // As the initiating span contained only range dels, hints can only be 573 // updated if this table does _not_ contain range keys. 574 if !file.HasRangeKeys { 575 updateHints = true 576 } 577 case deleteCompactionHintTypeRangeKeyOnly: 578 // The initiating span contained only range key dels. The estimates 579 // apply only to point keys, and are therefore not updated. 580 updateEstimates = false 581 // As the initiating span contained only range key dels, hints can 582 // only be updated if this table does _not_ contain point keys. 583 if !file.HasPointKeys { 584 updateHints = true 585 } 586 case deleteCompactionHintTypePointAndRangeKey: 587 // Always update the estimates and hints, as this hint type can drop a 588 // file, irrespective of the mixture of keys. Similar to above, the 589 // range del bytes estimates is an overestimate. 590 updateEstimates, updateHints = true, true 591 default: 592 panic(fmt.Sprintf("pebble: unknown hint type %s", hintType)) 593 } 594 if updateEstimates { 595 estimate += file.Size 596 } 597 if updateHints && hintSeqNum > file.SmallestSeqNum { 598 hintSeqNum = file.SmallestSeqNum 599 } 600 } else if d.cmp(file.Smallest.UserKey, end) <= 0 && d.cmp(start, file.Largest.UserKey) <= 0 { 601 // Partial overlap. 602 if hintType == deleteCompactionHintTypeRangeKeyOnly { 603 // If the hint that generated this overlap contains only range keys, 604 // there is no need to calculate disk usage, as the reclaimable space 605 // is expected to be minimal relative to point keys. 606 continue 607 } 608 var size uint64 609 var err error 610 if file.Virtual { 611 err = d.tableCache.withVirtualReader( 612 file.VirtualMeta(), func(r sstable.VirtualReader) (err error) { 613 size, err = r.EstimateDiskUsage(start, end) 614 return err 615 }) 616 } else { 617 err = d.tableCache.withReader( 618 file.PhysicalMeta(), func(r *sstable.Reader) (err error) { 619 size, err = r.EstimateDiskUsage(start, end) 620 return err 621 }) 622 } 623 624 if err != nil { 625 return 0, hintSeqNum, err 626 } 627 estimate += size 628 } 629 } 630 } 631 return estimate, hintSeqNum, nil 632 } 633 634 func maybeSetStatsFromProperties(meta physicalMeta, props *sstable.Properties) bool { 635 // If a table contains range deletions or range key deletions, we defer the 636 // stats collection. There are two main reasons for this: 637 // 638 // 1. Estimating the potential for reclaimed space due to a range deletion 639 // tombstone requires scanning the LSM - a potentially expensive operation 640 // that should be deferred. 641 // 2. Range deletions and / or range key deletions present an opportunity to 642 // compute "deletion hints", which also requires a scan of the LSM to 643 // compute tables that would be eligible for deletion. 644 // 645 // These two tasks are deferred to the table stats collector goroutine. 646 if props.NumRangeDeletions != 0 || props.NumRangeKeyDels != 0 { 647 return false 648 } 649 650 // If a table is more than 10% point deletions without user-provided size 651 // estimates, don't calculate the PointDeletionsBytesEstimate statistic 652 // using our limited knowledge. The table stats collector can populate the 653 // stats and calculate an average of value size of all the tables beneath 654 // the table in the LSM, which will be more accurate. 655 if unsizedDels := (props.NumDeletions - props.NumSizedDeletions); unsizedDels > props.NumEntries/10 { 656 return false 657 } 658 659 var pointEstimate uint64 660 if props.NumEntries > 0 { 661 // Use the file's own average key and value sizes as an estimate. This 662 // doesn't require any additional IO and since the number of point 663 // deletions in the file is low, the error introduced by this crude 664 // estimate is expected to be small. 665 commonProps := &props.CommonProperties 666 avgValSize, compressionRatio := estimatePhysicalSizes(meta.Size, commonProps) 667 pointEstimate = pointDeletionsBytesEstimate(meta.Size, commonProps, avgValSize, compressionRatio) 668 } 669 670 meta.Stats.NumEntries = props.NumEntries 671 meta.Stats.NumDeletions = props.NumDeletions 672 meta.Stats.NumRangeKeySets = props.NumRangeKeySets 673 meta.Stats.PointDeletionsBytesEstimate = pointEstimate 674 meta.Stats.RangeDeletionsBytesEstimate = 0 675 meta.Stats.ValueBlocksSize = props.ValueBlocksSize 676 meta.StatsMarkValid() 677 return true 678 } 679 680 func pointDeletionsBytesEstimate( 681 fileSize uint64, props *sstable.CommonProperties, avgValLogicalSize, compressionRatio float64, 682 ) (estimate uint64) { 683 if props.NumEntries == 0 { 684 return 0 685 } 686 numPointDels := props.NumPointDeletions() 687 if numPointDels == 0 { 688 return 0 689 } 690 // Estimate the potential space to reclaim using the table's own properties. 691 // There may or may not be keys covered by any individual point tombstone. 692 // If not, compacting the point tombstone into L6 will at least allow us to 693 // drop the point deletion key and will reclaim the tombstone's key bytes. 694 // If there are covered key(s), we also get to drop key and value bytes for 695 // each covered key. 696 // 697 // Some point tombstones (DELSIZEDs) carry a user-provided estimate of the 698 // uncompressed size of entries that will be elided by fully compacting the 699 // tombstone. For these tombstones, there's no guesswork—we use the 700 // RawPointTombstoneValueSizeHint property which is the sum of all these 701 // tombstones' encoded values. 702 // 703 // For un-sized point tombstones (DELs), we estimate assuming that each 704 // point tombstone on average covers 1 key and using average value sizes. 705 // This is almost certainly an overestimate, but that's probably okay 706 // because point tombstones can slow range iterations even when they don't 707 // cover a key. 708 // 709 // TODO(jackson): This logic doesn't directly incorporate fixed per-key 710 // overhead (8-byte trailer, plus at least 1 byte encoding the length of the 711 // key and 1 byte encoding the length of the value). This overhead is 712 // indirectly incorporated through the compression ratios, but that results 713 // in the overhead being smeared per key-byte and value-byte, rather than 714 // per-entry. This per-key fixed overhead can be nontrivial, especially for 715 // dense swaths of point tombstones. Give some thought as to whether we 716 // should directly include fixed per-key overhead in the calculations. 717 718 // Below, we calculate the tombstone contributions and the shadowed keys' 719 // contributions separately. 720 var tombstonesLogicalSize float64 721 var shadowedLogicalSize float64 722 723 // 1. Calculate the contribution of the tombstone keys themselves. 724 if props.RawPointTombstoneKeySize > 0 { 725 tombstonesLogicalSize += float64(props.RawPointTombstoneKeySize) 726 } else { 727 // This sstable predates the existence of the RawPointTombstoneKeySize 728 // property. We can use the average key size within the file itself and 729 // the count of point deletions to estimate the size. 730 tombstonesLogicalSize += float64(numPointDels * props.RawKeySize / props.NumEntries) 731 } 732 733 // 2. Calculate the contribution of the keys shadowed by tombstones. 734 // 735 // 2a. First account for keys shadowed by DELSIZED tombstones. THE DELSIZED 736 // tombstones encode the size of both the key and value of the shadowed KV 737 // entries. These sizes are aggregated into a sstable property. 738 shadowedLogicalSize += float64(props.RawPointTombstoneValueSize) 739 740 // 2b. Calculate the contribution of the KV entries shadowed by ordinary DEL 741 // keys. 742 numUnsizedDels := numPointDels - props.NumSizedDeletions 743 { 744 // The shadowed keys have the same exact user keys as the tombstones 745 // themselves, so we can use the `tombstonesLogicalSize` we computed 746 // earlier as an estimate. There's a complication that 747 // `tombstonesLogicalSize` may include DELSIZED keys we already 748 // accounted for. 749 shadowedLogicalSize += float64(tombstonesLogicalSize) / float64(numPointDels) * float64(numUnsizedDels) 750 751 // Calculate the contribution of the deleted values. The caller has 752 // already computed an average logical size (possibly computed across 753 // many sstables). 754 shadowedLogicalSize += float64(numUnsizedDels) * avgValLogicalSize 755 } 756 757 // Scale both tombstone and shadowed totals by logical:physical ratios to 758 // account for compression, metadata overhead, etc. 759 // 760 // Physical FileSize 761 // ----------- = ----------------------- 762 // Logical RawKeySize+RawValueSize 763 // 764 return uint64((tombstonesLogicalSize + shadowedLogicalSize) * compressionRatio) 765 } 766 767 func estimatePhysicalSizes( 768 fileSize uint64, props *sstable.CommonProperties, 769 ) (avgValLogicalSize, compressionRatio float64) { 770 // RawKeySize and RawValueSize are uncompressed totals. Scale according to 771 // the data size to account for compression, index blocks and metadata 772 // overhead. Eg: 773 // 774 // Compression rate × Average uncompressed value size 775 // 776 // ↓ 777 // 778 // FileSize RawValSize 779 // ----------------------- × ---------- 780 // RawKeySize+RawValueSize NumEntries 781 // 782 uncompressedSum := props.RawKeySize + props.RawValueSize 783 compressionRatio = float64(fileSize) / float64(uncompressedSum) 784 avgValLogicalSize = (float64(props.RawValueSize) / float64(props.NumEntries)) 785 return avgValLogicalSize, compressionRatio 786 } 787 788 // newCombinedDeletionKeyspanIter returns a keyspan.FragmentIterator that 789 // returns "ranged deletion" spans for a single table, providing a combined view 790 // of both range deletion and range key deletion spans. The 791 // tableRangedDeletionIter is intended for use in the specific case of computing 792 // the statistics and deleteCompactionHints for a single table. 793 // 794 // As an example, consider the following set of spans from the range deletion 795 // and range key blocks of a table: 796 // 797 // |---------| |---------| |-------| RANGEKEYDELs 798 // |-----------|-------------| |-----| RANGEDELs 799 // __________________________________________________________ 800 // a b c d e f g h i j k l m n o p q r s t u v w x y z 801 // 802 // The tableRangedDeletionIter produces the following set of output spans, where 803 // '1' indicates a span containing only range deletions, '2' is a span 804 // containing only range key deletions, and '3' is a span containing a mixture 805 // of both range deletions and range key deletions. 806 // 807 // 1 3 1 3 2 1 3 2 808 // |-----|---------|-----|---|-----| |---|-|-----| 809 // __________________________________________________________ 810 // a b c d e f g h i j k l m n o p q r s t u v w x y z 811 // 812 // Algorithm. 813 // 814 // The iterator first defragments the range deletion and range key blocks 815 // separately. During this defragmentation, the range key block is also filtered 816 // so that keys other than range key deletes are ignored. The range delete and 817 // range key delete keyspaces are then merged. 818 // 819 // Note that the only fragmentation introduced by merging is from where a range 820 // del span overlaps with a range key del span. Within the bounds of any overlap 821 // there is guaranteed to be no further fragmentation, as the constituent spans 822 // have already been defragmented. To the left and right of any overlap, the 823 // same reasoning applies. For example, 824 // 825 // |--------| |-------| RANGEKEYDEL 826 // |---------------------------| RANGEDEL 827 // |----1---|----3---|----1----|---2---| Merged, fragmented spans. 828 // __________________________________________________________ 829 // a b c d e f g h i j k l m n o p q r s t u v w x y z 830 // 831 // Any fragmented abutting spans produced by the merging iter will be of 832 // differing types (i.e. a transition from a span with homogenous key kinds to a 833 // heterogeneous span, or a transition from a span with exclusively range dels 834 // to a span with exclusively range key dels). Therefore, further 835 // defragmentation is not required. 836 // 837 // Each span returned by the tableRangeDeletionIter will have at most four keys, 838 // corresponding to the largest and smallest sequence numbers encountered across 839 // the range deletes and range keys deletes that comprised the merged spans. 840 func newCombinedDeletionKeyspanIter( 841 comparer *base.Comparer, cr sstable.CommonReader, m *fileMetadata, 842 ) (keyspan.FragmentIterator, error) { 843 // The range del iter and range key iter are each wrapped in their own 844 // defragmenting iter. For each iter, abutting spans can always be merged. 845 var equal = keyspan.DefragmentMethodFunc(func(_ base.Equal, a, b *keyspan.Span) bool { return true }) 846 // Reduce keys by maintaining a slice of at most length two, corresponding to 847 // the largest and smallest keys in the defragmented span. This maintains the 848 // contract that the emitted slice is sorted by (SeqNum, Kind) descending. 849 reducer := func(current, incoming []keyspan.Key) []keyspan.Key { 850 if len(current) == 0 && len(incoming) == 0 { 851 // While this should never occur in practice, a defensive return is used 852 // here to preserve correctness. 853 return current 854 } 855 var largest, smallest keyspan.Key 856 var set bool 857 for _, keys := range [2][]keyspan.Key{current, incoming} { 858 if len(keys) == 0 { 859 continue 860 } 861 first, last := keys[0], keys[len(keys)-1] 862 if !set { 863 largest, smallest = first, last 864 set = true 865 continue 866 } 867 if first.Trailer > largest.Trailer { 868 largest = first 869 } 870 if last.Trailer < smallest.Trailer { 871 smallest = last 872 } 873 } 874 if largest.Equal(comparer.Equal, smallest) { 875 current = append(current[:0], largest) 876 } else { 877 current = append(current[:0], largest, smallest) 878 } 879 return current 880 } 881 882 // The separate iters for the range dels and range keys are wrapped in a 883 // merging iter to join the keyspaces into a single keyspace. The separate 884 // iters are only added if the particular key kind is present. 885 mIter := &keyspan.MergingIter{} 886 var transform = keyspan.TransformerFunc(func(cmp base.Compare, in keyspan.Span, out *keyspan.Span) error { 887 if in.KeysOrder != keyspan.ByTrailerDesc { 888 panic("pebble: combined deletion iter encountered keys in non-trailer descending order") 889 } 890 out.Start, out.End = in.Start, in.End 891 out.Keys = append(out.Keys[:0], in.Keys...) 892 out.KeysOrder = keyspan.ByTrailerDesc 893 // NB: The order of by-trailer descending may have been violated, 894 // because we've layered rangekey and rangedel iterators from the same 895 // sstable into the same keyspan.MergingIter. The MergingIter will 896 // return the keys in the order that the child iterators were provided. 897 // Sort the keys to ensure they're sorted by trailer descending. 898 keyspan.SortKeysByTrailer(&out.Keys) 899 return nil 900 }) 901 mIter.Init(comparer.Compare, transform, new(keyspan.MergingBuffers)) 902 903 iter, err := cr.NewRawRangeDelIter() 904 if err != nil { 905 return nil, err 906 } 907 if iter != nil { 908 dIter := &keyspan.DefragmentingIter{} 909 dIter.Init(comparer, iter, equal, reducer, new(keyspan.DefragmentingBuffers)) 910 iter = dIter 911 // Truncate tombstones to the containing file's bounds if necessary. 912 // See docs/range_deletions.md for why this is necessary. 913 iter = keyspan.Truncate( 914 comparer.Compare, iter, m.Smallest.UserKey, m.Largest.UserKey, 915 nil, nil, false, /* panicOnUpperTruncate */ 916 ) 917 mIter.AddLevel(iter) 918 } 919 920 iter, err = cr.NewRawRangeKeyIter() 921 if err != nil { 922 return nil, err 923 } 924 if iter != nil { 925 // Wrap the range key iterator in a filter that elides keys other than range 926 // key deletions. 927 iter = keyspan.Filter(iter, func(in *keyspan.Span, out *keyspan.Span) (keep bool) { 928 out.Start, out.End = in.Start, in.End 929 out.Keys = out.Keys[:0] 930 for _, k := range in.Keys { 931 if k.Kind() != base.InternalKeyKindRangeKeyDelete { 932 continue 933 } 934 out.Keys = append(out.Keys, k) 935 } 936 return len(out.Keys) > 0 937 }, comparer.Compare) 938 dIter := &keyspan.DefragmentingIter{} 939 dIter.Init(comparer, iter, equal, reducer, new(keyspan.DefragmentingBuffers)) 940 iter = dIter 941 mIter.AddLevel(iter) 942 } 943 944 return mIter, nil 945 } 946 947 // rangeKeySetsAnnotator implements manifest.Annotator, annotating B-Tree nodes 948 // with the sum of the files' counts of range key fragments. Its annotation type 949 // is a *uint64. The count of range key sets may change once a table's stats are 950 // loaded asynchronously, so its values are marked as cacheable only if a file's 951 // stats have been loaded. 952 type rangeKeySetsAnnotator struct{} 953 954 var _ manifest.Annotator = rangeKeySetsAnnotator{} 955 956 func (a rangeKeySetsAnnotator) Zero(dst interface{}) interface{} { 957 if dst == nil { 958 return new(uint64) 959 } 960 v := dst.(*uint64) 961 *v = 0 962 return v 963 } 964 965 func (a rangeKeySetsAnnotator) Accumulate( 966 f *fileMetadata, dst interface{}, 967 ) (v interface{}, cacheOK bool) { 968 vptr := dst.(*uint64) 969 *vptr = *vptr + f.Stats.NumRangeKeySets 970 return vptr, f.StatsValid() 971 } 972 973 func (a rangeKeySetsAnnotator) Merge(src interface{}, dst interface{}) interface{} { 974 srcV := src.(*uint64) 975 dstV := dst.(*uint64) 976 *dstV = *dstV + *srcV 977 return dstV 978 } 979 980 // countRangeKeySetFragments counts the number of RANGEKEYSET keys across all 981 // files of the LSM. It only counts keys in files for which table stats have 982 // been loaded. It uses a b-tree annotator to cache intermediate values between 983 // calculations when possible. 984 func countRangeKeySetFragments(v *version) (count uint64) { 985 for l := 0; l < numLevels; l++ { 986 if v.RangeKeyLevels[l].Empty() { 987 continue 988 } 989 count += *v.RangeKeyLevels[l].Annotation(rangeKeySetsAnnotator{}).(*uint64) 990 } 991 return count 992 } 993 994 // tombstonesAnnotator implements manifest.Annotator, annotating B-Tree nodes 995 // with the sum of the files' counts of tombstones (DEL, SINGLEDEL and RANGEDELk 996 // eys). Its annotation type is a *uint64. The count of tombstones may change 997 // once a table's stats are loaded asynchronously, so its values are marked as 998 // cacheable only if a file's stats have been loaded. 999 type tombstonesAnnotator struct{} 1000 1001 var _ manifest.Annotator = tombstonesAnnotator{} 1002 1003 func (a tombstonesAnnotator) Zero(dst interface{}) interface{} { 1004 if dst == nil { 1005 return new(uint64) 1006 } 1007 v := dst.(*uint64) 1008 *v = 0 1009 return v 1010 } 1011 1012 func (a tombstonesAnnotator) Accumulate( 1013 f *fileMetadata, dst interface{}, 1014 ) (v interface{}, cacheOK bool) { 1015 vptr := dst.(*uint64) 1016 *vptr = *vptr + f.Stats.NumDeletions 1017 return vptr, f.StatsValid() 1018 } 1019 1020 func (a tombstonesAnnotator) Merge(src interface{}, dst interface{}) interface{} { 1021 srcV := src.(*uint64) 1022 dstV := dst.(*uint64) 1023 *dstV = *dstV + *srcV 1024 return dstV 1025 } 1026 1027 // countTombstones counts the number of tombstone (DEL, SINGLEDEL and RANGEDEL) 1028 // internal keys across all files of the LSM. It only counts keys in files for 1029 // which table stats have been loaded. It uses a b-tree annotator to cache 1030 // intermediate values between calculations when possible. 1031 func countTombstones(v *version) (count uint64) { 1032 for l := 0; l < numLevels; l++ { 1033 if v.Levels[l].Empty() { 1034 continue 1035 } 1036 count += *v.Levels[l].Annotation(tombstonesAnnotator{}).(*uint64) 1037 } 1038 return count 1039 } 1040 1041 // valueBlocksSizeAnnotator implements manifest.Annotator, annotating B-Tree 1042 // nodes with the sum of the files' Properties.ValueBlocksSize. Its annotation 1043 // type is a *uint64. The value block size may change once a table's stats are 1044 // loaded asynchronously, so its values are marked as cacheable only if a 1045 // file's stats have been loaded. 1046 type valueBlocksSizeAnnotator struct{} 1047 1048 var _ manifest.Annotator = valueBlocksSizeAnnotator{} 1049 1050 func (a valueBlocksSizeAnnotator) Zero(dst interface{}) interface{} { 1051 if dst == nil { 1052 return new(uint64) 1053 } 1054 v := dst.(*uint64) 1055 *v = 0 1056 return v 1057 } 1058 1059 func (a valueBlocksSizeAnnotator) Accumulate( 1060 f *fileMetadata, dst interface{}, 1061 ) (v interface{}, cacheOK bool) { 1062 vptr := dst.(*uint64) 1063 *vptr = *vptr + f.Stats.ValueBlocksSize 1064 return vptr, f.StatsValid() 1065 } 1066 1067 func (a valueBlocksSizeAnnotator) Merge(src interface{}, dst interface{}) interface{} { 1068 srcV := src.(*uint64) 1069 dstV := dst.(*uint64) 1070 *dstV = *dstV + *srcV 1071 return dstV 1072 } 1073 1074 // valueBlocksSizeForLevel returns the Properties.ValueBlocksSize across all 1075 // files for a level of the LSM. It only includes the size for files for which 1076 // table stats have been loaded. It uses a b-tree annotator to cache 1077 // intermediate values between calculations when possible. It must not be 1078 // called concurrently. 1079 // 1080 // REQUIRES: 0 <= level <= numLevels. 1081 func valueBlocksSizeForLevel(v *version, level int) (count uint64) { 1082 if v.Levels[level].Empty() { 1083 return 0 1084 } 1085 return *v.Levels[level].Annotation(valueBlocksSizeAnnotator{}).(*uint64) 1086 }