github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/table_stats.go (about) 1 // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "fmt" 9 "math" 10 11 "github.com/cockroachdb/pebble/internal/base" 12 "github.com/cockroachdb/pebble/internal/keyspan" 13 "github.com/cockroachdb/pebble/internal/manifest" 14 "github.com/cockroachdb/pebble/sstable" 15 ) 16 17 // In-memory statistics about tables help inform compaction picking, but may 18 // be expensive to calculate or load from disk. Every time a database is 19 // opened, these statistics must be reloaded or recalculated. To minimize 20 // impact on user activity and compactions, we load these statistics 21 // asynchronously in the background and store loaded statistics in each 22 // table's *FileMetadata. 23 // 24 // This file implements the asynchronous loading of statistics by maintaining 25 // a list of files that require statistics, alongside their LSM levels. 26 // Whenever new files are added to the LSM, the files are appended to 27 // d.mu.tableStats.pending. If a stats collection job is not currently 28 // running, one is started in a separate goroutine. 29 // 30 // The stats collection job grabs and clears the pending list, computes table 31 // statistics relative to the current readState and updates the tables' file 32 // metadata. New pending files may accumulate during a stats collection job, 33 // so a completing job triggers a new job if necessary. Only one job runs at a 34 // time. 35 // 36 // When an existing database is opened, all files lack in-memory statistics. 37 // These files' stats are loaded incrementally whenever the pending list is 38 // empty by scanning a current readState for files missing statistics. Once a 39 // job completes a scan without finding any remaining files without 40 // statistics, it flips a `loadedInitial` flag. From then on, the stats 41 // collection job only needs to load statistics for new files appended to the 42 // pending list. 43 44 func (d *DB) maybeCollectTableStatsLocked() { 45 if d.shouldCollectTableStatsLocked() { 46 go d.collectTableStats() 47 } 48 } 49 50 // updateTableStatsLocked is called when new files are introduced, after the 51 // read state has been updated. It may trigger a new stat collection. 52 // DB.mu must be locked when calling. 53 func (d *DB) updateTableStatsLocked(newFiles []manifest.NewFileEntry) { 54 var needStats bool 55 for _, nf := range newFiles { 56 if !nf.Meta.StatsValid() { 57 needStats = true 58 break 59 } 60 } 61 if !needStats { 62 return 63 } 64 65 d.mu.tableStats.pending = append(d.mu.tableStats.pending, newFiles...) 66 d.maybeCollectTableStatsLocked() 67 } 68 69 func (d *DB) shouldCollectTableStatsLocked() bool { 70 return !d.mu.tableStats.loading && 71 d.closed.Load() == nil && 72 !d.opts.private.disableTableStats && 73 (len(d.mu.tableStats.pending) > 0 || !d.mu.tableStats.loadedInitial) 74 } 75 76 // collectTableStats runs a table stats collection job, returning true if the 77 // invocation did the collection work, false otherwise (e.g. if another job was 78 // already running). 79 func (d *DB) collectTableStats() bool { 80 const maxTableStatsPerScan = 50 81 82 d.mu.Lock() 83 if !d.shouldCollectTableStatsLocked() { 84 d.mu.Unlock() 85 return false 86 } 87 88 pending := d.mu.tableStats.pending 89 d.mu.tableStats.pending = nil 90 d.mu.tableStats.loading = true 91 jobID := d.mu.nextJobID 92 d.mu.nextJobID++ 93 loadedInitial := d.mu.tableStats.loadedInitial 94 // Drop DB.mu before performing IO. 95 d.mu.Unlock() 96 97 // Every run of collectTableStats either collects stats from the pending 98 // list (if non-empty) or from scanning the version (loadedInitial is 99 // false). This job only runs if at least one of those conditions holds. 100 101 // Grab a read state to scan for tables. 102 rs := d.loadReadState() 103 var collected []collectedStats 104 var hints []deleteCompactionHint 105 if len(pending) > 0 { 106 collected, hints = d.loadNewFileStats(rs, pending) 107 } else { 108 var moreRemain bool 109 var buf [maxTableStatsPerScan]collectedStats 110 collected, hints, moreRemain = d.scanReadStateTableStats(rs, buf[:0]) 111 loadedInitial = !moreRemain 112 } 113 rs.unref() 114 115 // Update the FileMetadata with the loaded stats while holding d.mu. 116 d.mu.Lock() 117 defer d.mu.Unlock() 118 d.mu.tableStats.loading = false 119 if loadedInitial && !d.mu.tableStats.loadedInitial { 120 d.mu.tableStats.loadedInitial = loadedInitial 121 d.opts.EventListener.TableStatsLoaded(TableStatsInfo{ 122 JobID: jobID, 123 }) 124 } 125 126 maybeCompact := false 127 for _, c := range collected { 128 c.fileMetadata.Stats = c.TableStats 129 maybeCompact = maybeCompact || fileCompensation(c.fileMetadata) > 0 130 c.fileMetadata.StatsMarkValid() 131 } 132 d.mu.tableStats.cond.Broadcast() 133 d.maybeCollectTableStatsLocked() 134 if len(hints) > 0 && !d.opts.private.disableDeleteOnlyCompactions { 135 // Verify that all of the hint tombstones' files still exist in the 136 // current version. Otherwise, the tombstone itself may have been 137 // compacted into L6 and more recent keys may have had their sequence 138 // numbers zeroed. 139 // 140 // Note that it's possible that the tombstone file is being compacted 141 // presently. In that case, the file will be present in v. When the 142 // compaction finishes compacting the tombstone file, it will detect 143 // and clear the hint. 144 // 145 // See DB.maybeUpdateDeleteCompactionHints. 146 v := d.mu.versions.currentVersion() 147 keepHints := hints[:0] 148 for _, h := range hints { 149 if v.Contains(h.tombstoneLevel, d.cmp, h.tombstoneFile) { 150 keepHints = append(keepHints, h) 151 } 152 } 153 d.mu.compact.deletionHints = append(d.mu.compact.deletionHints, keepHints...) 154 } 155 if maybeCompact { 156 d.maybeScheduleCompaction() 157 } 158 return true 159 } 160 161 type collectedStats struct { 162 *fileMetadata 163 manifest.TableStats 164 } 165 166 func (d *DB) loadNewFileStats( 167 rs *readState, pending []manifest.NewFileEntry, 168 ) ([]collectedStats, []deleteCompactionHint) { 169 var hints []deleteCompactionHint 170 collected := make([]collectedStats, 0, len(pending)) 171 for _, nf := range pending { 172 // A file's stats might have been populated by an earlier call to 173 // loadNewFileStats if the file was moved. 174 // NB: We're not holding d.mu which protects f.Stats, but only 175 // collectTableStats updates f.Stats for active files, and we 176 // ensure only one goroutine runs it at a time through 177 // d.mu.tableStats.loading. 178 if nf.Meta.StatsValid() { 179 continue 180 } 181 182 // The file isn't guaranteed to still be live in the readState's 183 // version. It may have been deleted or moved. Skip it if it's not in 184 // the expected level. 185 if !rs.current.Contains(nf.Level, d.cmp, nf.Meta) { 186 continue 187 } 188 189 stats, newHints, err := d.loadTableStats( 190 rs.current, nf.Level, 191 nf.Meta, 192 ) 193 if err != nil { 194 d.opts.EventListener.BackgroundError(err) 195 continue 196 } 197 // NB: We don't update the FileMetadata yet, because we aren't 198 // holding DB.mu. We'll copy it to the FileMetadata after we're 199 // finished with IO. 200 collected = append(collected, collectedStats{ 201 fileMetadata: nf.Meta, 202 TableStats: stats, 203 }) 204 hints = append(hints, newHints...) 205 } 206 return collected, hints 207 } 208 209 // scanReadStateTableStats is run by an active stat collection job when there 210 // are no pending new files, but there might be files that existed at Open for 211 // which we haven't loaded table stats. 212 func (d *DB) scanReadStateTableStats( 213 rs *readState, fill []collectedStats, 214 ) ([]collectedStats, []deleteCompactionHint, bool) { 215 moreRemain := false 216 var hints []deleteCompactionHint 217 for l, levelMetadata := range rs.current.Levels { 218 iter := levelMetadata.Iter() 219 for f := iter.First(); f != nil; f = iter.Next() { 220 // NB: We're not holding d.mu which protects f.Stats, but only the 221 // active stats collection job updates f.Stats for active files, 222 // and we ensure only one goroutine runs it at a time through 223 // d.mu.tableStats.loading. This makes it safe to read validity 224 // through f.Stats.ValidLocked despite not holding d.mu. 225 if f.StatsValid() { 226 continue 227 } 228 229 // Limit how much work we do per read state. The older the read 230 // state is, the higher the likelihood files are no longer being 231 // used in the current version. If we've exhausted our allowance, 232 // return true for the last return value to signal there's more 233 // work to do. 234 if len(fill) == cap(fill) { 235 moreRemain = true 236 return fill, hints, moreRemain 237 } 238 239 stats, newHints, err := d.loadTableStats( 240 rs.current, l, f, 241 ) 242 if err != nil { 243 // Set `moreRemain` so we'll try again. 244 moreRemain = true 245 d.opts.EventListener.BackgroundError(err) 246 continue 247 } 248 fill = append(fill, collectedStats{ 249 fileMetadata: f, 250 TableStats: stats, 251 }) 252 hints = append(hints, newHints...) 253 } 254 } 255 return fill, hints, moreRemain 256 } 257 258 func (d *DB) loadTableStats( 259 v *version, level int, meta *fileMetadata, 260 ) (manifest.TableStats, []deleteCompactionHint, error) { 261 var stats manifest.TableStats 262 var compactionHints []deleteCompactionHint 263 err := d.tableCache.withCommonReader( 264 meta, func(r sstable.CommonReader) (err error) { 265 props := r.CommonProperties() 266 stats.NumEntries = props.NumEntries 267 stats.NumDeletions = props.NumDeletions 268 if props.NumPointDeletions() > 0 { 269 if err = d.loadTablePointKeyStats(props, v, level, meta, &stats); err != nil { 270 return 271 } 272 } 273 if props.NumRangeDeletions > 0 || props.NumRangeKeyDels > 0 { 274 if compactionHints, err = d.loadTableRangeDelStats( 275 r, v, level, meta, &stats, 276 ); err != nil { 277 return 278 } 279 } 280 // TODO(travers): Once we have real-world data, consider collecting 281 // additional stats that may provide improved heuristics for compaction 282 // picking. 283 stats.NumRangeKeySets = props.NumRangeKeySets 284 stats.ValueBlocksSize = props.ValueBlocksSize 285 return 286 }) 287 if err != nil { 288 return stats, nil, err 289 } 290 return stats, compactionHints, nil 291 } 292 293 // loadTablePointKeyStats calculates the point key statistics for the given 294 // table. The provided manifest.TableStats are updated. 295 func (d *DB) loadTablePointKeyStats( 296 props *sstable.CommonProperties, 297 v *version, 298 level int, 299 meta *fileMetadata, 300 stats *manifest.TableStats, 301 ) error { 302 // TODO(jackson): If the file has a wide keyspace, the average 303 // value size beneath the entire file might not be representative 304 // of the size of the keys beneath the point tombstones. 305 // We could write the ranges of 'clusters' of point tombstones to 306 // a sstable property and call averageValueSizeBeneath for each of 307 // these narrower ranges to improve the estimate. 308 avgValLogicalSize, compressionRatio, err := d.estimateSizesBeneath(v, level, meta, props) 309 if err != nil { 310 return err 311 } 312 stats.PointDeletionsBytesEstimate = 313 pointDeletionsBytesEstimate(meta.Size, props, avgValLogicalSize, compressionRatio) 314 return nil 315 } 316 317 // loadTableRangeDelStats calculates the range deletion and range key deletion 318 // statistics for the given table. 319 func (d *DB) loadTableRangeDelStats( 320 r sstable.CommonReader, v *version, level int, meta *fileMetadata, stats *manifest.TableStats, 321 ) ([]deleteCompactionHint, error) { 322 iter, err := newCombinedDeletionKeyspanIter(d.opts.Comparer, r, meta) 323 if err != nil { 324 return nil, err 325 } 326 defer iter.Close() 327 var compactionHints []deleteCompactionHint 328 // We iterate over the defragmented range tombstones and range key deletions, 329 // which ensures we don't double count ranges deleted at different sequence 330 // numbers. Also, merging abutting tombstones reduces the number of calls to 331 // estimateReclaimedSizeBeneath which is costly, and improves the accuracy of 332 // our overall estimate. 333 for s := iter.First(); s != nil; s = iter.Next() { 334 start, end := s.Start, s.End 335 // We only need to consider deletion size estimates for tables that contain 336 // RANGEDELs. 337 var maxRangeDeleteSeqNum uint64 338 for _, k := range s.Keys { 339 if k.Kind() == base.InternalKeyKindRangeDelete && maxRangeDeleteSeqNum < k.SeqNum() { 340 maxRangeDeleteSeqNum = k.SeqNum() 341 break 342 } 343 } 344 345 // If the file is in the last level of the LSM, there is no data beneath 346 // it. The fact that there is still a range tombstone in a bottommost file 347 // indicates two possibilites: 348 // 1. an open snapshot kept the tombstone around, and the data the 349 // tombstone deletes is contained within the file itself. 350 // 2. the file was ingested. 351 // In the first case, we'd like to estimate disk usage within the file 352 // itself since compacting the file will drop that covered data. In the 353 // second case, we expect that compacting the file will NOT drop any 354 // data and rewriting the file is a waste of write bandwidth. We can 355 // distinguish these cases by looking at the file metadata's sequence 356 // numbers. A file's range deletions can only delete data within the 357 // file at lower sequence numbers. All keys in an ingested sstable adopt 358 // the same sequence number, preventing tombstones from deleting keys 359 // within the same file. We check here if the largest RANGEDEL sequence 360 // number is greater than the file's smallest sequence number. If it is, 361 // the RANGEDEL could conceivably (although inconclusively) delete data 362 // within the same file. 363 // 364 // Note that this heuristic is imperfect. If a table containing a range 365 // deletion is ingested into L5 and subsequently compacted into L6 but 366 // an open snapshot prevents elision of covered keys in L6, the 367 // resulting RangeDeletionsBytesEstimate will incorrectly include all 368 // covered keys. 369 // 370 // TODO(jackson): We could prevent the above error in the heuristic by 371 // computing the file's RangeDeletionsBytesEstimate during the 372 // compaction itself. It's unclear how common this is. 373 // 374 // NOTE: If the span `s` wholly contains a table containing range keys, 375 // the returned size estimate will be slightly inflated by the range key 376 // block. However, in practice, range keys are expected to be rare, and 377 // the size of the range key block relative to the overall size of the 378 // table is expected to be small. 379 if level == numLevels-1 && meta.SmallestSeqNum < maxRangeDeleteSeqNum { 380 size, err := r.EstimateDiskUsage(start, end) 381 if err != nil { 382 return nil, err 383 } 384 stats.RangeDeletionsBytesEstimate += size 385 386 // As the file is in the bottommost level, there is no need to collect a 387 // deletion hint. 388 continue 389 } 390 391 // While the size estimates for point keys should only be updated if this 392 // span contains a range del, the sequence numbers are required for the 393 // hint. Unconditionally descend, but conditionally update the estimates. 394 hintType := compactionHintFromKeys(s.Keys) 395 estimate, hintSeqNum, err := d.estimateReclaimedSizeBeneath(v, level, start, end, hintType) 396 if err != nil { 397 return nil, err 398 } 399 stats.RangeDeletionsBytesEstimate += estimate 400 401 // If any files were completely contained with the range, 402 // hintSeqNum is the smallest sequence number contained in any 403 // such file. 404 if hintSeqNum == math.MaxUint64 { 405 continue 406 } 407 hint := deleteCompactionHint{ 408 hintType: hintType, 409 start: make([]byte, len(start)), 410 end: make([]byte, len(end)), 411 tombstoneFile: meta, 412 tombstoneLevel: level, 413 tombstoneLargestSeqNum: s.LargestSeqNum(), 414 tombstoneSmallestSeqNum: s.SmallestSeqNum(), 415 fileSmallestSeqNum: hintSeqNum, 416 } 417 copy(hint.start, start) 418 copy(hint.end, end) 419 compactionHints = append(compactionHints, hint) 420 } 421 return compactionHints, err 422 } 423 424 func (d *DB) estimateSizesBeneath( 425 v *version, level int, meta *fileMetadata, fileProps *sstable.CommonProperties, 426 ) (avgValueLogicalSize, compressionRatio float64, err error) { 427 // Find all files in lower levels that overlap with meta, 428 // summing their value sizes and entry counts. 429 file := meta 430 var fileSum, keySum, valSum, entryCount uint64 431 // Include the file itself. This is important because in some instances, the 432 // computed compression ratio is applied to the tombstones contained within 433 // `meta` itself. If there are no files beneath `meta` in the LSM, we would 434 // calculate a compression ratio of 0 which is not accurate for the file's 435 // own tombstones. 436 fileSum += file.Size 437 entryCount += fileProps.NumEntries 438 keySum += fileProps.RawKeySize 439 valSum += fileProps.RawValueSize 440 441 addPhysicalTableStats := func(r *sstable.Reader) (err error) { 442 fileSum += file.Size 443 entryCount += r.Properties.NumEntries 444 keySum += r.Properties.RawKeySize 445 valSum += r.Properties.RawValueSize 446 return nil 447 } 448 addVirtualTableStats := func(v sstable.VirtualReader) (err error) { 449 fileSum += file.Size 450 entryCount += file.Stats.NumEntries 451 keySum += v.Properties.RawKeySize 452 valSum += v.Properties.RawValueSize 453 return nil 454 } 455 456 for l := level + 1; l < numLevels; l++ { 457 overlaps := v.Overlaps(l, d.cmp, meta.Smallest.UserKey, 458 meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel()) 459 iter := overlaps.Iter() 460 for file = iter.First(); file != nil; file = iter.Next() { 461 var err error 462 if file.Virtual { 463 err = d.tableCache.withVirtualReader(file.VirtualMeta(), addVirtualTableStats) 464 } else { 465 err = d.tableCache.withReader(file.PhysicalMeta(), addPhysicalTableStats) 466 } 467 if err != nil { 468 return 0, 0, err 469 } 470 } 471 } 472 if entryCount == 0 { 473 return 0, 0, nil 474 } 475 // RawKeySize and RawValueSize are uncompressed totals. We'll need to scale 476 // the value sum according to the data size to account for compression, 477 // index blocks and metadata overhead. Eg: 478 // 479 // Compression rate × Average uncompressed value size 480 // 481 // ↓ 482 // 483 // FileSize RawValueSize 484 // ----------------------- × ------------ 485 // RawKeySize+RawValueSize NumEntries 486 // 487 // We return the average logical value size plus the compression ratio, 488 // leaving the scaling to the caller. This allows the caller to perform 489 // additional compression ratio scaling if necessary. 490 uncompressedSum := float64(keySum + valSum) 491 compressionRatio = float64(fileSum) / uncompressedSum 492 avgValueLogicalSize = (float64(valSum) / float64(entryCount)) 493 return avgValueLogicalSize, compressionRatio, nil 494 } 495 496 func (d *DB) estimateReclaimedSizeBeneath( 497 v *version, level int, start, end []byte, hintType deleteCompactionHintType, 498 ) (estimate uint64, hintSeqNum uint64, err error) { 499 // Find all files in lower levels that overlap with the deleted range 500 // [start, end). 501 // 502 // An overlapping file might be completely contained by the range 503 // tombstone, in which case we can count the entire file size in 504 // our estimate without doing any additional I/O. 505 // 506 // Otherwise, estimating the range for the file requires 507 // additional I/O to read the file's index blocks. 508 hintSeqNum = math.MaxUint64 509 for l := level + 1; l < numLevels; l++ { 510 overlaps := v.Overlaps(l, d.cmp, start, end, true /* exclusiveEnd */) 511 iter := overlaps.Iter() 512 for file := iter.First(); file != nil; file = iter.Next() { 513 startCmp := d.cmp(start, file.Smallest.UserKey) 514 endCmp := d.cmp(file.Largest.UserKey, end) 515 if startCmp <= 0 && (endCmp < 0 || endCmp == 0 && file.Largest.IsExclusiveSentinel()) { 516 // The range fully contains the file, so skip looking it up in table 517 // cache/looking at its indexes and add the full file size. Whether the 518 // disk estimate and hint seqnums are updated depends on a) the type of 519 // hint that requested the estimate and b) the keys contained in this 520 // current file. 521 var updateEstimates, updateHints bool 522 switch hintType { 523 case deleteCompactionHintTypePointKeyOnly: 524 // The range deletion byte estimates should only be updated if this 525 // table contains point keys. This ends up being an overestimate in 526 // the case that table also has range keys, but such keys are expected 527 // to contribute a negligible amount of the table's overall size, 528 // relative to point keys. 529 if file.HasPointKeys { 530 updateEstimates = true 531 } 532 // As the initiating span contained only range dels, hints can only be 533 // updated if this table does _not_ contain range keys. 534 if !file.HasRangeKeys { 535 updateHints = true 536 } 537 case deleteCompactionHintTypeRangeKeyOnly: 538 // The initiating span contained only range key dels. The estimates 539 // apply only to point keys, and are therefore not updated. 540 updateEstimates = false 541 // As the initiating span contained only range key dels, hints can 542 // only be updated if this table does _not_ contain point keys. 543 if !file.HasPointKeys { 544 updateHints = true 545 } 546 case deleteCompactionHintTypePointAndRangeKey: 547 // Always update the estimates and hints, as this hint type can drop a 548 // file, irrespective of the mixture of keys. Similar to above, the 549 // range del bytes estimates is an overestimate. 550 updateEstimates, updateHints = true, true 551 default: 552 panic(fmt.Sprintf("pebble: unknown hint type %s", hintType)) 553 } 554 if updateEstimates { 555 estimate += file.Size 556 } 557 if updateHints && hintSeqNum > file.SmallestSeqNum { 558 hintSeqNum = file.SmallestSeqNum 559 } 560 } else if d.cmp(file.Smallest.UserKey, end) <= 0 && d.cmp(start, file.Largest.UserKey) <= 0 { 561 // Partial overlap. 562 if hintType == deleteCompactionHintTypeRangeKeyOnly { 563 // If the hint that generated this overlap contains only range keys, 564 // there is no need to calculate disk usage, as the reclaimable space 565 // is expected to be minimal relative to point keys. 566 continue 567 } 568 var size uint64 569 var err error 570 if file.Virtual { 571 err = d.tableCache.withVirtualReader( 572 file.VirtualMeta(), func(r sstable.VirtualReader) (err error) { 573 size, err = r.EstimateDiskUsage(start, end) 574 return err 575 }) 576 } else { 577 err = d.tableCache.withReader( 578 file.PhysicalMeta(), func(r *sstable.Reader) (err error) { 579 size, err = r.EstimateDiskUsage(start, end) 580 return err 581 }) 582 } 583 584 if err != nil { 585 return 0, hintSeqNum, err 586 } 587 estimate += size 588 } 589 } 590 } 591 return estimate, hintSeqNum, nil 592 } 593 594 func maybeSetStatsFromProperties(meta physicalMeta, props *sstable.Properties) bool { 595 // If a table contains range deletions or range key deletions, we defer the 596 // stats collection. There are two main reasons for this: 597 // 598 // 1. Estimating the potential for reclaimed space due to a range deletion 599 // tombstone requires scanning the LSM - a potentially expensive operation 600 // that should be deferred. 601 // 2. Range deletions and / or range key deletions present an opportunity to 602 // compute "deletion hints", which also requires a scan of the LSM to 603 // compute tables that would be eligible for deletion. 604 // 605 // These two tasks are deferred to the table stats collector goroutine. 606 if props.NumRangeDeletions != 0 || props.NumRangeKeyDels != 0 { 607 return false 608 } 609 610 // If a table is more than 10% point deletions without user-provided size 611 // estimates, don't calculate the PointDeletionsBytesEstimate statistic 612 // using our limited knowledge. The table stats collector can populate the 613 // stats and calculate an average of value size of all the tables beneath 614 // the table in the LSM, which will be more accurate. 615 if unsizedDels := (props.NumDeletions - props.NumSizedDeletions); unsizedDels > props.NumEntries/10 { 616 return false 617 } 618 619 var pointEstimate uint64 620 if props.NumEntries > 0 { 621 // Use the file's own average key and value sizes as an estimate. This 622 // doesn't require any additional IO and since the number of point 623 // deletions in the file is low, the error introduced by this crude 624 // estimate is expected to be small. 625 commonProps := &props.CommonProperties 626 avgValSize, compressionRatio := estimatePhysicalSizes(meta.Size, commonProps) 627 pointEstimate = pointDeletionsBytesEstimate(meta.Size, commonProps, avgValSize, compressionRatio) 628 } 629 630 meta.Stats.NumEntries = props.NumEntries 631 meta.Stats.NumDeletions = props.NumDeletions 632 meta.Stats.NumRangeKeySets = props.NumRangeKeySets 633 meta.Stats.PointDeletionsBytesEstimate = pointEstimate 634 meta.Stats.RangeDeletionsBytesEstimate = 0 635 meta.Stats.ValueBlocksSize = props.ValueBlocksSize 636 meta.StatsMarkValid() 637 return true 638 } 639 640 func pointDeletionsBytesEstimate( 641 fileSize uint64, props *sstable.CommonProperties, avgValLogicalSize, compressionRatio float64, 642 ) (estimate uint64) { 643 if props.NumEntries == 0 { 644 return 0 645 } 646 numPointDels := props.NumPointDeletions() 647 if numPointDels == 0 { 648 return 0 649 } 650 // Estimate the potential space to reclaim using the table's own properties. 651 // There may or may not be keys covered by any individual point tombstone. 652 // If not, compacting the point tombstone into L6 will at least allow us to 653 // drop the point deletion key and will reclaim the tombstone's key bytes. 654 // If there are covered key(s), we also get to drop key and value bytes for 655 // each covered key. 656 // 657 // Some point tombstones (DELSIZEDs) carry a user-provided estimate of the 658 // uncompressed size of entries that will be elided by fully compacting the 659 // tombstone. For these tombstones, there's no guesswork—we use the 660 // RawPointTombstoneValueSizeHint property which is the sum of all these 661 // tombstones' encoded values. 662 // 663 // For un-sized point tombstones (DELs), we estimate assuming that each 664 // point tombstone on average covers 1 key and using average value sizes. 665 // This is almost certainly an overestimate, but that's probably okay 666 // because point tombstones can slow range iterations even when they don't 667 // cover a key. 668 // 669 // TODO(jackson): This logic doesn't directly incorporate fixed per-key 670 // overhead (8-byte trailer, plus at least 1 byte encoding the length of the 671 // key and 1 byte encoding the length of the value). This overhead is 672 // indirectly incorporated through the compression ratios, but that results 673 // in the overhead being smeared per key-byte and value-byte, rather than 674 // per-entry. This per-key fixed overhead can be nontrivial, especially for 675 // dense swaths of point tombstones. Give some thought as to whether we 676 // should directly include fixed per-key overhead in the calculations. 677 678 // Below, we calculate the tombstone contributions and the shadowed keys' 679 // contributions separately. 680 var tombstonesLogicalSize float64 681 var shadowedLogicalSize float64 682 683 // 1. Calculate the contribution of the tombstone keys themselves. 684 if props.RawPointTombstoneKeySize > 0 { 685 tombstonesLogicalSize += float64(props.RawPointTombstoneKeySize) 686 } else { 687 // This sstable predates the existence of the RawPointTombstoneKeySize 688 // property. We can use the average key size within the file itself and 689 // the count of point deletions to estimate the size. 690 tombstonesLogicalSize += float64(numPointDels * props.RawKeySize / props.NumEntries) 691 } 692 693 // 2. Calculate the contribution of the keys shadowed by tombstones. 694 // 695 // 2a. First account for keys shadowed by DELSIZED tombstones. THE DELSIZED 696 // tombstones encode the size of both the key and value of the shadowed KV 697 // entries. These sizes are aggregated into a sstable property. 698 shadowedLogicalSize += float64(props.RawPointTombstoneValueSize) 699 700 // 2b. Calculate the contribution of the KV entries shadowed by ordinary DEL 701 // keys. 702 numUnsizedDels := numPointDels - props.NumSizedDeletions 703 { 704 // The shadowed keys have the same exact user keys as the tombstones 705 // themselves, so we can use the `tombstonesLogicalSize` we computed 706 // earlier as an estimate. There's a complication that 707 // `tombstonesLogicalSize` may include DELSIZED keys we already 708 // accounted for. 709 shadowedLogicalSize += float64(tombstonesLogicalSize) / float64(numPointDels) * float64(numUnsizedDels) 710 711 // Calculate the contribution of the deleted values. The caller has 712 // already computed an average logical size (possibly computed across 713 // many sstables). 714 shadowedLogicalSize += float64(numUnsizedDels) * avgValLogicalSize 715 } 716 717 // Scale both tombstone and shadowed totals by logical:physical ratios to 718 // account for compression, metadata overhead, etc. 719 // 720 // Physical FileSize 721 // ----------- = ----------------------- 722 // Logical RawKeySize+RawValueSize 723 // 724 return uint64((tombstonesLogicalSize + shadowedLogicalSize) * compressionRatio) 725 } 726 727 func estimatePhysicalSizes( 728 fileSize uint64, props *sstable.CommonProperties, 729 ) (avgValLogicalSize, compressionRatio float64) { 730 // RawKeySize and RawValueSize are uncompressed totals. Scale according to 731 // the data size to account for compression, index blocks and metadata 732 // overhead. Eg: 733 // 734 // Compression rate × Average uncompressed value size 735 // 736 // ↓ 737 // 738 // FileSize RawValSize 739 // ----------------------- × ---------- 740 // RawKeySize+RawValueSize NumEntries 741 // 742 uncompressedSum := props.RawKeySize + props.RawValueSize 743 compressionRatio = float64(fileSize) / float64(uncompressedSum) 744 avgValLogicalSize = (float64(props.RawValueSize) / float64(props.NumEntries)) 745 return avgValLogicalSize, compressionRatio 746 } 747 748 // newCombinedDeletionKeyspanIter returns a keyspan.FragmentIterator that 749 // returns "ranged deletion" spans for a single table, providing a combined view 750 // of both range deletion and range key deletion spans. The 751 // tableRangedDeletionIter is intended for use in the specific case of computing 752 // the statistics and deleteCompactionHints for a single table. 753 // 754 // As an example, consider the following set of spans from the range deletion 755 // and range key blocks of a table: 756 // 757 // |---------| |---------| |-------| RANGEKEYDELs 758 // |-----------|-------------| |-----| RANGEDELs 759 // __________________________________________________________ 760 // a b c d e f g h i j k l m n o p q r s t u v w x y z 761 // 762 // The tableRangedDeletionIter produces the following set of output spans, where 763 // '1' indicates a span containing only range deletions, '2' is a span 764 // containing only range key deletions, and '3' is a span containing a mixture 765 // of both range deletions and range key deletions. 766 // 767 // 1 3 1 3 2 1 3 2 768 // |-----|---------|-----|---|-----| |---|-|-----| 769 // __________________________________________________________ 770 // a b c d e f g h i j k l m n o p q r s t u v w x y z 771 // 772 // Algorithm. 773 // 774 // The iterator first defragments the range deletion and range key blocks 775 // separately. During this defragmentation, the range key block is also filtered 776 // so that keys other than range key deletes are ignored. The range delete and 777 // range key delete keyspaces are then merged. 778 // 779 // Note that the only fragmentation introduced by merging is from where a range 780 // del span overlaps with a range key del span. Within the bounds of any overlap 781 // there is guaranteed to be no further fragmentation, as the constituent spans 782 // have already been defragmented. To the left and right of any overlap, the 783 // same reasoning applies. For example, 784 // 785 // |--------| |-------| RANGEKEYDEL 786 // |---------------------------| RANGEDEL 787 // |----1---|----3---|----1----|---2---| Merged, fragmented spans. 788 // __________________________________________________________ 789 // a b c d e f g h i j k l m n o p q r s t u v w x y z 790 // 791 // Any fragmented abutting spans produced by the merging iter will be of 792 // differing types (i.e. a transition from a span with homogenous key kinds to a 793 // heterogeneous span, or a transition from a span with exclusively range dels 794 // to a span with exclusively range key dels). Therefore, further 795 // defragmentation is not required. 796 // 797 // Each span returned by the tableRangeDeletionIter will have at most four keys, 798 // corresponding to the largest and smallest sequence numbers encountered across 799 // the range deletes and range keys deletes that comprised the merged spans. 800 func newCombinedDeletionKeyspanIter( 801 comparer *base.Comparer, cr sstable.CommonReader, m *fileMetadata, 802 ) (keyspan.FragmentIterator, error) { 803 // The range del iter and range key iter are each wrapped in their own 804 // defragmenting iter. For each iter, abutting spans can always be merged. 805 var equal = keyspan.DefragmentMethodFunc(func(_ base.Equal, a, b *keyspan.Span) bool { return true }) 806 // Reduce keys by maintaining a slice of at most length two, corresponding to 807 // the largest and smallest keys in the defragmented span. This maintains the 808 // contract that the emitted slice is sorted by (SeqNum, Kind) descending. 809 reducer := func(current, incoming []keyspan.Key) []keyspan.Key { 810 if len(current) == 0 && len(incoming) == 0 { 811 // While this should never occur in practice, a defensive return is used 812 // here to preserve correctness. 813 return current 814 } 815 var largest, smallest keyspan.Key 816 var set bool 817 for _, keys := range [2][]keyspan.Key{current, incoming} { 818 if len(keys) == 0 { 819 continue 820 } 821 first, last := keys[0], keys[len(keys)-1] 822 if !set { 823 largest, smallest = first, last 824 set = true 825 continue 826 } 827 if first.Trailer > largest.Trailer { 828 largest = first 829 } 830 if last.Trailer < smallest.Trailer { 831 smallest = last 832 } 833 } 834 if largest.Equal(comparer.Equal, smallest) { 835 current = append(current[:0], largest) 836 } else { 837 current = append(current[:0], largest, smallest) 838 } 839 return current 840 } 841 842 // The separate iters for the range dels and range keys are wrapped in a 843 // merging iter to join the keyspaces into a single keyspace. The separate 844 // iters are only added if the particular key kind is present. 845 mIter := &keyspan.MergingIter{} 846 var transform = keyspan.TransformerFunc(func(cmp base.Compare, in keyspan.Span, out *keyspan.Span) error { 847 if in.KeysOrder != keyspan.ByTrailerDesc { 848 panic("pebble: combined deletion iter encountered keys in non-trailer descending order") 849 } 850 out.Start, out.End = in.Start, in.End 851 out.Keys = append(out.Keys[:0], in.Keys...) 852 out.KeysOrder = keyspan.ByTrailerDesc 853 // NB: The order of by-trailer descending may have been violated, 854 // because we've layered rangekey and rangedel iterators from the same 855 // sstable into the same keyspan.MergingIter. The MergingIter will 856 // return the keys in the order that the child iterators were provided. 857 // Sort the keys to ensure they're sorted by trailer descending. 858 keyspan.SortKeysByTrailer(&out.Keys) 859 return nil 860 }) 861 mIter.Init(comparer.Compare, transform, new(keyspan.MergingBuffers)) 862 863 iter, err := cr.NewRawRangeDelIter() 864 if err != nil { 865 return nil, err 866 } 867 if iter != nil { 868 dIter := &keyspan.DefragmentingIter{} 869 dIter.Init(comparer, iter, equal, reducer, new(keyspan.DefragmentingBuffers)) 870 iter = dIter 871 // Truncate tombstones to the containing file's bounds if necessary. 872 // See docs/range_deletions.md for why this is necessary. 873 iter = keyspan.Truncate( 874 comparer.Compare, iter, m.Smallest.UserKey, m.Largest.UserKey, 875 nil, nil, false, /* panicOnUpperTruncate */ 876 ) 877 mIter.AddLevel(iter) 878 } 879 880 iter, err = cr.NewRawRangeKeyIter() 881 if err != nil { 882 return nil, err 883 } 884 if iter != nil { 885 // Wrap the range key iterator in a filter that elides keys other than range 886 // key deletions. 887 iter = keyspan.Filter(iter, func(in *keyspan.Span, out *keyspan.Span) (keep bool) { 888 out.Start, out.End = in.Start, in.End 889 out.Keys = out.Keys[:0] 890 for _, k := range in.Keys { 891 if k.Kind() != base.InternalKeyKindRangeKeyDelete { 892 continue 893 } 894 out.Keys = append(out.Keys, k) 895 } 896 return len(out.Keys) > 0 897 }, comparer.Compare) 898 dIter := &keyspan.DefragmentingIter{} 899 dIter.Init(comparer, iter, equal, reducer, new(keyspan.DefragmentingBuffers)) 900 iter = dIter 901 mIter.AddLevel(iter) 902 } 903 904 return mIter, nil 905 } 906 907 // rangeKeySetsAnnotator implements manifest.Annotator, annotating B-Tree nodes 908 // with the sum of the files' counts of range key fragments. Its annotation type 909 // is a *uint64. The count of range key sets may change once a table's stats are 910 // loaded asynchronously, so its values are marked as cacheable only if a file's 911 // stats have been loaded. 912 type rangeKeySetsAnnotator struct{} 913 914 var _ manifest.Annotator = rangeKeySetsAnnotator{} 915 916 func (a rangeKeySetsAnnotator) Zero(dst interface{}) interface{} { 917 if dst == nil { 918 return new(uint64) 919 } 920 v := dst.(*uint64) 921 *v = 0 922 return v 923 } 924 925 func (a rangeKeySetsAnnotator) Accumulate( 926 f *fileMetadata, dst interface{}, 927 ) (v interface{}, cacheOK bool) { 928 vptr := dst.(*uint64) 929 *vptr = *vptr + f.Stats.NumRangeKeySets 930 return vptr, f.StatsValid() 931 } 932 933 func (a rangeKeySetsAnnotator) Merge(src interface{}, dst interface{}) interface{} { 934 srcV := src.(*uint64) 935 dstV := dst.(*uint64) 936 *dstV = *dstV + *srcV 937 return dstV 938 } 939 940 // countRangeKeySetFragments counts the number of RANGEKEYSET keys across all 941 // files of the LSM. It only counts keys in files for which table stats have 942 // been loaded. It uses a b-tree annotator to cache intermediate values between 943 // calculations when possible. 944 func countRangeKeySetFragments(v *version) (count uint64) { 945 for l := 0; l < numLevels; l++ { 946 if v.RangeKeyLevels[l].Empty() { 947 continue 948 } 949 count += *v.RangeKeyLevels[l].Annotation(rangeKeySetsAnnotator{}).(*uint64) 950 } 951 return count 952 } 953 954 // tombstonesAnnotator implements manifest.Annotator, annotating B-Tree nodes 955 // with the sum of the files' counts of tombstones (DEL, SINGLEDEL and RANGEDELk 956 // eys). Its annotation type is a *uint64. The count of tombstones may change 957 // once a table's stats are loaded asynchronously, so its values are marked as 958 // cacheable only if a file's stats have been loaded. 959 type tombstonesAnnotator struct{} 960 961 var _ manifest.Annotator = tombstonesAnnotator{} 962 963 func (a tombstonesAnnotator) Zero(dst interface{}) interface{} { 964 if dst == nil { 965 return new(uint64) 966 } 967 v := dst.(*uint64) 968 *v = 0 969 return v 970 } 971 972 func (a tombstonesAnnotator) Accumulate( 973 f *fileMetadata, dst interface{}, 974 ) (v interface{}, cacheOK bool) { 975 vptr := dst.(*uint64) 976 *vptr = *vptr + f.Stats.NumDeletions 977 return vptr, f.StatsValid() 978 } 979 980 func (a tombstonesAnnotator) Merge(src interface{}, dst interface{}) interface{} { 981 srcV := src.(*uint64) 982 dstV := dst.(*uint64) 983 *dstV = *dstV + *srcV 984 return dstV 985 } 986 987 // countTombstones counts the number of tombstone (DEL, SINGLEDEL and RANGEDEL) 988 // internal keys across all files of the LSM. It only counts keys in files for 989 // which table stats have been loaded. It uses a b-tree annotator to cache 990 // intermediate values between calculations when possible. 991 func countTombstones(v *version) (count uint64) { 992 for l := 0; l < numLevels; l++ { 993 if v.Levels[l].Empty() { 994 continue 995 } 996 count += *v.Levels[l].Annotation(tombstonesAnnotator{}).(*uint64) 997 } 998 return count 999 } 1000 1001 // valueBlocksSizeAnnotator implements manifest.Annotator, annotating B-Tree 1002 // nodes with the sum of the files' Properties.ValueBlocksSize. Its annotation 1003 // type is a *uint64. The value block size may change once a table's stats are 1004 // loaded asynchronously, so its values are marked as cacheable only if a 1005 // file's stats have been loaded. 1006 type valueBlocksSizeAnnotator struct{} 1007 1008 var _ manifest.Annotator = valueBlocksSizeAnnotator{} 1009 1010 func (a valueBlocksSizeAnnotator) Zero(dst interface{}) interface{} { 1011 if dst == nil { 1012 return new(uint64) 1013 } 1014 v := dst.(*uint64) 1015 *v = 0 1016 return v 1017 } 1018 1019 func (a valueBlocksSizeAnnotator) Accumulate( 1020 f *fileMetadata, dst interface{}, 1021 ) (v interface{}, cacheOK bool) { 1022 vptr := dst.(*uint64) 1023 *vptr = *vptr + f.Stats.ValueBlocksSize 1024 return vptr, f.StatsValid() 1025 } 1026 1027 func (a valueBlocksSizeAnnotator) Merge(src interface{}, dst interface{}) interface{} { 1028 srcV := src.(*uint64) 1029 dstV := dst.(*uint64) 1030 *dstV = *dstV + *srcV 1031 return dstV 1032 } 1033 1034 // valueBlocksSizeForLevel returns the Properties.ValueBlocksSize across all 1035 // files for a level of the LSM. It only includes the size for files for which 1036 // table stats have been loaded. It uses a b-tree annotator to cache 1037 // intermediate values between calculations when possible. It must not be 1038 // called concurrently. 1039 // 1040 // REQUIRES: 0 <= level <= numLevels. 1041 func valueBlocksSizeForLevel(v *version, level int) (count uint64) { 1042 if v.Levels[level].Empty() { 1043 return 0 1044 } 1045 return *v.Levels[level].Annotation(valueBlocksSizeAnnotator{}).(*uint64) 1046 }