github.com/cockroachdb/pebble@v1.1.2/table_stats.go

github.com/cockroachdb/pebble@v1.1.2/table_stats.go (about)

     1  // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"fmt"
     9  	"math"
    10  
    11  	"github.com/cockroachdb/pebble/internal/base"
    12  	"github.com/cockroachdb/pebble/internal/keyspan"
    13  	"github.com/cockroachdb/pebble/internal/manifest"
    14  	"github.com/cockroachdb/pebble/sstable"
    15  )
    16  
    17  // In-memory statistics about tables help inform compaction picking, but may
    18  // be expensive to calculate or load from disk. Every time a database is
    19  // opened, these statistics must be reloaded or recalculated. To minimize
    20  // impact on user activity and compactions, we load these statistics
    21  // asynchronously in the background and store loaded statistics in each
    22  // table's *FileMetadata.
    23  //
    24  // This file implements the asynchronous loading of statistics by maintaining
    25  // a list of files that require statistics, alongside their LSM levels.
    26  // Whenever new files are added to the LSM, the files are appended to
    27  // d.mu.tableStats.pending. If a stats collection job is not currently
    28  // running, one is started in a separate goroutine.
    29  //
    30  // The stats collection job grabs and clears the pending list, computes table
    31  // statistics relative to the current readState and updates the tables' file
    32  // metadata. New pending files may accumulate during a stats collection job,
    33  // so a completing job triggers a new job if necessary. Only one job runs at a
    34  // time.
    35  //
    36  // When an existing database is opened, all files lack in-memory statistics.
    37  // These files' stats are loaded incrementally whenever the pending list is
    38  // empty by scanning a current readState for files missing statistics. Once a
    39  // job completes a scan without finding any remaining files without
    40  // statistics, it flips a `loadedInitial` flag. From then on, the stats
    41  // collection job only needs to load statistics for new files appended to the
    42  // pending list.
    43  
    44  func (d *DB) maybeCollectTableStatsLocked() {
    45  	if d.shouldCollectTableStatsLocked() {
    46  		go d.collectTableStats()
    47  	}
    48  }
    49  
    50  // updateTableStatsLocked is called when new files are introduced, after the
    51  // read state has been updated. It may trigger a new stat collection.
    52  // DB.mu must be locked when calling.
    53  func (d *DB) updateTableStatsLocked(newFiles []manifest.NewFileEntry) {
    54  	var needStats bool
    55  	for _, nf := range newFiles {
    56  		if !nf.Meta.StatsValid() {
    57  			needStats = true
    58  			break
    59  		}
    60  	}
    61  	if !needStats {
    62  		return
    63  	}
    64  
    65  	d.mu.tableStats.pending = append(d.mu.tableStats.pending, newFiles...)
    66  	d.maybeCollectTableStatsLocked()
    67  }
    68  
    69  func (d *DB) shouldCollectTableStatsLocked() bool {
    70  	return !d.mu.tableStats.loading &&
    71  		d.closed.Load() == nil &&
    72  		!d.opts.private.disableTableStats &&
    73  		(len(d.mu.tableStats.pending) > 0 || !d.mu.tableStats.loadedInitial)
    74  }
    75  
    76  // collectTableStats runs a table stats collection job, returning true if the
    77  // invocation did the collection work, false otherwise (e.g. if another job was
    78  // already running).
    79  func (d *DB) collectTableStats() bool {
    80  	const maxTableStatsPerScan = 50
    81  
    82  	d.mu.Lock()
    83  	if !d.shouldCollectTableStatsLocked() {
    84  		d.mu.Unlock()
    85  		return false
    86  	}
    87  
    88  	pending := d.mu.tableStats.pending
    89  	d.mu.tableStats.pending = nil
    90  	d.mu.tableStats.loading = true
    91  	jobID := d.mu.nextJobID
    92  	d.mu.nextJobID++
    93  	loadedInitial := d.mu.tableStats.loadedInitial
    94  	// Drop DB.mu before performing IO.
    95  	d.mu.Unlock()
    96  
    97  	// Every run of collectTableStats either collects stats from the pending
    98  	// list (if non-empty) or from scanning the version (loadedInitial is
    99  	// false). This job only runs if at least one of those conditions holds.
   100  
   101  	// Grab a read state to scan for tables.
   102  	rs := d.loadReadState()
   103  	var collected []collectedStats
   104  	var hints []deleteCompactionHint
   105  	if len(pending) > 0 {
   106  		collected, hints = d.loadNewFileStats(rs, pending)
   107  	} else {
   108  		var moreRemain bool
   109  		var buf [maxTableStatsPerScan]collectedStats
   110  		collected, hints, moreRemain = d.scanReadStateTableStats(rs, buf[:0])
   111  		loadedInitial = !moreRemain
   112  	}
   113  	rs.unref()
   114  
   115  	// Update the FileMetadata with the loaded stats while holding d.mu.
   116  	d.mu.Lock()
   117  	defer d.mu.Unlock()
   118  	d.mu.tableStats.loading = false
   119  	if loadedInitial && !d.mu.tableStats.loadedInitial {
   120  		d.mu.tableStats.loadedInitial = loadedInitial
   121  		d.opts.EventListener.TableStatsLoaded(TableStatsInfo{
   122  			JobID: jobID,
   123  		})
   124  	}
   125  
   126  	maybeCompact := false
   127  	for _, c := range collected {
   128  		c.fileMetadata.Stats = c.TableStats
   129  		maybeCompact = maybeCompact || fileCompensation(c.fileMetadata) > 0
   130  		c.fileMetadata.StatsMarkValid()
   131  	}
   132  	d.mu.tableStats.cond.Broadcast()
   133  	d.maybeCollectTableStatsLocked()
   134  	if len(hints) > 0 && !d.opts.private.disableDeleteOnlyCompactions {
   135  		// Verify that all of the hint tombstones' files still exist in the
   136  		// current version. Otherwise, the tombstone itself may have been
   137  		// compacted into L6 and more recent keys may have had their sequence
   138  		// numbers zeroed.
   139  		//
   140  		// Note that it's possible that the tombstone file is being compacted
   141  		// presently. In that case, the file will be present in v. When the
   142  		// compaction finishes compacting the tombstone file, it will detect
   143  		// and clear the hint.
   144  		//
   145  		// See DB.maybeUpdateDeleteCompactionHints.
   146  		v := d.mu.versions.currentVersion()
   147  		keepHints := hints[:0]
   148  		for _, h := range hints {
   149  			if v.Contains(h.tombstoneLevel, d.cmp, h.tombstoneFile) {
   150  				keepHints = append(keepHints, h)
   151  			}
   152  		}
   153  		d.mu.compact.deletionHints = append(d.mu.compact.deletionHints, keepHints...)
   154  	}
   155  	if maybeCompact {
   156  		d.maybeScheduleCompaction()
   157  	}
   158  	return true
   159  }
   160  
   161  type collectedStats struct {
   162  	*fileMetadata
   163  	manifest.TableStats
   164  }
   165  
   166  func (d *DB) loadNewFileStats(
   167  	rs *readState, pending []manifest.NewFileEntry,
   168  ) ([]collectedStats, []deleteCompactionHint) {
   169  	var hints []deleteCompactionHint
   170  	collected := make([]collectedStats, 0, len(pending))
   171  	for _, nf := range pending {
   172  		// A file's stats might have been populated by an earlier call to
   173  		// loadNewFileStats if the file was moved.
   174  		// NB: We're not holding d.mu which protects f.Stats, but only
   175  		// collectTableStats updates f.Stats for active files, and we
   176  		// ensure only one goroutine runs it at a time through
   177  		// d.mu.tableStats.loading.
   178  		if nf.Meta.StatsValid() {
   179  			continue
   180  		}
   181  
   182  		// The file isn't guaranteed to still be live in the readState's
   183  		// version. It may have been deleted or moved. Skip it if it's not in
   184  		// the expected level.
   185  		if !rs.current.Contains(nf.Level, d.cmp, nf.Meta) {
   186  			continue
   187  		}
   188  
   189  		stats, newHints, err := d.loadTableStats(
   190  			rs.current, nf.Level,
   191  			nf.Meta,
   192  		)
   193  		if err != nil {
   194  			d.opts.EventListener.BackgroundError(err)
   195  			continue
   196  		}
   197  		// NB: We don't update the FileMetadata yet, because we aren't
   198  		// holding DB.mu. We'll copy it to the FileMetadata after we're
   199  		// finished with IO.
   200  		collected = append(collected, collectedStats{
   201  			fileMetadata: nf.Meta,
   202  			TableStats:   stats,
   203  		})
   204  		hints = append(hints, newHints...)
   205  	}
   206  	return collected, hints
   207  }
   208  
   209  // scanReadStateTableStats is run by an active stat collection job when there
   210  // are no pending new files, but there might be files that existed at Open for
   211  // which we haven't loaded table stats.
   212  func (d *DB) scanReadStateTableStats(
   213  	rs *readState, fill []collectedStats,
   214  ) ([]collectedStats, []deleteCompactionHint, bool) {
   215  	moreRemain := false
   216  	var hints []deleteCompactionHint
   217  	for l, levelMetadata := range rs.current.Levels {
   218  		iter := levelMetadata.Iter()
   219  		for f := iter.First(); f != nil; f = iter.Next() {
   220  			// NB: We're not holding d.mu which protects f.Stats, but only the
   221  			// active stats collection job updates f.Stats for active files,
   222  			// and we ensure only one goroutine runs it at a time through
   223  			// d.mu.tableStats.loading. This makes it safe to read validity
   224  			// through f.Stats.ValidLocked despite not holding d.mu.
   225  			if f.StatsValid() {
   226  				continue
   227  			}
   228  
   229  			// Limit how much work we do per read state. The older the read
   230  			// state is, the higher the likelihood files are no longer being
   231  			// used in the current version. If we've exhausted our allowance,
   232  			// return true for the last return value to signal there's more
   233  			// work to do.
   234  			if len(fill) == cap(fill) {
   235  				moreRemain = true
   236  				return fill, hints, moreRemain
   237  			}
   238  
   239  			stats, newHints, err := d.loadTableStats(
   240  				rs.current, l, f,
   241  			)
   242  			if err != nil {
   243  				// Set `moreRemain` so we'll try again.
   244  				moreRemain = true
   245  				d.opts.EventListener.BackgroundError(err)
   246  				continue
   247  			}
   248  			fill = append(fill, collectedStats{
   249  				fileMetadata: f,
   250  				TableStats:   stats,
   251  			})
   252  			hints = append(hints, newHints...)
   253  		}
   254  	}
   255  	return fill, hints, moreRemain
   256  }
   257  
   258  func (d *DB) loadTableStats(
   259  	v *version, level int, meta *fileMetadata,
   260  ) (manifest.TableStats, []deleteCompactionHint, error) {
   261  	var stats manifest.TableStats
   262  	var compactionHints []deleteCompactionHint
   263  	err := d.tableCache.withCommonReader(
   264  		meta, func(r sstable.CommonReader) (err error) {
   265  			props := r.CommonProperties()
   266  			stats.NumEntries = props.NumEntries
   267  			stats.NumDeletions = props.NumDeletions
   268  			if props.NumPointDeletions() > 0 {
   269  				if err = d.loadTablePointKeyStats(props, v, level, meta, &stats); err != nil {
   270  					return
   271  				}
   272  			}
   273  			if props.NumRangeDeletions > 0 || props.NumRangeKeyDels > 0 {
   274  				if compactionHints, err = d.loadTableRangeDelStats(
   275  					r, v, level, meta, &stats,
   276  				); err != nil {
   277  					return
   278  				}
   279  			}
   280  			// TODO(travers): Once we have real-world data, consider collecting
   281  			// additional stats that may provide improved heuristics for compaction
   282  			// picking.
   283  			stats.NumRangeKeySets = props.NumRangeKeySets
   284  			stats.ValueBlocksSize = props.ValueBlocksSize
   285  			return
   286  		})
   287  	if err != nil {
   288  		return stats, nil, err
   289  	}
   290  	return stats, compactionHints, nil
   291  }
   292  
   293  // loadTablePointKeyStats calculates the point key statistics for the given
   294  // table. The provided manifest.TableStats are updated.
   295  func (d *DB) loadTablePointKeyStats(
   296  	props *sstable.CommonProperties,
   297  	v *version,
   298  	level int,
   299  	meta *fileMetadata,
   300  	stats *manifest.TableStats,
   301  ) error {
   302  	// TODO(jackson): If the file has a wide keyspace, the average
   303  	// value size beneath the entire file might not be representative
   304  	// of the size of the keys beneath the point tombstones.
   305  	// We could write the ranges of 'clusters' of point tombstones to
   306  	// a sstable property and call averageValueSizeBeneath for each of
   307  	// these narrower ranges to improve the estimate.
   308  	avgValLogicalSize, compressionRatio, err := d.estimateSizesBeneath(v, level, meta, props)
   309  	if err != nil {
   310  		return err
   311  	}
   312  	stats.PointDeletionsBytesEstimate =
   313  		pointDeletionsBytesEstimate(meta.Size, props, avgValLogicalSize, compressionRatio)
   314  	return nil
   315  }
   316  
   317  // loadTableRangeDelStats calculates the range deletion and range key deletion
   318  // statistics for the given table.
   319  func (d *DB) loadTableRangeDelStats(
   320  	r sstable.CommonReader, v *version, level int, meta *fileMetadata, stats *manifest.TableStats,
   321  ) ([]deleteCompactionHint, error) {
   322  	iter, err := newCombinedDeletionKeyspanIter(d.opts.Comparer, r, meta)
   323  	if err != nil {
   324  		return nil, err
   325  	}
   326  	defer iter.Close()
   327  	var compactionHints []deleteCompactionHint
   328  	// We iterate over the defragmented range tombstones and range key deletions,
   329  	// which ensures we don't double count ranges deleted at different sequence
   330  	// numbers. Also, merging abutting tombstones reduces the number of calls to
   331  	// estimateReclaimedSizeBeneath which is costly, and improves the accuracy of
   332  	// our overall estimate.
   333  	for s := iter.First(); s != nil; s = iter.Next() {
   334  		start, end := s.Start, s.End
   335  		// We only need to consider deletion size estimates for tables that contain
   336  		// RANGEDELs.
   337  		var maxRangeDeleteSeqNum uint64
   338  		for _, k := range s.Keys {
   339  			if k.Kind() == base.InternalKeyKindRangeDelete && maxRangeDeleteSeqNum < k.SeqNum() {
   340  				maxRangeDeleteSeqNum = k.SeqNum()
   341  				break
   342  			}
   343  		}
   344  
   345  		// If the file is in the last level of the LSM, there is no data beneath
   346  		// it. The fact that there is still a range tombstone in a bottommost file
   347  		// indicates two possibilites:
   348  		//   1. an open snapshot kept the tombstone around, and the data the
   349  		//      tombstone deletes is contained within the file itself.
   350  		//   2. the file was ingested.
   351  		// In the first case, we'd like to estimate disk usage within the file
   352  		// itself since compacting the file will drop that covered data. In the
   353  		// second case, we expect that compacting the file will NOT drop any
   354  		// data and rewriting the file is a waste of write bandwidth. We can
   355  		// distinguish these cases by looking at the file metadata's sequence
   356  		// numbers. A file's range deletions can only delete data within the
   357  		// file at lower sequence numbers. All keys in an ingested sstable adopt
   358  		// the same sequence number, preventing tombstones from deleting keys
   359  		// within the same file. We check here if the largest RANGEDEL sequence
   360  		// number is greater than the file's smallest sequence number. If it is,
   361  		// the RANGEDEL could conceivably (although inconclusively) delete data
   362  		// within the same file.
   363  		//
   364  		// Note that this heuristic is imperfect. If a table containing a range
   365  		// deletion is ingested into L5 and subsequently compacted into L6 but
   366  		// an open snapshot prevents elision of covered keys in L6, the
   367  		// resulting RangeDeletionsBytesEstimate will incorrectly include all
   368  		// covered keys.
   369  		//
   370  		// TODO(jackson): We could prevent the above error in the heuristic by
   371  		// computing the file's RangeDeletionsBytesEstimate during the
   372  		// compaction itself. It's unclear how common this is.
   373  		//
   374  		// NOTE: If the span `s` wholly contains a table containing range keys,
   375  		// the returned size estimate will be slightly inflated by the range key
   376  		// block. However, in practice, range keys are expected to be rare, and
   377  		// the size of the range key block relative to the overall size of the
   378  		// table is expected to be small.
   379  		if level == numLevels-1 && meta.SmallestSeqNum < maxRangeDeleteSeqNum {
   380  			size, err := r.EstimateDiskUsage(start, end)
   381  			if err != nil {
   382  				return nil, err
   383  			}
   384  			stats.RangeDeletionsBytesEstimate += size
   385  
   386  			// As the file is in the bottommost level, there is no need to collect a
   387  			// deletion hint.
   388  			continue
   389  		}
   390  
   391  		// While the size estimates for point keys should only be updated if this
   392  		// span contains a range del, the sequence numbers are required for the
   393  		// hint. Unconditionally descend, but conditionally update the estimates.
   394  		hintType := compactionHintFromKeys(s.Keys)
   395  		estimate, hintSeqNum, err := d.estimateReclaimedSizeBeneath(v, level, start, end, hintType)
   396  		if err != nil {
   397  			return nil, err
   398  		}
   399  		stats.RangeDeletionsBytesEstimate += estimate
   400  
   401  		// If any files were completely contained with the range,
   402  		// hintSeqNum is the smallest sequence number contained in any
   403  		// such file.
   404  		if hintSeqNum == math.MaxUint64 {
   405  			continue
   406  		}
   407  		hint := deleteCompactionHint{
   408  			hintType:                hintType,
   409  			start:                   make([]byte, len(start)),
   410  			end:                     make([]byte, len(end)),
   411  			tombstoneFile:           meta,
   412  			tombstoneLevel:          level,
   413  			tombstoneLargestSeqNum:  s.LargestSeqNum(),
   414  			tombstoneSmallestSeqNum: s.SmallestSeqNum(),
   415  			fileSmallestSeqNum:      hintSeqNum,
   416  		}
   417  		copy(hint.start, start)
   418  		copy(hint.end, end)
   419  		compactionHints = append(compactionHints, hint)
   420  	}
   421  	return compactionHints, err
   422  }
   423  
   424  func (d *DB) estimateSizesBeneath(
   425  	v *version, level int, meta *fileMetadata, fileProps *sstable.CommonProperties,
   426  ) (avgValueLogicalSize, compressionRatio float64, err error) {
   427  	// Find all files in lower levels that overlap with meta,
   428  	// summing their value sizes and entry counts.
   429  	file := meta
   430  	var fileSum, keySum, valSum, entryCount uint64
   431  	// Include the file itself. This is important because in some instances, the
   432  	// computed compression ratio is applied to the tombstones contained within
   433  	// `meta` itself. If there are no files beneath `meta` in the LSM, we would
   434  	// calculate a compression ratio of 0 which is not accurate for the file's
   435  	// own tombstones.
   436  	fileSum += file.Size
   437  	entryCount += fileProps.NumEntries
   438  	keySum += fileProps.RawKeySize
   439  	valSum += fileProps.RawValueSize
   440  
   441  	addPhysicalTableStats := func(r *sstable.Reader) (err error) {
   442  		fileSum += file.Size
   443  		entryCount += r.Properties.NumEntries
   444  		keySum += r.Properties.RawKeySize
   445  		valSum += r.Properties.RawValueSize
   446  		return nil
   447  	}
   448  	addVirtualTableStats := func(v sstable.VirtualReader) (err error) {
   449  		fileSum += file.Size
   450  		entryCount += file.Stats.NumEntries
   451  		keySum += v.Properties.RawKeySize
   452  		valSum += v.Properties.RawValueSize
   453  		return nil
   454  	}
   455  
   456  	for l := level + 1; l < numLevels; l++ {
   457  		overlaps := v.Overlaps(l, d.cmp, meta.Smallest.UserKey,
   458  			meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel())
   459  		iter := overlaps.Iter()
   460  		for file = iter.First(); file != nil; file = iter.Next() {
   461  			var err error
   462  			if file.Virtual {
   463  				err = d.tableCache.withVirtualReader(file.VirtualMeta(), addVirtualTableStats)
   464  			} else {
   465  				err = d.tableCache.withReader(file.PhysicalMeta(), addPhysicalTableStats)
   466  			}
   467  			if err != nil {
   468  				return 0, 0, err
   469  			}
   470  		}
   471  	}
   472  	if entryCount == 0 {
   473  		return 0, 0, nil
   474  	}
   475  	// RawKeySize and RawValueSize are uncompressed totals. We'll need to scale
   476  	// the value sum according to the data size to account for compression,
   477  	// index blocks and metadata overhead. Eg:
   478  	//
   479  	//    Compression rate        ×  Average uncompressed value size
   480  	//
   481  	//                            ↓
   482  	//
   483  	//         FileSize              RawValueSize
   484  	//   -----------------------  ×  ------------
   485  	//   RawKeySize+RawValueSize     NumEntries
   486  	//
   487  	// We return the average logical value size plus the compression ratio,
   488  	// leaving the scaling to the caller. This allows the caller to perform
   489  	// additional compression ratio scaling if necessary.
   490  	uncompressedSum := float64(keySum + valSum)
   491  	compressionRatio = float64(fileSum) / uncompressedSum
   492  	avgValueLogicalSize = (float64(valSum) / float64(entryCount))
   493  	return avgValueLogicalSize, compressionRatio, nil
   494  }
   495  
   496  func (d *DB) estimateReclaimedSizeBeneath(
   497  	v *version, level int, start, end []byte, hintType deleteCompactionHintType,
   498  ) (estimate uint64, hintSeqNum uint64, err error) {
   499  	// Find all files in lower levels that overlap with the deleted range
   500  	// [start, end).
   501  	//
   502  	// An overlapping file might be completely contained by the range
   503  	// tombstone, in which case we can count the entire file size in
   504  	// our estimate without doing any additional I/O.
   505  	//
   506  	// Otherwise, estimating the range for the file requires
   507  	// additional I/O to read the file's index blocks.
   508  	hintSeqNum = math.MaxUint64
   509  	for l := level + 1; l < numLevels; l++ {
   510  		overlaps := v.Overlaps(l, d.cmp, start, end, true /* exclusiveEnd */)
   511  		iter := overlaps.Iter()
   512  		for file := iter.First(); file != nil; file = iter.Next() {
   513  			startCmp := d.cmp(start, file.Smallest.UserKey)
   514  			endCmp := d.cmp(file.Largest.UserKey, end)
   515  			if startCmp <= 0 && (endCmp < 0 || endCmp == 0 && file.Largest.IsExclusiveSentinel()) {
   516  				// The range fully contains the file, so skip looking it up in table
   517  				// cache/looking at its indexes and add the full file size. Whether the
   518  				// disk estimate and hint seqnums are updated depends on a) the type of
   519  				// hint that requested the estimate and b) the keys contained in this
   520  				// current file.
   521  				var updateEstimates, updateHints bool
   522  				switch hintType {
   523  				case deleteCompactionHintTypePointKeyOnly:
   524  					// The range deletion byte estimates should only be updated if this
   525  					// table contains point keys. This ends up being an overestimate in
   526  					// the case that table also has range keys, but such keys are expected
   527  					// to contribute a negligible amount of the table's overall size,
   528  					// relative to point keys.
   529  					if file.HasPointKeys {
   530  						updateEstimates = true
   531  					}
   532  					// As the initiating span contained only range dels, hints can only be
   533  					// updated if this table does _not_ contain range keys.
   534  					if !file.HasRangeKeys {
   535  						updateHints = true
   536  					}
   537  				case deleteCompactionHintTypeRangeKeyOnly:
   538  					// The initiating span contained only range key dels. The estimates
   539  					// apply only to point keys, and are therefore not updated.
   540  					updateEstimates = false
   541  					// As the initiating span contained only range key dels, hints can
   542  					// only be updated if this table does _not_ contain point keys.
   543  					if !file.HasPointKeys {
   544  						updateHints = true
   545  					}
   546  				case deleteCompactionHintTypePointAndRangeKey:
   547  					// Always update the estimates and hints, as this hint type can drop a
   548  					// file, irrespective of the mixture of keys. Similar to above, the
   549  					// range del bytes estimates is an overestimate.
   550  					updateEstimates, updateHints = true, true
   551  				default:
   552  					panic(fmt.Sprintf("pebble: unknown hint type %s", hintType))
   553  				}
   554  				if updateEstimates {
   555  					estimate += file.Size
   556  				}
   557  				if updateHints && hintSeqNum > file.SmallestSeqNum {
   558  					hintSeqNum = file.SmallestSeqNum
   559  				}
   560  			} else if d.cmp(file.Smallest.UserKey, end) <= 0 && d.cmp(start, file.Largest.UserKey) <= 0 {
   561  				// Partial overlap.
   562  				if hintType == deleteCompactionHintTypeRangeKeyOnly {
   563  					// If the hint that generated this overlap contains only range keys,
   564  					// there is no need to calculate disk usage, as the reclaimable space
   565  					// is expected to be minimal relative to point keys.
   566  					continue
   567  				}
   568  				var size uint64
   569  				var err error
   570  				if file.Virtual {
   571  					err = d.tableCache.withVirtualReader(
   572  						file.VirtualMeta(), func(r sstable.VirtualReader) (err error) {
   573  							size, err = r.EstimateDiskUsage(start, end)
   574  							return err
   575  						})
   576  				} else {
   577  					err = d.tableCache.withReader(
   578  						file.PhysicalMeta(), func(r *sstable.Reader) (err error) {
   579  							size, err = r.EstimateDiskUsage(start, end)
   580  							return err
   581  						})
   582  				}
   583  
   584  				if err != nil {
   585  					return 0, hintSeqNum, err
   586  				}
   587  				estimate += size
   588  			}
   589  		}
   590  	}
   591  	return estimate, hintSeqNum, nil
   592  }
   593  
   594  func maybeSetStatsFromProperties(meta physicalMeta, props *sstable.Properties) bool {
   595  	// If a table contains range deletions or range key deletions, we defer the
   596  	// stats collection. There are two main reasons for this:
   597  	//
   598  	//  1. Estimating the potential for reclaimed space due to a range deletion
   599  	//     tombstone requires scanning the LSM - a potentially expensive operation
   600  	//     that should be deferred.
   601  	//  2. Range deletions and / or range key deletions present an opportunity to
   602  	//     compute "deletion hints", which also requires a scan of the LSM to
   603  	//     compute tables that would be eligible for deletion.
   604  	//
   605  	// These two tasks are deferred to the table stats collector goroutine.
   606  	if props.NumRangeDeletions != 0 || props.NumRangeKeyDels != 0 {
   607  		return false
   608  	}
   609  
   610  	// If a table is more than 10% point deletions without user-provided size
   611  	// estimates, don't calculate the PointDeletionsBytesEstimate statistic
   612  	// using our limited knowledge. The table stats collector can populate the
   613  	// stats and calculate an average of value size of all the tables beneath
   614  	// the table in the LSM, which will be more accurate.
   615  	if unsizedDels := (props.NumDeletions - props.NumSizedDeletions); unsizedDels > props.NumEntries/10 {
   616  		return false
   617  	}
   618  
   619  	var pointEstimate uint64
   620  	if props.NumEntries > 0 {
   621  		// Use the file's own average key and value sizes as an estimate. This
   622  		// doesn't require any additional IO and since the number of point
   623  		// deletions in the file is low, the error introduced by this crude
   624  		// estimate is expected to be small.
   625  		commonProps := &props.CommonProperties
   626  		avgValSize, compressionRatio := estimatePhysicalSizes(meta.Size, commonProps)
   627  		pointEstimate = pointDeletionsBytesEstimate(meta.Size, commonProps, avgValSize, compressionRatio)
   628  	}
   629  
   630  	meta.Stats.NumEntries = props.NumEntries
   631  	meta.Stats.NumDeletions = props.NumDeletions
   632  	meta.Stats.NumRangeKeySets = props.NumRangeKeySets
   633  	meta.Stats.PointDeletionsBytesEstimate = pointEstimate
   634  	meta.Stats.RangeDeletionsBytesEstimate = 0
   635  	meta.Stats.ValueBlocksSize = props.ValueBlocksSize
   636  	meta.StatsMarkValid()
   637  	return true
   638  }
   639  
   640  func pointDeletionsBytesEstimate(
   641  	fileSize uint64, props *sstable.CommonProperties, avgValLogicalSize, compressionRatio float64,
   642  ) (estimate uint64) {
   643  	if props.NumEntries == 0 {
   644  		return 0
   645  	}
   646  	numPointDels := props.NumPointDeletions()
   647  	if numPointDels == 0 {
   648  		return 0
   649  	}
   650  	// Estimate the potential space to reclaim using the table's own properties.
   651  	// There may or may not be keys covered by any individual point tombstone.
   652  	// If not, compacting the point tombstone into L6 will at least allow us to
   653  	// drop the point deletion key and will reclaim the tombstone's key bytes.
   654  	// If there are covered key(s), we also get to drop key and value bytes for
   655  	// each covered key.
   656  	//
   657  	// Some point tombstones (DELSIZEDs) carry a user-provided estimate of the
   658  	// uncompressed size of entries that will be elided by fully compacting the
   659  	// tombstone. For these tombstones, there's no guesswork—we use the
   660  	// RawPointTombstoneValueSizeHint property which is the sum of all these
   661  	// tombstones' encoded values.
   662  	//
   663  	// For un-sized point tombstones (DELs), we estimate assuming that each
   664  	// point tombstone on average covers 1 key and using average value sizes.
   665  	// This is almost certainly an overestimate, but that's probably okay
   666  	// because point tombstones can slow range iterations even when they don't
   667  	// cover a key.
   668  	//
   669  	// TODO(jackson): This logic doesn't directly incorporate fixed per-key
   670  	// overhead (8-byte trailer, plus at least 1 byte encoding the length of the
   671  	// key and 1 byte encoding the length of the value). This overhead is
   672  	// indirectly incorporated through the compression ratios, but that results
   673  	// in the overhead being smeared per key-byte and value-byte, rather than
   674  	// per-entry. This per-key fixed overhead can be nontrivial, especially for
   675  	// dense swaths of point tombstones. Give some thought as to whether we
   676  	// should directly include fixed per-key overhead in the calculations.
   677  
   678  	// Below, we calculate the tombstone contributions and the shadowed keys'
   679  	// contributions separately.
   680  	var tombstonesLogicalSize float64
   681  	var shadowedLogicalSize float64
   682  
   683  	// 1. Calculate the contribution of the tombstone keys themselves.
   684  	if props.RawPointTombstoneKeySize > 0 {
   685  		tombstonesLogicalSize += float64(props.RawPointTombstoneKeySize)
   686  	} else {
   687  		// This sstable predates the existence of the RawPointTombstoneKeySize
   688  		// property. We can use the average key size within the file itself and
   689  		// the count of point deletions to estimate the size.
   690  		tombstonesLogicalSize += float64(numPointDels * props.RawKeySize / props.NumEntries)
   691  	}
   692  
   693  	// 2. Calculate the contribution of the keys shadowed by tombstones.
   694  	//
   695  	// 2a. First account for keys shadowed by DELSIZED tombstones. THE DELSIZED
   696  	// tombstones encode the size of both the key and value of the shadowed KV
   697  	// entries. These sizes are aggregated into a sstable property.
   698  	shadowedLogicalSize += float64(props.RawPointTombstoneValueSize)
   699  
   700  	// 2b. Calculate the contribution of the KV entries shadowed by ordinary DEL
   701  	// keys.
   702  	numUnsizedDels := numPointDels - props.NumSizedDeletions
   703  	{
   704  		// The shadowed keys have the same exact user keys as the tombstones
   705  		// themselves, so we can use the `tombstonesLogicalSize` we computed
   706  		// earlier as an estimate. There's a complication that
   707  		// `tombstonesLogicalSize` may include DELSIZED keys we already
   708  		// accounted for.
   709  		shadowedLogicalSize += float64(tombstonesLogicalSize) / float64(numPointDels) * float64(numUnsizedDels)
   710  
   711  		// Calculate the contribution of the deleted values. The caller has
   712  		// already computed an average logical size (possibly computed across
   713  		// many sstables).
   714  		shadowedLogicalSize += float64(numUnsizedDels) * avgValLogicalSize
   715  	}
   716  
   717  	// Scale both tombstone and shadowed totals by logical:physical ratios to
   718  	// account for compression, metadata overhead, etc.
   719  	//
   720  	//      Physical             FileSize
   721  	//     -----------  = -----------------------
   722  	//      Logical       RawKeySize+RawValueSize
   723  	//
   724  	return uint64((tombstonesLogicalSize + shadowedLogicalSize) * compressionRatio)
   725  }
   726  
   727  func estimatePhysicalSizes(
   728  	fileSize uint64, props *sstable.CommonProperties,
   729  ) (avgValLogicalSize, compressionRatio float64) {
   730  	// RawKeySize and RawValueSize are uncompressed totals. Scale according to
   731  	// the data size to account for compression, index blocks and metadata
   732  	// overhead. Eg:
   733  	//
   734  	//    Compression rate        ×  Average uncompressed value size
   735  	//
   736  	//                            ↓
   737  	//
   738  	//         FileSize              RawValSize
   739  	//   -----------------------  ×  ----------
   740  	//   RawKeySize+RawValueSize     NumEntries
   741  	//
   742  	uncompressedSum := props.RawKeySize + props.RawValueSize
   743  	compressionRatio = float64(fileSize) / float64(uncompressedSum)
   744  	avgValLogicalSize = (float64(props.RawValueSize) / float64(props.NumEntries))
   745  	return avgValLogicalSize, compressionRatio
   746  }
   747  
   748  // newCombinedDeletionKeyspanIter returns a keyspan.FragmentIterator that
   749  // returns "ranged deletion" spans for a single table, providing a combined view
   750  // of both range deletion and range key deletion spans. The
   751  // tableRangedDeletionIter is intended for use in the specific case of computing
   752  // the statistics and deleteCompactionHints for a single table.
   753  //
   754  // As an example, consider the following set of spans from the range deletion
   755  // and range key blocks of a table:
   756  //
   757  //		      |---------|     |---------|         |-------| RANGEKEYDELs
   758  //		|-----------|-------------|           |-----|       RANGEDELs
   759  //	  __________________________________________________________
   760  //		a b c d e f g h i j k l m n o p q r s t u v w x y z
   761  //
   762  // The tableRangedDeletionIter produces the following set of output spans, where
   763  // '1' indicates a span containing only range deletions, '2' is a span
   764  // containing only range key deletions, and '3' is a span containing a mixture
   765  // of both range deletions and range key deletions.
   766  //
   767  //		   1       3       1    3    2          1  3   2
   768  //		|-----|---------|-----|---|-----|     |---|-|-----|
   769  //	  __________________________________________________________
   770  //		a b c d e f g h i j k l m n o p q r s t u v w x y z
   771  //
   772  // Algorithm.
   773  //
   774  // The iterator first defragments the range deletion and range key blocks
   775  // separately. During this defragmentation, the range key block is also filtered
   776  // so that keys other than range key deletes are ignored. The range delete and
   777  // range key delete keyspaces are then merged.
   778  //
   779  // Note that the only fragmentation introduced by merging is from where a range
   780  // del span overlaps with a range key del span. Within the bounds of any overlap
   781  // there is guaranteed to be no further fragmentation, as the constituent spans
   782  // have already been defragmented. To the left and right of any overlap, the
   783  // same reasoning applies. For example,
   784  //
   785  //		         |--------|         |-------| RANGEKEYDEL
   786  //		|---------------------------|         RANGEDEL
   787  //		|----1---|----3---|----1----|---2---| Merged, fragmented spans.
   788  //	  __________________________________________________________
   789  //		a b c d e f g h i j k l m n o p q r s t u v w x y z
   790  //
   791  // Any fragmented abutting spans produced by the merging iter will be of
   792  // differing types (i.e. a transition from a span with homogenous key kinds to a
   793  // heterogeneous span, or a transition from a span with exclusively range dels
   794  // to a span with exclusively range key dels). Therefore, further
   795  // defragmentation is not required.
   796  //
   797  // Each span returned by the tableRangeDeletionIter will have at most four keys,
   798  // corresponding to the largest and smallest sequence numbers encountered across
   799  // the range deletes and range keys deletes that comprised the merged spans.
   800  func newCombinedDeletionKeyspanIter(
   801  	comparer *base.Comparer, cr sstable.CommonReader, m *fileMetadata,
   802  ) (keyspan.FragmentIterator, error) {
   803  	// The range del iter and range key iter are each wrapped in their own
   804  	// defragmenting iter. For each iter, abutting spans can always be merged.
   805  	var equal = keyspan.DefragmentMethodFunc(func(_ base.Equal, a, b *keyspan.Span) bool { return true })
   806  	// Reduce keys by maintaining a slice of at most length two, corresponding to
   807  	// the largest and smallest keys in the defragmented span. This maintains the
   808  	// contract that the emitted slice is sorted by (SeqNum, Kind) descending.
   809  	reducer := func(current, incoming []keyspan.Key) []keyspan.Key {
   810  		if len(current) == 0 && len(incoming) == 0 {
   811  			// While this should never occur in practice, a defensive return is used
   812  			// here to preserve correctness.
   813  			return current
   814  		}
   815  		var largest, smallest keyspan.Key
   816  		var set bool
   817  		for _, keys := range [2][]keyspan.Key{current, incoming} {
   818  			if len(keys) == 0 {
   819  				continue
   820  			}
   821  			first, last := keys[0], keys[len(keys)-1]
   822  			if !set {
   823  				largest, smallest = first, last
   824  				set = true
   825  				continue
   826  			}
   827  			if first.Trailer > largest.Trailer {
   828  				largest = first
   829  			}
   830  			if last.Trailer < smallest.Trailer {
   831  				smallest = last
   832  			}
   833  		}
   834  		if largest.Equal(comparer.Equal, smallest) {
   835  			current = append(current[:0], largest)
   836  		} else {
   837  			current = append(current[:0], largest, smallest)
   838  		}
   839  		return current
   840  	}
   841  
   842  	// The separate iters for the range dels and range keys are wrapped in a
   843  	// merging iter to join the keyspaces into a single keyspace. The separate
   844  	// iters are only added if the particular key kind is present.
   845  	mIter := &keyspan.MergingIter{}
   846  	var transform = keyspan.TransformerFunc(func(cmp base.Compare, in keyspan.Span, out *keyspan.Span) error {
   847  		if in.KeysOrder != keyspan.ByTrailerDesc {
   848  			panic("pebble: combined deletion iter encountered keys in non-trailer descending order")
   849  		}
   850  		out.Start, out.End = in.Start, in.End
   851  		out.Keys = append(out.Keys[:0], in.Keys...)
   852  		out.KeysOrder = keyspan.ByTrailerDesc
   853  		// NB: The order of by-trailer descending may have been violated,
   854  		// because we've layered rangekey and rangedel iterators from the same
   855  		// sstable into the same keyspan.MergingIter. The MergingIter will
   856  		// return the keys in the order that the child iterators were provided.
   857  		// Sort the keys to ensure they're sorted by trailer descending.
   858  		keyspan.SortKeysByTrailer(&out.Keys)
   859  		return nil
   860  	})
   861  	mIter.Init(comparer.Compare, transform, new(keyspan.MergingBuffers))
   862  
   863  	iter, err := cr.NewRawRangeDelIter()
   864  	if err != nil {
   865  		return nil, err
   866  	}
   867  	if iter != nil {
   868  		dIter := &keyspan.DefragmentingIter{}
   869  		dIter.Init(comparer, iter, equal, reducer, new(keyspan.DefragmentingBuffers))
   870  		iter = dIter
   871  		// Truncate tombstones to the containing file's bounds if necessary.
   872  		// See docs/range_deletions.md for why this is necessary.
   873  		iter = keyspan.Truncate(
   874  			comparer.Compare, iter, m.Smallest.UserKey, m.Largest.UserKey,
   875  			nil, nil, false, /* panicOnUpperTruncate */
   876  		)
   877  		mIter.AddLevel(iter)
   878  	}
   879  
   880  	iter, err = cr.NewRawRangeKeyIter()
   881  	if err != nil {
   882  		return nil, err
   883  	}
   884  	if iter != nil {
   885  		// Wrap the range key iterator in a filter that elides keys other than range
   886  		// key deletions.
   887  		iter = keyspan.Filter(iter, func(in *keyspan.Span, out *keyspan.Span) (keep bool) {
   888  			out.Start, out.End = in.Start, in.End
   889  			out.Keys = out.Keys[:0]
   890  			for _, k := range in.Keys {
   891  				if k.Kind() != base.InternalKeyKindRangeKeyDelete {
   892  					continue
   893  				}
   894  				out.Keys = append(out.Keys, k)
   895  			}
   896  			return len(out.Keys) > 0
   897  		}, comparer.Compare)
   898  		dIter := &keyspan.DefragmentingIter{}
   899  		dIter.Init(comparer, iter, equal, reducer, new(keyspan.DefragmentingBuffers))
   900  		iter = dIter
   901  		mIter.AddLevel(iter)
   902  	}
   903  
   904  	return mIter, nil
   905  }
   906  
   907  // rangeKeySetsAnnotator implements manifest.Annotator, annotating B-Tree nodes
   908  // with the sum of the files' counts of range key fragments. Its annotation type
   909  // is a *uint64. The count of range key sets may change once a table's stats are
   910  // loaded asynchronously, so its values are marked as cacheable only if a file's
   911  // stats have been loaded.
   912  type rangeKeySetsAnnotator struct{}
   913  
   914  var _ manifest.Annotator = rangeKeySetsAnnotator{}
   915  
   916  func (a rangeKeySetsAnnotator) Zero(dst interface{}) interface{} {
   917  	if dst == nil {
   918  		return new(uint64)
   919  	}
   920  	v := dst.(*uint64)
   921  	*v = 0
   922  	return v
   923  }
   924  
   925  func (a rangeKeySetsAnnotator) Accumulate(
   926  	f *fileMetadata, dst interface{},
   927  ) (v interface{}, cacheOK bool) {
   928  	vptr := dst.(*uint64)
   929  	*vptr = *vptr + f.Stats.NumRangeKeySets
   930  	return vptr, f.StatsValid()
   931  }
   932  
   933  func (a rangeKeySetsAnnotator) Merge(src interface{}, dst interface{}) interface{} {
   934  	srcV := src.(*uint64)
   935  	dstV := dst.(*uint64)
   936  	*dstV = *dstV + *srcV
   937  	return dstV
   938  }
   939  
   940  // countRangeKeySetFragments counts the number of RANGEKEYSET keys across all
   941  // files of the LSM. It only counts keys in files for which table stats have
   942  // been loaded. It uses a b-tree annotator to cache intermediate values between
   943  // calculations when possible.
   944  func countRangeKeySetFragments(v *version) (count uint64) {
   945  	for l := 0; l < numLevels; l++ {
   946  		if v.RangeKeyLevels[l].Empty() {
   947  			continue
   948  		}
   949  		count += *v.RangeKeyLevels[l].Annotation(rangeKeySetsAnnotator{}).(*uint64)
   950  	}
   951  	return count
   952  }
   953  
   954  // tombstonesAnnotator implements manifest.Annotator, annotating B-Tree nodes
   955  // with the sum of the files' counts of tombstones (DEL, SINGLEDEL and RANGEDELk
   956  // eys). Its annotation type is a *uint64. The count of tombstones may change
   957  // once a table's stats are loaded asynchronously, so its values are marked as
   958  // cacheable only if a file's stats have been loaded.
   959  type tombstonesAnnotator struct{}
   960  
   961  var _ manifest.Annotator = tombstonesAnnotator{}
   962  
   963  func (a tombstonesAnnotator) Zero(dst interface{}) interface{} {
   964  	if dst == nil {
   965  		return new(uint64)
   966  	}
   967  	v := dst.(*uint64)
   968  	*v = 0
   969  	return v
   970  }
   971  
   972  func (a tombstonesAnnotator) Accumulate(
   973  	f *fileMetadata, dst interface{},
   974  ) (v interface{}, cacheOK bool) {
   975  	vptr := dst.(*uint64)
   976  	*vptr = *vptr + f.Stats.NumDeletions
   977  	return vptr, f.StatsValid()
   978  }
   979  
   980  func (a tombstonesAnnotator) Merge(src interface{}, dst interface{}) interface{} {
   981  	srcV := src.(*uint64)
   982  	dstV := dst.(*uint64)
   983  	*dstV = *dstV + *srcV
   984  	return dstV
   985  }
   986  
   987  // countTombstones counts the number of tombstone (DEL, SINGLEDEL and RANGEDEL)
   988  // internal keys across all files of the LSM. It only counts keys in files for
   989  // which table stats have been loaded. It uses a b-tree annotator to cache
   990  // intermediate values between calculations when possible.
   991  func countTombstones(v *version) (count uint64) {
   992  	for l := 0; l < numLevels; l++ {
   993  		if v.Levels[l].Empty() {
   994  			continue
   995  		}
   996  		count += *v.Levels[l].Annotation(tombstonesAnnotator{}).(*uint64)
   997  	}
   998  	return count
   999  }
  1000  
  1001  // valueBlocksSizeAnnotator implements manifest.Annotator, annotating B-Tree
  1002  // nodes with the sum of the files' Properties.ValueBlocksSize. Its annotation
  1003  // type is a *uint64. The value block size may change once a table's stats are
  1004  // loaded asynchronously, so its values are marked as cacheable only if a
  1005  // file's stats have been loaded.
  1006  type valueBlocksSizeAnnotator struct{}
  1007  
  1008  var _ manifest.Annotator = valueBlocksSizeAnnotator{}
  1009  
  1010  func (a valueBlocksSizeAnnotator) Zero(dst interface{}) interface{} {
  1011  	if dst == nil {
  1012  		return new(uint64)
  1013  	}
  1014  	v := dst.(*uint64)
  1015  	*v = 0
  1016  	return v
  1017  }
  1018  
  1019  func (a valueBlocksSizeAnnotator) Accumulate(
  1020  	f *fileMetadata, dst interface{},
  1021  ) (v interface{}, cacheOK bool) {
  1022  	vptr := dst.(*uint64)
  1023  	*vptr = *vptr + f.Stats.ValueBlocksSize
  1024  	return vptr, f.StatsValid()
  1025  }
  1026  
  1027  func (a valueBlocksSizeAnnotator) Merge(src interface{}, dst interface{}) interface{} {
  1028  	srcV := src.(*uint64)
  1029  	dstV := dst.(*uint64)
  1030  	*dstV = *dstV + *srcV
  1031  	return dstV
  1032  }
  1033  
  1034  // valueBlocksSizeForLevel returns the Properties.ValueBlocksSize across all
  1035  // files for a level of the LSM. It only includes the size for files for which
  1036  // table stats have been loaded. It uses a b-tree annotator to cache
  1037  // intermediate values between calculations when possible. It must not be
  1038  // called concurrently.
  1039  //
  1040  // REQUIRES: 0 <= level <= numLevels.
  1041  func valueBlocksSizeForLevel(v *version, level int) (count uint64) {
  1042  	if v.Levels[level].Empty() {
  1043  		return 0
  1044  	}
  1045  	return *v.Levels[level].Annotation(valueBlocksSizeAnnotator{}).(*uint64)
  1046  }