github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/table_stats.go

github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/table_stats.go (about)

     1  // Copyright 2020 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package bitalostable
     6  
     7  import (
     8  	"fmt"
     9  	"math"
    10  
    11  	"github.com/zuoyebang/bitalostable/internal/base"
    12  	"github.com/zuoyebang/bitalostable/internal/keyspan"
    13  	"github.com/zuoyebang/bitalostable/internal/manifest"
    14  	"github.com/zuoyebang/bitalostable/sstable"
    15  )
    16  
    17  // In-memory statistics about tables help inform compaction picking, but may
    18  // be expensive to calculate or load from disk. Every time a database is
    19  // opened, these statistics must be reloaded or recalculated. To minimize
    20  // impact on user activity and compactions, we load these statistics
    21  // asynchronously in the background and store loaded statistics in each
    22  // table's *FileMetadata.
    23  //
    24  // This file implements the asynchronous loading of statistics by maintaining
    25  // a list of files that require statistics, alongside their LSM levels.
    26  // Whenever new files are added to the LSM, the files are appended to
    27  // d.mu.tableStats.pending. If a stats collection job is not currently
    28  // running, one is started in a separate goroutine.
    29  //
    30  // The stats collection job grabs and clears the pending list, computes table
    31  // statistics relative to the current readState and updates the tables' file
    32  // metadata. New pending files may accumulate during a stats collection job,
    33  // so a completing job triggers a new job if necessary. Only one job runs at a
    34  // time.
    35  //
    36  // When an existing database is opened, all files lack in-memory statistics.
    37  // These files' stats are loaded incrementally whenever the pending list is
    38  // empty by scanning a current readState for files missing statistics. Once a
    39  // job completes a scan without finding any remaining files without
    40  // statistics, it flips a `loadedInitial` flag. From then on, the stats
    41  // collection job only needs to load statistics for new files appended to the
    42  // pending list.
    43  
    44  func (d *DB) maybeCollectTableStatsLocked() {
    45  	if d.shouldCollectTableStatsLocked() {
    46  		go d.collectTableStats()
    47  	}
    48  }
    49  
    50  // updateTableStatsLocked is called when new files are introduced, after the
    51  // read state has been updated. It may trigger a new stat collection.
    52  // DB.mu must be locked when calling.
    53  func (d *DB) updateTableStatsLocked(newFiles []manifest.NewFileEntry) {
    54  	var needStats bool
    55  	for _, nf := range newFiles {
    56  		if !nf.Meta.StatsValidLocked() {
    57  			needStats = true
    58  			break
    59  		}
    60  	}
    61  	if !needStats {
    62  		return
    63  	}
    64  
    65  	d.mu.tableStats.pending = append(d.mu.tableStats.pending, newFiles...)
    66  	d.maybeCollectTableStatsLocked()
    67  }
    68  
    69  func (d *DB) shouldCollectTableStatsLocked() bool {
    70  	return !d.mu.tableStats.loading &&
    71  		d.closed.Load() == nil &&
    72  		!d.opts.private.disableTableStats &&
    73  		(len(d.mu.tableStats.pending) > 0 || !d.mu.tableStats.loadedInitial)
    74  }
    75  
    76  // collectTableStats runs a table stats collection job, returning true if the
    77  // invocation did the collection work, false otherwise (e.g. if another job was
    78  // already running).
    79  func (d *DB) collectTableStats() bool {
    80  	const maxTableStatsPerScan = 50
    81  
    82  	d.mu.Lock()
    83  	if !d.shouldCollectTableStatsLocked() {
    84  		d.mu.Unlock()
    85  		return false
    86  	}
    87  
    88  	pending := d.mu.tableStats.pending
    89  	d.mu.tableStats.pending = nil
    90  	d.mu.tableStats.loading = true
    91  	jobID := d.mu.nextJobID
    92  	d.mu.nextJobID++
    93  	loadedInitial := d.mu.tableStats.loadedInitial
    94  	// Drop DB.mu before performing IO.
    95  	d.mu.Unlock()
    96  
    97  	// Every run of collectTableStats either collects stats from the pending
    98  	// list (if non-empty) or from scanning the version (loadedInitial is
    99  	// false). This job only runs if at least one of those conditions holds.
   100  
   101  	// Grab a read state to scan for tables.
   102  	rs := d.loadReadState()
   103  	var collected []collectedStats
   104  	var hints []deleteCompactionHint
   105  	if len(pending) > 0 {
   106  		collected, hints = d.loadNewFileStats(rs, pending)
   107  	} else {
   108  		var moreRemain bool
   109  		var buf [maxTableStatsPerScan]collectedStats
   110  		collected, hints, moreRemain = d.scanReadStateTableStats(rs, buf[:0])
   111  		loadedInitial = !moreRemain
   112  	}
   113  	rs.unref()
   114  
   115  	// Update the FileMetadata with the loaded stats while holding d.mu.
   116  	d.mu.Lock()
   117  	defer d.mu.Unlock()
   118  	d.mu.tableStats.loading = false
   119  	if loadedInitial && !d.mu.tableStats.loadedInitial {
   120  		d.mu.tableStats.loadedInitial = loadedInitial
   121  		d.opts.EventListener.TableStatsLoaded(TableStatsInfo{
   122  			JobID: jobID,
   123  		})
   124  	}
   125  
   126  	maybeCompact := false
   127  	for _, c := range collected {
   128  		c.fileMetadata.Stats = c.TableStats
   129  		maybeCompact = maybeCompact || c.TableStats.RangeDeletionsBytesEstimate > 0
   130  		c.fileMetadata.StatsMarkValid()
   131  	}
   132  	d.mu.tableStats.cond.Broadcast()
   133  	d.maybeCollectTableStatsLocked()
   134  	if len(hints) > 0 {
   135  		// Verify that all of the hint tombstones' files still exist in the
   136  		// current version. Otherwise, the tombstone itself may have been
   137  		// compacted into L6 and more recent keys may have had their sequence
   138  		// numbers zeroed.
   139  		//
   140  		// Note that it's possible that the tombstone file is being compacted
   141  		// presently. In that case, the file will be present in v. When the
   142  		// compaction finishes compacting the tombstone file, it will detect
   143  		// and clear the hint.
   144  		//
   145  		// See DB.maybeUpdateDeleteCompactionHints.
   146  		v := d.mu.versions.currentVersion()
   147  		keepHints := hints[:0]
   148  		for _, h := range hints {
   149  			if v.Contains(h.tombstoneLevel, d.cmp, h.tombstoneFile) {
   150  				keepHints = append(keepHints, h)
   151  			}
   152  		}
   153  		d.mu.compact.deletionHints = append(d.mu.compact.deletionHints, keepHints...)
   154  	}
   155  	if maybeCompact {
   156  		d.maybeScheduleCompaction()
   157  	}
   158  	return true
   159  }
   160  
   161  type collectedStats struct {
   162  	*fileMetadata
   163  	manifest.TableStats
   164  }
   165  
   166  func (d *DB) loadNewFileStats(
   167  	rs *readState, pending []manifest.NewFileEntry,
   168  ) ([]collectedStats, []deleteCompactionHint) {
   169  	var hints []deleteCompactionHint
   170  	collected := make([]collectedStats, 0, len(pending))
   171  	for _, nf := range pending {
   172  		// A file's stats might have been populated by an earlier call to
   173  		// loadNewFileStats if the file was moved.
   174  		// NB: We're not holding d.mu which protects f.Stats, but only
   175  		// collectTableStats updates f.Stats for active files, and we
   176  		// ensure only one goroutine runs it at a time through
   177  		// d.mu.tableStats.loading.
   178  		if nf.Meta.StatsValidLocked() {
   179  			continue
   180  		}
   181  
   182  		// The file isn't guaranteed to still be live in the readState's
   183  		// version. It may have been deleted or moved. Skip it if it's not in
   184  		// the expected level.
   185  		if !rs.current.Contains(nf.Level, d.cmp, nf.Meta) {
   186  			continue
   187  		}
   188  
   189  		stats, newHints, err := d.loadTableStats(rs.current, nf.Level, nf.Meta)
   190  		if err != nil {
   191  			d.opts.EventListener.BackgroundError(err)
   192  			continue
   193  		}
   194  		// NB: We don't update the FileMetadata yet, because we aren't
   195  		// holding DB.mu. We'll copy it to the FileMetadata after we're
   196  		// finished with IO.
   197  		collected = append(collected, collectedStats{
   198  			fileMetadata: nf.Meta,
   199  			TableStats:   stats,
   200  		})
   201  		hints = append(hints, newHints...)
   202  	}
   203  	return collected, hints
   204  }
   205  
   206  // scanReadStateTableStats is run by an active stat collection job when there
   207  // are no pending new files, but there might be files that existed at Open for
   208  // which we haven't loaded table stats.
   209  func (d *DB) scanReadStateTableStats(
   210  	rs *readState, fill []collectedStats,
   211  ) ([]collectedStats, []deleteCompactionHint, bool) {
   212  	moreRemain := false
   213  	var hints []deleteCompactionHint
   214  	for l, levelMetadata := range rs.current.Levels {
   215  		iter := levelMetadata.Iter()
   216  		for f := iter.First(); f != nil; f = iter.Next() {
   217  			// NB: We're not holding d.mu which protects f.Stats, but only the
   218  			// active stats collection job updates f.Stats for active files,
   219  			// and we ensure only one goroutine runs it at a time through
   220  			// d.mu.tableStats.loading. This makes it safe to read validity
   221  			// through f.Stats.ValidLocked despite not holding d.mu.
   222  			if f.StatsValidLocked() {
   223  				continue
   224  			}
   225  
   226  			// Limit how much work we do per read state. The older the read
   227  			// state is, the higher the likelihood files are no longer being
   228  			// used in the current version. If we've exhausted our allowance,
   229  			// return true for the last return value to signal there's more
   230  			// work to do.
   231  			if len(fill) == cap(fill) {
   232  				moreRemain = true
   233  				return fill, hints, moreRemain
   234  			}
   235  
   236  			stats, newHints, err := d.loadTableStats(rs.current, l, f)
   237  			if err != nil {
   238  				// Set `moreRemain` so we'll try again.
   239  				moreRemain = true
   240  				d.opts.EventListener.BackgroundError(err)
   241  				continue
   242  			}
   243  			fill = append(fill, collectedStats{
   244  				fileMetadata: f,
   245  				TableStats:   stats,
   246  			})
   247  			hints = append(hints, newHints...)
   248  		}
   249  	}
   250  	return fill, hints, moreRemain
   251  }
   252  
   253  func (d *DB) loadTableStats(
   254  	v *version, level int, meta *fileMetadata,
   255  ) (manifest.TableStats, []deleteCompactionHint, error) {
   256  	var stats manifest.TableStats
   257  	var compactionHints []deleteCompactionHint
   258  	err := d.tableCache.withReader(meta, func(r *sstable.Reader) (err error) {
   259  		stats.NumEntries = r.Properties.NumEntries
   260  		stats.NumDeletions = r.Properties.NumDeletions
   261  		if r.Properties.NumPointDeletions() > 0 {
   262  			if err = d.loadTablePointKeyStats(r, v, level, meta, &stats); err != nil {
   263  				return
   264  			}
   265  		}
   266  		if r.Properties.NumRangeDeletions > 0 || r.Properties.NumRangeKeyDels > 0 {
   267  			if compactionHints, err = d.loadTableRangeDelStats(r, v, level, meta, &stats); err != nil {
   268  				return
   269  			}
   270  		}
   271  		// TODO(travers): Once we have real-world data, consider collecting
   272  		// additional stats that may provide improved heuristics for compaction
   273  		// picking.
   274  		stats.NumRangeKeySets = r.Properties.NumRangeKeySets
   275  		return
   276  	})
   277  	if err != nil {
   278  		return stats, nil, err
   279  	}
   280  	return stats, compactionHints, nil
   281  }
   282  
   283  // loadTablePointKeyStats calculates the point key statistics for the given
   284  // table. The provided manifest.TableStats are updated.
   285  func (d *DB) loadTablePointKeyStats(
   286  	r *sstable.Reader, v *version, level int, meta *fileMetadata, stats *manifest.TableStats,
   287  ) error {
   288  	// TODO(jackson): If the file has a wide keyspace, the average
   289  	// value size beneath the entire file might not be representative
   290  	// of the size of the keys beneath the point tombstones.
   291  	// We could write the ranges of 'clusters' of point tombstones to
   292  	// a sstable property and call averageValueSizeBeneath for each of
   293  	// these narrower ranges to improve the estimate.
   294  	avgKeySize, avgValSize, err := d.averageEntrySizeBeneath(v, level, meta)
   295  	if err != nil {
   296  		return err
   297  	}
   298  	stats.PointDeletionsBytesEstimate =
   299  		pointDeletionsBytesEstimate(&r.Properties, avgKeySize, avgValSize)
   300  	return nil
   301  }
   302  
   303  // loadTableRangeDelStats calculates the range deletion and range key deletion
   304  // statistics for the given table.
   305  func (d *DB) loadTableRangeDelStats(
   306  	r *sstable.Reader, v *version, level int, meta *fileMetadata, stats *manifest.TableStats,
   307  ) ([]deleteCompactionHint, error) {
   308  	iter, err := newCombinedDeletionKeyspanIter(d.opts.Comparer, r, meta)
   309  	if err != nil {
   310  		return nil, err
   311  	}
   312  	defer iter.Close()
   313  	var compactionHints []deleteCompactionHint
   314  	// We iterate over the defragmented range tombstones and range key deletions,
   315  	// which ensures we don't double count ranges deleted at different sequence
   316  	// numbers. Also, merging abutting tombstones reduces the number of calls to
   317  	// estimateReclaimedSizeBeneath which is costly, and improves the accuracy of
   318  	// our overall estimate.
   319  	for s := iter.First(); s != nil; s = iter.Next() {
   320  		start, end := s.Start, s.End
   321  		// We only need to consider deletion size estimates for tables that contain
   322  		// point keys.
   323  		var hasPoints bool
   324  		for _, k := range s.Keys {
   325  			if k.Kind() == base.InternalKeyKindRangeDelete {
   326  				hasPoints = true
   327  				break
   328  			}
   329  		}
   330  
   331  		// If the file is in the last level of the LSM, there is no data beneath
   332  		// it. The fact that there is still a range tombstone in a bottommost file
   333  		// suggests that an open snapshot kept the tombstone around. Estimate disk
   334  		// usage within the file itself.
   335  		// NOTE: If the span `s` wholly contains a table containing range keys,
   336  		// the returned size estimate will be slightly inflated by the range key
   337  		// block. However, in practice, range keys are expected to be rare, and
   338  		// the size of the range key block relative to the overall size of the
   339  		// table is expected to be small.
   340  		if hasPoints && level == numLevels-1 {
   341  			size, err := r.EstimateDiskUsage(start, end)
   342  			if err != nil {
   343  				return nil, err
   344  			}
   345  			stats.RangeDeletionsBytesEstimate += size
   346  
   347  			// As the file is in the bottommost level, there is no need to collect a
   348  			// deletion hint.
   349  			continue
   350  		}
   351  
   352  		// While the size estimates for point keys should only be updated if this
   353  		// span contains a range del, the sequence numbers are required for the
   354  		// hint. Unconditionally descend, but conditionally update the estimates.
   355  		hintType := compactionHintFromKeys(s.Keys)
   356  		estimate, hintSeqNum, err := d.estimateReclaimedSizeBeneath(v, level, start, end, hintType)
   357  		if err != nil {
   358  			return nil, err
   359  		}
   360  		stats.RangeDeletionsBytesEstimate += estimate
   361  
   362  		// If any files were completely contained with the range,
   363  		// hintSeqNum is the smallest sequence number contained in any
   364  		// such file.
   365  		if hintSeqNum == math.MaxUint64 {
   366  			continue
   367  		}
   368  		hint := deleteCompactionHint{
   369  			hintType:                hintType,
   370  			start:                   make([]byte, len(start)),
   371  			end:                     make([]byte, len(end)),
   372  			tombstoneFile:           meta,
   373  			tombstoneLevel:          level,
   374  			tombstoneLargestSeqNum:  s.LargestSeqNum(),
   375  			tombstoneSmallestSeqNum: s.SmallestSeqNum(),
   376  			fileSmallestSeqNum:      hintSeqNum,
   377  		}
   378  		copy(hint.start, start)
   379  		copy(hint.end, end)
   380  		compactionHints = append(compactionHints, hint)
   381  	}
   382  	return compactionHints, err
   383  }
   384  
   385  func (d *DB) averageEntrySizeBeneath(
   386  	v *version, level int, meta *fileMetadata,
   387  ) (avgKeySize, avgValueSize uint64, err error) {
   388  	// Find all files in lower levels that overlap with meta,
   389  	// summing their value sizes and entry counts.
   390  	var fileSum, keySum, valSum, entryCount uint64
   391  	for l := level + 1; l < numLevels; l++ {
   392  		overlaps := v.Overlaps(l, d.cmp, meta.Smallest.UserKey,
   393  			meta.Largest.UserKey, meta.Largest.IsExclusiveSentinel())
   394  		iter := overlaps.Iter()
   395  		for file := iter.First(); file != nil; file = iter.Next() {
   396  			err := d.tableCache.withReader(file, func(r *sstable.Reader) (err error) {
   397  				fileSum += file.Size
   398  				entryCount += r.Properties.NumEntries
   399  				keySum += r.Properties.RawKeySize
   400  				valSum += r.Properties.RawValueSize
   401  				return nil
   402  			})
   403  			if err != nil {
   404  				return 0, 0, err
   405  			}
   406  		}
   407  	}
   408  	if entryCount == 0 {
   409  		return 0, 0, nil
   410  	}
   411  	// RawKeySize and RawValueSize are uncompressed totals. Scale them
   412  	// according to the data size to account for compression, index blocks and
   413  	// metadata overhead. Eg:
   414  	//
   415  	//    Compression rate        ×  Average uncompressed key size
   416  	//
   417  	//                            ↓
   418  	//
   419  	//         FileSize              RawKeySize
   420  	//   -----------------------  ×  ----------
   421  	//   RawKeySize+RawValueSize     NumEntries
   422  	//
   423  	// We refactor the calculation to avoid error from rounding/truncation.
   424  	totalSizePerEntry := fileSum / entryCount
   425  	uncompressedSum := keySum + valSum
   426  	avgKeySize = keySum * totalSizePerEntry / uncompressedSum
   427  	avgValueSize = valSum * totalSizePerEntry / uncompressedSum
   428  	return avgKeySize, avgValueSize, err
   429  }
   430  
   431  func (d *DB) estimateReclaimedSizeBeneath(
   432  	v *version, level int, start, end []byte, hintType deleteCompactionHintType,
   433  ) (estimate uint64, hintSeqNum uint64, err error) {
   434  	// Find all files in lower levels that overlap with the deleted range
   435  	// [start, end).
   436  	//
   437  	// An overlapping file might be completely contained by the range
   438  	// tombstone, in which case we can count the entire file size in
   439  	// our estimate without doing any additional I/O.
   440  	//
   441  	// Otherwise, estimating the range for the file requires
   442  	// additional I/O to read the file's index blocks.
   443  	hintSeqNum = math.MaxUint64
   444  	for l := level + 1; l < numLevels; l++ {
   445  		overlaps := v.Overlaps(l, d.cmp, start, end, true /* exclusiveEnd */)
   446  		iter := overlaps.Iter()
   447  		for file := iter.First(); file != nil; file = iter.Next() {
   448  			startCmp := d.cmp(start, file.Smallest.UserKey)
   449  			endCmp := d.cmp(file.Largest.UserKey, end)
   450  			if startCmp <= 0 && (endCmp < 0 || endCmp == 0 && file.Largest.IsExclusiveSentinel()) {
   451  				// The range fully contains the file, so skip looking it up in table
   452  				// cache/looking at its indexes and add the full file size. Whether the
   453  				// disk estimate and hint seqnums are updated depends on a) the type of
   454  				// hint that requested the estimate and b) the keys contained in this
   455  				// current file.
   456  				var updateEstimates, updateHints bool
   457  				switch hintType {
   458  				case deleteCompactionHintTypePointKeyOnly:
   459  					// The range deletion byte estimates should only be updated if this
   460  					// table contains point keys. This ends up being an overestimate in
   461  					// the case that table also has range keys, but such keys are expected
   462  					// to contribute a negligible amount of the table's overall size,
   463  					// relative to point keys.
   464  					if file.HasPointKeys {
   465  						updateEstimates = true
   466  					}
   467  					// As the initiating span contained only range dels, hints can only be
   468  					// updated if this table does _not_ contain range keys.
   469  					if !file.HasRangeKeys {
   470  						updateHints = true
   471  					}
   472  				case deleteCompactionHintTypeRangeKeyOnly:
   473  					// The initiating span contained only range key dels. The estimates
   474  					// apply only to point keys, and are therefore not updated.
   475  					updateEstimates = false
   476  					// As the initiating span contained only range key dels, hints can
   477  					// only be updated if this table does _not_ contain point keys.
   478  					if !file.HasPointKeys {
   479  						updateHints = true
   480  					}
   481  				case deleteCompactionHintTypePointAndRangeKey:
   482  					// Always update the estimates and hints, as this hint type can drop a
   483  					// file, irrespective of the mixture of keys. Similar to above, the
   484  					// range del bytes estimates is an overestimate.
   485  					updateEstimates, updateHints = true, true
   486  				default:
   487  					panic(fmt.Sprintf("bitalostable: unknown hint type %s", hintType))
   488  				}
   489  				if updateEstimates {
   490  					estimate += file.Size
   491  				}
   492  				if updateHints && hintSeqNum > file.SmallestSeqNum {
   493  					hintSeqNum = file.SmallestSeqNum
   494  				}
   495  			} else if d.cmp(file.Smallest.UserKey, end) <= 0 && d.cmp(start, file.Largest.UserKey) <= 0 {
   496  				// Partial overlap.
   497  				if hintType == deleteCompactionHintTypeRangeKeyOnly {
   498  					// If the hint that generated this overlap contains only range keys,
   499  					// there is no need to calculate disk usage, as the reclaimable space
   500  					// is expected to be minimal relative to point keys.
   501  					continue
   502  				}
   503  				var size uint64
   504  				err := d.tableCache.withReader(file, func(r *sstable.Reader) (err error) {
   505  					size, err = r.EstimateDiskUsage(start, end)
   506  					return err
   507  				})
   508  				if err != nil {
   509  					return 0, hintSeqNum, err
   510  				}
   511  				estimate += size
   512  			}
   513  		}
   514  	}
   515  	return estimate, hintSeqNum, nil
   516  }
   517  
   518  func maybeSetStatsFromProperties(meta *fileMetadata, props *sstable.Properties) bool {
   519  	// If a table contains range deletions or range key deletions, we defer the
   520  	// stats collection. There are two main reasons for this:
   521  	//
   522  	//  1. Estimating the potential for reclaimed space due to a range deletion
   523  	//     tombstone requires scanning the LSM - a potentially expensive operation
   524  	//     that should be deferred.
   525  	//  2. Range deletions and / or range key deletions present an opportunity to
   526  	//     compute "deletion hints", which also requires a scan of the LSM to
   527  	//     compute tables that would be eligible for deletion.
   528  	//
   529  	// These two tasks are deferred to the table stats collector goroutine.
   530  	if props.NumRangeDeletions != 0 || props.NumRangeKeyDels != 0 {
   531  		return false
   532  	}
   533  
   534  	// If a table is more than 10% point deletions, don't calculate the
   535  	// PointDeletionsBytesEstimate statistic using our limited knowledge. The
   536  	// table stats collector can populate the stats and calculate an average
   537  	// of value size of all the tables beneath the table in the LSM, which
   538  	// will be more accurate.
   539  	if props.NumDeletions > props.NumEntries/10 {
   540  		return false
   541  	}
   542  
   543  	var pointEstimate uint64
   544  	if props.NumEntries > 0 {
   545  		// Use the file's own average key and value sizes as an estimate. This
   546  		// doesn't require any additional IO and since the number of point
   547  		// deletions in the file is low, the error introduced by this crude
   548  		// estimate is expected to be small.
   549  		avgKeySize, avgValSize := estimateEntrySizes(meta.Size, props)
   550  		pointEstimate = pointDeletionsBytesEstimate(props, avgKeySize, avgValSize)
   551  	}
   552  
   553  	meta.Stats.NumEntries = props.NumEntries
   554  	meta.Stats.NumDeletions = props.NumDeletions
   555  	meta.Stats.NumRangeKeySets = props.NumRangeKeySets
   556  	meta.Stats.PointDeletionsBytesEstimate = pointEstimate
   557  	meta.Stats.RangeDeletionsBytesEstimate = 0
   558  	meta.StatsMarkValid()
   559  	return true
   560  }
   561  
   562  func pointDeletionsBytesEstimate(props *sstable.Properties, avgKeySize, avgValSize uint64) uint64 {
   563  	if props.NumEntries == 0 {
   564  		return 0
   565  	}
   566  	// Estimate the potential space to reclaim using the table's own
   567  	// properties. There may or may not be keys covered by any individual
   568  	// point tombstone. If not, compacting the point tombstone into L6 will at
   569  	// least allow us to drop the point deletion key and will reclaim the key
   570  	// bytes. If there are covered key(s), we also get to drop key and value
   571  	// bytes for each covered key.
   572  	//
   573  	// We estimate assuming that each point tombstone on average covers 1 key.
   574  	// This is almost certainly an overestimate, but that's probably okay
   575  	// because point tombstones can slow range iterations even when they don't
   576  	// cover a key. It may be beneficial in the future to more accurately
   577  	// estimate which tombstones cover keys and which do not.
   578  	numPointDels := props.NumPointDeletions()
   579  	return numPointDels*avgKeySize + numPointDels*(avgKeySize+avgValSize)
   580  }
   581  
   582  func estimateEntrySizes(
   583  	fileSize uint64, props *sstable.Properties,
   584  ) (avgKeySize, avgValSize uint64) {
   585  	// RawKeySize and RawValueSize are uncompressed totals. Scale them
   586  	// according to the data size to account for compression, index blocks and
   587  	// metadata overhead. Eg:
   588  	//
   589  	//    Compression rate        ×  Average uncompressed key size
   590  	//
   591  	//                            ↓
   592  	//
   593  	//         FileSize              RawKeySize
   594  	//   -----------------------  ×  ----------
   595  	//   RawKeySize+RawValueSize     NumEntries
   596  	//
   597  	// We refactor the calculation to avoid error from rounding/truncation.
   598  	fileSizePerEntry := fileSize / props.NumEntries
   599  	uncompressedSum := props.RawKeySize + props.RawValueSize
   600  	avgKeySize = props.RawKeySize * fileSizePerEntry / uncompressedSum
   601  	avgValSize = props.RawValueSize * fileSizePerEntry / uncompressedSum
   602  	return avgKeySize, avgValSize
   603  }
   604  
   605  // newCombinedDeletionKeyspanIter returns a keyspan.FragmentIterator that
   606  // returns "ranged deletion" spans for a single table, providing a combined view
   607  // of both range deletion and range key deletion spans. The
   608  // tableRangedDeletionIter is intended for use in the specific case of computing
   609  // the statistics and deleteCompactionHints for a single table.
   610  //
   611  // As an example, consider the following set of spans from the range deletion
   612  // and range key blocks of a table:
   613  //
   614  //	      |---------|     |---------|         |-------| RANGEKEYDELs
   615  //	|-----------|-------------|           |-----|       RANGEDELs
   616  //
   617  // __________________________________________________________
   618  //
   619  //	a b c d e f g h i j k l m n o p q r s t u v w x y z
   620  //
   621  // The tableRangedDeletionIter produces the following set of output spans, where
   622  // '1' indicates a span containing only range deletions, '2' is a span
   623  // containing only range key deletions, and '3' is a span containing a mixture
   624  // of both range deletions and range key deletions.
   625  //
   626  //	   1       3       1    3    2          1  3   2
   627  //	|-----|---------|-----|---|-----|     |---|-|-----|
   628  //
   629  // __________________________________________________________
   630  //
   631  //	a b c d e f g h i j k l m n o p q r s t u v w x y z
   632  //
   633  // Algorithm.
   634  //
   635  // The iterator first defragments the range deletion and range key blocks
   636  // separately. During this defragmentation, the range key block is also filtered
   637  // so that keys other than range key deletes are ignored. The range delete and
   638  // range key delete keyspaces are then merged.
   639  //
   640  // Note that the only fragmentation introduced by merging is from where a range
   641  // del span overlaps with a range key del span. Within the bounds of any overlap
   642  // there is guaranteed to be no further fragmentation, as the constituent spans
   643  // have already been defragmented. To the left and right of any overlap, the
   644  // same reasoning applies. For example,
   645  //
   646  //	         |--------|         |-------| RANGEKEYDEL
   647  //	|---------------------------|         RANGEDEL
   648  //	|----1---|----3---|----1----|---2---| Merged, fragmented spans.
   649  //
   650  // __________________________________________________________
   651  //
   652  //	a b c d e f g h i j k l m n o p q r s t u v w x y z
   653  //
   654  // Any fragmented abutting spans produced by the merging iter will be of
   655  // differing types (i.e. a transition from a span with homogenous key kinds to a
   656  // heterogeneous span, or a transition from a span with exclusively range dels
   657  // to a span with exclusively range key dels). Therefore, further
   658  // defragmentation is not required.
   659  //
   660  // Each span returned by the tableRangeDeletionIter will have at most four keys,
   661  // corresponding to the largest and smallest sequence numbers encountered across
   662  // the range deletes and range keys deletes that comprised the merged spans.
   663  func newCombinedDeletionKeyspanIter(
   664  	comparer *base.Comparer, r *sstable.Reader, m *fileMetadata,
   665  ) (keyspan.FragmentIterator, error) {
   666  	// The range del iter and range key iter are each wrapped in their own
   667  	// defragmenting iter. For each iter, abutting spans can always be merged.
   668  	var equal = keyspan.DefragmentMethodFunc(func(_ base.Equal, a, b *keyspan.Span) bool { return true })
   669  	// Reduce keys by maintaining a slice of at most length two, corresponding to
   670  	// the largest and smallest keys in the defragmented span. This maintains the
   671  	// contract that the emitted slice is sorted by (SeqNum, Kind) descending.
   672  	reducer := func(current, incoming []keyspan.Key) []keyspan.Key {
   673  		if len(current) == 0 && len(incoming) == 0 {
   674  			// While this should never occur in practice, a defensive return is used
   675  			// here to preserve correctness.
   676  			return current
   677  		}
   678  		var largest, smallest keyspan.Key
   679  		var set bool
   680  		for _, keys := range [2][]keyspan.Key{current, incoming} {
   681  			if len(keys) == 0 {
   682  				continue
   683  			}
   684  			first, last := keys[0], keys[len(keys)-1]
   685  			if !set {
   686  				largest, smallest = first, last
   687  				set = true
   688  				continue
   689  			}
   690  			if first.Trailer > largest.Trailer {
   691  				largest = first
   692  			}
   693  			if last.Trailer < smallest.Trailer {
   694  				smallest = last
   695  			}
   696  		}
   697  		if largest.Equal(comparer.Equal, smallest) {
   698  			current = append(current[:0], largest)
   699  		} else {
   700  			current = append(current[:0], largest, smallest)
   701  		}
   702  		return current
   703  	}
   704  
   705  	// The separate iters for the range dels and range keys are wrapped in a
   706  	// merging iter to join the keyspaces into a single keyspace. The separate
   707  	// iters are only added if the particular key kind is present.
   708  	mIter := &keyspan.MergingIter{}
   709  	var transform = keyspan.TransformerFunc(func(cmp base.Compare, in keyspan.Span, out *keyspan.Span) error {
   710  		if in.KeysOrder != keyspan.ByTrailerDesc {
   711  			panic("bitalostable: combined deletion iter encountered keys in non-trailer descending order")
   712  		}
   713  		out.Start, out.End = in.Start, in.End
   714  		out.Keys = append(out.Keys[:0], in.Keys...)
   715  		out.KeysOrder = keyspan.ByTrailerDesc
   716  		// NB: The order of by-trailer descending may have been violated,
   717  		// because we've layered rangekey and rangedel iterators from the same
   718  		// sstable into the same keyspan.MergingIter. The MergingIter will
   719  		// return the keys in the order that the child iterators were provided.
   720  		// Sort the keys to ensure they're sorted by trailer descending.
   721  		keyspan.SortKeysByTrailer(&out.Keys)
   722  		return nil
   723  	})
   724  	mIter.Init(comparer.Compare, transform)
   725  
   726  	iter, err := r.NewRawRangeDelIter()
   727  	if err != nil {
   728  		return nil, err
   729  	}
   730  	if iter != nil {
   731  		dIter := &keyspan.DefragmentingIter{}
   732  		dIter.Init(comparer, iter, equal, reducer)
   733  		iter = dIter
   734  		// Truncate tombstones to the containing file's bounds if necessary.
   735  		// See docs/range_deletions.md for why this is necessary.
   736  		iter = keyspan.Truncate(
   737  			comparer.Compare, iter, m.Smallest.UserKey, m.Largest.UserKey, nil, nil,
   738  		)
   739  		mIter.AddLevel(iter)
   740  	}
   741  
   742  	iter, err = r.NewRawRangeKeyIter()
   743  	if err != nil {
   744  		return nil, err
   745  	}
   746  	if iter != nil {
   747  		// Wrap the range key iterator in a filter that elides keys other than range
   748  		// key deletions.
   749  		iter = keyspan.Filter(iter, func(in *keyspan.Span, out *keyspan.Span) (keep bool) {
   750  			out.Start, out.End = in.Start, in.End
   751  			out.Keys = out.Keys[:0]
   752  			for _, k := range in.Keys {
   753  				if k.Kind() != base.InternalKeyKindRangeKeyDelete {
   754  					continue
   755  				}
   756  				out.Keys = append(out.Keys, k)
   757  			}
   758  			return len(out.Keys) > 0
   759  		})
   760  		dIter := &keyspan.DefragmentingIter{}
   761  		dIter.Init(comparer, iter, equal, reducer)
   762  		iter = dIter
   763  		mIter.AddLevel(iter)
   764  	}
   765  
   766  	return mIter, nil
   767  }
   768  
   769  // rangeKeySetsAnnotator implements manifest.Annotator, annotating B-Tree nodes
   770  // with the sum of the files' counts of range key fragments. Its annotation type
   771  // is a *uint64. The count of range key sets may change once a table's stats are
   772  // loaded asynchronously, so its values are marked as cacheable only if a file's
   773  // stats have been loaded.
   774  type rangeKeySetsAnnotator struct{}
   775  
   776  var _ manifest.Annotator = rangeKeySetsAnnotator{}
   777  
   778  func (a rangeKeySetsAnnotator) Zero(dst interface{}) interface{} {
   779  	if dst == nil {
   780  		return new(uint64)
   781  	}
   782  	v := dst.(*uint64)
   783  	*v = 0
   784  	return v
   785  }
   786  
   787  func (a rangeKeySetsAnnotator) Accumulate(
   788  	f *fileMetadata, dst interface{},
   789  ) (v interface{}, cacheOK bool) {
   790  	vptr := dst.(*uint64)
   791  	*vptr = *vptr + f.Stats.NumRangeKeySets
   792  	return vptr, f.StatsValidLocked()
   793  }
   794  
   795  func (a rangeKeySetsAnnotator) Merge(src interface{}, dst interface{}) interface{} {
   796  	srcV := src.(*uint64)
   797  	dstV := dst.(*uint64)
   798  	*dstV = *dstV + *srcV
   799  	return dstV
   800  }
   801  
   802  // countRangeKeySetFragments counts the number of RANGEKEYSET keys across all
   803  // files of the LSM. It only counts keys in files for which table stats have
   804  // been loaded. It uses a b-tree annotator to cache intermediate values between
   805  // calculations when possible.
   806  func countRangeKeySetFragments(v *version) (count uint64) {
   807  	for l := 0; l < numLevels; l++ {
   808  		if v.RangeKeyLevels[l].Empty() {
   809  			continue
   810  		}
   811  		count += *v.RangeKeyLevels[l].Annotation(rangeKeySetsAnnotator{}).(*uint64)
   812  	}
   813  	return count
   814  }