github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/compactor/compactor.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package compactor
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"time"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/keys"
    19  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    20  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    21  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    22  	"github.com/cockroachdb/cockroach/pkg/storage"
    23  	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
    24  	"github.com/cockroachdb/cockroach/pkg/util/log"
    25  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    26  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    27  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    28  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    29  	"github.com/cockroachdb/errors"
    30  	"github.com/cockroachdb/logtags"
    31  )
    32  
    33  type storeCapacityFunc func() (roachpb.StoreCapacity, error)
    34  
    35  type doneCompactingFunc func(ctx context.Context)
    36  
    37  // A Compactor records suggested compactions and periodically
    38  // makes requests to the engine to reclaim storage space.
    39  type Compactor struct {
    40  	st      *cluster.Settings
    41  	eng     storage.Engine
    42  	capFn   storeCapacityFunc
    43  	doneFn  doneCompactingFunc
    44  	ch      chan struct{}
    45  	Metrics Metrics
    46  }
    47  
    48  // NewCompactor returns a compactor for the specified storage engine.
    49  func NewCompactor(
    50  	st *cluster.Settings, eng storage.Engine, capFn storeCapacityFunc, doneFn doneCompactingFunc,
    51  ) *Compactor {
    52  	return &Compactor{
    53  		st:      st,
    54  		eng:     eng,
    55  		capFn:   capFn,
    56  		doneFn:  doneFn,
    57  		ch:      make(chan struct{}, 1),
    58  		Metrics: makeMetrics(),
    59  	}
    60  }
    61  
    62  func (c *Compactor) enabled() bool {
    63  	return enabled.Get(&c.st.SV)
    64  }
    65  
    66  func (c *Compactor) minInterval() time.Duration {
    67  	return minInterval.Get(&c.st.SV)
    68  }
    69  
    70  func (c *Compactor) thresholdBytes() int64 {
    71  	return thresholdBytes.Get(&c.st.SV)
    72  }
    73  
    74  func (c *Compactor) thresholdBytesUsedFraction() float64 {
    75  	return thresholdBytesUsedFraction.Get(&c.st.SV)
    76  }
    77  
    78  func (c *Compactor) thresholdBytesAvailableFraction() float64 {
    79  	return thresholdBytesAvailableFraction.Get(&c.st.SV)
    80  }
    81  
    82  func (c *Compactor) maxAge() time.Duration {
    83  	return maxSuggestedCompactionRecordAge.Get(&c.st.SV)
    84  }
    85  
    86  // poke instructs the compactor's main loop to react to new suggestions in a
    87  // timely manner.
    88  func (c *Compactor) poke() {
    89  	select {
    90  	case c.ch <- struct{}{}:
    91  	default:
    92  	}
    93  }
    94  
    95  // Start launches a compaction processing goroutine and exits when the
    96  // provided stopper indicates. Processing is done with a periodicity of
    97  // compactionMinInterval, but only if there are compactions pending.
    98  func (c *Compactor) Start(ctx context.Context, stopper *stop.Stopper) {
    99  	ctx = logtags.AddTag(ctx, "compactor", "")
   100  
   101  	// Wake up immediately to examine the queue and set the bytes queued metric.
   102  	// Note that the compactor may have received suggestions before having been
   103  	// started (this isn't great, but it's how it is right now).
   104  	c.poke()
   105  
   106  	// Run the Worker in a Task because the worker holds on to the engine and
   107  	// may still access it even though the stopper has allowed it to close.
   108  	_ = stopper.RunTask(ctx, "compactor", func(ctx context.Context) {
   109  		stopper.RunWorker(ctx, func(ctx context.Context) {
   110  			var timer timeutil.Timer
   111  			defer timer.Stop()
   112  
   113  			// The above timer will either be on c.minInterval() or c.maxAge(). The
   114  			// former applies if we know there are new suggestions waiting to be
   115  			// inspected: we want to look at them soon, but also want to make sure
   116  			// "related" suggestions arrive before we start compacting. When no new
   117  			// suggestions have been made since the last inspection, the expectation
   118  			// is that all we have to do is clean up any previously skipped ones (at
   119  			// least after sufficient time has passed), and so we wait out the max age.
   120  			var isFast bool
   121  
   122  			for {
   123  				select {
   124  				case <-stopper.ShouldStop():
   125  					return
   126  
   127  				case <-c.ch:
   128  					// A new suggestion was made. Examine the compaction queue,
   129  					// which returns the number of bytes queued.
   130  					if bytesQueued, err := c.examineQueue(ctx); err != nil {
   131  						log.Warningf(ctx, "failed check whether compaction suggestions exist: %+v", err)
   132  					} else if bytesQueued > 0 {
   133  						log.VEventf(ctx, 3, "compactor starting in %s as there are suggested compactions pending", c.minInterval())
   134  					} else {
   135  						// Queue is empty, don't set the timer. This can happen only at startup.
   136  						break
   137  					}
   138  					// Set the wait timer if not already set.
   139  					if !isFast {
   140  						isFast = true
   141  						timer.Reset(c.minInterval())
   142  					}
   143  
   144  				case <-timer.C:
   145  					timer.Read = true
   146  					ok, err := c.processSuggestions(ctx)
   147  					if err != nil {
   148  						log.Warningf(ctx, "failed processing suggested compactions: %+v", err)
   149  					}
   150  					if ok {
   151  						// The queue was processed, so either it's empty or contains suggestions
   152  						// that were skipped for now. Revisit when they are certainly expired.
   153  						isFast = false
   154  						timer.Reset(c.maxAge())
   155  						break
   156  					}
   157  					// More work to do, revisit after minInterval. Note that basically
   158  					// `ok == (err == nil)` but this refactor is left for a future commit.
   159  					isFast = true
   160  					timer.Reset(c.minInterval())
   161  				}
   162  			}
   163  		})
   164  	})
   165  }
   166  
   167  // aggregatedCompaction is a utility struct that holds information
   168  // about aggregated suggested compactions.
   169  type aggregatedCompaction struct {
   170  	kvserverpb.SuggestedCompaction
   171  	suggestions []kvserverpb.SuggestedCompaction
   172  	startIdx    int
   173  	total       int
   174  }
   175  
   176  func initAggregatedCompaction(
   177  	startIdx, total int, sc kvserverpb.SuggestedCompaction,
   178  ) aggregatedCompaction {
   179  	return aggregatedCompaction{
   180  		SuggestedCompaction: sc,
   181  		suggestions:         []kvserverpb.SuggestedCompaction{sc},
   182  		startIdx:            startIdx,
   183  		total:               total,
   184  	}
   185  }
   186  
   187  func (aggr aggregatedCompaction) String() string {
   188  	var seqFmt string
   189  	if len(aggr.suggestions) == 1 {
   190  		seqFmt = fmt.Sprintf("#%d/%d", aggr.startIdx+1, aggr.total)
   191  	} else {
   192  		seqFmt = fmt.Sprintf("#%d-%d/%d", aggr.startIdx+1, aggr.startIdx+len(aggr.suggestions), aggr.total)
   193  	}
   194  	return fmt.Sprintf("%s (%s-%s) for %s", seqFmt, aggr.StartKey, aggr.EndKey, humanizeutil.IBytes(aggr.Bytes))
   195  }
   196  
   197  // processSuggestions considers all suggested compactions and
   198  // processes contiguous or nearly contiguous aggregations if they
   199  // exceed the absolute or fractional size thresholds. If suggested
   200  // compactions don't meet thresholds, they're discarded if they're
   201  // older than maxSuggestedCompactionRecordAge. Returns a boolean
   202  // indicating whether the queue was successfully processed.
   203  func (c *Compactor) processSuggestions(ctx context.Context) (bool, error) {
   204  	ctx, cleanup := tracing.EnsureContext(ctx, c.st.Tracer, "process suggested compactions")
   205  	defer cleanup()
   206  
   207  	suggestions, totalBytes, err := c.fetchSuggestions(ctx)
   208  	if err != nil {
   209  		return false, err
   210  	}
   211  
   212  	// Update at start of processing. Note that totalBytes is decremented and
   213  	// updated after any compactions which are processed.
   214  	c.Metrics.BytesQueued.Update(totalBytes)
   215  
   216  	if len(suggestions) == 0 {
   217  		return false, nil
   218  	}
   219  
   220  	log.Eventf(ctx, "considering %d suggested compaction(s)", len(suggestions))
   221  
   222  	// Determine whether to attempt a compaction to reclaim space during
   223  	// this processing. The decision is based on total bytes to free up
   224  	// and the time since the last processing.
   225  	capacity, err := c.capFn()
   226  	if err != nil {
   227  		return false, err
   228  	}
   229  
   230  	// Get information about SSTables in the underlying RocksDB instance.
   231  	ssti := storage.NewSSTableInfosByLevel(c.eng.GetSSTables())
   232  
   233  	// Update the bytes queued metric based, periodically querying the persisted
   234  	// suggestions so that we pick up newly added suggestions in the case where
   235  	// we're processing a large number of suggestions.
   236  	lastUpdate := timeutil.Now()
   237  	updateBytesQueued := func(delta int64) error {
   238  		totalBytes -= delta
   239  		if timeutil.Since(lastUpdate) >= 10*time.Second {
   240  			lastUpdate = timeutil.Now()
   241  			bytes, err := c.examineQueue(ctx)
   242  			if err != nil {
   243  				return err
   244  			}
   245  			totalBytes = bytes
   246  			// NB: examineQueue updates the BytesQueued metric.
   247  		} else {
   248  			c.Metrics.BytesQueued.Update(totalBytes)
   249  		}
   250  		return nil
   251  	}
   252  
   253  	// Iterate through suggestions, merging them into a running
   254  	// aggregation. Aggregates which exceed size thresholds are compacted. Small,
   255  	// isolated suggestions will be ignored until becoming too old, at which
   256  	// point they are discarded without compaction.
   257  	aggr := initAggregatedCompaction(0, len(suggestions), suggestions[0])
   258  	for i, sc := range suggestions[1:] {
   259  		// Aggregate current suggestion with running aggregate if possible. If
   260  		// the current suggestion cannot be merged with the aggregate, process
   261  		// it if it meets compaction thresholds.
   262  		if done := c.aggregateCompaction(ctx, ssti, &aggr, sc); done {
   263  			processedBytes, err := c.processCompaction(ctx, aggr, capacity)
   264  			if err != nil {
   265  				log.Errorf(ctx, "failed processing suggested compactions %+v: %+v", aggr, err)
   266  			} else if err := updateBytesQueued(processedBytes); err != nil {
   267  				log.Errorf(ctx, "failed updating bytes queued metric %+v", err)
   268  			}
   269  			// Reset aggregation to the last, un-aggregated, suggested compaction.
   270  			aggr = initAggregatedCompaction(i, len(suggestions), sc)
   271  		}
   272  	}
   273  	// Process remaining aggregated compaction.
   274  	processedBytes, err := c.processCompaction(ctx, aggr, capacity)
   275  	if err != nil {
   276  		return false, err
   277  	}
   278  	if err := updateBytesQueued(processedBytes); err != nil {
   279  		log.Errorf(ctx, "failed updating bytes queued metric %+v", err)
   280  	}
   281  
   282  	return true, nil
   283  }
   284  
   285  // fetchSuggestions loads the persisted suggested compactions from the store.
   286  func (c *Compactor) fetchSuggestions(
   287  	ctx context.Context,
   288  ) (suggestions []kvserverpb.SuggestedCompaction, totalBytes int64, err error) {
   289  	dataIter := c.eng.NewIterator(storage.IterOptions{
   290  		UpperBound: roachpb.KeyMax, // refined before every seek
   291  	})
   292  	defer dataIter.Close()
   293  
   294  	delBatch := c.eng.NewBatch()
   295  	defer delBatch.Close()
   296  
   297  	err = c.eng.Iterate(
   298  		keys.LocalStoreSuggestedCompactionsMin,
   299  		keys.LocalStoreSuggestedCompactionsMax,
   300  		func(kv storage.MVCCKeyValue) (bool, error) {
   301  			var sc kvserverpb.SuggestedCompaction
   302  			var err error
   303  			sc.StartKey, sc.EndKey, err = keys.DecodeStoreSuggestedCompactionKey(kv.Key.Key)
   304  			if err != nil {
   305  				return false, errors.Wrapf(err, "failed to decode suggested compaction key")
   306  			}
   307  			if err := protoutil.Unmarshal(kv.Value, &sc.Compaction); err != nil {
   308  				return false, err
   309  			}
   310  
   311  			dataIter.SetUpperBound(sc.EndKey)
   312  			dataIter.SeekGE(storage.MakeMVCCMetadataKey(sc.StartKey))
   313  			if ok, err := dataIter.Valid(); err != nil {
   314  				return false, err
   315  			} else if ok && dataIter.UnsafeKey().Less(storage.MakeMVCCMetadataKey(sc.EndKey)) {
   316  				// The suggested compaction span has live keys remaining. This is a
   317  				// strong indicator that compacting this range will be significantly
   318  				// more expensive than we expected when the compaction was suggested, as
   319  				// compactions are only suggested when a ClearRange request has removed
   320  				// all the keys in the span. Perhaps a replica was rebalanced away then
   321  				// back?
   322  				//
   323  				// Since we can't guarantee that this compaction will be an easy win,
   324  				// purge it to avoid bogging down the compaction queue.
   325  				log.Infof(ctx, "purging suggested compaction for range %s - %s that contains live data",
   326  					sc.StartKey, sc.EndKey)
   327  				if err := delBatch.Clear(kv.Key); err != nil {
   328  					log.Fatalf(ctx, "%v", err) // should never happen on a batch
   329  				}
   330  				c.Metrics.BytesSkipped.Inc(sc.Bytes)
   331  			} else {
   332  				suggestions = append(suggestions, sc)
   333  				totalBytes += sc.Bytes
   334  			}
   335  
   336  			return false, nil // continue iteration
   337  		},
   338  	)
   339  	if err != nil {
   340  		return nil, 0, err
   341  	}
   342  	if err := delBatch.Commit(true); err != nil {
   343  		log.Warningf(ctx, "unable to delete suggested compaction records: %+v", err)
   344  	}
   345  	return suggestions, totalBytes, nil
   346  }
   347  
   348  // processCompaction sends CompactRange requests to the storage engine if the
   349  // aggregated suggestion exceeds size threshold(s). Otherwise, it either skips
   350  // the compaction or skips the compaction *and* deletes the suggested compaction
   351  // records if they're too old (and in particular, if the compactor is disabled,
   352  // deletes any suggestions handed to it). Returns the number of bytes processed
   353  // (either compacted or skipped and deleted due to age).
   354  func (c *Compactor) processCompaction(
   355  	ctx context.Context, aggr aggregatedCompaction, capacity roachpb.StoreCapacity,
   356  ) (int64, error) {
   357  	aboveSizeThresh := aggr.Bytes >= c.thresholdBytes()
   358  	aboveUsedFracThresh := func() bool {
   359  		thresh := c.thresholdBytesUsedFraction()
   360  		return thresh > 0 && aggr.Bytes >= int64(float64(capacity.LogicalBytes)*thresh)
   361  	}()
   362  	aboveAvailFracThresh := func() bool {
   363  		thresh := c.thresholdBytesAvailableFraction()
   364  		return thresh > 0 && aggr.Bytes >= int64(float64(capacity.Available)*thresh)
   365  	}()
   366  
   367  	shouldProcess := c.enabled() && (aboveSizeThresh || aboveUsedFracThresh || aboveAvailFracThresh)
   368  	if shouldProcess {
   369  		startTime := timeutil.Now()
   370  		log.Infof(ctx,
   371  			"processing compaction %s (reasons: size=%t used=%t avail=%t)",
   372  			aggr, aboveSizeThresh, aboveUsedFracThresh, aboveAvailFracThresh,
   373  		)
   374  
   375  		if err := c.eng.CompactRange(aggr.StartKey, aggr.EndKey, false /* forceBottommost */); err != nil {
   376  			c.Metrics.CompactionFailures.Inc(1)
   377  			return 0, errors.Wrapf(err, "unable to compact range %+v", aggr)
   378  		}
   379  		c.Metrics.BytesCompacted.Inc(aggr.Bytes)
   380  		c.Metrics.CompactionSuccesses.Inc(1)
   381  		duration := timeutil.Since(startTime)
   382  		c.Metrics.CompactingNanos.Inc(int64(duration))
   383  		if c.doneFn != nil {
   384  			c.doneFn(ctx)
   385  		}
   386  		log.Infof(ctx, "processed compaction %s in %.1fs", aggr, duration.Seconds())
   387  	} else {
   388  		log.VEventf(ctx, 2, "skipping compaction(s) %s", aggr)
   389  	}
   390  
   391  	delBatch := c.eng.NewWriteOnlyBatch()
   392  
   393  	// Delete suggested compaction records if appropriate.
   394  	for _, sc := range aggr.suggestions {
   395  		age := timeutil.Since(timeutil.Unix(0, sc.SuggestedAtNanos))
   396  		tooOld := age >= c.maxAge() || !c.enabled()
   397  		// Delete unless we didn't process and the record isn't too old.
   398  		if !shouldProcess && !tooOld {
   399  			continue
   400  		}
   401  		if tooOld {
   402  			c.Metrics.BytesSkipped.Inc(sc.Bytes)
   403  		}
   404  		key := keys.StoreSuggestedCompactionKey(sc.StartKey, sc.EndKey)
   405  		if err := delBatch.Clear(storage.MVCCKey{Key: key}); err != nil {
   406  			log.Fatalf(ctx, "%v", err) // should never happen on a batch
   407  		}
   408  	}
   409  
   410  	if err := delBatch.Commit(true); err != nil {
   411  		log.Warningf(ctx, "unable to delete suggested compaction records: %+v", err)
   412  	}
   413  	delBatch.Close()
   414  
   415  	if shouldProcess {
   416  		return aggr.Bytes, nil
   417  	}
   418  	return 0, nil
   419  }
   420  
   421  // aggregateCompaction merges sc into aggr, to create a new suggested
   422  // compaction, if the key spans are overlapping or near-contiguous.  Note that
   423  // because suggested compactions are stored sorted by their start key,
   424  // sc.StartKey >= aggr.StartKey. Returns true if we couldn't add the new
   425  // suggested compaction to the aggregation and are therefore done building the
   426  // current aggregation and should process it. Returns false if we should
   427  // continue aggregating suggested compactions.
   428  func (c *Compactor) aggregateCompaction(
   429  	ctx context.Context,
   430  	ssti storage.SSTableInfosByLevel,
   431  	aggr *aggregatedCompaction,
   432  	sc kvserverpb.SuggestedCompaction,
   433  ) (done bool) {
   434  	// Don't bother aggregating more once we reach threshold bytes.
   435  	if aggr.Bytes >= c.thresholdBytes() {
   436  		return true // suggested compation could not be aggregated
   437  	}
   438  
   439  	// If the key spans don't overlap, then check whether they're
   440  	// "nearly" contiguous.
   441  	if aggr.EndKey.Compare(sc.StartKey) < 0 {
   442  		// Aggregate if the gap between current aggregate and proposed
   443  		// compaction span overlaps (at most) two contiguous SSTables at
   444  		// the bottommost level.
   445  		span := roachpb.Span{Key: aggr.EndKey, EndKey: sc.StartKey}
   446  		maxLevel := ssti.MaxLevelSpanOverlapsContiguousSSTables(span)
   447  		if maxLevel < ssti.MaxLevel() {
   448  			return true // suggested compaction could not be aggregated
   449  		}
   450  	}
   451  
   452  	// We can aggregate, so merge sc into aggr.
   453  	if aggr.EndKey.Compare(sc.EndKey) < 0 {
   454  		aggr.EndKey = sc.EndKey
   455  	}
   456  	aggr.Bytes += sc.Bytes
   457  	aggr.suggestions = append(aggr.suggestions, sc)
   458  	return false // aggregated successfully
   459  }
   460  
   461  // examineQueue returns the total number of bytes queued and updates the
   462  // BytesQueued gauge.
   463  func (c *Compactor) examineQueue(ctx context.Context) (int64, error) {
   464  	var totalBytes int64
   465  	if err := c.eng.Iterate(
   466  		keys.LocalStoreSuggestedCompactionsMin,
   467  		keys.LocalStoreSuggestedCompactionsMax,
   468  		func(kv storage.MVCCKeyValue) (bool, error) {
   469  			var c kvserverpb.Compaction
   470  			if err := protoutil.Unmarshal(kv.Value, &c); err != nil {
   471  				return false, err
   472  			}
   473  			totalBytes += c.Bytes
   474  			return false, nil // continue iteration
   475  		},
   476  	); err != nil {
   477  		return 0, err
   478  	}
   479  	c.Metrics.BytesQueued.Update(totalBytes)
   480  	return totalBytes, nil
   481  }
   482  
   483  // Suggest writes the specified compaction to persistent storage and
   484  // pings the processing goroutine.
   485  func (c *Compactor) Suggest(ctx context.Context, sc kvserverpb.SuggestedCompaction) {
   486  	log.VEventf(ctx, 2, "suggested compaction from %s - %s: %+v", sc.StartKey, sc.EndKey, sc.Compaction)
   487  
   488  	// Check whether a suggested compaction already exists for this key span.
   489  	key := keys.StoreSuggestedCompactionKey(sc.StartKey, sc.EndKey)
   490  	var existing kvserverpb.Compaction
   491  	//lint:ignore SA1019 historical usage of deprecated c.eng.GetProto is OK
   492  	ok, _, _, err := c.eng.GetProto(storage.MVCCKey{Key: key}, &existing)
   493  	if err != nil {
   494  		log.VErrEventf(ctx, 2, "unable to record suggested compaction: %s", err)
   495  		return
   496  	}
   497  
   498  	// If there's already a suggested compaction, merge them. Note that
   499  	// this method is only called after clearing keys from the underlying
   500  	// storage engine. All such actions really do result in successively
   501  	// more bytes being made available for compaction, so there is no
   502  	// double-counting if the same range were cleared twice.
   503  	if ok {
   504  		sc.Bytes += existing.Bytes
   505  	}
   506  
   507  	// Store the new compaction.
   508  	//lint:ignore SA1019 historical usage of deprecated engine.PutProto is OK
   509  	if _, _, err = storage.PutProto(c.eng, storage.MVCCKey{Key: key}, &sc.Compaction); err != nil {
   510  		log.Warningf(ctx, "unable to record suggested compaction: %+v", err)
   511  	}
   512  
   513  	// Poke the compactor goroutine to reconsider compaction in light of
   514  	// this new suggested compaction.
   515  	c.poke()
   516  }