github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/merge_queue.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"math"
    17  	"time"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/config"
    20  	"github.com/cockroachdb/cockroach/pkg/gossip"
    21  	"github.com/cockroachdb/cockroach/pkg/kv"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    23  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    24  	"github.com/cockroachdb/cockroach/pkg/settings"
    25  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    26  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    27  	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
    28  	"github.com/cockroachdb/cockroach/pkg/util/log"
    29  	"github.com/cockroachdb/errors"
    30  )
    31  
    32  const (
    33  	// mergeQueuePurgatoryCheckInterval is the interval at which replicas in
    34  	// purgatory make merge attempts. Since merges are relatively untested, the
    35  	// reasons that a range may fail to merge are unknown, so the merge queue has
    36  	// a large purgatory interval.
    37  	mergeQueuePurgatoryCheckInterval = 1 * time.Minute
    38  
    39  	// The current implementation of merges requires rewriting the right-hand data
    40  	// onto the left-hand range, even when the ranges are collocated. This is
    41  	// expensive, so limit to one merge at a time.
    42  	mergeQueueConcurrency = 1
    43  )
    44  
    45  // MergeQueueInterval is a setting that controls how often the merge queue waits
    46  // between processing replicas.
    47  var MergeQueueInterval = settings.RegisterNonNegativeDurationSetting(
    48  	"kv.range_merge.queue_interval",
    49  	"how long the merge queue waits between processing replicas",
    50  	time.Second,
    51  )
    52  
    53  // mergeQueue manages a queue of ranges slated to be merged with their right-
    54  // hand neighbor.
    55  //
    56  // A range will only be queued if it is beneath the minimum size threshold. Once
    57  // queued, the size of the right-hand neighbor will additionally be checked;
    58  // merges can only proceed if a) the right-hand neighbor is beneath the minimum
    59  // size threshold, and b) the merged range would not need to be immediately
    60  // split, e.g. because the new range would exceed the maximum size threshold.
    61  //
    62  // Note that the merge queue is not capable of initiating all possible merges.
    63  // Consider the example below:
    64  //
    65  //      /Table/51/1    /Table/51/2    /Table/52
    66  //         32MB            0MB           32MB
    67  //
    68  // The range beginning at /Table/51/2 is empty and would, ideally, be merged
    69  // away. The range to its left, /Table/51/1, will not propose a merge because it
    70  // is over the minimum size threshold. And /Table/51/2 will not propose a merge
    71  // because the next range, /Table/52, is a new table and thus the split is
    72  // mandatory.
    73  //
    74  // There are several ways to solve this. /Table/51/2 could look both left and
    75  // right to find a merge partner, but discovering ones left neighbor is rather
    76  // difficult and involves scanning the meta ranges. /Table/51/1 could propose a
    77  // merge even though it's over the minimum size threshold, but this would result
    78  // in a lot more RangeStats requests--essentially every range would send a
    79  // RangeStats request on every scanner cycle.
    80  //
    81  // The current approach seems to be a nice balance of finding nearly all
    82  // mergeable ranges without sending many RPCs. It has the additional nice
    83  // property of not sending any RPCs to meta ranges until a merge is actually
    84  // initiated.
    85  type mergeQueue struct {
    86  	*baseQueue
    87  	db       *kv.DB
    88  	purgChan <-chan time.Time
    89  }
    90  
    91  func newMergeQueue(store *Store, db *kv.DB, gossip *gossip.Gossip) *mergeQueue {
    92  	mq := &mergeQueue{
    93  		db:       db,
    94  		purgChan: time.NewTicker(mergeQueuePurgatoryCheckInterval).C,
    95  	}
    96  	mq.baseQueue = newBaseQueue(
    97  		"merge", mq, store, gossip,
    98  		queueConfig{
    99  			maxSize:        defaultQueueMaxSize,
   100  			maxConcurrency: mergeQueueConcurrency,
   101  			// TODO(ajwerner): Sometimes the merge queue needs to send multiple
   102  			// snapshots, but the timeout function here is configured based on the
   103  			// duration required to send a single snapshot. That being said, this
   104  			// timeout provides leeway for snapshots to be 10x slower than the
   105  			// specified rate and still respects the queue processing minimum timeout.
   106  			// While using the below function is certainly better than just using the
   107  			// default timeout, it would be better to have a function which takes into
   108  			// account how many snapshots processing will need to send. That might be
   109  			// hard to determine ahead of time. An alternative would be to calculate
   110  			// the timeout with a function that additionally considers the replication
   111  			// factor.
   112  			processTimeoutFunc:   makeQueueSnapshotTimeoutFunc(rebalanceSnapshotRate),
   113  			needsLease:           true,
   114  			needsSystemConfig:    true,
   115  			acceptsUnsplitRanges: false,
   116  			successes:            store.metrics.MergeQueueSuccesses,
   117  			failures:             store.metrics.MergeQueueFailures,
   118  			pending:              store.metrics.MergeQueuePending,
   119  			processingNanos:      store.metrics.MergeQueueProcessingNanos,
   120  			purgatory:            store.metrics.MergeQueuePurgatory,
   121  		},
   122  	)
   123  	return mq
   124  }
   125  
   126  func (mq *mergeQueue) enabled() bool {
   127  	st := mq.store.ClusterSettings()
   128  	return kvserverbase.MergeQueueEnabled.Get(&st.SV)
   129  }
   130  
   131  func (mq *mergeQueue) shouldQueue(
   132  	ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg *config.SystemConfig,
   133  ) (shouldQ bool, priority float64) {
   134  	if !mq.enabled() {
   135  		return false, 0
   136  	}
   137  
   138  	desc := repl.Desc()
   139  
   140  	if desc.EndKey.Equal(roachpb.RKeyMax) {
   141  		// The last range has no right-hand neighbor to merge with.
   142  		return false, 0
   143  	}
   144  
   145  	if sysCfg.NeedsSplit(desc.StartKey, desc.EndKey.Next()) {
   146  		// This range would need to be split if it extended just one key further.
   147  		// There is thus no possible right-hand neighbor that it could be merged
   148  		// with.
   149  		return false, 0
   150  	}
   151  
   152  	sizeRatio := float64(repl.GetMVCCStats().Total()) / float64(repl.GetMinBytes())
   153  	if math.IsNaN(sizeRatio) || sizeRatio >= 1 {
   154  		// This range is above the minimum size threshold. It does not need to be
   155  		// merged.
   156  		return false, 0
   157  	}
   158  
   159  	// Invert sizeRatio to compute the priority so that smaller ranges are merged
   160  	// before larger ranges.
   161  	priority = 1 - sizeRatio
   162  	return true, priority
   163  }
   164  
   165  // rangeMergePurgatoryError wraps an error that occurs during merging to
   166  // indicate that the error should send the range to purgatory.
   167  type rangeMergePurgatoryError struct{ error }
   168  
   169  func (rangeMergePurgatoryError) purgatoryErrorMarker() {}
   170  
   171  var _ purgatoryError = rangeMergePurgatoryError{}
   172  
   173  func (mq *mergeQueue) requestRangeStats(
   174  	ctx context.Context, key roachpb.Key,
   175  ) (*roachpb.RangeDescriptor, enginepb.MVCCStats, float64, error) {
   176  	res, pErr := kv.SendWrappedWith(ctx, mq.db.NonTransactionalSender(), roachpb.Header{
   177  		ReturnRangeInfo: true,
   178  	}, &roachpb.RangeStatsRequest{
   179  		RequestHeader: roachpb.RequestHeader{Key: key},
   180  	})
   181  	if pErr != nil {
   182  		return nil, enginepb.MVCCStats{}, 0, pErr.GoError()
   183  	}
   184  	rangeInfos := res.Header().RangeInfos
   185  	if len(rangeInfos) != 1 {
   186  		return nil, enginepb.MVCCStats{}, 0, fmt.Errorf(
   187  			"mergeQueue.requestRangeStats: response had %d range infos but exactly one was expected",
   188  			len(rangeInfos))
   189  	}
   190  	return &rangeInfos[0].Desc, res.(*roachpb.RangeStatsResponse).MVCCStats,
   191  		res.(*roachpb.RangeStatsResponse).QueriesPerSecond, nil
   192  }
   193  
   194  func (mq *mergeQueue) process(
   195  	ctx context.Context, lhsRepl *Replica, sysCfg *config.SystemConfig,
   196  ) error {
   197  	if !mq.enabled() {
   198  		log.VEventf(ctx, 2, "skipping merge: queue has been disabled")
   199  		return nil
   200  	}
   201  
   202  	lhsStats := lhsRepl.GetMVCCStats()
   203  	minBytes := lhsRepl.GetMinBytes()
   204  	if lhsStats.Total() >= minBytes {
   205  		log.VEventf(ctx, 2, "skipping merge: LHS meets minimum size threshold %d with %d bytes",
   206  			minBytes, lhsStats.Total())
   207  		return nil
   208  	}
   209  
   210  	lhsDesc := lhsRepl.Desc()
   211  	lhsQPS := lhsRepl.GetSplitQPS()
   212  	rhsDesc, rhsStats, rhsQPS, err := mq.requestRangeStats(ctx, lhsDesc.EndKey.AsRawKey())
   213  	if err != nil {
   214  		return err
   215  	}
   216  	if rhsStats.Total() >= minBytes {
   217  		log.VEventf(ctx, 2, "skipping merge: RHS meets minimum size threshold %d with %d bytes",
   218  			minBytes, lhsStats.Total())
   219  		return nil
   220  	}
   221  
   222  	// Range was manually split and not expired, so skip merging.
   223  	now := mq.store.Clock().Now()
   224  	if now.Less(rhsDesc.GetStickyBit()) {
   225  		log.VEventf(ctx, 2, "skipping merge: ranges were manually split and sticky bit was not expired")
   226  		// TODO(jeffreyxiao): Consider returning a purgatory error to avoid
   227  		// repeatedly processing ranges that cannot be merged.
   228  		return nil
   229  	}
   230  
   231  	mergedDesc := &roachpb.RangeDescriptor{
   232  		StartKey: lhsDesc.StartKey,
   233  		EndKey:   rhsDesc.EndKey,
   234  	}
   235  	mergedStats := lhsStats
   236  	mergedStats.Add(rhsStats)
   237  
   238  	var mergedQPS float64
   239  	if lhsRepl.SplitByLoadEnabled() {
   240  		mergedQPS = lhsQPS + rhsQPS
   241  	}
   242  
   243  	// Check if the merged range would need to be split, if so, skip merge.
   244  	// Use a lower threshold for load based splitting so we don't find ourselves
   245  	// in a situation where we keep merging ranges that would be split soon after
   246  	// by a small increase in load.
   247  	conservativeLoadBasedSplitThreshold := 0.5 * lhsRepl.SplitByLoadQPSThreshold()
   248  	shouldSplit, _ := shouldSplitRange(mergedDesc, mergedStats,
   249  		lhsRepl.GetMaxBytes(), lhsRepl.shouldBackpressureWrites(), sysCfg)
   250  	if shouldSplit || mergedQPS >= conservativeLoadBasedSplitThreshold {
   251  		log.VEventf(ctx, 2,
   252  			"skipping merge to avoid thrashing: merged range %s may split "+
   253  				"(estimated size, estimated QPS: %d, %v)",
   254  			mergedDesc, mergedStats.Total(), mergedQPS)
   255  		return nil
   256  	}
   257  
   258  	{
   259  		store := lhsRepl.store
   260  		// AdminMerge errors if there is a learner or joint config on either
   261  		// side and AdminRelocateRange removes any on the range it operates on.
   262  		// For the sake of obviousness, just fix this all upfront.
   263  		var err error
   264  		lhsDesc, err = maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, store, lhsDesc)
   265  		if err != nil {
   266  			log.VEventf(ctx, 2, `%v`, err)
   267  			return err
   268  		}
   269  
   270  		rhsDesc, err = maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, store, rhsDesc)
   271  		if err != nil {
   272  			log.VEventf(ctx, 2, `%v`, err)
   273  			return err
   274  		}
   275  	}
   276  	lhsReplicas, rhsReplicas := lhsDesc.Replicas().All(), rhsDesc.Replicas().All()
   277  
   278  	// Defensive sanity check that everything is now a voter.
   279  	for i := range lhsReplicas {
   280  		if lhsReplicas[i].GetType() != roachpb.VOTER_FULL {
   281  			return errors.Errorf(`cannot merge non-voter replicas on lhs: %v`, lhsReplicas)
   282  		}
   283  	}
   284  	for i := range rhsReplicas {
   285  		if rhsReplicas[i].GetType() != roachpb.VOTER_FULL {
   286  			return errors.Errorf(`cannot merge non-voter replicas on rhs: %v`, rhsReplicas)
   287  		}
   288  	}
   289  
   290  	if !replicaSetsEqual(lhsReplicas, rhsReplicas) {
   291  		var targets []roachpb.ReplicationTarget
   292  		for _, lhsReplDesc := range lhsReplicas {
   293  			targets = append(targets, roachpb.ReplicationTarget{
   294  				NodeID: lhsReplDesc.NodeID, StoreID: lhsReplDesc.StoreID,
   295  			})
   296  		}
   297  		// AdminRelocateRange moves the lease to the first target in the list, so
   298  		// sort the existing leaseholder there to leave it unchanged.
   299  		lease, _ := lhsRepl.GetLease()
   300  		for i := range targets {
   301  			if targets[i].NodeID == lease.Replica.NodeID && targets[i].StoreID == lease.Replica.StoreID {
   302  				if i > 0 {
   303  					targets[0], targets[i] = targets[i], targets[0]
   304  				}
   305  				break
   306  			}
   307  		}
   308  		// TODO(benesch): RelocateRange can sometimes fail if it needs to move a replica
   309  		// from one store to another store on the same node.
   310  		if err := mq.store.DB().AdminRelocateRange(ctx, rhsDesc.StartKey, targets); err != nil {
   311  			return err
   312  		}
   313  	}
   314  
   315  	log.VEventf(ctx, 2, "merging to produce range: %s-%s", mergedDesc.StartKey, mergedDesc.EndKey)
   316  	reason := fmt.Sprintf("lhs+rhs has (size=%s+%s=%s qps=%.2f+%.2f=%.2fqps) below threshold (size=%s, qps=%.2f)",
   317  		humanizeutil.IBytes(lhsStats.Total()),
   318  		humanizeutil.IBytes(rhsStats.Total()),
   319  		humanizeutil.IBytes(mergedStats.Total()),
   320  		lhsQPS,
   321  		rhsQPS,
   322  		mergedQPS,
   323  		humanizeutil.IBytes(minBytes),
   324  		conservativeLoadBasedSplitThreshold,
   325  	)
   326  	_, pErr := lhsRepl.AdminMerge(ctx, roachpb.AdminMergeRequest{
   327  		RequestHeader: roachpb.RequestHeader{Key: lhsRepl.Desc().StartKey.AsRawKey()},
   328  	}, reason)
   329  	if err := pErr.GoError(); errors.HasType(err, (*roachpb.ConditionFailedError)(nil)) {
   330  		// ConditionFailedErrors are an expected outcome for range merge
   331  		// attempts because merges can race with other descriptor modifications.
   332  		// On seeing a ConditionFailedError, don't return an error and enqueue
   333  		// this replica again in case it still needs to be merged.
   334  		log.Infof(ctx, "merge saw concurrent descriptor modification; maybe retrying")
   335  		mq.MaybeAddAsync(ctx, lhsRepl, now)
   336  	} else if err != nil {
   337  		// While range merges are unstable, be extra cautious and mark every error
   338  		// as purgatory-worthy.
   339  		return rangeMergePurgatoryError{err}
   340  	}
   341  	if testingAggressiveConsistencyChecks {
   342  		if err := mq.store.consistencyQueue.process(ctx, lhsRepl, sysCfg); err != nil {
   343  			log.Warningf(ctx, "%v", err)
   344  		}
   345  	}
   346  	return nil
   347  }
   348  
   349  func (mq *mergeQueue) timer(time.Duration) time.Duration {
   350  	return MergeQueueInterval.Get(&mq.store.ClusterSettings().SV)
   351  }
   352  
   353  func (mq *mergeQueue) purgatoryChan() <-chan time.Time {
   354  	return mq.purgChan
   355  }