github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/split_queue.go (about)

     1  // Copyright 2015 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"time"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/config"
    19  	"github.com/cockroachdb/cockroach/pkg/gossip"
    20  	"github.com/cockroachdb/cockroach/pkg/kv"
    21  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    22  	"github.com/cockroachdb/cockroach/pkg/server/telemetry"
    23  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    24  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    25  	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
    26  	"github.com/cockroachdb/cockroach/pkg/util/log"
    27  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    28  	"github.com/cockroachdb/errors"
    29  )
    30  
    31  const (
    32  	// splitQueueTimerDuration is the duration between splits of queued ranges.
    33  	splitQueueTimerDuration = 0 // zero duration to process splits greedily.
    34  
    35  	// splitQueuePurgatoryCheckInterval is the interval at which replicas in
    36  	// purgatory make split attempts. Purgatory is used by the splitQueue to
    37  	// store ranges that are large enough to require a split but are
    38  	// unsplittable because they do not contain a suitable split key. Purgatory
    39  	// prevents them from repeatedly attempting to split at an unbounded rate.
    40  	splitQueuePurgatoryCheckInterval = 1 * time.Minute
    41  
    42  	// splits should be relatively isolated, other than requiring expensive
    43  	// RocksDB scans over part of the splitting range to recompute stats. We
    44  	// allow a limitted number of splits to be processed at once.
    45  	splitQueueConcurrency = 4
    46  )
    47  
    48  // splitQueue manages a queue of ranges slated to be split due to size
    49  // or along intersecting zone config boundaries.
    50  type splitQueue struct {
    51  	*baseQueue
    52  	db       *kv.DB
    53  	purgChan <-chan time.Time
    54  
    55  	// loadBasedCount counts the load-based splits performed by the queue.
    56  	loadBasedCount telemetry.Counter
    57  }
    58  
    59  // newSplitQueue returns a new instance of splitQueue.
    60  func newSplitQueue(store *Store, db *kv.DB, gossip *gossip.Gossip) *splitQueue {
    61  	var purgChan <-chan time.Time
    62  	if c := store.TestingKnobs().SplitQueuePurgatoryChan; c != nil {
    63  		purgChan = c
    64  	} else {
    65  		purgTicker := time.NewTicker(splitQueuePurgatoryCheckInterval)
    66  		purgChan = purgTicker.C
    67  	}
    68  
    69  	sq := &splitQueue{
    70  		db:             db,
    71  		purgChan:       purgChan,
    72  		loadBasedCount: telemetry.GetCounter("kv.split.load"),
    73  	}
    74  	sq.baseQueue = newBaseQueue(
    75  		"split", sq, store, gossip,
    76  		queueConfig{
    77  			maxSize:              defaultQueueMaxSize,
    78  			maxConcurrency:       splitQueueConcurrency,
    79  			needsLease:           true,
    80  			needsSystemConfig:    true,
    81  			acceptsUnsplitRanges: true,
    82  			successes:            store.metrics.SplitQueueSuccesses,
    83  			failures:             store.metrics.SplitQueueFailures,
    84  			pending:              store.metrics.SplitQueuePending,
    85  			processingNanos:      store.metrics.SplitQueueProcessingNanos,
    86  			purgatory:            store.metrics.SplitQueuePurgatory,
    87  		},
    88  	)
    89  	return sq
    90  }
    91  
    92  func shouldSplitRange(
    93  	desc *roachpb.RangeDescriptor,
    94  	ms enginepb.MVCCStats,
    95  	maxBytes int64,
    96  	shouldBackpressureWrites bool,
    97  	sysCfg *config.SystemConfig,
    98  ) (shouldQ bool, priority float64) {
    99  	if sysCfg.NeedsSplit(desc.StartKey, desc.EndKey) {
   100  		// Set priority to 1 in the event the range is split by zone configs.
   101  		priority = 1
   102  		shouldQ = true
   103  	}
   104  
   105  	// Add priority based on the size of range compared to the max
   106  	// size for the zone it's in.
   107  	if ratio := float64(ms.Total()) / float64(maxBytes); ratio > 1 {
   108  		priority += ratio
   109  		shouldQ = true
   110  	}
   111  
   112  	// additionalPriorityDueToBackpressure is a mechanism to prioritize splitting
   113  	// ranges which will actively backpressure writes.
   114  	//
   115  	// NB: This additional weight is totally arbitrary. The priority in the split
   116  	// queue is usually 1 plus the ratio of the current size over the max size.
   117  	// When a range is much larger than it is allowed to be given the
   118  	// backpressureRangeSizeMultiplier and the zone config, backpressure is
   119  	// not going to be applied because of the backpressureByteTolerance (see the
   120  	// comment there for more details). However, when the range size is close to
   121  	// the limit, we will backpressure. We strongly prefer to split over
   122  	// backpressure.
   123  	const additionalPriorityDueToBackpressure = 50
   124  	if shouldQ && shouldBackpressureWrites {
   125  		priority += additionalPriorityDueToBackpressure
   126  	}
   127  
   128  	return shouldQ, priority
   129  }
   130  
   131  // shouldQueue determines whether a range should be queued for
   132  // splitting. This is true if the range is intersected by a zone config
   133  // prefix or if the range's size in bytes exceeds the limit for the zone,
   134  // or if the range has too much load on it.
   135  func (sq *splitQueue) shouldQueue(
   136  	ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg *config.SystemConfig,
   137  ) (shouldQ bool, priority float64) {
   138  	shouldQ, priority = shouldSplitRange(repl.Desc(), repl.GetMVCCStats(),
   139  		repl.GetMaxBytes(), repl.shouldBackpressureWrites(), sysCfg)
   140  
   141  	if !shouldQ && repl.SplitByLoadEnabled() {
   142  		if splitKey := repl.loadBasedSplitter.MaybeSplitKey(timeutil.Now()); splitKey != nil {
   143  			shouldQ, priority = true, 1.0 // default priority
   144  		}
   145  	}
   146  
   147  	return shouldQ, priority
   148  }
   149  
   150  // unsplittableRangeError indicates that a split attempt failed because a no
   151  // suitable split key could be found.
   152  type unsplittableRangeError struct{}
   153  
   154  func (unsplittableRangeError) Error() string         { return "could not find valid split key" }
   155  func (unsplittableRangeError) purgatoryErrorMarker() {}
   156  
   157  var _ purgatoryError = unsplittableRangeError{}
   158  
   159  // process synchronously invokes admin split for each proposed split key.
   160  func (sq *splitQueue) process(ctx context.Context, r *Replica, sysCfg *config.SystemConfig) error {
   161  	err := sq.processAttempt(ctx, r, sysCfg)
   162  	if errors.HasType(err, (*roachpb.ConditionFailedError)(nil)) {
   163  		// ConditionFailedErrors are an expected outcome for range split
   164  		// attempts because splits can race with other descriptor modifications.
   165  		// On seeing a ConditionFailedError, don't return an error and enqueue
   166  		// this replica again in case it still needs to be split.
   167  		log.Infof(ctx, "split saw concurrent descriptor modification; maybe retrying")
   168  		sq.MaybeAddAsync(ctx, r, sq.store.Clock().Now())
   169  		return nil
   170  	}
   171  	return err
   172  }
   173  
   174  func (sq *splitQueue) processAttempt(
   175  	ctx context.Context, r *Replica, sysCfg *config.SystemConfig,
   176  ) error {
   177  	desc := r.Desc()
   178  	// First handle the case of splitting due to zone config maps.
   179  	if splitKey := sysCfg.ComputeSplitKey(desc.StartKey, desc.EndKey); splitKey != nil {
   180  		if _, err := r.adminSplitWithDescriptor(
   181  			ctx,
   182  			roachpb.AdminSplitRequest{
   183  				RequestHeader: roachpb.RequestHeader{
   184  					Key: splitKey.AsRawKey(),
   185  				},
   186  				SplitKey:       splitKey.AsRawKey(),
   187  				ExpirationTime: hlc.Timestamp{},
   188  			},
   189  			desc,
   190  			false, /* delayable */
   191  			"zone config",
   192  		); err != nil {
   193  			return errors.Wrapf(err, "unable to split %s at key %q", r, splitKey)
   194  		}
   195  		return nil
   196  	}
   197  
   198  	// Next handle case of splitting due to size. Note that we don't perform
   199  	// size-based splitting if maxBytes is 0 (happens in certain test
   200  	// situations).
   201  	size := r.GetMVCCStats().Total()
   202  	maxBytes := r.GetMaxBytes()
   203  	if maxBytes > 0 && float64(size)/float64(maxBytes) > 1 {
   204  		_, err := r.adminSplitWithDescriptor(
   205  			ctx,
   206  			roachpb.AdminSplitRequest{},
   207  			desc,
   208  			false, /* delayable */
   209  			fmt.Sprintf("%s above threshold size %s", humanizeutil.IBytes(size), humanizeutil.IBytes(maxBytes)),
   210  		)
   211  		return err
   212  	}
   213  
   214  	now := timeutil.Now()
   215  	if splitByLoadKey := r.loadBasedSplitter.MaybeSplitKey(now); splitByLoadKey != nil {
   216  		batchHandledQPS := r.QueriesPerSecond()
   217  		raftAppliedQPS := r.WritesPerSecond()
   218  		splitQPS := r.loadBasedSplitter.LastQPS(now)
   219  		reason := fmt.Sprintf(
   220  			"load at key %s (%.2f splitQPS, %.2f batches/sec, %.2f raft mutations/sec)",
   221  			splitByLoadKey,
   222  			splitQPS,
   223  			batchHandledQPS,
   224  			raftAppliedQPS,
   225  		)
   226  		if _, pErr := r.adminSplitWithDescriptor(
   227  			ctx,
   228  			roachpb.AdminSplitRequest{
   229  				RequestHeader: roachpb.RequestHeader{
   230  					Key: splitByLoadKey,
   231  				},
   232  				SplitKey: splitByLoadKey,
   233  			},
   234  			desc,
   235  			false, /* delayable */
   236  			reason,
   237  		); pErr != nil {
   238  			return errors.Wrapf(pErr, "unable to split %s at key %q", r, splitByLoadKey)
   239  		}
   240  
   241  		telemetry.Inc(sq.loadBasedCount)
   242  
   243  		// Reset the splitter now that the bounds of the range changed.
   244  		r.loadBasedSplitter.Reset()
   245  		return nil
   246  	}
   247  	return nil
   248  }
   249  
   250  // timer returns interval between processing successive queued splits.
   251  func (*splitQueue) timer(_ time.Duration) time.Duration {
   252  	return splitQueueTimerDuration
   253  }
   254  
   255  // purgatoryChan returns the split queue's purgatory channel.
   256  func (sq *splitQueue) purgatoryChan() <-chan time.Time {
   257  	return sq.purgChan
   258  }