github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/split_queue.go (about) 1 // Copyright 2015 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "fmt" 16 "time" 17 18 "github.com/cockroachdb/cockroach/pkg/config" 19 "github.com/cockroachdb/cockroach/pkg/gossip" 20 "github.com/cockroachdb/cockroach/pkg/kv" 21 "github.com/cockroachdb/cockroach/pkg/roachpb" 22 "github.com/cockroachdb/cockroach/pkg/server/telemetry" 23 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 24 "github.com/cockroachdb/cockroach/pkg/util/hlc" 25 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 26 "github.com/cockroachdb/cockroach/pkg/util/log" 27 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 28 "github.com/cockroachdb/errors" 29 ) 30 31 const ( 32 // splitQueueTimerDuration is the duration between splits of queued ranges. 33 splitQueueTimerDuration = 0 // zero duration to process splits greedily. 34 35 // splitQueuePurgatoryCheckInterval is the interval at which replicas in 36 // purgatory make split attempts. Purgatory is used by the splitQueue to 37 // store ranges that are large enough to require a split but are 38 // unsplittable because they do not contain a suitable split key. Purgatory 39 // prevents them from repeatedly attempting to split at an unbounded rate. 40 splitQueuePurgatoryCheckInterval = 1 * time.Minute 41 42 // splits should be relatively isolated, other than requiring expensive 43 // RocksDB scans over part of the splitting range to recompute stats. We 44 // allow a limitted number of splits to be processed at once. 45 splitQueueConcurrency = 4 46 ) 47 48 // splitQueue manages a queue of ranges slated to be split due to size 49 // or along intersecting zone config boundaries. 50 type splitQueue struct { 51 *baseQueue 52 db *kv.DB 53 purgChan <-chan time.Time 54 55 // loadBasedCount counts the load-based splits performed by the queue. 56 loadBasedCount telemetry.Counter 57 } 58 59 // newSplitQueue returns a new instance of splitQueue. 60 func newSplitQueue(store *Store, db *kv.DB, gossip *gossip.Gossip) *splitQueue { 61 var purgChan <-chan time.Time 62 if c := store.TestingKnobs().SplitQueuePurgatoryChan; c != nil { 63 purgChan = c 64 } else { 65 purgTicker := time.NewTicker(splitQueuePurgatoryCheckInterval) 66 purgChan = purgTicker.C 67 } 68 69 sq := &splitQueue{ 70 db: db, 71 purgChan: purgChan, 72 loadBasedCount: telemetry.GetCounter("kv.split.load"), 73 } 74 sq.baseQueue = newBaseQueue( 75 "split", sq, store, gossip, 76 queueConfig{ 77 maxSize: defaultQueueMaxSize, 78 maxConcurrency: splitQueueConcurrency, 79 needsLease: true, 80 needsSystemConfig: true, 81 acceptsUnsplitRanges: true, 82 successes: store.metrics.SplitQueueSuccesses, 83 failures: store.metrics.SplitQueueFailures, 84 pending: store.metrics.SplitQueuePending, 85 processingNanos: store.metrics.SplitQueueProcessingNanos, 86 purgatory: store.metrics.SplitQueuePurgatory, 87 }, 88 ) 89 return sq 90 } 91 92 func shouldSplitRange( 93 desc *roachpb.RangeDescriptor, 94 ms enginepb.MVCCStats, 95 maxBytes int64, 96 shouldBackpressureWrites bool, 97 sysCfg *config.SystemConfig, 98 ) (shouldQ bool, priority float64) { 99 if sysCfg.NeedsSplit(desc.StartKey, desc.EndKey) { 100 // Set priority to 1 in the event the range is split by zone configs. 101 priority = 1 102 shouldQ = true 103 } 104 105 // Add priority based on the size of range compared to the max 106 // size for the zone it's in. 107 if ratio := float64(ms.Total()) / float64(maxBytes); ratio > 1 { 108 priority += ratio 109 shouldQ = true 110 } 111 112 // additionalPriorityDueToBackpressure is a mechanism to prioritize splitting 113 // ranges which will actively backpressure writes. 114 // 115 // NB: This additional weight is totally arbitrary. The priority in the split 116 // queue is usually 1 plus the ratio of the current size over the max size. 117 // When a range is much larger than it is allowed to be given the 118 // backpressureRangeSizeMultiplier and the zone config, backpressure is 119 // not going to be applied because of the backpressureByteTolerance (see the 120 // comment there for more details). However, when the range size is close to 121 // the limit, we will backpressure. We strongly prefer to split over 122 // backpressure. 123 const additionalPriorityDueToBackpressure = 50 124 if shouldQ && shouldBackpressureWrites { 125 priority += additionalPriorityDueToBackpressure 126 } 127 128 return shouldQ, priority 129 } 130 131 // shouldQueue determines whether a range should be queued for 132 // splitting. This is true if the range is intersected by a zone config 133 // prefix or if the range's size in bytes exceeds the limit for the zone, 134 // or if the range has too much load on it. 135 func (sq *splitQueue) shouldQueue( 136 ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg *config.SystemConfig, 137 ) (shouldQ bool, priority float64) { 138 shouldQ, priority = shouldSplitRange(repl.Desc(), repl.GetMVCCStats(), 139 repl.GetMaxBytes(), repl.shouldBackpressureWrites(), sysCfg) 140 141 if !shouldQ && repl.SplitByLoadEnabled() { 142 if splitKey := repl.loadBasedSplitter.MaybeSplitKey(timeutil.Now()); splitKey != nil { 143 shouldQ, priority = true, 1.0 // default priority 144 } 145 } 146 147 return shouldQ, priority 148 } 149 150 // unsplittableRangeError indicates that a split attempt failed because a no 151 // suitable split key could be found. 152 type unsplittableRangeError struct{} 153 154 func (unsplittableRangeError) Error() string { return "could not find valid split key" } 155 func (unsplittableRangeError) purgatoryErrorMarker() {} 156 157 var _ purgatoryError = unsplittableRangeError{} 158 159 // process synchronously invokes admin split for each proposed split key. 160 func (sq *splitQueue) process(ctx context.Context, r *Replica, sysCfg *config.SystemConfig) error { 161 err := sq.processAttempt(ctx, r, sysCfg) 162 if errors.HasType(err, (*roachpb.ConditionFailedError)(nil)) { 163 // ConditionFailedErrors are an expected outcome for range split 164 // attempts because splits can race with other descriptor modifications. 165 // On seeing a ConditionFailedError, don't return an error and enqueue 166 // this replica again in case it still needs to be split. 167 log.Infof(ctx, "split saw concurrent descriptor modification; maybe retrying") 168 sq.MaybeAddAsync(ctx, r, sq.store.Clock().Now()) 169 return nil 170 } 171 return err 172 } 173 174 func (sq *splitQueue) processAttempt( 175 ctx context.Context, r *Replica, sysCfg *config.SystemConfig, 176 ) error { 177 desc := r.Desc() 178 // First handle the case of splitting due to zone config maps. 179 if splitKey := sysCfg.ComputeSplitKey(desc.StartKey, desc.EndKey); splitKey != nil { 180 if _, err := r.adminSplitWithDescriptor( 181 ctx, 182 roachpb.AdminSplitRequest{ 183 RequestHeader: roachpb.RequestHeader{ 184 Key: splitKey.AsRawKey(), 185 }, 186 SplitKey: splitKey.AsRawKey(), 187 ExpirationTime: hlc.Timestamp{}, 188 }, 189 desc, 190 false, /* delayable */ 191 "zone config", 192 ); err != nil { 193 return errors.Wrapf(err, "unable to split %s at key %q", r, splitKey) 194 } 195 return nil 196 } 197 198 // Next handle case of splitting due to size. Note that we don't perform 199 // size-based splitting if maxBytes is 0 (happens in certain test 200 // situations). 201 size := r.GetMVCCStats().Total() 202 maxBytes := r.GetMaxBytes() 203 if maxBytes > 0 && float64(size)/float64(maxBytes) > 1 { 204 _, err := r.adminSplitWithDescriptor( 205 ctx, 206 roachpb.AdminSplitRequest{}, 207 desc, 208 false, /* delayable */ 209 fmt.Sprintf("%s above threshold size %s", humanizeutil.IBytes(size), humanizeutil.IBytes(maxBytes)), 210 ) 211 return err 212 } 213 214 now := timeutil.Now() 215 if splitByLoadKey := r.loadBasedSplitter.MaybeSplitKey(now); splitByLoadKey != nil { 216 batchHandledQPS := r.QueriesPerSecond() 217 raftAppliedQPS := r.WritesPerSecond() 218 splitQPS := r.loadBasedSplitter.LastQPS(now) 219 reason := fmt.Sprintf( 220 "load at key %s (%.2f splitQPS, %.2f batches/sec, %.2f raft mutations/sec)", 221 splitByLoadKey, 222 splitQPS, 223 batchHandledQPS, 224 raftAppliedQPS, 225 ) 226 if _, pErr := r.adminSplitWithDescriptor( 227 ctx, 228 roachpb.AdminSplitRequest{ 229 RequestHeader: roachpb.RequestHeader{ 230 Key: splitByLoadKey, 231 }, 232 SplitKey: splitByLoadKey, 233 }, 234 desc, 235 false, /* delayable */ 236 reason, 237 ); pErr != nil { 238 return errors.Wrapf(pErr, "unable to split %s at key %q", r, splitByLoadKey) 239 } 240 241 telemetry.Inc(sq.loadBasedCount) 242 243 // Reset the splitter now that the bounds of the range changed. 244 r.loadBasedSplitter.Reset() 245 return nil 246 } 247 return nil 248 } 249 250 // timer returns interval between processing successive queued splits. 251 func (*splitQueue) timer(_ time.Duration) time.Duration { 252 return splitQueueTimerDuration 253 } 254 255 // purgatoryChan returns the split queue's purgatory channel. 256 func (sq *splitQueue) purgatoryChan() <-chan time.Time { 257 return sq.purgChan 258 }