github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/merge_queue.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "fmt" 16 "math" 17 "time" 18 19 "github.com/cockroachdb/cockroach/pkg/config" 20 "github.com/cockroachdb/cockroach/pkg/gossip" 21 "github.com/cockroachdb/cockroach/pkg/kv" 22 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 23 "github.com/cockroachdb/cockroach/pkg/roachpb" 24 "github.com/cockroachdb/cockroach/pkg/settings" 25 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 26 "github.com/cockroachdb/cockroach/pkg/util/hlc" 27 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 28 "github.com/cockroachdb/cockroach/pkg/util/log" 29 "github.com/cockroachdb/errors" 30 ) 31 32 const ( 33 // mergeQueuePurgatoryCheckInterval is the interval at which replicas in 34 // purgatory make merge attempts. Since merges are relatively untested, the 35 // reasons that a range may fail to merge are unknown, so the merge queue has 36 // a large purgatory interval. 37 mergeQueuePurgatoryCheckInterval = 1 * time.Minute 38 39 // The current implementation of merges requires rewriting the right-hand data 40 // onto the left-hand range, even when the ranges are collocated. This is 41 // expensive, so limit to one merge at a time. 42 mergeQueueConcurrency = 1 43 ) 44 45 // MergeQueueInterval is a setting that controls how often the merge queue waits 46 // between processing replicas. 47 var MergeQueueInterval = settings.RegisterNonNegativeDurationSetting( 48 "kv.range_merge.queue_interval", 49 "how long the merge queue waits between processing replicas", 50 time.Second, 51 ) 52 53 // mergeQueue manages a queue of ranges slated to be merged with their right- 54 // hand neighbor. 55 // 56 // A range will only be queued if it is beneath the minimum size threshold. Once 57 // queued, the size of the right-hand neighbor will additionally be checked; 58 // merges can only proceed if a) the right-hand neighbor is beneath the minimum 59 // size threshold, and b) the merged range would not need to be immediately 60 // split, e.g. because the new range would exceed the maximum size threshold. 61 // 62 // Note that the merge queue is not capable of initiating all possible merges. 63 // Consider the example below: 64 // 65 // /Table/51/1 /Table/51/2 /Table/52 66 // 32MB 0MB 32MB 67 // 68 // The range beginning at /Table/51/2 is empty and would, ideally, be merged 69 // away. The range to its left, /Table/51/1, will not propose a merge because it 70 // is over the minimum size threshold. And /Table/51/2 will not propose a merge 71 // because the next range, /Table/52, is a new table and thus the split is 72 // mandatory. 73 // 74 // There are several ways to solve this. /Table/51/2 could look both left and 75 // right to find a merge partner, but discovering ones left neighbor is rather 76 // difficult and involves scanning the meta ranges. /Table/51/1 could propose a 77 // merge even though it's over the minimum size threshold, but this would result 78 // in a lot more RangeStats requests--essentially every range would send a 79 // RangeStats request on every scanner cycle. 80 // 81 // The current approach seems to be a nice balance of finding nearly all 82 // mergeable ranges without sending many RPCs. It has the additional nice 83 // property of not sending any RPCs to meta ranges until a merge is actually 84 // initiated. 85 type mergeQueue struct { 86 *baseQueue 87 db *kv.DB 88 purgChan <-chan time.Time 89 } 90 91 func newMergeQueue(store *Store, db *kv.DB, gossip *gossip.Gossip) *mergeQueue { 92 mq := &mergeQueue{ 93 db: db, 94 purgChan: time.NewTicker(mergeQueuePurgatoryCheckInterval).C, 95 } 96 mq.baseQueue = newBaseQueue( 97 "merge", mq, store, gossip, 98 queueConfig{ 99 maxSize: defaultQueueMaxSize, 100 maxConcurrency: mergeQueueConcurrency, 101 // TODO(ajwerner): Sometimes the merge queue needs to send multiple 102 // snapshots, but the timeout function here is configured based on the 103 // duration required to send a single snapshot. That being said, this 104 // timeout provides leeway for snapshots to be 10x slower than the 105 // specified rate and still respects the queue processing minimum timeout. 106 // While using the below function is certainly better than just using the 107 // default timeout, it would be better to have a function which takes into 108 // account how many snapshots processing will need to send. That might be 109 // hard to determine ahead of time. An alternative would be to calculate 110 // the timeout with a function that additionally considers the replication 111 // factor. 112 processTimeoutFunc: makeQueueSnapshotTimeoutFunc(rebalanceSnapshotRate), 113 needsLease: true, 114 needsSystemConfig: true, 115 acceptsUnsplitRanges: false, 116 successes: store.metrics.MergeQueueSuccesses, 117 failures: store.metrics.MergeQueueFailures, 118 pending: store.metrics.MergeQueuePending, 119 processingNanos: store.metrics.MergeQueueProcessingNanos, 120 purgatory: store.metrics.MergeQueuePurgatory, 121 }, 122 ) 123 return mq 124 } 125 126 func (mq *mergeQueue) enabled() bool { 127 st := mq.store.ClusterSettings() 128 return kvserverbase.MergeQueueEnabled.Get(&st.SV) 129 } 130 131 func (mq *mergeQueue) shouldQueue( 132 ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg *config.SystemConfig, 133 ) (shouldQ bool, priority float64) { 134 if !mq.enabled() { 135 return false, 0 136 } 137 138 desc := repl.Desc() 139 140 if desc.EndKey.Equal(roachpb.RKeyMax) { 141 // The last range has no right-hand neighbor to merge with. 142 return false, 0 143 } 144 145 if sysCfg.NeedsSplit(desc.StartKey, desc.EndKey.Next()) { 146 // This range would need to be split if it extended just one key further. 147 // There is thus no possible right-hand neighbor that it could be merged 148 // with. 149 return false, 0 150 } 151 152 sizeRatio := float64(repl.GetMVCCStats().Total()) / float64(repl.GetMinBytes()) 153 if math.IsNaN(sizeRatio) || sizeRatio >= 1 { 154 // This range is above the minimum size threshold. It does not need to be 155 // merged. 156 return false, 0 157 } 158 159 // Invert sizeRatio to compute the priority so that smaller ranges are merged 160 // before larger ranges. 161 priority = 1 - sizeRatio 162 return true, priority 163 } 164 165 // rangeMergePurgatoryError wraps an error that occurs during merging to 166 // indicate that the error should send the range to purgatory. 167 type rangeMergePurgatoryError struct{ error } 168 169 func (rangeMergePurgatoryError) purgatoryErrorMarker() {} 170 171 var _ purgatoryError = rangeMergePurgatoryError{} 172 173 func (mq *mergeQueue) requestRangeStats( 174 ctx context.Context, key roachpb.Key, 175 ) (*roachpb.RangeDescriptor, enginepb.MVCCStats, float64, error) { 176 res, pErr := kv.SendWrappedWith(ctx, mq.db.NonTransactionalSender(), roachpb.Header{ 177 ReturnRangeInfo: true, 178 }, &roachpb.RangeStatsRequest{ 179 RequestHeader: roachpb.RequestHeader{Key: key}, 180 }) 181 if pErr != nil { 182 return nil, enginepb.MVCCStats{}, 0, pErr.GoError() 183 } 184 rangeInfos := res.Header().RangeInfos 185 if len(rangeInfos) != 1 { 186 return nil, enginepb.MVCCStats{}, 0, fmt.Errorf( 187 "mergeQueue.requestRangeStats: response had %d range infos but exactly one was expected", 188 len(rangeInfos)) 189 } 190 return &rangeInfos[0].Desc, res.(*roachpb.RangeStatsResponse).MVCCStats, 191 res.(*roachpb.RangeStatsResponse).QueriesPerSecond, nil 192 } 193 194 func (mq *mergeQueue) process( 195 ctx context.Context, lhsRepl *Replica, sysCfg *config.SystemConfig, 196 ) error { 197 if !mq.enabled() { 198 log.VEventf(ctx, 2, "skipping merge: queue has been disabled") 199 return nil 200 } 201 202 lhsStats := lhsRepl.GetMVCCStats() 203 minBytes := lhsRepl.GetMinBytes() 204 if lhsStats.Total() >= minBytes { 205 log.VEventf(ctx, 2, "skipping merge: LHS meets minimum size threshold %d with %d bytes", 206 minBytes, lhsStats.Total()) 207 return nil 208 } 209 210 lhsDesc := lhsRepl.Desc() 211 lhsQPS := lhsRepl.GetSplitQPS() 212 rhsDesc, rhsStats, rhsQPS, err := mq.requestRangeStats(ctx, lhsDesc.EndKey.AsRawKey()) 213 if err != nil { 214 return err 215 } 216 if rhsStats.Total() >= minBytes { 217 log.VEventf(ctx, 2, "skipping merge: RHS meets minimum size threshold %d with %d bytes", 218 minBytes, lhsStats.Total()) 219 return nil 220 } 221 222 // Range was manually split and not expired, so skip merging. 223 now := mq.store.Clock().Now() 224 if now.Less(rhsDesc.GetStickyBit()) { 225 log.VEventf(ctx, 2, "skipping merge: ranges were manually split and sticky bit was not expired") 226 // TODO(jeffreyxiao): Consider returning a purgatory error to avoid 227 // repeatedly processing ranges that cannot be merged. 228 return nil 229 } 230 231 mergedDesc := &roachpb.RangeDescriptor{ 232 StartKey: lhsDesc.StartKey, 233 EndKey: rhsDesc.EndKey, 234 } 235 mergedStats := lhsStats 236 mergedStats.Add(rhsStats) 237 238 var mergedQPS float64 239 if lhsRepl.SplitByLoadEnabled() { 240 mergedQPS = lhsQPS + rhsQPS 241 } 242 243 // Check if the merged range would need to be split, if so, skip merge. 244 // Use a lower threshold for load based splitting so we don't find ourselves 245 // in a situation where we keep merging ranges that would be split soon after 246 // by a small increase in load. 247 conservativeLoadBasedSplitThreshold := 0.5 * lhsRepl.SplitByLoadQPSThreshold() 248 shouldSplit, _ := shouldSplitRange(mergedDesc, mergedStats, 249 lhsRepl.GetMaxBytes(), lhsRepl.shouldBackpressureWrites(), sysCfg) 250 if shouldSplit || mergedQPS >= conservativeLoadBasedSplitThreshold { 251 log.VEventf(ctx, 2, 252 "skipping merge to avoid thrashing: merged range %s may split "+ 253 "(estimated size, estimated QPS: %d, %v)", 254 mergedDesc, mergedStats.Total(), mergedQPS) 255 return nil 256 } 257 258 { 259 store := lhsRepl.store 260 // AdminMerge errors if there is a learner or joint config on either 261 // side and AdminRelocateRange removes any on the range it operates on. 262 // For the sake of obviousness, just fix this all upfront. 263 var err error 264 lhsDesc, err = maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, store, lhsDesc) 265 if err != nil { 266 log.VEventf(ctx, 2, `%v`, err) 267 return err 268 } 269 270 rhsDesc, err = maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, store, rhsDesc) 271 if err != nil { 272 log.VEventf(ctx, 2, `%v`, err) 273 return err 274 } 275 } 276 lhsReplicas, rhsReplicas := lhsDesc.Replicas().All(), rhsDesc.Replicas().All() 277 278 // Defensive sanity check that everything is now a voter. 279 for i := range lhsReplicas { 280 if lhsReplicas[i].GetType() != roachpb.VOTER_FULL { 281 return errors.Errorf(`cannot merge non-voter replicas on lhs: %v`, lhsReplicas) 282 } 283 } 284 for i := range rhsReplicas { 285 if rhsReplicas[i].GetType() != roachpb.VOTER_FULL { 286 return errors.Errorf(`cannot merge non-voter replicas on rhs: %v`, rhsReplicas) 287 } 288 } 289 290 if !replicaSetsEqual(lhsReplicas, rhsReplicas) { 291 var targets []roachpb.ReplicationTarget 292 for _, lhsReplDesc := range lhsReplicas { 293 targets = append(targets, roachpb.ReplicationTarget{ 294 NodeID: lhsReplDesc.NodeID, StoreID: lhsReplDesc.StoreID, 295 }) 296 } 297 // AdminRelocateRange moves the lease to the first target in the list, so 298 // sort the existing leaseholder there to leave it unchanged. 299 lease, _ := lhsRepl.GetLease() 300 for i := range targets { 301 if targets[i].NodeID == lease.Replica.NodeID && targets[i].StoreID == lease.Replica.StoreID { 302 if i > 0 { 303 targets[0], targets[i] = targets[i], targets[0] 304 } 305 break 306 } 307 } 308 // TODO(benesch): RelocateRange can sometimes fail if it needs to move a replica 309 // from one store to another store on the same node. 310 if err := mq.store.DB().AdminRelocateRange(ctx, rhsDesc.StartKey, targets); err != nil { 311 return err 312 } 313 } 314 315 log.VEventf(ctx, 2, "merging to produce range: %s-%s", mergedDesc.StartKey, mergedDesc.EndKey) 316 reason := fmt.Sprintf("lhs+rhs has (size=%s+%s=%s qps=%.2f+%.2f=%.2fqps) below threshold (size=%s, qps=%.2f)", 317 humanizeutil.IBytes(lhsStats.Total()), 318 humanizeutil.IBytes(rhsStats.Total()), 319 humanizeutil.IBytes(mergedStats.Total()), 320 lhsQPS, 321 rhsQPS, 322 mergedQPS, 323 humanizeutil.IBytes(minBytes), 324 conservativeLoadBasedSplitThreshold, 325 ) 326 _, pErr := lhsRepl.AdminMerge(ctx, roachpb.AdminMergeRequest{ 327 RequestHeader: roachpb.RequestHeader{Key: lhsRepl.Desc().StartKey.AsRawKey()}, 328 }, reason) 329 if err := pErr.GoError(); errors.HasType(err, (*roachpb.ConditionFailedError)(nil)) { 330 // ConditionFailedErrors are an expected outcome for range merge 331 // attempts because merges can race with other descriptor modifications. 332 // On seeing a ConditionFailedError, don't return an error and enqueue 333 // this replica again in case it still needs to be merged. 334 log.Infof(ctx, "merge saw concurrent descriptor modification; maybe retrying") 335 mq.MaybeAddAsync(ctx, lhsRepl, now) 336 } else if err != nil { 337 // While range merges are unstable, be extra cautious and mark every error 338 // as purgatory-worthy. 339 return rangeMergePurgatoryError{err} 340 } 341 if testingAggressiveConsistencyChecks { 342 if err := mq.store.consistencyQueue.process(ctx, lhsRepl, sysCfg); err != nil { 343 log.Warningf(ctx, "%v", err) 344 } 345 } 346 return nil 347 } 348 349 func (mq *mergeQueue) timer(time.Duration) time.Duration { 350 return MergeQueueInterval.Get(&mq.store.ClusterSettings().SV) 351 } 352 353 func (mq *mergeQueue) purgatoryChan() <-chan time.Time { 354 return mq.purgChan 355 }