github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/queue.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "container/heap" 15 "context" 16 "fmt" 17 "sync/atomic" 18 "time" 19 20 "github.com/cockroachdb/cockroach/pkg/config" 21 "github.com/cockroachdb/cockroach/pkg/gossip" 22 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 23 "github.com/cockroachdb/cockroach/pkg/roachpb" 24 "github.com/cockroachdb/cockroach/pkg/settings" 25 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 26 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 27 "github.com/cockroachdb/cockroach/pkg/util/contextutil" 28 "github.com/cockroachdb/cockroach/pkg/util/hlc" 29 "github.com/cockroachdb/cockroach/pkg/util/log" 30 "github.com/cockroachdb/cockroach/pkg/util/metric" 31 "github.com/cockroachdb/cockroach/pkg/util/quotapool" 32 "github.com/cockroachdb/cockroach/pkg/util/stop" 33 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 34 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 35 "github.com/cockroachdb/errors" 36 ) 37 38 const ( 39 // purgatoryReportInterval is the duration between reports on 40 // purgatory status. 41 purgatoryReportInterval = 10 * time.Minute 42 // defaultProcessTimeout is the timeout when processing a replica. 43 // The timeout prevents a queue from getting stuck on a replica. 44 // For example, a replica whose range is not reachable for quorum. 45 defaultProcessTimeout = 1 * time.Minute 46 // defaultQueueMaxSize is the default max size for a queue. 47 defaultQueueMaxSize = 10000 48 ) 49 50 // queueGuaranteedProcessingTimeBudget is the smallest amount of time before 51 // which the processing of a queue may time out. It is an escape hatch to raise 52 // the timeout for queues. 53 var queueGuaranteedProcessingTimeBudget = settings.RegisterDurationSetting( 54 "kv.queue.process.guaranteed_time_budget", 55 "the guaranteed duration before which the processing of a queue may "+ 56 "time out", 57 defaultProcessTimeout, 58 ) 59 60 func init() { 61 queueGuaranteedProcessingTimeBudget.SetVisibility(settings.Reserved) 62 } 63 64 func defaultProcessTimeoutFunc(cs *cluster.Settings, _ replicaInQueue) time.Duration { 65 return queueGuaranteedProcessingTimeBudget.Get(&cs.SV) 66 } 67 68 // The queues which send snapshots while processing should have a timeout which 69 // is a function of the size of the range and the maximum allowed rate of data 70 // transfer that adheres to a minimum timeout specified in a cluster setting. 71 // 72 // The parameter controls which rate to use. 73 func makeQueueSnapshotTimeoutFunc(rateSetting *settings.ByteSizeSetting) queueProcessTimeoutFunc { 74 return func(cs *cluster.Settings, r replicaInQueue) time.Duration { 75 minimumTimeout := queueGuaranteedProcessingTimeBudget.Get(&cs.SV) 76 // NB: In production code this will type assertion will always succeed. 77 // Some tests set up a fake implementation of replicaInQueue in which 78 // case we fall back to the configured minimum timeout. 79 repl, ok := r.(interface{ GetMVCCStats() enginepb.MVCCStats }) 80 if !ok { 81 return minimumTimeout 82 } 83 snapshotRate := rateSetting.Get(&cs.SV) 84 stats := repl.GetMVCCStats() 85 totalBytes := stats.KeyBytes + stats.ValBytes + stats.IntentBytes + stats.SysBytes 86 estimatedDuration := time.Duration(totalBytes/snapshotRate) * time.Second 87 timeout := estimatedDuration * permittedSnapshotSlowdown 88 if timeout < minimumTimeout { 89 timeout = minimumTimeout 90 } 91 return timeout 92 } 93 } 94 95 // permittedSnapshotSlowdown is the factor of the above the estimated duration 96 // for a snapshot given the configured snapshot rate which we use to configure 97 // the snapshot's timeout. 98 const permittedSnapshotSlowdown = 10 99 100 // a purgatoryError indicates a replica processing failure which indicates 101 // the replica can be placed into purgatory for faster retries when the 102 // failure condition changes. 103 type purgatoryError interface { 104 error 105 purgatoryErrorMarker() // dummy method for unique interface 106 } 107 108 // processCallback is a hook that is called when a replica finishes processing. 109 // It is called with the result of the process attempt. 110 type processCallback func(error) 111 112 // A replicaItem holds a replica and metadata about its queue state and 113 // processing state. 114 type replicaItem struct { 115 rangeID roachpb.RangeID 116 replicaID roachpb.ReplicaID 117 seq int // enforce FIFO order for equal priorities 118 119 // fields used when a replicaItem is enqueued in a priority queue. 120 priority float64 121 index int // The index of the item in the heap, maintained by the heap.Interface methods 122 123 // fields used when a replicaItem is processing. 124 processing bool 125 requeue bool // enqueue again after processing? 126 callbacks []processCallback 127 } 128 129 // setProcessing moves the item from an enqueued state to a processing state. 130 func (i *replicaItem) setProcessing() { 131 i.priority = 0 132 if i.index >= 0 { 133 log.Fatalf(context.Background(), 134 "r%d marked as processing but appears in prioQ", i.rangeID, 135 ) 136 } 137 i.processing = true 138 } 139 140 // registerCallback adds a new callback to be executed when the replicaItem 141 // finishes processing. 142 func (i *replicaItem) registerCallback(cb processCallback) { 143 i.callbacks = append(i.callbacks, cb) 144 } 145 146 // A priorityQueue implements heap.Interface and holds replicaItems. 147 type priorityQueue struct { 148 seqGen int 149 sl []*replicaItem 150 } 151 152 func (pq priorityQueue) Len() int { return len(pq.sl) } 153 154 func (pq priorityQueue) Less(i, j int) bool { 155 a, b := pq.sl[i], pq.sl[j] 156 if a.priority == b.priority { 157 // When priorities are equal, we want the lower sequence number to show 158 // up first (FIFO). 159 return a.seq < b.seq 160 } 161 // We want Pop to give us the highest, not lowest, priority so we use greater than here. 162 return a.priority > b.priority 163 } 164 165 func (pq priorityQueue) Swap(i, j int) { 166 pq.sl[i], pq.sl[j] = pq.sl[j], pq.sl[i] 167 pq.sl[i].index, pq.sl[j].index = i, j 168 } 169 170 func (pq *priorityQueue) Push(x interface{}) { 171 n := len(pq.sl) 172 item := x.(*replicaItem) 173 item.index = n 174 pq.seqGen++ 175 item.seq = pq.seqGen 176 pq.sl = append(pq.sl, item) 177 } 178 179 func (pq *priorityQueue) Pop() interface{} { 180 old := pq.sl 181 n := len(old) 182 item := old[n-1] 183 item.index = -1 // for safety 184 old[n-1] = nil // for gc 185 pq.sl = old[0 : n-1] 186 return item 187 } 188 189 // update modifies the priority of a replicaItem in the queue. 190 func (pq *priorityQueue) update(item *replicaItem, priority float64) { 191 item.priority = priority 192 if len(pq.sl) <= item.index || pq.sl[item.index] != item { 193 log.Fatalf(context.Background(), "updating item in heap that's not contained in it: %v", item) 194 } 195 heap.Fix(pq, item.index) 196 } 197 198 var ( 199 errQueueDisabled = errors.New("queue disabled") 200 errQueueStopped = errors.New("queue stopped") 201 ) 202 203 func isExpectedQueueError(err error) bool { 204 return err == nil || errors.Is(err, errQueueDisabled) 205 } 206 207 // shouldQueueAgain is a helper function to determine whether the 208 // replica should be queued according to the current time, the last 209 // time the replica was processed, and the minimum interval between 210 // successive processing. Specifying minInterval=0 queues all replicas. 211 // Returns a bool for whether to queue as well as a priority based 212 // on how long it's been since last processed. 213 func shouldQueueAgain(now, last hlc.Timestamp, minInterval time.Duration) (bool, float64) { 214 if minInterval == 0 || last == (hlc.Timestamp{}) { 215 return true, 0 216 } 217 if diff := now.GoTime().Sub(last.GoTime()); diff >= minInterval { 218 priority := float64(1) 219 // If there's a non-zero last processed timestamp, adjust the 220 // priority by a multiple of how long it's been since the last 221 // time this replica was processed. 222 if last != (hlc.Timestamp{}) { 223 priority = float64(diff.Nanoseconds()) / float64(minInterval.Nanoseconds()) 224 } 225 return true, priority 226 } 227 return false, 0 228 } 229 230 // replicaInQueue is the subset of *Replica required for interacting with queues. 231 // 232 // TODO(tbg): this interface is horrible, but this is what we do use at time of 233 // extraction. Establish a sane interface and use that. 234 type replicaInQueue interface { 235 AnnotateCtx(context.Context) context.Context 236 ReplicaID() roachpb.ReplicaID 237 StoreID() roachpb.StoreID 238 GetRangeID() roachpb.RangeID 239 IsInitialized() bool 240 IsDestroyed() (DestroyReason, error) 241 Desc() *roachpb.RangeDescriptor 242 maybeInitializeRaftGroup(context.Context) 243 redirectOnOrAcquireLease(context.Context) (kvserverpb.LeaseStatus, *roachpb.Error) 244 IsLeaseValid(roachpb.Lease, hlc.Timestamp) bool 245 GetLease() (roachpb.Lease, roachpb.Lease) 246 } 247 248 type queueImpl interface { 249 // shouldQueue accepts current time, a replica, and the system config 250 // and returns whether it should be queued and if so, at what priority. 251 // The Replica is guaranteed to be initialized. 252 shouldQueue( 253 context.Context, hlc.Timestamp, *Replica, *config.SystemConfig, 254 ) (shouldQueue bool, priority float64) 255 256 // process accepts a replica, and the system config and executes 257 // queue-specific work on it. The Replica is guaranteed to be initialized. 258 process(context.Context, *Replica, *config.SystemConfig) error 259 260 // timer returns a duration to wait between processing the next item 261 // from the queue. The duration of the last processing of a replica 262 // is supplied as an argument. If no replicas have finished processing 263 // yet, this can be 0. 264 timer(time.Duration) time.Duration 265 266 // purgatoryChan returns a channel that is signaled with the current 267 // time when it's time to retry replicas which have been relegated to 268 // purgatory due to failures. If purgatoryChan returns nil, failing 269 // replicas are not sent to purgatory. 270 purgatoryChan() <-chan time.Time 271 } 272 273 // queueProcessTimeoutFunc controls the timeout for queue processing for a 274 // replicaInQueue. 275 type queueProcessTimeoutFunc func(*cluster.Settings, replicaInQueue) time.Duration 276 277 type queueConfig struct { 278 // maxSize is the maximum number of replicas to queue. 279 maxSize int 280 // maxConcurrency is the maximum number of replicas that can be processed 281 // concurrently. If not set, defaults to 1. 282 maxConcurrency int 283 addOrMaybeAddSemSize int 284 // needsLease controls whether this queue requires the range lease to operate 285 // on a replica. If so, one will be acquired if necessary. Many queues set 286 // needsLease not because they literally need a lease, but because they work 287 // on a range level and use it to ensure that only one node in the cluster 288 // processes that range. 289 needsLease bool 290 // needsRaftInitialized controls whether the Raft group will be initialized 291 // (if not already initialized) when deciding whether to process this 292 // replica. 293 needsRaftInitialized bool 294 // needsSystemConfig controls whether this queue requires a valid copy of the 295 // system config to operate on a replica. Not all queues require it, and it's 296 // unsafe for certain queues to wait on it. For example, a raft snapshot may 297 // be needed in order to make it possible for the system config to become 298 // available (as observed in #16268), so the raft snapshot queue can't 299 // require the system config to already be available. 300 needsSystemConfig bool 301 // acceptsUnsplitRanges controls whether this queue can process ranges that 302 // need to be split due to zone config settings. Ranges are checked before 303 // calling queueImpl.shouldQueue and queueImpl.process. 304 // This is to avoid giving the queue a replica that spans multiple config 305 // zones (which might make the action of the queue ambiguous - e.g. we don't 306 // want to try to replicate a range until we know which zone it is in and 307 // therefore how many replicas are required). 308 acceptsUnsplitRanges bool 309 // processDestroyedReplicas controls whether or not we want to process replicas 310 // that have been destroyed but not GCed. 311 processDestroyedReplicas bool 312 // processTimeout returns the timeout for processing a replica. 313 processTimeoutFunc queueProcessTimeoutFunc 314 // successes is a counter of replicas processed successfully. 315 successes *metric.Counter 316 // failures is a counter of replicas which failed processing. 317 failures *metric.Counter 318 // pending is a gauge measuring current replica count pending. 319 pending *metric.Gauge 320 // processingNanos is a counter measuring total nanoseconds spent processing replicas. 321 processingNanos *metric.Counter 322 // purgatory is a gauge measuring current replica count in purgatory. 323 purgatory *metric.Gauge 324 } 325 326 // baseQueue is the base implementation of the replicaQueue interface. Queue 327 // implementations should embed a baseQueue and implement queueImpl. 328 // 329 // A queue contains replicas in one of three stages: queued, processing, and 330 // purgatory. A "queued" replica is waiting for processing with some priority 331 // that was selected when it was added. A "processing" replica is actively being 332 // worked on by the queue, which delegates to the queueImpl's `process` method. 333 // Replicas are selected from the queue for processing purely in priority order. 334 // A "purgatory" replica has been marked by the queue implementation as 335 // temporarily uninteresting and it will not be processed again until some 336 // queue-specific event occurs. Not every queue has a purgatory. 337 // 338 // Generally, replicas are added to a queue by a replicaScanner, which is a 339 // Store-level object. The scanner is configured with a set of queues (which in 340 // practice is all of the queues) and will repeatedly iterate through every 341 // replica on the store at a measured pace, handing each replica to every 342 // queueImpl's `shouldQueue` method. This method is implemented differently by 343 // each queue and decides whether the replica is currently interesting. If so, 344 // it also selects a priority. Note that queues have a bounded size controlled 345 // by the `maxSize` config option, which means the ones with lowest priority may 346 // be dropped if processing cannot keep up and the queue fills. 347 // 348 // Replicas are added asynchronously through `MaybeAddAsync` or `AddAsync`. 349 // MaybeAddAsync checks the various requirements selected by the queue config 350 // (needsSystemConfig, needsLease, acceptsUnsplitRanges) as well as the 351 // queueImpl's `shouldQueue`. AddAsync does not check any of this and accept a 352 // priority directly instead of getting it from `shouldQueue`. These methods run 353 // with shared a maximum concurrency of `addOrMaybeAddSemSize`. If the maximum 354 // concurrency is reached, MaybeAddAsync will silently drop the replica but 355 // AddAsync will block. 356 // 357 // Synchronous replica addition is intentionally not part of the public 358 // interface. Many queue impl's "processing" work functions acquire various 359 // locks on Replica, so it would be too easy for a callsite of such a method to 360 // deadlock. See #36413 for context. Additionally, the queues themselves process 361 // asynchronously and the bounded size means what you add isn't guaranteed to be 362 // processed, so the exclusive-async contract just forces callers to realize 363 // this early. 364 // 365 // Processing is rate limited by the queueImpl's `timer` which receives the 366 // amount of time it took to processes the previous replica and returns the 367 // amount of time to wait before processing the next one. A bounded amount of 368 // processing concurrency is allowed, which is controlled by the 369 // `maxConcurrency` option in the queue's configuration. If a replica is added 370 // while being processed, it's requeued after the processing finishes. 371 // 372 // Note that all sorts of things can change between when a replica is enqueued 373 // and when it is processed, so the queue makes sure to grab the latest one 374 // right before processing by looking up the current replica with the same 375 // RangeID. This replica could be gone or, in extreme cases, could have been 376 // removed and re-added and now has a new ReplicaID. Implementors needs to be 377 // resilient to this. 378 // 379 // A queueImpl can opt into a purgatory by returning a non-nil channel from the 380 // `purgatoryChan` method. A replica is put into purgatory when the `process` 381 // method returns an error with a `purgatoryError` as an entry somewhere in the 382 // `Cause` chain. A replica in purgatory is not processed again until the 383 // channel is signaled, at which point every replica in purgatory is immediately 384 // processed. This catchup is run without the `timer` rate limiting but shares 385 // the same `maxConcurrency` semaphore as regular processing. Note that if a 386 // purgatory replica is pushed out of a full queue, it's also removed from 387 // purgatory. Replicas in purgatory count against the max queue size. 388 // 389 // After construction a queue needs to be `Start`ed, which spawns a goroutine to 390 // continually pop the "queued" replica with the highest priority and process 391 // it. In practice, this is done by the same replicaScanner that adds replicas. 392 type baseQueue struct { 393 log.AmbientContext 394 395 name string 396 getReplica func(roachpb.RangeID) (replicaInQueue, error) 397 // The constructor of the queueImpl structure MUST return a pointer. 398 // This is because assigning queueImpl to a function-local, then 399 // passing a pointer to it to `makeBaseQueue`, and then returning it 400 // from the constructor function will return a queueImpl containing 401 // a pointer to a structure which is a copy of the one within which 402 // it is contained. DANGER. 403 impl queueImpl 404 store *Store 405 gossip *gossip.Gossip 406 queueConfig 407 incoming chan struct{} // Channel signaled when a new replica is added to the queue. 408 processSem chan struct{} 409 addOrMaybeAddSem *quotapool.IntPool // for {Maybe,}AddAsync 410 addLogN log.EveryN // avoid log spam when addSem, addOrMaybeAddSemSize are maxed out 411 processDur int64 // accessed atomically 412 mu struct { 413 syncutil.Mutex // Protects all variables in the mu struct 414 replicas map[roachpb.RangeID]*replicaItem // Map from RangeID to replicaItem 415 priorityQ priorityQueue // The priority queue 416 purgatory map[roachpb.RangeID]purgatoryError // Map of replicas to processing errors 417 stopped bool 418 // Some tests in this package disable queues. 419 disabled bool 420 } 421 } 422 423 // newBaseQueue returns a new instance of baseQueue with the specified 424 // shouldQueue function to determine which replicas to queue and maxSize to 425 // limit the growth of the queue. Note that maxSize doesn't prevent new 426 // replicas from being added, it just limits the total size. Higher priority 427 // replicas can still be added; their addition simply removes the lowest 428 // priority replica. 429 func newBaseQueue( 430 name string, impl queueImpl, store *Store, gossip *gossip.Gossip, cfg queueConfig, 431 ) *baseQueue { 432 // Use the default process timeout if none specified. 433 if cfg.processTimeoutFunc == nil { 434 cfg.processTimeoutFunc = defaultProcessTimeoutFunc 435 } 436 if cfg.maxConcurrency == 0 { 437 cfg.maxConcurrency = 1 438 } 439 // NB: addOrMaybeAddSemSize coupled with tight scanner intervals in tests 440 // unfortunately bog down the race build if they are increased too much. 441 if cfg.addOrMaybeAddSemSize == 0 { 442 cfg.addOrMaybeAddSemSize = 20 443 } 444 445 ambient := store.cfg.AmbientCtx 446 ambient.AddLogTag(name, nil) 447 448 if !cfg.acceptsUnsplitRanges && !cfg.needsSystemConfig { 449 log.Fatalf(ambient.AnnotateCtx(context.Background()), 450 "misconfigured queue: acceptsUnsplitRanges=false requires needsSystemConfig=true; got %+v", cfg) 451 } 452 453 bq := baseQueue{ 454 AmbientContext: ambient, 455 name: name, 456 impl: impl, 457 store: store, 458 gossip: gossip, 459 queueConfig: cfg, 460 incoming: make(chan struct{}, 1), 461 processSem: make(chan struct{}, cfg.maxConcurrency), 462 addOrMaybeAddSem: quotapool.NewIntPool("queue-add", uint64(cfg.addOrMaybeAddSemSize)), 463 addLogN: log.Every(5 * time.Second), 464 getReplica: func(id roachpb.RangeID) (replicaInQueue, error) { 465 repl, err := store.GetReplica(id) 466 if repl == nil || err != nil { 467 // Don't return (*Replica)(nil) as replicaInQueue or NPEs will 468 // ensue. 469 return nil, err 470 } 471 return repl, err 472 }, 473 } 474 bq.mu.replicas = map[roachpb.RangeID]*replicaItem{} 475 476 return &bq 477 } 478 479 // Name returns the name of the queue. 480 func (bq *baseQueue) Name() string { 481 return bq.name 482 } 483 484 // NeedsLease returns whether the queue requires a replica to be leaseholder. 485 func (bq *baseQueue) NeedsLease() bool { 486 return bq.needsLease 487 } 488 489 // Length returns the current size of the queue. 490 func (bq *baseQueue) Length() int { 491 bq.mu.Lock() 492 defer bq.mu.Unlock() 493 return bq.mu.priorityQ.Len() 494 } 495 496 // PurgatoryLength returns the current size of purgatory. 497 func (bq *baseQueue) PurgatoryLength() int { 498 // Lock processing while measuring the purgatory length. This ensures that 499 // no purgatory replicas are concurrently being processed, during which time 500 // they are removed from bq.mu.purgatory even though they may be re-added. 501 defer bq.lockProcessing()() 502 503 bq.mu.Lock() 504 defer bq.mu.Unlock() 505 return len(bq.mu.purgatory) 506 } 507 508 // SetDisabled turns queue processing off or on as directed. 509 func (bq *baseQueue) SetDisabled(disabled bool) { 510 bq.mu.Lock() 511 bq.mu.disabled = disabled 512 bq.mu.Unlock() 513 } 514 515 // lockProcessing locks all processing in the baseQueue. It returns 516 // a function to unlock processing. 517 func (bq *baseQueue) lockProcessing() func() { 518 semCount := cap(bq.processSem) 519 520 // Drain process semaphore. 521 for i := 0; i < semCount; i++ { 522 bq.processSem <- struct{}{} 523 } 524 525 return func() { 526 // Populate process semaphore. 527 for i := 0; i < semCount; i++ { 528 <-bq.processSem 529 } 530 } 531 } 532 533 // Start launches a goroutine to process entries in the queue. The 534 // provided stopper is used to finish processing. 535 func (bq *baseQueue) Start(stopper *stop.Stopper) { 536 bq.processLoop(stopper) 537 } 538 539 type baseQueueHelper struct { 540 bq *baseQueue 541 } 542 543 func (h baseQueueHelper) MaybeAdd(ctx context.Context, repl replicaInQueue, now hlc.Timestamp) { 544 h.bq.maybeAdd(ctx, repl, now) 545 } 546 547 func (h baseQueueHelper) Add(ctx context.Context, repl replicaInQueue, prio float64) { 548 _, err := h.bq.addInternal(ctx, repl.Desc(), repl.ReplicaID(), prio) 549 if err != nil && log.V(1) { 550 log.Infof(ctx, "during Add: %s", err) 551 } 552 } 553 554 type queueHelper interface { 555 MaybeAdd(ctx context.Context, repl replicaInQueue, now hlc.Timestamp) 556 Add(ctx context.Context, repl replicaInQueue, prio float64) 557 } 558 559 // Async is a more performant substitute for calling AddAsync or MaybeAddAsync 560 // when many operations are going to be carried out. It invokes the given helper 561 // function in a goroutine if semaphore capacity is available. If the semaphore 562 // is not available, the 'wait' parameter decides whether to wait or to return 563 // as a noop. Note that if the system is quiescing, fn may never be called in- 564 // dependent of the value of 'wait'. 565 // 566 // The caller is responsible for ensuring that opName does not contain PII. 567 // (Best is to pass a constant string.) 568 func (bq *baseQueue) Async( 569 ctx context.Context, opName string, wait bool, fn func(ctx context.Context, h queueHelper), 570 ) { 571 if log.V(3) { 572 log.InfofDepth(ctx, 2, "%s", log.Safe(opName)) 573 } 574 opName += " (" + bq.name + ")" 575 if err := bq.store.stopper.RunLimitedAsyncTask(context.Background(), opName, bq.addOrMaybeAddSem, wait, 576 func(ctx context.Context) { 577 fn(ctx, baseQueueHelper{bq}) 578 }); err != nil && bq.addLogN.ShouldLog() { 579 log.Infof(ctx, "rate limited in %s: %s", log.Safe(opName), err) 580 } 581 } 582 583 // MaybeAddAsync offers the replica to the queue. The queue will only process a 584 // certain number of these operations concurrently, and will drop (i.e. treat as 585 // a noop) any additional calls. 586 func (bq *baseQueue) MaybeAddAsync(ctx context.Context, repl replicaInQueue, now hlc.Timestamp) { 587 bq.Async(ctx, "MaybeAdd", false /* wait */, func(ctx context.Context, h queueHelper) { 588 h.MaybeAdd(ctx, repl, now) 589 }) 590 } 591 592 // AddAsync adds the replica to the queue. Unlike MaybeAddAsync, it will wait 593 // for other operations to finish instead of turning into a noop (because 594 // unlikely MaybeAdd, Add is not subject to being called opportunistically). 595 func (bq *baseQueue) AddAsync(ctx context.Context, repl replicaInQueue, prio float64) { 596 bq.Async(ctx, "Add", false /* wait */, func(ctx context.Context, h queueHelper) { 597 h.Add(ctx, repl, prio) 598 }) 599 } 600 601 func (bq *baseQueue) maybeAdd(ctx context.Context, repl replicaInQueue, now hlc.Timestamp) { 602 ctx = repl.AnnotateCtx(ctx) 603 // Load the system config if it's needed. 604 var cfg *config.SystemConfig 605 if bq.needsSystemConfig { 606 cfg = bq.gossip.GetSystemConfig() 607 if cfg == nil { 608 if log.V(1) { 609 log.Infof(ctx, "no system config available. skipping") 610 } 611 return 612 } 613 } 614 615 bq.mu.Lock() 616 stopped := bq.mu.stopped || bq.mu.disabled 617 bq.mu.Unlock() 618 619 if stopped { 620 return 621 } 622 623 if !repl.IsInitialized() { 624 return 625 } 626 627 if bq.needsRaftInitialized { 628 repl.maybeInitializeRaftGroup(ctx) 629 } 630 631 if cfg != nil && bq.requiresSplit(cfg, repl) { 632 // Range needs to be split due to zone configs, but queue does 633 // not accept unsplit ranges. 634 if log.V(1) { 635 log.Infof(ctx, "split needed; not adding") 636 } 637 return 638 } 639 640 if bq.needsLease { 641 // Check to see if either we own the lease or do not know who the lease 642 // holder is. 643 if lease, _ := repl.GetLease(); repl.IsLeaseValid(lease, now) && 644 !lease.OwnedBy(repl.StoreID()) { 645 if log.V(1) { 646 log.Infof(ctx, "needs lease; not adding: %+v", lease) 647 } 648 return 649 } 650 } 651 // NB: in production code, this type assertion is always true. In tests, 652 // it may not be and shouldQueue will be passed a nil realRepl. These tests 653 // know what they're getting into so that's fine. 654 realRepl, _ := repl.(*Replica) 655 should, priority := bq.impl.shouldQueue(ctx, now, realRepl, cfg) 656 if !should { 657 return 658 } 659 if _, err := bq.addInternal(ctx, repl.Desc(), repl.ReplicaID(), priority); !isExpectedQueueError(err) { 660 log.Errorf(ctx, "unable to add: %+v", err) 661 } 662 } 663 664 func (bq *baseQueue) requiresSplit(cfg *config.SystemConfig, repl replicaInQueue) bool { 665 if bq.acceptsUnsplitRanges { 666 return false 667 } 668 desc := repl.Desc() 669 return cfg.NeedsSplit(desc.StartKey, desc.EndKey) 670 } 671 672 // addInternal adds the replica the queue with specified priority. If 673 // the replica is already queued at a lower priority, updates the existing 674 // priority. Expects the queue lock to be held by caller. 675 func (bq *baseQueue) addInternal( 676 ctx context.Context, desc *roachpb.RangeDescriptor, replicaID roachpb.ReplicaID, priority float64, 677 ) (bool, error) { 678 // NB: this is intentionally outside of bq.mu to avoid having to consider 679 // lock ordering constraints. 680 if !desc.IsInitialized() { 681 // We checked this above in MaybeAdd(), but we need to check it 682 // again for Add(). 683 return false, errors.New("replica not initialized") 684 } 685 686 bq.mu.Lock() 687 defer bq.mu.Unlock() 688 689 if bq.mu.stopped { 690 return false, errQueueStopped 691 } 692 693 if bq.mu.disabled { 694 if log.V(3) { 695 log.Infof(ctx, "queue disabled") 696 } 697 return false, errQueueDisabled 698 } 699 700 // If the replica is currently in purgatory, don't re-add it. 701 if _, ok := bq.mu.purgatory[desc.RangeID]; ok { 702 return false, nil 703 } 704 705 item, ok := bq.mu.replicas[desc.RangeID] 706 if ok { 707 // Replica is already processing. Mark to be requeued. 708 if item.processing { 709 wasRequeued := item.requeue 710 item.requeue = true 711 return !wasRequeued, nil 712 } 713 714 // Replica has already been added but at a lower priority; update priority. 715 // Don't lower it since the previous queuer may have known more than this 716 // one does. 717 if priority > item.priority { 718 if log.V(1) { 719 log.Infof(ctx, "updating priority: %0.3f -> %0.3f", item.priority, priority) 720 } 721 bq.mu.priorityQ.update(item, priority) 722 } 723 return false, nil 724 } 725 726 if log.V(3) { 727 log.Infof(ctx, "adding: priority=%0.3f", priority) 728 } 729 item = &replicaItem{rangeID: desc.RangeID, replicaID: replicaID, priority: priority} 730 bq.addLocked(item) 731 732 // If adding this replica has pushed the queue past its maximum size, 733 // remove the lowest priority element. 734 if pqLen := bq.mu.priorityQ.Len(); pqLen > bq.maxSize { 735 bq.removeLocked(bq.mu.priorityQ.sl[pqLen-1]) 736 } 737 // Signal the processLoop that a replica has been added. 738 select { 739 case bq.incoming <- struct{}{}: 740 default: 741 // No need to signal again. 742 } 743 return true, nil 744 } 745 746 // MaybeAddCallback adds a callback to be called when the specified range 747 // finishes processing if the range is in the queue. If the range is in 748 // purgatory, the callback is called immediately with the purgatory error. If 749 // the range is not in the queue (either waiting or processing), the method 750 // returns false. 751 // 752 // NB: If the replica this attaches to is dropped from an overfull queue, this 753 // callback is never called. This is surprising, but the single caller of this 754 // is okay with these semantics. Adding new uses is discouraged without cleaning 755 // up the contract of this method, but this code doesn't lend itself readily to 756 // upholding invariants so there may need to be some cleanup first. 757 func (bq *baseQueue) MaybeAddCallback(rangeID roachpb.RangeID, cb processCallback) bool { 758 bq.mu.Lock() 759 defer bq.mu.Unlock() 760 761 if purgatoryErr, ok := bq.mu.purgatory[rangeID]; ok { 762 cb(purgatoryErr) 763 return true 764 } 765 if item, ok := bq.mu.replicas[rangeID]; ok { 766 item.registerCallback(cb) 767 return true 768 } 769 return false 770 } 771 772 // MaybeRemove removes the specified replica from the queue if enqueued. 773 func (bq *baseQueue) MaybeRemove(rangeID roachpb.RangeID) { 774 bq.mu.Lock() 775 defer bq.mu.Unlock() 776 777 if bq.mu.stopped { 778 return 779 } 780 781 if item, ok := bq.mu.replicas[rangeID]; ok { 782 ctx := bq.AnnotateCtx(context.TODO()) 783 if log.V(3) { 784 log.Infof(ctx, "%s: removing", item.rangeID) 785 } 786 bq.removeLocked(item) 787 } 788 } 789 790 // processLoop processes the entries in the queue until the provided 791 // stopper signals exit. 792 func (bq *baseQueue) processLoop(stopper *stop.Stopper) { 793 ctx := bq.AnnotateCtx(context.Background()) 794 stopper.RunWorker(ctx, func(ctx context.Context) { 795 defer func() { 796 bq.mu.Lock() 797 bq.mu.stopped = true 798 bq.mu.Unlock() 799 }() 800 801 // nextTime is initially nil; we don't start any timers until the queue 802 // becomes non-empty. 803 var nextTime <-chan time.Time 804 805 immediately := make(chan time.Time) 806 close(immediately) 807 808 for { 809 select { 810 // Exit on stopper. 811 case <-stopper.ShouldStop(): 812 return 813 814 // Incoming signal sets the next time to process if there were previously 815 // no replicas in the queue. 816 case <-bq.incoming: 817 if nextTime == nil { 818 // When a replica is added, wake up immediately. This is mainly 819 // to facilitate testing without unnecessary sleeps. 820 nextTime = immediately 821 822 // In case we're in a test, still block on the impl. 823 bq.impl.timer(0) 824 } 825 // Process replicas as the timer expires. 826 case <-nextTime: 827 // Acquire from the process semaphore. 828 bq.processSem <- struct{}{} 829 830 repl := bq.pop() 831 if repl != nil { 832 annotatedCtx := repl.AnnotateCtx(ctx) 833 if stopper.RunAsyncTask( 834 annotatedCtx, fmt.Sprintf("storage.%s: processing replica", bq.name), 835 func(ctx context.Context) { 836 // Release semaphore when finished processing. 837 defer func() { <-bq.processSem }() 838 839 start := timeutil.Now() 840 err := bq.processReplica(ctx, repl) 841 842 duration := timeutil.Since(start) 843 bq.recordProcessDuration(ctx, duration) 844 845 bq.finishProcessingReplica(ctx, stopper, repl, err) 846 }) != nil { 847 // Release semaphore on task failure. 848 <-bq.processSem 849 return 850 } 851 } else { 852 // Release semaphore if no replicas were available. 853 <-bq.processSem 854 } 855 856 if bq.Length() == 0 { 857 nextTime = nil 858 } else { 859 // lastDur will be 0 after the first processing attempt. 860 lastDur := bq.lastProcessDuration() 861 switch t := bq.impl.timer(lastDur); t { 862 case 0: 863 nextTime = immediately 864 default: 865 nextTime = time.After(t) 866 } 867 } 868 } 869 } 870 }) 871 } 872 873 // lastProcessDuration returns the duration of the last processing attempt. 874 func (bq *baseQueue) lastProcessDuration() time.Duration { 875 return time.Duration(atomic.LoadInt64(&bq.processDur)) 876 } 877 878 // recordProcessDuration records the duration of a processing run. 879 func (bq *baseQueue) recordProcessDuration(ctx context.Context, dur time.Duration) { 880 if log.V(2) { 881 log.Infof(ctx, "done %s", dur) 882 } 883 bq.processingNanos.Inc(dur.Nanoseconds()) 884 atomic.StoreInt64(&bq.processDur, int64(dur)) 885 } 886 887 // processReplica processes a single replica. This should not be 888 // called externally to the queue. bq.mu.Lock must not be held 889 // while calling this method. 890 // 891 // ctx should already be annotated by repl.AnnotateCtx(). 892 func (bq *baseQueue) processReplica(ctx context.Context, repl replicaInQueue) error { 893 // Load the system config if it's needed. 894 var cfg *config.SystemConfig 895 if bq.needsSystemConfig { 896 cfg = bq.gossip.GetSystemConfig() 897 if cfg == nil { 898 log.VEventf(ctx, 1, "no system config available. skipping") 899 return nil 900 } 901 } 902 903 if cfg != nil && bq.requiresSplit(cfg, repl) { 904 // Range needs to be split due to zone configs, but queue does 905 // not accept unsplit ranges. 906 log.VEventf(ctx, 3, "split needed; skipping") 907 return nil 908 } 909 910 ctx, span := bq.AnnotateCtxWithSpan(ctx, bq.name) 911 defer span.Finish() 912 return contextutil.RunWithTimeout(ctx, fmt.Sprintf("%s queue process replica %d", bq.name, repl.GetRangeID()), 913 bq.processTimeoutFunc(bq.store.ClusterSettings(), repl), func(ctx context.Context) error { 914 log.VEventf(ctx, 1, "processing replica") 915 916 if !repl.IsInitialized() { 917 // We checked this when adding the replica, but we need to check it again 918 // in case this is a different replica with the same range ID (see #14193). 919 // This is possible in the case where the replica was enqueued while not 920 // having a replica ID, perhaps due to a pre-emptive snapshot, and has 921 // since been removed and re-added at a different replica ID. 922 return errors.New("cannot process uninitialized replica") 923 } 924 925 if reason, err := repl.IsDestroyed(); err != nil { 926 if !bq.queueConfig.processDestroyedReplicas || reason == destroyReasonRemoved { 927 log.VEventf(ctx, 3, "replica destroyed (%s); skipping", err) 928 return nil 929 } 930 } 931 932 // If the queue requires a replica to have the range lease in 933 // order to be processed, check whether this replica has range lease 934 // and renew or acquire if necessary. 935 if bq.needsLease { 936 if _, pErr := repl.redirectOnOrAcquireLease(ctx); pErr != nil { 937 switch v := pErr.GetDetail().(type) { 938 case *roachpb.NotLeaseHolderError, *roachpb.RangeNotFoundError: 939 log.VEventf(ctx, 3, "%s; skipping", v) 940 return nil 941 default: 942 log.VErrEventf(ctx, 2, "could not obtain lease: %s", pErr) 943 return errors.Wrapf(pErr.GoError(), "%s: could not obtain lease", repl) 944 } 945 } 946 } 947 948 log.VEventf(ctx, 3, "processing...") 949 // NB: in production code, this type assertion is always true. In tests, 950 // it may not be and shouldQueue will be passed a nil realRepl. These tests 951 // know what they're getting into so that's fine. 952 realRepl, _ := repl.(*Replica) 953 if err := bq.impl.process(ctx, realRepl, cfg); err != nil { 954 return err 955 } 956 log.VEventf(ctx, 3, "processing... done") 957 bq.successes.Inc(1) 958 return nil 959 }) 960 } 961 962 type benignError struct { 963 cause error 964 } 965 966 func (be *benignError) Error() string { return be.cause.Error() } 967 func (be *benignError) Cause() error { return be.cause } 968 969 func isBenign(err error) bool { 970 return errors.HasType(err, (*benignError)(nil)) 971 } 972 973 func isPurgatoryError(err error) (purgatoryError, bool) { 974 var purgErr purgatoryError 975 return purgErr, errors.As(err, &purgErr) 976 } 977 978 // assertInvariants codifies the guarantees upheld by the data structures in the 979 // base queue. In summary, a replica is one of: 980 // - "queued" and in mu.replicas and mu.priorityQ 981 // - "processing" and only in mu.replicas 982 // - "purgatory" and in mu.replicas and mu.purgatory 983 // 984 // Note that in particular, nothing is ever in both mu.priorityQ and 985 // mu.purgatory. 986 func (bq *baseQueue) assertInvariants() { 987 bq.mu.Lock() 988 defer bq.mu.Unlock() 989 990 ctx := bq.AnnotateCtx(context.Background()) 991 for _, item := range bq.mu.priorityQ.sl { 992 if item.processing { 993 log.Fatalf(ctx, "processing item found in prioQ: %v", item) 994 } 995 if _, inReplicas := bq.mu.replicas[item.rangeID]; !inReplicas { 996 log.Fatalf(ctx, "item found in prioQ but not in mu.replicas: %v", item) 997 } 998 if _, inPurg := bq.mu.purgatory[item.rangeID]; inPurg { 999 log.Fatalf(ctx, "item found in prioQ and purgatory: %v", item) 1000 } 1001 } 1002 for rangeID := range bq.mu.purgatory { 1003 item, inReplicas := bq.mu.replicas[rangeID] 1004 if !inReplicas { 1005 log.Fatalf(ctx, "item found in purg but not in mu.replicas: %v", item) 1006 } 1007 if item.processing { 1008 log.Fatalf(ctx, "processing item found in purgatory: %v", item) 1009 } 1010 // NB: we already checked above that item not in prioQ. 1011 } 1012 1013 // At this point we know that the purgatory in prioQ are distinct, and we 1014 // also know that no processing replicas are tracked in each. Let's check 1015 // that there aren't any non-processing replicas *only* in bq.mu.replicas. 1016 var nNotProcessing int 1017 for _, item := range bq.mu.replicas { 1018 if !item.processing { 1019 nNotProcessing++ 1020 } 1021 } 1022 if nNotProcessing != len(bq.mu.purgatory)+len(bq.mu.priorityQ.sl) { 1023 log.Fatalf(ctx, "have %d non-processing replicas in mu.replicas, "+ 1024 "but %d in purgatory and %d in prioQ; the latter two should add up"+ 1025 "to the former", nNotProcessing, len(bq.mu.purgatory), len(bq.mu.priorityQ.sl)) 1026 } 1027 } 1028 1029 // finishProcessingReplica handles the completion of a replica process attempt. 1030 // It removes the replica from the replica set and may re-enqueue the replica or 1031 // add it to purgatory. 1032 func (bq *baseQueue) finishProcessingReplica( 1033 ctx context.Context, stopper *stop.Stopper, repl replicaInQueue, err error, 1034 ) { 1035 bq.mu.Lock() 1036 // Remove item from replica set completely. We may add it 1037 // back in down below. 1038 item := bq.mu.replicas[repl.GetRangeID()] 1039 processing := item.processing 1040 callbacks := item.callbacks 1041 requeue := item.requeue 1042 item.callbacks = nil 1043 bq.removeFromReplicaSetLocked(repl.GetRangeID()) 1044 item = nil // prevent accidental use below 1045 bq.mu.Unlock() 1046 1047 if !processing { 1048 log.Fatalf(ctx, "%s: attempt to remove non-processing replica %v", bq.name, repl) 1049 } 1050 1051 // Call any registered callbacks. 1052 for _, cb := range callbacks { 1053 cb(err) 1054 } 1055 1056 // Handle failures. 1057 if err != nil { 1058 benign := isBenign(err) 1059 1060 // Increment failures metric. 1061 // 1062 // TODO(tschottdorf): once we start asserting zero failures in tests 1063 // (and production), move benign failures into a dedicated category. 1064 bq.failures.Inc(1) 1065 1066 // Determine whether a failure is a purgatory error. If it is, add 1067 // the failing replica to purgatory. Note that even if the item was 1068 // scheduled to be requeued, we ignore this if we add the replica to 1069 // purgatory. 1070 if purgErr, ok := isPurgatoryError(err); ok { 1071 bq.mu.Lock() 1072 bq.addToPurgatoryLocked(ctx, stopper, repl, purgErr) 1073 bq.mu.Unlock() 1074 return 1075 } 1076 1077 // If not a benign or purgatory error, log. 1078 if !benign { 1079 log.Errorf(ctx, "%v", err) 1080 } 1081 } 1082 1083 // Maybe add replica back into queue, if requested. 1084 if requeue { 1085 bq.maybeAdd(ctx, repl, bq.store.Clock().Now()) 1086 } 1087 } 1088 1089 // addToPurgatoryLocked adds the specified replica to the purgatory queue, which 1090 // holds replicas which have failed processing. 1091 func (bq *baseQueue) addToPurgatoryLocked( 1092 ctx context.Context, stopper *stop.Stopper, repl replicaInQueue, purgErr purgatoryError, 1093 ) { 1094 bq.mu.AssertHeld() 1095 1096 // Check whether the queue supports purgatory errors. If not then something 1097 // went wrong because a purgatory error should not have ended up here. 1098 if bq.impl.purgatoryChan() == nil { 1099 log.Errorf(ctx, "queue does not support purgatory errors, but saw %v", purgErr) 1100 return 1101 } 1102 1103 if log.V(1) { 1104 log.Infof(ctx, "purgatory: %v", purgErr) 1105 } 1106 1107 if _, found := bq.mu.replicas[repl.GetRangeID()]; found { 1108 // Don't add to purgatory if already in the queue (again). We need to 1109 // uphold the invariant that a replica is never both in the priority 1110 // queue and the purgatory at the same time or bad things will happen. 1111 // See bq.assertInvariants and: 1112 // https://github.com/cockroachdb/cockroach/issues/36277#issuecomment-482659939 1113 return 1114 } 1115 1116 item := &replicaItem{rangeID: repl.GetRangeID(), replicaID: repl.ReplicaID(), index: -1} 1117 bq.mu.replicas[repl.GetRangeID()] = item 1118 1119 defer func() { 1120 bq.purgatory.Update(int64(len(bq.mu.purgatory))) 1121 }() 1122 1123 // If purgatory already exists, just add to the map and we're done. 1124 if bq.mu.purgatory != nil { 1125 bq.mu.purgatory[repl.GetRangeID()] = purgErr 1126 return 1127 } 1128 1129 // Otherwise, create purgatory and start processing. 1130 bq.mu.purgatory = map[roachpb.RangeID]purgatoryError{ 1131 repl.GetRangeID(): purgErr, 1132 } 1133 1134 workerCtx := bq.AnnotateCtx(context.Background()) 1135 stopper.RunWorker(workerCtx, func(ctx context.Context) { 1136 ticker := time.NewTicker(purgatoryReportInterval) 1137 for { 1138 select { 1139 case <-bq.impl.purgatoryChan(): 1140 func() { 1141 // Acquire from the process semaphore, release when done. 1142 bq.processSem <- struct{}{} 1143 defer func() { <-bq.processSem }() 1144 1145 // Remove all items from purgatory into a copied slice. 1146 bq.mu.Lock() 1147 ranges := make([]*replicaItem, 0, len(bq.mu.purgatory)) 1148 for rangeID := range bq.mu.purgatory { 1149 item := bq.mu.replicas[rangeID] 1150 if item == nil { 1151 log.Fatalf(ctx, "r%d is in purgatory but not in replicas", rangeID) 1152 } 1153 item.setProcessing() 1154 ranges = append(ranges, item) 1155 bq.removeFromPurgatoryLocked(item) 1156 } 1157 bq.mu.Unlock() 1158 1159 for _, item := range ranges { 1160 repl, err := bq.getReplica(item.rangeID) 1161 if err != nil || item.replicaID != repl.ReplicaID() { 1162 continue 1163 } 1164 annotatedCtx := repl.AnnotateCtx(ctx) 1165 if stopper.RunTask( 1166 annotatedCtx, fmt.Sprintf("storage.%s: purgatory processing replica", bq.name), 1167 func(ctx context.Context) { 1168 err := bq.processReplica(ctx, repl) 1169 bq.finishProcessingReplica(ctx, stopper, repl, err) 1170 }) != nil { 1171 return 1172 } 1173 } 1174 }() 1175 1176 // Clean up purgatory, if empty. 1177 bq.mu.Lock() 1178 if len(bq.mu.purgatory) == 0 { 1179 log.Infof(ctx, "purgatory is now empty") 1180 bq.mu.purgatory = nil 1181 bq.mu.Unlock() 1182 return 1183 } 1184 bq.mu.Unlock() 1185 case <-ticker.C: 1186 // Report purgatory status. 1187 bq.mu.Lock() 1188 errMap := map[string]int{} 1189 for _, err := range bq.mu.purgatory { 1190 errMap[err.Error()]++ 1191 } 1192 bq.mu.Unlock() 1193 for errStr, count := range errMap { 1194 log.Errorf(ctx, "%d replicas failing with %q", count, errStr) 1195 } 1196 case <-stopper.ShouldStop(): 1197 return 1198 } 1199 } 1200 }) 1201 } 1202 1203 // pop dequeues the highest priority replica, if any, in the queue. The 1204 // replicaItem corresponding to the returned Replica will be moved to the 1205 // "processing" state and should be cleaned up by calling 1206 // finishProcessingReplica once the Replica has finished processing. 1207 func (bq *baseQueue) pop() replicaInQueue { 1208 bq.mu.Lock() 1209 for { 1210 if bq.mu.priorityQ.Len() == 0 { 1211 bq.mu.Unlock() 1212 return nil 1213 } 1214 item := heap.Pop(&bq.mu.priorityQ).(*replicaItem) 1215 if item.processing { 1216 log.Fatalf(bq.AnnotateCtx(context.Background()), "%s pulled processing item from heap: %v", bq.name, item) 1217 } 1218 item.setProcessing() 1219 bq.pending.Update(int64(bq.mu.priorityQ.Len())) 1220 bq.mu.Unlock() 1221 1222 repl, _ := bq.getReplica(item.rangeID) 1223 if repl != nil && item.replicaID == repl.ReplicaID() { 1224 return repl 1225 } 1226 // Replica not found or was recreated with a new replica ID, remove from 1227 // set and try again. 1228 bq.mu.Lock() 1229 bq.removeFromReplicaSetLocked(item.rangeID) 1230 } 1231 } 1232 1233 // addLocked adds an element to the priority queue. Caller must hold mutex. 1234 func (bq *baseQueue) addLocked(item *replicaItem) { 1235 heap.Push(&bq.mu.priorityQ, item) 1236 bq.pending.Update(int64(bq.mu.priorityQ.Len())) 1237 bq.mu.replicas[item.rangeID] = item 1238 } 1239 1240 // removeLocked removes an element from purgatory (if it's experienced an 1241 // error) or from the priority queue by index. Caller must hold mutex. 1242 func (bq *baseQueue) removeLocked(item *replicaItem) { 1243 if item.processing { 1244 // The item is processing. We can't intererupt the processing 1245 // or remove it from the replica set yet, but we can make sure 1246 // it doesn't get requeued. 1247 item.requeue = false 1248 } else { 1249 if _, inPurg := bq.mu.purgatory[item.rangeID]; inPurg { 1250 bq.removeFromPurgatoryLocked(item) 1251 } else if item.index >= 0 { 1252 bq.removeFromQueueLocked(item) 1253 } else { 1254 log.Fatalf(bq.AnnotateCtx(context.Background()), 1255 "item for r%d is only in replicas map, but is not processing", 1256 item.rangeID, 1257 ) 1258 } 1259 bq.removeFromReplicaSetLocked(item.rangeID) 1260 } 1261 } 1262 1263 // Caller must hold mutex. 1264 func (bq *baseQueue) removeFromPurgatoryLocked(item *replicaItem) { 1265 delete(bq.mu.purgatory, item.rangeID) 1266 bq.purgatory.Update(int64(len(bq.mu.purgatory))) 1267 } 1268 1269 // Caller must hold mutex. 1270 func (bq *baseQueue) removeFromQueueLocked(item *replicaItem) { 1271 heap.Remove(&bq.mu.priorityQ, item.index) 1272 bq.pending.Update(int64(bq.mu.priorityQ.Len())) 1273 } 1274 1275 // Caller must hold mutex. 1276 func (bq *baseQueue) removeFromReplicaSetLocked(rangeID roachpb.RangeID) { 1277 if _, found := bq.mu.replicas[rangeID]; !found { 1278 log.Fatalf(bq.AnnotateCtx(context.Background()), 1279 "attempted to remove r%d from queue, but it isn't in it", 1280 rangeID, 1281 ) 1282 } 1283 delete(bq.mu.replicas, rangeID) 1284 } 1285 1286 // DrainQueue locks the queue and processes the remaining queued replicas. It 1287 // processes the replicas in the order they're queued in, one at a time. 1288 // Exposed for testing only. 1289 func (bq *baseQueue) DrainQueue(stopper *stop.Stopper) { 1290 // Lock processing while draining. This prevents the main process 1291 // loop from racing with this method and ensures that any replicas 1292 // queued up when this method was called will be processed by the 1293 // time it returns. 1294 defer bq.lockProcessing()() 1295 1296 ctx := bq.AnnotateCtx(context.TODO()) 1297 for repl := bq.pop(); repl != nil; repl = bq.pop() { 1298 annotatedCtx := repl.AnnotateCtx(ctx) 1299 err := bq.processReplica(annotatedCtx, repl) 1300 bq.finishProcessingReplica(annotatedCtx, stopper, repl, err) 1301 } 1302 }