github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/queue.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/queue.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"container/heap"
    15  	"context"
    16  	"fmt"
    17  	"sync/atomic"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/config"
    21  	"github.com/cockroachdb/cockroach/pkg/gossip"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    23  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    24  	"github.com/cockroachdb/cockroach/pkg/settings"
    25  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    26  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    27  	"github.com/cockroachdb/cockroach/pkg/util/contextutil"
    28  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    29  	"github.com/cockroachdb/cockroach/pkg/util/log"
    30  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    31  	"github.com/cockroachdb/cockroach/pkg/util/quotapool"
    32  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    33  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    34  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    35  	"github.com/cockroachdb/errors"
    36  )
    37  
    38  const (
    39  	// purgatoryReportInterval is the duration between reports on
    40  	// purgatory status.
    41  	purgatoryReportInterval = 10 * time.Minute
    42  	// defaultProcessTimeout is the timeout when processing a replica.
    43  	// The timeout prevents a queue from getting stuck on a replica.
    44  	// For example, a replica whose range is not reachable for quorum.
    45  	defaultProcessTimeout = 1 * time.Minute
    46  	// defaultQueueMaxSize is the default max size for a queue.
    47  	defaultQueueMaxSize = 10000
    48  )
    49  
    50  // queueGuaranteedProcessingTimeBudget is the smallest amount of time before
    51  // which the processing of a queue may time out. It is an escape hatch to raise
    52  // the timeout for queues.
    53  var queueGuaranteedProcessingTimeBudget = settings.RegisterDurationSetting(
    54  	"kv.queue.process.guaranteed_time_budget",
    55  	"the guaranteed duration before which the processing of a queue may "+
    56  		"time out",
    57  	defaultProcessTimeout,
    58  )
    59  
    60  func init() {
    61  	queueGuaranteedProcessingTimeBudget.SetVisibility(settings.Reserved)
    62  }
    63  
    64  func defaultProcessTimeoutFunc(cs *cluster.Settings, _ replicaInQueue) time.Duration {
    65  	return queueGuaranteedProcessingTimeBudget.Get(&cs.SV)
    66  }
    67  
    68  // The queues which send snapshots while processing should have a timeout which
    69  // is a function of the size of the range and the maximum allowed rate of data
    70  // transfer that adheres to a minimum timeout specified in a cluster setting.
    71  //
    72  // The parameter controls which rate to use.
    73  func makeQueueSnapshotTimeoutFunc(rateSetting *settings.ByteSizeSetting) queueProcessTimeoutFunc {
    74  	return func(cs *cluster.Settings, r replicaInQueue) time.Duration {
    75  		minimumTimeout := queueGuaranteedProcessingTimeBudget.Get(&cs.SV)
    76  		// NB: In production code this will type assertion will always succeed.
    77  		// Some tests set up a fake implementation of replicaInQueue in which
    78  		// case we fall back to the configured minimum timeout.
    79  		repl, ok := r.(interface{ GetMVCCStats() enginepb.MVCCStats })
    80  		if !ok {
    81  			return minimumTimeout
    82  		}
    83  		snapshotRate := rateSetting.Get(&cs.SV)
    84  		stats := repl.GetMVCCStats()
    85  		totalBytes := stats.KeyBytes + stats.ValBytes + stats.IntentBytes + stats.SysBytes
    86  		estimatedDuration := time.Duration(totalBytes/snapshotRate) * time.Second
    87  		timeout := estimatedDuration * permittedSnapshotSlowdown
    88  		if timeout < minimumTimeout {
    89  			timeout = minimumTimeout
    90  		}
    91  		return timeout
    92  	}
    93  }
    94  
    95  // permittedSnapshotSlowdown is the factor of the above the estimated duration
    96  // for a snapshot given the configured snapshot rate which we use to configure
    97  // the snapshot's timeout.
    98  const permittedSnapshotSlowdown = 10
    99  
   100  // a purgatoryError indicates a replica processing failure which indicates
   101  // the replica can be placed into purgatory for faster retries when the
   102  // failure condition changes.
   103  type purgatoryError interface {
   104  	error
   105  	purgatoryErrorMarker() // dummy method for unique interface
   106  }
   107  
   108  // processCallback is a hook that is called when a replica finishes processing.
   109  // It is called with the result of the process attempt.
   110  type processCallback func(error)
   111  
   112  // A replicaItem holds a replica and metadata about its queue state and
   113  // processing state.
   114  type replicaItem struct {
   115  	rangeID   roachpb.RangeID
   116  	replicaID roachpb.ReplicaID
   117  	seq       int // enforce FIFO order for equal priorities
   118  
   119  	// fields used when a replicaItem is enqueued in a priority queue.
   120  	priority float64
   121  	index    int // The index of the item in the heap, maintained by the heap.Interface methods
   122  
   123  	// fields used when a replicaItem is processing.
   124  	processing bool
   125  	requeue    bool // enqueue again after processing?
   126  	callbacks  []processCallback
   127  }
   128  
   129  // setProcessing moves the item from an enqueued state to a processing state.
   130  func (i *replicaItem) setProcessing() {
   131  	i.priority = 0
   132  	if i.index >= 0 {
   133  		log.Fatalf(context.Background(),
   134  			"r%d marked as processing but appears in prioQ", i.rangeID,
   135  		)
   136  	}
   137  	i.processing = true
   138  }
   139  
   140  // registerCallback adds a new callback to be executed when the replicaItem
   141  // finishes processing.
   142  func (i *replicaItem) registerCallback(cb processCallback) {
   143  	i.callbacks = append(i.callbacks, cb)
   144  }
   145  
   146  // A priorityQueue implements heap.Interface and holds replicaItems.
   147  type priorityQueue struct {
   148  	seqGen int
   149  	sl     []*replicaItem
   150  }
   151  
   152  func (pq priorityQueue) Len() int { return len(pq.sl) }
   153  
   154  func (pq priorityQueue) Less(i, j int) bool {
   155  	a, b := pq.sl[i], pq.sl[j]
   156  	if a.priority == b.priority {
   157  		// When priorities are equal, we want the lower sequence number to show
   158  		// up first (FIFO).
   159  		return a.seq < b.seq
   160  	}
   161  	// We want Pop to give us the highest, not lowest, priority so we use greater than here.
   162  	return a.priority > b.priority
   163  }
   164  
   165  func (pq priorityQueue) Swap(i, j int) {
   166  	pq.sl[i], pq.sl[j] = pq.sl[j], pq.sl[i]
   167  	pq.sl[i].index, pq.sl[j].index = i, j
   168  }
   169  
   170  func (pq *priorityQueue) Push(x interface{}) {
   171  	n := len(pq.sl)
   172  	item := x.(*replicaItem)
   173  	item.index = n
   174  	pq.seqGen++
   175  	item.seq = pq.seqGen
   176  	pq.sl = append(pq.sl, item)
   177  }
   178  
   179  func (pq *priorityQueue) Pop() interface{} {
   180  	old := pq.sl
   181  	n := len(old)
   182  	item := old[n-1]
   183  	item.index = -1 // for safety
   184  	old[n-1] = nil  // for gc
   185  	pq.sl = old[0 : n-1]
   186  	return item
   187  }
   188  
   189  // update modifies the priority of a replicaItem in the queue.
   190  func (pq *priorityQueue) update(item *replicaItem, priority float64) {
   191  	item.priority = priority
   192  	if len(pq.sl) <= item.index || pq.sl[item.index] != item {
   193  		log.Fatalf(context.Background(), "updating item in heap that's not contained in it: %v", item)
   194  	}
   195  	heap.Fix(pq, item.index)
   196  }
   197  
   198  var (
   199  	errQueueDisabled = errors.New("queue disabled")
   200  	errQueueStopped  = errors.New("queue stopped")
   201  )
   202  
   203  func isExpectedQueueError(err error) bool {
   204  	return err == nil || errors.Is(err, errQueueDisabled)
   205  }
   206  
   207  // shouldQueueAgain is a helper function to determine whether the
   208  // replica should be queued according to the current time, the last
   209  // time the replica was processed, and the minimum interval between
   210  // successive processing. Specifying minInterval=0 queues all replicas.
   211  // Returns a bool for whether to queue as well as a priority based
   212  // on how long it's been since last processed.
   213  func shouldQueueAgain(now, last hlc.Timestamp, minInterval time.Duration) (bool, float64) {
   214  	if minInterval == 0 || last == (hlc.Timestamp{}) {
   215  		return true, 0
   216  	}
   217  	if diff := now.GoTime().Sub(last.GoTime()); diff >= minInterval {
   218  		priority := float64(1)
   219  		// If there's a non-zero last processed timestamp, adjust the
   220  		// priority by a multiple of how long it's been since the last
   221  		// time this replica was processed.
   222  		if last != (hlc.Timestamp{}) {
   223  			priority = float64(diff.Nanoseconds()) / float64(minInterval.Nanoseconds())
   224  		}
   225  		return true, priority
   226  	}
   227  	return false, 0
   228  }
   229  
   230  // replicaInQueue is the subset of *Replica required for interacting with queues.
   231  //
   232  // TODO(tbg): this interface is horrible, but this is what we do use at time of
   233  // extraction. Establish a sane interface and use that.
   234  type replicaInQueue interface {
   235  	AnnotateCtx(context.Context) context.Context
   236  	ReplicaID() roachpb.ReplicaID
   237  	StoreID() roachpb.StoreID
   238  	GetRangeID() roachpb.RangeID
   239  	IsInitialized() bool
   240  	IsDestroyed() (DestroyReason, error)
   241  	Desc() *roachpb.RangeDescriptor
   242  	maybeInitializeRaftGroup(context.Context)
   243  	redirectOnOrAcquireLease(context.Context) (kvserverpb.LeaseStatus, *roachpb.Error)
   244  	IsLeaseValid(roachpb.Lease, hlc.Timestamp) bool
   245  	GetLease() (roachpb.Lease, roachpb.Lease)
   246  }
   247  
   248  type queueImpl interface {
   249  	// shouldQueue accepts current time, a replica, and the system config
   250  	// and returns whether it should be queued and if so, at what priority.
   251  	// The Replica is guaranteed to be initialized.
   252  	shouldQueue(
   253  		context.Context, hlc.Timestamp, *Replica, *config.SystemConfig,
   254  	) (shouldQueue bool, priority float64)
   255  
   256  	// process accepts a replica, and the system config and executes
   257  	// queue-specific work on it. The Replica is guaranteed to be initialized.
   258  	process(context.Context, *Replica, *config.SystemConfig) error
   259  
   260  	// timer returns a duration to wait between processing the next item
   261  	// from the queue. The duration of the last processing of a replica
   262  	// is supplied as an argument. If no replicas have finished processing
   263  	// yet, this can be 0.
   264  	timer(time.Duration) time.Duration
   265  
   266  	// purgatoryChan returns a channel that is signaled with the current
   267  	// time when it's time to retry replicas which have been relegated to
   268  	// purgatory due to failures. If purgatoryChan returns nil, failing
   269  	// replicas are not sent to purgatory.
   270  	purgatoryChan() <-chan time.Time
   271  }
   272  
   273  // queueProcessTimeoutFunc controls the timeout for queue processing for a
   274  // replicaInQueue.
   275  type queueProcessTimeoutFunc func(*cluster.Settings, replicaInQueue) time.Duration
   276  
   277  type queueConfig struct {
   278  	// maxSize is the maximum number of replicas to queue.
   279  	maxSize int
   280  	// maxConcurrency is the maximum number of replicas that can be processed
   281  	// concurrently. If not set, defaults to 1.
   282  	maxConcurrency       int
   283  	addOrMaybeAddSemSize int
   284  	// needsLease controls whether this queue requires the range lease to operate
   285  	// on a replica. If so, one will be acquired if necessary. Many queues set
   286  	// needsLease not because they literally need a lease, but because they work
   287  	// on a range level and use it to ensure that only one node in the cluster
   288  	// processes that range.
   289  	needsLease bool
   290  	// needsRaftInitialized controls whether the Raft group will be initialized
   291  	// (if not already initialized) when deciding whether to process this
   292  	// replica.
   293  	needsRaftInitialized bool
   294  	// needsSystemConfig controls whether this queue requires a valid copy of the
   295  	// system config to operate on a replica. Not all queues require it, and it's
   296  	// unsafe for certain queues to wait on it. For example, a raft snapshot may
   297  	// be needed in order to make it possible for the system config to become
   298  	// available (as observed in #16268), so the raft snapshot queue can't
   299  	// require the system config to already be available.
   300  	needsSystemConfig bool
   301  	// acceptsUnsplitRanges controls whether this queue can process ranges that
   302  	// need to be split due to zone config settings. Ranges are checked before
   303  	// calling queueImpl.shouldQueue and queueImpl.process.
   304  	// This is to avoid giving the queue a replica that spans multiple config
   305  	// zones (which might make the action of the queue ambiguous - e.g. we don't
   306  	// want to try to replicate a range until we know which zone it is in and
   307  	// therefore how many replicas are required).
   308  	acceptsUnsplitRanges bool
   309  	// processDestroyedReplicas controls whether or not we want to process replicas
   310  	// that have been destroyed but not GCed.
   311  	processDestroyedReplicas bool
   312  	// processTimeout returns the timeout for processing a replica.
   313  	processTimeoutFunc queueProcessTimeoutFunc
   314  	// successes is a counter of replicas processed successfully.
   315  	successes *metric.Counter
   316  	// failures is a counter of replicas which failed processing.
   317  	failures *metric.Counter
   318  	// pending is a gauge measuring current replica count pending.
   319  	pending *metric.Gauge
   320  	// processingNanos is a counter measuring total nanoseconds spent processing replicas.
   321  	processingNanos *metric.Counter
   322  	// purgatory is a gauge measuring current replica count in purgatory.
   323  	purgatory *metric.Gauge
   324  }
   325  
   326  // baseQueue is the base implementation of the replicaQueue interface. Queue
   327  // implementations should embed a baseQueue and implement queueImpl.
   328  //
   329  // A queue contains replicas in one of three stages: queued, processing, and
   330  // purgatory. A "queued" replica is waiting for processing with some priority
   331  // that was selected when it was added. A "processing" replica is actively being
   332  // worked on by the queue, which delegates to the queueImpl's `process` method.
   333  // Replicas are selected from the queue for processing purely in priority order.
   334  // A "purgatory" replica has been marked by the queue implementation as
   335  // temporarily uninteresting and it will not be processed again until some
   336  // queue-specific event occurs. Not every queue has a purgatory.
   337  //
   338  // Generally, replicas are added to a queue by a replicaScanner, which is a
   339  // Store-level object. The scanner is configured with a set of queues (which in
   340  // practice is all of the queues) and will repeatedly iterate through every
   341  // replica on the store at a measured pace, handing each replica to every
   342  // queueImpl's `shouldQueue` method. This method is implemented differently by
   343  // each queue and decides whether the replica is currently interesting. If so,
   344  // it also selects a priority. Note that queues have a bounded size controlled
   345  // by the `maxSize` config option, which means the ones with lowest priority may
   346  // be dropped if processing cannot keep up and the queue fills.
   347  //
   348  // Replicas are added asynchronously through `MaybeAddAsync` or `AddAsync`.
   349  // MaybeAddAsync checks the various requirements selected by the queue config
   350  // (needsSystemConfig, needsLease, acceptsUnsplitRanges) as well as the
   351  // queueImpl's `shouldQueue`. AddAsync does not check any of this and accept a
   352  // priority directly instead of getting it from `shouldQueue`. These methods run
   353  // with shared a maximum concurrency of `addOrMaybeAddSemSize`. If the maximum
   354  // concurrency is reached, MaybeAddAsync will silently drop the replica but
   355  // AddAsync will block.
   356  //
   357  // Synchronous replica addition is intentionally not part of the public
   358  // interface. Many queue impl's "processing" work functions acquire various
   359  // locks on Replica, so it would be too easy for a callsite of such a method to
   360  // deadlock. See #36413 for context. Additionally, the queues themselves process
   361  // asynchronously and the bounded size means what you add isn't guaranteed to be
   362  // processed, so the exclusive-async contract just forces callers to realize
   363  // this early.
   364  //
   365  // Processing is rate limited by the queueImpl's `timer` which receives the
   366  // amount of time it took to processes the previous replica and returns the
   367  // amount of time to wait before processing the next one. A bounded amount of
   368  // processing concurrency is allowed, which is controlled by the
   369  // `maxConcurrency` option in the queue's configuration. If a replica is added
   370  // while being processed, it's requeued after the processing finishes.
   371  //
   372  // Note that all sorts of things can change between when a replica is enqueued
   373  // and when it is processed, so the queue makes sure to grab the latest one
   374  // right before processing by looking up the current replica with the same
   375  // RangeID. This replica could be gone or, in extreme cases, could have been
   376  // removed and re-added and now has a new ReplicaID. Implementors needs to be
   377  // resilient to this.
   378  //
   379  // A queueImpl can opt into a purgatory by returning a non-nil channel from the
   380  // `purgatoryChan` method. A replica is put into purgatory when the `process`
   381  // method returns an error with a `purgatoryError` as an entry somewhere in the
   382  // `Cause` chain. A replica in purgatory is not processed again until the
   383  // channel is signaled, at which point every replica in purgatory is immediately
   384  // processed. This catchup is run without the `timer` rate limiting but shares
   385  // the same `maxConcurrency` semaphore as regular processing. Note that if a
   386  // purgatory replica is pushed out of a full queue, it's also removed from
   387  // purgatory. Replicas in purgatory count against the max queue size.
   388  //
   389  // After construction a queue needs to be `Start`ed, which spawns a goroutine to
   390  // continually pop the "queued" replica with the highest priority and process
   391  // it. In practice, this is done by the same replicaScanner that adds replicas.
   392  type baseQueue struct {
   393  	log.AmbientContext
   394  
   395  	name       string
   396  	getReplica func(roachpb.RangeID) (replicaInQueue, error)
   397  	// The constructor of the queueImpl structure MUST return a pointer.
   398  	// This is because assigning queueImpl to a function-local, then
   399  	// passing a pointer to it to `makeBaseQueue`, and then returning it
   400  	// from the constructor function will return a queueImpl containing
   401  	// a pointer to a structure which is a copy of the one within which
   402  	// it is contained. DANGER.
   403  	impl   queueImpl
   404  	store  *Store
   405  	gossip *gossip.Gossip
   406  	queueConfig
   407  	incoming         chan struct{} // Channel signaled when a new replica is added to the queue.
   408  	processSem       chan struct{}
   409  	addOrMaybeAddSem *quotapool.IntPool // for {Maybe,}AddAsync
   410  	addLogN          log.EveryN         // avoid log spam when addSem, addOrMaybeAddSemSize are maxed out
   411  	processDur       int64              // accessed atomically
   412  	mu               struct {
   413  		syncutil.Mutex                                    // Protects all variables in the mu struct
   414  		replicas       map[roachpb.RangeID]*replicaItem   // Map from RangeID to replicaItem
   415  		priorityQ      priorityQueue                      // The priority queue
   416  		purgatory      map[roachpb.RangeID]purgatoryError // Map of replicas to processing errors
   417  		stopped        bool
   418  		// Some tests in this package disable queues.
   419  		disabled bool
   420  	}
   421  }
   422  
   423  // newBaseQueue returns a new instance of baseQueue with the specified
   424  // shouldQueue function to determine which replicas to queue and maxSize to
   425  // limit the growth of the queue. Note that maxSize doesn't prevent new
   426  // replicas from being added, it just limits the total size. Higher priority
   427  // replicas can still be added; their addition simply removes the lowest
   428  // priority replica.
   429  func newBaseQueue(
   430  	name string, impl queueImpl, store *Store, gossip *gossip.Gossip, cfg queueConfig,
   431  ) *baseQueue {
   432  	// Use the default process timeout if none specified.
   433  	if cfg.processTimeoutFunc == nil {
   434  		cfg.processTimeoutFunc = defaultProcessTimeoutFunc
   435  	}
   436  	if cfg.maxConcurrency == 0 {
   437  		cfg.maxConcurrency = 1
   438  	}
   439  	// NB: addOrMaybeAddSemSize coupled with tight scanner intervals in tests
   440  	// unfortunately bog down the race build if they are increased too much.
   441  	if cfg.addOrMaybeAddSemSize == 0 {
   442  		cfg.addOrMaybeAddSemSize = 20
   443  	}
   444  
   445  	ambient := store.cfg.AmbientCtx
   446  	ambient.AddLogTag(name, nil)
   447  
   448  	if !cfg.acceptsUnsplitRanges && !cfg.needsSystemConfig {
   449  		log.Fatalf(ambient.AnnotateCtx(context.Background()),
   450  			"misconfigured queue: acceptsUnsplitRanges=false requires needsSystemConfig=true; got %+v", cfg)
   451  	}
   452  
   453  	bq := baseQueue{
   454  		AmbientContext:   ambient,
   455  		name:             name,
   456  		impl:             impl,
   457  		store:            store,
   458  		gossip:           gossip,
   459  		queueConfig:      cfg,
   460  		incoming:         make(chan struct{}, 1),
   461  		processSem:       make(chan struct{}, cfg.maxConcurrency),
   462  		addOrMaybeAddSem: quotapool.NewIntPool("queue-add", uint64(cfg.addOrMaybeAddSemSize)),
   463  		addLogN:          log.Every(5 * time.Second),
   464  		getReplica: func(id roachpb.RangeID) (replicaInQueue, error) {
   465  			repl, err := store.GetReplica(id)
   466  			if repl == nil || err != nil {
   467  				// Don't return (*Replica)(nil) as replicaInQueue or NPEs will
   468  				// ensue.
   469  				return nil, err
   470  			}
   471  			return repl, err
   472  		},
   473  	}
   474  	bq.mu.replicas = map[roachpb.RangeID]*replicaItem{}
   475  
   476  	return &bq
   477  }
   478  
   479  // Name returns the name of the queue.
   480  func (bq *baseQueue) Name() string {
   481  	return bq.name
   482  }
   483  
   484  // NeedsLease returns whether the queue requires a replica to be leaseholder.
   485  func (bq *baseQueue) NeedsLease() bool {
   486  	return bq.needsLease
   487  }
   488  
   489  // Length returns the current size of the queue.
   490  func (bq *baseQueue) Length() int {
   491  	bq.mu.Lock()
   492  	defer bq.mu.Unlock()
   493  	return bq.mu.priorityQ.Len()
   494  }
   495  
   496  // PurgatoryLength returns the current size of purgatory.
   497  func (bq *baseQueue) PurgatoryLength() int {
   498  	// Lock processing while measuring the purgatory length. This ensures that
   499  	// no purgatory replicas are concurrently being processed, during which time
   500  	// they are removed from bq.mu.purgatory even though they may be re-added.
   501  	defer bq.lockProcessing()()
   502  
   503  	bq.mu.Lock()
   504  	defer bq.mu.Unlock()
   505  	return len(bq.mu.purgatory)
   506  }
   507  
   508  // SetDisabled turns queue processing off or on as directed.
   509  func (bq *baseQueue) SetDisabled(disabled bool) {
   510  	bq.mu.Lock()
   511  	bq.mu.disabled = disabled
   512  	bq.mu.Unlock()
   513  }
   514  
   515  // lockProcessing locks all processing in the baseQueue. It returns
   516  // a function to unlock processing.
   517  func (bq *baseQueue) lockProcessing() func() {
   518  	semCount := cap(bq.processSem)
   519  
   520  	// Drain process semaphore.
   521  	for i := 0; i < semCount; i++ {
   522  		bq.processSem <- struct{}{}
   523  	}
   524  
   525  	return func() {
   526  		// Populate process semaphore.
   527  		for i := 0; i < semCount; i++ {
   528  			<-bq.processSem
   529  		}
   530  	}
   531  }
   532  
   533  // Start launches a goroutine to process entries in the queue. The
   534  // provided stopper is used to finish processing.
   535  func (bq *baseQueue) Start(stopper *stop.Stopper) {
   536  	bq.processLoop(stopper)
   537  }
   538  
   539  type baseQueueHelper struct {
   540  	bq *baseQueue
   541  }
   542  
   543  func (h baseQueueHelper) MaybeAdd(ctx context.Context, repl replicaInQueue, now hlc.Timestamp) {
   544  	h.bq.maybeAdd(ctx, repl, now)
   545  }
   546  
   547  func (h baseQueueHelper) Add(ctx context.Context, repl replicaInQueue, prio float64) {
   548  	_, err := h.bq.addInternal(ctx, repl.Desc(), repl.ReplicaID(), prio)
   549  	if err != nil && log.V(1) {
   550  		log.Infof(ctx, "during Add: %s", err)
   551  	}
   552  }
   553  
   554  type queueHelper interface {
   555  	MaybeAdd(ctx context.Context, repl replicaInQueue, now hlc.Timestamp)
   556  	Add(ctx context.Context, repl replicaInQueue, prio float64)
   557  }
   558  
   559  // Async is a more performant substitute for calling AddAsync or MaybeAddAsync
   560  // when many operations are going to be carried out. It invokes the given helper
   561  // function in a goroutine if semaphore capacity is available. If the semaphore
   562  // is not available, the 'wait' parameter decides whether to wait or to return
   563  // as a noop. Note that if the system is quiescing, fn may never be called in-
   564  // dependent of the value of 'wait'.
   565  //
   566  // The caller is responsible for ensuring that opName does not contain PII.
   567  // (Best is to pass a constant string.)
   568  func (bq *baseQueue) Async(
   569  	ctx context.Context, opName string, wait bool, fn func(ctx context.Context, h queueHelper),
   570  ) {
   571  	if log.V(3) {
   572  		log.InfofDepth(ctx, 2, "%s", log.Safe(opName))
   573  	}
   574  	opName += " (" + bq.name + ")"
   575  	if err := bq.store.stopper.RunLimitedAsyncTask(context.Background(), opName, bq.addOrMaybeAddSem, wait,
   576  		func(ctx context.Context) {
   577  			fn(ctx, baseQueueHelper{bq})
   578  		}); err != nil && bq.addLogN.ShouldLog() {
   579  		log.Infof(ctx, "rate limited in %s: %s", log.Safe(opName), err)
   580  	}
   581  }
   582  
   583  // MaybeAddAsync offers the replica to the queue. The queue will only process a
   584  // certain number of these operations concurrently, and will drop (i.e. treat as
   585  // a noop) any additional calls.
   586  func (bq *baseQueue) MaybeAddAsync(ctx context.Context, repl replicaInQueue, now hlc.Timestamp) {
   587  	bq.Async(ctx, "MaybeAdd", false /* wait */, func(ctx context.Context, h queueHelper) {
   588  		h.MaybeAdd(ctx, repl, now)
   589  	})
   590  }
   591  
   592  // AddAsync adds the replica to the queue. Unlike MaybeAddAsync, it will wait
   593  // for other operations to finish instead of turning into a noop (because
   594  // unlikely MaybeAdd, Add is not subject to being called opportunistically).
   595  func (bq *baseQueue) AddAsync(ctx context.Context, repl replicaInQueue, prio float64) {
   596  	bq.Async(ctx, "Add", false /* wait */, func(ctx context.Context, h queueHelper) {
   597  		h.Add(ctx, repl, prio)
   598  	})
   599  }
   600  
   601  func (bq *baseQueue) maybeAdd(ctx context.Context, repl replicaInQueue, now hlc.Timestamp) {
   602  	ctx = repl.AnnotateCtx(ctx)
   603  	// Load the system config if it's needed.
   604  	var cfg *config.SystemConfig
   605  	if bq.needsSystemConfig {
   606  		cfg = bq.gossip.GetSystemConfig()
   607  		if cfg == nil {
   608  			if log.V(1) {
   609  				log.Infof(ctx, "no system config available. skipping")
   610  			}
   611  			return
   612  		}
   613  	}
   614  
   615  	bq.mu.Lock()
   616  	stopped := bq.mu.stopped || bq.mu.disabled
   617  	bq.mu.Unlock()
   618  
   619  	if stopped {
   620  		return
   621  	}
   622  
   623  	if !repl.IsInitialized() {
   624  		return
   625  	}
   626  
   627  	if bq.needsRaftInitialized {
   628  		repl.maybeInitializeRaftGroup(ctx)
   629  	}
   630  
   631  	if cfg != nil && bq.requiresSplit(cfg, repl) {
   632  		// Range needs to be split due to zone configs, but queue does
   633  		// not accept unsplit ranges.
   634  		if log.V(1) {
   635  			log.Infof(ctx, "split needed; not adding")
   636  		}
   637  		return
   638  	}
   639  
   640  	if bq.needsLease {
   641  		// Check to see if either we own the lease or do not know who the lease
   642  		// holder is.
   643  		if lease, _ := repl.GetLease(); repl.IsLeaseValid(lease, now) &&
   644  			!lease.OwnedBy(repl.StoreID()) {
   645  			if log.V(1) {
   646  				log.Infof(ctx, "needs lease; not adding: %+v", lease)
   647  			}
   648  			return
   649  		}
   650  	}
   651  	// NB: in production code, this type assertion is always true. In tests,
   652  	// it may not be and shouldQueue will be passed a nil realRepl. These tests
   653  	// know what they're getting into so that's fine.
   654  	realRepl, _ := repl.(*Replica)
   655  	should, priority := bq.impl.shouldQueue(ctx, now, realRepl, cfg)
   656  	if !should {
   657  		return
   658  	}
   659  	if _, err := bq.addInternal(ctx, repl.Desc(), repl.ReplicaID(), priority); !isExpectedQueueError(err) {
   660  		log.Errorf(ctx, "unable to add: %+v", err)
   661  	}
   662  }
   663  
   664  func (bq *baseQueue) requiresSplit(cfg *config.SystemConfig, repl replicaInQueue) bool {
   665  	if bq.acceptsUnsplitRanges {
   666  		return false
   667  	}
   668  	desc := repl.Desc()
   669  	return cfg.NeedsSplit(desc.StartKey, desc.EndKey)
   670  }
   671  
   672  // addInternal adds the replica the queue with specified priority. If
   673  // the replica is already queued at a lower priority, updates the existing
   674  // priority. Expects the queue lock to be held by caller.
   675  func (bq *baseQueue) addInternal(
   676  	ctx context.Context, desc *roachpb.RangeDescriptor, replicaID roachpb.ReplicaID, priority float64,
   677  ) (bool, error) {
   678  	// NB: this is intentionally outside of bq.mu to avoid having to consider
   679  	// lock ordering constraints.
   680  	if !desc.IsInitialized() {
   681  		// We checked this above in MaybeAdd(), but we need to check it
   682  		// again for Add().
   683  		return false, errors.New("replica not initialized")
   684  	}
   685  
   686  	bq.mu.Lock()
   687  	defer bq.mu.Unlock()
   688  
   689  	if bq.mu.stopped {
   690  		return false, errQueueStopped
   691  	}
   692  
   693  	if bq.mu.disabled {
   694  		if log.V(3) {
   695  			log.Infof(ctx, "queue disabled")
   696  		}
   697  		return false, errQueueDisabled
   698  	}
   699  
   700  	// If the replica is currently in purgatory, don't re-add it.
   701  	if _, ok := bq.mu.purgatory[desc.RangeID]; ok {
   702  		return false, nil
   703  	}
   704  
   705  	item, ok := bq.mu.replicas[desc.RangeID]
   706  	if ok {
   707  		// Replica is already processing. Mark to be requeued.
   708  		if item.processing {
   709  			wasRequeued := item.requeue
   710  			item.requeue = true
   711  			return !wasRequeued, nil
   712  		}
   713  
   714  		// Replica has already been added but at a lower priority; update priority.
   715  		// Don't lower it since the previous queuer may have known more than this
   716  		// one does.
   717  		if priority > item.priority {
   718  			if log.V(1) {
   719  				log.Infof(ctx, "updating priority: %0.3f -> %0.3f", item.priority, priority)
   720  			}
   721  			bq.mu.priorityQ.update(item, priority)
   722  		}
   723  		return false, nil
   724  	}
   725  
   726  	if log.V(3) {
   727  		log.Infof(ctx, "adding: priority=%0.3f", priority)
   728  	}
   729  	item = &replicaItem{rangeID: desc.RangeID, replicaID: replicaID, priority: priority}
   730  	bq.addLocked(item)
   731  
   732  	// If adding this replica has pushed the queue past its maximum size,
   733  	// remove the lowest priority element.
   734  	if pqLen := bq.mu.priorityQ.Len(); pqLen > bq.maxSize {
   735  		bq.removeLocked(bq.mu.priorityQ.sl[pqLen-1])
   736  	}
   737  	// Signal the processLoop that a replica has been added.
   738  	select {
   739  	case bq.incoming <- struct{}{}:
   740  	default:
   741  		// No need to signal again.
   742  	}
   743  	return true, nil
   744  }
   745  
   746  // MaybeAddCallback adds a callback to be called when the specified range
   747  // finishes processing if the range is in the queue. If the range is in
   748  // purgatory, the callback is called immediately with the purgatory error. If
   749  // the range is not in the queue (either waiting or processing), the method
   750  // returns false.
   751  //
   752  // NB: If the replica this attaches to is dropped from an overfull queue, this
   753  // callback is never called. This is surprising, but the single caller of this
   754  // is okay with these semantics. Adding new uses is discouraged without cleaning
   755  // up the contract of this method, but this code doesn't lend itself readily to
   756  // upholding invariants so there may need to be some cleanup first.
   757  func (bq *baseQueue) MaybeAddCallback(rangeID roachpb.RangeID, cb processCallback) bool {
   758  	bq.mu.Lock()
   759  	defer bq.mu.Unlock()
   760  
   761  	if purgatoryErr, ok := bq.mu.purgatory[rangeID]; ok {
   762  		cb(purgatoryErr)
   763  		return true
   764  	}
   765  	if item, ok := bq.mu.replicas[rangeID]; ok {
   766  		item.registerCallback(cb)
   767  		return true
   768  	}
   769  	return false
   770  }
   771  
   772  // MaybeRemove removes the specified replica from the queue if enqueued.
   773  func (bq *baseQueue) MaybeRemove(rangeID roachpb.RangeID) {
   774  	bq.mu.Lock()
   775  	defer bq.mu.Unlock()
   776  
   777  	if bq.mu.stopped {
   778  		return
   779  	}
   780  
   781  	if item, ok := bq.mu.replicas[rangeID]; ok {
   782  		ctx := bq.AnnotateCtx(context.TODO())
   783  		if log.V(3) {
   784  			log.Infof(ctx, "%s: removing", item.rangeID)
   785  		}
   786  		bq.removeLocked(item)
   787  	}
   788  }
   789  
   790  // processLoop processes the entries in the queue until the provided
   791  // stopper signals exit.
   792  func (bq *baseQueue) processLoop(stopper *stop.Stopper) {
   793  	ctx := bq.AnnotateCtx(context.Background())
   794  	stopper.RunWorker(ctx, func(ctx context.Context) {
   795  		defer func() {
   796  			bq.mu.Lock()
   797  			bq.mu.stopped = true
   798  			bq.mu.Unlock()
   799  		}()
   800  
   801  		// nextTime is initially nil; we don't start any timers until the queue
   802  		// becomes non-empty.
   803  		var nextTime <-chan time.Time
   804  
   805  		immediately := make(chan time.Time)
   806  		close(immediately)
   807  
   808  		for {
   809  			select {
   810  			// Exit on stopper.
   811  			case <-stopper.ShouldStop():
   812  				return
   813  
   814  			// Incoming signal sets the next time to process if there were previously
   815  			// no replicas in the queue.
   816  			case <-bq.incoming:
   817  				if nextTime == nil {
   818  					// When a replica is added, wake up immediately. This is mainly
   819  					// to facilitate testing without unnecessary sleeps.
   820  					nextTime = immediately
   821  
   822  					// In case we're in a test, still block on the impl.
   823  					bq.impl.timer(0)
   824  				}
   825  			// Process replicas as the timer expires.
   826  			case <-nextTime:
   827  				// Acquire from the process semaphore.
   828  				bq.processSem <- struct{}{}
   829  
   830  				repl := bq.pop()
   831  				if repl != nil {
   832  					annotatedCtx := repl.AnnotateCtx(ctx)
   833  					if stopper.RunAsyncTask(
   834  						annotatedCtx, fmt.Sprintf("storage.%s: processing replica", bq.name),
   835  						func(ctx context.Context) {
   836  							// Release semaphore when finished processing.
   837  							defer func() { <-bq.processSem }()
   838  
   839  							start := timeutil.Now()
   840  							err := bq.processReplica(ctx, repl)
   841  
   842  							duration := timeutil.Since(start)
   843  							bq.recordProcessDuration(ctx, duration)
   844  
   845  							bq.finishProcessingReplica(ctx, stopper, repl, err)
   846  						}) != nil {
   847  						// Release semaphore on task failure.
   848  						<-bq.processSem
   849  						return
   850  					}
   851  				} else {
   852  					// Release semaphore if no replicas were available.
   853  					<-bq.processSem
   854  				}
   855  
   856  				if bq.Length() == 0 {
   857  					nextTime = nil
   858  				} else {
   859  					// lastDur will be 0 after the first processing attempt.
   860  					lastDur := bq.lastProcessDuration()
   861  					switch t := bq.impl.timer(lastDur); t {
   862  					case 0:
   863  						nextTime = immediately
   864  					default:
   865  						nextTime = time.After(t)
   866  					}
   867  				}
   868  			}
   869  		}
   870  	})
   871  }
   872  
   873  // lastProcessDuration returns the duration of the last processing attempt.
   874  func (bq *baseQueue) lastProcessDuration() time.Duration {
   875  	return time.Duration(atomic.LoadInt64(&bq.processDur))
   876  }
   877  
   878  // recordProcessDuration records the duration of a processing run.
   879  func (bq *baseQueue) recordProcessDuration(ctx context.Context, dur time.Duration) {
   880  	if log.V(2) {
   881  		log.Infof(ctx, "done %s", dur)
   882  	}
   883  	bq.processingNanos.Inc(dur.Nanoseconds())
   884  	atomic.StoreInt64(&bq.processDur, int64(dur))
   885  }
   886  
   887  // processReplica processes a single replica. This should not be
   888  // called externally to the queue. bq.mu.Lock must not be held
   889  // while calling this method.
   890  //
   891  // ctx should already be annotated by repl.AnnotateCtx().
   892  func (bq *baseQueue) processReplica(ctx context.Context, repl replicaInQueue) error {
   893  	// Load the system config if it's needed.
   894  	var cfg *config.SystemConfig
   895  	if bq.needsSystemConfig {
   896  		cfg = bq.gossip.GetSystemConfig()
   897  		if cfg == nil {
   898  			log.VEventf(ctx, 1, "no system config available. skipping")
   899  			return nil
   900  		}
   901  	}
   902  
   903  	if cfg != nil && bq.requiresSplit(cfg, repl) {
   904  		// Range needs to be split due to zone configs, but queue does
   905  		// not accept unsplit ranges.
   906  		log.VEventf(ctx, 3, "split needed; skipping")
   907  		return nil
   908  	}
   909  
   910  	ctx, span := bq.AnnotateCtxWithSpan(ctx, bq.name)
   911  	defer span.Finish()
   912  	return contextutil.RunWithTimeout(ctx, fmt.Sprintf("%s queue process replica %d", bq.name, repl.GetRangeID()),
   913  		bq.processTimeoutFunc(bq.store.ClusterSettings(), repl), func(ctx context.Context) error {
   914  			log.VEventf(ctx, 1, "processing replica")
   915  
   916  			if !repl.IsInitialized() {
   917  				// We checked this when adding the replica, but we need to check it again
   918  				// in case this is a different replica with the same range ID (see #14193).
   919  				// This is possible in the case where the replica was enqueued while not
   920  				// having a replica ID, perhaps due to a pre-emptive snapshot, and has
   921  				// since been removed and re-added at a different replica ID.
   922  				return errors.New("cannot process uninitialized replica")
   923  			}
   924  
   925  			if reason, err := repl.IsDestroyed(); err != nil {
   926  				if !bq.queueConfig.processDestroyedReplicas || reason == destroyReasonRemoved {
   927  					log.VEventf(ctx, 3, "replica destroyed (%s); skipping", err)
   928  					return nil
   929  				}
   930  			}
   931  
   932  			// If the queue requires a replica to have the range lease in
   933  			// order to be processed, check whether this replica has range lease
   934  			// and renew or acquire if necessary.
   935  			if bq.needsLease {
   936  				if _, pErr := repl.redirectOnOrAcquireLease(ctx); pErr != nil {
   937  					switch v := pErr.GetDetail().(type) {
   938  					case *roachpb.NotLeaseHolderError, *roachpb.RangeNotFoundError:
   939  						log.VEventf(ctx, 3, "%s; skipping", v)
   940  						return nil
   941  					default:
   942  						log.VErrEventf(ctx, 2, "could not obtain lease: %s", pErr)
   943  						return errors.Wrapf(pErr.GoError(), "%s: could not obtain lease", repl)
   944  					}
   945  				}
   946  			}
   947  
   948  			log.VEventf(ctx, 3, "processing...")
   949  			// NB: in production code, this type assertion is always true. In tests,
   950  			// it may not be and shouldQueue will be passed a nil realRepl. These tests
   951  			// know what they're getting into so that's fine.
   952  			realRepl, _ := repl.(*Replica)
   953  			if err := bq.impl.process(ctx, realRepl, cfg); err != nil {
   954  				return err
   955  			}
   956  			log.VEventf(ctx, 3, "processing... done")
   957  			bq.successes.Inc(1)
   958  			return nil
   959  		})
   960  }
   961  
   962  type benignError struct {
   963  	cause error
   964  }
   965  
   966  func (be *benignError) Error() string { return be.cause.Error() }
   967  func (be *benignError) Cause() error  { return be.cause }
   968  
   969  func isBenign(err error) bool {
   970  	return errors.HasType(err, (*benignError)(nil))
   971  }
   972  
   973  func isPurgatoryError(err error) (purgatoryError, bool) {
   974  	var purgErr purgatoryError
   975  	return purgErr, errors.As(err, &purgErr)
   976  }
   977  
   978  // assertInvariants codifies the guarantees upheld by the data structures in the
   979  // base queue. In summary, a replica is one of:
   980  // - "queued" and in mu.replicas and mu.priorityQ
   981  // - "processing" and only in mu.replicas
   982  // - "purgatory" and in mu.replicas and mu.purgatory
   983  //
   984  // Note that in particular, nothing is ever in both mu.priorityQ and
   985  // mu.purgatory.
   986  func (bq *baseQueue) assertInvariants() {
   987  	bq.mu.Lock()
   988  	defer bq.mu.Unlock()
   989  
   990  	ctx := bq.AnnotateCtx(context.Background())
   991  	for _, item := range bq.mu.priorityQ.sl {
   992  		if item.processing {
   993  			log.Fatalf(ctx, "processing item found in prioQ: %v", item)
   994  		}
   995  		if _, inReplicas := bq.mu.replicas[item.rangeID]; !inReplicas {
   996  			log.Fatalf(ctx, "item found in prioQ but not in mu.replicas: %v", item)
   997  		}
   998  		if _, inPurg := bq.mu.purgatory[item.rangeID]; inPurg {
   999  			log.Fatalf(ctx, "item found in prioQ and purgatory: %v", item)
  1000  		}
  1001  	}
  1002  	for rangeID := range bq.mu.purgatory {
  1003  		item, inReplicas := bq.mu.replicas[rangeID]
  1004  		if !inReplicas {
  1005  			log.Fatalf(ctx, "item found in purg but not in mu.replicas: %v", item)
  1006  		}
  1007  		if item.processing {
  1008  			log.Fatalf(ctx, "processing item found in purgatory: %v", item)
  1009  		}
  1010  		// NB: we already checked above that item not in prioQ.
  1011  	}
  1012  
  1013  	// At this point we know that the purgatory in prioQ are distinct, and we
  1014  	// also know that no processing replicas are tracked in each. Let's check
  1015  	// that there aren't any non-processing replicas *only* in bq.mu.replicas.
  1016  	var nNotProcessing int
  1017  	for _, item := range bq.mu.replicas {
  1018  		if !item.processing {
  1019  			nNotProcessing++
  1020  		}
  1021  	}
  1022  	if nNotProcessing != len(bq.mu.purgatory)+len(bq.mu.priorityQ.sl) {
  1023  		log.Fatalf(ctx, "have %d non-processing replicas in mu.replicas, "+
  1024  			"but %d in purgatory and %d in prioQ; the latter two should add up"+
  1025  			"to the former", nNotProcessing, len(bq.mu.purgatory), len(bq.mu.priorityQ.sl))
  1026  	}
  1027  }
  1028  
  1029  // finishProcessingReplica handles the completion of a replica process attempt.
  1030  // It removes the replica from the replica set and may re-enqueue the replica or
  1031  // add it to purgatory.
  1032  func (bq *baseQueue) finishProcessingReplica(
  1033  	ctx context.Context, stopper *stop.Stopper, repl replicaInQueue, err error,
  1034  ) {
  1035  	bq.mu.Lock()
  1036  	// Remove item from replica set completely. We may add it
  1037  	// back in down below.
  1038  	item := bq.mu.replicas[repl.GetRangeID()]
  1039  	processing := item.processing
  1040  	callbacks := item.callbacks
  1041  	requeue := item.requeue
  1042  	item.callbacks = nil
  1043  	bq.removeFromReplicaSetLocked(repl.GetRangeID())
  1044  	item = nil // prevent accidental use below
  1045  	bq.mu.Unlock()
  1046  
  1047  	if !processing {
  1048  		log.Fatalf(ctx, "%s: attempt to remove non-processing replica %v", bq.name, repl)
  1049  	}
  1050  
  1051  	// Call any registered callbacks.
  1052  	for _, cb := range callbacks {
  1053  		cb(err)
  1054  	}
  1055  
  1056  	// Handle failures.
  1057  	if err != nil {
  1058  		benign := isBenign(err)
  1059  
  1060  		// Increment failures metric.
  1061  		//
  1062  		// TODO(tschottdorf): once we start asserting zero failures in tests
  1063  		// (and production), move benign failures into a dedicated category.
  1064  		bq.failures.Inc(1)
  1065  
  1066  		// Determine whether a failure is a purgatory error. If it is, add
  1067  		// the failing replica to purgatory. Note that even if the item was
  1068  		// scheduled to be requeued, we ignore this if we add the replica to
  1069  		// purgatory.
  1070  		if purgErr, ok := isPurgatoryError(err); ok {
  1071  			bq.mu.Lock()
  1072  			bq.addToPurgatoryLocked(ctx, stopper, repl, purgErr)
  1073  			bq.mu.Unlock()
  1074  			return
  1075  		}
  1076  
  1077  		// If not a benign or purgatory error, log.
  1078  		if !benign {
  1079  			log.Errorf(ctx, "%v", err)
  1080  		}
  1081  	}
  1082  
  1083  	// Maybe add replica back into queue, if requested.
  1084  	if requeue {
  1085  		bq.maybeAdd(ctx, repl, bq.store.Clock().Now())
  1086  	}
  1087  }
  1088  
  1089  // addToPurgatoryLocked adds the specified replica to the purgatory queue, which
  1090  // holds replicas which have failed processing.
  1091  func (bq *baseQueue) addToPurgatoryLocked(
  1092  	ctx context.Context, stopper *stop.Stopper, repl replicaInQueue, purgErr purgatoryError,
  1093  ) {
  1094  	bq.mu.AssertHeld()
  1095  
  1096  	// Check whether the queue supports purgatory errors. If not then something
  1097  	// went wrong because a purgatory error should not have ended up here.
  1098  	if bq.impl.purgatoryChan() == nil {
  1099  		log.Errorf(ctx, "queue does not support purgatory errors, but saw %v", purgErr)
  1100  		return
  1101  	}
  1102  
  1103  	if log.V(1) {
  1104  		log.Infof(ctx, "purgatory: %v", purgErr)
  1105  	}
  1106  
  1107  	if _, found := bq.mu.replicas[repl.GetRangeID()]; found {
  1108  		// Don't add to purgatory if already in the queue (again). We need to
  1109  		// uphold the invariant that a replica is never both in the priority
  1110  		// queue and the purgatory at the same time or bad things will happen.
  1111  		// See bq.assertInvariants and:
  1112  		// https://github.com/cockroachdb/cockroach/issues/36277#issuecomment-482659939
  1113  		return
  1114  	}
  1115  
  1116  	item := &replicaItem{rangeID: repl.GetRangeID(), replicaID: repl.ReplicaID(), index: -1}
  1117  	bq.mu.replicas[repl.GetRangeID()] = item
  1118  
  1119  	defer func() {
  1120  		bq.purgatory.Update(int64(len(bq.mu.purgatory)))
  1121  	}()
  1122  
  1123  	// If purgatory already exists, just add to the map and we're done.
  1124  	if bq.mu.purgatory != nil {
  1125  		bq.mu.purgatory[repl.GetRangeID()] = purgErr
  1126  		return
  1127  	}
  1128  
  1129  	// Otherwise, create purgatory and start processing.
  1130  	bq.mu.purgatory = map[roachpb.RangeID]purgatoryError{
  1131  		repl.GetRangeID(): purgErr,
  1132  	}
  1133  
  1134  	workerCtx := bq.AnnotateCtx(context.Background())
  1135  	stopper.RunWorker(workerCtx, func(ctx context.Context) {
  1136  		ticker := time.NewTicker(purgatoryReportInterval)
  1137  		for {
  1138  			select {
  1139  			case <-bq.impl.purgatoryChan():
  1140  				func() {
  1141  					// Acquire from the process semaphore, release when done.
  1142  					bq.processSem <- struct{}{}
  1143  					defer func() { <-bq.processSem }()
  1144  
  1145  					// Remove all items from purgatory into a copied slice.
  1146  					bq.mu.Lock()
  1147  					ranges := make([]*replicaItem, 0, len(bq.mu.purgatory))
  1148  					for rangeID := range bq.mu.purgatory {
  1149  						item := bq.mu.replicas[rangeID]
  1150  						if item == nil {
  1151  							log.Fatalf(ctx, "r%d is in purgatory but not in replicas", rangeID)
  1152  						}
  1153  						item.setProcessing()
  1154  						ranges = append(ranges, item)
  1155  						bq.removeFromPurgatoryLocked(item)
  1156  					}
  1157  					bq.mu.Unlock()
  1158  
  1159  					for _, item := range ranges {
  1160  						repl, err := bq.getReplica(item.rangeID)
  1161  						if err != nil || item.replicaID != repl.ReplicaID() {
  1162  							continue
  1163  						}
  1164  						annotatedCtx := repl.AnnotateCtx(ctx)
  1165  						if stopper.RunTask(
  1166  							annotatedCtx, fmt.Sprintf("storage.%s: purgatory processing replica", bq.name),
  1167  							func(ctx context.Context) {
  1168  								err := bq.processReplica(ctx, repl)
  1169  								bq.finishProcessingReplica(ctx, stopper, repl, err)
  1170  							}) != nil {
  1171  							return
  1172  						}
  1173  					}
  1174  				}()
  1175  
  1176  				// Clean up purgatory, if empty.
  1177  				bq.mu.Lock()
  1178  				if len(bq.mu.purgatory) == 0 {
  1179  					log.Infof(ctx, "purgatory is now empty")
  1180  					bq.mu.purgatory = nil
  1181  					bq.mu.Unlock()
  1182  					return
  1183  				}
  1184  				bq.mu.Unlock()
  1185  			case <-ticker.C:
  1186  				// Report purgatory status.
  1187  				bq.mu.Lock()
  1188  				errMap := map[string]int{}
  1189  				for _, err := range bq.mu.purgatory {
  1190  					errMap[err.Error()]++
  1191  				}
  1192  				bq.mu.Unlock()
  1193  				for errStr, count := range errMap {
  1194  					log.Errorf(ctx, "%d replicas failing with %q", count, errStr)
  1195  				}
  1196  			case <-stopper.ShouldStop():
  1197  				return
  1198  			}
  1199  		}
  1200  	})
  1201  }
  1202  
  1203  // pop dequeues the highest priority replica, if any, in the queue. The
  1204  // replicaItem corresponding to the returned Replica will be moved to the
  1205  // "processing" state and should be cleaned up by calling
  1206  // finishProcessingReplica once the Replica has finished processing.
  1207  func (bq *baseQueue) pop() replicaInQueue {
  1208  	bq.mu.Lock()
  1209  	for {
  1210  		if bq.mu.priorityQ.Len() == 0 {
  1211  			bq.mu.Unlock()
  1212  			return nil
  1213  		}
  1214  		item := heap.Pop(&bq.mu.priorityQ).(*replicaItem)
  1215  		if item.processing {
  1216  			log.Fatalf(bq.AnnotateCtx(context.Background()), "%s pulled processing item from heap: %v", bq.name, item)
  1217  		}
  1218  		item.setProcessing()
  1219  		bq.pending.Update(int64(bq.mu.priorityQ.Len()))
  1220  		bq.mu.Unlock()
  1221  
  1222  		repl, _ := bq.getReplica(item.rangeID)
  1223  		if repl != nil && item.replicaID == repl.ReplicaID() {
  1224  			return repl
  1225  		}
  1226  		// Replica not found or was recreated with a new replica ID, remove from
  1227  		// set and try again.
  1228  		bq.mu.Lock()
  1229  		bq.removeFromReplicaSetLocked(item.rangeID)
  1230  	}
  1231  }
  1232  
  1233  // addLocked adds an element to the priority queue. Caller must hold mutex.
  1234  func (bq *baseQueue) addLocked(item *replicaItem) {
  1235  	heap.Push(&bq.mu.priorityQ, item)
  1236  	bq.pending.Update(int64(bq.mu.priorityQ.Len()))
  1237  	bq.mu.replicas[item.rangeID] = item
  1238  }
  1239  
  1240  // removeLocked removes an element from purgatory (if it's experienced an
  1241  // error) or from the priority queue by index. Caller must hold mutex.
  1242  func (bq *baseQueue) removeLocked(item *replicaItem) {
  1243  	if item.processing {
  1244  		// The item is processing. We can't intererupt the processing
  1245  		// or remove it from the replica set yet, but we can make sure
  1246  		// it doesn't get requeued.
  1247  		item.requeue = false
  1248  	} else {
  1249  		if _, inPurg := bq.mu.purgatory[item.rangeID]; inPurg {
  1250  			bq.removeFromPurgatoryLocked(item)
  1251  		} else if item.index >= 0 {
  1252  			bq.removeFromQueueLocked(item)
  1253  		} else {
  1254  			log.Fatalf(bq.AnnotateCtx(context.Background()),
  1255  				"item for r%d is only in replicas map, but is not processing",
  1256  				item.rangeID,
  1257  			)
  1258  		}
  1259  		bq.removeFromReplicaSetLocked(item.rangeID)
  1260  	}
  1261  }
  1262  
  1263  // Caller must hold mutex.
  1264  func (bq *baseQueue) removeFromPurgatoryLocked(item *replicaItem) {
  1265  	delete(bq.mu.purgatory, item.rangeID)
  1266  	bq.purgatory.Update(int64(len(bq.mu.purgatory)))
  1267  }
  1268  
  1269  // Caller must hold mutex.
  1270  func (bq *baseQueue) removeFromQueueLocked(item *replicaItem) {
  1271  	heap.Remove(&bq.mu.priorityQ, item.index)
  1272  	bq.pending.Update(int64(bq.mu.priorityQ.Len()))
  1273  }
  1274  
  1275  // Caller must hold mutex.
  1276  func (bq *baseQueue) removeFromReplicaSetLocked(rangeID roachpb.RangeID) {
  1277  	if _, found := bq.mu.replicas[rangeID]; !found {
  1278  		log.Fatalf(bq.AnnotateCtx(context.Background()),
  1279  			"attempted to remove r%d from queue, but it isn't in it",
  1280  			rangeID,
  1281  		)
  1282  	}
  1283  	delete(bq.mu.replicas, rangeID)
  1284  }
  1285  
  1286  // DrainQueue locks the queue and processes the remaining queued replicas. It
  1287  // processes the replicas in the order they're queued in, one at a time.
  1288  // Exposed for testing only.
  1289  func (bq *baseQueue) DrainQueue(stopper *stop.Stopper) {
  1290  	// Lock processing while draining. This prevents the main process
  1291  	// loop from racing with this method and ensures that any replicas
  1292  	// queued up when this method was called will be processed by the
  1293  	// time it returns.
  1294  	defer bq.lockProcessing()()
  1295  
  1296  	ctx := bq.AnnotateCtx(context.TODO())
  1297  	for repl := bq.pop(); repl != nil; repl = bq.pop() {
  1298  		annotatedCtx := repl.AnnotateCtx(ctx)
  1299  		err := bq.processReplica(annotatedCtx, repl)
  1300  		bq.finishProcessingReplica(annotatedCtx, stopper, repl, err)
  1301  	}
  1302  }