github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/raft_log_queue.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/raft_log_queue.go (about)

     1  // Copyright 2015 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"sort"
    17  	"strings"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/config"
    21  	"github.com/cockroachdb/cockroach/pkg/gossip"
    22  	"github.com/cockroachdb/cockroach/pkg/kv"
    23  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    24  	"github.com/cockroachdb/cockroach/pkg/util"
    25  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    26  	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
    27  	"github.com/cockroachdb/cockroach/pkg/util/log"
    28  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    29  	"github.com/cockroachdb/errors"
    30  	"go.etcd.io/etcd/raft"
    31  	"go.etcd.io/etcd/raft/tracker"
    32  )
    33  
    34  const (
    35  	// raftLogQueueTimerDuration is the duration between truncations.
    36  	raftLogQueueTimerDuration = 0 // zero duration to process truncations greedily
    37  	// RaftLogQueueStaleThreshold is the minimum threshold for stale raft log
    38  	// entries. A stale entry is one which all replicas of the range have
    39  	// progressed past and thus is no longer needed and can be truncated.
    40  	RaftLogQueueStaleThreshold = 100
    41  	// RaftLogQueueStaleSize is the minimum size of the Raft log that we'll
    42  	// truncate even if there are fewer than RaftLogQueueStaleThreshold entries
    43  	// to truncate. The value of 64 KB was chosen experimentally by looking at
    44  	// when Raft log truncation usually occurs when using the number of entries
    45  	// as the sole criteria.
    46  	RaftLogQueueStaleSize = 64 << 10
    47  	// Allow a limited number of Raft log truncations to be processed
    48  	// concurrently.
    49  	raftLogQueueConcurrency = 4
    50  	// While a snapshot is in flight, we won't truncate past the snapshot's log
    51  	// index. This behavior is extended to a grace period after the snapshot is
    52  	// marked as completed as it is applied at the receiver only a little later,
    53  	// leaving a window for a truncation that requires another snapshot.
    54  	raftLogQueuePendingSnapshotGracePeriod = 3 * time.Second
    55  )
    56  
    57  // raftLogQueue manages a queue of replicas slated to have their raft logs
    58  // truncated by removing unneeded entries.
    59  type raftLogQueue struct {
    60  	*baseQueue
    61  	db *kv.DB
    62  
    63  	logSnapshots util.EveryN
    64  }
    65  
    66  // newRaftLogQueue returns a new instance of raftLogQueue. Replicas are passed
    67  // to the queue both proactively (triggered by write load) and periodically
    68  // (via the scanner). When processing a replica, the queue decides whether the
    69  // Raft log can be truncated, which is a tradeoff between wanting to keep the
    70  // log short overall and allowing slower followers to catch up before they get
    71  // cut off by a truncation and need a snapshot. See newTruncateDecision for
    72  // details on this decision making process.
    73  func newRaftLogQueue(store *Store, db *kv.DB, gossip *gossip.Gossip) *raftLogQueue {
    74  	rlq := &raftLogQueue{
    75  		db:           db,
    76  		logSnapshots: util.Every(10 * time.Second),
    77  	}
    78  	rlq.baseQueue = newBaseQueue(
    79  		"raftlog", rlq, store, gossip,
    80  		queueConfig{
    81  			maxSize:              defaultQueueMaxSize,
    82  			maxConcurrency:       raftLogQueueConcurrency,
    83  			needsLease:           false,
    84  			needsSystemConfig:    false,
    85  			acceptsUnsplitRanges: true,
    86  			successes:            store.metrics.RaftLogQueueSuccesses,
    87  			failures:             store.metrics.RaftLogQueueFailures,
    88  			pending:              store.metrics.RaftLogQueuePending,
    89  			processingNanos:      store.metrics.RaftLogQueueProcessingNanos,
    90  		},
    91  	)
    92  	return rlq
    93  }
    94  
    95  // newTruncateDecision returns a truncateDecision for the given Replica if no
    96  // error occurs. If input data to establish a truncateDecision is missing, a
    97  // zero decision is returned.
    98  //
    99  // At a high level, a truncate decision operates based on the Raft log size, the
   100  // number of entries in the log, and the Raft status of the followers. In an
   101  // ideal world and most of the time, followers are reasonably up to date, and a
   102  // decision to truncate to the index acked on all replicas will be made whenever
   103  // there is at least a little bit of log to truncate (think a hundred records or
   104  // ~100kb of data). If followers fall behind, are offline, or are waiting for a
   105  // snapshot, a second strategy is needed to make sure that the Raft log is
   106  // eventually truncated: when the raft log size exceeds a limit (4mb at time of
   107  // writing), truncations become willing and able to cut off followers as long as
   108  // a quorum has acked the truncation index. The quota pool ensures that the delta
   109  // between "acked by quorum" and "acked by all" is bounded, while Raft limits the
   110  // size of the uncommitted, i.e. not "acked by quorum", part of the log; thus
   111  // the "quorum" truncation strategy bounds the absolute size of the log on all
   112  // followers.
   113  //
   114  // Exceptions are made for replicas for which information is missing ("probing
   115  // state") as long as they are known to have been online recently, and for
   116  // in-flight snapshots (in particular preemptive snapshots) which are not
   117  // adequately reflected in the Raft status and would otherwise be cut off with
   118  // regularity. Probing live followers should only remain in this state for a
   119  // short moment and so we deny a log truncation outright (as there's no safe
   120  // index to truncate to); for snapshots, we can still truncate, but not past
   121  // the snapshot's index.
   122  //
   123  // A challenge for log truncation is to deal with sideloaded log entries, that
   124  // is, entries which contain SSTables for direct ingestion into the storage
   125  // engine. Such log entries are very large, and failing to account for them in
   126  // the heuristics can trigger overly aggressive truncations.
   127  //
   128  // The raft log size used in the decision making process is principally updated
   129  // in the main Raft command apply loop, and adds a Replica to this queue
   130  // whenever the log size has increased by a non-negligible amount that would be
   131  // worth truncating (~100kb).
   132  //
   133  // Unfortunately, the size tracking is not very robust as it suffers from two
   134  // limitations at the time of writing:
   135  // 1. it may undercount as it is in-memory and incremented only as proposals
   136  //    are handled; that is, a freshly started node will believe its Raft log to be
   137  //    zero-sized independent of its actual size, and
   138  // 2. the addition and corresponding subtraction happen in very different places
   139  //    and are difficult to keep bug-free, meaning that there is low confidence that
   140  //    we maintain the delta in a completely accurate manner over time. One example
   141  //    of potential errors are sideloaded proposals, for which the subtraction needs
   142  //    to load the size of the file on-disk (i.e. supplied by the fs), whereas
   143  //    the addition uses the in-memory representation of the file.
   144  //
   145  // Ideally, a Raft log that grows large for whichever reason (for instance the
   146  // queue being stuck on another replica) wouldn't be more than a nuisance on
   147  // nodes with sufficient disk space. Unfortunately, at the time of writing, the
   148  // Raft log is included in Raft snapshots. On the other hand, IMPORT/RESTORE's
   149  // split/scatter phase interacts poorly with overly aggressive truncations and
   150  // can DDOS the Raft snapshot queue.
   151  func newTruncateDecision(ctx context.Context, r *Replica) (truncateDecision, error) {
   152  	rangeID := r.RangeID
   153  	now := timeutil.Now()
   154  
   155  	// NB: we need an exclusive lock due to grabbing the first index.
   156  	r.mu.Lock()
   157  	raftLogSize := r.mu.raftLogSize
   158  	// A "cooperative" truncation (i.e. one that does not cut off followers from
   159  	// the log) takes place whenever there are more than
   160  	// RaftLogQueueStaleThreshold entries or the log's estimated size is above
   161  	// RaftLogQueueStaleSize bytes. This is fairly aggressive, so under normal
   162  	// conditions, the log is very small.
   163  	//
   164  	// If followers start falling behind, at some point the logs still need to
   165  	// be truncated. We do this either when the size of the log exceeds
   166  	// RaftLogTruncationThreshold (or, in eccentric configurations, the zone's
   167  	// RangeMaxBytes). This captures the heuristic that at some point, it's more
   168  	// efficient to catch up via a snapshot than via applying a long tail of log
   169  	// entries.
   170  	targetSize := r.store.cfg.RaftLogTruncationThreshold
   171  	if targetSize > *r.mu.zone.RangeMaxBytes {
   172  		targetSize = *r.mu.zone.RangeMaxBytes
   173  	}
   174  	raftStatus := r.raftStatusRLocked()
   175  
   176  	firstIndex, err := r.raftFirstIndexLocked()
   177  	const anyRecipientStore roachpb.StoreID = 0
   178  	pendingSnapshotIndex := r.getAndGCSnapshotLogTruncationConstraintsLocked(now, anyRecipientStore)
   179  	lastIndex := r.mu.lastIndex
   180  	logSizeTrusted := r.mu.raftLogSizeTrusted
   181  	r.mu.Unlock()
   182  
   183  	if err != nil {
   184  		return truncateDecision{}, errors.Errorf("error retrieving first index for r%d: %s", rangeID, err)
   185  	}
   186  
   187  	if raftStatus == nil {
   188  		if log.V(6) {
   189  			log.Infof(ctx, "the raft group doesn't exist for r%d", rangeID)
   190  		}
   191  		return truncateDecision{}, nil
   192  	}
   193  
   194  	// Is this the raft leader? We only perform log truncation on the raft leader
   195  	// which has the up to date info on followers.
   196  	if raftStatus.RaftState != raft.StateLeader {
   197  		return truncateDecision{}, nil
   198  	}
   199  
   200  	// For all our followers, overwrite the RecentActive field (which is always
   201  	// true since we don't use CheckQuorum) with our own activity check.
   202  	r.mu.RLock()
   203  	log.Eventf(ctx, "raft status before lastUpdateTimes check: %+v", raftStatus.Progress)
   204  	log.Eventf(ctx, "lastUpdateTimes: %+v", r.mu.lastUpdateTimes)
   205  	updateRaftProgressFromActivity(
   206  		ctx, raftStatus.Progress, r.descRLocked().Replicas().All(),
   207  		func(replicaID roachpb.ReplicaID) bool {
   208  			return r.mu.lastUpdateTimes.isFollowerActiveSince(
   209  				ctx, replicaID, now, r.store.cfg.RangeLeaseActiveDuration())
   210  		},
   211  	)
   212  	log.Eventf(ctx, "raft status after lastUpdateTimes check: %+v", raftStatus.Progress)
   213  	r.mu.RUnlock()
   214  
   215  	if pr, ok := raftStatus.Progress[raftStatus.Lead]; ok {
   216  		// TODO(tschottdorf): remove this line once we have picked up
   217  		// https://github.com/etcd-io/etcd/pull/10279
   218  		pr.State = tracker.StateReplicate
   219  		raftStatus.Progress[raftStatus.Lead] = pr
   220  	}
   221  
   222  	input := truncateDecisionInput{
   223  		RaftStatus:           *raftStatus,
   224  		LogSize:              raftLogSize,
   225  		MaxLogSize:           targetSize,
   226  		LogSizeTrusted:       logSizeTrusted,
   227  		FirstIndex:           firstIndex,
   228  		LastIndex:            lastIndex,
   229  		PendingSnapshotIndex: pendingSnapshotIndex,
   230  	}
   231  
   232  	decision := computeTruncateDecision(input)
   233  	return decision, nil
   234  }
   235  
   236  func updateRaftProgressFromActivity(
   237  	ctx context.Context,
   238  	prs map[uint64]tracker.Progress,
   239  	replicas []roachpb.ReplicaDescriptor,
   240  	replicaActive func(roachpb.ReplicaID) bool,
   241  ) {
   242  	for _, replDesc := range replicas {
   243  		replicaID := replDesc.ReplicaID
   244  		pr, ok := prs[uint64(replicaID)]
   245  		if !ok {
   246  			continue
   247  		}
   248  		pr.RecentActive = replicaActive(replicaID)
   249  		// Override this field for safety since we don't use it. Instead, we use
   250  		// pendingSnapshotIndex from above which is also populated for preemptive
   251  		// snapshots.
   252  		//
   253  		// NOTE: We don't rely on PendingSnapshot because PendingSnapshot is
   254  		// initialized by the leader when it realizes the follower needs a snapshot,
   255  		// and it isn't initialized with the index of the snapshot that is actually
   256  		// sent by us (out of band), which likely is lower.
   257  		pr.PendingSnapshot = 0
   258  		prs[uint64(replicaID)] = pr
   259  	}
   260  }
   261  
   262  const (
   263  	truncatableIndexChosenViaCommitIndex     = "commit"
   264  	truncatableIndexChosenViaFollowers       = "followers"
   265  	truncatableIndexChosenViaProbingFollower = "probing follower"
   266  	truncatableIndexChosenViaPendingSnap     = "pending snapshot"
   267  	truncatableIndexChosenViaFirstIndex      = "first index"
   268  	truncatableIndexChosenViaLastIndex       = "last index"
   269  )
   270  
   271  type truncateDecisionInput struct {
   272  	RaftStatus            raft.Status
   273  	LogSize, MaxLogSize   int64
   274  	LogSizeTrusted        bool // false when LogSize might be off
   275  	FirstIndex, LastIndex uint64
   276  	PendingSnapshotIndex  uint64
   277  }
   278  
   279  func (input truncateDecisionInput) LogTooLarge() bool {
   280  	return input.LogSize > input.MaxLogSize
   281  }
   282  
   283  // truncateDecision describes a truncation decision.
   284  // Beware: when extending this struct, be sure to adjust .String()
   285  // so that it is guaranteed to not contain any PII or confidential
   286  // cluster data.
   287  type truncateDecision struct {
   288  	Input       truncateDecisionInput
   289  	CommitIndex uint64
   290  
   291  	NewFirstIndex uint64 // first index of the resulting log after truncation
   292  	ChosenVia     string
   293  }
   294  
   295  func (td *truncateDecision) raftSnapshotsForIndex(index uint64) int {
   296  	var n int
   297  	for _, p := range td.Input.RaftStatus.Progress {
   298  		if p.State != tracker.StateReplicate {
   299  			// If the follower isn't replicating, we can't trust its Match in
   300  			// the first place. But note that this shouldn't matter in practice
   301  			// as we already take care to not cut off these followers when
   302  			// computing the truncate decision. See:
   303  			_ = truncatableIndexChosenViaProbingFollower // guru ref
   304  			continue
   305  		}
   306  
   307  		// When a log truncation happens at the "current log index" (i.e. the
   308  		// most recently committed index), it is often still in flight to the
   309  		// followers not required for quorum, and it is likely that they won't
   310  		// need a truncation to catch up. A follower in that state will have a
   311  		// Match equaling committed-1, but a Next of committed+1 (indicating that
   312  		// an append at 'committed' is already ongoing).
   313  		if p.Match < index && p.Next <= index {
   314  			n++
   315  		}
   316  	}
   317  	if td.Input.PendingSnapshotIndex != 0 && td.Input.PendingSnapshotIndex < index {
   318  		n++
   319  	}
   320  
   321  	return n
   322  }
   323  
   324  func (td *truncateDecision) NumNewRaftSnapshots() int {
   325  	return td.raftSnapshotsForIndex(td.NewFirstIndex) - td.raftSnapshotsForIndex(td.Input.FirstIndex)
   326  }
   327  
   328  // String returns a representation for the decision.
   329  // It is guaranteed to not return PII or confidential
   330  // information from the cluster.
   331  func (td *truncateDecision) String() string {
   332  	var buf strings.Builder
   333  	_, _ = fmt.Fprintf(&buf, "should truncate: %t [", td.ShouldTruncate())
   334  	_, _ = fmt.Fprintf(
   335  		&buf,
   336  		"truncate %d entries to first index %d (chosen via: %s)",
   337  		td.NumTruncatableIndexes(), td.NewFirstIndex, td.ChosenVia,
   338  	)
   339  	if td.Input.LogTooLarge() {
   340  		_, _ = fmt.Fprintf(
   341  			&buf,
   342  			"; log too large (%s > %s)",
   343  			humanizeutil.IBytes(td.Input.LogSize),
   344  			humanizeutil.IBytes(td.Input.MaxLogSize),
   345  		)
   346  	}
   347  	if n := td.NumNewRaftSnapshots(); n > 0 {
   348  		_, _ = fmt.Fprintf(&buf, "; implies %d Raft snapshot%s", n, util.Pluralize(int64(n)))
   349  	}
   350  	if !td.Input.LogSizeTrusted {
   351  		_, _ = fmt.Fprintf(&buf, "; log size untrusted")
   352  	}
   353  	buf.WriteRune(']')
   354  
   355  	return buf.String()
   356  }
   357  
   358  func (td *truncateDecision) NumTruncatableIndexes() int {
   359  	if td.NewFirstIndex < td.Input.FirstIndex {
   360  		return 0
   361  	}
   362  	return int(td.NewFirstIndex - td.Input.FirstIndex)
   363  }
   364  
   365  func (td *truncateDecision) ShouldTruncate() bool {
   366  	n := td.NumTruncatableIndexes()
   367  	return n >= RaftLogQueueStaleThreshold ||
   368  		(n > 0 && td.Input.LogSize >= RaftLogQueueStaleSize)
   369  }
   370  
   371  // ProtectIndex attempts to "protect" a position in the log by making sure it's
   372  // not truncated away. Specifically it lowers the proposed truncation point
   373  // (which will be the new first index after the truncation) to the given index
   374  // if it would be truncating at a point past it. If a change is made, the
   375  // ChosenVia is updated with the one given. This protection is not guaranteed if
   376  // the protected index is outside of the existing [FirstIndex,LastIndex] bounds.
   377  func (td *truncateDecision) ProtectIndex(index uint64, chosenVia string) {
   378  	if td.NewFirstIndex > index {
   379  		td.NewFirstIndex = index
   380  		td.ChosenVia = chosenVia
   381  	}
   382  }
   383  
   384  // computeTruncateDecision returns the oldest index that cannot be
   385  // truncated. If there is a behind node, we want to keep old raft logs so it
   386  // can catch up without having to send a full snapshot. However, if a node down
   387  // is down long enough, sending a snapshot is more efficient and we should
   388  // truncate the log to the next behind node or the quorum committed index. We
   389  // currently truncate when the raft log size is bigger than the range
   390  // size.
   391  //
   392  // Note that when a node is behind we continue to let the raft log build up
   393  // instead of truncating to the commit index. Consider what would happen if we
   394  // truncated to the commit index whenever a node is behind and thus needs to be
   395  // caught up via a snapshot. While we're generating the snapshot, sending it to
   396  // the behind node and waiting for it to be applied we would continue to
   397  // truncate the log. If the snapshot generation and application takes too long
   398  // the behind node will be caught up to a point behind the current first index
   399  // and thus require another snapshot, likely entering a never ending loop of
   400  // snapshots. See #8629.
   401  func computeTruncateDecision(input truncateDecisionInput) truncateDecision {
   402  	decision := truncateDecision{Input: input}
   403  	decision.CommitIndex = input.RaftStatus.Commit
   404  
   405  	// The last index is most aggressive possible truncation that we could do.
   406  	// Everything else in this method makes the truncation less aggressive.
   407  	decision.NewFirstIndex = input.LastIndex
   408  	decision.ChosenVia = truncatableIndexChosenViaLastIndex
   409  
   410  	// Start by trying to truncate at the commit index. Naively, you would expect
   411  	// LastIndex to never be smaller than the commit index, but
   412  	// RaftStatus.Progress.Match is updated on the leader when a command is
   413  	// proposed and in a single replica Raft group this also means that
   414  	// RaftStatus.Commit is updated at propose time.
   415  	decision.ProtectIndex(decision.CommitIndex, truncatableIndexChosenViaCommitIndex)
   416  
   417  	for _, progress := range input.RaftStatus.Progress {
   418  		// Snapshots are expensive, so we try our best to avoid truncating past
   419  		// where a follower is.
   420  
   421  		// First, we never truncate off a recently active follower, no matter how
   422  		// large the log gets. Recently active shares the (currently 10s) constant
   423  		// as the quota pool, so the quota pool should put a bound on how much the
   424  		// raft log can grow due to this.
   425  		//
   426  		// For live followers which are being probed (i.e. the leader doesn't know
   427  		// how far they've caught up), the Match index is too large, and so the
   428  		// quorum index can be, too. We don't want these followers to require a
   429  		// snapshot since they are most likely going to be caught up very soon (they
   430  		// respond with the "right index" to the first probe or don't respond, in
   431  		// which case they should end up as not recently active). But we also don't
   432  		// know their index, so we can't possible make a truncation decision that
   433  		// avoids that at this point and make the truncation a no-op.
   434  		//
   435  		// The scenario in which this is most relevant is during restores, where we
   436  		// split off new ranges that rapidly receive very large log entries while
   437  		// the Raft group is still in a state of discovery (a new leader starts
   438  		// probing followers at its own last index). Additionally, these ranges will
   439  		// be split many times over, resulting in a flurry of snapshots with
   440  		// overlapping bounds that put significant stress on the Raft snapshot
   441  		// queue.
   442  		if progress.RecentActive {
   443  			if progress.State == tracker.StateProbe {
   444  				decision.ProtectIndex(input.FirstIndex, truncatableIndexChosenViaProbingFollower)
   445  			} else {
   446  				decision.ProtectIndex(progress.Match, truncatableIndexChosenViaFollowers)
   447  			}
   448  			continue
   449  		}
   450  
   451  		// Second, if the follower has not been recently active, we don't
   452  		// truncate it off as long as the raft log is not too large.
   453  		if !input.LogTooLarge() {
   454  			decision.ProtectIndex(progress.Match, truncatableIndexChosenViaFollowers)
   455  		}
   456  
   457  		// Otherwise, we let it truncate to the committed index.
   458  	}
   459  
   460  	// The pending snapshot index acts as a placeholder for a replica that is
   461  	// about to be added to the range (or is in Raft recovery). We don't want to
   462  	// truncate the log in a way that will require that new replica to be caught
   463  	// up via yet another Raft snapshot.
   464  	if input.PendingSnapshotIndex > 0 {
   465  		decision.ProtectIndex(input.PendingSnapshotIndex, truncatableIndexChosenViaPendingSnap)
   466  	}
   467  
   468  	// If new first index dropped below first index, make them equal (resulting
   469  	// in a no-op).
   470  	if decision.NewFirstIndex < input.FirstIndex {
   471  		decision.NewFirstIndex = input.FirstIndex
   472  		decision.ChosenVia = truncatableIndexChosenViaFirstIndex
   473  	}
   474  
   475  	// We've inherited the unfortunate semantics for {First,Last}Index from
   476  	// raft.Storage. Specifically, both {First,Last}Index are inclusive, so
   477  	// there's no way to represent an empty log. The way we've initialized
   478  	// repl.FirstIndex is to set it to the first index in the possibly-empty log
   479  	// (TruncatedState.Index + 1), and allowing LastIndex to fall behind it when
   480  	// the log is empty (TruncatedState.Index). The initialization is done when
   481  	// minting a new replica from either the truncated state of incoming
   482  	// snapshot, or using the default initial log index. This makes for the
   483  	// confusing situation where FirstIndex > LastIndex. We can detect this
   484  	// special empty log case by comparing checking if
   485  	// `FirstIndex == LastIndex + 1` (`logEmpty` below). Similar to this, we can
   486  	// have the case that `FirstIndex = CommitIndex + 1` when there are no
   487  	// committed entries (which we check for in `noCommittedEntries` below).
   488  	// Having done that (i.e. if the raft log is not empty, and there are
   489  	// committed entries), we can assert on the following invariants:
   490  	//
   491  	//         FirstIndex    <= LastIndex                                    (0)
   492  	//         NewFirstIndex >= FirstIndex                                   (1)
   493  	//         NewFirstIndex <= LastIndex                                    (2)
   494  	//         NewFirstIndex <= CommitIndex                                  (3)
   495  	//
   496  	// (1) asserts that we're not regressing our FirstIndex
   497  	// (2) asserts that our we don't truncate past the last index we can
   498  	//     truncate away, and
   499  	// (3) is similar to (2) in that we assert that we're not truncating past
   500  	//     the last known CommitIndex.
   501  	//
   502  	// TODO(irfansharif): We should consider cleaning up this mess around
   503  	// {First,Last,Commit}Index by using a sentinel value to represent an empty
   504  	// log (like we do with `invalidLastTerm`). It'd be extra nice if we could
   505  	// safeguard access by relying on the type system to force callers to
   506  	// consider the empty case. Something like
   507  	// https://github.com/nvanbenschoten/optional could help us emulate an
   508  	// `option<uint64>` type if we care enough.
   509  	logEmpty := input.FirstIndex == input.LastIndex+1
   510  	noCommittedEntries := input.FirstIndex == input.RaftStatus.Commit+1
   511  
   512  	logIndexValid := logEmpty ||
   513  		(decision.NewFirstIndex >= input.FirstIndex) && (decision.NewFirstIndex <= input.LastIndex)
   514  	commitIndexValid := noCommittedEntries ||
   515  		(decision.NewFirstIndex <= decision.CommitIndex)
   516  	valid := logIndexValid && commitIndexValid
   517  	if !valid {
   518  		err := fmt.Sprintf("invalid truncation decision: output = %d, input: [%d, %d], commit idx = %d",
   519  			decision.NewFirstIndex, input.FirstIndex, input.LastIndex, decision.CommitIndex)
   520  		panic(err)
   521  	}
   522  
   523  	return decision
   524  }
   525  
   526  // shouldQueue determines whether a range should be queued for truncating. This
   527  // is true only if the replica is the raft leader and if the total number of
   528  // the range's raft log's stale entries exceeds RaftLogQueueStaleThreshold.
   529  func (rlq *raftLogQueue) shouldQueue(
   530  	ctx context.Context, now hlc.Timestamp, r *Replica, _ *config.SystemConfig,
   531  ) (shouldQ bool, priority float64) {
   532  	decision, err := newTruncateDecision(ctx, r)
   533  	if err != nil {
   534  		log.Warningf(ctx, "%v", err)
   535  		return false, 0
   536  	}
   537  
   538  	shouldQ, _, prio := rlq.shouldQueueImpl(ctx, decision)
   539  	return shouldQ, prio
   540  }
   541  
   542  // shouldQueueImpl returns whether the given truncate decision should lead to
   543  // a log truncation. This is either the case if the decision says so or if
   544  // we want to recompute the log size (in which case `recomputeRaftLogSize` and
   545  // `shouldQ` are both true and a reasonable priority is returned).
   546  func (rlq *raftLogQueue) shouldQueueImpl(
   547  	ctx context.Context, decision truncateDecision,
   548  ) (shouldQ bool, recomputeRaftLogSize bool, priority float64) {
   549  	if decision.ShouldTruncate() {
   550  		return true, !decision.Input.LogSizeTrusted, float64(decision.Input.LogSize)
   551  	}
   552  	if decision.Input.LogSizeTrusted ||
   553  		decision.Input.LastIndex == decision.Input.FirstIndex {
   554  
   555  		return false, false, 0
   556  	}
   557  	// We have a nonempty log (first index != last index) and can't vouch that
   558  	// the bytes in the log are known. Queue the replica; processing it will
   559  	// force a recomputation. For the priority, we have to pick one as we
   560  	// usually use the log size which is not available here. Going half-way
   561  	// between zero and the MaxLogSize should give a good tradeoff between
   562  	// processing the recomputation quickly, and not starving replicas which see
   563  	// a significant amount of write traffic until they run over and truncate
   564  	// more aggressively than they need to.
   565  	return true, true, 1.0 + float64(decision.Input.MaxLogSize)/2.0
   566  }
   567  
   568  // process truncates the raft log of the range if the replica is the raft
   569  // leader and if the total number of the range's raft log's stale entries
   570  // exceeds RaftLogQueueStaleThreshold.
   571  func (rlq *raftLogQueue) process(ctx context.Context, r *Replica, _ *config.SystemConfig) error {
   572  	decision, err := newTruncateDecision(ctx, r)
   573  	if err != nil {
   574  		return err
   575  	}
   576  
   577  	if _, recompute, _ := rlq.shouldQueueImpl(ctx, decision); recompute {
   578  		log.VEventf(ctx, 2, "recomputing raft log based on decision %+v", decision)
   579  
   580  		// We need to hold raftMu both to access the sideloaded storage and to
   581  		// make sure concurrent Raft activity doesn't foul up our update to the
   582  		// cached in-memory values.
   583  		r.raftMu.Lock()
   584  		n, err := ComputeRaftLogSize(ctx, r.RangeID, r.Engine(), r.raftMu.sideloaded)
   585  		if err == nil {
   586  			r.mu.Lock()
   587  			r.mu.raftLogSize = n
   588  			r.mu.raftLogLastCheckSize = n
   589  			r.mu.raftLogSizeTrusted = true
   590  			r.mu.Unlock()
   591  		}
   592  		r.raftMu.Unlock()
   593  
   594  		if err != nil {
   595  			return errors.Wrap(err, "recomputing raft log size")
   596  		}
   597  
   598  		log.VEventf(ctx, 2, "recomputed raft log size to %s", humanizeutil.IBytes(n))
   599  
   600  		// Override the decision, now that an accurate log size is available.
   601  		decision, err = newTruncateDecision(ctx, r)
   602  		if err != nil {
   603  			return err
   604  		}
   605  	}
   606  
   607  	// Can and should the raft logs be truncated?
   608  	if decision.ShouldTruncate() {
   609  		if n := decision.NumNewRaftSnapshots(); log.V(1) || n > 0 && rlq.logSnapshots.ShouldProcess(timeutil.Now()) {
   610  			log.Infof(ctx, "%v", log.Safe(decision.String()))
   611  		} else {
   612  			log.VEventf(ctx, 1, "%v", log.Safe(decision.String()))
   613  		}
   614  		b := &kv.Batch{}
   615  		b.AddRawRequest(&roachpb.TruncateLogRequest{
   616  			RequestHeader: roachpb.RequestHeader{Key: r.Desc().StartKey.AsRawKey()},
   617  			Index:         decision.NewFirstIndex,
   618  			RangeID:       r.RangeID,
   619  		})
   620  		if err := rlq.db.Run(ctx, b); err != nil {
   621  			return err
   622  		}
   623  		r.store.metrics.RaftLogTruncated.Inc(int64(decision.NumTruncatableIndexes()))
   624  	} else {
   625  		log.VEventf(ctx, 3, "%s", log.Safe(decision.String()))
   626  	}
   627  	return nil
   628  }
   629  
   630  // timer returns interval between processing successive queued truncations.
   631  func (*raftLogQueue) timer(_ time.Duration) time.Duration {
   632  	return raftLogQueueTimerDuration
   633  }
   634  
   635  // purgatoryChan returns nil.
   636  func (*raftLogQueue) purgatoryChan() <-chan time.Time {
   637  	return nil
   638  }
   639  
   640  var _ sort.Interface = uint64Slice(nil)
   641  
   642  // uint64Slice implements sort.Interface
   643  type uint64Slice []uint64
   644  
   645  // Len implements sort.Interface
   646  func (a uint64Slice) Len() int { return len(a) }
   647  
   648  // Swap implements sort.Interface
   649  func (a uint64Slice) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
   650  
   651  // Less implements sort.Interface
   652  func (a uint64Slice) Less(i, j int) bool { return a[i] < a[j] }