github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_proposal_quota.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"time"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/keys"
    19  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    20  	"github.com/cockroachdb/cockroach/pkg/util/log"
    21  	"github.com/cockroachdb/cockroach/pkg/util/quotapool"
    22  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    23  	"github.com/cockroachdb/errors"
    24  	"go.etcd.io/etcd/raft"
    25  	"go.etcd.io/etcd/raft/tracker"
    26  )
    27  
    28  func (r *Replica) maybeAcquireProposalQuota(
    29  	ctx context.Context, quota uint64,
    30  ) (*quotapool.IntAlloc, error) {
    31  	r.mu.RLock()
    32  	quotaPool := r.mu.proposalQuota
    33  	desc := *r.mu.state.Desc
    34  	r.mu.RUnlock()
    35  
    36  	// Quota acquisition only takes place on the leader replica,
    37  	// r.mu.proposalQuota is set to nil if a node is a follower (see
    38  	// updateProposalQuotaRaftMuLocked). For the cases where the range lease
    39  	// holder is not the same as the range leader, i.e. the lease holder is a
    40  	// follower, r.mu.proposalQuota == nil. This means all quota acquisitions
    41  	// go through without any throttling whatsoever but given how short lived
    42  	// these scenarios are we don't try to remedy any further.
    43  	//
    44  	// NB: It is necessary to allow proposals with a nil quota pool to go
    45  	// through, for otherwise a follower could never request the lease.
    46  
    47  	if quotaPool == nil {
    48  		return nil, nil
    49  	}
    50  
    51  	if !quotaPoolEnabledForRange(desc) {
    52  		return nil, nil
    53  	}
    54  
    55  	// Trace if we're running low on available proposal quota; it might explain
    56  	// why we're taking so long.
    57  	if log.HasSpanOrEvent(ctx) {
    58  		if q := quotaPool.ApproximateQuota(); q < quotaPool.Capacity()/10 {
    59  			log.Eventf(ctx, "quota running low, currently available ~%d", q)
    60  		}
    61  	}
    62  	alloc, err := quotaPool.Acquire(ctx, quota)
    63  	// Let quotapool errors due to being closed pass through.
    64  	if errors.HasType(err, (*quotapool.ErrClosed)(nil)) {
    65  		err = nil
    66  	}
    67  	return alloc, err
    68  }
    69  
    70  func quotaPoolEnabledForRange(desc roachpb.RangeDescriptor) bool {
    71  	// The NodeLiveness range does not use a quota pool. We don't want to
    72  	// throttle updates to the NodeLiveness range even if a follower is falling
    73  	// behind because this could result in cascading failures.
    74  	return !bytes.HasPrefix(desc.StartKey, keys.NodeLivenessPrefix)
    75  }
    76  
    77  func (r *Replica) updateProposalQuotaRaftMuLocked(
    78  	ctx context.Context, lastLeaderID roachpb.ReplicaID,
    79  ) {
    80  	r.mu.Lock()
    81  	defer r.mu.Unlock()
    82  
    83  	status := r.mu.internalRaftGroup.BasicStatus()
    84  	if r.mu.leaderID != lastLeaderID {
    85  		if r.mu.replicaID == r.mu.leaderID {
    86  			// We're becoming the leader.
    87  			// Initialize the proposalQuotaBaseIndex at the applied index.
    88  			// After the proposal quota is enabled all entries applied by this replica
    89  			// will be appended to the quotaReleaseQueue. The proposalQuotaBaseIndex
    90  			// and the quotaReleaseQueue together track status.Applied exactly.
    91  			r.mu.proposalQuotaBaseIndex = status.Applied
    92  			if r.mu.proposalQuota != nil {
    93  				log.Fatal(ctx, "proposalQuota was not nil before becoming the leader")
    94  			}
    95  			if releaseQueueLen := len(r.mu.quotaReleaseQueue); releaseQueueLen != 0 {
    96  				log.Fatalf(ctx, "len(r.mu.quotaReleaseQueue) = %d, expected 0", releaseQueueLen)
    97  			}
    98  
    99  			// Raft may propose commands itself (specifically the empty
   100  			// commands when leadership changes), and these commands don't go
   101  			// through the code paths where we acquire quota from the pool. To
   102  			// offset this we reset the quota pool whenever leadership changes
   103  			// hands.
   104  			r.mu.proposalQuota = quotapool.NewIntPool(r.rangeStr.String(), uint64(r.store.cfg.RaftProposalQuota))
   105  			r.mu.lastUpdateTimes = make(map[roachpb.ReplicaID]time.Time)
   106  			r.mu.lastUpdateTimes.updateOnBecomeLeader(r.mu.state.Desc.Replicas().All(), timeutil.Now())
   107  		} else if r.mu.proposalQuota != nil {
   108  			// We're becoming a follower.
   109  			// We unblock all ongoing and subsequent quota acquisition goroutines
   110  			// (if any) and release the quotaReleaseQueue so its allocs are pooled.
   111  			r.mu.proposalQuota.Close("leader change")
   112  			r.mu.proposalQuota.Release(r.mu.quotaReleaseQueue...)
   113  			r.mu.quotaReleaseQueue = nil
   114  			r.mu.proposalQuota = nil
   115  			r.mu.lastUpdateTimes = nil
   116  		}
   117  		return
   118  	} else if r.mu.proposalQuota == nil {
   119  		if r.mu.replicaID == r.mu.leaderID {
   120  			log.Fatal(ctx, "leader has uninitialized proposalQuota pool")
   121  		}
   122  		// We're a follower.
   123  		return
   124  	}
   125  
   126  	// We're still the leader.
   127  
   128  	// Find the minimum index that active followers have acknowledged.
   129  	now := timeutil.Now()
   130  	// commitIndex is used to determine whether a newly added replica has fully
   131  	// caught up.
   132  	commitIndex := status.Commit
   133  	// Initialize minIndex to the currently applied index. The below progress
   134  	// checks will only decrease the minIndex. Given that the quotaReleaseQueue
   135  	// cannot correspond to values beyond the applied index there's no reason
   136  	// to consider progress beyond it as meaningful.
   137  	minIndex := status.Applied
   138  	r.mu.internalRaftGroup.WithProgress(func(id uint64, _ raft.ProgressType, progress tracker.Progress) {
   139  		rep, ok := r.mu.state.Desc.GetReplicaDescriptorByID(roachpb.ReplicaID(id))
   140  		if !ok {
   141  			return
   142  		}
   143  
   144  		// Only consider followers that are active. Inactive ones don't decrease
   145  		// minIndex - i.e. they don't hold up releasing quota.
   146  		//
   147  		// The policy for determining who's active is more strict than the one used
   148  		// for purposes of quiescing. Failure to consider a dead/stuck node as such
   149  		// for the purposes of releasing quota can have bad consequences (writes
   150  		// will stall), whereas for quiescing the downside is lower.
   151  
   152  		if !r.mu.lastUpdateTimes.isFollowerActiveSince(
   153  			ctx, rep.ReplicaID, now, r.store.cfg.RangeLeaseActiveDuration(),
   154  		) {
   155  			return
   156  		}
   157  
   158  		// Only consider followers that that have "healthy" RPC connections.
   159  		if err := r.store.cfg.NodeDialer.ConnHealth(rep.NodeID, r.connectionClass.get()); err != nil {
   160  			return
   161  		}
   162  
   163  		// Note that the Match field has different semantics depending on
   164  		// the State.
   165  		//
   166  		// In state ProgressStateReplicate, the Match index is optimistically
   167  		// updated whenever a message is *sent* (not received). Due to Raft
   168  		// flow control, only a reasonably small amount of data can be en
   169  		// route to a given follower at any point in time.
   170  		//
   171  		// In state ProgressStateProbe, the Match index equals Next-1, and
   172  		// it tells us the leader's optimistic best guess for the right log
   173  		// index (and will try once per heartbeat interval to update its
   174  		// estimate). In the usual case, the follower responds with a hint
   175  		// when it rejects the first probe and the leader replicates or
   176  		// sends a snapshot. In the case in which the follower does not
   177  		// respond, the leader reduces Match by one each heartbeat interval.
   178  		// But if the follower does not respond, we've already filtered it
   179  		// out above. We use the Match index as is, even though the follower
   180  		// likely isn't there yet because that index won't go up unless the
   181  		// follower is actually catching up, so it won't cause it to fall
   182  		// behind arbitrarily.
   183  		//
   184  		// Another interesting tidbit about this state is that the Paused
   185  		// field is usually true as it is used to limit the number of probes
   186  		// (i.e. appends) sent to this follower to one per heartbeat
   187  		// interval.
   188  		//
   189  		// In state ProgressStateSnapshot, the Match index is the last known
   190  		// (possibly optimistic, depending on previous state) index before
   191  		// the snapshot went out. Once the snapshot applies, the follower
   192  		// will enter ProgressStateReplicate again. So here the Match index
   193  		// works as advertised too.
   194  
   195  		// Only consider followers who are in advance of the quota base
   196  		// index. This prevents a follower from coming back online and
   197  		// preventing throughput to the range until it has caught up.
   198  		if progress.Match < r.mu.proposalQuotaBaseIndex {
   199  			return
   200  		}
   201  		if progress.Match > 0 && progress.Match < minIndex {
   202  			minIndex = progress.Match
   203  		}
   204  		// If this is the most recently added replica and it has caught up, clear
   205  		// our state that was tracking it. This is unrelated to managing proposal
   206  		// quota, but this is a convenient place to do so.
   207  		if rep.ReplicaID == r.mu.lastReplicaAdded && progress.Match >= commitIndex {
   208  			r.mu.lastReplicaAdded = 0
   209  			r.mu.lastReplicaAddedTime = time.Time{}
   210  		}
   211  	})
   212  
   213  	if r.mu.proposalQuotaBaseIndex < minIndex {
   214  		// We've persisted at least minIndex-r.mu.proposalQuotaBaseIndex entries
   215  		// to the raft log on all 'active' replicas and applied at least minIndex
   216  		// entries locally since last we checked, so we are able to release the
   217  		// difference back to the quota pool.
   218  		numReleases := minIndex - r.mu.proposalQuotaBaseIndex
   219  
   220  		// NB: Release deals with cases where allocs being released do not originate
   221  		// from this incarnation of quotaReleaseQueue, which can happen if a
   222  		// proposal acquires quota while this replica is the raft leader in some
   223  		// term and then commits while at a different term.
   224  		r.mu.proposalQuota.Release(r.mu.quotaReleaseQueue[:numReleases]...)
   225  		r.mu.quotaReleaseQueue = r.mu.quotaReleaseQueue[numReleases:]
   226  		r.mu.proposalQuotaBaseIndex += numReleases
   227  	}
   228  	// Assert the sanity of the base index and the queue. Queue entries should
   229  	// correspond to applied entries. It should not be possible for the base
   230  	// index and the not yet released applied entries to not equal the applied
   231  	// index.
   232  	releasableIndex := r.mu.proposalQuotaBaseIndex + uint64(len(r.mu.quotaReleaseQueue))
   233  	if releasableIndex != status.Applied {
   234  		log.Fatalf(ctx, "proposalQuotaBaseIndex (%d) + quotaReleaseQueueLen (%d) = %d"+
   235  			" must equal the applied index (%d)",
   236  			r.mu.proposalQuotaBaseIndex, len(r.mu.quotaReleaseQueue), releasableIndex,
   237  			status.Applied)
   238  	}
   239  }