github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_proposal_quota.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "bytes" 15 "context" 16 "time" 17 18 "github.com/cockroachdb/cockroach/pkg/keys" 19 "github.com/cockroachdb/cockroach/pkg/roachpb" 20 "github.com/cockroachdb/cockroach/pkg/util/log" 21 "github.com/cockroachdb/cockroach/pkg/util/quotapool" 22 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 23 "github.com/cockroachdb/errors" 24 "go.etcd.io/etcd/raft" 25 "go.etcd.io/etcd/raft/tracker" 26 ) 27 28 func (r *Replica) maybeAcquireProposalQuota( 29 ctx context.Context, quota uint64, 30 ) (*quotapool.IntAlloc, error) { 31 r.mu.RLock() 32 quotaPool := r.mu.proposalQuota 33 desc := *r.mu.state.Desc 34 r.mu.RUnlock() 35 36 // Quota acquisition only takes place on the leader replica, 37 // r.mu.proposalQuota is set to nil if a node is a follower (see 38 // updateProposalQuotaRaftMuLocked). For the cases where the range lease 39 // holder is not the same as the range leader, i.e. the lease holder is a 40 // follower, r.mu.proposalQuota == nil. This means all quota acquisitions 41 // go through without any throttling whatsoever but given how short lived 42 // these scenarios are we don't try to remedy any further. 43 // 44 // NB: It is necessary to allow proposals with a nil quota pool to go 45 // through, for otherwise a follower could never request the lease. 46 47 if quotaPool == nil { 48 return nil, nil 49 } 50 51 if !quotaPoolEnabledForRange(desc) { 52 return nil, nil 53 } 54 55 // Trace if we're running low on available proposal quota; it might explain 56 // why we're taking so long. 57 if log.HasSpanOrEvent(ctx) { 58 if q := quotaPool.ApproximateQuota(); q < quotaPool.Capacity()/10 { 59 log.Eventf(ctx, "quota running low, currently available ~%d", q) 60 } 61 } 62 alloc, err := quotaPool.Acquire(ctx, quota) 63 // Let quotapool errors due to being closed pass through. 64 if errors.HasType(err, (*quotapool.ErrClosed)(nil)) { 65 err = nil 66 } 67 return alloc, err 68 } 69 70 func quotaPoolEnabledForRange(desc roachpb.RangeDescriptor) bool { 71 // The NodeLiveness range does not use a quota pool. We don't want to 72 // throttle updates to the NodeLiveness range even if a follower is falling 73 // behind because this could result in cascading failures. 74 return !bytes.HasPrefix(desc.StartKey, keys.NodeLivenessPrefix) 75 } 76 77 func (r *Replica) updateProposalQuotaRaftMuLocked( 78 ctx context.Context, lastLeaderID roachpb.ReplicaID, 79 ) { 80 r.mu.Lock() 81 defer r.mu.Unlock() 82 83 status := r.mu.internalRaftGroup.BasicStatus() 84 if r.mu.leaderID != lastLeaderID { 85 if r.mu.replicaID == r.mu.leaderID { 86 // We're becoming the leader. 87 // Initialize the proposalQuotaBaseIndex at the applied index. 88 // After the proposal quota is enabled all entries applied by this replica 89 // will be appended to the quotaReleaseQueue. The proposalQuotaBaseIndex 90 // and the quotaReleaseQueue together track status.Applied exactly. 91 r.mu.proposalQuotaBaseIndex = status.Applied 92 if r.mu.proposalQuota != nil { 93 log.Fatal(ctx, "proposalQuota was not nil before becoming the leader") 94 } 95 if releaseQueueLen := len(r.mu.quotaReleaseQueue); releaseQueueLen != 0 { 96 log.Fatalf(ctx, "len(r.mu.quotaReleaseQueue) = %d, expected 0", releaseQueueLen) 97 } 98 99 // Raft may propose commands itself (specifically the empty 100 // commands when leadership changes), and these commands don't go 101 // through the code paths where we acquire quota from the pool. To 102 // offset this we reset the quota pool whenever leadership changes 103 // hands. 104 r.mu.proposalQuota = quotapool.NewIntPool(r.rangeStr.String(), uint64(r.store.cfg.RaftProposalQuota)) 105 r.mu.lastUpdateTimes = make(map[roachpb.ReplicaID]time.Time) 106 r.mu.lastUpdateTimes.updateOnBecomeLeader(r.mu.state.Desc.Replicas().All(), timeutil.Now()) 107 } else if r.mu.proposalQuota != nil { 108 // We're becoming a follower. 109 // We unblock all ongoing and subsequent quota acquisition goroutines 110 // (if any) and release the quotaReleaseQueue so its allocs are pooled. 111 r.mu.proposalQuota.Close("leader change") 112 r.mu.proposalQuota.Release(r.mu.quotaReleaseQueue...) 113 r.mu.quotaReleaseQueue = nil 114 r.mu.proposalQuota = nil 115 r.mu.lastUpdateTimes = nil 116 } 117 return 118 } else if r.mu.proposalQuota == nil { 119 if r.mu.replicaID == r.mu.leaderID { 120 log.Fatal(ctx, "leader has uninitialized proposalQuota pool") 121 } 122 // We're a follower. 123 return 124 } 125 126 // We're still the leader. 127 128 // Find the minimum index that active followers have acknowledged. 129 now := timeutil.Now() 130 // commitIndex is used to determine whether a newly added replica has fully 131 // caught up. 132 commitIndex := status.Commit 133 // Initialize minIndex to the currently applied index. The below progress 134 // checks will only decrease the minIndex. Given that the quotaReleaseQueue 135 // cannot correspond to values beyond the applied index there's no reason 136 // to consider progress beyond it as meaningful. 137 minIndex := status.Applied 138 r.mu.internalRaftGroup.WithProgress(func(id uint64, _ raft.ProgressType, progress tracker.Progress) { 139 rep, ok := r.mu.state.Desc.GetReplicaDescriptorByID(roachpb.ReplicaID(id)) 140 if !ok { 141 return 142 } 143 144 // Only consider followers that are active. Inactive ones don't decrease 145 // minIndex - i.e. they don't hold up releasing quota. 146 // 147 // The policy for determining who's active is more strict than the one used 148 // for purposes of quiescing. Failure to consider a dead/stuck node as such 149 // for the purposes of releasing quota can have bad consequences (writes 150 // will stall), whereas for quiescing the downside is lower. 151 152 if !r.mu.lastUpdateTimes.isFollowerActiveSince( 153 ctx, rep.ReplicaID, now, r.store.cfg.RangeLeaseActiveDuration(), 154 ) { 155 return 156 } 157 158 // Only consider followers that that have "healthy" RPC connections. 159 if err := r.store.cfg.NodeDialer.ConnHealth(rep.NodeID, r.connectionClass.get()); err != nil { 160 return 161 } 162 163 // Note that the Match field has different semantics depending on 164 // the State. 165 // 166 // In state ProgressStateReplicate, the Match index is optimistically 167 // updated whenever a message is *sent* (not received). Due to Raft 168 // flow control, only a reasonably small amount of data can be en 169 // route to a given follower at any point in time. 170 // 171 // In state ProgressStateProbe, the Match index equals Next-1, and 172 // it tells us the leader's optimistic best guess for the right log 173 // index (and will try once per heartbeat interval to update its 174 // estimate). In the usual case, the follower responds with a hint 175 // when it rejects the first probe and the leader replicates or 176 // sends a snapshot. In the case in which the follower does not 177 // respond, the leader reduces Match by one each heartbeat interval. 178 // But if the follower does not respond, we've already filtered it 179 // out above. We use the Match index as is, even though the follower 180 // likely isn't there yet because that index won't go up unless the 181 // follower is actually catching up, so it won't cause it to fall 182 // behind arbitrarily. 183 // 184 // Another interesting tidbit about this state is that the Paused 185 // field is usually true as it is used to limit the number of probes 186 // (i.e. appends) sent to this follower to one per heartbeat 187 // interval. 188 // 189 // In state ProgressStateSnapshot, the Match index is the last known 190 // (possibly optimistic, depending on previous state) index before 191 // the snapshot went out. Once the snapshot applies, the follower 192 // will enter ProgressStateReplicate again. So here the Match index 193 // works as advertised too. 194 195 // Only consider followers who are in advance of the quota base 196 // index. This prevents a follower from coming back online and 197 // preventing throughput to the range until it has caught up. 198 if progress.Match < r.mu.proposalQuotaBaseIndex { 199 return 200 } 201 if progress.Match > 0 && progress.Match < minIndex { 202 minIndex = progress.Match 203 } 204 // If this is the most recently added replica and it has caught up, clear 205 // our state that was tracking it. This is unrelated to managing proposal 206 // quota, but this is a convenient place to do so. 207 if rep.ReplicaID == r.mu.lastReplicaAdded && progress.Match >= commitIndex { 208 r.mu.lastReplicaAdded = 0 209 r.mu.lastReplicaAddedTime = time.Time{} 210 } 211 }) 212 213 if r.mu.proposalQuotaBaseIndex < minIndex { 214 // We've persisted at least minIndex-r.mu.proposalQuotaBaseIndex entries 215 // to the raft log on all 'active' replicas and applied at least minIndex 216 // entries locally since last we checked, so we are able to release the 217 // difference back to the quota pool. 218 numReleases := minIndex - r.mu.proposalQuotaBaseIndex 219 220 // NB: Release deals with cases where allocs being released do not originate 221 // from this incarnation of quotaReleaseQueue, which can happen if a 222 // proposal acquires quota while this replica is the raft leader in some 223 // term and then commits while at a different term. 224 r.mu.proposalQuota.Release(r.mu.quotaReleaseQueue[:numReleases]...) 225 r.mu.quotaReleaseQueue = r.mu.quotaReleaseQueue[numReleases:] 226 r.mu.proposalQuotaBaseIndex += numReleases 227 } 228 // Assert the sanity of the base index and the queue. Queue entries should 229 // correspond to applied entries. It should not be possible for the base 230 // index and the not yet released applied entries to not equal the applied 231 // index. 232 releasableIndex := r.mu.proposalQuotaBaseIndex + uint64(len(r.mu.quotaReleaseQueue)) 233 if releasableIndex != status.Applied { 234 log.Fatalf(ctx, "proposalQuotaBaseIndex (%d) + quotaReleaseQueueLen (%d) = %d"+ 235 " must equal the applied index (%d)", 236 r.mu.proposalQuotaBaseIndex, len(r.mu.quotaReleaseQueue), releasableIndex, 237 status.Applied) 238 } 239 }