github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/roachpb/metadata_replicas.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package roachpb 12 13 import ( 14 "fmt" 15 "strings" 16 17 "go.etcd.io/etcd/raft/raftpb" 18 ) 19 20 // ReplicaTypeVoterFull returns a VOTER_FULL pointer suitable for use in a 21 // nullable proto field. 22 func ReplicaTypeVoterFull() *ReplicaType { 23 t := VOTER_FULL 24 return &t 25 } 26 27 // ReplicaTypeVoterIncoming returns a VOTER_INCOMING pointer suitable 28 // for use in a nullable proto field. 29 func ReplicaTypeVoterIncoming() *ReplicaType { 30 t := VOTER_INCOMING 31 return &t 32 } 33 34 // ReplicaTypeVoterOutgoing returns a VOTER_OUTGOING pointer suitable 35 // for use in a nullable proto field. 36 func ReplicaTypeVoterOutgoing() *ReplicaType { 37 t := VOTER_OUTGOING 38 return &t 39 } 40 41 // ReplicaTypeVoterDemoting returns a VOTER_DEMOTING pointer suitable 42 // for use in a nullable proto field. 43 func ReplicaTypeVoterDemoting() *ReplicaType { 44 t := VOTER_DEMOTING 45 return &t 46 } 47 48 // ReplicaTypeLearner returns a LEARNER pointer suitable for use in 49 // a nullable proto field. 50 func ReplicaTypeLearner() *ReplicaType { 51 t := LEARNER 52 return &t 53 } 54 55 // ReplicaDescriptors is a set of replicas, usually the nodes/stores on which 56 // replicas of a range are stored. 57 type ReplicaDescriptors struct { 58 wrapped []ReplicaDescriptor 59 } 60 61 // MakeReplicaDescriptors creates a ReplicaDescriptors wrapper from a raw slice 62 // of individual descriptors. 63 // 64 // All construction of ReplicaDescriptors is required to go through this method 65 // so we can guarantee sortedness, which is used to speed up accessor 66 // operations. 67 // 68 // The function accepts a pointer to a slice instead of a slice directly to 69 // avoid an allocation when boxing the argument as a sort.Interface. This may 70 // cause the argument to escape to the heap for some callers, at which point 71 // we're trading one allocation for another. However, if the caller already has 72 // the slice header on the heap (which is the common case for *RangeDescriptors) 73 // then this is a net win. 74 func MakeReplicaDescriptors(replicas []ReplicaDescriptor) ReplicaDescriptors { 75 return ReplicaDescriptors{wrapped: replicas} 76 } 77 78 func (d ReplicaDescriptors) String() string { 79 var buf strings.Builder 80 for i, desc := range d.wrapped { 81 if i > 0 { 82 buf.WriteByte(',') 83 } 84 fmt.Fprint(&buf, desc) 85 } 86 return buf.String() 87 } 88 89 // All returns every replica in the set, including both voter replicas and 90 // learner replicas. Voter replicas are ordered first in the returned slice. 91 func (d ReplicaDescriptors) All() []ReplicaDescriptor { 92 return d.wrapped 93 } 94 95 func predVoterFullOrIncoming(rDesc ReplicaDescriptor) bool { 96 switch rDesc.GetType() { 97 case VOTER_FULL, VOTER_INCOMING: 98 return true 99 default: 100 } 101 return false 102 } 103 104 func predLearner(rDesc ReplicaDescriptor) bool { 105 return rDesc.GetType() == LEARNER 106 } 107 108 // Voters returns the current and future voter replicas in the set. This means 109 // that during an atomic replication change, only the replicas that will be 110 // voters once the change completes will be returned; "outgoing" voters will not 111 // be returned even though they do in the current state retain their voting 112 // rights. When no atomic membership change is ongoing, this is simply the set 113 // of all non-learners. 114 // 115 // This may allocate, but it also may return the underlying slice as a 116 // performance optimization, so it's not safe to modify the returned value. 117 // 118 // TODO(tbg): go through the callers and figure out the few which want a 119 // different subset of voters. Consider renaming this method so that it's 120 // more descriptive. 121 func (d ReplicaDescriptors) Voters() []ReplicaDescriptor { 122 return d.Filter(predVoterFullOrIncoming) 123 } 124 125 // Learners returns the learner replicas in the set. This may allocate, but it 126 // also may return the underlying slice as a performance optimization, so it's 127 // not safe to modify the returned value. 128 // 129 // A learner is a participant in a raft group that accepts messages but doesn't 130 // vote. This means it doesn't affect raft quorum and thus doesn't affect the 131 // fragility of the range, even if it's very far behind or many learners are 132 // down. 133 // 134 // At the time of writing, learners are used in CockroachDB as an interim state 135 // while adding a replica. A learner replica is added to the range via raft 136 // ConfChange, a raft snapshot (of type LEARNER) is sent to catch it up, and 137 // then a second ConfChange promotes it to a full replica. 138 // 139 // This means that learners are currently always expected to have a short 140 // lifetime, approximately the time it takes to send a snapshot. Ideas have been 141 // kicked around to use learners with follower reads, which could be a cheap way 142 // to allow many geographies to have local reads without affecting write 143 // latencies. If implemented, these learners would have long lifetimes. 144 // 145 // For simplicity, CockroachDB treats learner replicas the same as voter 146 // replicas as much as possible, but there are a few exceptions: 147 // 148 // - Learner replicas are not considered when calculating quorum size, and thus 149 // do not affect the computation of which ranges are under-replicated for 150 // upreplication/alerting/debug/etc purposes. Ditto for over-replicated. 151 // - Learner replicas cannot become raft leaders, so we also don't allow them to 152 // become leaseholders. As a result, DistSender and the various oracles don't 153 // try to send them traffic. 154 // - The raft snapshot queue tries to avoid sending snapshots to learners for 155 // reasons described below. 156 // - Merges won't run while a learner replica is present. 157 // 158 // Replicas are now added in two ConfChange transactions. The first creates the 159 // learner and the second promotes it to a voter. If the node that is 160 // coordinating this dies in the middle, we're left with an orphaned learner. 161 // For this reason, the replicate queue always first removes any learners it 162 // sees before doing anything else. We could instead try to finish off the 163 // learner snapshot and promotion, but this is more complicated and it's not yet 164 // clear the efficiency win is worth it. 165 // 166 // This introduces some rare races between the replicate queue and 167 // AdminChangeReplicas or if a range's lease is moved to a new owner while the 168 // old leaseholder is still processing it in the replicate queue. These races 169 // are handled by retrying if a learner disappears during the 170 // snapshot/promotion. 171 // 172 // If the coordinator otherwise encounters an error while sending the learner 173 // snapshot or promoting it (which can happen for a number of reasons, including 174 // the node getting the learner going away), it tries to clean up after itself 175 // by rolling back the addition of the learner. 176 // 177 // There is another race between the learner snapshot being sent and the raft 178 // snapshot queue happening to check the replica at the same time, also sending 179 // it a snapshot. This is safe but wasteful, so the raft snapshot queue won't 180 // try to send snapshots to learners if there is already a snapshot to that 181 // range in flight. 182 // 183 // *However*, raft is currently pickier than the needs to be about the snapshots 184 // it requests and it can get stuck in StateSnapshot if it doesn't receive 185 // exactly the index it wants. As a result, for now, the raft snapshot queue 186 // will send one if it's still needed after the learner snapshot finishes (or 187 // times out). To make this work in a timely manner (i.e. without relying on the 188 // replica scanner) but without blocking the raft snapshot queue, when a 189 // snapshot is skipped, this is reported to raft as an error sending the 190 // snapshot. This causes raft to eventually re-enqueue it in the raft snapshot 191 // queue. All of this is quite hard to reason about, so it'd be nice to make 192 // this go away at some point. 193 // 194 // Merges are blocked if either side has a learner (to avoid working out the 195 // edge cases) but it's historically turned out to be a bad idea to get in the 196 // way of splits, so we allow them even when some of the replicas are learners. 197 // This orphans a learner on each side of the split (the original coordinator 198 // will not be able to finish either of them), but the replication queue will 199 // eventually clean them up. 200 // 201 // Learner replicas don't affect quorum but they do affect the system in other 202 // ways. The most obvious way is that the leader sends them the raft traffic it 203 // would send to any follower, consuming resources. More surprising is that once 204 // the learner has received a snapshot, it's considered by the quota pool that 205 // prevents the raft leader from getting too far ahead of the followers. This is 206 // because a learner (especially one that already has a snapshot) is expected to 207 // very soon be a voter, so we treat it like one. However, it means a slow 208 // learner can slow down regular traffic, which is possibly counterintuitive. 209 // 210 // For some related mega-comments, see Replica.sendSnapshot. 211 func (d ReplicaDescriptors) Learners() []ReplicaDescriptor { 212 return d.Filter(predLearner) 213 } 214 215 // Filter returns only the replica descriptors for which the supplied method 216 // returns true. The memory returned may be shared with the receiver. 217 func (d ReplicaDescriptors) Filter(pred func(rDesc ReplicaDescriptor) bool) []ReplicaDescriptor { 218 // Fast path when all or none match to avoid allocations. 219 fastpath := true 220 out := d.wrapped 221 for i := range d.wrapped { 222 if pred(d.wrapped[i]) { 223 if !fastpath { 224 out = append(out, d.wrapped[i]) 225 } 226 } else { 227 if fastpath { 228 out = nil 229 out = append(out, d.wrapped[:i]...) 230 fastpath = false 231 } 232 } 233 } 234 return out 235 } 236 237 // AsProto returns the protobuf representation of these replicas, suitable for 238 // setting the InternalReplicas field of a RangeDescriptor. When possible the 239 // SetReplicas method of RangeDescriptor should be used instead, this is only 240 // here for the convenience of tests. 241 func (d ReplicaDescriptors) AsProto() []ReplicaDescriptor { 242 return d.wrapped 243 } 244 245 // DeepCopy returns a copy of this set of replicas. Modifications to the 246 // returned set will not affect this one and vice-versa. 247 func (d ReplicaDescriptors) DeepCopy() ReplicaDescriptors { 248 return ReplicaDescriptors{ 249 wrapped: append([]ReplicaDescriptor(nil), d.wrapped...), 250 } 251 } 252 253 // AddReplica adds the given replica to this set. 254 func (d *ReplicaDescriptors) AddReplica(r ReplicaDescriptor) { 255 d.wrapped = append(d.wrapped, r) 256 } 257 258 // RemoveReplica removes the matching replica from this set. If it wasn't found 259 // to remove, false is returned. 260 func (d *ReplicaDescriptors) RemoveReplica( 261 nodeID NodeID, storeID StoreID, 262 ) (ReplicaDescriptor, bool) { 263 idx := -1 264 for i := range d.wrapped { 265 if d.wrapped[i].NodeID == nodeID && d.wrapped[i].StoreID == storeID { 266 idx = i 267 break 268 } 269 } 270 if idx == -1 { 271 return ReplicaDescriptor{}, false 272 } 273 // Swap with the last element so we can simply truncate the slice. 274 d.wrapped[idx], d.wrapped[len(d.wrapped)-1] = d.wrapped[len(d.wrapped)-1], d.wrapped[idx] 275 removed := d.wrapped[len(d.wrapped)-1] 276 d.wrapped = d.wrapped[:len(d.wrapped)-1] 277 return removed, true 278 } 279 280 // InAtomicReplicationChange returns true if the descriptor is in the middle of 281 // an atomic replication change. 282 func (d ReplicaDescriptors) InAtomicReplicationChange() bool { 283 for _, rDesc := range d.wrapped { 284 switch rDesc.GetType() { 285 case VOTER_INCOMING, VOTER_OUTGOING, VOTER_DEMOTING: 286 return true 287 case VOTER_FULL, LEARNER: 288 default: 289 panic(fmt.Sprintf("unknown replica type %d", rDesc.GetType())) 290 } 291 } 292 return false 293 } 294 295 // ConfState returns the Raft configuration described by the set of replicas. 296 func (d ReplicaDescriptors) ConfState() raftpb.ConfState { 297 var cs raftpb.ConfState 298 joint := d.InAtomicReplicationChange() 299 // The incoming config is taken verbatim from the full voters when the 300 // config is not joint. If it is joint, slot the voters into the right 301 // category. 302 for _, rep := range d.wrapped { 303 id := uint64(rep.ReplicaID) 304 typ := rep.GetType() 305 switch typ { 306 case VOTER_FULL: 307 cs.Voters = append(cs.Voters, id) 308 if joint { 309 cs.VotersOutgoing = append(cs.VotersOutgoing, id) 310 } 311 case VOTER_INCOMING: 312 cs.Voters = append(cs.Voters, id) 313 case VOTER_OUTGOING: 314 cs.VotersOutgoing = append(cs.VotersOutgoing, id) 315 case VOTER_DEMOTING: 316 cs.VotersOutgoing = append(cs.VotersOutgoing, id) 317 cs.LearnersNext = append(cs.LearnersNext, id) 318 case LEARNER: 319 cs.Learners = append(cs.Learners, id) 320 default: 321 panic(fmt.Sprintf("unknown ReplicaType %d", typ)) 322 } 323 } 324 return cs 325 } 326 327 // CanMakeProgress reports whether the given descriptors can make progress at the 328 // replication layer. This is more complicated than just counting the number 329 // of replicas due to the existence of joint quorums. 330 func (d ReplicaDescriptors) CanMakeProgress(liveFunc func(descriptor ReplicaDescriptor) bool) bool { 331 isVoterOldConfig := func(rDesc ReplicaDescriptor) bool { 332 switch rDesc.GetType() { 333 case VOTER_FULL, VOTER_OUTGOING, VOTER_DEMOTING: 334 return true 335 default: 336 return false 337 } 338 } 339 isVoterNewConfig := func(rDesc ReplicaDescriptor) bool { 340 switch rDesc.GetType() { 341 case VOTER_FULL, VOTER_INCOMING: 342 return true 343 default: 344 return false 345 } 346 } 347 // isBoth takes two replica predicates and returns their conjunction. 348 isBoth := func( 349 pred1 func(rDesc ReplicaDescriptor) bool, 350 pred2 func(rDesc ReplicaDescriptor) bool) func(ReplicaDescriptor) bool { 351 return func(rDesc ReplicaDescriptor) bool { 352 return pred1(rDesc) && pred2(rDesc) 353 } 354 } 355 356 votersOldGroup := d.Filter(isVoterOldConfig) 357 liveVotersOldGroup := d.Filter(isBoth(isVoterOldConfig, liveFunc)) 358 359 n := len(votersOldGroup) 360 // Empty groups succeed by default, to match the Raft implementation. 361 if n > 0 && len(liveVotersOldGroup) < n/2+1 { 362 return false 363 } 364 365 votersNewGroup := d.Filter(isVoterNewConfig) 366 liveVotersNewGroup := d.Filter(isBoth(isVoterNewConfig, liveFunc)) 367 368 n = len(votersNewGroup) 369 return len(liveVotersNewGroup) >= n/2+1 370 }