github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/roachpb/metadata_replicas.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package roachpb
    12  
    13  import (
    14  	"fmt"
    15  	"strings"
    16  
    17  	"go.etcd.io/etcd/raft/raftpb"
    18  )
    19  
    20  // ReplicaTypeVoterFull returns a VOTER_FULL pointer suitable for use in a
    21  // nullable proto field.
    22  func ReplicaTypeVoterFull() *ReplicaType {
    23  	t := VOTER_FULL
    24  	return &t
    25  }
    26  
    27  // ReplicaTypeVoterIncoming returns a VOTER_INCOMING pointer suitable
    28  // for use in a nullable proto field.
    29  func ReplicaTypeVoterIncoming() *ReplicaType {
    30  	t := VOTER_INCOMING
    31  	return &t
    32  }
    33  
    34  // ReplicaTypeVoterOutgoing returns a VOTER_OUTGOING pointer suitable
    35  // for use in a nullable proto field.
    36  func ReplicaTypeVoterOutgoing() *ReplicaType {
    37  	t := VOTER_OUTGOING
    38  	return &t
    39  }
    40  
    41  // ReplicaTypeVoterDemoting returns a VOTER_DEMOTING pointer suitable
    42  // for use in a nullable proto field.
    43  func ReplicaTypeVoterDemoting() *ReplicaType {
    44  	t := VOTER_DEMOTING
    45  	return &t
    46  }
    47  
    48  // ReplicaTypeLearner returns a LEARNER pointer suitable for use in
    49  // a nullable proto field.
    50  func ReplicaTypeLearner() *ReplicaType {
    51  	t := LEARNER
    52  	return &t
    53  }
    54  
    55  // ReplicaDescriptors is a set of replicas, usually the nodes/stores on which
    56  // replicas of a range are stored.
    57  type ReplicaDescriptors struct {
    58  	wrapped []ReplicaDescriptor
    59  }
    60  
    61  // MakeReplicaDescriptors creates a ReplicaDescriptors wrapper from a raw slice
    62  // of individual descriptors.
    63  //
    64  // All construction of ReplicaDescriptors is required to go through this method
    65  // so we can guarantee sortedness, which is used to speed up accessor
    66  // operations.
    67  //
    68  // The function accepts a pointer to a slice instead of a slice directly to
    69  // avoid an allocation when boxing the argument as a sort.Interface. This may
    70  // cause the argument to escape to the heap for some callers, at which point
    71  // we're trading one allocation for another. However, if the caller already has
    72  // the slice header on the heap (which is the common case for *RangeDescriptors)
    73  // then this is a net win.
    74  func MakeReplicaDescriptors(replicas []ReplicaDescriptor) ReplicaDescriptors {
    75  	return ReplicaDescriptors{wrapped: replicas}
    76  }
    77  
    78  func (d ReplicaDescriptors) String() string {
    79  	var buf strings.Builder
    80  	for i, desc := range d.wrapped {
    81  		if i > 0 {
    82  			buf.WriteByte(',')
    83  		}
    84  		fmt.Fprint(&buf, desc)
    85  	}
    86  	return buf.String()
    87  }
    88  
    89  // All returns every replica in the set, including both voter replicas and
    90  // learner replicas. Voter replicas are ordered first in the returned slice.
    91  func (d ReplicaDescriptors) All() []ReplicaDescriptor {
    92  	return d.wrapped
    93  }
    94  
    95  func predVoterFullOrIncoming(rDesc ReplicaDescriptor) bool {
    96  	switch rDesc.GetType() {
    97  	case VOTER_FULL, VOTER_INCOMING:
    98  		return true
    99  	default:
   100  	}
   101  	return false
   102  }
   103  
   104  func predLearner(rDesc ReplicaDescriptor) bool {
   105  	return rDesc.GetType() == LEARNER
   106  }
   107  
   108  // Voters returns the current and future voter replicas in the set. This means
   109  // that during an atomic replication change, only the replicas that will be
   110  // voters once the change completes will be returned; "outgoing" voters will not
   111  // be returned even though they do in the current state retain their voting
   112  // rights. When no atomic membership change is ongoing, this is simply the set
   113  // of all non-learners.
   114  //
   115  // This may allocate, but it also may return the underlying slice as a
   116  // performance optimization, so it's not safe to modify the returned value.
   117  //
   118  // TODO(tbg): go through the callers and figure out the few which want a
   119  // different subset of voters. Consider renaming this method so that it's
   120  // more descriptive.
   121  func (d ReplicaDescriptors) Voters() []ReplicaDescriptor {
   122  	return d.Filter(predVoterFullOrIncoming)
   123  }
   124  
   125  // Learners returns the learner replicas in the set. This may allocate, but it
   126  // also may return the underlying slice as a performance optimization, so it's
   127  // not safe to modify the returned value.
   128  //
   129  // A learner is a participant in a raft group that accepts messages but doesn't
   130  // vote. This means it doesn't affect raft quorum and thus doesn't affect the
   131  // fragility of the range, even if it's very far behind or many learners are
   132  // down.
   133  //
   134  // At the time of writing, learners are used in CockroachDB as an interim state
   135  // while adding a replica. A learner replica is added to the range via raft
   136  // ConfChange, a raft snapshot (of type LEARNER) is sent to catch it up, and
   137  // then a second ConfChange promotes it to a full replica.
   138  //
   139  // This means that learners are currently always expected to have a short
   140  // lifetime, approximately the time it takes to send a snapshot. Ideas have been
   141  // kicked around to use learners with follower reads, which could be a cheap way
   142  // to allow many geographies to have local reads without affecting write
   143  // latencies. If implemented, these learners would have long lifetimes.
   144  //
   145  // For simplicity, CockroachDB treats learner replicas the same as voter
   146  // replicas as much as possible, but there are a few exceptions:
   147  //
   148  // - Learner replicas are not considered when calculating quorum size, and thus
   149  //   do not affect the computation of which ranges are under-replicated for
   150  //   upreplication/alerting/debug/etc purposes. Ditto for over-replicated.
   151  // - Learner replicas cannot become raft leaders, so we also don't allow them to
   152  //   become leaseholders. As a result, DistSender and the various oracles don't
   153  //   try to send them traffic.
   154  // - The raft snapshot queue tries to avoid sending snapshots to learners for
   155  //   reasons described below.
   156  // - Merges won't run while a learner replica is present.
   157  //
   158  // Replicas are now added in two ConfChange transactions. The first creates the
   159  // learner and the second promotes it to a voter. If the node that is
   160  // coordinating this dies in the middle, we're left with an orphaned learner.
   161  // For this reason, the replicate queue always first removes any learners it
   162  // sees before doing anything else. We could instead try to finish off the
   163  // learner snapshot and promotion, but this is more complicated and it's not yet
   164  // clear the efficiency win is worth it.
   165  //
   166  // This introduces some rare races between the replicate queue and
   167  // AdminChangeReplicas or if a range's lease is moved to a new owner while the
   168  // old leaseholder is still processing it in the replicate queue. These races
   169  // are handled by retrying if a learner disappears during the
   170  // snapshot/promotion.
   171  //
   172  // If the coordinator otherwise encounters an error while sending the learner
   173  // snapshot or promoting it (which can happen for a number of reasons, including
   174  // the node getting the learner going away), it tries to clean up after itself
   175  // by rolling back the addition of the learner.
   176  //
   177  // There is another race between the learner snapshot being sent and the raft
   178  // snapshot queue happening to check the replica at the same time, also sending
   179  // it a snapshot. This is safe but wasteful, so the raft snapshot queue won't
   180  // try to send snapshots to learners if there is already a snapshot to that
   181  // range in flight.
   182  //
   183  // *However*, raft is currently pickier than the needs to be about the snapshots
   184  // it requests and it can get stuck in StateSnapshot if it doesn't receive
   185  // exactly the index it wants. As a result, for now, the raft snapshot queue
   186  // will send one if it's still needed after the learner snapshot finishes (or
   187  // times out). To make this work in a timely manner (i.e. without relying on the
   188  // replica scanner) but without blocking the raft snapshot queue, when a
   189  // snapshot is skipped, this is reported to raft as an error sending the
   190  // snapshot. This causes raft to eventually re-enqueue it in the raft snapshot
   191  // queue. All of this is quite hard to reason about, so it'd be nice to make
   192  // this go away at some point.
   193  //
   194  // Merges are blocked if either side has a learner (to avoid working out the
   195  // edge cases) but it's historically turned out to be a bad idea to get in the
   196  // way of splits, so we allow them even when some of the replicas are learners.
   197  // This orphans a learner on each side of the split (the original coordinator
   198  // will not be able to finish either of them), but the replication queue will
   199  // eventually clean them up.
   200  //
   201  // Learner replicas don't affect quorum but they do affect the system in other
   202  // ways. The most obvious way is that the leader sends them the raft traffic it
   203  // would send to any follower, consuming resources. More surprising is that once
   204  // the learner has received a snapshot, it's considered by the quota pool that
   205  // prevents the raft leader from getting too far ahead of the followers. This is
   206  // because a learner (especially one that already has a snapshot) is expected to
   207  // very soon be a voter, so we treat it like one. However, it means a slow
   208  // learner can slow down regular traffic, which is possibly counterintuitive.
   209  //
   210  // For some related mega-comments, see Replica.sendSnapshot.
   211  func (d ReplicaDescriptors) Learners() []ReplicaDescriptor {
   212  	return d.Filter(predLearner)
   213  }
   214  
   215  // Filter returns only the replica descriptors for which the supplied method
   216  // returns true. The memory returned may be shared with the receiver.
   217  func (d ReplicaDescriptors) Filter(pred func(rDesc ReplicaDescriptor) bool) []ReplicaDescriptor {
   218  	// Fast path when all or none match to avoid allocations.
   219  	fastpath := true
   220  	out := d.wrapped
   221  	for i := range d.wrapped {
   222  		if pred(d.wrapped[i]) {
   223  			if !fastpath {
   224  				out = append(out, d.wrapped[i])
   225  			}
   226  		} else {
   227  			if fastpath {
   228  				out = nil
   229  				out = append(out, d.wrapped[:i]...)
   230  				fastpath = false
   231  			}
   232  		}
   233  	}
   234  	return out
   235  }
   236  
   237  // AsProto returns the protobuf representation of these replicas, suitable for
   238  // setting the InternalReplicas field of a RangeDescriptor. When possible the
   239  // SetReplicas method of RangeDescriptor should be used instead, this is only
   240  // here for the convenience of tests.
   241  func (d ReplicaDescriptors) AsProto() []ReplicaDescriptor {
   242  	return d.wrapped
   243  }
   244  
   245  // DeepCopy returns a copy of this set of replicas. Modifications to the
   246  // returned set will not affect this one and vice-versa.
   247  func (d ReplicaDescriptors) DeepCopy() ReplicaDescriptors {
   248  	return ReplicaDescriptors{
   249  		wrapped: append([]ReplicaDescriptor(nil), d.wrapped...),
   250  	}
   251  }
   252  
   253  // AddReplica adds the given replica to this set.
   254  func (d *ReplicaDescriptors) AddReplica(r ReplicaDescriptor) {
   255  	d.wrapped = append(d.wrapped, r)
   256  }
   257  
   258  // RemoveReplica removes the matching replica from this set. If it wasn't found
   259  // to remove, false is returned.
   260  func (d *ReplicaDescriptors) RemoveReplica(
   261  	nodeID NodeID, storeID StoreID,
   262  ) (ReplicaDescriptor, bool) {
   263  	idx := -1
   264  	for i := range d.wrapped {
   265  		if d.wrapped[i].NodeID == nodeID && d.wrapped[i].StoreID == storeID {
   266  			idx = i
   267  			break
   268  		}
   269  	}
   270  	if idx == -1 {
   271  		return ReplicaDescriptor{}, false
   272  	}
   273  	// Swap with the last element so we can simply truncate the slice.
   274  	d.wrapped[idx], d.wrapped[len(d.wrapped)-1] = d.wrapped[len(d.wrapped)-1], d.wrapped[idx]
   275  	removed := d.wrapped[len(d.wrapped)-1]
   276  	d.wrapped = d.wrapped[:len(d.wrapped)-1]
   277  	return removed, true
   278  }
   279  
   280  // InAtomicReplicationChange returns true if the descriptor is in the middle of
   281  // an atomic replication change.
   282  func (d ReplicaDescriptors) InAtomicReplicationChange() bool {
   283  	for _, rDesc := range d.wrapped {
   284  		switch rDesc.GetType() {
   285  		case VOTER_INCOMING, VOTER_OUTGOING, VOTER_DEMOTING:
   286  			return true
   287  		case VOTER_FULL, LEARNER:
   288  		default:
   289  			panic(fmt.Sprintf("unknown replica type %d", rDesc.GetType()))
   290  		}
   291  	}
   292  	return false
   293  }
   294  
   295  // ConfState returns the Raft configuration described by the set of replicas.
   296  func (d ReplicaDescriptors) ConfState() raftpb.ConfState {
   297  	var cs raftpb.ConfState
   298  	joint := d.InAtomicReplicationChange()
   299  	// The incoming config is taken verbatim from the full voters when the
   300  	// config is not joint. If it is joint, slot the voters into the right
   301  	// category.
   302  	for _, rep := range d.wrapped {
   303  		id := uint64(rep.ReplicaID)
   304  		typ := rep.GetType()
   305  		switch typ {
   306  		case VOTER_FULL:
   307  			cs.Voters = append(cs.Voters, id)
   308  			if joint {
   309  				cs.VotersOutgoing = append(cs.VotersOutgoing, id)
   310  			}
   311  		case VOTER_INCOMING:
   312  			cs.Voters = append(cs.Voters, id)
   313  		case VOTER_OUTGOING:
   314  			cs.VotersOutgoing = append(cs.VotersOutgoing, id)
   315  		case VOTER_DEMOTING:
   316  			cs.VotersOutgoing = append(cs.VotersOutgoing, id)
   317  			cs.LearnersNext = append(cs.LearnersNext, id)
   318  		case LEARNER:
   319  			cs.Learners = append(cs.Learners, id)
   320  		default:
   321  			panic(fmt.Sprintf("unknown ReplicaType %d", typ))
   322  		}
   323  	}
   324  	return cs
   325  }
   326  
   327  // CanMakeProgress reports whether the given descriptors can make progress at the
   328  // replication layer. This is more complicated than just counting the number
   329  // of replicas due to the existence of joint quorums.
   330  func (d ReplicaDescriptors) CanMakeProgress(liveFunc func(descriptor ReplicaDescriptor) bool) bool {
   331  	isVoterOldConfig := func(rDesc ReplicaDescriptor) bool {
   332  		switch rDesc.GetType() {
   333  		case VOTER_FULL, VOTER_OUTGOING, VOTER_DEMOTING:
   334  			return true
   335  		default:
   336  			return false
   337  		}
   338  	}
   339  	isVoterNewConfig := func(rDesc ReplicaDescriptor) bool {
   340  		switch rDesc.GetType() {
   341  		case VOTER_FULL, VOTER_INCOMING:
   342  			return true
   343  		default:
   344  			return false
   345  		}
   346  	}
   347  	// isBoth takes two replica predicates and returns their conjunction.
   348  	isBoth := func(
   349  		pred1 func(rDesc ReplicaDescriptor) bool,
   350  		pred2 func(rDesc ReplicaDescriptor) bool) func(ReplicaDescriptor) bool {
   351  		return func(rDesc ReplicaDescriptor) bool {
   352  			return pred1(rDesc) && pred2(rDesc)
   353  		}
   354  	}
   355  
   356  	votersOldGroup := d.Filter(isVoterOldConfig)
   357  	liveVotersOldGroup := d.Filter(isBoth(isVoterOldConfig, liveFunc))
   358  
   359  	n := len(votersOldGroup)
   360  	// Empty groups succeed by default, to match the Raft implementation.
   361  	if n > 0 && len(liveVotersOldGroup) < n/2+1 {
   362  		return false
   363  	}
   364  
   365  	votersNewGroup := d.Filter(isVoterNewConfig)
   366  	liveVotersNewGroup := d.Filter(isBoth(isVoterNewConfig, liveFunc))
   367  
   368  	n = len(votersNewGroup)
   369  	return len(liveVotersNewGroup) >= n/2+1
   370  }