github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_split.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_split.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
    18  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    19  	"github.com/cockroachdb/cockroach/pkg/storage"
    20  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    21  	"github.com/cockroachdb/cockroach/pkg/util/log"
    22  	"github.com/cockroachdb/errors"
    23  	"go.etcd.io/etcd/raft"
    24  	"go.etcd.io/etcd/raft/raftpb"
    25  )
    26  
    27  // splitPreApply is called when the raft command is applied. Any
    28  // changes to the given ReadWriter will be written atomically with the
    29  // split commit.
    30  func splitPreApply(
    31  	ctx context.Context, readWriter storage.ReadWriter, split roachpb.SplitTrigger, r *Replica,
    32  ) {
    33  	// Sanity check that the store is in the split.
    34  	//
    35  	// The exception to that is if the DisableEagerReplicaRemoval testing flag is
    36  	// enabled.
    37  	//
    38  	// TODO(ajwerner): rethink DisableEagerReplicaRemoval and remove this in
    39  	// 20.1 after there are no more preemptive snapshots.
    40  	_, hasRightDesc := split.RightDesc.GetReplicaDescriptor(r.StoreID())
    41  	_, hasLeftDesc := split.LeftDesc.GetReplicaDescriptor(r.StoreID())
    42  	if !hasRightDesc || !hasLeftDesc {
    43  		log.Fatalf(ctx, "cannot process split on s%s which does not exist in the split: %+v",
    44  			r.StoreID(), split)
    45  	}
    46  
    47  	// Check on the RHS, we need to ensure that it exists and has a minReplicaID
    48  	// less than or equal to the replica we're about to initialize.
    49  	//
    50  	// The right hand side of the split was already created (and its raftMu
    51  	// acquired) in Replica.acquireSplitLock. It must be present here if it hasn't
    52  	// been removed in the meantime (handled below).
    53  	rightRepl, err := r.store.GetReplica(split.RightDesc.RangeID)
    54  	if roachpb.IsRangeNotFoundError(err) {
    55  		// The right hand side we were planning to populate has already been removed.
    56  		// We handle this below.
    57  		rightRepl = nil
    58  	} else if err != nil {
    59  		log.Fatalf(ctx, "failed to get RHS replica: %v", err)
    60  	}
    61  	// Check to see if we know that the RHS has already been removed from this
    62  	// store at the replica ID implied by the split.
    63  	if rightRepl == nil || rightRepl.isNewerThanSplit(&split) {
    64  		// We're in the rare case where we know that the RHS has been removed
    65  		// and re-added with a higher replica ID (and then maybe removed again).
    66  		//
    67  		// To apply the split, we need to "throw away" the data that would belong to
    68  		// the RHS, i.e. we clear the user data the RHS would have inherited from the
    69  		// LHS due to the split and additionally clear all of the range ID local state
    70  		// that the split trigger writes into the RHS.
    71  		//
    72  		// We know we've never processed a snapshot for the right range because the
    73  		// LHS prevents any incoming snapshots until the split has executed (i.e. now).
    74  		// It is important to preserve the HardState because we might however have
    75  		// already voted at a higher term. In general this shouldn't happen because
    76  		// we add learners and then promote them only after we snapshot but we're
    77  		// going to be extra careful in case future versions of cockroach somehow
    78  		// promote replicas without ensuring that a snapshot has been received.
    79  		//
    80  		// Rather than specifically deleting around the data we want to preserve
    81  		// we read the HardState to preserve it, clear everything and write back
    82  		// the HardState and tombstone. Note that we only do this if rightRepl
    83  		// exists; if it doesn't, there's no Raft state to massage (when rightRepl
    84  		// was removed, a tombstone was written instead).
    85  		var hs raftpb.HardState
    86  		if rightRepl != nil {
    87  			// Assert that the rightRepl is not initialized. We're about to clear out
    88  			// the data of the RHS of the split; we cannot have already accepted a
    89  			// snapshot to initialize this newer RHS.
    90  			if rightRepl.IsInitialized() {
    91  				log.Fatalf(ctx, "unexpectedly found initialized newer RHS of split: %v", rightRepl.Desc())
    92  			}
    93  			hs, err = rightRepl.raftMu.stateLoader.LoadHardState(ctx, readWriter)
    94  			if err != nil {
    95  				log.Fatalf(ctx, "failed to load hard state for removed rhs: %v", err)
    96  			}
    97  		}
    98  		const rangeIDLocalOnly = false
    99  		const mustUseClearRange = false
   100  		if err := clearRangeData(&split.RightDesc, readWriter, readWriter, rangeIDLocalOnly, mustUseClearRange); err != nil {
   101  			log.Fatalf(ctx, "failed to clear range data for removed rhs: %v", err)
   102  		}
   103  		if rightRepl != nil {
   104  			if err := rightRepl.raftMu.stateLoader.SetHardState(ctx, readWriter, hs); err != nil {
   105  				log.Fatalf(ctx, "failed to set hard state with 0 commit index for removed rhs: %v", err)
   106  			}
   107  		}
   108  		return
   109  	}
   110  
   111  	// Update the raft HardState with the new Commit value now that the
   112  	// replica is initialized (combining it with existing or default
   113  	// Term and Vote). This is the common case.
   114  	rsl := stateloader.Make(split.RightDesc.RangeID)
   115  	if err := rsl.SynthesizeRaftState(ctx, readWriter); err != nil {
   116  		log.Fatalf(ctx, "%v", err)
   117  	}
   118  
   119  	// The initialMaxClosed is assigned to the RHS replica to ensure that
   120  	// follower reads do not regress following the split. After the split occurs
   121  	// there will be no information in the closedts subsystem about the newly
   122  	// minted RHS range from its leaseholder's store. Furthermore, the RHS will
   123  	// have a lease start time equal to that of the LHS which might be quite
   124  	// old. This means that timestamps which follow the least StartTime for the
   125  	// LHS part are below the current closed timestamp for the LHS would no
   126  	// longer be readable on the RHS after the split.
   127  	//
   128  	// It is necessary for correctness that the call to maxClosed used to
   129  	// determine the current closed timestamp happens during the splitPreApply
   130  	// so that it uses a LAI that is _before_ the index at which this split is
   131  	// applied. If it were to refer to a LAI equal to or after the split then
   132  	// the value of initialMaxClosed might be unsafe.
   133  	//
   134  	// Concretely, any closed timestamp based on an LAI that is equal to or
   135  	// above the split index might be larger than the initial closed timestamp
   136  	// assigned to the RHS range's initial leaseholder. This is because the LHS
   137  	// range's leaseholder could continue closing out timestamps at the split's
   138  	// LAI after applying the split. Slow followers in that range could hear
   139  	// about these closed timestamp notifications before applying the split
   140  	// themselves. If these slow followers were allowed to pass these closed
   141  	// timestamps created after the split to the RHS replicas they create during
   142  	// the application of the split then these RHS replicas might end up with
   143  	// initialMaxClosed values above their current range's official closed
   144  	// timestamp. The leaseholder of the RHS range could then propose a write at
   145  	// a timestamp below this initialMaxClosed, violating the closed timestamp
   146  	// systems most important property.
   147  	//
   148  	// Using an LAI from before the index at which this split is applied avoids
   149  	// the hazard and ensures that no replica on the RHS is created with an
   150  	// initialMaxClosed that could be violated by a proposal on the RHS's
   151  	// initial leaseholder. See #44878.
   152  	initialMaxClosed, _ := r.maxClosed(ctx)
   153  	rightRepl.mu.Lock()
   154  	rightRepl.mu.initialMaxClosed = initialMaxClosed
   155  	rightRepl.mu.Unlock()
   156  }
   157  
   158  // splitPostApply is the part of the split trigger which coordinates the actual
   159  // split with the Store. Requires that Replica.raftMu is held.
   160  func splitPostApply(
   161  	ctx context.Context, deltaMS enginepb.MVCCStats, split *roachpb.SplitTrigger, r *Replica,
   162  ) {
   163  	// rightReplOrNil will be nil if the RHS replica at the ID of the split is
   164  	// already known to be removed, generally because we know that this store has
   165  	// been re-added at a higher replica ID.
   166  	rightReplOrNil := prepareRightReplicaForSplit(ctx, split, r)
   167  	// Add the RHS replica to the store. This step atomically updates
   168  	// the EndKey of the LHS replica and also adds the RHS replica
   169  	// to the store's replica map.
   170  	if err := r.store.SplitRange(ctx, r, rightReplOrNil, split); err != nil {
   171  		// Our in-memory state has diverged from the on-disk state.
   172  		log.Fatalf(ctx, "%s: failed to update Store after split: %+v", r, err)
   173  	}
   174  
   175  	// Update store stats with difference in stats before and after split.
   176  	r.store.metrics.addMVCCStats(deltaMS)
   177  
   178  	now := r.store.Clock().Now()
   179  
   180  	// While performing the split, zone config changes or a newly created table
   181  	// might require the range to be split again. Enqueue both the left and right
   182  	// ranges to speed up such splits. See #10160.
   183  	r.store.splitQueue.MaybeAddAsync(ctx, r, now)
   184  	// If the range was not properly replicated before the split, the replicate
   185  	// queue may not have picked it up (due to the need for a split). Enqueue
   186  	// both the left and right ranges to speed up a potentially necessary
   187  	// replication. See #7022 and #7800.
   188  	r.store.replicateQueue.MaybeAddAsync(ctx, r, now)
   189  
   190  	if rightReplOrNil != nil {
   191  		r.store.splitQueue.MaybeAddAsync(ctx, rightReplOrNil, now)
   192  		r.store.replicateQueue.MaybeAddAsync(ctx, rightReplOrNil, now)
   193  		if len(split.RightDesc.Replicas().All()) == 1 {
   194  			// TODO(peter): In single-node clusters, we enqueue the right-hand side of
   195  			// the split (the new range) for Raft processing so that the corresponding
   196  			// Raft group is created. This shouldn't be necessary for correctness, but
   197  			// some tests rely on this (e.g. server.TestNodeStatusWritten).
   198  			r.store.enqueueRaftUpdateCheck(rightReplOrNil.RangeID)
   199  		}
   200  	}
   201  }
   202  
   203  // prepareRightReplicaForSplit a helper for splitPostApply.
   204  // Requires that r.raftMu is held.
   205  func prepareRightReplicaForSplit(
   206  	ctx context.Context, split *roachpb.SplitTrigger, r *Replica,
   207  ) (rightReplicaOrNil *Replica) {
   208  	// The right hand side of the split was already created (and its raftMu
   209  	// acquired) in Replica.acquireSplitLock. It must be present here.
   210  	rightRepl, err := r.store.GetReplica(split.RightDesc.RangeID)
   211  	// If the RHS replica at the point of the split was known to be removed
   212  	// during the application of the split then we may not find it here. That's
   213  	// fine, carry on. See also:
   214  	_, _ = r.acquireSplitLock, splitPostApply
   215  	if roachpb.IsRangeNotFoundError(err) {
   216  		return nil
   217  	}
   218  	if err != nil {
   219  		log.Fatalf(ctx, "unable to find RHS replica: %+v", err)
   220  	}
   221  	// Already holding raftMu, see above.
   222  	rightRepl.mu.Lock()
   223  
   224  	// If we know that the RHS has already been removed at this replica ID
   225  	// then we also know that its data has already been removed by the preApply
   226  	// so we skip initializing it as the RHS of the split.
   227  	if rightRepl.isNewerThanSplitRLocked(split) {
   228  		rightRepl.mu.Unlock()
   229  		return nil
   230  	}
   231  
   232  	// Finish initialization of the RHS.
   233  	err = rightRepl.loadRaftMuLockedReplicaMuLocked(&split.RightDesc)
   234  	rightRepl.mu.Unlock()
   235  	if err != nil {
   236  		log.Fatalf(ctx, "%v", err)
   237  	}
   238  
   239  	// Copy the minLeaseProposedTS from the LHS and grab the RHS's lease.
   240  	r.mu.RLock()
   241  	rightRepl.mu.Lock()
   242  	rightRepl.mu.minLeaseProposedTS = r.mu.minLeaseProposedTS
   243  	rightLease := *rightRepl.mu.state.Lease
   244  	rightRepl.mu.Unlock()
   245  	r.mu.RUnlock()
   246  
   247  	// We need to explicitly wake up the Raft group on the right-hand range or
   248  	// else the range could be underreplicated for an indefinite period of time.
   249  	//
   250  	// Specifically, suppose one of the replicas of the left-hand range never
   251  	// applies this split trigger, e.g., because it catches up via a snapshot that
   252  	// advances it past this split. That store won't create the right-hand replica
   253  	// until it receives a Raft message addressed to the right-hand range. But
   254  	// since new replicas start out quiesced, unless we explicitly awaken the
   255  	// Raft group, there might not be any Raft traffic for quite a while.
   256  	err = rightRepl.withRaftGroup(true, func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error) {
   257  		return true, nil
   258  	})
   259  	if err != nil {
   260  		log.Fatalf(ctx, "unable to create raft group for right-hand range in split: %+v", err)
   261  	}
   262  
   263  	// Invoke the leasePostApply method to ensure we properly initialize
   264  	// the replica according to whether it holds the lease. This enables
   265  	// the txnWaitQueue.
   266  	rightRepl.leasePostApply(ctx, rightLease, false /* permitJump */)
   267  	return rightRepl
   268  }
   269  
   270  // SplitRange shortens the original range to accommodate the new range. The new
   271  // range is added to the ranges map and the replicasByKey btree. origRng.raftMu
   272  // and newRng.raftMu must be held.
   273  //
   274  // This is only called from the split trigger in the context of the execution
   275  // of a Raft command. Note that rightRepl will be nil if the replica described
   276  // by rightDesc is known to have been removed.
   277  func (s *Store) SplitRange(
   278  	ctx context.Context, leftRepl, rightReplOrNil *Replica, split *roachpb.SplitTrigger,
   279  ) error {
   280  	rightDesc := &split.RightDesc
   281  	newLeftDesc := &split.LeftDesc
   282  	oldLeftDesc := leftRepl.Desc()
   283  	if !bytes.Equal(oldLeftDesc.EndKey, rightDesc.EndKey) ||
   284  		bytes.Compare(oldLeftDesc.StartKey, rightDesc.StartKey) >= 0 {
   285  		return errors.Errorf("left range is not splittable by right range: %+v, %+v", oldLeftDesc, rightDesc)
   286  	}
   287  
   288  	s.mu.Lock()
   289  	defer s.mu.Unlock()
   290  	if exRng, ok := s.mu.uninitReplicas[rightDesc.RangeID]; rightReplOrNil != nil && ok {
   291  		// If we have an uninitialized replica of the new range we require pointer
   292  		// equivalence with rightRepl. See Store.splitTriggerPostApply().
   293  		if exRng != rightReplOrNil {
   294  			log.Fatalf(ctx, "found unexpected uninitialized replica: %s vs %s", exRng, rightReplOrNil)
   295  		}
   296  		// NB: We only remove from uninitReplicas and the replicaQueues maps here
   297  		// so that we don't leave open a window where a replica is temporarily not
   298  		// present in Store.mu.replicas.
   299  		delete(s.mu.uninitReplicas, rightDesc.RangeID)
   300  		s.replicaQueues.Delete(int64(rightDesc.RangeID))
   301  	}
   302  
   303  	leftRepl.setDescRaftMuLocked(ctx, newLeftDesc)
   304  
   305  	// Clear the LHS lock and txn wait-queues, to redirect to the RHS if
   306  	// appropriate. We do this after setDescWithoutProcessUpdate to ensure
   307  	// that no pre-split commands are inserted into the wait-queues after we
   308  	// clear them.
   309  	leftRepl.concMgr.OnRangeSplit()
   310  
   311  	// Clear the original range's request stats, since they include requests for
   312  	// spans that are now owned by the new range.
   313  	leftRepl.leaseholderStats.resetRequestCounts()
   314  
   315  	if rightReplOrNil == nil {
   316  		throwawayRightWriteStats := new(replicaStats)
   317  		leftRepl.writeStats.splitRequestCounts(throwawayRightWriteStats)
   318  	} else {
   319  		rightRepl := rightReplOrNil
   320  		leftRepl.writeStats.splitRequestCounts(rightRepl.writeStats)
   321  		if err := s.addReplicaInternalLocked(rightRepl); err != nil {
   322  			return errors.Errorf("unable to add replica %v: %s", rightRepl, err)
   323  		}
   324  
   325  		// Update the replica's cached byte thresholds. This is a no-op if the system
   326  		// config is not available, in which case we rely on the next gossip update
   327  		// to perform the update.
   328  		if err := rightRepl.updateRangeInfo(rightRepl.Desc()); err != nil {
   329  			return err
   330  		}
   331  		// Add the range to metrics and maybe gossip on capacity change.
   332  		s.metrics.ReplicaCount.Inc(1)
   333  		s.maybeGossipOnCapacityChange(ctx, rangeAddEvent)
   334  	}
   335  
   336  	return nil
   337  }