github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_create_replica.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"time"
    16  	"unsafe"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/keys"
    19  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    20  	"github.com/cockroachdb/cockroach/pkg/storage"
    21  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    22  	"github.com/cockroachdb/cockroach/pkg/util/log"
    23  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    24  	"github.com/cockroachdb/errors"
    25  )
    26  
    27  var errRetry = errors.New("retry: orphaned replica")
    28  
    29  // getOrCreateReplica returns a replica for the given RangeID, creating an
    30  // uninitialized replica if necessary. The caller must not hold the store's
    31  // lock. The returned replica has Replica.raftMu locked and it is the caller's
    32  // responsibility to unlock it.
    33  func (s *Store) getOrCreateReplica(
    34  	ctx context.Context,
    35  	rangeID roachpb.RangeID,
    36  	replicaID roachpb.ReplicaID,
    37  	creatingReplica *roachpb.ReplicaDescriptor,
    38  	isLearner bool,
    39  ) (_ *Replica, created bool, _ error) {
    40  	if replicaID == 0 {
    41  		log.Fatalf(ctx, "cannot construct a Replica for range %d with 0 id", rangeID)
    42  	}
    43  	// We need a retry loop as the replica we find in the map may be in the
    44  	// process of being removed or may need to be removed. Retries in the loop
    45  	// imply that a removal is actually being carried out, not that we're waiting
    46  	// on a queue.
    47  	r := retry.Start(retry.Options{
    48  		InitialBackoff: time.Microsecond,
    49  		// Set the backoff up to only a small amount to wait for data that
    50  		// might need to be cleared.
    51  		MaxBackoff: 10 * time.Millisecond,
    52  	})
    53  	for {
    54  		r.Next()
    55  		r, created, err := s.tryGetOrCreateReplica(
    56  			ctx,
    57  			rangeID,
    58  			replicaID,
    59  			creatingReplica,
    60  			isLearner,
    61  		)
    62  		if errors.Is(err, errRetry) {
    63  			continue
    64  		}
    65  		if err != nil {
    66  			return nil, false, err
    67  		}
    68  		return r, created, err
    69  	}
    70  }
    71  
    72  // tryGetOrCreateReplica performs a single attempt at trying to lookup or
    73  // create a replica. It will fail with errRetry if it finds a Replica that has
    74  // been destroyed (and is no longer in Store.mu.replicas) or if during creation
    75  // another goroutine gets there first. In either case, a subsequent call to
    76  // tryGetOrCreateReplica will likely succeed, hence the loop in
    77  // getOrCreateReplica.
    78  func (s *Store) tryGetOrCreateReplica(
    79  	ctx context.Context,
    80  	rangeID roachpb.RangeID,
    81  	replicaID roachpb.ReplicaID,
    82  	creatingReplica *roachpb.ReplicaDescriptor,
    83  	isLearner bool,
    84  ) (_ *Replica, created bool, _ error) {
    85  	// The common case: look up an existing (initialized) replica.
    86  	if value, ok := s.mu.replicas.Load(int64(rangeID)); ok {
    87  		repl := (*Replica)(value)
    88  		repl.raftMu.Lock() // not unlocked on success
    89  		repl.mu.Lock()
    90  
    91  		// The current replica is removed, go back around.
    92  		if repl.mu.destroyStatus.Removed() {
    93  			repl.mu.Unlock()
    94  			repl.raftMu.Unlock()
    95  			return nil, false, errRetry
    96  		}
    97  
    98  		// Drop messages from replicas we know to be too old.
    99  		if fromReplicaIsTooOld(repl, creatingReplica) {
   100  			repl.mu.Unlock()
   101  			repl.raftMu.Unlock()
   102  			return nil, false, roachpb.NewReplicaTooOldError(creatingReplica.ReplicaID)
   103  		}
   104  
   105  		// The current replica needs to be removed, remove it and go back around.
   106  		if toTooOld := repl.mu.replicaID < replicaID; toTooOld {
   107  			if shouldLog := log.V(1); shouldLog {
   108  				log.Infof(ctx, "found message for replica ID %d which is newer than %v",
   109  					replicaID, repl)
   110  			}
   111  
   112  			repl.mu.Unlock()
   113  			if err := s.removeReplicaRaftMuLocked(ctx, repl, replicaID, RemoveOptions{
   114  				DestroyData: true,
   115  			}); err != nil {
   116  				log.Fatalf(ctx, "failed to remove replica: %v", err)
   117  			}
   118  			repl.raftMu.Unlock()
   119  			return nil, false, errRetry
   120  		}
   121  		defer repl.mu.Unlock()
   122  
   123  		if repl.mu.replicaID > replicaID {
   124  			// The sender is behind and is sending to an old replica.
   125  			// We could silently drop this message but this way we'll inform the
   126  			// sender that they may no longer exist.
   127  			repl.raftMu.Unlock()
   128  			return nil, false, &roachpb.RaftGroupDeletedError{}
   129  		}
   130  		if repl.mu.replicaID != replicaID {
   131  			// This case should have been caught by handleToReplicaTooOld.
   132  			log.Fatalf(ctx, "intended replica id %d unexpectedly does not match the current replica %v",
   133  				replicaID, repl)
   134  		}
   135  		return repl, false, nil
   136  	}
   137  
   138  	// No replica currently exists, so we'll try to create one. Before creating
   139  	// the replica, see if there is a tombstone which would indicate that this
   140  	// is a stale message.
   141  	// NB: we check this before creating a new Replica and adding it to the
   142  	// Store's Range map even though we must check it again after to avoid race
   143  	// conditions. This double-checked locking is an optimization to avoid this
   144  	// work when we know the Replica should not be created ahead of time.
   145  	tombstoneKey := keys.RangeTombstoneKey(rangeID)
   146  	var tombstone roachpb.RangeTombstone
   147  	if ok, err := storage.MVCCGetProto(
   148  		ctx, s.Engine(), tombstoneKey, hlc.Timestamp{}, &tombstone, storage.MVCCGetOptions{},
   149  	); err != nil {
   150  		return nil, false, err
   151  	} else if ok && replicaID != 0 && replicaID < tombstone.NextReplicaID {
   152  		return nil, false, &roachpb.RaftGroupDeletedError{}
   153  	}
   154  
   155  	// Create a new replica and lock it for raft processing.
   156  	uninitializedDesc := &roachpb.RangeDescriptor{
   157  		RangeID: rangeID,
   158  		// NB: other fields are unknown; need to populate them from
   159  		// snapshot.
   160  	}
   161  	repl := newUnloadedReplica(ctx, uninitializedDesc, s, replicaID)
   162  	repl.creatingReplica = creatingReplica
   163  	repl.raftMu.Lock() // not unlocked
   164  
   165  	// Install the replica in the store's replica map. The replica is in an
   166  	// inconsistent state, but nobody will be accessing it while we hold its
   167  	// locks.
   168  	s.mu.Lock()
   169  	// Grab the internal Replica state lock to ensure nobody mucks with our
   170  	// replica even outside of raft processing. Have to do this after grabbing
   171  	// Store.mu to maintain lock ordering invariant.
   172  	repl.mu.Lock()
   173  	repl.mu.tombstoneMinReplicaID = tombstone.NextReplicaID
   174  
   175  	// NB: A Replica should never be in the store's replicas map with a nil
   176  	// descriptor. Assign it directly here. In the case that the Replica should
   177  	// exist (which we confirm with another check of the Tombstone below), we'll
   178  	// re-initialize the replica with the same uninitializedDesc.
   179  	//
   180  	// During short window between here and call to s.unlinkReplicaByRangeIDLocked()
   181  	// in the failure branch below, the Replica used to have a nil descriptor and
   182  	// was present in the map. While it was the case that the destroy status had
   183  	// been set, not every code path which inspects the descriptor checks the
   184  	// destroy status.
   185  	repl.mu.state.Desc = uninitializedDesc
   186  	// Add the range to range map, but not replicasByKey since the range's start
   187  	// key is unknown. The range will be added to replicasByKey later when a
   188  	// snapshot is applied. After unlocking Store.mu above, another goroutine
   189  	// might have snuck in and created the replica, so we retry on error.
   190  	if err := s.addReplicaToRangeMapLocked(repl); err != nil {
   191  		repl.mu.Unlock()
   192  		s.mu.Unlock()
   193  		repl.raftMu.Unlock()
   194  		return nil, false, errRetry
   195  	}
   196  	s.mu.uninitReplicas[repl.RangeID] = repl
   197  	s.mu.Unlock() // NB: unlocking out of order
   198  
   199  	// Initialize the Replica with the replicaID.
   200  	if err := func() error {
   201  		// Check for a tombstone again now that we've inserted into the Range
   202  		// map. This double-checked locking ensures that we avoid a race where a
   203  		// replica is created and destroyed between the initial unsynchronized
   204  		// tombstone check and the Range map linearization point. By checking
   205  		// again now, we make sure to synchronize with any goroutine that wrote
   206  		// a tombstone and then removed an old replica from the Range map.
   207  		if ok, err := storage.MVCCGetProto(
   208  			ctx, s.Engine(), tombstoneKey, hlc.Timestamp{}, &tombstone, storage.MVCCGetOptions{},
   209  		); err != nil {
   210  			return err
   211  		} else if ok && replicaID < tombstone.NextReplicaID {
   212  			return &roachpb.RaftGroupDeletedError{}
   213  		}
   214  
   215  		// An uninitialized replica should have an empty HardState.Commit at
   216  		// all times. Failure to maintain this invariant indicates corruption.
   217  		// And yet, we have observed this in the wild. See #40213.
   218  		if hs, err := repl.mu.stateLoader.LoadHardState(ctx, s.Engine()); err != nil {
   219  			return err
   220  		} else if hs.Commit != 0 {
   221  			log.Fatalf(ctx, "found non-zero HardState.Commit on uninitialized replica %s. HS=%+v", repl, hs)
   222  		}
   223  		return repl.loadRaftMuLockedReplicaMuLocked(uninitializedDesc)
   224  	}(); err != nil {
   225  		// Mark the replica as destroyed and remove it from the replicas maps to
   226  		// ensure nobody tries to use it.
   227  		repl.mu.destroyStatus.Set(errors.Wrapf(err, "%s: failed to initialize", repl), destroyReasonRemoved)
   228  		repl.mu.Unlock()
   229  		s.mu.Lock()
   230  		s.unlinkReplicaByRangeIDLocked(rangeID)
   231  		s.mu.Unlock()
   232  		repl.raftMu.Unlock()
   233  		return nil, false, err
   234  	}
   235  	repl.mu.Unlock()
   236  	return repl, true, nil
   237  }
   238  
   239  // isFromReplicaTooOld returns an true if the creatingReplica is deemed to be
   240  // a member of the range which has been removed.
   241  // Assumes toReplica.mu is held.
   242  func fromReplicaIsTooOld(toReplica *Replica, fromReplica *roachpb.ReplicaDescriptor) bool {
   243  	toReplica.mu.AssertHeld()
   244  	if fromReplica == nil {
   245  		return false
   246  	}
   247  	desc := toReplica.mu.state.Desc
   248  	_, found := desc.GetReplicaDescriptorByID(fromReplica.ReplicaID)
   249  	return !found && fromReplica.ReplicaID < desc.NextReplicaID
   250  }
   251  
   252  // addReplicaInternalLocked adds the replica to the replicas map and the
   253  // replicasByKey btree. Returns an error if a replica with
   254  // the same Range ID or a KeyRange that overlaps has already been added to
   255  // this store. addReplicaInternalLocked requires that the store lock is held.
   256  func (s *Store) addReplicaInternalLocked(repl *Replica) error {
   257  	if !repl.IsInitialized() {
   258  		return errors.Errorf("attempted to add uninitialized replica %s", repl)
   259  	}
   260  
   261  	if err := s.addReplicaToRangeMapLocked(repl); err != nil {
   262  		return err
   263  	}
   264  
   265  	if exRange := s.getOverlappingKeyRangeLocked(repl.Desc()); exRange != nil {
   266  		return errors.Errorf("%s: cannot addReplicaInternalLocked; range %s has overlapping range %s", s, repl, exRange.Desc())
   267  	}
   268  
   269  	if exRngItem := s.mu.replicasByKey.ReplaceOrInsert(repl); exRngItem != nil {
   270  		return errors.Errorf("%s: cannot addReplicaInternalLocked; range for key %v already exists in replicasByKey btree", s,
   271  			exRngItem.(KeyRange).startKey())
   272  	}
   273  
   274  	return nil
   275  }
   276  
   277  // addPlaceholderLocked adds the specified placeholder. Requires that the
   278  // raftMu of the replica whose place is being held is locked.
   279  func (s *Store) addPlaceholder(placeholder *ReplicaPlaceholder) error {
   280  	s.mu.Lock()
   281  	defer s.mu.Unlock()
   282  	return s.addPlaceholderLocked(placeholder)
   283  }
   284  
   285  // addPlaceholderLocked adds the specified placeholder. Requires that Store.mu
   286  // and the raftMu of the replica whose place is being held are locked.
   287  func (s *Store) addPlaceholderLocked(placeholder *ReplicaPlaceholder) error {
   288  	rangeID := placeholder.Desc().RangeID
   289  	if exRng := s.mu.replicasByKey.ReplaceOrInsert(placeholder); exRng != nil {
   290  		return errors.Errorf("%s overlaps with existing KeyRange %s in replicasByKey btree", placeholder, exRng)
   291  	}
   292  	if exRng, ok := s.mu.replicaPlaceholders[rangeID]; ok {
   293  		return errors.Errorf("%s has ID collision with existing KeyRange %s", placeholder, exRng)
   294  	}
   295  	s.mu.replicaPlaceholders[rangeID] = placeholder
   296  	return nil
   297  }
   298  
   299  // addReplicaToRangeMapLocked adds the replica to the replicas map.
   300  func (s *Store) addReplicaToRangeMapLocked(repl *Replica) error {
   301  	// It's ok for the replica to exist in the replicas map as long as it is the
   302  	// same replica object. This occurs during splits where the right-hand side
   303  	// is added to the replicas map before it is initialized.
   304  	if existing, loaded := s.mu.replicas.LoadOrStore(
   305  		int64(repl.RangeID), unsafe.Pointer(repl)); loaded && (*Replica)(existing) != repl {
   306  		return errors.Errorf("%s: replica already exists", repl)
   307  	}
   308  	// Check whether the replica is unquiesced but not in the map. This
   309  	// can happen during splits and merges, where the uninitialized (but
   310  	// also unquiesced) replica is removed from the unquiesced replica
   311  	// map in advance of this method being called.
   312  	s.unquiescedReplicas.Lock()
   313  	if _, ok := s.unquiescedReplicas.m[repl.RangeID]; !repl.mu.quiescent && !ok {
   314  		s.unquiescedReplicas.m[repl.RangeID] = struct{}{}
   315  	}
   316  	s.unquiescedReplicas.Unlock()
   317  	return nil
   318  }
   319  
   320  // maybeMarkReplicaInitializedLocked should be called whenever a previously
   321  // unintialized replica has become initialized so that the store can update its
   322  // internal bookkeeping. It requires that Store.mu and Replica.raftMu
   323  // are locked.
   324  func (s *Store) maybeMarkReplicaInitializedLocked(ctx context.Context, repl *Replica) error {
   325  	if !repl.IsInitialized() {
   326  		return errors.Errorf("attempted to process uninitialized range %s", repl)
   327  	}
   328  
   329  	rangeID := repl.RangeID
   330  
   331  	if _, ok := s.mu.uninitReplicas[rangeID]; !ok {
   332  		// Do nothing if the range has already been initialized.
   333  		return nil
   334  	}
   335  	delete(s.mu.uninitReplicas, rangeID)
   336  
   337  	if exRange := s.getOverlappingKeyRangeLocked(repl.Desc()); exRange != nil {
   338  		return errors.Errorf("%s: cannot initialize replica; range %s has overlapping range %s",
   339  			s, repl, exRange.Desc())
   340  	}
   341  	if exRngItem := s.mu.replicasByKey.ReplaceOrInsert(repl); exRngItem != nil {
   342  		return errors.Errorf("range for key %v already exists in replicasByKey btree",
   343  			(exRngItem.(*Replica)).startKey())
   344  	}
   345  
   346  	// Add the range to metrics and maybe gossip on capacity change.
   347  	s.metrics.ReplicaCount.Inc(1)
   348  	s.maybeGossipOnCapacityChange(ctx, rangeAddEvent)
   349  
   350  	return nil
   351  }