github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_init.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"math/rand"
    16  	"time"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/abortspan"
    19  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency"
    20  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    21  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/split"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
    23  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    24  	"github.com/cockroachdb/cockroach/pkg/rpc"
    25  	"github.com/cockroachdb/cockroach/pkg/util"
    26  	"github.com/cockroachdb/cockroach/pkg/util/log"
    27  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    28  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    29  	"github.com/cockroachdb/errors"
    30  	"go.etcd.io/etcd/raft"
    31  )
    32  
    33  const (
    34  	splitQueueThrottleDuration = 5 * time.Second
    35  	mergeQueueThrottleDuration = 5 * time.Second
    36  )
    37  
    38  // newReplica constructs a new Replica. If the desc is initialized, the store
    39  // must be present in it and the corresponding replica descriptor must have
    40  // replicaID as its ReplicaID.
    41  func newReplica(
    42  	ctx context.Context, desc *roachpb.RangeDescriptor, store *Store, replicaID roachpb.ReplicaID,
    43  ) (*Replica, error) {
    44  	repl := newUnloadedReplica(ctx, desc, store, replicaID)
    45  	repl.raftMu.Lock()
    46  	defer repl.raftMu.Unlock()
    47  	repl.mu.Lock()
    48  	defer repl.mu.Unlock()
    49  	if err := repl.loadRaftMuLockedReplicaMuLocked(desc); err != nil {
    50  		return nil, err
    51  	}
    52  	return repl, nil
    53  }
    54  
    55  // newUnloadedReplica partially constructs a replica. The primary reason this
    56  // function exists separately from Replica.loadRaftMuLockedReplicaMuLocked() is
    57  // to avoid attempting to fully constructing a Replica prior to proving that it
    58  // can exist during the delicate synchronization dance that occurs in
    59  // Store.tryGetOrCreateReplica(). A Replica returned from this function must not
    60  // be used in any way until it's load() method has been called.
    61  func newUnloadedReplica(
    62  	ctx context.Context, desc *roachpb.RangeDescriptor, store *Store, replicaID roachpb.ReplicaID,
    63  ) *Replica {
    64  	if replicaID == 0 {
    65  		log.Fatalf(context.TODO(), "cannot construct a replica for range %d with a 0 replica ID", desc.RangeID)
    66  	}
    67  	r := &Replica{
    68  		AmbientContext: store.cfg.AmbientCtx,
    69  		RangeID:        desc.RangeID,
    70  		store:          store,
    71  		abortSpan:      abortspan.New(desc.RangeID),
    72  		concMgr: concurrency.NewManager(concurrency.Config{
    73  			NodeDesc:          store.nodeDesc,
    74  			RangeDesc:         desc,
    75  			Settings:          store.ClusterSettings(),
    76  			DB:                store.DB(),
    77  			Clock:             store.Clock(),
    78  			Stopper:           store.Stopper(),
    79  			IntentResolver:    store.intentResolver,
    80  			TxnWaitMetrics:    store.txnWaitMetrics,
    81  			SlowLatchGauge:    store.metrics.SlowLatchRequests,
    82  			DisableTxnPushing: store.TestingKnobs().DontPushOnWriteIntentError,
    83  			TxnWaitKnobs:      store.TestingKnobs().TxnWaitKnobs,
    84  		}),
    85  	}
    86  	r.mu.pendingLeaseRequest = makePendingLeaseRequest(r)
    87  	r.mu.stateLoader = stateloader.Make(desc.RangeID)
    88  	r.mu.quiescent = true
    89  	r.mu.zone = store.cfg.DefaultZoneConfig
    90  	r.mu.replicaID = replicaID
    91  	split.Init(&r.loadBasedSplitter, rand.Intn, func() float64 {
    92  		return float64(SplitByLoadQPSThreshold.Get(&store.cfg.Settings.SV))
    93  	})
    94  	r.mu.proposals = map[kvserverbase.CmdIDKey]*ProposalData{}
    95  	r.mu.checksums = map[uuid.UUID]ReplicaChecksum{}
    96  	r.mu.proposalBuf.Init((*replicaProposer)(r))
    97  
    98  	if leaseHistoryMaxEntries > 0 {
    99  		r.leaseHistory = newLeaseHistory()
   100  	}
   101  	if store.cfg.StorePool != nil {
   102  		r.leaseholderStats = newReplicaStats(store.Clock(), store.cfg.StorePool.getNodeLocalityString)
   103  	}
   104  	// Pass nil for the localityOracle because we intentionally don't track the
   105  	// origin locality of write load.
   106  	r.writeStats = newReplicaStats(store.Clock(), nil)
   107  
   108  	// Init rangeStr with the range ID.
   109  	r.rangeStr.store(replicaID, &roachpb.RangeDescriptor{RangeID: desc.RangeID})
   110  	// Add replica log tag - the value is rangeStr.String().
   111  	r.AmbientContext.AddLogTag("r", &r.rangeStr)
   112  	// Add replica pointer value. NB: this was historically useful for debugging
   113  	// replica GC issues, but is a distraction at the moment.
   114  	// r.AmbientContext.AddLogTag("@", fmt.Sprintf("%x", unsafe.Pointer(r)))
   115  	r.raftMu.stateLoader = stateloader.Make(desc.RangeID)
   116  
   117  	r.splitQueueThrottle = util.Every(splitQueueThrottleDuration)
   118  	r.mergeQueueThrottle = util.Every(mergeQueueThrottleDuration)
   119  	return r
   120  }
   121  
   122  // loadRaftMuLockedReplicaMuLocked will load the state of the replica from disk.
   123  // If desc is initialized, the Replica will be initialized when this method
   124  // returns. An initialized Replica may not be reloaded. If this method is called
   125  // with an uninitialized desc it may be called again later with an initialized
   126  // desc.
   127  //
   128  // This method is called in three places:
   129  //
   130  //  1) newReplica - used when the store is initializing and during testing
   131  //  2) tryGetOrCreateReplica - see newUnloadedReplica
   132  //  3) splitPostApply - this call initializes a previously uninitialized Replica.
   133  //
   134  func (r *Replica) loadRaftMuLockedReplicaMuLocked(desc *roachpb.RangeDescriptor) error {
   135  	ctx := r.AnnotateCtx(context.TODO())
   136  	if r.mu.state.Desc != nil && r.isInitializedRLocked() {
   137  		log.Fatalf(ctx, "r%d: cannot reinitialize an initialized replica", desc.RangeID)
   138  	} else if r.mu.replicaID == 0 {
   139  		// NB: This is just a defensive check as r.mu.replicaID should never be 0.
   140  		log.Fatalf(ctx, "r%d: cannot initialize replica without a replicaID", desc.RangeID)
   141  	}
   142  
   143  	// Clear the internal raft group in case we're being reset. Since we're
   144  	// reloading the raft state below, it isn't safe to use the existing raft
   145  	// group.
   146  	r.mu.internalRaftGroup = nil
   147  
   148  	var err error
   149  	if r.mu.state, err = r.mu.stateLoader.Load(ctx, r.Engine(), desc); err != nil {
   150  		return err
   151  	}
   152  	r.mu.lastIndex, err = r.mu.stateLoader.LoadLastIndex(ctx, r.Engine())
   153  	if err != nil {
   154  		return err
   155  	}
   156  	r.mu.lastTerm = invalidLastTerm
   157  
   158  	// Ensure that we're not trying to load a replica with a different ID than
   159  	// was used to construct this Replica.
   160  	replicaID := r.mu.replicaID
   161  	if replicaDesc, found := r.mu.state.Desc.GetReplicaDescriptor(r.StoreID()); found {
   162  		replicaID = replicaDesc.ReplicaID
   163  	} else if desc.IsInitialized() {
   164  		log.Fatalf(ctx, "r%d: cannot initialize replica which is not in descriptor %v", desc.RangeID, desc)
   165  	}
   166  	if r.mu.replicaID != replicaID {
   167  		log.Fatalf(ctx, "attempting to initialize a replica which has ID %d with ID %d",
   168  			r.mu.replicaID, replicaID)
   169  	}
   170  
   171  	r.setDescLockedRaftMuLocked(ctx, desc)
   172  
   173  	// Init the minLeaseProposedTS such that we won't use an existing lease (if
   174  	// any). This is so that, after a restart, we don't propose under old leases.
   175  	// If the replica is being created through a split, this value will be
   176  	// overridden.
   177  	if !r.store.cfg.TestingKnobs.DontPreventUseOfOldLeaseOnStart {
   178  		// Only do this if there was a previous lease. This shouldn't be important
   179  		// to do but consider that the first lease which is obtained is back-dated
   180  		// to a zero start timestamp (and this de-flakes some tests). If we set the
   181  		// min proposed TS here, this lease could not be renewed (by the semantics
   182  		// of minLeaseProposedTS); and since minLeaseProposedTS is copied on splits,
   183  		// this problem would multiply to a number of replicas at cluster bootstrap.
   184  		// Instead, we make the first lease special (which is OK) and the problem
   185  		// disappears.
   186  		if r.mu.state.Lease.Sequence > 0 {
   187  			r.mu.minLeaseProposedTS = r.Clock().Now()
   188  		}
   189  	}
   190  
   191  	ssBase := r.Engine().GetAuxiliaryDir()
   192  	if r.raftMu.sideloaded, err = newDiskSideloadStorage(
   193  		r.store.cfg.Settings,
   194  		desc.RangeID,
   195  		replicaID,
   196  		ssBase,
   197  		r.store.limiters.BulkIOWriteRate,
   198  		r.store.engine,
   199  	); err != nil {
   200  		return errors.Wrap(err, "while initializing sideloaded storage")
   201  	}
   202  	r.assertStateLocked(ctx, r.store.Engine())
   203  	return nil
   204  }
   205  
   206  // IsInitialized is true if we know the metadata of this range, either
   207  // because we created it or we have received an initial snapshot from
   208  // another node. It is false when a range has been created in response
   209  // to an incoming message but we are waiting for our initial snapshot.
   210  func (r *Replica) IsInitialized() bool {
   211  	r.mu.RLock()
   212  	defer r.mu.RUnlock()
   213  	return r.isInitializedRLocked()
   214  }
   215  
   216  // isInitializedRLocked is true if we know the metadata of this range, either
   217  // because we created it or we have received an initial snapshot from
   218  // another node. It is false when a range has been created in response
   219  // to an incoming message but we are waiting for our initial snapshot.
   220  // isInitializedLocked requires that the replica lock is held.
   221  func (r *Replica) isInitializedRLocked() bool {
   222  	return r.mu.state.Desc.IsInitialized()
   223  }
   224  
   225  // maybeInitializeRaftGroup check whether the internal Raft group has
   226  // not yet been initialized. If not, it is created and set to campaign
   227  // if this replica is the most recent owner of the range lease.
   228  func (r *Replica) maybeInitializeRaftGroup(ctx context.Context) {
   229  	r.mu.RLock()
   230  	// If this replica hasn't initialized the Raft group, create it and
   231  	// unquiesce and wake the leader to ensure the replica comes up to date.
   232  	initialized := r.mu.internalRaftGroup != nil
   233  	// If this replica has been removed or is in the process of being removed
   234  	// then it'll never handle any raft events so there's no reason to initialize
   235  	// it now.
   236  	removed := !r.mu.destroyStatus.IsAlive()
   237  	r.mu.RUnlock()
   238  	if initialized || removed {
   239  		return
   240  	}
   241  
   242  	// Acquire raftMu, but need to maintain lock ordering (raftMu then mu).
   243  	r.raftMu.Lock()
   244  	defer r.raftMu.Unlock()
   245  	r.mu.Lock()
   246  	defer r.mu.Unlock()
   247  
   248  	// If we raced on checking the destroyStatus above that's fine as
   249  	// the below withRaftGroupLocked will no-op.
   250  	if err := r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) {
   251  		return true, nil
   252  	}); err != nil && !errors.Is(err, errRemoved) {
   253  		log.VErrEventf(ctx, 1, "unable to initialize raft group: %s", err)
   254  	}
   255  }
   256  
   257  // setDescRaftMuLocked atomically sets the replica's descriptor. It requires raftMu to be
   258  // locked.
   259  func (r *Replica) setDescRaftMuLocked(ctx context.Context, desc *roachpb.RangeDescriptor) {
   260  	r.mu.Lock()
   261  	defer r.mu.Unlock()
   262  	r.setDescLockedRaftMuLocked(ctx, desc)
   263  }
   264  
   265  func (r *Replica) setDescLockedRaftMuLocked(ctx context.Context, desc *roachpb.RangeDescriptor) {
   266  	if desc.RangeID != r.RangeID {
   267  		log.Fatalf(ctx, "range descriptor ID (%d) does not match replica's range ID (%d)",
   268  			desc.RangeID, r.RangeID)
   269  	}
   270  	if r.mu.state.Desc.IsInitialized() &&
   271  		(desc == nil || !desc.IsInitialized()) {
   272  		log.Fatalf(ctx, "cannot replace initialized descriptor with uninitialized one: %+v -> %+v",
   273  			r.mu.state.Desc, desc)
   274  	}
   275  	if r.mu.state.Desc.IsInitialized() &&
   276  		!r.mu.state.Desc.StartKey.Equal(desc.StartKey) {
   277  		log.Fatalf(ctx, "attempted to change replica's start key from %s to %s",
   278  			r.mu.state.Desc.StartKey, desc.StartKey)
   279  	}
   280  
   281  	// NB: It might be nice to assert that the current replica exists in desc
   282  	// however we allow it to not be present for three reasons:
   283  	//
   284  	//   1) When removing the current replica we update the descriptor to the point
   285  	//      of removal even though we will delete the Replica's data in the same
   286  	//      batch. We could avoid setting the local descriptor in this case.
   287  	//   2) When the DisableEagerReplicaRemoval testing knob is enabled. We
   288  	//      could remove all tests which utilize this behavior now that there's
   289  	//      no other mechanism for range state which does not contain the current
   290  	//      store to exist on disk.
   291  	//   3) Various unit tests do not provide a valid descriptor.
   292  	replDesc, found := desc.GetReplicaDescriptor(r.StoreID())
   293  	if found && replDesc.ReplicaID != r.mu.replicaID {
   294  		log.Fatalf(ctx, "attempted to change replica's ID from %d to %d",
   295  			r.mu.replicaID, replDesc.ReplicaID)
   296  	}
   297  
   298  	// Determine if a new replica was added. This is true if the new max replica
   299  	// ID is greater than the old max replica ID.
   300  	oldMaxID := maxReplicaIDOfAny(r.mu.state.Desc)
   301  	newMaxID := maxReplicaIDOfAny(desc)
   302  	if newMaxID > oldMaxID {
   303  		r.mu.lastReplicaAdded = newMaxID
   304  		r.mu.lastReplicaAddedTime = timeutil.Now()
   305  	} else if r.mu.lastReplicaAdded > newMaxID {
   306  		// The last replica added was removed.
   307  		r.mu.lastReplicaAdded = 0
   308  		r.mu.lastReplicaAddedTime = time.Time{}
   309  	}
   310  
   311  	r.rangeStr.store(r.mu.replicaID, desc)
   312  	r.connectionClass.set(rpc.ConnectionClassForKey(desc.StartKey))
   313  	r.concMgr.OnRangeDescUpdated(desc)
   314  	r.mu.state.Desc = desc
   315  }