github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/client_raft_test.go (about)

     1  // Copyright 2015 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver_test
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"math"
    18  	"math/rand"
    19  	"reflect"
    20  	"runtime"
    21  	"strconv"
    22  	"sync"
    23  	"sync/atomic"
    24  	"testing"
    25  	"time"
    26  
    27  	"github.com/cockroachdb/cockroach/pkg/base"
    28  	"github.com/cockroachdb/cockroach/pkg/gossip"
    29  	"github.com/cockroachdb/cockroach/pkg/keys"
    30  	"github.com/cockroachdb/cockroach/pkg/kv"
    31  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
    32  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    33  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    34  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
    35  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    36  	"github.com/cockroachdb/cockroach/pkg/rpc"
    37  	"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
    38  	"github.com/cockroachdb/cockroach/pkg/server/serverpb"
    39  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    40  	"github.com/cockroachdb/cockroach/pkg/storage"
    41  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    42  	"github.com/cockroachdb/cockroach/pkg/testutils"
    43  	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
    44  	"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
    45  	"github.com/cockroachdb/cockroach/pkg/util"
    46  	"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
    47  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    48  	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
    49  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    50  	"github.com/cockroachdb/cockroach/pkg/util/log"
    51  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    52  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    53  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    54  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    55  	"github.com/cockroachdb/errors"
    56  	"github.com/stretchr/testify/assert"
    57  	"github.com/stretchr/testify/require"
    58  	"go.etcd.io/etcd/raft"
    59  	"go.etcd.io/etcd/raft/raftpb"
    60  	"google.golang.org/grpc"
    61  )
    62  
    63  // mustGetInt decodes an int64 value from the bytes field of the receiver
    64  // and panics if the bytes field is not 0 or 8 bytes in length.
    65  func mustGetInt(v *roachpb.Value) int64 {
    66  	if v == nil {
    67  		return 0
    68  	}
    69  	i, err := v.GetInt()
    70  	if err != nil {
    71  		panic(err)
    72  	}
    73  	return i
    74  }
    75  
    76  // TestStoreRecoverFromEngine verifies that the store recovers all ranges and their contents
    77  // after being stopped and recreated.
    78  func TestStoreRecoverFromEngine(t *testing.T) {
    79  	defer leaktest.AfterTest(t)()
    80  	storeCfg := kvserver.TestStoreConfig(nil /* clock */)
    81  	storeCfg.TestingKnobs.DisableSplitQueue = true
    82  	storeCfg.TestingKnobs.DisableMergeQueue = true
    83  
    84  	const rangeID = roachpb.RangeID(1)
    85  	splitKey := roachpb.Key("m")
    86  	key1 := roachpb.Key("a")
    87  	key2 := roachpb.Key("z")
    88  
    89  	engineStopper := stop.NewStopper()
    90  	defer engineStopper.Stop(context.Background())
    91  	eng := storage.NewDefaultInMem()
    92  	engineStopper.AddCloser(eng)
    93  	var rangeID2 roachpb.RangeID
    94  
    95  	get := func(store *kvserver.Store, rangeID roachpb.RangeID, key roachpb.Key) int64 {
    96  		args := getArgs(key)
    97  		resp, err := kv.SendWrappedWith(context.Background(), store.TestSender(), roachpb.Header{
    98  			RangeID: rangeID,
    99  		}, args)
   100  		if err != nil {
   101  			t.Fatal(err)
   102  		}
   103  		return mustGetInt(resp.(*roachpb.GetResponse).Value)
   104  	}
   105  	validate := func(store *kvserver.Store) {
   106  		if val := get(store, rangeID, key1); val != 13 {
   107  			t.Errorf("key %q: expected 13 but got %v", key1, val)
   108  		}
   109  		if val := get(store, rangeID2, key2); val != 28 {
   110  			t.Errorf("key %q: expected 28 but got %v", key2, val)
   111  		}
   112  	}
   113  
   114  	// First, populate the store with data across two ranges. Each range contains commands
   115  	// that both predate and postdate the split.
   116  	func() {
   117  		stopper := stop.NewStopper()
   118  		defer stopper.Stop(context.Background())
   119  		store := createTestStoreWithOpts(t,
   120  			testStoreOpts{
   121  				eng: eng,
   122  				cfg: &storeCfg,
   123  				// This test was written before the test stores were able to start with
   124  				// more than one range and is not prepared to handle many ranges.
   125  				dontCreateSystemRanges: true,
   126  			},
   127  			stopper)
   128  
   129  		increment := func(rangeID roachpb.RangeID, key roachpb.Key, value int64) (*roachpb.IncrementResponse, *roachpb.Error) {
   130  			args := incrementArgs(key, value)
   131  			resp, err := kv.SendWrappedWith(context.Background(), store.TestSender(), roachpb.Header{
   132  				RangeID: rangeID,
   133  			}, args)
   134  			incResp, _ := resp.(*roachpb.IncrementResponse)
   135  			return incResp, err
   136  		}
   137  
   138  		if _, err := increment(rangeID, key1, 2); err != nil {
   139  			t.Fatal(err)
   140  		}
   141  		if _, err := increment(rangeID, key2, 5); err != nil {
   142  			t.Fatal(err)
   143  		}
   144  		splitArgs := adminSplitArgs(splitKey)
   145  		if _, err := kv.SendWrapped(context.Background(), store.TestSender(), splitArgs); err != nil {
   146  			t.Fatal(err)
   147  		}
   148  		rangeID2 = store.LookupReplica(roachpb.RKey(key2)).RangeID
   149  		if rangeID2 == rangeID {
   150  			t.Fatal("got same range id after split")
   151  		}
   152  		if _, err := increment(rangeID, key1, 11); err != nil {
   153  			t.Fatal(err)
   154  		}
   155  		if _, err := increment(rangeID2, key2, 23); err != nil {
   156  			t.Fatal(err)
   157  		}
   158  		validate(store)
   159  	}()
   160  
   161  	// Now create a new store with the same engine and make sure the expected data is present.
   162  	// We must use the same clock because a newly-created manual clock will be behind the one
   163  	// we wrote with and so will see stale MVCC data.
   164  	store := createTestStoreWithOpts(t,
   165  		testStoreOpts{
   166  			dontBootstrap: true,
   167  			eng:           eng,
   168  			cfg:           &storeCfg,
   169  		},
   170  		engineStopper)
   171  
   172  	// Raft processing is initialized lazily; issue a no-op write request on each key to
   173  	// ensure that is has been started.
   174  	incArgs := incrementArgs(key1, 0)
   175  	if _, err := kv.SendWrapped(context.Background(), store.TestSender(), incArgs); err != nil {
   176  		t.Fatal(err)
   177  	}
   178  	incArgs = incrementArgs(key2, 0)
   179  	if _, err := kv.SendWrappedWith(context.Background(), store.TestSender(), roachpb.Header{
   180  		RangeID: rangeID2,
   181  	}, incArgs); err != nil {
   182  		t.Fatal(err)
   183  	}
   184  
   185  	validate(store)
   186  }
   187  
   188  // TestStoreRecoverWithErrors verifies that even commands that fail are marked as
   189  // applied so they are not retried after recovery.
   190  func TestStoreRecoverWithErrors(t *testing.T) {
   191  	defer leaktest.AfterTest(t)()
   192  	storeCfg := kvserver.TestStoreConfig(nil)
   193  	// Splits can cause our chosen keys to end up on ranges other than range 1,
   194  	// and trying to handle that complicates the test without providing any
   195  	// added benefit.
   196  	storeCfg.TestingKnobs.DisableSplitQueue = true
   197  	eng := storage.NewDefaultInMem()
   198  	defer eng.Close()
   199  
   200  	numIncrements := 0
   201  
   202  	func() {
   203  		stopper := stop.NewStopper()
   204  		defer stopper.Stop(context.Background())
   205  		keyA := roachpb.Key("a")
   206  		storeCfg := storeCfg // copy
   207  		storeCfg.TestingKnobs.EvalKnobs.TestingEvalFilter =
   208  			func(filterArgs kvserverbase.FilterArgs) *roachpb.Error {
   209  				_, ok := filterArgs.Req.(*roachpb.IncrementRequest)
   210  				if ok && filterArgs.Req.Header().Key.Equal(keyA) {
   211  					numIncrements++
   212  				}
   213  				return nil
   214  			}
   215  		store := createTestStoreWithOpts(
   216  			t,
   217  			testStoreOpts{eng: eng, cfg: &storeCfg},
   218  			stopper)
   219  
   220  		// Write a bytes value so the increment will fail.
   221  		putArgs := putArgs(keyA, []byte("asdf"))
   222  		if _, err := kv.SendWrapped(context.Background(), store.TestSender(), putArgs); err != nil {
   223  			t.Fatal(err)
   224  		}
   225  
   226  		// Try and fail to increment the key. It is important for this test that the
   227  		// failure be the last thing in the raft log when the store is stopped.
   228  		incArgs := incrementArgs(keyA, 42)
   229  		if _, err := kv.SendWrapped(context.Background(), store.TestSender(), incArgs); err == nil {
   230  			t.Fatal("did not get expected error")
   231  		}
   232  	}()
   233  
   234  	if numIncrements != 1 {
   235  		t.Fatalf("expected 1 increments; was %d", numIncrements)
   236  	}
   237  
   238  	stopper := stop.NewStopper()
   239  	defer stopper.Stop(context.Background())
   240  
   241  	// Recover from the engine.
   242  	store := createTestStoreWithOpts(t,
   243  		testStoreOpts{
   244  			dontBootstrap: true,
   245  			eng:           eng,
   246  			cfg:           &storeCfg,
   247  		},
   248  		stopper)
   249  
   250  	// Issue a no-op write to lazily initialize raft on the range.
   251  	keyB := roachpb.Key("b")
   252  	incArgs := incrementArgs(keyB, 0)
   253  	if _, err := kv.SendWrapped(context.Background(), store.TestSender(), incArgs); err != nil {
   254  		t.Fatal(err)
   255  	}
   256  
   257  	// No additional increments were performed on key A during recovery.
   258  	if numIncrements != 1 {
   259  		t.Fatalf("expected 1 increments; was %d", numIncrements)
   260  	}
   261  }
   262  
   263  // TestReplicateRange verifies basic replication functionality by creating two stores
   264  // and a range, replicating the range to the second store, and reading its data there.
   265  func TestReplicateRange(t *testing.T) {
   266  	defer leaktest.AfterTest(t)()
   267  	mtc := &multiTestContext{
   268  		// This test was written before the multiTestContext started creating many
   269  		// system ranges at startup, and hasn't been update to take that into
   270  		// account.
   271  		startWithSingleRange: true,
   272  	}
   273  	defer mtc.Stop()
   274  	mtc.Start(t, 2)
   275  
   276  	// Issue a command on the first node before replicating.
   277  	incArgs := incrementArgs([]byte("a"), 5)
   278  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
   279  		t.Fatal(err)
   280  	}
   281  
   282  	repl, err := mtc.stores[0].GetReplica(1)
   283  	if err != nil {
   284  		t.Fatal(err)
   285  	}
   286  
   287  	chgs := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, roachpb.ReplicationTarget{
   288  		NodeID:  mtc.stores[1].Ident.NodeID,
   289  		StoreID: mtc.stores[1].Ident.StoreID,
   290  	})
   291  	if _, err := repl.ChangeReplicas(context.Background(), repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs); err != nil {
   292  		t.Fatal(err)
   293  	}
   294  	// Verify no intent remains on range descriptor key.
   295  	key := keys.RangeDescriptorKey(repl.Desc().StartKey)
   296  	desc := roachpb.RangeDescriptor{}
   297  	if ok, err := storage.MVCCGetProto(context.Background(), mtc.stores[0].Engine(), key,
   298  		mtc.stores[0].Clock().Now(), &desc, storage.MVCCGetOptions{}); err != nil {
   299  		t.Fatal(err)
   300  	} else if !ok {
   301  		t.Fatalf("range descriptor key %s was not found", key)
   302  	}
   303  	// Verify that in time, no intents remain on meta addressing
   304  	// keys, and that range descriptor on the meta records is correct.
   305  	testutils.SucceedsSoon(t, func() error {
   306  		meta2 := keys.RangeMetaKey(roachpb.RKeyMax)
   307  		meta1 := keys.RangeMetaKey(meta2)
   308  		for _, key := range []roachpb.RKey{meta2, meta1} {
   309  			metaDesc := roachpb.RangeDescriptor{}
   310  			if ok, err := storage.MVCCGetProto(context.Background(), mtc.stores[0].Engine(), key.AsRawKey(),
   311  				mtc.stores[0].Clock().Now(), &metaDesc, storage.MVCCGetOptions{}); err != nil {
   312  				return err
   313  			} else if !ok {
   314  				return errors.Errorf("failed to resolve %s", key.AsRawKey())
   315  			}
   316  			if !reflect.DeepEqual(metaDesc, desc) {
   317  				return errors.Errorf("descs not equal: %+v != %+v", metaDesc, desc)
   318  			}
   319  		}
   320  		return nil
   321  	})
   322  
   323  	// Verify that the same data is available on the replica.
   324  	testutils.SucceedsSoon(t, func() error {
   325  		getArgs := getArgs([]byte("a"))
   326  		if reply, err := kv.SendWrappedWith(context.Background(), mtc.stores[1].TestSender(), roachpb.Header{
   327  			ReadConsistency: roachpb.INCONSISTENT,
   328  		}, getArgs); err != nil {
   329  			return errors.Errorf("failed to read data: %s", err)
   330  		} else if e, v := int64(5), mustGetInt(reply.(*roachpb.GetResponse).Value); v != e {
   331  			return errors.Errorf("failed to read correct data: expected %d, got %d", e, v)
   332  		}
   333  		return nil
   334  	})
   335  }
   336  
   337  // TestRestoreReplicas ensures that consensus group membership is properly
   338  // persisted to disk and restored when a node is stopped and restarted.
   339  func TestRestoreReplicas(t *testing.T) {
   340  	defer leaktest.AfterTest(t)()
   341  
   342  	t.Skip("https://github.com/cockroachdb/cockroach/issues/40351")
   343  
   344  	sc := kvserver.TestStoreConfig(nil)
   345  	// Disable periodic gossip activities. The periodic gossiping of the first
   346  	// range can cause spurious lease transfers which cause this test to fail.
   347  	sc.TestingKnobs.DisablePeriodicGossips = true
   348  	// Allow a replica to use the lease it had before a restart; we don't want
   349  	// this test to deal with needing to acquire new leases after the restart.
   350  	sc.TestingKnobs.DontPreventUseOfOldLeaseOnStart = true
   351  	mtc := &multiTestContext{
   352  		storeConfig: &sc,
   353  		// This test was written before the multiTestContext started creating many
   354  		// system ranges at startup, and hasn't been update to take that into
   355  		// account.
   356  		startWithSingleRange: true,
   357  	}
   358  	defer mtc.Stop()
   359  	mtc.Start(t, 2)
   360  
   361  	firstRng, err := mtc.stores[0].GetReplica(1)
   362  	if err != nil {
   363  		t.Fatal(err)
   364  	}
   365  
   366  	// Perform an increment before replication to ensure that commands are not
   367  	// repeated on restarts.
   368  	incArgs := incrementArgs([]byte("a"), 23)
   369  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
   370  		t.Fatal(err)
   371  	}
   372  
   373  	chgs := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, roachpb.ReplicationTarget{
   374  		NodeID:  mtc.stores[1].Ident.NodeID,
   375  		StoreID: mtc.stores[1].Ident.StoreID,
   376  	})
   377  	if _, err := firstRng.ChangeReplicas(context.Background(), firstRng.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs); err != nil {
   378  		t.Fatal(err)
   379  	}
   380  
   381  	mtc.restart()
   382  
   383  	// Send a command on each store. The original store (the lease holder still)
   384  	// will succeed.
   385  	incArgs = incrementArgs([]byte("a"), 5)
   386  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
   387  		t.Fatal(err)
   388  	}
   389  	// The follower will return a not lease holder error, indicating the command
   390  	// should be forwarded to the lease holder.
   391  	incArgs = incrementArgs([]byte("a"), 11)
   392  	{
   393  		_, pErr := kv.SendWrapped(context.Background(), mtc.stores[1].TestSender(), incArgs)
   394  		if _, ok := pErr.GetDetail().(*roachpb.NotLeaseHolderError); !ok {
   395  			t.Fatalf("expected not lease holder error; got %s", pErr)
   396  		}
   397  	}
   398  	// Send again, this time to first store.
   399  	if _, pErr := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); pErr != nil {
   400  		t.Fatal(pErr)
   401  	}
   402  
   403  	testutils.SucceedsSoon(t, func() error {
   404  		getArgs := getArgs([]byte("a"))
   405  		if reply, err := kv.SendWrappedWith(context.Background(), mtc.stores[1].TestSender(), roachpb.Header{
   406  			ReadConsistency: roachpb.INCONSISTENT,
   407  		}, getArgs); err != nil {
   408  			return errors.Errorf("failed to read data: %s", err)
   409  		} else if e, v := int64(39), mustGetInt(reply.(*roachpb.GetResponse).Value); v != e {
   410  			return errors.Errorf("failed to read correct data: expected %d, got %d", e, v)
   411  		}
   412  		return nil
   413  	})
   414  
   415  	// Both replicas have a complete list in Desc.Replicas
   416  	for i, store := range mtc.stores {
   417  		repl, err := store.GetReplica(1)
   418  		if err != nil {
   419  			t.Fatal(err)
   420  		}
   421  		desc := repl.Desc()
   422  		if len(desc.InternalReplicas) != 2 {
   423  			t.Fatalf("store %d: expected 2 replicas, found %d", i, len(desc.InternalReplicas))
   424  		}
   425  		if desc.InternalReplicas[0].NodeID != mtc.stores[0].Ident.NodeID {
   426  			t.Errorf("store %d: expected replica[0].NodeID == %d, was %d",
   427  				i, mtc.stores[0].Ident.NodeID, desc.InternalReplicas[0].NodeID)
   428  		}
   429  	}
   430  }
   431  
   432  // TODO(bdarnell): more aggressive testing here; especially with
   433  // proposer-evaluated KV, what this test does is much less as it doesn't
   434  // exercise the path in which the replica change fails at *apply* time (we only
   435  // test the failfast path), in which case the replica change isn't even
   436  // proposed.
   437  func TestFailedReplicaChange(t *testing.T) {
   438  	defer leaktest.AfterTest(t)()
   439  
   440  	var runFilter atomic.Value
   441  	runFilter.Store(true)
   442  
   443  	sc := kvserver.TestStoreConfig(nil)
   444  	sc.Clock = nil // manual clock
   445  	sc.TestingKnobs.EvalKnobs.TestingEvalFilter = func(filterArgs kvserverbase.FilterArgs) *roachpb.Error {
   446  		if runFilter.Load().(bool) {
   447  			if et, ok := filterArgs.Req.(*roachpb.EndTxnRequest); ok && et.Commit {
   448  				return roachpb.NewErrorWithTxn(errors.Errorf("boom"), filterArgs.Hdr.Txn)
   449  			}
   450  		}
   451  		return nil
   452  	}
   453  	mtc := &multiTestContext{storeConfig: &sc}
   454  	defer mtc.Stop()
   455  	mtc.Start(t, 2)
   456  
   457  	repl, err := mtc.stores[0].GetReplica(1)
   458  	if err != nil {
   459  		t.Fatal(err)
   460  	}
   461  
   462  	chgs := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, roachpb.ReplicationTarget{
   463  		NodeID:  mtc.stores[1].Ident.NodeID,
   464  		StoreID: mtc.stores[1].Ident.StoreID,
   465  	})
   466  	if _, err := repl.ChangeReplicas(context.Background(), repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs); !testutils.IsError(err, "boom") {
   467  		t.Fatalf("did not get expected error: %+v", err)
   468  	}
   469  
   470  	// After the aborted transaction, r.Desc was not updated.
   471  	// TODO(bdarnell): expose and inspect raft's internal state.
   472  	if replicas := repl.Desc().InternalReplicas; len(replicas) != 1 {
   473  		t.Fatalf("expected 1 replica, found %v", replicas)
   474  	}
   475  
   476  	// The pending config change flag was cleared, so a subsequent attempt
   477  	// can succeed.
   478  	runFilter.Store(false)
   479  
   480  	// The first failed replica change has laid down intents. Make sure those
   481  	// are pushable by making the transaction abandoned.
   482  	mtc.manualClock.Increment(10 * base.DefaultTxnHeartbeatInterval.Nanoseconds())
   483  
   484  	if _, err := repl.ChangeReplicas(context.Background(), repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs); err != nil {
   485  		t.Fatal(err)
   486  	}
   487  
   488  	// Wait for the range to sync to both replicas (mainly so leaktest doesn't
   489  	// complain about goroutines involved in the process).
   490  	testutils.SucceedsSoon(t, func() error {
   491  		for _, store := range mtc.stores {
   492  			rang, err := store.GetReplica(1)
   493  			if err != nil {
   494  				return err
   495  			}
   496  			if replicas := rang.Desc().InternalReplicas; len(replicas) <= 1 {
   497  				return errors.Errorf("expected > 1 replicas; got %v", replicas)
   498  			}
   499  		}
   500  		return nil
   501  	})
   502  }
   503  
   504  // We can truncate the old log entries and a new replica will be brought up from a snapshot.
   505  func TestReplicateAfterTruncation(t *testing.T) {
   506  	defer leaktest.AfterTest(t)()
   507  	mtc := &multiTestContext{
   508  		// This test was written before the multiTestContext started creating many
   509  		// system ranges at startup, and hasn't been update to take that into
   510  		// account.
   511  		startWithSingleRange: true,
   512  	}
   513  	defer mtc.Stop()
   514  	mtc.Start(t, 2)
   515  
   516  	repl, err := mtc.stores[0].GetReplica(1)
   517  	if err != nil {
   518  		t.Fatal(err)
   519  	}
   520  
   521  	// Issue a command on the first node before replicating.
   522  	incArgs := incrementArgs([]byte("a"), 5)
   523  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
   524  		t.Fatal(err)
   525  	}
   526  
   527  	// Get that command's log index.
   528  	index, err := repl.GetLastIndex()
   529  	if err != nil {
   530  		t.Fatal(err)
   531  	}
   532  
   533  	// Truncate the log at index+1 (log entries < N are removed, so this includes
   534  	// the increment).
   535  	truncArgs := truncateLogArgs(index+1, 1)
   536  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), truncArgs); err != nil {
   537  		t.Fatal(err)
   538  	}
   539  
   540  	// Issue a second command post-truncation.
   541  	incArgs = incrementArgs([]byte("a"), 11)
   542  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
   543  		t.Fatal(err)
   544  	}
   545  
   546  	// Now add the second replica.
   547  	chgs := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, roachpb.ReplicationTarget{
   548  		NodeID:  mtc.stores[1].Ident.NodeID,
   549  		StoreID: mtc.stores[1].Ident.StoreID,
   550  	})
   551  	if _, err := repl.ChangeReplicas(context.Background(), repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs); err != nil {
   552  		t.Fatal(err)
   553  	}
   554  
   555  	// Once it catches up, the effects of both commands can be seen.
   556  	testutils.SucceedsSoon(t, func() error {
   557  		getArgs := getArgs([]byte("a"))
   558  		if reply, err := kv.SendWrappedWith(context.Background(), mtc.stores[1].TestSender(), roachpb.Header{
   559  			ReadConsistency: roachpb.INCONSISTENT,
   560  		}, getArgs); err != nil {
   561  			return errors.Errorf("failed to read data: %s", err)
   562  		} else if e, v := int64(16), mustGetInt(reply.(*roachpb.GetResponse).Value); v != e {
   563  			return errors.Errorf("failed to read correct data: expected %d, got %d", e, v)
   564  		}
   565  		return nil
   566  	})
   567  
   568  	repl2, err := mtc.stores[1].GetReplica(1)
   569  	if err != nil {
   570  		t.Fatal(err)
   571  	}
   572  
   573  	testutils.SucceedsSoon(t, func() error {
   574  		if mvcc, mvcc2 := repl.GetMVCCStats(), repl2.GetMVCCStats(); mvcc2 != mvcc {
   575  			return errors.Errorf("expected stats on new range:\n%+v\not equal old:\n%+v", mvcc2, mvcc)
   576  		}
   577  		return nil
   578  	})
   579  
   580  	// Send a third command to verify that the log states are synced up so the
   581  	// new node can accept new commands.
   582  	incArgs = incrementArgs([]byte("a"), 23)
   583  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
   584  		t.Fatal(err)
   585  	}
   586  
   587  	testutils.SucceedsSoon(t, func() error {
   588  		getArgs := getArgs([]byte("a"))
   589  		if reply, err := kv.SendWrappedWith(context.Background(), mtc.stores[1].TestSender(), roachpb.Header{
   590  			ReadConsistency: roachpb.INCONSISTENT,
   591  		}, getArgs); err != nil {
   592  			return errors.Errorf("failed to read data: %s", err)
   593  		} else if e, v := int64(39), mustGetInt(reply.(*roachpb.GetResponse).Value); v != e {
   594  			return errors.Errorf("failed to read correct data: expected %d, got %d", e, v)
   595  		}
   596  		return nil
   597  	})
   598  }
   599  
   600  func TestRaftLogSizeAfterTruncation(t *testing.T) {
   601  	defer leaktest.AfterTest(t)()
   602  	mtc := &multiTestContext{
   603  		// This test was written before the multiTestContext started creating many
   604  		// system ranges at startup, and hasn't been update to take that into
   605  		// account.
   606  		startWithSingleRange: true,
   607  	}
   608  	defer mtc.Stop()
   609  	mtc.Start(t, 1)
   610  
   611  	const rangeID = 1
   612  
   613  	repl, err := mtc.stores[0].GetReplica(rangeID)
   614  	if err != nil {
   615  		t.Fatal(err)
   616  	}
   617  
   618  	key := []byte("a")
   619  	incArgs := incrementArgs(key, 5)
   620  	if _, err := kv.SendWrapped(
   621  		context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
   622  		t.Fatal(err)
   623  	}
   624  
   625  	index, err := repl.GetLastIndex()
   626  	if err != nil {
   627  		t.Fatal(err)
   628  	}
   629  
   630  	// Verifies the recomputed log size against what we track in `r.mu.raftLogSize`.
   631  	assertCorrectRaftLogSize := func() error {
   632  		// Recompute under raft lock so that the log doesn't change while we
   633  		// compute its size.
   634  		repl.RaftLock()
   635  		realSize, err := kvserver.ComputeRaftLogSize(
   636  			context.Background(), repl.RangeID, repl.Engine(), repl.SideloadedRaftMuLocked(),
   637  		)
   638  		size, _ := repl.GetRaftLogSize()
   639  		repl.RaftUnlock()
   640  
   641  		if err != nil {
   642  			t.Fatal(err)
   643  		}
   644  
   645  		// If the size isn't trusted, it won't have to match (and in fact
   646  		// likely won't). In this test, this is because the upreplication
   647  		// elides old Raft log entries in the snapshot it uses.
   648  		if size != realSize {
   649  			return fmt.Errorf("%s: raft log claims size %d, but is in fact %d", repl, size, realSize)
   650  		}
   651  		return nil
   652  	}
   653  
   654  	assert.NoError(t, assertCorrectRaftLogSize())
   655  
   656  	truncArgs := truncateLogArgs(index+1, 1)
   657  	if _, err := kv.SendWrapped(
   658  		context.Background(), mtc.stores[0].TestSender(), truncArgs); err != nil {
   659  		t.Fatal(err)
   660  	}
   661  
   662  	// Note that if there were multiple nodes, the Raft log sizes would not
   663  	// be correct for the followers as they would have received a shorter
   664  	// Raft log than the leader.
   665  	assert.NoError(t, assertCorrectRaftLogSize())
   666  }
   667  
   668  // TestSnapshotAfterTruncation tests that Raft will properly send a
   669  // non-preemptive snapshot when a node is brought up and the log has been
   670  // truncated.
   671  func TestSnapshotAfterTruncation(t *testing.T) {
   672  	defer leaktest.AfterTest(t)()
   673  	for _, changeTerm := range []bool{false, true} {
   674  		name := "sameTerm"
   675  		if changeTerm {
   676  			name = "differentTerm"
   677  		}
   678  		t.Run(name, func(t *testing.T) {
   679  			mtc := &multiTestContext{
   680  				// This test was written before the multiTestContext started creating many
   681  				// system ranges at startup, and hasn't been update to take that into
   682  				// account.
   683  				startWithSingleRange: true,
   684  			}
   685  			defer mtc.Stop()
   686  			mtc.Start(t, 3)
   687  			const stoppedStore = 1
   688  			repl0, err := mtc.stores[0].GetReplica(1)
   689  			if err != nil {
   690  				t.Fatal(err)
   691  			}
   692  
   693  			key := roachpb.Key("a")
   694  			incA := int64(5)
   695  			incB := int64(7)
   696  			incAB := incA + incB
   697  
   698  			// Set up a key to replicate across the cluster. We're going to modify this
   699  			// key and truncate the raft logs from that command after killing one of the
   700  			// nodes to check that it gets the new value after it comes up.
   701  			incArgs := incrementArgs(key, incA)
   702  			if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
   703  				t.Fatal(err)
   704  			}
   705  
   706  			mtc.replicateRange(1, 1, 2)
   707  			mtc.waitForValues(key, []int64{incA, incA, incA})
   708  
   709  			// Now kill one store, increment the key on the other stores and truncate
   710  			// their logs to make sure that when store 1 comes back up it will require a
   711  			// non-preemptive snapshot from Raft.
   712  			mtc.stopStore(stoppedStore)
   713  
   714  			incArgs = incrementArgs(key, incB)
   715  			if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
   716  				t.Fatal(err)
   717  			}
   718  
   719  			mtc.waitForValues(key, []int64{incAB, incA, incAB})
   720  
   721  			index, err := repl0.GetLastIndex()
   722  			if err != nil {
   723  				t.Fatal(err)
   724  			}
   725  
   726  			// Truncate the log at index+1 (log entries < N are removed, so this
   727  			// includes the increment).
   728  			truncArgs := truncateLogArgs(index+1, 1)
   729  			if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), truncArgs); err != nil {
   730  				t.Fatal(err)
   731  			}
   732  
   733  			if changeTerm {
   734  				for i := range mtc.stores {
   735  					if i != stoppedStore {
   736  						// Stop and restart all the live stores, which guarantees that
   737  						// we won't be in the same term we started with.
   738  						mtc.stopStore(i)
   739  						mtc.restartStore(i)
   740  						// Disable the snapshot queue on the live stores so that
   741  						// stoppedStore won't get a snapshot as soon as it starts
   742  						// back up.
   743  						mtc.stores[i].SetRaftSnapshotQueueActive(false)
   744  					}
   745  				}
   746  
   747  				// Restart the stopped store and wait for raft
   748  				// election/heartbeat traffic to settle down. Specifically, we
   749  				// need stoppedStore to know about the new term number before
   750  				// the snapshot is sent to reproduce #13506. If the snapshot
   751  				// happened before it learned the term, it would accept the
   752  				// snapshot no matter what term it contained.
   753  				//
   754  				// We do not wait for the store to successfully heartbeat
   755  				// because it is not expected to succeed in cases where the
   756  				// other two stores have already completed their leader
   757  				// election. In this case, a successful heartbeat won't be
   758  				// possible until we re-enable snapshots.
   759  				mtc.restartStoreWithoutHeartbeat(stoppedStore)
   760  				testutils.SucceedsSoon(t, func() error {
   761  					hasLeader := false
   762  					term := uint64(0)
   763  					for i := range mtc.stores {
   764  						repl, err := mtc.stores[i].GetReplica(1)
   765  						if err != nil {
   766  							return err
   767  						}
   768  						status := repl.RaftStatus()
   769  						if status == nil {
   770  							return errors.New("raft status not initialized")
   771  						}
   772  						if status.RaftState == raft.StateLeader {
   773  							hasLeader = true
   774  						}
   775  						if term == 0 {
   776  							term = status.Term
   777  						} else if status.Term != term {
   778  							return errors.Errorf("terms do not agree: %d vs %d", status.Term, term)
   779  						}
   780  					}
   781  					if !hasLeader {
   782  						return errors.New("no leader")
   783  					}
   784  					return nil
   785  				})
   786  
   787  				// Turn the queues back on and wait for the snapshot to be sent and processed.
   788  				for i, store := range mtc.stores {
   789  					if i != stoppedStore {
   790  						store.SetRaftSnapshotQueueActive(true)
   791  						if err := store.ForceRaftSnapshotQueueProcess(); err != nil {
   792  							t.Fatal(err)
   793  						}
   794  					}
   795  				}
   796  			} else { // !changeTerm
   797  				mtc.restartStore(stoppedStore)
   798  			}
   799  			mtc.waitForValues(key, []int64{incAB, incAB, incAB})
   800  
   801  			testutils.SucceedsSoon(t, func() error {
   802  				// Verify that the cached index and term (Replica.mu.last{Index,Term}))
   803  				// on all of the replicas is the same. #18327 fixed an issue where the
   804  				// cached term was left unchanged after applying a snapshot leading to a
   805  				// persistently unavailable range.
   806  				repl0, err = mtc.stores[0].GetReplica(1)
   807  				if err != nil {
   808  					t.Fatal(err)
   809  				}
   810  				expectedLastIndex, _ := repl0.GetLastIndex()
   811  				expectedLastTerm := repl0.GetCachedLastTerm()
   812  
   813  				for i := 1; i < len(mtc.stores); i++ {
   814  					repl1, err := mtc.stores[i].GetReplica(1)
   815  					if err != nil {
   816  						return err
   817  					}
   818  					if lastIndex, _ := repl1.GetLastIndex(); expectedLastIndex != lastIndex {
   819  						return fmt.Errorf("%d: expected last index %d, but found %d", i, expectedLastIndex, lastIndex)
   820  					}
   821  					if lastTerm := repl1.GetCachedLastTerm(); expectedLastTerm != lastTerm {
   822  						return fmt.Errorf("%d: expected last term %d, but found %d", i, expectedLastTerm, lastTerm)
   823  					}
   824  				}
   825  				return nil
   826  			})
   827  		})
   828  	}
   829  }
   830  
   831  // TestSnapshotAfterTruncationWithUncommittedTail is similar in spirit to
   832  // TestSnapshotAfterTruncation/differentTerm. However, it differs in that we
   833  // take care to ensure that the partitioned Replica has a long uncommitted tail
   834  // of Raft entries that is not entirely overwritten by the snapshot it receives
   835  // after the partition heals. If the recipient of the snapshot did not purge its
   836  // Raft entry cache when receiving the snapshot, it could get stuck repeatedly
   837  // rejecting attempts to catch it up. This serves as a regression test for the
   838  // bug seen in #37056.
   839  func TestSnapshotAfterTruncationWithUncommittedTail(t *testing.T) {
   840  	defer leaktest.AfterTest(t)()
   841  	ctx := context.Background()
   842  	mtc := &multiTestContext{
   843  		// This test was written before the multiTestContext started creating many
   844  		// system ranges at startup, and hasn't been update to take that into
   845  		// account.
   846  		startWithSingleRange: true,
   847  	}
   848  	defer mtc.Stop()
   849  	mtc.Start(t, 3)
   850  
   851  	key := roachpb.Key("a")
   852  	incA := int64(5)
   853  	incB := int64(7)
   854  	incC := int64(9)
   855  	incAB := incA + incB
   856  	incABC := incAB + incC
   857  
   858  	// Set up a key to replicate across the cluster. We're going to modify this
   859  	// key and truncate the raft logs from that command after partitioning one
   860  	// of the nodes to check that it gets the new value after it reconnects.
   861  	// We're then going to continue modifying this key to make sure that the
   862  	// temporarily partitioned node can continue to receive updates.
   863  	incArgs := incrementArgs(key, incA)
   864  	if _, pErr := kv.SendWrapped(ctx, mtc.stores[0].TestSender(), incArgs); pErr != nil {
   865  		t.Fatal(pErr)
   866  	}
   867  
   868  	mtc.replicateRange(1, 1, 2)
   869  	mtc.waitForValues(key, []int64{incA, incA, incA})
   870  
   871  	// We partition the original leader from the other two replicas. This allows
   872  	// us to build up a large uncommitted Raft log on the partitioned node.
   873  	const partStore = 0
   874  	partRepl, err := mtc.stores[partStore].GetReplica(1)
   875  	if err != nil {
   876  		t.Fatal(err)
   877  	}
   878  	partReplDesc, err := partRepl.GetReplicaDescriptor()
   879  	if err != nil {
   880  		t.Fatal(err)
   881  	}
   882  	partReplSender := mtc.stores[partStore].TestSender()
   883  
   884  	// Partition the original leader from its followers. We do this by installing
   885  	// unreliableRaftHandler listeners on all three Stores. The handler on the
   886  	// partitioned store filters out all messages while the handler on the other
   887  	// two stores only filters out messages from the partitioned store. The
   888  	// configuration looks like:
   889  	//
   890  	//           [0]
   891  	//          x  x
   892  	//         /    \
   893  	//        x      x
   894  	//      [1]<---->[2]
   895  	//
   896  	for _, s := range []int{0, 1, 2} {
   897  		h := &unreliableRaftHandler{rangeID: 1, RaftMessageHandler: mtc.stores[s]}
   898  		if s != partStore {
   899  			// Only filter messages from the partitioned store on the other
   900  			// two stores.
   901  			h.dropReq = func(req *kvserver.RaftMessageRequest) bool {
   902  				return req.FromReplica.StoreID == partRepl.StoreID()
   903  			}
   904  			h.dropHB = func(hb *kvserver.RaftHeartbeat) bool {
   905  				return hb.FromReplicaID == partReplDesc.ReplicaID
   906  			}
   907  		}
   908  		mtc.transport.Listen(mtc.stores[s].Ident.StoreID, h)
   909  	}
   910  
   911  	// Perform a series of writes on the partitioned replica. The writes will
   912  	// not succeed before their context is canceled, but they will be appended
   913  	// to the partitioned replica's Raft log because it is currently the Raft
   914  	// leader.
   915  	g := ctxgroup.WithContext(ctx)
   916  	for i := 0; i < 32; i++ {
   917  		otherKey := roachpb.Key(fmt.Sprintf("other-%d", i))
   918  		g.GoCtx(func(ctx context.Context) error {
   919  			cCtx, cancel := context.WithTimeout(ctx, 50*time.Millisecond)
   920  			defer cancel()
   921  			incArgsOther := incrementArgs(otherKey, 1)
   922  			if _, pErr := kv.SendWrapped(cCtx, partReplSender, incArgsOther); pErr == nil {
   923  				return errors.New("unexpected success")
   924  			} else if !testutils.IsPError(pErr, "context deadline exceeded") {
   925  				return pErr.GoError()
   926  			}
   927  			return nil
   928  		})
   929  	}
   930  	if err := g.Wait(); err != nil {
   931  		t.Fatal(err)
   932  	}
   933  
   934  	// Transfer the lease to one of the followers and perform a write. The
   935  	// partition ensures that this will require a Raft leadership change.
   936  	const newLeaderStore = partStore + 1
   937  	newLeaderRepl, err := mtc.stores[newLeaderStore].GetReplica(1)
   938  	if err != nil {
   939  		t.Fatal(err)
   940  	}
   941  	newLeaderReplSender := mtc.stores[newLeaderStore].TestSender()
   942  
   943  	incArgs = incrementArgs(key, incB)
   944  	testutils.SucceedsSoon(t, func() error {
   945  		mtc.advanceClock(ctx)
   946  		_, pErr := kv.SendWrapped(ctx, newLeaderReplSender, incArgs)
   947  		if _, ok := pErr.GetDetail().(*roachpb.NotLeaseHolderError); ok {
   948  			return pErr.GoError()
   949  		} else if pErr != nil {
   950  			t.Fatal(pErr)
   951  		}
   952  		return nil
   953  	})
   954  	mtc.waitForValues(key, []int64{incA, incAB, incAB})
   955  
   956  	index, err := newLeaderRepl.GetLastIndex()
   957  	if err != nil {
   958  		t.Fatal(err)
   959  	}
   960  
   961  	// Truncate the log at index+1 (log entries < N are removed, so this
   962  	// includes the increment).
   963  	truncArgs := truncateLogArgs(index+1, 1)
   964  	testutils.SucceedsSoon(t, func() error {
   965  		mtc.advanceClock(ctx)
   966  		_, pErr := kv.SendWrapped(ctx, newLeaderReplSender, truncArgs)
   967  		if _, ok := pErr.GetDetail().(*roachpb.NotLeaseHolderError); ok {
   968  			return pErr.GoError()
   969  		} else if pErr != nil {
   970  			t.Fatal(pErr)
   971  		}
   972  		return nil
   973  	})
   974  
   975  	snapsMetric := mtc.stores[partStore].Metrics().RangeSnapshotsNormalApplied
   976  	snapsBefore := snapsMetric.Count()
   977  
   978  	// Remove the partition. Snapshot should follow.
   979  	for _, s := range []int{0, 1, 2} {
   980  		mtc.transport.Listen(mtc.stores[s].Ident.StoreID, &unreliableRaftHandler{
   981  			rangeID:            1,
   982  			RaftMessageHandler: mtc.stores[s],
   983  			unreliableRaftHandlerFuncs: unreliableRaftHandlerFuncs{
   984  				dropReq: func(req *kvserver.RaftMessageRequest) bool {
   985  					// Make sure that even going forward no MsgApp for what we just truncated can
   986  					// make it through. The Raft transport is asynchronous so this is necessary
   987  					// to make the test pass reliably.
   988  					// NB: the Index on the message is the log index that _precedes_ any of the
   989  					// entries in the MsgApp, so filter where msg.Index < index, not <= index.
   990  					return req.Message.Type == raftpb.MsgApp && req.Message.Index < index
   991  				},
   992  				dropHB:   func(*kvserver.RaftHeartbeat) bool { return false },
   993  				dropResp: func(*kvserver.RaftMessageResponse) bool { return false },
   994  			},
   995  		})
   996  	}
   997  
   998  	// The partitioned replica should catch up after a snapshot.
   999  	testutils.SucceedsSoon(t, func() error {
  1000  		snapsAfter := snapsMetric.Count()
  1001  		if !(snapsAfter > snapsBefore) {
  1002  			return errors.New("expected at least 1 snapshot to catch the partitioned replica up")
  1003  		}
  1004  		return nil
  1005  	})
  1006  	mtc.waitForValues(key, []int64{incAB, incAB, incAB})
  1007  
  1008  	// Perform another write. The partitioned replica should be able to receive
  1009  	// replicated updates.
  1010  	incArgs = incrementArgs(key, incC)
  1011  	if _, pErr := kv.SendWrapped(ctx, mtc.distSenders[0], incArgs); pErr != nil {
  1012  		t.Fatal(pErr)
  1013  	}
  1014  	mtc.waitForValues(key, []int64{incABC, incABC, incABC})
  1015  }
  1016  
  1017  type fakeSnapshotStream struct {
  1018  	nextReq *kvserver.SnapshotRequest
  1019  	nextErr error
  1020  }
  1021  
  1022  // Recv implements the SnapshotResponseStream interface.
  1023  func (c fakeSnapshotStream) Recv() (*kvserver.SnapshotRequest, error) {
  1024  	return c.nextReq, c.nextErr
  1025  }
  1026  
  1027  // Send implements the SnapshotResponseStream interface.
  1028  func (c fakeSnapshotStream) Send(request *kvserver.SnapshotResponse) error {
  1029  	return nil
  1030  }
  1031  
  1032  // Context implements the SnapshotResponseStream interface.
  1033  func (c fakeSnapshotStream) Context() context.Context {
  1034  	return context.Background()
  1035  }
  1036  
  1037  // TestFailedSnapshotFillsReservation tests that failing to finish applying an
  1038  // incoming snapshot still cleans up the outstanding reservation that was made.
  1039  func TestFailedSnapshotFillsReservation(t *testing.T) {
  1040  	defer leaktest.AfterTest(t)()
  1041  	mtc := &multiTestContext{}
  1042  	defer mtc.Stop()
  1043  	mtc.Start(t, 3)
  1044  
  1045  	rep, err := mtc.stores[0].GetReplica(1)
  1046  	require.NoError(t, err)
  1047  	repDesc, err := rep.GetReplicaDescriptor()
  1048  	require.NoError(t, err)
  1049  	desc := protoutil.Clone(rep.Desc()).(*roachpb.RangeDescriptor)
  1050  	desc.AddReplica(2, 2, roachpb.LEARNER)
  1051  	rep2Desc, found := desc.GetReplicaDescriptor(2)
  1052  	require.True(t, found)
  1053  	header := kvserver.SnapshotRequest_Header{
  1054  		CanDecline: true,
  1055  		RangeSize:  100,
  1056  		State:      kvserverpb.ReplicaState{Desc: desc},
  1057  		RaftMessageRequest: kvserver.RaftMessageRequest{
  1058  			RangeID:     rep.RangeID,
  1059  			FromReplica: repDesc,
  1060  			ToReplica:   rep2Desc,
  1061  		},
  1062  	}
  1063  	header.RaftMessageRequest.Message.Snapshot.Data = uuid.UUID{}.GetBytes()
  1064  	// Cause this stream to return an error as soon as we ask it for something.
  1065  	// This injects an error into HandleSnapshotStream when we try to send the
  1066  	// "snapshot accepted" message.
  1067  	expectedErr := errors.Errorf("")
  1068  	stream := fakeSnapshotStream{nil, expectedErr}
  1069  	if err := mtc.stores[1].HandleSnapshot(&header, stream); !errors.Is(err, expectedErr) {
  1070  		t.Fatalf("expected error %s, but found %v", expectedErr, err)
  1071  	}
  1072  	if n := mtc.stores[1].ReservationCount(); n != 0 {
  1073  		t.Fatalf("expected 0 reservations, but found %d", n)
  1074  	}
  1075  }
  1076  
  1077  // TestConcurrentRaftSnapshots tests that snapshots still work correctly when
  1078  // Raft requests multiple non-preemptive snapshots at the same time. This
  1079  // situation occurs when two replicas need snapshots at the same time.
  1080  func TestConcurrentRaftSnapshots(t *testing.T) {
  1081  	defer leaktest.AfterTest(t)()
  1082  	// This test relies on concurrently waiting for a value to change in the
  1083  	// underlying engine(s). Since the teeing engine does not respond well to
  1084  	// value mismatches, whether transient or permanent, skip this test if the
  1085  	// teeing engine is being used. See
  1086  	// https://github.com/cockroachdb/cockroach/issues/42656 for more context.
  1087  	if storage.DefaultStorageEngine == enginepb.EngineTypeTeePebbleRocksDB {
  1088  		t.Skip("disabled on teeing engine")
  1089  	}
  1090  
  1091  	mtc := &multiTestContext{
  1092  		// This test was written before the multiTestContext started creating many
  1093  		// system ranges at startup, and hasn't been update to take that into
  1094  		// account.
  1095  		startWithSingleRange: true,
  1096  	}
  1097  	defer mtc.Stop()
  1098  	mtc.Start(t, 5)
  1099  	repl, err := mtc.stores[0].GetReplica(1)
  1100  	if err != nil {
  1101  		t.Fatal(err)
  1102  	}
  1103  
  1104  	key := roachpb.Key("a")
  1105  	incA := int64(5)
  1106  	incB := int64(7)
  1107  	incAB := incA + incB
  1108  
  1109  	// Set up a key to replicate across the cluster. We're going to modify this
  1110  	// key and truncate the raft logs from that command after killing one of the
  1111  	// nodes to check that it gets the new value after it comes up.
  1112  	incArgs := incrementArgs(key, incA)
  1113  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  1114  		t.Fatal(err)
  1115  	}
  1116  
  1117  	mtc.replicateRange(1, 1, 2, 3, 4)
  1118  	mtc.waitForValues(key, []int64{incA, incA, incA, incA, incA})
  1119  
  1120  	// Now kill stores 1 + 2, increment the key on the other stores and
  1121  	// truncate their logs to make sure that when store 1 + 2 comes back up
  1122  	// they will require a non-preemptive snapshot from Raft.
  1123  	mtc.stopStore(1)
  1124  	mtc.stopStore(2)
  1125  
  1126  	incArgs = incrementArgs(key, incB)
  1127  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  1128  		t.Fatal(err)
  1129  	}
  1130  
  1131  	mtc.waitForValues(key, []int64{incAB, incA, incA, incAB, incAB})
  1132  
  1133  	index, err := repl.GetLastIndex()
  1134  	if err != nil {
  1135  		t.Fatal(err)
  1136  	}
  1137  
  1138  	// Truncate the log at index+1 (log entries < N are removed, so this
  1139  	// includes the increment).
  1140  	truncArgs := truncateLogArgs(index+1, 1)
  1141  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), truncArgs); err != nil {
  1142  		t.Fatal(err)
  1143  	}
  1144  	mtc.restartStore(1)
  1145  	mtc.restartStore(2)
  1146  
  1147  	mtc.waitForValues(key, []int64{incAB, incAB, incAB, incAB, incAB})
  1148  }
  1149  
  1150  // Test a scenario where a replica is removed from a down node, the associated
  1151  // range is split, the node restarts and we try to replicate the RHS of the
  1152  // split range back to the restarted node.
  1153  func TestReplicateAfterRemoveAndSplit(t *testing.T) {
  1154  	defer leaktest.AfterTest(t)()
  1155  
  1156  	sc := kvserver.TestStoreConfig(nil)
  1157  	sc.TestingKnobs.DisableMergeQueue = true
  1158  	sc.TestingKnobs.DisableReplicateQueue = true
  1159  	// Disable the replica GC queue so that it doesn't accidentally pick up the
  1160  	// removed replica and GC it. We'll explicitly enable it later in the test.
  1161  	sc.TestingKnobs.DisableReplicaGCQueue = true
  1162  	// Disable eager replica removal so we can manually remove the replica.
  1163  	sc.TestingKnobs.DisableEagerReplicaRemoval = true
  1164  	sc.Clock = nil // manual clock
  1165  	mtc := &multiTestContext{
  1166  		storeConfig: &sc,
  1167  		// This test was written before the multiTestContext started creating many
  1168  		// system ranges at startup, and hasn't been update to take that into
  1169  		// account.
  1170  		startWithSingleRange: true,
  1171  	}
  1172  	defer mtc.Stop()
  1173  	mtc.Start(t, 3)
  1174  	rep1, err := mtc.stores[0].GetReplica(1)
  1175  	if err != nil {
  1176  		t.Fatal(err)
  1177  	}
  1178  
  1179  	const rangeID = roachpb.RangeID(1)
  1180  	mtc.replicateRange(rangeID, 1, 2)
  1181  
  1182  	// Kill store 2.
  1183  	mtc.stopStore(2)
  1184  
  1185  	// Remove store 2 from the range to simulate removal of a dead node.
  1186  	mtc.unreplicateRange(rangeID, 2)
  1187  
  1188  	// Split the range.
  1189  	splitKey := roachpb.Key("m")
  1190  	splitArgs := adminSplitArgs(splitKey)
  1191  	if _, err := rep1.AdminSplit(context.Background(), *splitArgs, "test"); err != nil {
  1192  		t.Fatal(err)
  1193  	}
  1194  
  1195  	mtc.advanceClock(context.Background())
  1196  
  1197  	// Restart store 2.
  1198  	mtc.restartStore(2)
  1199  
  1200  	replicateRHS := func() error {
  1201  		// Try to up-replicate the RHS of the split to store 2. We can't use
  1202  		// replicateRange because this should fail on the first attempt and then
  1203  		// eventually succeed.
  1204  		startKey := roachpb.RKey(splitKey)
  1205  
  1206  		var desc roachpb.RangeDescriptor
  1207  		if err := mtc.dbs[0].GetProto(context.Background(), keys.RangeDescriptorKey(startKey), &desc); err != nil {
  1208  			t.Fatal(err)
  1209  		}
  1210  
  1211  		rep2, err := mtc.findMemberStoreLocked(desc).GetReplica(desc.RangeID)
  1212  		if err != nil {
  1213  			t.Fatal(err)
  1214  		}
  1215  
  1216  		chgs := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, roachpb.ReplicationTarget{
  1217  			NodeID:  mtc.stores[2].Ident.NodeID,
  1218  			StoreID: mtc.stores[2].Ident.StoreID,
  1219  		})
  1220  		_, err = rep2.ChangeReplicas(context.Background(), &desc, kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs)
  1221  		return err
  1222  	}
  1223  
  1224  	if err := replicateRHS(); !testutils.IsError(err, kvserver.IntersectingSnapshotMsg) {
  1225  		t.Fatalf("unexpected error %v", err)
  1226  	}
  1227  
  1228  	// Enable the replica GC queue so that the next attempt to replicate the RHS
  1229  	// to store 2 will cause the obsolete replica to be GC'd allowing a
  1230  	// subsequent replication to succeed.
  1231  	mtc.stores[2].SetReplicaGCQueueActive(true)
  1232  }
  1233  
  1234  // Test various mechanism for refreshing pending commands.
  1235  func TestRefreshPendingCommands(t *testing.T) {
  1236  	defer leaktest.AfterTest(t)()
  1237  
  1238  	// In this scenario, three different mechanisms detect the need to repropose
  1239  	// commands. Test that each one is sufficient individually. We have this
  1240  	// redundancy because some mechanisms respond with lower latency than others,
  1241  	// but each has some scenarios (not currently tested) in which it is
  1242  	// insufficient on its own. In addition, there is a fourth reproposal
  1243  	// mechanism (reasonNewLeaderOrConfigChange) which is not relevant to this
  1244  	// scenario.
  1245  	//
  1246  	// We don't test with only reasonNewLeader because that mechanism is less
  1247  	// robust than refreshing due to snapshot or ticks. In particular, it is
  1248  	// possible for node 3 to propose the RequestLease command and have that
  1249  	// command executed by the other nodes but to never see the execution locally
  1250  	// because it is caught up by applying a snapshot.
  1251  	testCases := map[string]kvserver.StoreTestingKnobs{
  1252  		"reasonSnapshotApplied": {
  1253  			DisableRefreshReasonNewLeader: true,
  1254  			DisableRefreshReasonTicks:     true,
  1255  		},
  1256  		"reasonTicks": {
  1257  			DisableRefreshReasonNewLeader:       true,
  1258  			DisableRefreshReasonSnapshotApplied: true,
  1259  		},
  1260  	}
  1261  	for name, c := range testCases {
  1262  		t.Run(name, func(t *testing.T) {
  1263  			sc := kvserver.TestStoreConfig(nil)
  1264  			sc.TestingKnobs = c
  1265  			// Disable periodic gossip tasks which can move the range 1 lease
  1266  			// unexpectedly.
  1267  			sc.TestingKnobs.DisablePeriodicGossips = true
  1268  			sc.Clock = nil // manual clock
  1269  			mtc := &multiTestContext{
  1270  				storeConfig: &sc,
  1271  				// This test was written before the multiTestContext started creating
  1272  				// many system ranges at startup, and hasn't been update to take that
  1273  				// into account.
  1274  				startWithSingleRange: true,
  1275  			}
  1276  			defer mtc.Stop()
  1277  			mtc.Start(t, 3)
  1278  
  1279  			const rangeID = roachpb.RangeID(1)
  1280  			mtc.replicateRange(rangeID, 1, 2)
  1281  
  1282  			// Put some data in the range so we'll have something to test for.
  1283  			incArgs := incrementArgs([]byte("a"), 5)
  1284  			if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  1285  				t.Fatal(err)
  1286  			}
  1287  
  1288  			// Wait for all nodes to catch up.
  1289  			mtc.waitForValues(roachpb.Key("a"), []int64{5, 5, 5})
  1290  
  1291  			// Stop node 2; while it is down write some more data.
  1292  			mtc.stopStore(2)
  1293  
  1294  			if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  1295  				t.Fatal(err)
  1296  			}
  1297  
  1298  			// Get the last increment's log index.
  1299  			repl, err := mtc.stores[0].GetReplica(1)
  1300  			if err != nil {
  1301  				t.Fatal(err)
  1302  			}
  1303  			index, err := repl.GetLastIndex()
  1304  			if err != nil {
  1305  				t.Fatal(err)
  1306  			}
  1307  
  1308  			// Truncate the log at index+1 (log entries < N are removed, so this includes
  1309  			// the increment).
  1310  			truncArgs := truncateLogArgs(index+1, rangeID)
  1311  			if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), truncArgs); err != nil {
  1312  				t.Fatal(err)
  1313  			}
  1314  
  1315  			// Stop and restart node 0 in order to make sure that any in-flight Raft
  1316  			// messages have been sent.
  1317  			mtc.stopStore(0)
  1318  			mtc.restartStore(0)
  1319  
  1320  			////////////////////////////////////////////////////////////////////
  1321  			// We want store 2 to take the lease later, so we'll drain the other
  1322  			// stores and expire the lease.
  1323  			////////////////////////////////////////////////////////////////////
  1324  
  1325  			// Disable node liveness heartbeats which can reacquire leases when we're
  1326  			// trying to expire them. We pause liveness heartbeats here after node 0
  1327  			// was restarted (which creates a new NodeLiveness).
  1328  			pauseNodeLivenessHeartbeats(mtc, true)
  1329  
  1330  			// Start draining stores 0 and 1 to prevent them from grabbing any new
  1331  			// leases.
  1332  			mtc.advanceClock(context.Background())
  1333  			var wg sync.WaitGroup
  1334  			for i := 0; i < 2; i++ {
  1335  				wg.Add(1)
  1336  				go func(i int) {
  1337  					mtc.stores[i].SetDraining(true, nil /* reporter */)
  1338  					wg.Done()
  1339  				}(i)
  1340  			}
  1341  
  1342  			// Wait for the stores 0 and 1 to have entered draining mode, and then
  1343  			// advance the clock. Advancing the clock will leave the liveness records
  1344  			// of draining nodes in an expired state, so the SetDraining() call above
  1345  			// will be able to terminate.
  1346  			draining := false
  1347  			for !draining {
  1348  				draining = true
  1349  				for i := 0; i < 2; i++ {
  1350  					draining = draining && mtc.stores[i].IsDraining()
  1351  				}
  1352  				// Allow this loop to be preempted. Failure to do so can cause a
  1353  				// deadlock because a non-preemptible loop will prevent GC from
  1354  				// starting which in turn will cause all other goroutines to be stuck
  1355  				// as soon as they are called on to assist the GC (this shows up as
  1356  				// goroutines stuck in "GC assist wait"). With all of the other
  1357  				// goroutines stuck, nothing will be able to set mtc.stores[i].draining
  1358  				// to true.
  1359  				//
  1360  				// See #18554.
  1361  				runtime.Gosched()
  1362  			}
  1363  			mtc.advanceClock(context.Background())
  1364  
  1365  			wg.Wait()
  1366  
  1367  			// Restart node 2 and wait for the snapshot to be applied. Note that
  1368  			// waitForValues reads directly from the engine and thus isn't executing
  1369  			// a Raft command.
  1370  			mtc.restartStore(2)
  1371  			mtc.waitForValues(roachpb.Key("a"), []int64{10, 10, 10})
  1372  
  1373  			// Send an increment to the restarted node. If we don't refresh pending
  1374  			// commands appropriately, the range lease command will not get
  1375  			// re-proposed when we discover the new leader.
  1376  			if _, err := kv.SendWrapped(context.Background(), mtc.stores[2].TestSender(), incArgs); err != nil {
  1377  				t.Fatal(err)
  1378  			}
  1379  
  1380  			mtc.waitForValues(roachpb.Key("a"), []int64{15, 15, 15})
  1381  		})
  1382  	}
  1383  }
  1384  
  1385  // Test that when a Raft group is not able to establish a quorum, its Raft log
  1386  // does not grow without bound. It tests two different scenarios where this used
  1387  // to be possible (see #27772):
  1388  // 1. The leader proposes a command and cannot establish a quorum. The leader
  1389  //    continually re-proposes the command.
  1390  // 2. The follower proposes a command and forwards it to the leader, who cannot
  1391  //    establish a quorum. The follower continually re-proposes and forwards the
  1392  //    command to the leader.
  1393  func TestLogGrowthWhenRefreshingPendingCommands(t *testing.T) {
  1394  	defer leaktest.AfterTest(t)()
  1395  
  1396  	sc := kvserver.TestStoreConfig(nil)
  1397  	// Drop the raft tick interval so the Raft group is ticked more.
  1398  	sc.RaftTickInterval = 10 * time.Millisecond
  1399  	// Don't timeout raft leader. We don't want leadership moving.
  1400  	sc.RaftElectionTimeoutTicks = 1000000
  1401  	// Reduce the max uncommitted entry size.
  1402  	sc.RaftMaxUncommittedEntriesSize = 64 << 10 // 64 KB
  1403  	// RaftProposalQuota cannot exceed RaftMaxUncommittedEntriesSize.
  1404  	sc.RaftProposalQuota = int64(sc.RaftMaxUncommittedEntriesSize)
  1405  	// RaftMaxInflightMsgs * RaftMaxSizePerMsg cannot exceed RaftProposalQuota.
  1406  	sc.RaftMaxInflightMsgs = 16
  1407  	sc.RaftMaxSizePerMsg = 1 << 10 // 1 KB
  1408  	// Disable leader transfers during leaseholder changes so that we
  1409  	// can easily create leader-not-leaseholder scenarios.
  1410  	sc.TestingKnobs.DisableLeaderFollowsLeaseholder = true
  1411  	// Refresh pending commands on every Raft group tick instead of
  1412  	// every RaftElectionTimeoutTicks.
  1413  	sc.TestingKnobs.RefreshReasonTicksPeriod = 1
  1414  	// Disable periodic gossip tasks which can move the range 1 lease
  1415  	// unexpectedly.
  1416  	sc.TestingKnobs.DisablePeriodicGossips = true
  1417  	mtc := &multiTestContext{
  1418  		storeConfig: &sc,
  1419  		// This test was written before the multiTestContext started creating many
  1420  		// system ranges at startup, and hasn't been update to take that into
  1421  		// account.
  1422  		startWithSingleRange: true,
  1423  	}
  1424  	defer mtc.Stop()
  1425  	mtc.Start(t, 5)
  1426  
  1427  	const rangeID = roachpb.RangeID(1)
  1428  	mtc.replicateRange(rangeID, 1, 2, 3, 4)
  1429  
  1430  	// Raft leadership is kept on node 0.
  1431  	leaderRepl, err := mtc.Store(0).GetReplica(rangeID)
  1432  	if err != nil {
  1433  		t.Fatal(err)
  1434  	}
  1435  
  1436  	// Put some data in the range so we'll have something to test for.
  1437  	incArgs := incrementArgs([]byte("a"), 5)
  1438  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  1439  		t.Fatal(err)
  1440  	}
  1441  
  1442  	// Wait for all nodes to catch up.
  1443  	mtc.waitForValues(roachpb.Key("a"), []int64{5, 5, 5, 5, 5})
  1444  
  1445  	// Test proposing on leader and proposing on follower. Neither should result
  1446  	// in unbounded raft log growth.
  1447  	testutils.RunTrueAndFalse(t, "proposeOnFollower", func(t *testing.T, proposeOnFollower bool) {
  1448  		// Restart any nodes that are down.
  1449  		for _, s := range []int{2, 3, 4} {
  1450  			if mtc.Store(s) == nil {
  1451  				mtc.restartStore(s)
  1452  			}
  1453  		}
  1454  
  1455  		// Determine which node to propose on. Transfer lease to that node.
  1456  		var propIdx, otherIdx int
  1457  		if !proposeOnFollower {
  1458  			propIdx, otherIdx = 0, 1
  1459  		} else {
  1460  			propIdx, otherIdx = 1, 0
  1461  		}
  1462  		propNode := mtc.stores[propIdx].TestSender()
  1463  		mtc.transferLease(context.Background(), rangeID, otherIdx, propIdx)
  1464  		testutils.SucceedsSoon(t, func() error {
  1465  			// Lease transfers may not be immediately observed by the new
  1466  			// leaseholder. Wait until the new leaseholder is aware.
  1467  			repl, err := mtc.Store(propIdx).GetReplica(rangeID)
  1468  			if err != nil {
  1469  				t.Fatal(err)
  1470  			}
  1471  			repDesc, err := repl.GetReplicaDescriptor()
  1472  			if err != nil {
  1473  				t.Fatal(err)
  1474  			}
  1475  			if lease, _ := repl.GetLease(); !lease.Replica.Equal(repDesc) {
  1476  				return errors.Errorf("lease not transferred yet; found %v", lease)
  1477  			}
  1478  			return nil
  1479  		})
  1480  
  1481  		// Stop enough nodes to prevent a quorum.
  1482  		for _, s := range []int{2, 3, 4} {
  1483  			mtc.stopStore(s)
  1484  		}
  1485  
  1486  		// Determine the current raft log size.
  1487  		initLogSize, _ := leaderRepl.GetRaftLogSize()
  1488  
  1489  		// While a majority nodes are down, write some data.
  1490  		putRes := make(chan *roachpb.Error)
  1491  		go func() {
  1492  			putArgs := putArgs([]byte("b"), make([]byte, sc.RaftMaxUncommittedEntriesSize/8))
  1493  			_, err := kv.SendWrapped(context.Background(), propNode, putArgs)
  1494  			putRes <- err
  1495  		}()
  1496  
  1497  		// Wait for a bit and watch for Raft log growth.
  1498  		wait := time.After(500 * time.Millisecond)
  1499  		ticker := time.Tick(50 * time.Millisecond)
  1500  	Loop:
  1501  		for {
  1502  			select {
  1503  			case <-wait:
  1504  				break Loop
  1505  			case <-ticker:
  1506  				// Verify that the leader is node 0.
  1507  				status := leaderRepl.RaftStatus()
  1508  				if status == nil || status.RaftState != raft.StateLeader {
  1509  					t.Fatalf("raft leader should be node 0, but got status %+v", status)
  1510  				}
  1511  
  1512  				// Check the raft log size. We allow GetRaftLogSize to grow up
  1513  				// to twice RaftMaxUncommittedEntriesSize because its total
  1514  				// includes a little more state (the roachpb.Value checksum,
  1515  				// etc.). The important thing here is that the log doesn't grow
  1516  				// forever.
  1517  				logSizeLimit := int64(2 * sc.RaftMaxUncommittedEntriesSize)
  1518  				curlogSize, _ := leaderRepl.GetRaftLogSize()
  1519  				logSize := curlogSize - initLogSize
  1520  				logSizeStr := humanizeutil.IBytes(logSize)
  1521  				// Note that logSize could be negative if something got truncated.
  1522  				if logSize > logSizeLimit {
  1523  					t.Fatalf("raft log size grew to %s", logSizeStr)
  1524  				}
  1525  				t.Logf("raft log size grew to %s", logSizeStr)
  1526  			case err := <-putRes:
  1527  				t.Fatalf("write finished with quorum unavailable; err=%v", err)
  1528  			}
  1529  		}
  1530  
  1531  		// Start enough nodes to establish a quorum.
  1532  		mtc.restartStore(2)
  1533  
  1534  		// The write should now succeed.
  1535  		if err := <-putRes; err != nil {
  1536  			t.Fatal(err)
  1537  		}
  1538  	})
  1539  }
  1540  
  1541  // TestStoreRangeUpReplicate verifies that the replication queue will notice
  1542  // under-replicated ranges and replicate them.
  1543  func TestStoreRangeUpReplicate(t *testing.T) {
  1544  	defer leaktest.AfterTest(t)()
  1545  	defer kvserver.SetMockAddSSTable()()
  1546  	sc := kvserver.TestStoreConfig(nil)
  1547  	// Prevent the split queue from creating additional ranges while we're
  1548  	// waiting for replication.
  1549  	sc.TestingKnobs.DisableSplitQueue = true
  1550  	mtc := &multiTestContext{
  1551  		storeConfig: &sc,
  1552  	}
  1553  	defer mtc.Stop()
  1554  	mtc.Start(t, 3)
  1555  	mtc.initGossipNetwork()
  1556  
  1557  	// Once we know our peers, trigger a scan.
  1558  	if err := mtc.stores[0].ForceReplicationScanAndProcess(); err != nil {
  1559  		t.Fatal(err)
  1560  	}
  1561  
  1562  	// Wait until all ranges are upreplicated to all nodes.
  1563  	var replicaCount int64
  1564  	testutils.SucceedsSoon(t, func() error {
  1565  		var replicaCounts [3]int64
  1566  		for i, s := range mtc.stores {
  1567  			var err error
  1568  			mtc.stores[i].VisitReplicas(func(r *kvserver.Replica) bool {
  1569  				replicaCounts[i]++
  1570  				// Synchronize with the replica's raft processing goroutine.
  1571  				r.RaftLock()
  1572  				defer r.RaftUnlock()
  1573  				if len(r.Desc().InternalReplicas) != 3 {
  1574  					// This fails even after the snapshot has arrived and only
  1575  					// goes through once the replica has applied the conf change.
  1576  					err = errors.Errorf("not fully initialized")
  1577  					return false
  1578  				}
  1579  				return true
  1580  			})
  1581  			if err != nil {
  1582  				return err
  1583  			}
  1584  			if replicaCounts[i] != replicaCounts[0] {
  1585  				return errors.Errorf("not fully upreplicated")
  1586  			}
  1587  			if n := s.ReservationCount(); n != 0 {
  1588  				return errors.Errorf("expected 0 reservations, but found %d", n)
  1589  			}
  1590  		}
  1591  		replicaCount = replicaCounts[0]
  1592  		return nil
  1593  	})
  1594  
  1595  	var generated int64
  1596  	var learnerApplied, raftApplied int64
  1597  	for _, s := range mtc.stores {
  1598  		m := s.Metrics()
  1599  		generated += m.RangeSnapshotsGenerated.Count()
  1600  		learnerApplied += m.RangeSnapshotsLearnerApplied.Count()
  1601  		raftApplied += m.RangeSnapshotsNormalApplied.Count()
  1602  	}
  1603  	if generated == 0 {
  1604  		t.Fatalf("expected at least 1 snapshot, but found 0")
  1605  	}
  1606  	// We upreplicate each range (once each for n2 and n3), so there should be
  1607  	// exactly 2 * replica learner snaps, one per upreplication.
  1608  	require.Equal(t, 2*replicaCount, learnerApplied)
  1609  	// Ideally there would be zero raft snaps, but etcd/raft is picky about
  1610  	// getting a snapshot at exactly the index it asked for.
  1611  	if raftApplied > learnerApplied {
  1612  		t.Fatalf("expected more learner snaps %d than raft snaps %d", learnerApplied, raftApplied)
  1613  	}
  1614  }
  1615  
  1616  // TestUnreplicateFirstRange verifies that multiTestContext still functions in
  1617  // the case where the first range (which contains range metadata) is
  1618  // unreplicated from the first store. This situation can arise occasionally in
  1619  // tests, as can a similar situation where the first store is no longer the lease holder of
  1620  // the first range; this verifies that those tests will not be affected.
  1621  func TestUnreplicateFirstRange(t *testing.T) {
  1622  	defer leaktest.AfterTest(t)()
  1623  
  1624  	mtc := &multiTestContext{}
  1625  	defer mtc.Stop()
  1626  	mtc.Start(t, 3)
  1627  
  1628  	const rangeID = roachpb.RangeID(1)
  1629  	// Replicate the range to store 1.
  1630  	mtc.replicateRange(rangeID, 1)
  1631  	// Move the lease away from store 0 before removing its replica.
  1632  	mtc.transferLease(context.Background(), rangeID, 0, 1)
  1633  	// Unreplicate the from from store 0.
  1634  	mtc.unreplicateRange(rangeID, 0)
  1635  	// Replicate the range to store 2. The first range is no longer available on
  1636  	// store 1, and this command will fail if that situation is not properly
  1637  	// supported.
  1638  	mtc.replicateRange(rangeID, 2)
  1639  }
  1640  
  1641  // TestChangeReplicasDescriptorInvariant tests that a replica change aborts if
  1642  // another change has been made to the RangeDescriptor since it was initiated.
  1643  func TestChangeReplicasDescriptorInvariant(t *testing.T) {
  1644  	defer leaktest.AfterTest(t)()
  1645  	mtc := &multiTestContext{
  1646  		// This test was written before the multiTestContext started creating many
  1647  		// system ranges at startup, and hasn't been update to take that into
  1648  		// account.
  1649  		startWithSingleRange: true,
  1650  	}
  1651  	defer mtc.Stop()
  1652  	mtc.Start(t, 3)
  1653  
  1654  	repl, err := mtc.stores[0].GetReplica(1)
  1655  	if err != nil {
  1656  		t.Fatal(err)
  1657  	}
  1658  
  1659  	addReplica := func(storeNum int, desc *roachpb.RangeDescriptor) error {
  1660  		chgs := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, roachpb.ReplicationTarget{
  1661  			NodeID:  mtc.stores[storeNum].Ident.NodeID,
  1662  			StoreID: mtc.stores[storeNum].Ident.StoreID,
  1663  		})
  1664  		_, err := repl.ChangeReplicas(context.Background(), desc, kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs)
  1665  		return err
  1666  	}
  1667  
  1668  	// Retain the descriptor for the range at this point.
  1669  	origDesc := repl.Desc()
  1670  
  1671  	// Add replica to the second store, which should succeed.
  1672  	if err := addReplica(1, origDesc); err != nil {
  1673  		t.Fatal(err)
  1674  	}
  1675  	testutils.SucceedsSoon(t, func() error {
  1676  		r := mtc.stores[1].LookupReplica(roachpb.RKey("a"))
  1677  		if r == nil {
  1678  			return errors.Errorf(`expected replica for key "a"`)
  1679  		}
  1680  		return nil
  1681  	})
  1682  
  1683  	before := mtc.stores[2].Metrics().RangeSnapshotsLearnerApplied.Count()
  1684  	// Attempt to add replica to the third store with the original descriptor.
  1685  	// This should fail because the descriptor is stale.
  1686  	expectedErr := `change replicas of r1 failed: descriptor changed: \[expected\]`
  1687  	if err := addReplica(2, origDesc); !testutils.IsError(err, expectedErr) {
  1688  		t.Fatalf("got unexpected error: %+v", err)
  1689  	}
  1690  
  1691  	after := mtc.stores[2].Metrics().RangeSnapshotsLearnerApplied.Count()
  1692  	// The failed ChangeReplicas call should NOT have applied a learner snapshot.
  1693  	if after != before {
  1694  		t.Fatalf(
  1695  			"ChangeReplicas call should not have applied a learner snapshot, before %d after %d",
  1696  			before, after)
  1697  	}
  1698  
  1699  	before = mtc.stores[2].Metrics().RangeSnapshotsLearnerApplied.Count()
  1700  	// Add to third store with fresh descriptor.
  1701  	if err := addReplica(2, repl.Desc()); err != nil {
  1702  		t.Fatal(err)
  1703  	}
  1704  
  1705  	testutils.SucceedsSoon(t, func() error {
  1706  		after := mtc.stores[2].Metrics().RangeSnapshotsLearnerApplied.Count()
  1707  		// The failed ChangeReplicas call should have applied a learner snapshot.
  1708  		if after != before+1 {
  1709  			return errors.Errorf(
  1710  				"ChangeReplicas call should have applied a learner snapshot, before %d after %d",
  1711  				before, after)
  1712  		}
  1713  		r := mtc.stores[2].LookupReplica(roachpb.RKey("a"))
  1714  		if r == nil {
  1715  			return errors.Errorf(`expected replica for key "a"`)
  1716  		}
  1717  		return nil
  1718  	})
  1719  }
  1720  
  1721  // TestProgressWithDownNode verifies that a surviving quorum can make progress
  1722  // with a downed node.
  1723  func TestProgressWithDownNode(t *testing.T) {
  1724  	defer leaktest.AfterTest(t)()
  1725  	// This test relies on concurrently waiting for a value to change in the
  1726  	// underlying engine(s). Since the teeing engine does not respond well to
  1727  	// value mismatches, whether transient or permanent, skip this test if the
  1728  	// teeing engine is being used. See
  1729  	// https://github.com/cockroachdb/cockroach/issues/42656 for more context.
  1730  	if storage.DefaultStorageEngine == enginepb.EngineTypeTeePebbleRocksDB {
  1731  		t.Skip("disabled on teeing engine")
  1732  	}
  1733  	mtc := &multiTestContext{
  1734  		// This test was written before the multiTestContext started creating many
  1735  		// system ranges at startup, and hasn't been update to take that into
  1736  		// account.
  1737  		startWithSingleRange: true,
  1738  	}
  1739  	defer mtc.Stop()
  1740  	mtc.Start(t, 3)
  1741  
  1742  	const rangeID = roachpb.RangeID(1)
  1743  	mtc.replicateRange(rangeID, 1, 2)
  1744  
  1745  	incArgs := incrementArgs([]byte("a"), 5)
  1746  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  1747  		t.Fatal(err)
  1748  	}
  1749  
  1750  	// Verify that the first increment propagates to all the engines.
  1751  	verify := func(expected []int64) {
  1752  		testutils.SucceedsSoon(t, func() error {
  1753  			values := []int64{}
  1754  			for _, eng := range mtc.engines {
  1755  				val, _, err := storage.MVCCGet(context.Background(), eng, roachpb.Key("a"), mtc.clock().Now(),
  1756  					storage.MVCCGetOptions{})
  1757  				if err != nil {
  1758  					return err
  1759  				}
  1760  				values = append(values, mustGetInt(val))
  1761  			}
  1762  			if !reflect.DeepEqual(expected, values) {
  1763  				return errors.Errorf("expected %v, got %v", expected, values)
  1764  			}
  1765  			return nil
  1766  		})
  1767  	}
  1768  	verify([]int64{5, 5, 5})
  1769  
  1770  	// Stop one of the replicas and issue a new increment.
  1771  	mtc.stopStore(1)
  1772  	incArgs = incrementArgs([]byte("a"), 11)
  1773  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  1774  		t.Fatal(err)
  1775  	}
  1776  
  1777  	// The new increment can be seen on both live replicas.
  1778  	verify([]int64{16, 5, 16})
  1779  
  1780  	// Once the downed node is restarted, it will catch up.
  1781  	mtc.restartStore(1)
  1782  	verify([]int64{16, 16, 16})
  1783  }
  1784  
  1785  // TestReplicateRestartAfterTruncationWithRemoveAndReAdd is motivated by issue
  1786  // #8111, which suggests the following test (which verifies the ability of a
  1787  // snapshot with a new replica ID to overwrite existing data):
  1788  //   - replicate a range to three stores
  1789  //   - stop a store
  1790  //   - remove the stopped store from the range
  1791  //   - truncate the logs
  1792  //   - re-add the store and restart it
  1793  //   - ensure that store can catch up with the rest of the group
  1794  func TestReplicateRestartAfterTruncationWithRemoveAndReAdd(t *testing.T) {
  1795  	defer leaktest.AfterTest(t)()
  1796  	runReplicateRestartAfterTruncation(t, true /* removeBeforeTruncateAndReAdd */)
  1797  }
  1798  
  1799  // TestReplicateRestartAfterTruncation is a variant of
  1800  // TestReplicateRestartAfterTruncationWithRemoveAndReAdd without the remove and
  1801  // re-add. Just stop, truncate, and restart. This verifies that a snapshot
  1802  // without a new replica ID works correctly.
  1803  func TestReplicateRestartAfterTruncation(t *testing.T) {
  1804  	defer leaktest.AfterTest(t)()
  1805  	runReplicateRestartAfterTruncation(t, false /* removeBeforeTruncateAndReAdd */)
  1806  }
  1807  
  1808  func runReplicateRestartAfterTruncation(t *testing.T, removeBeforeTruncateAndReAdd bool) {
  1809  	sc := kvserver.TestStoreConfig(nil)
  1810  	// Don't timeout raft leaders or range leases (see the relation between
  1811  	// RaftElectionTimeoutTicks and RangeLeaseActiveDuration). This test expects
  1812  	// mtc.stores[0] to hold the range lease for range 1.
  1813  	sc.RaftElectionTimeoutTicks = 1000000
  1814  	sc.Clock = nil // manual clock
  1815  	mtc := &multiTestContext{
  1816  		storeConfig: &sc,
  1817  		// This test was written before the multiTestContext started creating many
  1818  		// system ranges at startup, and hasn't been update to take that into
  1819  		// account.
  1820  		startWithSingleRange: true,
  1821  	}
  1822  	defer mtc.Stop()
  1823  	mtc.Start(t, 3)
  1824  
  1825  	key := roachpb.Key("a")
  1826  
  1827  	// Replicate the initial range to all three nodes.
  1828  	const rangeID = roachpb.RangeID(1)
  1829  	mtc.replicateRange(rangeID, 1, 2)
  1830  
  1831  	// Verify that the first increment propagates to all the engines.
  1832  	incArgs := incrementArgs(key, 2)
  1833  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  1834  		t.Fatal(err)
  1835  	}
  1836  	mtc.waitForValues(key, []int64{2, 2, 2})
  1837  
  1838  	// Stop a store.
  1839  	mtc.stopStore(1)
  1840  	if removeBeforeTruncateAndReAdd {
  1841  		// remove the stopped store from the range
  1842  		mtc.unreplicateRange(rangeID, 1)
  1843  	}
  1844  
  1845  	// Truncate the logs.
  1846  	{
  1847  		// Get the last increment's log index.
  1848  		repl, err := mtc.stores[0].GetReplica(rangeID)
  1849  		if err != nil {
  1850  			t.Fatal(err)
  1851  		}
  1852  		index, err := repl.GetLastIndex()
  1853  		if err != nil {
  1854  			t.Fatal(err)
  1855  		}
  1856  		// Truncate the log at index+1 (log entries < N are removed, so this includes
  1857  		// the increment).
  1858  		truncArgs := truncateLogArgs(index+1, rangeID)
  1859  		if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), truncArgs); err != nil {
  1860  			t.Fatal(err)
  1861  		}
  1862  	}
  1863  
  1864  	// Ensure that store can catch up with the rest of the group.
  1865  	incArgs = incrementArgs(key, 3)
  1866  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  1867  		t.Fatal(err)
  1868  	}
  1869  
  1870  	mtc.waitForValues(key, []int64{5, 2, 5})
  1871  
  1872  	// Re-add the store and restart it.
  1873  	// TODO(dt): ben originally suggested we also attempt this in the other order.
  1874  	// This currently hits an NPE in mtc.replicateRange though when it tries to
  1875  	// read the Ident.NodeID field in the specified store, and will become
  1876  	// impossible after streaming snapshots.
  1877  	mtc.restartStore(1)
  1878  	if removeBeforeTruncateAndReAdd {
  1879  		// Verify old replica is GC'd. Wait out the replica gc queue
  1880  		// inactivity threshold and force a gc scan.
  1881  		mtc.manualClock.Increment(int64(kvserver.ReplicaGCQueueInactivityThreshold + 1))
  1882  		testutils.SucceedsSoon(t, func() error {
  1883  			mtc.stores[1].MustForceReplicaGCScanAndProcess()
  1884  			_, err := mtc.stores[1].GetReplica(rangeID)
  1885  			if !errors.HasType(err, (*roachpb.RangeNotFoundError)(nil)) {
  1886  				return errors.Errorf("expected replica to be garbage collected, got %v %T", err, err)
  1887  			}
  1888  			return nil
  1889  		})
  1890  
  1891  		mtc.replicateRange(rangeID, 1)
  1892  	}
  1893  
  1894  	mtc.waitForValues(key, []int64{5, 5, 5})
  1895  }
  1896  
  1897  func testReplicaAddRemove(t *testing.T, addFirst bool) {
  1898  	// This test relies on concurrently waiting for a value to change in the
  1899  	// underlying engine(s). Since the teeing engine does not respond well to
  1900  	// value mismatches, whether transient or permanent, skip this test if the
  1901  	// teeing engine is being used. See
  1902  	// https://github.com/cockroachdb/cockroach/issues/42656 for more context.
  1903  	if storage.DefaultStorageEngine == enginepb.EngineTypeTeePebbleRocksDB {
  1904  		t.Skip("disabled on teeing engine")
  1905  	}
  1906  	sc := kvserver.TestStoreConfig(nil)
  1907  	// We're gonna want to validate the state of the store before and after the
  1908  	// replica GC queue does its work, so we disable the replica gc queue here
  1909  	// and run it manually when we're ready.
  1910  	sc.TestingKnobs.DisableReplicaGCQueue = true
  1911  	sc.TestingKnobs.DisableEagerReplicaRemoval = true
  1912  	sc.Clock = nil // manual clock
  1913  	mtc := &multiTestContext{
  1914  		storeConfig: &sc,
  1915  		// This test was written before the multiTestContext started creating many
  1916  		// system ranges at startup, and hasn't been update to take that into
  1917  		// account.
  1918  		startWithSingleRange: true,
  1919  	}
  1920  	defer mtc.Stop()
  1921  	mtc.Start(t, 4)
  1922  
  1923  	key := roachpb.Key("a")
  1924  	verifyFn := func(expected []int64) func() error {
  1925  		return func() error {
  1926  			values := make([]int64, len(mtc.engines))
  1927  			for i, eng := range mtc.engines {
  1928  				val, _, err := storage.MVCCGet(context.Background(), eng, key, mtc.clock().Now(),
  1929  					storage.MVCCGetOptions{})
  1930  				if err != nil {
  1931  					return err
  1932  				}
  1933  				values[i] = mustGetInt(val)
  1934  			}
  1935  			if reflect.DeepEqual(expected, values) {
  1936  				return nil
  1937  			}
  1938  			return errors.Errorf("expected %+v, got %+v", expected, values)
  1939  		}
  1940  	}
  1941  
  1942  	// Replicate the initial range to three of the four nodes.
  1943  	const rangeID = roachpb.RangeID(1)
  1944  	mtc.replicateRange(rangeID, 3, 1)
  1945  
  1946  	inc1 := int64(5)
  1947  	{
  1948  		incArgs := incrementArgs(key, inc1)
  1949  		if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  1950  			t.Fatal(err)
  1951  		}
  1952  	}
  1953  
  1954  	// The first increment is visible on all three replicas.
  1955  	testutils.SucceedsSoon(t, verifyFn([]int64{
  1956  		inc1,
  1957  		inc1,
  1958  		0,
  1959  		inc1,
  1960  	}))
  1961  
  1962  	// Stop a store and replace it.
  1963  	mtc.stopStore(1)
  1964  	if addFirst {
  1965  		mtc.replicateRange(rangeID, 2)
  1966  		mtc.unreplicateRange(rangeID, 1)
  1967  	} else {
  1968  		mtc.unreplicateRange(rangeID, 1)
  1969  		mtc.replicateRange(rangeID, 2)
  1970  	}
  1971  	// The first increment is visible on the new replica.
  1972  	testutils.SucceedsSoon(t, verifyFn([]int64{
  1973  		inc1,
  1974  		inc1,
  1975  		inc1,
  1976  		inc1,
  1977  	}))
  1978  
  1979  	// Ensure that the rest of the group can make progress.
  1980  	inc2 := int64(11)
  1981  	{
  1982  		incArgs := incrementArgs(key, inc2)
  1983  		if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  1984  			t.Fatal(err)
  1985  		}
  1986  	}
  1987  	testutils.SucceedsSoon(t, verifyFn([]int64{
  1988  		inc1 + inc2,
  1989  		inc1,
  1990  		inc1 + inc2,
  1991  		inc1 + inc2,
  1992  	}))
  1993  
  1994  	// Bring the downed store back up (required for a clean shutdown).
  1995  	mtc.restartStore(1)
  1996  
  1997  	// The downed store never sees the increment that was added while it was
  1998  	// down. Perform another increment now that it is back up to verify that it
  1999  	// doesn't see future activity.
  2000  	inc3 := int64(23)
  2001  	{
  2002  		incArgs := incrementArgs(key, inc3)
  2003  		if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  2004  			t.Fatal(err)
  2005  		}
  2006  	}
  2007  	testutils.SucceedsSoon(t, verifyFn([]int64{
  2008  		inc1 + inc2 + inc3,
  2009  		inc1,
  2010  		inc1 + inc2 + inc3,
  2011  		inc1 + inc2 + inc3,
  2012  	}))
  2013  
  2014  	// Wait out the range lease and the unleased duration to make the replica GC'able.
  2015  	mtc.advanceClock(context.Background())
  2016  	mtc.manualClock.Increment(int64(kvserver.ReplicaGCQueueInactivityThreshold + 1))
  2017  	mtc.stores[1].SetReplicaGCQueueActive(true)
  2018  	mtc.stores[1].MustForceReplicaGCScanAndProcess()
  2019  
  2020  	// The removed store no longer has any of the data from the range.
  2021  	testutils.SucceedsSoon(t, verifyFn([]int64{
  2022  		inc1 + inc2 + inc3,
  2023  		0,
  2024  		inc1 + inc2 + inc3,
  2025  		inc1 + inc2 + inc3,
  2026  	}))
  2027  
  2028  	desc := mtc.stores[0].LookupReplica(roachpb.RKeyMin).Desc()
  2029  	replicaIDsByStore := map[roachpb.StoreID]roachpb.ReplicaID{}
  2030  	for _, rep := range desc.InternalReplicas {
  2031  		replicaIDsByStore[rep.StoreID] = rep.ReplicaID
  2032  	}
  2033  	expected := map[roachpb.StoreID]roachpb.ReplicaID{1: 1, 4: 2, 3: 4}
  2034  	if !reflect.DeepEqual(expected, replicaIDsByStore) {
  2035  		t.Fatalf("expected replica IDs to be %v but got %v", expected, replicaIDsByStore)
  2036  	}
  2037  }
  2038  
  2039  func TestReplicateAddAndRemove(t *testing.T) {
  2040  	defer leaktest.AfterTest(t)()
  2041  
  2042  	testReplicaAddRemove(t, true /* addFirst */)
  2043  }
  2044  
  2045  func TestReplicateRemoveAndAdd(t *testing.T) {
  2046  	defer leaktest.AfterTest(t)()
  2047  
  2048  	testReplicaAddRemove(t, false /* addFirst */)
  2049  }
  2050  
  2051  // TestQuotaPool verifies that writes get throttled in the case where we have
  2052  // two fast moving replicas with sufficiently fast growing raft logs and a
  2053  // slower replica catching up. By throttling write throughput we avoid having
  2054  // to constantly catch up the slower node via snapshots. See #8659.
  2055  func TestQuotaPool(t *testing.T) {
  2056  	defer leaktest.AfterTest(t)()
  2057  
  2058  	const quota = 10000
  2059  	const numReplicas = 3
  2060  	const rangeID = 1
  2061  	ctx := context.Background()
  2062  	sc := kvserver.TestStoreConfig(nil)
  2063  	// Suppress timeout-based elections to avoid leadership changes in ways
  2064  	// this test doesn't expect.
  2065  	sc.RaftElectionTimeoutTicks = 100000
  2066  	mtc := &multiTestContext{
  2067  		storeConfig: &sc,
  2068  		// This test was written before the multiTestContext started creating many
  2069  		// system ranges at startup, and hasn't been update to take that into
  2070  		// account.
  2071  		startWithSingleRange: true,
  2072  	}
  2073  	mtc.Start(t, numReplicas)
  2074  	defer mtc.Stop()
  2075  
  2076  	mtc.replicateRange(rangeID, 1, 2)
  2077  
  2078  	assertEqualLastIndex := func() error {
  2079  		var expectedIndex uint64
  2080  
  2081  		for i, s := range mtc.stores {
  2082  			repl, err := s.GetReplica(rangeID)
  2083  			if err != nil {
  2084  				t.Fatal(err)
  2085  			}
  2086  
  2087  			index, err := repl.GetLastIndex()
  2088  			if err != nil {
  2089  				t.Fatal(err)
  2090  			}
  2091  			if i == 0 {
  2092  				expectedIndex = index
  2093  			} else if expectedIndex != index {
  2094  				return fmt.Errorf("%s: expected lastIndex %d, but found %d", repl, expectedIndex, index)
  2095  			}
  2096  		}
  2097  		return nil
  2098  	}
  2099  	testutils.SucceedsSoon(t, assertEqualLastIndex)
  2100  
  2101  	// NB: See TestRaftBlockedReplica/#9914 for why we use a separate	goroutine.
  2102  	raftLockReplica := func(repl *kvserver.Replica) {
  2103  		ch := make(chan struct{})
  2104  		go func() { repl.RaftLock(); close(ch) }()
  2105  		<-ch
  2106  	}
  2107  
  2108  	leaderRepl := mtc.getRaftLeader(rangeID)
  2109  	// Grab the raftMu to re-initialize the QuotaPool to ensure that we don't
  2110  	// race with ongoing applications.
  2111  	raftLockReplica(leaderRepl)
  2112  	if err := leaderRepl.InitQuotaPool(quota); err != nil {
  2113  		t.Fatalf("failed to initialize quota pool: %v", err)
  2114  	}
  2115  	leaderRepl.RaftUnlock()
  2116  	followerRepl := func() *kvserver.Replica {
  2117  		for _, store := range mtc.stores {
  2118  			repl, err := store.GetReplica(rangeID)
  2119  			if err != nil {
  2120  				t.Fatal(err)
  2121  			}
  2122  			if repl == leaderRepl {
  2123  				continue
  2124  			}
  2125  			return repl
  2126  		}
  2127  		return nil
  2128  	}()
  2129  	if followerRepl == nil {
  2130  		t.Fatal("could not get a handle on a follower replica")
  2131  	}
  2132  
  2133  	// We block the third replica effectively causing acquisition of quota
  2134  	// without subsequent release.
  2135  	raftLockReplica(followerRepl)
  2136  	ch := make(chan *roachpb.Error, 1)
  2137  
  2138  	func() {
  2139  		defer followerRepl.RaftUnlock()
  2140  
  2141  		// In order to verify write throttling we insert a value 3/4th the size of
  2142  		// total quota available in the system. This should effectively go through
  2143  		// and block the subsequent insert of the same size. We check to see whether
  2144  		// or not after this write has gone through by verifying that the total
  2145  		// quota available has decreased as expected.
  2146  		//
  2147  		// Following this we unblock the 'slow' replica allowing it to catch up to
  2148  		// the first write. This in turn releases quota back to the pool and the
  2149  		// second write, previously blocked by virtue of there not being enough
  2150  		// quota, is now free to proceed. We expect the final quota in the system
  2151  		// to be the same as what we started with.
  2152  		key := roachpb.Key("k")
  2153  		value := bytes.Repeat([]byte("v"), (3*quota)/4)
  2154  		var ba roachpb.BatchRequest
  2155  		ba.Add(putArgs(key, value))
  2156  		if err := ba.SetActiveTimestamp(mtc.clock().Now); err != nil {
  2157  			t.Fatal(err)
  2158  		}
  2159  		if _, pErr := leaderRepl.Send(ctx, ba); pErr != nil {
  2160  			t.Fatal(pErr)
  2161  		}
  2162  
  2163  		if curQuota := leaderRepl.QuotaAvailable(); curQuota > quota/4 {
  2164  			t.Fatalf("didn't observe the expected quota acquisition, available: %d", curQuota)
  2165  		}
  2166  
  2167  		testutils.SucceedsSoon(t, func() error {
  2168  			if qLen := leaderRepl.QuotaReleaseQueueLen(); qLen < 1 {
  2169  				return errors.Errorf("expected at least 1 queued quota release, found: %d", qLen)
  2170  			}
  2171  			return nil
  2172  		})
  2173  
  2174  		go func() {
  2175  			var ba roachpb.BatchRequest
  2176  			ba.Add(putArgs(key, value))
  2177  			if err := ba.SetActiveTimestamp(mtc.clock().Now); err != nil {
  2178  				ch <- roachpb.NewError(err)
  2179  				return
  2180  			}
  2181  			_, pErr := leaderRepl.Send(ctx, ba)
  2182  			ch <- pErr
  2183  		}()
  2184  	}()
  2185  
  2186  	testutils.SucceedsSoon(t, func() error {
  2187  		if curQuota := leaderRepl.QuotaAvailable(); curQuota != quota {
  2188  			return errors.Errorf("expected available quota %d, got %d", quota, curQuota)
  2189  		}
  2190  		if qLen := leaderRepl.QuotaReleaseQueueLen(); qLen != 0 {
  2191  			return errors.Errorf("expected no queued quota releases, found: %d", qLen)
  2192  		}
  2193  		return nil
  2194  	})
  2195  
  2196  	if pErr := <-ch; pErr != nil {
  2197  		t.Fatal(pErr)
  2198  	}
  2199  }
  2200  
  2201  // TestWedgedReplicaDetection verifies that a leader replica is able to
  2202  // correctly detect a wedged follower replica and no longer consider it
  2203  // as active for the purpose of proposal throttling.
  2204  func TestWedgedReplicaDetection(t *testing.T) {
  2205  	defer leaktest.AfterTest(t)()
  2206  
  2207  	const numReplicas = 3
  2208  	const rangeID = 1
  2209  
  2210  	sc := kvserver.TestStoreConfig(nil)
  2211  	// Suppress timeout-based elections to avoid leadership changes in ways
  2212  	// this test doesn't expect.
  2213  	sc.RaftElectionTimeoutTicks = 100000
  2214  	mtc := &multiTestContext{
  2215  		storeConfig: &sc,
  2216  		// This test was written before the multiTestContext started creating many
  2217  		// system ranges at startup, and hasn't been update to take that into
  2218  		// account.
  2219  		startWithSingleRange: true,
  2220  	}
  2221  	mtc.Start(t, numReplicas)
  2222  	defer mtc.Stop()
  2223  	mtc.replicateRange(rangeID, 1, 2)
  2224  
  2225  	leaderRepl := mtc.getRaftLeader(rangeID)
  2226  	followerRepl := func() *kvserver.Replica {
  2227  		for _, store := range mtc.stores {
  2228  			repl, err := store.GetReplica(rangeID)
  2229  			if err != nil {
  2230  				t.Fatal(err)
  2231  			}
  2232  			if repl == leaderRepl {
  2233  				continue
  2234  			}
  2235  			return repl
  2236  		}
  2237  		return nil
  2238  	}()
  2239  	if followerRepl == nil {
  2240  		t.Fatal("could not get a handle on a follower replica")
  2241  	}
  2242  
  2243  	// Lock the follower replica to prevent it from making progress from now
  2244  	// on. NB: See TestRaftBlockedReplica/#9914 for why we use a separate
  2245  	// goroutine.
  2246  	var wg sync.WaitGroup
  2247  	wg.Add(1)
  2248  	go func() {
  2249  		followerRepl.RaftLock()
  2250  		wg.Done()
  2251  	}()
  2252  	wg.Wait()
  2253  	defer followerRepl.RaftUnlock()
  2254  
  2255  	// TODO(andrei): The test becomes flaky with a lower threshold because the
  2256  	// follower is considered inactive just below. Figure out how to switch the
  2257  	// test to a manual clock. The activity tracking for followers uses the
  2258  	// physical clock.
  2259  	inactivityThreshold := time.Second
  2260  
  2261  	// Send a request to the leader replica. followerRepl is locked so it will
  2262  	// not respond.
  2263  	ctx := context.Background()
  2264  	key := roachpb.Key("k")
  2265  	value := []byte("value")
  2266  	var ba roachpb.BatchRequest
  2267  	ba.Add(putArgs(key, value))
  2268  	if err := ba.SetActiveTimestamp(mtc.clock().Now); err != nil {
  2269  		t.Fatal(err)
  2270  	}
  2271  	if _, pErr := leaderRepl.Send(ctx, ba); pErr != nil {
  2272  		t.Fatal(pErr)
  2273  	}
  2274  
  2275  	// The follower should still be active.
  2276  	followerID := followerRepl.ReplicaID()
  2277  	if !leaderRepl.IsFollowerActiveSince(ctx, followerID, inactivityThreshold) {
  2278  		t.Fatalf("expected follower to still be considered active")
  2279  	}
  2280  
  2281  	// It is possible that there are in-flight heartbeat responses from
  2282  	// followerRepl from before it was locked. The receipt of one of these
  2283  	// would bump the last active timestamp on the leader. Because of this,
  2284  	// we check whether the follower is eventually considered inactive.
  2285  	testutils.SucceedsSoon(t, func() error {
  2286  		// Send another request to the leader replica. followerRepl is locked
  2287  		// so it will not respond.
  2288  		if _, pErr := leaderRepl.Send(ctx, ba); pErr != nil {
  2289  			t.Fatal(pErr)
  2290  		}
  2291  
  2292  		// The follower should no longer be considered active.
  2293  		if leaderRepl.IsFollowerActiveSince(ctx, followerID, inactivityThreshold) {
  2294  			return errors.New("expected follower to be considered inactive")
  2295  		}
  2296  		return nil
  2297  	})
  2298  }
  2299  
  2300  // TestRaftHeartbeats verifies that coalesced heartbeats are correctly
  2301  // suppressing elections in an idle cluster.
  2302  func TestRaftHeartbeats(t *testing.T) {
  2303  	defer leaktest.AfterTest(t)()
  2304  
  2305  	mtc := &multiTestContext{}
  2306  	defer mtc.Stop()
  2307  	mtc.Start(t, 3)
  2308  
  2309  	const rangeID = roachpb.RangeID(1)
  2310  	mtc.replicateRange(rangeID, 1, 2)
  2311  
  2312  	// Capture the initial term and state.
  2313  	leaderIdx := -1
  2314  	for i, store := range mtc.stores {
  2315  		if store.RaftStatus(rangeID).SoftState.RaftState == raft.StateLeader {
  2316  			leaderIdx = i
  2317  			break
  2318  		}
  2319  	}
  2320  	initialTerm := mtc.stores[leaderIdx].RaftStatus(rangeID).Term
  2321  
  2322  	// Wait for several ticks to elapse.
  2323  	ticksToWait := 2 * mtc.makeStoreConfig(leaderIdx).RaftElectionTimeoutTicks
  2324  	ticks := mtc.stores[leaderIdx].Metrics().RaftTicks.Count
  2325  	for targetTicks := ticks() + int64(ticksToWait); ticks() < targetTicks; {
  2326  		time.Sleep(time.Millisecond)
  2327  	}
  2328  
  2329  	status := mtc.stores[leaderIdx].RaftStatus(rangeID)
  2330  	if status.SoftState.RaftState != raft.StateLeader {
  2331  		t.Errorf("expected node %d to be leader after sleeping but was %s", leaderIdx, status.SoftState.RaftState)
  2332  	}
  2333  	if status.Term != initialTerm {
  2334  		t.Errorf("while sleeping, term changed from %d to %d", initialTerm, status.Term)
  2335  	}
  2336  }
  2337  
  2338  // TestReportUnreachableHeartbeats tests that if a single transport fails,
  2339  // coalesced heartbeats are not stalled out entirely.
  2340  func TestReportUnreachableHeartbeats(t *testing.T) {
  2341  	defer leaktest.AfterTest(t)()
  2342  
  2343  	mtc := &multiTestContext{
  2344  		// This test was written before the multiTestContext started creating many
  2345  		// system ranges at startup, and hasn't been update to take that into
  2346  		// account.
  2347  		startWithSingleRange: true,
  2348  	}
  2349  	defer mtc.Stop()
  2350  	mtc.Start(t, 3)
  2351  
  2352  	const rangeID = roachpb.RangeID(1)
  2353  	mtc.replicateRange(rangeID, 1, 2)
  2354  
  2355  	leaderIdx := -1
  2356  	// Loop until a leader is elected.
  2357  	for {
  2358  		for i, store := range mtc.stores {
  2359  			if store.RaftStatus(rangeID).SoftState.RaftState == raft.StateLeader {
  2360  				leaderIdx = i
  2361  				break
  2362  			}
  2363  		}
  2364  		if leaderIdx == -1 {
  2365  			runtime.Gosched()
  2366  		} else {
  2367  			break
  2368  		}
  2369  	}
  2370  	initialTerm := mtc.stores[leaderIdx].RaftStatus(rangeID).Term
  2371  	// Choose a follower index that is guaranteed to not be the leader.
  2372  	followerIdx := (leaderIdx + 1) % len(mtc.stores)
  2373  
  2374  	// Shut down a raft transport via the circuit breaker, and wait for two
  2375  	// election timeouts to trigger an election if reportUnreachable broke
  2376  	// heartbeat transmission to the other store.
  2377  	cb := mtc.transport.GetCircuitBreaker(mtc.stores[followerIdx].Ident.NodeID,
  2378  		rpc.DefaultClass)
  2379  	cb.Break()
  2380  
  2381  	// Send a command to ensure Raft is aware of lost follower so that it won't
  2382  	// quiesce (which would prevent heartbeats).
  2383  	if _, err := kv.SendWrappedWith(
  2384  		context.Background(), mtc.stores[0].TestSender(), roachpb.Header{RangeID: rangeID},
  2385  		incrementArgs(roachpb.Key("a"), 1)); err != nil {
  2386  		t.Fatal(err)
  2387  	}
  2388  
  2389  	ticksToWait := 2 * mtc.makeStoreConfig(leaderIdx).RaftElectionTimeoutTicks
  2390  	ticks := mtc.stores[leaderIdx].Metrics().RaftTicks.Count
  2391  	for targetTicks := ticks() + int64(ticksToWait); ticks() < targetTicks; {
  2392  		time.Sleep(time.Millisecond)
  2393  	}
  2394  
  2395  	// Ensure that the leadership has not changed, to confirm that heartbeats
  2396  	// are sent to the store with a functioning transport.
  2397  	status := mtc.stores[leaderIdx].RaftStatus(rangeID)
  2398  	if status.SoftState.RaftState != raft.StateLeader {
  2399  		t.Errorf("expected node %d to be leader after sleeping but was %s", leaderIdx, status.SoftState.RaftState)
  2400  	}
  2401  	if status.Term != initialTerm {
  2402  		t.Errorf("while sleeping, term changed from %d to %d", initialTerm, status.Term)
  2403  	}
  2404  }
  2405  
  2406  // TestReportUnreachableRemoveRace adds and removes the raft leader replica
  2407  // repeatedly while one of its peers is unreachable in an attempt to expose
  2408  // races (primarily in asynchronous coalesced heartbeats).
  2409  func TestReportUnreachableRemoveRace(t *testing.T) {
  2410  	defer leaktest.AfterTest(t)()
  2411  
  2412  	mtc := &multiTestContext{}
  2413  	defer mtc.Stop()
  2414  	mtc.Start(t, 3)
  2415  
  2416  	const rangeID = roachpb.RangeID(1)
  2417  	mtc.replicateRange(rangeID, 1, 2)
  2418  
  2419  outer:
  2420  	for i := 0; i < 5; i++ {
  2421  		for leaderIdx, store := range mtc.stores {
  2422  			repl, err := store.GetReplica(rangeID)
  2423  			if err != nil {
  2424  				t.Fatal(err)
  2425  			}
  2426  			if repl.RaftStatus().SoftState.RaftState == raft.StateLeader {
  2427  				for replicaIdx, toStore := range mtc.stores {
  2428  					if toStore == store {
  2429  						continue
  2430  					}
  2431  					repDesc, err := repl.GetReplicaDescriptor()
  2432  					if err != nil {
  2433  						t.Fatal(err)
  2434  					}
  2435  					if lease, _ := repl.GetLease(); lease.Replica.Equal(repDesc) {
  2436  						mtc.transferLease(context.Background(), rangeID, leaderIdx, replicaIdx)
  2437  					}
  2438  					mtc.unreplicateRange(rangeID, leaderIdx)
  2439  					cb := mtc.transport.GetCircuitBreaker(toStore.Ident.NodeID, rpc.DefaultClass)
  2440  					cb.Break()
  2441  					time.Sleep(mtc.storeConfig.CoalescedHeartbeatsInterval)
  2442  					cb.Reset()
  2443  					mtc.replicateRange(rangeID, leaderIdx)
  2444  					continue outer
  2445  				}
  2446  				t.Fatal("could not find raft replica")
  2447  			}
  2448  		}
  2449  		i-- // try again
  2450  	}
  2451  }
  2452  
  2453  // TestReplicateAfterSplit verifies that a new replica whose start key
  2454  // is not KeyMin replicating to a fresh store can apply snapshots correctly.
  2455  func TestReplicateAfterSplit(t *testing.T) {
  2456  	defer leaktest.AfterTest(t)()
  2457  	storeCfg := kvserver.TestStoreConfig(nil /* clock */)
  2458  	storeCfg.TestingKnobs.DisableMergeQueue = true
  2459  	mtc := &multiTestContext{
  2460  		storeConfig: &storeCfg,
  2461  	}
  2462  	defer mtc.Stop()
  2463  	mtc.Start(t, 2)
  2464  
  2465  	const rangeID = roachpb.RangeID(1)
  2466  	splitKey := roachpb.Key("m")
  2467  	key := roachpb.Key("z")
  2468  
  2469  	store0 := mtc.stores[0]
  2470  	// Make the split
  2471  	splitArgs := adminSplitArgs(splitKey)
  2472  	if _, err := kv.SendWrapped(context.Background(), store0.TestSender(), splitArgs); err != nil {
  2473  		t.Fatal(err)
  2474  	}
  2475  
  2476  	rangeID2 := store0.LookupReplica(roachpb.RKey(key)).RangeID
  2477  	if rangeID2 == rangeID {
  2478  		t.Fatal("got same range id after split")
  2479  	}
  2480  	// Issue an increment for later check.
  2481  	incArgs := incrementArgs(key, 11)
  2482  	if _, err := kv.SendWrappedWith(context.Background(), store0.TestSender(), roachpb.Header{
  2483  		RangeID: rangeID2,
  2484  	}, incArgs); err != nil {
  2485  		t.Fatal(err)
  2486  	}
  2487  	// Now add the second replica.
  2488  	mtc.replicateRange(rangeID2, 1)
  2489  
  2490  	if mtc.stores[1].LookupReplica(roachpb.RKey(key)).GetMaxBytes() == 0 {
  2491  		t.Error("Range MaxBytes is not set after snapshot applied")
  2492  	}
  2493  	// Once it catches up, the effects of increment commands can be seen.
  2494  	testutils.SucceedsSoon(t, func() error {
  2495  		getArgs := getArgs(key)
  2496  		// Reading on non-lease holder replica should use inconsistent read
  2497  		if reply, err := kv.SendWrappedWith(context.Background(), mtc.stores[1].TestSender(), roachpb.Header{
  2498  			RangeID:         rangeID2,
  2499  			ReadConsistency: roachpb.INCONSISTENT,
  2500  		}, getArgs); err != nil {
  2501  			return errors.Errorf("failed to read data: %s", err)
  2502  		} else if e, v := int64(11), mustGetInt(reply.(*roachpb.GetResponse).Value); v != e {
  2503  			return errors.Errorf("failed to read correct data: expected %d, got %d", e, v)
  2504  		}
  2505  		return nil
  2506  	})
  2507  }
  2508  
  2509  // TestReplicaRemovalCampaign verifies that a new replica after a split can be
  2510  // transferred away/replaced without campaigning the old one.
  2511  func TestReplicaRemovalCampaign(t *testing.T) {
  2512  	defer leaktest.AfterTest(t)()
  2513  
  2514  	testData := []struct {
  2515  		remove        bool
  2516  		expectAdvance bool
  2517  	}{
  2518  		{ // Replica removed
  2519  			remove:        true,
  2520  			expectAdvance: false,
  2521  		},
  2522  		{ // Default behavior
  2523  			remove:        false,
  2524  			expectAdvance: true,
  2525  		},
  2526  	}
  2527  
  2528  	const rangeID = roachpb.RangeID(1)
  2529  	splitKey := roachpb.Key("m")
  2530  	key2 := roachpb.Key("z")
  2531  
  2532  	for i, td := range testData {
  2533  		func() {
  2534  			storeCfg := kvserver.TestStoreConfig(nil /* clock */)
  2535  			storeCfg.TestingKnobs.DisableMergeQueue = true
  2536  			mtc := &multiTestContext{
  2537  				storeConfig: &storeCfg,
  2538  			}
  2539  			defer mtc.Stop()
  2540  			mtc.Start(t, 2)
  2541  
  2542  			// Replicate range to enable raft campaigning.
  2543  			mtc.replicateRange(rangeID, 1)
  2544  			store0 := mtc.stores[0]
  2545  
  2546  			// Make the split.
  2547  			splitArgs := adminSplitArgs(splitKey)
  2548  			if _, err := kv.SendWrapped(context.Background(), store0.TestSender(), splitArgs); err != nil {
  2549  				t.Fatal(err)
  2550  			}
  2551  
  2552  			replica2 := store0.LookupReplica(roachpb.RKey(key2))
  2553  
  2554  			rg2 := func(s *kvserver.Store) kv.Sender {
  2555  				return kv.Wrap(s, func(ba roachpb.BatchRequest) roachpb.BatchRequest {
  2556  					if ba.RangeID == 0 {
  2557  						ba.RangeID = replica2.RangeID
  2558  					}
  2559  					return ba
  2560  				})
  2561  			}
  2562  
  2563  			// Raft processing is initialized lazily; issue a no-op write request to
  2564  			// ensure that the Raft group has been started.
  2565  			incArgs := incrementArgs(key2, 0)
  2566  			if _, err := kv.SendWrapped(context.Background(), rg2(store0), incArgs); err != nil {
  2567  				t.Fatal(err)
  2568  			}
  2569  
  2570  			if td.remove {
  2571  				// Simulate second replica being transferred by removing it.
  2572  				if err := store0.RemoveReplica(context.Background(), replica2, replica2.Desc().NextReplicaID, kvserver.RemoveOptions{
  2573  					DestroyData: true,
  2574  				}); err != nil {
  2575  					t.Fatal(err)
  2576  				}
  2577  			}
  2578  
  2579  			var latestTerm uint64
  2580  			if td.expectAdvance {
  2581  				testutils.SucceedsSoon(t, func() error {
  2582  					if raftStatus := replica2.RaftStatus(); raftStatus != nil {
  2583  						if term := raftStatus.Term; term <= latestTerm {
  2584  							return errors.Errorf("%d: raft term has not yet advanced: %d", i, term)
  2585  						} else if latestTerm == 0 {
  2586  							latestTerm = term
  2587  						}
  2588  					} else {
  2589  						return errors.Errorf("%d: raft group is not yet initialized", i)
  2590  					}
  2591  					return nil
  2592  				})
  2593  			} else {
  2594  				for start := timeutil.Now(); timeutil.Since(start) < time.Second; time.Sleep(10 * time.Millisecond) {
  2595  					if raftStatus := replica2.RaftStatus(); raftStatus != nil {
  2596  						if term := raftStatus.Term; term > latestTerm {
  2597  							if latestTerm == 0 {
  2598  								latestTerm = term
  2599  							} else {
  2600  								t.Errorf("%d: raft term unexpectedly advanced: %d", i, term)
  2601  								break
  2602  							}
  2603  						}
  2604  					}
  2605  				}
  2606  			}
  2607  		}()
  2608  	}
  2609  }
  2610  
  2611  // TestRaftAfterRemoveRange verifies that the raft state removes
  2612  // a remote node correctly after the Replica was removed from the Store.
  2613  func TestRaftAfterRemoveRange(t *testing.T) {
  2614  	defer leaktest.AfterTest(t)()
  2615  	storeCfg := kvserver.TestStoreConfig(nil /* clock */)
  2616  	storeCfg.TestingKnobs.DisableMergeQueue = true
  2617  	storeCfg.Clock = nil // manual clock
  2618  	mtc := &multiTestContext{
  2619  		storeConfig: &storeCfg,
  2620  	}
  2621  	defer mtc.Stop()
  2622  	mtc.Start(t, 3)
  2623  
  2624  	// Make the split.
  2625  	splitArgs := adminSplitArgs(roachpb.Key("b"))
  2626  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), splitArgs); err != nil {
  2627  		t.Fatal(err)
  2628  	}
  2629  
  2630  	const rangeID = roachpb.RangeID(2)
  2631  	mtc.replicateRange(rangeID, 1, 2)
  2632  
  2633  	mtc.unreplicateRange(rangeID, 2)
  2634  	mtc.unreplicateRange(rangeID, 1)
  2635  
  2636  	// Wait for the removal to be processed.
  2637  	testutils.SucceedsSoon(t, func() error {
  2638  		for _, s := range mtc.stores[1:] {
  2639  			_, err := s.GetReplica(rangeID)
  2640  			if !errors.HasType(err, (*roachpb.RangeNotFoundError)(nil)) {
  2641  				return errors.Wrapf(err, "range %d not yet removed from %s", rangeID, s)
  2642  			}
  2643  		}
  2644  		return nil
  2645  	})
  2646  
  2647  	// Test that a coalesced heartbeat is ingested correctly.
  2648  	replica1 := roachpb.ReplicaDescriptor{
  2649  		ReplicaID: roachpb.ReplicaID(mtc.stores[1].StoreID()),
  2650  		NodeID:    roachpb.NodeID(mtc.stores[1].StoreID()),
  2651  		StoreID:   mtc.stores[1].StoreID(),
  2652  	}
  2653  	replica2 := roachpb.ReplicaDescriptor{
  2654  		ReplicaID: roachpb.ReplicaID(mtc.stores[2].StoreID()),
  2655  		NodeID:    roachpb.NodeID(mtc.stores[2].StoreID()),
  2656  		StoreID:   mtc.stores[2].StoreID(),
  2657  	}
  2658  	mtc.transport.SendAsync(&kvserver.RaftMessageRequest{
  2659  		ToReplica:   replica1,
  2660  		FromReplica: replica2,
  2661  		Heartbeats: []kvserver.RaftHeartbeat{
  2662  			{
  2663  				RangeID:       rangeID,
  2664  				FromReplicaID: replica2.ReplicaID,
  2665  				ToReplicaID:   replica1.ReplicaID,
  2666  			},
  2667  		},
  2668  	}, rpc.DefaultClass)
  2669  	// Execute another replica change to ensure that raft has processed
  2670  	// the heartbeat just sent.
  2671  	mtc.replicateRange(roachpb.RangeID(1), 1)
  2672  
  2673  	// Expire leases to ensure any remaining intent resolutions can complete.
  2674  	// TODO(bdarnell): understand why some tests need this.
  2675  	mtc.advanceClock(context.Background())
  2676  }
  2677  
  2678  // TestRaftRemoveRace adds and removes a replica repeatedly in an attempt to
  2679  // reproduce a race (see #1911 and #9037).
  2680  func TestRaftRemoveRace(t *testing.T) {
  2681  	defer leaktest.AfterTest(t)()
  2682  	mtc := &multiTestContext{}
  2683  	defer mtc.Stop()
  2684  	const rangeID = roachpb.RangeID(1)
  2685  
  2686  	if !util.RaceEnabled {
  2687  		mtc.Start(t, 10)
  2688  		// Up-replicate to a bunch of nodes which stresses a condition where a
  2689  		// replica created via a preemptive snapshot receives a message for a
  2690  		// previous incarnation of the replica (i.e. has a smaller replica ID) that
  2691  		// existed on the same store.
  2692  		mtc.replicateRange(rangeID, 1, 2, 3, 4, 5, 6, 7, 8, 9)
  2693  	} else {
  2694  		// In race builds, running 10 nodes needs more than 1 full CPU
  2695  		// (due to background gossip and heartbeat overhead), so it can't
  2696  		// keep up when run under stress with one process per CPU. Run a
  2697  		// reduced version of this test in race builds. This isn't as
  2698  		// likely to reproduce the preemptive-snapshot race described in
  2699  		// the previous comment, but will still have a chance to do so, or
  2700  		// to find other races.
  2701  		mtc.Start(t, 3)
  2702  		mtc.replicateRange(rangeID, 1, 2)
  2703  	}
  2704  
  2705  	for i := 0; i < 10; i++ {
  2706  		mtc.unreplicateRange(rangeID, 2)
  2707  		mtc.replicateRange(rangeID, 2)
  2708  
  2709  		// Verify the tombstone key does not exist. See #12130.
  2710  		tombstoneKey := keys.RangeTombstoneKey(rangeID)
  2711  		var tombstone roachpb.RangeTombstone
  2712  		if ok, err := storage.MVCCGetProto(
  2713  			context.Background(), mtc.stores[2].Engine(), tombstoneKey,
  2714  			hlc.Timestamp{}, &tombstone, storage.MVCCGetOptions{},
  2715  		); err != nil {
  2716  			t.Fatal(err)
  2717  		} else if ok {
  2718  			t.Fatal("tombstone should not exist")
  2719  		}
  2720  	}
  2721  }
  2722  
  2723  // TestRemovePlaceholderRace adds and removes a replica repeatedly (similar to
  2724  // TestRaftRemoveRace) in an attempt to stress the locking around replica
  2725  // placeholders.
  2726  func TestRemovePlaceholderRace(t *testing.T) {
  2727  	defer leaktest.AfterTest(t)()
  2728  	mtc := &multiTestContext{}
  2729  	defer mtc.Stop()
  2730  	mtc.Start(t, 3)
  2731  
  2732  	const rangeID = roachpb.RangeID(1)
  2733  	mtc.replicateRange(rangeID, 1, 2)
  2734  
  2735  	repl, err := mtc.stores[0].GetReplica(rangeID)
  2736  	if err != nil {
  2737  		t.Fatal(err)
  2738  	}
  2739  	ctx := repl.AnnotateCtx(context.Background())
  2740  
  2741  	for i := 0; i < 100; i++ {
  2742  		for _, action := range []roachpb.ReplicaChangeType{roachpb.REMOVE_REPLICA, roachpb.ADD_REPLICA} {
  2743  			for {
  2744  				chgs := roachpb.MakeReplicationChanges(action, roachpb.ReplicationTarget{
  2745  					NodeID:  mtc.stores[1].Ident.NodeID,
  2746  					StoreID: mtc.stores[1].Ident.StoreID,
  2747  				})
  2748  				if _, err := repl.ChangeReplicas(ctx, repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonUnknown, "", chgs); err != nil {
  2749  					if kvserver.IsSnapshotError(err) {
  2750  						continue
  2751  					} else {
  2752  						t.Fatal(err)
  2753  					}
  2754  				}
  2755  				break
  2756  			}
  2757  		}
  2758  	}
  2759  }
  2760  
  2761  type noConfChangeTestHandler struct {
  2762  	rangeID roachpb.RangeID
  2763  	kvserver.RaftMessageHandler
  2764  }
  2765  
  2766  func (ncc *noConfChangeTestHandler) HandleRaftRequest(
  2767  	ctx context.Context,
  2768  	req *kvserver.RaftMessageRequest,
  2769  	respStream kvserver.RaftMessageResponseStream,
  2770  ) *roachpb.Error {
  2771  	for i, e := range req.Message.Entries {
  2772  		if e.Type == raftpb.EntryConfChange {
  2773  			var cc raftpb.ConfChange
  2774  			if err := protoutil.Unmarshal(e.Data, &cc); err != nil {
  2775  				panic(err)
  2776  			}
  2777  			var ccCtx kvserver.ConfChangeContext
  2778  			if err := protoutil.Unmarshal(cc.Context, &ccCtx); err != nil {
  2779  				panic(err)
  2780  			}
  2781  			var command kvserverpb.RaftCommand
  2782  			if err := protoutil.Unmarshal(ccCtx.Payload, &command); err != nil {
  2783  				panic(err)
  2784  			}
  2785  			if req.RangeID == ncc.rangeID {
  2786  				if command.ReplicatedEvalResult.ChangeReplicas != nil {
  2787  					// We found a configuration change headed for our victim range;
  2788  					// sink it.
  2789  					req.Message.Entries = req.Message.Entries[:i]
  2790  				}
  2791  			}
  2792  		}
  2793  	}
  2794  	return ncc.RaftMessageHandler.HandleRaftRequest(ctx, req, respStream)
  2795  }
  2796  
  2797  func (ncc *noConfChangeTestHandler) HandleRaftResponse(
  2798  	ctx context.Context, resp *kvserver.RaftMessageResponse,
  2799  ) error {
  2800  	switch val := resp.Union.GetValue().(type) {
  2801  	case *roachpb.Error:
  2802  		switch val.GetDetail().(type) {
  2803  		case *roachpb.ReplicaTooOldError:
  2804  			// We're going to manually GC the replica, so ignore these errors.
  2805  			return nil
  2806  		}
  2807  	}
  2808  	return ncc.RaftMessageHandler.HandleRaftResponse(ctx, resp)
  2809  }
  2810  
  2811  func TestReplicaGCRace(t *testing.T) {
  2812  	defer leaktest.AfterTest(t)()
  2813  
  2814  	mtc := &multiTestContext{}
  2815  	defer mtc.Stop()
  2816  	mtc.Start(t, 3)
  2817  
  2818  	const rangeID = roachpb.RangeID(1)
  2819  	mtc.replicateRange(rangeID, 1)
  2820  
  2821  	leaderStore := mtc.stores[0]
  2822  	fromStore := mtc.stores[1]
  2823  	toStore := mtc.stores[2]
  2824  
  2825  	// Prevent the victim replica from processing configuration changes.
  2826  	mtc.transport.Stop(toStore.Ident.StoreID)
  2827  	mtc.transport.Listen(toStore.Ident.StoreID, &noConfChangeTestHandler{
  2828  		rangeID:            rangeID,
  2829  		RaftMessageHandler: toStore,
  2830  	})
  2831  
  2832  	repl, err := leaderStore.GetReplica(rangeID)
  2833  	if err != nil {
  2834  		t.Fatal(err)
  2835  	}
  2836  	ctx := repl.AnnotateCtx(context.Background())
  2837  
  2838  	// Add the victim replica. Note that it will receive a snapshot and raft log
  2839  	// replays, but will not process the configuration change containing the new
  2840  	// range descriptor, preventing it from learning of the new NextReplicaID.
  2841  	chgs := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, roachpb.ReplicationTarget{
  2842  		NodeID:  toStore.Ident.NodeID,
  2843  		StoreID: toStore.Ident.StoreID,
  2844  	})
  2845  	if _, err := repl.ChangeReplicas(ctx, repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs); err != nil {
  2846  		t.Fatal(err)
  2847  	}
  2848  
  2849  	// Craft a heartbeat addressed to the victim replica. Note that this
  2850  	// heartbeat will be sent after the replica has been GC'ed.
  2851  	rangeDesc := repl.Desc()
  2852  	fromReplicaDesc, ok := rangeDesc.GetReplicaDescriptor(fromStore.Ident.StoreID)
  2853  	if !ok {
  2854  		t.Fatalf("expected %s to have a replica on %s", rangeDesc, fromStore)
  2855  	}
  2856  	toReplicaDesc, ok := rangeDesc.GetReplicaDescriptor(toStore.Ident.StoreID)
  2857  	if !ok {
  2858  		t.Fatalf("expected %s to have a replica on %s", rangeDesc, toStore)
  2859  	}
  2860  
  2861  	hbReq := kvserver.RaftMessageRequest{
  2862  		FromReplica: fromReplicaDesc,
  2863  		ToReplica:   toReplicaDesc,
  2864  		Heartbeats: []kvserver.RaftHeartbeat{
  2865  			{
  2866  				RangeID:       rangeID,
  2867  				FromReplicaID: fromReplicaDesc.ReplicaID,
  2868  				ToReplicaID:   toReplicaDesc.ReplicaID,
  2869  			},
  2870  		},
  2871  	}
  2872  
  2873  	// Wait for the victim's raft log to be non-empty, then configure the heartbeat
  2874  	// with the raft state.
  2875  	testutils.SucceedsSoon(t, func() error {
  2876  		status := repl.RaftStatus()
  2877  		progressByID := status.Progress
  2878  		progress, ok := progressByID[uint64(toReplicaDesc.ReplicaID)]
  2879  		if !ok {
  2880  			return errors.Errorf("%+v does not yet contain %s", progressByID, toReplicaDesc)
  2881  		}
  2882  		if progress.Match == 0 {
  2883  			return errors.Errorf("%+v has not yet advanced", progress)
  2884  		}
  2885  		for i := range hbReq.Heartbeats {
  2886  			hbReq.Heartbeats[i].Term = status.Term
  2887  			hbReq.Heartbeats[i].Commit = progress.Match
  2888  		}
  2889  		return nil
  2890  	})
  2891  
  2892  	// Remove the victim replica and manually GC it.
  2893  	chgs[0].ChangeType = roachpb.REMOVE_REPLICA
  2894  	if _, err := repl.ChangeReplicas(ctx, repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeOverReplicated, "", chgs); err != nil {
  2895  		t.Fatal(err)
  2896  	}
  2897  
  2898  	{
  2899  		removedReplica, err := toStore.GetReplica(rangeID)
  2900  		if err != nil {
  2901  			t.Fatal(err)
  2902  		}
  2903  		if err := toStore.ManualReplicaGC(removedReplica); err != nil {
  2904  			t.Fatal(err)
  2905  		}
  2906  	}
  2907  
  2908  	// Create a new transport for store 0. Error responses are passed
  2909  	// back along the same grpc stream as the request so it's ok that
  2910  	// there are two (this one and the one actually used by the store).
  2911  	fromTransport := kvserver.NewRaftTransport(log.AmbientContext{Tracer: mtc.storeConfig.Settings.Tracer},
  2912  		cluster.MakeTestingClusterSettings(),
  2913  		nodedialer.New(mtc.rpcContext, gossip.AddressResolver(fromStore.Gossip())),
  2914  		nil, /* grpcServer */
  2915  		mtc.transportStopper,
  2916  	)
  2917  	errChan := errorChannelTestHandler(make(chan *roachpb.Error, 1))
  2918  	fromTransport.Listen(fromStore.StoreID(), errChan)
  2919  
  2920  	// Send the heartbeat. Boom. See #11591.
  2921  	// We have to send this multiple times to protect against
  2922  	// dropped messages (see #18355).
  2923  	sendHeartbeat := func() (sent bool) {
  2924  		r := hbReq
  2925  		return fromTransport.SendAsync(&r, rpc.DefaultClass)
  2926  	}
  2927  	if sent := sendHeartbeat(); !sent {
  2928  		t.Fatal("failed to send heartbeat")
  2929  	}
  2930  	heartbeatsSent := 1
  2931  
  2932  	// The receiver of this message should return an error. If we don't get a
  2933  	// quick response, assume that the message got dropped and try sending it
  2934  	// again.
  2935  	select {
  2936  	case pErr := <-errChan:
  2937  		switch pErr.GetDetail().(type) {
  2938  		case *roachpb.RaftGroupDeletedError:
  2939  		default:
  2940  			t.Fatalf("unexpected error type %T: %s", pErr.GetDetail(), pErr)
  2941  		}
  2942  	case <-time.After(time.Second):
  2943  		if heartbeatsSent >= 5 {
  2944  			t.Fatal("did not get expected error")
  2945  		}
  2946  		heartbeatsSent++
  2947  		if sent := sendHeartbeat(); !sent {
  2948  			t.Fatal("failed to send heartbeat")
  2949  		}
  2950  	}
  2951  }
  2952  
  2953  func requireOnlyAtomicChanges(
  2954  	t *testing.T, db *sqlutils.SQLRunner, rangeID roachpb.RangeID, repFactor int, start time.Time,
  2955  ) {
  2956  	// From all events pertaining to the given rangeID and post-dating the start time,
  2957  	// filter out those infos which indicate a (full and incoming) voter count in
  2958  	// excess of the replication factor. Any rows returned have the full info JSON
  2959  	// strings in them.
  2960  	const q = `
  2961  SELECT
  2962  	"uniqueID",
  2963  	count(t) AS repfactor,
  2964  	string_agg(info, e'\\n') AS infos
  2965  FROM
  2966  	[
  2967  		SELECT
  2968  			"uniqueID",
  2969  			replicas->'node_id' AS n,
  2970  			COALESCE(replicas->'type', '0') AS t,
  2971  			info
  2972  		FROM
  2973  			system.rangelog,
  2974  			ROWS FROM (
  2975  				jsonb_array_elements(
  2976  					info::JSONB->'UpdatedDesc'->'internal_replicas'
  2977  				)
  2978  			)
  2979  				AS replicas
  2980  		WHERE
  2981  			info::JSONB->'UpdatedDesc'->'range_id' = $1::JSONB AND timestamp >= $2
  2982  		ORDER BY
  2983  			"timestamp" ASC
  2984  	]
  2985  WHERE
  2986  	t IN ('0', '2')
  2987  GROUP BY
  2988  	"uniqueID"
  2989  HAVING
  2990  	count(t) > $3;
  2991  `
  2992  	matrix := db.QueryStr(t, q, rangeID, start, repFactor)
  2993  	if len(matrix) > 0 {
  2994  		t.Fatalf("more than %d voting replicas: %s", repFactor, sqlutils.MatrixToStr(matrix))
  2995  	}
  2996  }
  2997  
  2998  func TestDecommission(t *testing.T) {
  2999  	defer leaktest.AfterTest(t)()
  3000  
  3001  	if util.RaceEnabled {
  3002  		// Five nodes is too much to reliably run under testrace with our aggressive
  3003  		// liveness timings.
  3004  		t.Skip("skipping under testrace: #39807 and #37811")
  3005  	}
  3006  
  3007  	// This test relies on concurrently waiting for a value to change in the
  3008  	// underlying engine(s). Since the teeing engine does not respond well to
  3009  	// value mismatches, whether transient or permanent, skip this test if the
  3010  	// teeing engine is being used. See
  3011  	// https://github.com/cockroachdb/cockroach/issues/42656 for more context.
  3012  	if storage.DefaultStorageEngine == enginepb.EngineTypeTeePebbleRocksDB {
  3013  		t.Skip("disabled on teeing engine")
  3014  	}
  3015  
  3016  	ctx := context.Background()
  3017  	tc := testcluster.StartTestCluster(t, 5, base.TestClusterArgs{
  3018  		ReplicationMode: base.ReplicationAuto,
  3019  	})
  3020  	defer tc.Stopper().Stop(ctx)
  3021  
  3022  	k := tc.ScratchRange(t)
  3023  	cc, err := tc.Server(0).RPCContext().GRPCDialNode(tc.Server(0).RPCAddr(), 1, rpc.DefaultClass).Connect(ctx)
  3024  	require.NoError(t, err)
  3025  	admin := serverpb.NewAdminClient(cc)
  3026  	// Decommission the first node, which holds most of the leases.
  3027  	_, err = admin.Decommission(
  3028  		ctx, &serverpb.DecommissionRequest{Decommissioning: true},
  3029  	)
  3030  	require.NoError(t, err)
  3031  
  3032  	requireNoReplicas := func(storeID roachpb.StoreID, repFactor int) {
  3033  		testutils.SucceedsSoon(t, func() error {
  3034  			desc := tc.LookupRangeOrFatal(t, k)
  3035  			for _, rDesc := range desc.Replicas().Voters() {
  3036  				store, err := tc.Servers[int(rDesc.NodeID-1)].Stores().GetStore(rDesc.StoreID)
  3037  				require.NoError(t, err)
  3038  				if err := store.ForceReplicationScanAndProcess(); err != nil {
  3039  					return err
  3040  				}
  3041  			}
  3042  			if sl := desc.Replicas().Filter(func(rDesc roachpb.ReplicaDescriptor) bool {
  3043  				return rDesc.StoreID == storeID
  3044  			}); len(sl) > 0 {
  3045  				return errors.Errorf("still a replica on s%d: %s", storeID, &desc)
  3046  			}
  3047  			if len(desc.Replicas().Voters()) != repFactor {
  3048  				return errors.Errorf("expected %d replicas: %s", repFactor, &desc)
  3049  			}
  3050  			return nil
  3051  		})
  3052  	}
  3053  
  3054  	const triplicated = 3
  3055  
  3056  	requireNoReplicas(1, triplicated)
  3057  
  3058  	runner := sqlutils.MakeSQLRunner(tc.ServerConn(0))
  3059  	ts := timeutil.Now()
  3060  
  3061  	_, err = admin.Decommission(
  3062  		ctx, &serverpb.DecommissionRequest{NodeIDs: []roachpb.NodeID{2}, Decommissioning: true},
  3063  	)
  3064  	require.NoError(t, err)
  3065  
  3066  	// Both s1 and s2 are out, so neither ought to have replicas.
  3067  	requireNoReplicas(1, triplicated)
  3068  	requireNoReplicas(2, triplicated)
  3069  
  3070  	// Going from three replicas to three replicas should have used atomic swaps
  3071  	// only. We didn't verify this before the first decommissioning op because
  3072  	// lots of ranges were over-replicated due to ranges recently having split
  3073  	// off from the five-fold replicated system ranges.
  3074  	requireOnlyAtomicChanges(t, runner, tc.LookupRangeOrFatal(t, k).RangeID, triplicated, ts)
  3075  
  3076  	sqlutils.SetZoneConfig(t, runner, "RANGE default", "num_replicas: 1")
  3077  
  3078  	const single = 1
  3079  
  3080  	// The range should drop down to one replica on a non-decommissioning store.
  3081  	requireNoReplicas(1, single)
  3082  	requireNoReplicas(2, single)
  3083  
  3084  	// Decommission two more nodes. Only n5 is left; getting the replicas there
  3085  	// can't use atomic replica swaps because the leaseholder can't be removed.
  3086  	_, err = admin.Decommission(
  3087  		ctx, &serverpb.DecommissionRequest{NodeIDs: []roachpb.NodeID{3, 4}, Decommissioning: true},
  3088  	)
  3089  	require.NoError(t, err)
  3090  
  3091  	requireNoReplicas(1, single)
  3092  	requireNoReplicas(2, single)
  3093  	requireNoReplicas(3, single)
  3094  	requireNoReplicas(4, single)
  3095  }
  3096  
  3097  // TestReplicateRogueRemovedNode ensures that a rogue removed node
  3098  // (i.e. a node that has been removed from the range but doesn't know
  3099  // it yet because it was down or partitioned away when it happened)
  3100  // cannot cause other removed nodes to recreate their ranges.
  3101  func TestReplicateRogueRemovedNode(t *testing.T) {
  3102  	defer leaktest.AfterTest(t)()
  3103  
  3104  	sc := kvserver.TestStoreConfig(nil)
  3105  	// Newly-started stores (including the "rogue" one) should not GC
  3106  	// their replicas. We'll turn this back on when needed.
  3107  	sc.TestingKnobs.DisableReplicaGCQueue = true
  3108  	sc.Clock = nil // manual clock
  3109  	mtc := &multiTestContext{
  3110  		storeConfig: &sc,
  3111  		// This test was written before the multiTestContext started creating many
  3112  		// system ranges at startup, and hasn't been update to take that into
  3113  		// account.
  3114  		startWithSingleRange: true,
  3115  	}
  3116  	defer mtc.Stop()
  3117  	mtc.Start(t, 3)
  3118  
  3119  	// We're going to set up the cluster with partitioning so that we can
  3120  	// partition node 0 from the others. The partition is not initially active.
  3121  	partRange, err := setupPartitionedRange(mtc, 1, 0, 0, false /* activated */, unreliableRaftHandlerFuncs{})
  3122  	require.NoError(t, err)
  3123  	// First put the range on all three nodes.
  3124  	raftID := roachpb.RangeID(1)
  3125  	mtc.replicateRange(raftID, 1, 2)
  3126  
  3127  	// Put some data in the range so we'll have something to test for.
  3128  	incArgs := incrementArgs([]byte("a"), 5)
  3129  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  3130  		t.Fatal(err)
  3131  	}
  3132  
  3133  	// Wait for all nodes to catch up.
  3134  	mtc.waitForValues(roachpb.Key("a"), []int64{5, 5, 5})
  3135  
  3136  	// Stop node 2; while it is down remove the range from nodes 2 and 1.
  3137  	mtc.stopStore(2)
  3138  	mtc.unreplicateRange(raftID, 2)
  3139  	mtc.unreplicateRange(raftID, 1)
  3140  
  3141  	// Make a write on node 0; this will not be replicated because 0 is the only node left.
  3142  	incArgs = incrementArgs([]byte("a"), 11)
  3143  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  3144  		t.Fatal(err)
  3145  	}
  3146  
  3147  	// Wait for the replica to be GC'd on node 1.
  3148  	// Store 0 has two writes, 1 has erased everything, and 2 still has the first write.
  3149  	// A single pass of ForceReplicaGCScanAndProcess is not enough, since the replica
  3150  	// may be recreated by a stray raft message, so we run the GC scan inside the loop.
  3151  	// TODO(bdarnell): if the call to RemoveReplica in replicaGCQueue.process can be
  3152  	// moved under the lock, then the GC scan can be moved out of this loop.
  3153  	mtc.stores[1].SetReplicaGCQueueActive(true)
  3154  	testutils.SucceedsSoon(t, func() error {
  3155  		mtc.advanceClock(context.Background())
  3156  		mtc.manualClock.Increment(int64(
  3157  			kvserver.ReplicaGCQueueInactivityThreshold) + 1)
  3158  		mtc.stores[1].MustForceReplicaGCScanAndProcess()
  3159  
  3160  		actual := mtc.readIntFromEngines(roachpb.Key("a"))
  3161  		expected := []int64{16, 0, 5}
  3162  		if !reflect.DeepEqual(expected, actual) {
  3163  			return errors.Errorf("expected %v, got %v", expected, actual)
  3164  		}
  3165  		return nil
  3166  	})
  3167  	// Partition nodes 1 and 2 from node 0. Otherwise they'd get a
  3168  	// ReplicaTooOldError from node 0 and proceed to remove themselves.
  3169  	partRange.activate()
  3170  	// Bring node 2 back up.
  3171  	mtc.restartStore(2)
  3172  
  3173  	// Try to issue a command on node 2. It should not be able to commit
  3174  	// (so we add it asynchronously).
  3175  	var startWG sync.WaitGroup
  3176  	startWG.Add(1)
  3177  	var finishWG sync.WaitGroup
  3178  	finishWG.Add(1)
  3179  
  3180  	rep, err := mtc.stores[2].GetReplica(raftID)
  3181  	if err != nil {
  3182  		t.Fatal(err)
  3183  	}
  3184  	replicaDesc, ok := rep.Desc().GetReplicaDescriptor(mtc.stores[2].StoreID())
  3185  	if !ok {
  3186  		t.Fatalf("ReplicaID %d not found", raftID)
  3187  	}
  3188  	go func() {
  3189  		incArgs := incrementArgs([]byte("a"), 23)
  3190  		startWG.Done()
  3191  		defer finishWG.Done()
  3192  		_, pErr := kv.SendWrappedWith(
  3193  			context.Background(),
  3194  			mtc.stores[2],
  3195  			roachpb.Header{
  3196  				Replica:   replicaDesc,
  3197  				Timestamp: mtc.stores[2].Clock().Now(),
  3198  			}, incArgs,
  3199  		)
  3200  		if _, ok := pErr.GetDetail().(*roachpb.RangeNotFoundError); !ok {
  3201  			// We're on a goroutine and passing the error out is awkward since
  3202  			// it would only surface at shutdown time. A panic ought to be good
  3203  			// enough to get visibility.
  3204  			panic(fmt.Sprintf("unexpected error: %v", pErr))
  3205  		}
  3206  	}()
  3207  	startWG.Wait()
  3208  
  3209  	// Sleep a bit to let the command proposed on node 2 proceed if it's
  3210  	// going to. Prior to the introduction of replica tombstones, this
  3211  	// would lead to split-brain: Node 2 would wake up node 1 and they
  3212  	// would form a quorum, even though node 0 had removed them both.
  3213  	// Now the tombstone on node 1 prevents it from rejoining the rogue
  3214  	// copy of the group.
  3215  	time.Sleep(100 * time.Millisecond)
  3216  	testutils.SucceedsSoon(t, func() error {
  3217  		actual := mtc.readIntFromEngines(roachpb.Key("a"))
  3218  		// Normally, replica GC has not happened yet on store 2, so we
  3219  		// expect {16, 0, 5}. However, it is possible (on a
  3220  		// slow/overloaded machine) for the end of the ChangeReplicas
  3221  		// transaction to be queued up inside the raft transport for long
  3222  		// enough that it doesn't arrive until after store 2 has been
  3223  		// restarted, so it is able to trigger an early GC on the
  3224  		// restarted node, resulting in {16, 0, 0}.
  3225  		// TODO(bdarnell): When #5789 is fixed, the probabilities flip and
  3226  		// {16, 0, 0} becomes the expected case. When this happens
  3227  		// we should just combine this check with the following one.
  3228  		expected1 := []int64{16, 0, 5}
  3229  		expected2 := []int64{16, 0, 0}
  3230  		if !reflect.DeepEqual(expected1, actual) && !reflect.DeepEqual(expected2, actual) {
  3231  			return errors.Errorf("expected %v or %v, got %v", expected1, expected2, actual)
  3232  		}
  3233  		return nil
  3234  	})
  3235  
  3236  	// Run garbage collection on node 2. The lack of an active lease holder
  3237  	// lease will cause GC to do a consistent range lookup, where it
  3238  	// will see that the range has been moved and delete the old
  3239  	// replica.
  3240  	mtc.stores[2].SetReplicaGCQueueActive(true)
  3241  	mtc.advanceClock(context.Background())
  3242  	mtc.manualClock.Increment(int64(
  3243  		kvserver.ReplicaGCQueueInactivityThreshold) + 1)
  3244  	mtc.stores[2].MustForceReplicaGCScanAndProcess()
  3245  	mtc.waitForValues(roachpb.Key("a"), []int64{16, 0, 0})
  3246  
  3247  	// Now that the group has been GC'd, the goroutine that was
  3248  	// attempting to write has finished (with an error).
  3249  	finishWG.Wait()
  3250  }
  3251  
  3252  type errorChannelTestHandler chan *roachpb.Error
  3253  
  3254  func (errorChannelTestHandler) HandleRaftRequest(
  3255  	_ context.Context, _ *kvserver.RaftMessageRequest, _ kvserver.RaftMessageResponseStream,
  3256  ) *roachpb.Error {
  3257  	panic("unimplemented")
  3258  }
  3259  
  3260  func (d errorChannelTestHandler) HandleRaftResponse(
  3261  	ctx context.Context, resp *kvserver.RaftMessageResponse,
  3262  ) error {
  3263  	switch val := resp.Union.GetValue().(type) {
  3264  	case *roachpb.Error:
  3265  		d <- val
  3266  	default:
  3267  		log.Fatalf(ctx, "unexpected response type %T", val)
  3268  	}
  3269  	return nil
  3270  }
  3271  
  3272  func (errorChannelTestHandler) HandleSnapshot(
  3273  	_ *kvserver.SnapshotRequest_Header, _ kvserver.SnapshotResponseStream,
  3274  ) error {
  3275  	panic("unimplemented")
  3276  }
  3277  
  3278  // This test simulates a scenario where one replica has been removed from the
  3279  // range's Raft group but it is unaware of the fact. We check that this replica
  3280  // coming back from the dead cannot cause elections.
  3281  func TestReplicateRemovedNodeDisruptiveElection(t *testing.T) {
  3282  	defer leaktest.AfterTest(t)()
  3283  
  3284  	mtc := &multiTestContext{
  3285  		// This test was written before the multiTestContext started creating many
  3286  		// system ranges at startup, and hasn't been update to take that into
  3287  		// account.
  3288  		startWithSingleRange: true,
  3289  	}
  3290  	defer mtc.Stop()
  3291  	mtc.Start(t, 4)
  3292  
  3293  	// Move the first range from the first node to the other three.
  3294  	const rangeID = roachpb.RangeID(1)
  3295  	mtc.replicateRange(rangeID, 1, 2, 3)
  3296  	mtc.transferLease(context.Background(), rangeID, 0, 1)
  3297  	mtc.unreplicateRange(rangeID, 0)
  3298  
  3299  	// Ensure that we have a stable lease and raft leader so we can tell if the
  3300  	// removed node causes a disruption. This is a three-step process.
  3301  
  3302  	// 1. Write on the second node, to ensure that a lease has been
  3303  	// established after the first node's removal.
  3304  	key := roachpb.Key("a")
  3305  	value := int64(5)
  3306  	incArgs := incrementArgs(key, value)
  3307  	if _, err := kv.SendWrapped(context.Background(), mtc.distSenders[1], incArgs); err != nil {
  3308  		t.Fatal(err)
  3309  	}
  3310  
  3311  	// 2. Wait for all nodes to process the increment (and therefore the
  3312  	// new lease).
  3313  	mtc.waitForValues(key, []int64{0, value, value, value})
  3314  
  3315  	// 3. Wait for the lease holder to obtain raft leadership too.
  3316  	testutils.SucceedsSoon(t, func() error {
  3317  		req := &roachpb.LeaseInfoRequest{
  3318  			RequestHeader: roachpb.RequestHeader{
  3319  				Key: roachpb.KeyMin,
  3320  			},
  3321  		}
  3322  		reply, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[1], req)
  3323  		if pErr != nil {
  3324  			return pErr.GoError()
  3325  		}
  3326  		leaseReplica := reply.(*roachpb.LeaseInfoResponse).Lease.Replica.ReplicaID
  3327  		leadReplica := roachpb.ReplicaID(mtc.stores[1].RaftStatus(rangeID).Lead)
  3328  		if leaseReplica != leadReplica {
  3329  			return errors.Errorf("leaseReplica %s does not match leadReplica %s",
  3330  				leaseReplica, leadReplica)
  3331  		}
  3332  
  3333  		return nil
  3334  	})
  3335  
  3336  	// Save the current term, which is the latest among the live stores.
  3337  	findTerm := func() uint64 {
  3338  		var term uint64
  3339  		for i := 1; i < 4; i++ {
  3340  			s := mtc.stores[i].RaftStatus(rangeID)
  3341  			if s.Term > term {
  3342  				term = s.Term
  3343  			}
  3344  		}
  3345  		return term
  3346  	}
  3347  	term := findTerm()
  3348  	if term == 0 {
  3349  		t.Fatalf("expected non-zero term")
  3350  	}
  3351  
  3352  	// replica0 is the one that  has been removed; replica1 is a current
  3353  	// member of the group.
  3354  	replica0 := roachpb.ReplicaDescriptor{
  3355  		ReplicaID: roachpb.ReplicaID(mtc.stores[0].StoreID()),
  3356  		NodeID:    roachpb.NodeID(mtc.stores[0].StoreID()),
  3357  		StoreID:   mtc.stores[0].StoreID(),
  3358  	}
  3359  	replica1 := roachpb.ReplicaDescriptor{
  3360  		ReplicaID: roachpb.ReplicaID(mtc.stores[1].StoreID()),
  3361  		NodeID:    roachpb.NodeID(mtc.stores[1].StoreID()),
  3362  		StoreID:   mtc.stores[1].StoreID(),
  3363  	}
  3364  
  3365  	// Create a new transport for store 0 so that we can intercept the responses.
  3366  	// Error responses are passed back along the same grpc stream as the request
  3367  	// so it's ok that there are two (this one and the one actually used by the
  3368  	// store).
  3369  	transport0 := kvserver.NewRaftTransport(log.AmbientContext{Tracer: mtc.storeConfig.Settings.Tracer},
  3370  		cluster.MakeTestingClusterSettings(),
  3371  		nodedialer.New(mtc.rpcContext, gossip.AddressResolver(mtc.gossips[0])),
  3372  		nil, /* grpcServer */
  3373  		mtc.transportStopper,
  3374  	)
  3375  	errChan := errorChannelTestHandler(make(chan *roachpb.Error, 1))
  3376  	transport0.Listen(mtc.stores[0].StoreID(), errChan)
  3377  
  3378  	// Simulate the removed node asking to trigger an election. Try and try again
  3379  	// until we're reasonably sure the message was sent.
  3380  	for !transport0.SendAsync(&kvserver.RaftMessageRequest{
  3381  		RangeID:     rangeID,
  3382  		ToReplica:   replica1,
  3383  		FromReplica: replica0,
  3384  		Message: raftpb.Message{
  3385  			From: uint64(replica0.ReplicaID),
  3386  			To:   uint64(replica1.ReplicaID),
  3387  			Type: raftpb.MsgVote,
  3388  			Term: term + 1,
  3389  		},
  3390  	}, rpc.DefaultClass) {
  3391  	}
  3392  
  3393  	// The receiver of this message (i.e. replica1) should return an error telling
  3394  	// the sender that it's no longer part of the group.
  3395  	select {
  3396  	case pErr := <-errChan:
  3397  		switch pErr.GetDetail().(type) {
  3398  		case *roachpb.ReplicaTooOldError:
  3399  		default:
  3400  			t.Fatalf("unexpected error type %T: %s", pErr.GetDetail(), pErr)
  3401  		}
  3402  	case <-time.After(45 * time.Second):
  3403  		t.Fatal("did not get expected ReplicaTooOldError error")
  3404  	}
  3405  
  3406  	// The message should have been discarded without triggering an
  3407  	// election or changing the term.
  3408  	newTerm := findTerm()
  3409  	if term != newTerm {
  3410  		t.Errorf("expected term to be constant, but changed from %v to %v", term, newTerm)
  3411  	}
  3412  }
  3413  
  3414  func TestReplicaTooOldGC(t *testing.T) {
  3415  	defer leaktest.AfterTest(t)()
  3416  
  3417  	sc := kvserver.TestStoreConfig(nil)
  3418  	sc.TestingKnobs.DisableScanner = true
  3419  	mtc := &multiTestContext{
  3420  		storeConfig: &sc,
  3421  		// This test was written before the multiTestContext started creating many
  3422  		// system ranges at startup, and hasn't been update to take that into
  3423  		// account.
  3424  		startWithSingleRange: true,
  3425  	}
  3426  	defer mtc.Stop()
  3427  	mtc.Start(t, 4)
  3428  
  3429  	// Replicate the first range onto all of the nodes.
  3430  	const rangeID = 1
  3431  	mtc.replicateRange(rangeID, 1, 2, 3)
  3432  
  3433  	// Put some data in the range so we'll have something to test for.
  3434  	incArgs := incrementArgs([]byte("a"), 5)
  3435  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  3436  		t.Fatal(err)
  3437  	}
  3438  	// Wait for all nodes to catch up.
  3439  	mtc.waitForValues(roachpb.Key("a"), []int64{5, 5, 5, 5})
  3440  
  3441  	// Verify store 3 has the replica.
  3442  	if _, err := mtc.stores[3].GetReplica(rangeID); err != nil {
  3443  		t.Fatal(err)
  3444  	}
  3445  
  3446  	// Stop node 3; while it is down remove the range from it. Since the node is
  3447  	// down it won't see the removal and won't clean up its replica.
  3448  	mtc.stopStore(3)
  3449  	mtc.unreplicateRange(rangeID, 3)
  3450  
  3451  	// Perform another write.
  3452  	incArgs = incrementArgs([]byte("a"), 11)
  3453  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  3454  		t.Fatal(err)
  3455  	}
  3456  	mtc.waitForValues(roachpb.Key("a"), []int64{16, 16, 16, 5})
  3457  
  3458  	// Wait for a bunch of raft ticks in order to flush any heartbeats through
  3459  	// the system. In particular, a coalesced heartbeat containing a quiesce
  3460  	// message could have been sent before the node was removed from range but
  3461  	// arrive after the node restarted.
  3462  	ticks := mtc.stores[0].Metrics().RaftTicks.Count
  3463  	for targetTicks := ticks() + 5; ticks() < targetTicks; {
  3464  		time.Sleep(time.Millisecond)
  3465  	}
  3466  
  3467  	// Restart node 3. The removed replica will start talking to the other
  3468  	// replicas and determine it needs to be GC'd.
  3469  	mtc.restartStore(3)
  3470  
  3471  	// Because we lazily initialize Raft groups, we have to force the Raft group
  3472  	// to get created in order to get the replica talking to the other replicas.
  3473  	mtc.stores[3].EnqueueRaftUpdateCheck(rangeID)
  3474  
  3475  	testutils.SucceedsSoon(t, func() error {
  3476  		replica, err := mtc.stores[3].GetReplica(rangeID)
  3477  		if err != nil {
  3478  			if errors.HasType(err, (*roachpb.RangeNotFoundError)(nil)) {
  3479  				return nil
  3480  			}
  3481  			return err
  3482  		} else if replica != nil {
  3483  			// Make sure the replica is unquiesced so that it will tick and
  3484  			// contact the leader to discover it's no longer part of the range.
  3485  			replica.UnquiesceAndWakeLeader()
  3486  		}
  3487  		return errors.Errorf("found %s, waiting for it to be GC'd", replica)
  3488  	})
  3489  }
  3490  
  3491  func TestReplicaLazyLoad(t *testing.T) {
  3492  	defer leaktest.AfterTest(t)()
  3493  
  3494  	sc := kvserver.TestStoreConfig(nil)
  3495  	sc.RaftTickInterval = 10 * time.Millisecond // safe because there is only a single node
  3496  	sc.TestingKnobs.DisableScanner = true
  3497  	sc.TestingKnobs.DisablePeriodicGossips = true
  3498  	sc.TestingKnobs.DisableMergeQueue = true
  3499  	mtc := &multiTestContext{
  3500  		storeConfig: &sc,
  3501  		// This test was written before the multiTestContext started creating many
  3502  		// system ranges at startup, and hasn't been update to take that into
  3503  		// account.
  3504  		startWithSingleRange: true,
  3505  	}
  3506  	defer mtc.Stop()
  3507  	mtc.Start(t, 1)
  3508  
  3509  	// Split so we can rely on RHS range being quiescent after a restart.
  3510  	// We use UserTableDataMin to avoid having the range activated to
  3511  	// gossip system table data.
  3512  	splitKey := keys.UserTableDataMin
  3513  	splitArgs := adminSplitArgs(splitKey)
  3514  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), splitArgs); err != nil {
  3515  		t.Fatal(err)
  3516  	}
  3517  
  3518  	mtc.stopStore(0)
  3519  	mtc.restartStore(0)
  3520  
  3521  	// Wait for a bunch of raft ticks.
  3522  	ticks := mtc.stores[0].Metrics().RaftTicks.Count
  3523  	for targetTicks := ticks() + 3; ticks() < targetTicks; {
  3524  		time.Sleep(time.Millisecond)
  3525  	}
  3526  
  3527  	splitKeyAddr, err := keys.Addr(splitKey)
  3528  	if err != nil {
  3529  		t.Fatal(err)
  3530  	}
  3531  
  3532  	replica := mtc.stores[0].LookupReplica(splitKeyAddr)
  3533  	if replica == nil {
  3534  		t.Fatalf("lookup replica at key %q returned nil", splitKey)
  3535  	}
  3536  	if replica.RaftStatus() != nil {
  3537  		t.Fatalf("expected replica Raft group to be uninitialized")
  3538  	}
  3539  }
  3540  
  3541  func TestReplicateReAddAfterDown(t *testing.T) {
  3542  	defer leaktest.AfterTest(t)()
  3543  
  3544  	mtc := &multiTestContext{
  3545  		// This test was written before the multiTestContext started creating many
  3546  		// system ranges at startup, and hasn't been update to take that into
  3547  		// account.
  3548  		startWithSingleRange: true,
  3549  	}
  3550  	defer mtc.Stop()
  3551  	mtc.Start(t, 3)
  3552  
  3553  	downedStoreIdx := 2
  3554  
  3555  	// First put the range on all three nodes.
  3556  	raftID := roachpb.RangeID(1)
  3557  	mtc.replicateRange(raftID, 1, 2)
  3558  
  3559  	// Put some data in the range so we'll have something to test for.
  3560  	incArgs := incrementArgs([]byte("a"), 5)
  3561  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  3562  		t.Fatal(err)
  3563  	}
  3564  
  3565  	// Wait for all nodes to catch up.
  3566  	mtc.waitForValues(roachpb.Key("a"), []int64{5, 5, 5})
  3567  
  3568  	// Stop node 2; while it is down remove the range from it. Since the node is
  3569  	// down it won't see the removal and clean up its replica.
  3570  	mtc.stopStore(downedStoreIdx)
  3571  	mtc.unreplicateRange(raftID, 2)
  3572  
  3573  	// Perform another write.
  3574  	incArgs = incrementArgs([]byte("a"), 11)
  3575  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  3576  		t.Fatal(err)
  3577  	}
  3578  	mtc.waitForValues(roachpb.Key("a"), []int64{16, 16, 5})
  3579  
  3580  	// Bring it back up and re-add the range. There is a race when the
  3581  	// store applies its removal and re-addition back to back: the
  3582  	// replica may or may not have (asynchronously) garbage collected
  3583  	// its data in between. Whether the existing data is reused or the
  3584  	// replica gets recreated, the replica ID is changed by this
  3585  	// process. An ill-timed GC has been known to cause bugs including
  3586  	// https://github.com/cockroachdb/cockroach/issues/2873.
  3587  	mtc.restartStore(downedStoreIdx)
  3588  	mtc.replicateRange(raftID, downedStoreIdx)
  3589  
  3590  	// The range should be synced back up.
  3591  	mtc.waitForValues(roachpb.Key("a"), []int64{16, 16, 16})
  3592  }
  3593  
  3594  // TestLeaseHolderRemoveSelf verifies that a lease holder cannot remove itself
  3595  // without encountering an error.
  3596  func TestLeaseHolderRemoveSelf(t *testing.T) {
  3597  	defer leaktest.AfterTest(t)()
  3598  
  3599  	mtc := &multiTestContext{}
  3600  	defer mtc.Stop()
  3601  	mtc.Start(t, 2)
  3602  
  3603  	leaseHolder := mtc.stores[0]
  3604  
  3605  	raftID := roachpb.RangeID(1)
  3606  	mtc.replicateRange(raftID, 1)
  3607  
  3608  	// Attempt to remove the replica from first store.
  3609  	expectedErr := "invalid ChangeReplicasTrigger"
  3610  	if err := mtc.unreplicateRangeNonFatal(raftID, 0); !testutils.IsError(err, expectedErr) {
  3611  		t.Fatalf("expected %q error trying to remove leaseholder replica; got %v", expectedErr, err)
  3612  	}
  3613  
  3614  	// Expect that we can still successfully do a get on the range.
  3615  	getArgs := getArgs([]byte("a"))
  3616  	_, pErr := kv.SendWrappedWith(context.Background(), leaseHolder.TestSender(), roachpb.Header{}, getArgs)
  3617  	if pErr != nil {
  3618  		t.Fatal(pErr)
  3619  	}
  3620  }
  3621  
  3622  // TestRemovedReplicaError verifies that a replica that has been removed from a
  3623  // range returns a RangeNotFoundError if it receives a request for that range
  3624  // (not RaftGroupDeletedError, and even before the ReplicaGCQueue has run).
  3625  func TestRemovedReplicaError(t *testing.T) {
  3626  	defer leaktest.AfterTest(t)()
  3627  
  3628  	mtc := &multiTestContext{
  3629  		// This test was written before the multiTestContext started creating many
  3630  		// system ranges at startup, and hasn't been update to take that into
  3631  		// account.
  3632  		startWithSingleRange: true,
  3633  	}
  3634  	defer mtc.Stop()
  3635  	mtc.Start(t, 2)
  3636  
  3637  	// Disable the replica GC queues. This verifies that the replica is
  3638  	// considered removed even before the gc queue has run, and also
  3639  	// helps avoid a deadlock at shutdown.
  3640  	mtc.stores[0].SetReplicaGCQueueActive(false)
  3641  
  3642  	raftID := roachpb.RangeID(1)
  3643  	mtc.replicateRange(raftID, 1)
  3644  	mtc.transferLease(context.Background(), raftID, 0, 1)
  3645  	mtc.unreplicateRange(raftID, 0)
  3646  
  3647  	mtc.manualClock.Increment(mtc.storeConfig.LeaseExpiration())
  3648  
  3649  	// Expect to get a RangeNotFoundError. We have to allow for ambiguous result
  3650  	// errors to avoid the occasional test flake. Since we use demotions to remove
  3651  	// voters, the actual removal sees a learner, and so the learner is not in
  3652  	// the commit quorum for the removal itself. That is to say, we will only
  3653  	// start seeing the RangeNotFoundError after a little bit of time has passed.
  3654  	getArgs := getArgs([]byte("a"))
  3655  	testutils.SucceedsSoon(t, func() error {
  3656  		_, pErr := kv.SendWrappedWith(context.Background(), mtc.stores[0].TestSender(), roachpb.Header{}, getArgs)
  3657  		switch pErr.GetDetail().(type) {
  3658  		case *roachpb.AmbiguousResultError:
  3659  			return pErr.GoError()
  3660  		case *roachpb.NotLeaseHolderError:
  3661  			return pErr.GoError()
  3662  		case *roachpb.RangeNotFoundError:
  3663  			return nil
  3664  		default:
  3665  		}
  3666  		t.Fatal(pErr)
  3667  		return errors.New("unreachable")
  3668  	})
  3669  }
  3670  
  3671  func TestTransferRaftLeadership(t *testing.T) {
  3672  	defer leaktest.AfterTest(t)()
  3673  
  3674  	const numStores = 3
  3675  	sc := kvserver.TestStoreConfig(nil)
  3676  	sc.TestingKnobs.DisableMergeQueue = true
  3677  	// Suppress timeout-based elections (which also includes a previous
  3678  	// leader stepping down due to a quorum check). Running tests on a
  3679  	// heavily loaded CPU is enough to reach the raft election timeout
  3680  	// and cause leadership to change hands in ways this test doesn't
  3681  	// expect.
  3682  	sc.RaftElectionTimeoutTicks = 100000
  3683  	// This test can rapidly advance the clock via mtc.advanceClock(),
  3684  	// which could lead the replication queue to consider a store dead
  3685  	// and remove a replica in the middle of the test. Disable the
  3686  	// replication queue; we'll control replication manually.
  3687  	sc.TestingKnobs.DisableReplicateQueue = true
  3688  	sc.Clock = nil // manual clock
  3689  	mtc := &multiTestContext{
  3690  		storeConfig: &sc,
  3691  		// This test was written before the multiTestContext started creating many
  3692  		// system ranges at startup, and hasn't been update to take that into
  3693  		// account.
  3694  		startWithSingleRange: true,
  3695  	}
  3696  	defer mtc.Stop()
  3697  	mtc.Start(t, numStores)
  3698  	store0 := mtc.Store(0)
  3699  	store1 := mtc.Store(1)
  3700  
  3701  	key := roachpb.Key("a")
  3702  
  3703  	{
  3704  		// Split off a range to avoid interacting with the initial splits.
  3705  		splitArgs := adminSplitArgs(key)
  3706  		if _, err := kv.SendWrapped(context.Background(), mtc.distSenders[0], splitArgs); err != nil {
  3707  			t.Fatal(err)
  3708  		}
  3709  	}
  3710  
  3711  	repl0 := store0.LookupReplica(keys.MustAddr(key))
  3712  	if repl0 == nil {
  3713  		t.Fatalf("no replica found for key '%s'", key)
  3714  	}
  3715  	rd0, err := repl0.GetReplicaDescriptor()
  3716  	if err != nil {
  3717  		t.Fatal(err)
  3718  	}
  3719  	mtc.replicateRange(repl0.RangeID, 1, 2)
  3720  
  3721  	repl1 := store1.LookupReplica(keys.MustAddr(key))
  3722  	if repl1 == nil {
  3723  		t.Fatalf("no replica found for key '%s'", key)
  3724  	}
  3725  	rd1, err := repl1.GetReplicaDescriptor()
  3726  	if err != nil {
  3727  		t.Fatal(err)
  3728  	}
  3729  
  3730  	getArgs := getArgs([]byte("a"))
  3731  	if _, pErr := kv.SendWrappedWith(
  3732  		context.Background(), store0, roachpb.Header{RangeID: repl0.RangeID}, getArgs,
  3733  	); pErr != nil {
  3734  		t.Fatalf("expect get nil, actual get %v ", pErr)
  3735  	}
  3736  
  3737  	status := repl0.RaftStatus()
  3738  	if status == nil || status.Lead != uint64(rd0.ReplicaID) {
  3739  		t.Fatalf("raft leader should be %d, but got status %+v", rd0.ReplicaID, status)
  3740  	}
  3741  
  3742  	// Force a read on Store 2 to request a new lease. Other moving parts in
  3743  	// the system could have requested another lease as well, so we
  3744  	// expire-request in a loop until we get our foot in the door.
  3745  	origCount0 := store0.Metrics().RangeRaftLeaderTransfers.Count()
  3746  	for {
  3747  		mtc.advanceClock(context.Background())
  3748  		if _, pErr := kv.SendWrappedWith(
  3749  			context.Background(), store1, roachpb.Header{RangeID: repl0.RangeID}, getArgs,
  3750  		); pErr == nil {
  3751  			break
  3752  		} else {
  3753  			switch pErr.GetDetail().(type) {
  3754  			case *roachpb.NotLeaseHolderError, *roachpb.RangeNotFoundError:
  3755  			default:
  3756  				t.Fatal(pErr)
  3757  			}
  3758  		}
  3759  	}
  3760  	// Verify lease is transferred.
  3761  	testutils.SucceedsSoon(t, func() error {
  3762  		if a, e := repl0.RaftStatus().Lead, uint64(rd1.ReplicaID); a != e {
  3763  			return errors.Errorf("expected raft leader be %d; got %d", e, a)
  3764  		}
  3765  		if a, e := store0.Metrics().RangeRaftLeaderTransfers.Count()-origCount0, int64(1); a < e {
  3766  			return errors.Errorf("expected raft leader transfer count >= %d; got %d", e, a)
  3767  		}
  3768  		return nil
  3769  	})
  3770  }
  3771  
  3772  // Test that a single blocked replica does not block other replicas.
  3773  func TestRaftBlockedReplica(t *testing.T) {
  3774  	defer leaktest.AfterTest(t)()
  3775  
  3776  	sc := kvserver.TestStoreConfig(nil)
  3777  	sc.TestingKnobs.DisableMergeQueue = true
  3778  	sc.TestingKnobs.DisableScanner = true
  3779  	mtc := &multiTestContext{
  3780  		storeConfig: &sc,
  3781  		// This test was written before the multiTestContext started creating many
  3782  		// system ranges at startup, and hasn't been update to take that into
  3783  		// account.
  3784  		startWithSingleRange: true,
  3785  	}
  3786  	defer mtc.Stop()
  3787  	mtc.Start(t, 3)
  3788  
  3789  	// Create 2 ranges by splitting range 1.
  3790  	splitArgs := adminSplitArgs(roachpb.Key("b"))
  3791  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), splitArgs); err != nil {
  3792  		t.Fatal(err)
  3793  	}
  3794  
  3795  	// Replicate range 1 to all 3 nodes. This ensures the usage of the network.
  3796  	mtc.replicateRange(1, 1, 2)
  3797  
  3798  	// Lock range 2 for raft processing.
  3799  	rep, err := mtc.stores[0].GetReplica(2)
  3800  	if err != nil {
  3801  		t.Fatal(err)
  3802  	}
  3803  
  3804  	// NB: We perform the actual locking on a different goroutine in order to
  3805  	// workaround a spurious inconsistent lock order warning when running with
  3806  	// TAGS=deadlock. The issue is that we're grabbing Replica 2's raftMu and
  3807  	// then later Replica 1's from the same goroutine due to the direct calling
  3808  	// of client.SendWrapped down the callstack into the Replica code (via the
  3809  	// local RPC optimization).
  3810  	var wg sync.WaitGroup
  3811  	wg.Add(1)
  3812  	go func() {
  3813  		rep.RaftLock()
  3814  		wg.Done()
  3815  	}()
  3816  	wg.Wait()
  3817  	defer rep.RaftUnlock()
  3818  
  3819  	// Verify that we're still ticking the non-blocked replica.
  3820  	ticks := mtc.stores[0].Metrics().RaftTicks.Count
  3821  	for targetTicks := ticks() + 3; ticks() < targetTicks; {
  3822  		time.Sleep(time.Millisecond)
  3823  	}
  3824  
  3825  	// Verify we can still perform operations on the non-blocked replica.
  3826  	incArgs := incrementArgs([]byte("a"), 5)
  3827  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); err != nil {
  3828  		t.Fatal(err)
  3829  	}
  3830  	mtc.waitForValues(roachpb.Key("a"), []int64{5, 5, 5})
  3831  }
  3832  
  3833  // Test that ranges quiesce and if a follower unquiesces the leader is woken
  3834  // up.
  3835  func TestRangeQuiescence(t *testing.T) {
  3836  	defer leaktest.AfterTest(t)()
  3837  
  3838  	sc := kvserver.TestStoreConfig(nil)
  3839  	sc.TestingKnobs.DisableScanner = true
  3840  	sc.TestingKnobs.DisablePeriodicGossips = true
  3841  	mtc := &multiTestContext{
  3842  		storeConfig: &sc,
  3843  		// This test was written before the multiTestContext started creating many
  3844  		// system ranges at startup, and hasn't been update to take that into
  3845  		// account.
  3846  		startWithSingleRange: true,
  3847  	}
  3848  	defer mtc.Stop()
  3849  	mtc.Start(t, 3)
  3850  
  3851  	pauseNodeLivenessHeartbeats(mtc, true)
  3852  
  3853  	// Replica range 1 to all 3 nodes.
  3854  	const rangeID = roachpb.RangeID(1)
  3855  	mtc.replicateRange(rangeID, 1, 2)
  3856  
  3857  	waitForQuiescence := func(rangeID roachpb.RangeID) {
  3858  		testutils.SucceedsSoon(t, func() error {
  3859  			for _, s := range mtc.stores {
  3860  				rep, err := s.GetReplica(rangeID)
  3861  				if err != nil {
  3862  					t.Fatal(err)
  3863  				}
  3864  				if !rep.IsQuiescent() {
  3865  					return errors.Errorf("%s not quiescent", rep)
  3866  				}
  3867  			}
  3868  			return nil
  3869  		})
  3870  	}
  3871  
  3872  	// Wait for the range to quiesce.
  3873  	waitForQuiescence(rangeID)
  3874  
  3875  	// Find the leader replica.
  3876  	var rep *kvserver.Replica
  3877  	var leaderIdx int
  3878  	for leaderIdx = range mtc.stores {
  3879  		var err error
  3880  		if rep, err = mtc.stores[leaderIdx].GetReplica(1); err != nil {
  3881  			t.Fatal(err)
  3882  		}
  3883  		if rep.RaftStatus().SoftState.RaftState == raft.StateLeader {
  3884  			break
  3885  		}
  3886  	}
  3887  
  3888  	// Unquiesce a follower range, this should "wake the leader" and not result
  3889  	// in an election.
  3890  	followerIdx := (leaderIdx + 1) % len(mtc.stores)
  3891  	mtc.stores[followerIdx].EnqueueRaftUpdateCheck(rangeID)
  3892  
  3893  	// Wait for a bunch of ticks to occur which will allow the follower time to
  3894  	// campaign.
  3895  	ticks := mtc.stores[followerIdx].Metrics().RaftTicks.Count
  3896  	for targetTicks := ticks() + int64(2*sc.RaftElectionTimeoutTicks); ticks() < targetTicks; {
  3897  		time.Sleep(time.Millisecond)
  3898  	}
  3899  
  3900  	// Wait for the range to quiesce again.
  3901  	waitForQuiescence(rangeID)
  3902  
  3903  	// The leadership should not have changed.
  3904  	if state := rep.RaftStatus().SoftState.RaftState; state != raft.StateLeader {
  3905  		t.Fatalf("%s should be the leader: %s", rep, state)
  3906  	}
  3907  }
  3908  
  3909  // TestInitRaftGroupOnRequest verifies that an uninitialized Raft group
  3910  // is initialized if a request is received, even if the current range
  3911  // lease points to a different replica.
  3912  func TestInitRaftGroupOnRequest(t *testing.T) {
  3913  	defer leaktest.AfterTest(t)()
  3914  	storeCfg := kvserver.TestStoreConfig(nil /* clock */)
  3915  	storeCfg.TestingKnobs.DisableMergeQueue = true
  3916  	// Don't timeout range leases (see the relation between
  3917  	// RaftElectionTimeoutTicks and RangeLeaseActiveDuration). This test expects
  3918  	// the replica that holds the lease before the cluster is restarted to
  3919  	// continue holding it after the restart, regardless of how long the restart
  3920  	// takes.
  3921  	storeCfg.RaftElectionTimeoutTicks = 1000000
  3922  	// Disable async intent resolution. This can lead to flakiness in the test
  3923  	// because it allows for the intents written by the split transaction to be
  3924  	// resolved at any time, including after the nodes are restarted. The intent
  3925  	// resolution on the RHS's local range descriptor can both wake up the RHS
  3926  	// range's Raft group and result in the wrong replica acquiring the lease.
  3927  	storeCfg.TestingKnobs.IntentResolverKnobs.DisableAsyncIntentResolution = true
  3928  	mtc := &multiTestContext{
  3929  		storeConfig: &storeCfg,
  3930  		// TODO(andrei): This test was written before multiTestContexts started with
  3931  		// multiple ranges, and for some unknown reason is flaky if we're not
  3932  		// forcing it to start with a single range, although it doesnt look like it
  3933  		// should be.
  3934  		startWithSingleRange: true,
  3935  	}
  3936  	defer mtc.Stop()
  3937  	mtc.Start(t, 2)
  3938  
  3939  	// Split so we can rely on RHS range being quiescent after a restart.
  3940  	// We use UserTableDataMin to avoid having the range activated to
  3941  	// gossip system table data.
  3942  	splitKey := keys.UserTableDataMin
  3943  	splitArgs := adminSplitArgs(splitKey)
  3944  	if _, err := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), splitArgs); err != nil {
  3945  		t.Fatal(err)
  3946  	}
  3947  
  3948  	repl := mtc.stores[0].LookupReplica(roachpb.RKey(splitKey))
  3949  	if repl == nil {
  3950  		t.Fatal("replica should not be nil for RHS range")
  3951  	}
  3952  	mtc.replicateRange(repl.RangeID, 1)
  3953  
  3954  	// Find the leaseholder and then restart the test context.
  3955  	lease, _ := repl.GetLease()
  3956  	mtc.restart()
  3957  
  3958  	// Get replica from the store which isn't the leaseholder.
  3959  	// NOTE: StoreID is 1-indexed and storeIdx is 0-indexed, so despite what
  3960  	// this might look like, this is grabbing the replica without the lease.
  3961  	storeIdx := int(lease.Replica.StoreID) % len(mtc.stores)
  3962  	if repl = mtc.stores[storeIdx].LookupReplica(roachpb.RKey(splitKey)); repl == nil {
  3963  		t.Fatal("replica should not be nil for RHS range")
  3964  	}
  3965  
  3966  	// TODO(spencer): Raft messages seem to turn up
  3967  	// occasionally on restart, which initialize the replica, so
  3968  	// this is not a test failure. Not sure how to work around this
  3969  	// problem.
  3970  	// Verify the raft group isn't initialized yet.
  3971  	if repl.IsRaftGroupInitialized() {
  3972  		log.Errorf(context.Background(), "expected raft group to be uninitialized")
  3973  	}
  3974  
  3975  	// Send an increment and verify that initializes the Raft group.
  3976  	incArgs := incrementArgs(splitKey, 1)
  3977  	_, pErr := kv.SendWrappedWith(
  3978  		context.Background(), mtc.stores[storeIdx], roachpb.Header{RangeID: repl.RangeID}, incArgs,
  3979  	)
  3980  	if _, ok := pErr.GetDetail().(*roachpb.NotLeaseHolderError); !ok {
  3981  		t.Fatalf("expected NotLeaseHolderError; got %s", pErr)
  3982  	}
  3983  	if !repl.IsRaftGroupInitialized() {
  3984  		t.Fatal("expected raft group to be initialized")
  3985  	}
  3986  }
  3987  
  3988  // TestFailedConfChange verifies correct behavior after a configuration change
  3989  // experiences an error when applying EndTxn. Specifically, it verifies that
  3990  // https://github.com/cockroachdb/cockroach/issues/13506 has been fixed.
  3991  func TestFailedConfChange(t *testing.T) {
  3992  	defer leaktest.AfterTest(t)()
  3993  
  3994  	// Trigger errors at apply time so they happen on both leaders and
  3995  	// followers.
  3996  	var filterActive int32
  3997  	sc := kvserver.TestStoreConfig(nil)
  3998  	sc.TestingKnobs.TestingApplyFilter = func(filterArgs kvserverbase.ApplyFilterArgs) (int, *roachpb.Error) {
  3999  		if atomic.LoadInt32(&filterActive) == 1 && filterArgs.ChangeReplicas != nil {
  4000  			return 0, roachpb.NewErrorf("boom")
  4001  		}
  4002  		return 0, nil
  4003  	}
  4004  	mtc := &multiTestContext{
  4005  		storeConfig: &sc,
  4006  	}
  4007  	defer mtc.Stop()
  4008  	mtc.Start(t, 3)
  4009  	ctx := context.Background()
  4010  
  4011  	// Replicate the range (successfully) to the second node.
  4012  	const rangeID = roachpb.RangeID(1)
  4013  	mtc.replicateRange(rangeID, 1)
  4014  
  4015  	// Try and fail to replicate it to the third node.
  4016  	atomic.StoreInt32(&filterActive, 1)
  4017  	if err := mtc.replicateRangeNonFatal(rangeID, 2); !testutils.IsError(err, "boom") {
  4018  		t.Fatal(err)
  4019  	}
  4020  
  4021  	// Raft state is only exposed on the leader, so we must transfer
  4022  	// leadership and check the stores one at a time.
  4023  	checkLeaderStore := func(i int) error {
  4024  		store := mtc.stores[i]
  4025  		repl, err := store.GetReplica(rangeID)
  4026  		if err != nil {
  4027  			t.Fatal(err)
  4028  		}
  4029  		if l := len(repl.Desc().InternalReplicas); l != 2 {
  4030  			return errors.Errorf("store %d: expected 2 replicas in descriptor, found %d in %s",
  4031  				i, l, repl.Desc())
  4032  		}
  4033  		status := repl.RaftStatus()
  4034  		if status.RaftState != raft.StateLeader {
  4035  			return errors.Errorf("store %d: expected StateLeader, was %s", i, status.RaftState)
  4036  		}
  4037  		// In issue #13506, the Progress map would be updated as if the
  4038  		// change had succeeded.
  4039  		if l := len(status.Progress); l != 2 {
  4040  			return errors.Errorf("store %d: expected 2 replicas in raft, found %d in %s", i, l, status)
  4041  		}
  4042  		return nil
  4043  	}
  4044  
  4045  	if err := checkLeaderStore(0); err != nil {
  4046  		t.Fatal(err)
  4047  	}
  4048  
  4049  	// Transfer leadership to the second node and wait for it to become leader.
  4050  	mtc.transferLease(ctx, rangeID, 0, 1)
  4051  	testutils.SucceedsSoon(t, func() error {
  4052  		repl, err := mtc.stores[1].GetReplica(rangeID)
  4053  		if err != nil {
  4054  			return err
  4055  		}
  4056  		status := repl.RaftStatus()
  4057  		if status.RaftState != raft.StateLeader {
  4058  			return errors.Errorf("store %d: expected StateLeader, was %s", 1, status.RaftState)
  4059  		}
  4060  		return nil
  4061  	})
  4062  
  4063  	if err := checkLeaderStore(1); err != nil {
  4064  		t.Fatal(err)
  4065  	}
  4066  }
  4067  
  4068  // TestStoreRangeRemovalCompactionSuggestion verifies that if a replica
  4069  // is removed from a store, a compaction suggestion is made to the
  4070  // compactor queue.
  4071  func TestStoreRangeRemovalCompactionSuggestion(t *testing.T) {
  4072  	defer leaktest.AfterTest(t)()
  4073  	sc := kvserver.TestStoreConfig(nil)
  4074  	mtc := &multiTestContext{storeConfig: &sc}
  4075  	defer mtc.Stop()
  4076  	mtc.Start(t, 3)
  4077  
  4078  	const rangeID = roachpb.RangeID(1)
  4079  	mtc.replicateRange(rangeID, 1, 2)
  4080  
  4081  	repl, err := mtc.stores[0].GetReplica(rangeID)
  4082  	if err != nil {
  4083  		t.Fatal(err)
  4084  	}
  4085  	ctx := repl.AnnotateCtx(context.Background())
  4086  
  4087  	deleteStore := mtc.stores[2]
  4088  	chgs := roachpb.MakeReplicationChanges(roachpb.REMOVE_REPLICA, roachpb.ReplicationTarget{
  4089  		NodeID:  deleteStore.Ident.NodeID,
  4090  		StoreID: deleteStore.Ident.StoreID,
  4091  	})
  4092  	if _, err := repl.ChangeReplicas(ctx, repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRebalance, "", chgs); err != nil {
  4093  		t.Fatal(err)
  4094  	}
  4095  
  4096  	testutils.SucceedsSoon(t, func() error {
  4097  		// Function to check compaction metrics indicating a suggestion
  4098  		// was queued or a compaction was processed or skipped.
  4099  		haveCompaction := func(s *kvserver.Store, exp bool) error {
  4100  			queued := s.Compactor().Metrics.BytesQueued.Value()
  4101  			comps := s.Compactor().Metrics.BytesCompacted.Count()
  4102  			skipped := s.Compactor().Metrics.BytesSkipped.Count()
  4103  			if exp != (queued > 0 || comps > 0 || skipped > 0) {
  4104  				return errors.Errorf("%s: expected non-zero compaction metrics? %t; got queued=%d, compactions=%d, skipped=%d",
  4105  					s, exp, queued, comps, skipped)
  4106  			}
  4107  			return nil
  4108  		}
  4109  		// Verify that no compaction metrics are showing non-zero bytes in the
  4110  		// other stores.
  4111  		for _, s := range mtc.stores {
  4112  			if err := haveCompaction(s, s == deleteStore); err != nil {
  4113  				return err
  4114  			}
  4115  		}
  4116  		return nil
  4117  	})
  4118  }
  4119  
  4120  func TestStoreRangeWaitForApplication(t *testing.T) {
  4121  	defer leaktest.AfterTest(t)()
  4122  
  4123  	var filterRangeIDAtomic int64
  4124  
  4125  	ctx := context.Background()
  4126  	sc := kvserver.TestStoreConfig(nil)
  4127  	sc.TestingKnobs.DisableReplicateQueue = true
  4128  	sc.TestingKnobs.DisableReplicaGCQueue = true
  4129  	sc.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) (retErr *roachpb.Error) {
  4130  		if rangeID := roachpb.RangeID(atomic.LoadInt64(&filterRangeIDAtomic)); rangeID != ba.RangeID {
  4131  			return nil
  4132  		}
  4133  		pErr := roachpb.NewErrorf("blocking %s in this test", ba.Summary())
  4134  		if len(ba.Requests) != 1 {
  4135  			return pErr
  4136  		}
  4137  		_, ok := ba.Requests[0].GetInner().(*roachpb.PutRequest)
  4138  		if !ok {
  4139  			return pErr
  4140  		}
  4141  		return nil
  4142  	}
  4143  	mtc := &multiTestContext{storeConfig: &sc}
  4144  	mtc.Start(t, 3)
  4145  	defer mtc.Stop()
  4146  	store0, store2 := mtc.Store(0), mtc.Store(2)
  4147  	distSender := mtc.distSenders[0]
  4148  
  4149  	// Split off a non-system range so we don't have to account for node liveness
  4150  	// traffic.
  4151  	splitArgs := adminSplitArgs(roachpb.Key("a"))
  4152  	if _, pErr := kv.SendWrapped(ctx, distSender, splitArgs); pErr != nil {
  4153  		t.Fatal(pErr)
  4154  	}
  4155  	rangeID := store0.LookupReplica(roachpb.RKey("a")).RangeID
  4156  	mtc.replicateRange(rangeID, 1, 2)
  4157  
  4158  	repl0, err := store0.GetReplica(rangeID)
  4159  	if err != nil {
  4160  		t.Fatal(err)
  4161  	}
  4162  
  4163  	atomic.StoreInt64(&filterRangeIDAtomic, int64(rangeID))
  4164  
  4165  	leaseIndex0 := repl0.LastAssignedLeaseIndex()
  4166  
  4167  	type target struct {
  4168  		client kvserver.PerReplicaClient
  4169  		header kvserver.StoreRequestHeader
  4170  	}
  4171  
  4172  	var targets []target
  4173  	for _, s := range mtc.stores {
  4174  		conn, err := mtc.nodeDialer.Dial(ctx, s.Ident.NodeID, rpc.DefaultClass)
  4175  		if err != nil {
  4176  			t.Fatal(err)
  4177  		}
  4178  		targets = append(targets, target{
  4179  			client: kvserver.NewPerReplicaClient(conn),
  4180  			header: kvserver.StoreRequestHeader{NodeID: s.Ident.NodeID, StoreID: s.Ident.StoreID},
  4181  		})
  4182  	}
  4183  
  4184  	// Wait for a command that is already applied. The request should return
  4185  	// immediately.
  4186  	for i, target := range targets {
  4187  		_, err := target.client.WaitForApplication(ctx, &kvserver.WaitForApplicationRequest{
  4188  			StoreRequestHeader: target.header,
  4189  			RangeID:            rangeID,
  4190  			LeaseIndex:         leaseIndex0,
  4191  		})
  4192  		if err != nil {
  4193  			t.Fatalf("%d: %+v", i, err)
  4194  		}
  4195  	}
  4196  
  4197  	const count = 5
  4198  
  4199  	// Wait for a command that is `count` indexes later.
  4200  	var errChs []chan error
  4201  	for _, target := range targets {
  4202  		errCh := make(chan error)
  4203  		errChs = append(errChs, errCh)
  4204  		target := target
  4205  		go func() {
  4206  			_, err := target.client.WaitForApplication(ctx, &kvserver.WaitForApplicationRequest{
  4207  				StoreRequestHeader: target.header,
  4208  				RangeID:            rangeID,
  4209  				LeaseIndex:         leaseIndex0 + count,
  4210  			})
  4211  			errCh <- err
  4212  		}()
  4213  	}
  4214  
  4215  	// The request should not return when less than `count` commands have
  4216  	// been issued.
  4217  	putArgs := putArgs(roachpb.Key("foo"), []byte("bar"))
  4218  	for i := 0; i < count-1; i++ {
  4219  		if _, pErr := kv.SendWrapped(ctx, distSender, putArgs); pErr != nil {
  4220  			t.Fatal(pErr)
  4221  		}
  4222  		// Wait a little bit to increase the likelihood that we observe an invalid
  4223  		// ordering. This is not intended to be foolproof.
  4224  		time.Sleep(10 * time.Millisecond)
  4225  		for j, errCh := range errChs {
  4226  			select {
  4227  			case err := <-errCh:
  4228  				t.Fatalf("%d: WaitForApplication returned early (request: %d, err: %v)", j, i, err)
  4229  			default:
  4230  			}
  4231  		}
  4232  	}
  4233  
  4234  	// Once the `count`th command has been issued, the request should return.
  4235  	if _, pErr := kv.SendWrapped(ctx, distSender, putArgs); pErr != nil {
  4236  		t.Fatal(pErr)
  4237  	}
  4238  	for i, errCh := range errChs {
  4239  		if err := <-errCh; err != nil {
  4240  			t.Fatalf("%d: %+v", i, err)
  4241  		}
  4242  	}
  4243  
  4244  	atomic.StoreInt64(&filterRangeIDAtomic, 0)
  4245  
  4246  	// GC the replica while a request is in progress. The request should return
  4247  	// an error.
  4248  	go func() {
  4249  		_, err := targets[2].client.WaitForApplication(ctx, &kvserver.WaitForApplicationRequest{
  4250  			StoreRequestHeader: targets[2].header,
  4251  			RangeID:            rangeID,
  4252  			LeaseIndex:         math.MaxUint64,
  4253  		})
  4254  		errChs[2] <- err
  4255  	}()
  4256  	repl2, err := store2.GetReplica(rangeID)
  4257  	if err != nil {
  4258  		t.Fatal(err)
  4259  	}
  4260  	mtc.unreplicateRange(repl2.RangeID, 2)
  4261  	if err := store2.ManualReplicaGC(repl2); err != nil {
  4262  		t.Fatal(err)
  4263  	}
  4264  	if _, err := repl2.IsDestroyed(); err == nil {
  4265  		t.Fatalf("replica was not destroyed after gc on store2")
  4266  	}
  4267  	err = <-errChs[2]
  4268  	if exp := fmt.Sprintf("r%d was not found", rangeID); !testutils.IsError(err, exp) {
  4269  		t.Fatalf("expected %q error, but got %v", exp, err)
  4270  	}
  4271  
  4272  	// Allow the client context to time out while a request is in progress. The
  4273  	// request should return an error.
  4274  	{
  4275  		var cancel context.CancelFunc
  4276  		ctx, cancel = context.WithTimeout(ctx, 50*time.Millisecond)
  4277  		defer cancel()
  4278  		_, err := targets[0].client.WaitForApplication(ctx, &kvserver.WaitForApplicationRequest{
  4279  			StoreRequestHeader: targets[0].header,
  4280  			RangeID:            rangeID,
  4281  			LeaseIndex:         math.MaxUint64,
  4282  		})
  4283  		if exp := "context deadline exceeded"; !testutils.IsError(err, exp) {
  4284  			t.Fatalf("expected %q error, but got %v", exp, err)
  4285  		}
  4286  	}
  4287  }
  4288  
  4289  func TestStoreWaitForReplicaInit(t *testing.T) {
  4290  	defer leaktest.AfterTest(t)()
  4291  
  4292  	ctx := context.Background()
  4293  	sc := kvserver.TestStoreConfig(nil)
  4294  	mtc := &multiTestContext{
  4295  		storeConfig: &sc,
  4296  		// This test was written before the multiTestContext started creating many
  4297  		// system ranges at startup, and hasn't been update to take that into
  4298  		// account.
  4299  		startWithSingleRange: true,
  4300  	}
  4301  	mtc.Start(t, 1)
  4302  	defer mtc.Stop()
  4303  	store := mtc.Store(0)
  4304  
  4305  	conn, err := mtc.nodeDialer.Dial(ctx, store.Ident.NodeID, rpc.DefaultClass)
  4306  	if err != nil {
  4307  		t.Fatal(err)
  4308  	}
  4309  	client := kvserver.NewPerReplicaClient(conn)
  4310  	storeHeader := kvserver.StoreRequestHeader{NodeID: store.Ident.NodeID, StoreID: store.Ident.StoreID}
  4311  
  4312  	// Test that WaitForReplicaInit returns successfully if the replica exists.
  4313  	_, err = client.WaitForReplicaInit(ctx, &kvserver.WaitForReplicaInitRequest{
  4314  		StoreRequestHeader: storeHeader,
  4315  		RangeID:            roachpb.RangeID(1),
  4316  	})
  4317  	if err != nil {
  4318  		t.Fatal(err)
  4319  	}
  4320  
  4321  	// Test that WaitForReplicaInit times out if the replica does not exist.
  4322  	{
  4323  		timeoutCtx, cancel := context.WithTimeout(ctx, 50*time.Millisecond)
  4324  		defer cancel()
  4325  		_, err = client.WaitForReplicaInit(timeoutCtx, &kvserver.WaitForReplicaInitRequest{
  4326  			StoreRequestHeader: storeHeader,
  4327  			RangeID:            roachpb.RangeID(2),
  4328  		})
  4329  		if exp := "context deadline exceeded"; !testutils.IsError(err, exp) {
  4330  			t.Fatalf("expected %q error, but got %v", exp, err)
  4331  		}
  4332  	}
  4333  
  4334  	// Test that WaitForReplicaInit times out if the replica exists but is not
  4335  	// initialized.
  4336  	{
  4337  		// Constructing an permanently-uninitialized replica is somewhat difficult.
  4338  		// Sending a fake Raft heartbeat for a range ID that the store hasn't seen
  4339  		// before does the trick.
  4340  		var repl42 *kvserver.Replica
  4341  		testutils.SucceedsSoon(t, func() (err error) {
  4342  			// Try several times, as the message may be dropped (see #18355).
  4343  			mtc.transport.SendAsync(&kvserver.RaftMessageRequest{
  4344  				ToReplica: roachpb.ReplicaDescriptor{
  4345  					NodeID:  store.Ident.NodeID,
  4346  					StoreID: store.Ident.StoreID,
  4347  				},
  4348  				Heartbeats: []kvserver.RaftHeartbeat{{RangeID: 42, ToReplicaID: 1}},
  4349  			}, rpc.DefaultClass)
  4350  			repl42, err = store.GetReplica(42)
  4351  			return err
  4352  		})
  4353  		if repl42.IsInitialized() {
  4354  			t.Fatalf("test bug: repl42 is initialized")
  4355  		}
  4356  
  4357  		timeoutCtx, cancel := context.WithTimeout(ctx, 50*time.Millisecond)
  4358  		defer cancel()
  4359  		_, err = client.WaitForReplicaInit(timeoutCtx, &kvserver.WaitForReplicaInitRequest{
  4360  			StoreRequestHeader: storeHeader,
  4361  			RangeID:            roachpb.RangeID(42),
  4362  		})
  4363  		if exp := "context deadline exceeded"; !testutils.IsError(err, exp) {
  4364  			t.Fatalf("expected %q error, but got %v", exp, err)
  4365  		}
  4366  	}
  4367  }
  4368  
  4369  // TestTracingDoesNotRaceWithCancelation ensures that the tracing underneath
  4370  // raft does not race with tracing operations which might occur concurrently
  4371  // due to a request cancelation. When this bug existed this test only
  4372  // uncovered it when run under stress.
  4373  func TestTracingDoesNotRaceWithCancelation(t *testing.T) {
  4374  	defer leaktest.AfterTest(t)()
  4375  
  4376  	sc := kvserver.TestStoreConfig(nil)
  4377  	sc.TestingKnobs.TraceAllRaftEvents = true
  4378  	sc.TestingKnobs.DisableSplitQueue = true
  4379  	sc.TestingKnobs.DisableMergeQueue = true
  4380  	mtc := &multiTestContext{
  4381  		storeConfig: &sc,
  4382  	}
  4383  	mtc.Start(t, 3)
  4384  	defer mtc.Stop()
  4385  
  4386  	db := mtc.Store(0).DB()
  4387  	ctx := context.Background()
  4388  	// Make the transport flaky for the range in question to encourage proposals
  4389  	// to be sent more times and ultimately traced more.
  4390  	ri, err := getRangeInfo(ctx, db, roachpb.Key("foo"))
  4391  	require.Nil(t, err)
  4392  
  4393  	for i := 0; i < 3; i++ {
  4394  		mtc.transport.Listen(mtc.stores[i].Ident.StoreID, &unreliableRaftHandler{
  4395  			rangeID:            ri.Desc.RangeID,
  4396  			RaftMessageHandler: mtc.stores[i],
  4397  			unreliableRaftHandlerFuncs: unreliableRaftHandlerFuncs{
  4398  				dropReq: func(req *kvserver.RaftMessageRequest) bool {
  4399  					return rand.Intn(2) == 0
  4400  				},
  4401  			},
  4402  		})
  4403  	}
  4404  	val := []byte("asdf")
  4405  	var wg sync.WaitGroup
  4406  	put := func(i int) func() {
  4407  		wg.Add(1)
  4408  		return func() {
  4409  			defer wg.Done()
  4410  			totalDelay := 1 * time.Millisecond
  4411  			delay := time.Duration(rand.Intn(int(totalDelay)))
  4412  			startDelay := totalDelay - delay
  4413  			time.Sleep(startDelay)
  4414  			ctx, cancel := context.WithTimeout(context.Background(), delay)
  4415  			defer cancel()
  4416  			_ = db.Put(ctx, roachpb.Key(fmt.Sprintf("foo%d", i)), val)
  4417  		}
  4418  	}
  4419  	const N = 256
  4420  	for i := 0; i < N; i++ {
  4421  		go put(i)()
  4422  	}
  4423  	wg.Wait()
  4424  }
  4425  
  4426  type disablingClientStream struct {
  4427  	grpc.ClientStream
  4428  	disabled *atomic.Value
  4429  }
  4430  
  4431  func (cs *disablingClientStream) SendMsg(m interface{}) error {
  4432  	if cs.disabled.Load().(bool) {
  4433  		return nil
  4434  	}
  4435  	return cs.ClientStream.SendMsg(m)
  4436  }
  4437  
  4438  // TestDefaultConnectionDisruptionDoesNotInterfereWithSystemTraffic tests that
  4439  // disconnection on connections of the rpc.DefaultClass do not interfere with
  4440  // traffic on the SystemClass connection.
  4441  func TestDefaultConnectionDisruptionDoesNotInterfereWithSystemTraffic(t *testing.T) {
  4442  	defer leaktest.AfterTest(t)()
  4443  	// This test relies on concurrently waiting for a value to change in the
  4444  	// underlying engine(s). Since the teeing engine does not respond well to
  4445  	// value mismatches, whether transient or permanent, skip this test if the
  4446  	// teeing engine is being used. See
  4447  	// https://github.com/cockroachdb/cockroach/issues/42656 for more context.
  4448  	if storage.DefaultStorageEngine == enginepb.EngineTypeTeePebbleRocksDB {
  4449  		t.Skip("disabled on teeing engine")
  4450  	}
  4451  
  4452  	stopper := stop.NewStopper()
  4453  	ctx := context.Background()
  4454  	defer stopper.Stop(ctx)
  4455  	// disabled controls whether to disrupt DefaultClass streams.
  4456  	var disabled atomic.Value
  4457  	disabled.Store(false)
  4458  	knobs := rpc.ContextTestingKnobs{
  4459  		StreamClientInterceptor: func(target string, class rpc.ConnectionClass) grpc.StreamClientInterceptor {
  4460  			if class == rpc.SystemClass {
  4461  				return nil
  4462  			}
  4463  			return func(
  4464  				ctx context.Context, desc *grpc.StreamDesc, cc *grpc.ClientConn,
  4465  				method string, streamer grpc.Streamer, opts ...grpc.CallOption,
  4466  			) (grpc.ClientStream, error) {
  4467  				cs, err := streamer(ctx, desc, cc, method, opts...)
  4468  				if err != nil {
  4469  					return nil, err
  4470  				}
  4471  				return &disablingClientStream{
  4472  					disabled:     &disabled,
  4473  					ClientStream: cs,
  4474  				}, nil
  4475  			}
  4476  		},
  4477  	}
  4478  	// Prevent the split queue from creating additional ranges while we're
  4479  	// waiting for replication.
  4480  	sc := kvserver.TestStoreConfig(nil)
  4481  	mtc := &multiTestContext{
  4482  		storeConfig:     &sc,
  4483  		rpcTestingKnobs: knobs,
  4484  	}
  4485  
  4486  	const numReplicas = 3
  4487  	mtc.Start(t, numReplicas)
  4488  	defer mtc.Stop()
  4489  	for _, s := range mtc.stores {
  4490  		s.SetReplicateQueueActive(true)
  4491  	}
  4492  	mtc.replicateRange(1, 1, 2)
  4493  	// Make a key that's in the user data space.
  4494  	keyA := append(keys.SystemSQLCodec.TablePrefix(100), 'a')
  4495  	replica1 := mtc.stores[0].LookupReplica(roachpb.RKey(keyA))
  4496  	mtc.replicateRange(replica1.RangeID, 1, 2)
  4497  	// Create a test function so that we can run the test both immediately after
  4498  	// up-replicating and after a restart.
  4499  	runTest := func(t *testing.T) {
  4500  		// Look up the replica again because we may have restarted the store.
  4501  		replica1 = mtc.stores[0].LookupReplica(roachpb.RKey(keyA))
  4502  		// Put some data in the range so we'll have something to test for.
  4503  		db := mtc.Store(0).DB()
  4504  		require.NoError(t, db.Put(ctx, keyA, 1))
  4505  
  4506  		// Wait for all nodes to catch up.
  4507  		mtc.waitForValues(keyA, []int64{1, 1, 1})
  4508  		disabled.Store(true)
  4509  		repl1, err := mtc.stores[0].GetReplica(1)
  4510  		require.Nil(t, err)
  4511  		// Transfer the lease on range 1. Make sure there's no pending transfer.
  4512  		var lease roachpb.Lease
  4513  		testutils.SucceedsSoon(t, func() error {
  4514  			var next roachpb.Lease
  4515  			lease, next = repl1.GetLease()
  4516  			if next != (roachpb.Lease{}) {
  4517  				return fmt.Errorf("lease transfer in process, next = %v", next)
  4518  			}
  4519  			return nil
  4520  		})
  4521  
  4522  		var target int
  4523  		for i := roachpb.StoreID(1); i <= numReplicas; i++ {
  4524  			if lease.Replica.StoreID != i {
  4525  				target = int(i - 1)
  4526  				break
  4527  			}
  4528  		}
  4529  		// Use SucceedsSoon to deal with rare stress cases where the lease
  4530  		// transfer may fail.
  4531  		testutils.SucceedsSoon(t, func() error {
  4532  			return mtc.transferLeaseNonFatal(ctx, 1, target, int(lease.Replica.StoreID-1))
  4533  		})
  4534  		// Set a relatively short timeout so that this test doesn't take too long.
  4535  		// We should always hit it.
  4536  		withTimeout, cancel := context.WithTimeout(ctx, 20*time.Millisecond)
  4537  		defer cancel()
  4538  		err = db.Put(withTimeout, keyA, 2)
  4539  		require.True(t, testutils.IsError(err, "deadline exceeded"), err)
  4540  		// Transfer the lease back to demonstrate that the system range is still live.
  4541  		testutils.SucceedsSoon(t, func() error {
  4542  			return mtc.transferLeaseNonFatal(ctx, 1, target, int(lease.Replica.StoreID-1))
  4543  		})
  4544  
  4545  		// Heal the partition, the previous proposal may now succeed but it may have
  4546  		// have been canceled.
  4547  		disabled.Store(false)
  4548  		// Overwrite with a new value and ensure that it propagates.
  4549  		require.NoError(t, db.Put(ctx, keyA, 3))
  4550  		mtc.waitForValuesT(t, keyA, []int64{3, 3, 3})
  4551  	}
  4552  	t.Run("initial_run", runTest)
  4553  	mtc.restart()
  4554  	t.Run("after_restart", runTest)
  4555  }
  4556  
  4557  // TestAckWriteBeforeApplication tests that the success of transactional writes
  4558  // is acknowledged after those writes have been committed to a Range's Raft log
  4559  // but before those writes have been applied to its replicated state machine.
  4560  func TestAckWriteBeforeApplication(t *testing.T) {
  4561  	defer leaktest.AfterTest(t)()
  4562  	for _, tc := range []struct {
  4563  		repls            int
  4564  		expAckBeforeAppl bool
  4565  	}{
  4566  		// In a single-replica Range, each handleRaftReady iteration will append
  4567  		// new entries to the Raft log and immediately apply them. This prevents
  4568  		// "early acknowledgement" from being possible or useful. See the comment
  4569  		// on apply.Task.AckCommittedEntriesBeforeApplication.
  4570  		{1, false},
  4571  		// In a three-replica Range, each handleRaftReady iteration will append
  4572  		// a set of entries to the Raft log and then apply the previous set of
  4573  		// entries. This makes "early acknowledgement" a major optimization, as
  4574  		// it pulls the entire latency required to append the next set of entries
  4575  		// to the Raft log out of the client-perceived latency of the previous
  4576  		// set of entries.
  4577  		{3, true},
  4578  	} {
  4579  		t.Run(fmt.Sprintf("numRepls=%d", tc.repls), func(t *testing.T) {
  4580  			var filterActive int32
  4581  			var magicTS hlc.Timestamp
  4582  			blockPreApplication, blockPostApplication := make(chan struct{}), make(chan struct{})
  4583  			applyFilterFn := func(ch chan struct{}) kvserverbase.ReplicaApplyFilter {
  4584  				return func(filterArgs kvserverbase.ApplyFilterArgs) (int, *roachpb.Error) {
  4585  					if atomic.LoadInt32(&filterActive) == 1 && filterArgs.Timestamp == magicTS {
  4586  						<-ch
  4587  					}
  4588  					return 0, nil
  4589  				}
  4590  			}
  4591  
  4592  			tsc := kvserver.TestStoreConfig(nil)
  4593  			tsc.TestingKnobs.TestingApplyFilter = applyFilterFn(blockPreApplication)
  4594  			tsc.TestingKnobs.TestingPostApplyFilter = applyFilterFn(blockPostApplication)
  4595  
  4596  			mtc := &multiTestContext{storeConfig: &tsc}
  4597  			defer mtc.Stop()
  4598  			mtc.Start(t, tc.repls)
  4599  
  4600  			// Replicate the Range, if necessary.
  4601  			key := roachpb.Key("a")
  4602  			rangeID := mtc.stores[0].LookupReplica(roachpb.RKey(key)).RangeID
  4603  			for i := 1; i < tc.repls; i++ {
  4604  				mtc.replicateRange(rangeID, i)
  4605  			}
  4606  
  4607  			// Begin peforming a write on the Range.
  4608  			magicTS = mtc.stores[0].Clock().Now()
  4609  			atomic.StoreInt32(&filterActive, 1)
  4610  			ch := make(chan *roachpb.Error, 1)
  4611  			go func() {
  4612  				ctx := context.Background()
  4613  				put := putArgs(key, []byte("val"))
  4614  				_, pErr := kv.SendWrappedWith(ctx, mtc.stores[0].TestSender(), roachpb.Header{
  4615  					Timestamp: magicTS,
  4616  				}, put)
  4617  				ch <- pErr
  4618  			}()
  4619  
  4620  			expResult := func() {
  4621  				t.Helper()
  4622  				if pErr := <-ch; pErr != nil {
  4623  					t.Fatalf("unexpected proposal result error: %v", pErr)
  4624  				}
  4625  			}
  4626  			dontExpResult := func() {
  4627  				t.Helper()
  4628  				select {
  4629  				case <-time.After(10 * time.Millisecond):
  4630  					// Expected.
  4631  				case pErr := <-ch:
  4632  					t.Fatalf("unexpected proposal acknowledged before TestingApplyFilter: %v", pErr)
  4633  				}
  4634  			}
  4635  
  4636  			// The result should be blocked on the pre-apply filter.
  4637  			dontExpResult()
  4638  
  4639  			// Release the pre-apply filter.
  4640  			close(blockPreApplication)
  4641  			// Depending on the cluster configuration, The result may not be blocked
  4642  			// on the post-apply filter because it may be able to acknowledges the
  4643  			// client before applying.
  4644  			if tc.expAckBeforeAppl {
  4645  				expResult()
  4646  			} else {
  4647  				dontExpResult()
  4648  			}
  4649  
  4650  			// Stop blocking Raft application to allow everything to shut down cleanly.
  4651  			// This also confirms that the proposal does eventually apply.
  4652  			close(blockPostApplication)
  4653  			// If we didn't expect an acknowledgement before, we do now.
  4654  			if !tc.expAckBeforeAppl {
  4655  				expResult()
  4656  			}
  4657  		})
  4658  	}
  4659  }
  4660  
  4661  // TestProcessSplitAfterRightHandSideHasBeenRemoved tests cases where we have
  4662  // a follower replica which has received information about the RHS of a split
  4663  // before it has processed that split. The replica can't both have an
  4664  // initialized RHS and process the split but it can have (1) an uninitialized
  4665  // RHS with a higher replica ID than in the split and (2) a RHS with an unknown
  4666  // replica ID and a tombstone with a higher replica ID than in the split.
  4667  // It may learn about a newer replica ID than the split without ever hearing
  4668  // about the split replica. If it does not crash (3) it will know that the
  4669  // split replica is too old and will not initialize it. If the node does
  4670  // crash (4) it will forget it had learned about the higher replica ID and
  4671  // will initialize the RHS as the split replica.
  4672  //
  4673  // Starting in 19.2 if a replica discovers from a raft message that it is an
  4674  // old replica then it knows that it has been removed and re-added to the range.
  4675  // In this case the Replica eagerly destroys itself and its data.
  4676  //
  4677  // Given this behavior there are 4 troubling cases with regards to splits.
  4678  //
  4679  //   *  In all cases we begin with s1 processing a presplit snapshot for
  4680  //      r20. After the split the store should have r21/3.
  4681  //
  4682  // In the first two cases the following occurs:
  4683  //
  4684  //   *  s1 receives a message for r21/3 prior to acquiring the split lock
  4685  //      in r21. This will create an uninitialized r21/3 which may write
  4686  //      HardState.
  4687  //
  4688  //   *  Before the r20 processes the split r21 is removed and re-added to
  4689  //      s1 as r21/4. s1 receives a raft message destined for r21/4 and proceeds
  4690  //      to destroy its uninitialized r21/3, laying down a tombstone at 4 in the
  4691  //      process.
  4692  //
  4693  //  (1) s1 processes the split and finds the RHS to be an uninitialized replica
  4694  //      with a higher replica ID.
  4695  //
  4696  //  (2) s1 crashes before processing the split, forgetting the replica ID of the
  4697  //      RHS but retaining its tombstone.
  4698  //
  4699  // In both cases we know that the RHS could not have committed anything because
  4700  // it cannot have gotten a snapshot but we want to be sure to not synthesize a
  4701  // HardState for the RHS that contains a non-zero commit index if we know that
  4702  // the RHS will need another snapshot later.
  4703  //
  4704  // In the third and fourth cases:
  4705  //
  4706  //   *  s1 never receives a message for r21/3.
  4707  //
  4708  //   *  Before the r20 processes the split r21 is removed and re-added to
  4709  //      s1 as r21/4. s1 receives a raft message destined for r21/4 and has never
  4710  //      heard about r21/3.
  4711  //
  4712  //  (3) s1 processes the split and finds the RHS to be an uninitialized replica
  4713  //      with a higher replica ID (but without a tombstone). This case is very
  4714  //      similar to (1)
  4715  //
  4716  //  (4) s1 crashes still before processing the split, forgetting that it had
  4717  //      known about r21/4. When it reboots r21/4 is totally partitioned and
  4718  //      r20 becomes unpartitioned.
  4719  //
  4720  //   *  r20 processes the split successfully and initialized r21/3.
  4721  //
  4722  // In the 4th case we find that until we unpartition r21/4 (the RHS) and let it
  4723  // learn about its removal with a ReplicaTooOldError that it will be initialized
  4724  // with a CommitIndex at 10 as r21/3, the split's value. After r21/4 becomes
  4725  // unpartitioned it will learn it is removed by either catching up on its
  4726  // its log or receiving a ReplicaTooOldError which will lead to a tombstone.
  4727  //
  4728  func TestProcessSplitAfterRightHandSideHasBeenRemoved(t *testing.T) {
  4729  	defer leaktest.AfterTest(t)()
  4730  	// This test relies on concurrently waiting for a value to change in the
  4731  	// underlying engine(s). Since the teeing engine does not respond well to
  4732  	// value mismatches, whether transient or permanent, skip this test if the
  4733  	// teeing engine is being used. See
  4734  	// https://github.com/cockroachdb/cockroach/issues/42656 for more context.
  4735  	if storage.DefaultStorageEngine == enginepb.EngineTypeTeePebbleRocksDB {
  4736  		t.Skip("disabled on teeing engine")
  4737  	}
  4738  	sc := kvserver.TestStoreConfig(nil)
  4739  	// Newly-started stores (including the "rogue" one) should not GC
  4740  	// their replicas. We'll turn this back on when needed.
  4741  	sc.TestingKnobs.DisableReplicaGCQueue = true
  4742  	sc.RaftDelaySplitToSuppressSnapshotTicks = 0
  4743  	// Make the tick interval short so we don't need to wait too long for the
  4744  	// partitioned leader to time out. Also make the
  4745  	// RangeLeaseRaftElectionTimeout multiplier high so that system ranges
  4746  	// like node liveness can actually get leases.
  4747  	sc.RaftTickInterval = 10 * time.Millisecond
  4748  	sc.RangeLeaseRaftElectionTimeoutMultiplier = 1000
  4749  	noopProposalFilter := kvserverbase.ReplicaProposalFilter(func(args kvserverbase.ProposalFilterArgs) *roachpb.Error {
  4750  		return nil
  4751  	})
  4752  	var proposalFilter atomic.Value
  4753  	proposalFilter.Store(noopProposalFilter)
  4754  	sc.TestingKnobs.TestingProposalFilter = func(args kvserverbase.ProposalFilterArgs) *roachpb.Error {
  4755  		return proposalFilter.Load().(kvserverbase.ReplicaProposalFilter)(args)
  4756  	}
  4757  
  4758  	ctx := context.Background()
  4759  	increment := func(t *testing.T, db *kv.DB, key roachpb.Key, by int64) {
  4760  		b := &kv.Batch{}
  4761  		b.AddRawRequest(incrementArgs(key, by))
  4762  		require.NoError(t, db.Run(ctx, b))
  4763  	}
  4764  	changeReplicas := func(
  4765  		t *testing.T, db *kv.DB, typ roachpb.ReplicaChangeType, key roachpb.Key, idx int,
  4766  	) error {
  4767  		ri, err := getRangeInfo(ctx, db, key)
  4768  		require.NoError(t, err)
  4769  		_, err = db.AdminChangeReplicas(ctx, ri.Desc.StartKey.AsRawKey(), ri.Desc,
  4770  			roachpb.MakeReplicationChanges(typ, makeReplicationTargets(idx+1)...))
  4771  		return err
  4772  	}
  4773  	split := func(t *testing.T, db *kv.DB, key roachpb.Key) {
  4774  		b := &kv.Batch{}
  4775  		b.AddRawRequest(adminSplitArgs(key))
  4776  		require.NoError(t, db.Run(ctx, b))
  4777  	}
  4778  	ensureNoTombstone := func(t *testing.T, store *kvserver.Store, rangeID roachpb.RangeID) {
  4779  		var tombstone roachpb.RangeTombstone
  4780  		tombstoneKey := keys.RangeTombstoneKey(rangeID)
  4781  		ok, err := storage.MVCCGetProto(
  4782  			ctx, store.Engine(), tombstoneKey, hlc.Timestamp{}, &tombstone, storage.MVCCGetOptions{},
  4783  		)
  4784  		require.NoError(t, err)
  4785  		require.False(t, ok)
  4786  	}
  4787  	getHardState := func(
  4788  		t *testing.T, store *kvserver.Store, rangeID roachpb.RangeID,
  4789  	) raftpb.HardState {
  4790  		hs, err := stateloader.Make(rangeID).LoadHardState(ctx, store.Engine())
  4791  		require.NoError(t, err)
  4792  		return hs
  4793  	}
  4794  	partitionReplicaOnSplit := func(t *testing.T, mtc *multiTestContext, key roachpb.Key, basePartition *mtcPartitionedRange, partRange **mtcPartitionedRange) {
  4795  		// Set up a hook to partition the RHS range at its initial range ID
  4796  		// before proposing the split trigger.
  4797  		var setupOnce sync.Once
  4798  		f := kvserverbase.ReplicaProposalFilter(func(args kvserverbase.ProposalFilterArgs) *roachpb.Error {
  4799  			req, ok := args.Req.GetArg(roachpb.EndTxn)
  4800  			if !ok {
  4801  				return nil
  4802  			}
  4803  			endTxn := req.(*roachpb.EndTxnRequest)
  4804  			if endTxn.InternalCommitTrigger == nil || endTxn.InternalCommitTrigger.SplitTrigger == nil {
  4805  				return nil
  4806  			}
  4807  			split := endTxn.InternalCommitTrigger.SplitTrigger
  4808  
  4809  			if !split.RightDesc.StartKey.Equal(key) {
  4810  				return nil
  4811  			}
  4812  			setupOnce.Do(func() {
  4813  				replDesc, ok := split.RightDesc.GetReplicaDescriptor(1)
  4814  				require.True(t, ok)
  4815  				var err error
  4816  				*partRange, err = basePartition.extend(mtc, split.RightDesc.RangeID, replDesc.ReplicaID,
  4817  					0 /* partitionedNode */, true /* activated */, unreliableRaftHandlerFuncs{})
  4818  				require.NoError(t, err)
  4819  				proposalFilter.Store(noopProposalFilter)
  4820  			})
  4821  			return nil
  4822  		})
  4823  		proposalFilter.Store(f)
  4824  	}
  4825  
  4826  	// The basic setup for all of these tests are that we have a LHS range on 3
  4827  	// nodes and we've partitioned store 0 for the LHS range. The tests will now
  4828  	// perform a split, remove the RHS, add it back and validate assumptions.
  4829  	//
  4830  	// Different outcomes will occur depending on whether and how the RHS is
  4831  	// partitioned and whether the server is killed. In all cases we want the
  4832  	// split to succeed and the RHS to eventually also be on all 3 nodes.
  4833  	setup := func(t *testing.T) (
  4834  		mtc *multiTestContext,
  4835  		db *kv.DB,
  4836  		keyA, keyB roachpb.Key,
  4837  		lhsID roachpb.RangeID,
  4838  		lhsPartition *mtcPartitionedRange,
  4839  	) {
  4840  		mtc = &multiTestContext{
  4841  			storeConfig: &sc,
  4842  		}
  4843  		mtc.Start(t, 3)
  4844  
  4845  		db = mtc.Store(1).DB()
  4846  
  4847  		// Split off a non-system range so we don't have to account for node liveness
  4848  		// traffic.
  4849  		scratchTableKey := keys.SystemSQLCodec.TablePrefix(math.MaxUint32)
  4850  		// Put some data in the range so we'll have something to test for.
  4851  		keyA = append(append(roachpb.Key{}, scratchTableKey...), 'a')
  4852  		keyB = append(append(roachpb.Key{}, scratchTableKey...), 'b')
  4853  
  4854  		split(t, db, scratchTableKey)
  4855  		ri, err := getRangeInfo(ctx, db, scratchTableKey)
  4856  		require.Nil(t, err)
  4857  		lhsID = ri.Desc.RangeID
  4858  		// First put the range on all three nodes.
  4859  		mtc.replicateRange(lhsID, 1, 2)
  4860  
  4861  		// Set up a partition for the LHS range only. Initially it is not active.
  4862  		lhsPartition, err = setupPartitionedRange(mtc, lhsID,
  4863  			0 /* replicaID */, 0 /* partitionedNode */, false /* activated */, unreliableRaftHandlerFuncs{})
  4864  		require.NoError(t, err)
  4865  		// Wait for all nodes to catch up.
  4866  		increment(t, db, keyA, 5)
  4867  		mtc.waitForValues(keyA, []int64{5, 5, 5})
  4868  
  4869  		// Transfer the lease off of node 0.
  4870  		mtc.transferLease(ctx, lhsID, 0, 2)
  4871  
  4872  		// Make sure everybody knows about that transfer.
  4873  		increment(t, db, keyA, 1)
  4874  		mtc.waitForValues(keyA, []int64{6, 6, 6})
  4875  		lhsPartition.activate()
  4876  
  4877  		increment(t, db, keyA, 1)
  4878  		mtc.waitForValues(keyA, []int64{6, 7, 7})
  4879  		return mtc, db, keyA, keyB, lhsID, lhsPartition
  4880  	}
  4881  
  4882  	// In this case we only have the LHS partitioned. The RHS will learn about its
  4883  	// identity as the replica in the split and after being re-added will learn
  4884  	// about the new replica ID and will lay down a tombstone. At this point we'll
  4885  	// partition the RHS and ensure that the split does not clobber the RHS's hard
  4886  	// state.
  4887  	t.Run("(1) no RHS partition", func(t *testing.T) {
  4888  		mtc, db, keyA, keyB, _, lhsPartition := setup(t)
  4889  		defer mtc.Stop()
  4890  
  4891  		split(t, db, keyB)
  4892  
  4893  		// Write a value which we can observe to know when the split has been
  4894  		// applied by the LHS.
  4895  		increment(t, db, keyA, 1)
  4896  		mtc.waitForValues(keyA, []int64{6, 8, 8})
  4897  
  4898  		increment(t, db, keyB, 6)
  4899  		// Wait for all non-partitioned nodes to catch up.
  4900  		mtc.waitForValues(keyB, []int64{0, 6, 6})
  4901  
  4902  		rhsInfo, err := getRangeInfo(ctx, db, keyB)
  4903  		require.NoError(t, err)
  4904  		rhsID := rhsInfo.Desc.RangeID
  4905  		_, store0Exists := rhsInfo.Desc.GetReplicaDescriptor(1)
  4906  		require.True(t, store0Exists)
  4907  
  4908  		// Remove and re-add the RHS to create a new uninitialized replica at
  4909  		// a higher replica ID. This will lead to a tombstone being written.
  4910  		require.NoError(t, changeReplicas(t, db, roachpb.REMOVE_REPLICA, keyB, 0))
  4911  		// Unsuccessful because the RHS will not accept the learner snapshot
  4912  		// and will be rolled back. Nevertheless it will have learned that it
  4913  		// has been removed at the old replica ID.
  4914  		err = changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0)
  4915  		require.True(t,
  4916  			testutils.IsError(err, "snapshot failed.*cannot apply snapshot: snapshot intersects"), err)
  4917  
  4918  		// Without a partitioned RHS we'll end up always writing a tombstone here because
  4919  		// the RHS will be created at the initial replica ID because it will get
  4920  		// raft message when the other nodes split and then after the above call
  4921  		// it will find out about its new replica ID and write a tombstone for the
  4922  		// old one.
  4923  		waitForTombstone(t, mtc.Store(0).Engine(), rhsID)
  4924  		lhsPartition.deactivate()
  4925  		mtc.waitForValues(keyA, []int64{8, 8, 8})
  4926  		hs := getHardState(t, mtc.Store(0), rhsID)
  4927  		require.Equal(t, uint64(0), hs.Commit)
  4928  		testutils.SucceedsSoon(t, func() error {
  4929  			return changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0)
  4930  		})
  4931  		mtc.waitForValues(keyB, []int64{6, 6, 6})
  4932  	})
  4933  
  4934  	// This case is like the previous case except the store crashes after
  4935  	// laying down a tombstone.
  4936  	t.Run("(2) no RHS partition, with restart", func(t *testing.T) {
  4937  		mtc, db, keyA, keyB, _, lhsPartition := setup(t)
  4938  		defer mtc.Stop()
  4939  
  4940  		split(t, db, keyB)
  4941  
  4942  		// Write a value which we can observe to know when the split has been
  4943  		// applied by the LHS.
  4944  		increment(t, db, keyA, 1)
  4945  		mtc.waitForValues(keyA, []int64{6, 8, 8})
  4946  
  4947  		increment(t, db, keyB, 6)
  4948  		// Wait for all non-partitioned nodes to catch up.
  4949  		mtc.waitForValues(keyB, []int64{0, 6, 6})
  4950  
  4951  		rhsInfo, err := getRangeInfo(ctx, db, keyB)
  4952  		require.NoError(t, err)
  4953  		rhsID := rhsInfo.Desc.RangeID
  4954  		_, store0Exists := rhsInfo.Desc.GetReplicaDescriptor(1)
  4955  		require.True(t, store0Exists)
  4956  
  4957  		// Remove and re-add the RHS to create a new uninitialized replica at
  4958  		// a higher replica ID. This will lead to a tombstone being written.
  4959  		require.NoError(t, changeReplicas(t, db, roachpb.REMOVE_REPLICA, keyB, 0))
  4960  		// Unsuccessfuly because the RHS will not accept the learner snapshot
  4961  		// and will be rolled back. Nevertheless it will have learned that it
  4962  		// has been removed at the old replica ID.
  4963  		err = changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0)
  4964  		require.True(t,
  4965  			testutils.IsError(err, "snapshot failed.*cannot apply snapshot: snapshot intersects"), err)
  4966  
  4967  		// Without a partitioned RHS we'll end up always writing a tombstone here because
  4968  		// the RHS will be created at the initial replica ID because it will get
  4969  		// raft message when the other nodes split and then after the above call
  4970  		// it will find out about its new replica ID and write a tombstone for the
  4971  		// old one.
  4972  		waitForTombstone(t, mtc.Store(0).Engine(), rhsID)
  4973  
  4974  		// We do all of this incrementing to ensure that nobody will ever
  4975  		// succeed in sending a message the new RHS replica after we restart
  4976  		// the store. Previously there were races which could happen if we
  4977  		// stopped the store immediately. Sleeps worked but this feels somehow
  4978  		// more principled.
  4979  		curB := int64(6)
  4980  		for curB < 100 {
  4981  			curB++
  4982  			increment(t, db, keyB, 1)
  4983  			mtc.waitForValues(keyB, []int64{0, curB, curB})
  4984  		}
  4985  
  4986  		// Restart store 0 so that it forgets about the newer replicaID.
  4987  		mtc.stopStore(0)
  4988  		mtc.restartStore(0)
  4989  
  4990  		lhsPartition.deactivate()
  4991  		mtc.waitForValues(keyA, []int64{8, 8, 8})
  4992  		hs := getHardState(t, mtc.Store(0), rhsID)
  4993  		require.Equal(t, uint64(0), hs.Commit)
  4994  		testutils.SucceedsSoon(t, func() error {
  4995  			return changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0)
  4996  		})
  4997  		mtc.waitForValues(keyB, []int64{curB, curB, curB})
  4998  	})
  4999  
  5000  	// In this case the RHS will be partitioned from hearing anything about
  5001  	// the initial replica ID of the RHS after the split. It will learn about
  5002  	// the higher replica ID and have that higher replica ID in memory when
  5003  	// the split is processed. We partition the RHS's new replica ID before
  5004  	// processing the split to ensure that the RHS doesn't get initialized.
  5005  	t.Run("(3) initial replica RHS partition, no restart", func(t *testing.T) {
  5006  		mtc, db, keyA, keyB, _, lhsPartition := setup(t)
  5007  		defer mtc.Stop()
  5008  		var rhsPartition *mtcPartitionedRange
  5009  		partitionReplicaOnSplit(t, mtc, keyB, lhsPartition, &rhsPartition)
  5010  		split(t, db, keyB)
  5011  
  5012  		// Write a value which we can observe to know when the split has been
  5013  		// applied by the LHS.
  5014  		increment(t, db, keyA, 1)
  5015  		mtc.waitForValues(keyA, []int64{6, 8, 8})
  5016  
  5017  		increment(t, db, keyB, 6)
  5018  		// Wait for all non-partitioned nodes to catch up.
  5019  		mtc.waitForValues(keyB, []int64{0, 6, 6})
  5020  
  5021  		rhsInfo, err := getRangeInfo(ctx, db, keyB)
  5022  		require.NoError(t, err)
  5023  		rhsID := rhsInfo.Desc.RangeID
  5024  		_, store0Exists := rhsInfo.Desc.GetReplicaDescriptor(1)
  5025  		require.True(t, store0Exists)
  5026  
  5027  		// Remove and re-add the RHS to create a new uninitialized replica at
  5028  		// a higher replica ID. This will lead to a tombstone being written.
  5029  		require.NoError(t, changeReplicas(t, db, roachpb.REMOVE_REPLICA, keyB, 0))
  5030  		// Unsuccessful because the RHS will not accept the learner snapshot
  5031  		// and will be rolled back. Nevertheless it will have learned that it
  5032  		// has been removed at the old replica ID.
  5033  		err = changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0)
  5034  		require.True(t,
  5035  			testutils.IsError(err, "snapshot failed.*cannot apply snapshot: snapshot intersects"), err)
  5036  		// Ensure that the replica exists with the higher replica ID.
  5037  		repl, err := mtc.Store(0).GetReplica(rhsInfo.Desc.RangeID)
  5038  		require.NoError(t, err)
  5039  		require.Equal(t, repl.ReplicaID(), rhsInfo.Desc.NextReplicaID)
  5040  		rhsPartition.addReplica(rhsInfo.Desc.NextReplicaID)
  5041  
  5042  		// Ensure that there's no tombstone.
  5043  		// The RHS on store 0 never should have heard about its original ID.
  5044  		ensureNoTombstone(t, mtc.Store(0), rhsID)
  5045  		lhsPartition.deactivate()
  5046  		mtc.waitForValues(keyA, []int64{8, 8, 8})
  5047  		hs := getHardState(t, mtc.Store(0), rhsID)
  5048  		require.Equal(t, uint64(0), hs.Commit)
  5049  		// Now succeed in adding the RHS. Use SucceedsSoon because in rare cases
  5050  		// the learner snapshot can fail due to a race with a raft snapshot from
  5051  		// a raft leader on a different node.
  5052  		testutils.SucceedsSoon(t, func() error {
  5053  			return changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0)
  5054  		})
  5055  		mtc.waitForValues(keyB, []int64{6, 6, 6})
  5056  	})
  5057  
  5058  	// This case is set up like the previous one except after the RHS learns about
  5059  	// its higher replica ID the store crahes and forgets. The RHS replica gets
  5060  	// initialized by the split.
  5061  	t.Run("(4) initial replica RHS partition, with restart", func(t *testing.T) {
  5062  		mtc, db, keyA, keyB, _, lhsPartition := setup(t)
  5063  		defer mtc.Stop()
  5064  		var rhsPartition *mtcPartitionedRange
  5065  
  5066  		partitionReplicaOnSplit(t, mtc, keyB, lhsPartition, &rhsPartition)
  5067  		split(t, db, keyB)
  5068  
  5069  		// Write a value which we can observe to know when the split has been
  5070  		// applied by the LHS.
  5071  		increment(t, db, keyA, 1)
  5072  		mtc.waitForValues(keyA, []int64{6, 8, 8})
  5073  
  5074  		increment(t, db, keyB, 6)
  5075  		// Wait for all non-partitioned nodes to catch up.
  5076  		mtc.waitForValues(keyB, []int64{0, 6, 6})
  5077  
  5078  		rhsInfo, err := getRangeInfo(ctx, db, keyB)
  5079  		require.NoError(t, err)
  5080  		rhsID := rhsInfo.Desc.RangeID
  5081  		_, store0Exists := rhsInfo.Desc.GetReplicaDescriptor(1)
  5082  		require.True(t, store0Exists)
  5083  
  5084  		// Remove and re-add the RHS to create a new uninitialized replica at
  5085  		// a higher replica ID. This will lead to a tombstone being written.
  5086  		require.NoError(t, changeReplicas(t, db, roachpb.REMOVE_REPLICA, keyB, 0))
  5087  		// Unsuccessfuly because the RHS will not accept the learner snapshot
  5088  		// and will be rolled back. Nevertheless it will have learned that it
  5089  		// has been removed at the old replica ID.
  5090  		err = changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0)
  5091  		require.True(t,
  5092  			testutils.IsError(err, "snapshot failed.*cannot apply snapshot: snapshot intersects"), err)
  5093  		// Ensure that there's no tombstone.
  5094  		// The RHS on store 0 never should have heard about its original ID.
  5095  		ensureNoTombstone(t, mtc.Store(0), rhsID)
  5096  
  5097  		// Now, before we deactivate the LHS partition, partition the newer replica
  5098  		// on the RHS too.
  5099  		rhsPartition.addReplica(rhsInfo.Desc.NextReplicaID)
  5100  
  5101  		// We do all of this incrementing to ensure that nobody will ever
  5102  		// succeed in sending a message the new RHS replica after we restart
  5103  		// the store. Previously there were races which could happen if we
  5104  		// stopped the store immediately. Sleeps worked but this feels somehow
  5105  		// more principled.
  5106  		curB := int64(6)
  5107  		for curB < 100 {
  5108  			curB++
  5109  			increment(t, db, keyB, 1)
  5110  			mtc.waitForValues(keyB, []int64{0, curB, curB})
  5111  		}
  5112  
  5113  		mtc.stopStore(0)
  5114  		mtc.restartStore(0)
  5115  
  5116  		lhsPartition.deactivate()
  5117  		mtc.waitForValues(keyA, []int64{8, 8, 8})
  5118  		// In this case the store has forgotten that it knew the RHS of the split
  5119  		// could not exist. We ensure that it has been initialized to the initial
  5120  		// commit value, which is 10.
  5121  		testutils.SucceedsSoon(t, func() error {
  5122  			hs := getHardState(t, mtc.Store(0), rhsID)
  5123  			if hs.Commit != uint64(10) {
  5124  				return errors.Errorf("hard state not yet initialized: got %v, expected %v",
  5125  					hs.Commit, uint64(10))
  5126  			}
  5127  			return nil
  5128  		})
  5129  		rhsPartition.deactivate()
  5130  		testutils.SucceedsSoon(t, func() error {
  5131  			return changeReplicas(t, db, roachpb.ADD_REPLICA, keyB, 0)
  5132  		})
  5133  		mtc.waitForValues(keyB, []int64{curB, curB, curB})
  5134  	})
  5135  }
  5136  
  5137  // TestReplicaRemovalClosesProposalQuota is a somewhat contrived test to ensure
  5138  // that when a replica is removed that it closes its proposal quota if it has
  5139  // one. This used to not be the case though it wasn't really very consequential.
  5140  // Firstly, it's rare that a removed replica has a proposal quota to begin with.
  5141  // Replicas which believe they are they leaseholder can only be removed if they
  5142  // have lost the lease and are behind. This requires a network partition.
  5143  // Regardless, there was never actually a problem because once the replica has
  5144  // been removed, all commands will eventually fail and remove themselves from
  5145  // the quota pool. This potentially adds latency as every pending request will
  5146  // need to acquire and release their quota. This is almost always very fast as
  5147  // it is rarely the case that there are more outstanding requests than there is
  5148  // quota. Nevertheless, we have this test to ensure that the pool does get
  5149  // closed if only to avoid asking the question and to ensure that that case is
  5150  // tested.
  5151  func TestReplicaRemovalClosesProposalQuota(t *testing.T) {
  5152  	defer leaktest.AfterTest(t)()
  5153  	ctx := context.Background()
  5154  	// These variables track the request count to make sure that all of the
  5155  	// requests have made it to the Replica.
  5156  	var (
  5157  		rangeID         int64
  5158  		putRequestCount int64
  5159  	)
  5160  	tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{
  5161  		ServerArgs: base.TestServerArgs{
  5162  			Knobs: base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{
  5163  				DisableReplicaGCQueue: true,
  5164  				TestingRequestFilter: kvserverbase.ReplicaRequestFilter(func(_ context.Context, r roachpb.BatchRequest) *roachpb.Error {
  5165  					if r.RangeID == roachpb.RangeID(atomic.LoadInt64(&rangeID)) {
  5166  						if _, isPut := r.GetArg(roachpb.Put); isPut {
  5167  							atomic.AddInt64(&putRequestCount, 1)
  5168  						}
  5169  					}
  5170  					return nil
  5171  				}),
  5172  			}},
  5173  			RaftConfig: base.RaftConfig{
  5174  				// Set the proposal quota to a tiny amount so that each write will
  5175  				// exceed it.
  5176  				RaftProposalQuota: 512,
  5177  				// RaftMaxInflightMsgs * RaftMaxSizePerMsg cannot exceed RaftProposalQuota.
  5178  				RaftMaxInflightMsgs: 2,
  5179  				RaftMaxSizePerMsg:   256,
  5180  			},
  5181  		},
  5182  		ReplicationMode: base.ReplicationManual,
  5183  	})
  5184  	defer tc.Stopper().Stop(ctx)
  5185  
  5186  	key := tc.ScratchRange(t)
  5187  	require.NoError(t, tc.WaitForSplitAndInitialization(key))
  5188  	desc, err := tc.LookupRange(key)
  5189  	require.NoError(t, err)
  5190  	atomic.StoreInt64(&rangeID, int64(desc.RangeID))
  5191  	tc.AddReplicasOrFatal(t, key, tc.Target(1), tc.Target(2))
  5192  	// Partition node 1 from receiving any requests or responses.
  5193  	// This will prevent it from successfully replicating anything.
  5194  	require.NoError(t, tc.WaitForSplitAndInitialization(key))
  5195  	require.NoError(t, tc.TransferRangeLease(desc, tc.Target(0)))
  5196  	store, repl := getFirstStoreReplica(t, tc.Server(0), key)
  5197  	funcs := unreliableRaftHandlerFuncs{}
  5198  	tc.Servers[0].RaftTransport().Listen(store.StoreID(), &unreliableRaftHandler{
  5199  		rangeID:                    desc.RangeID,
  5200  		RaftMessageHandler:         store,
  5201  		unreliableRaftHandlerFuncs: funcs,
  5202  	})
  5203  	// NB: We need to be sure that our Replica is the leaseholder for this
  5204  	// test to make sense. It usually is.
  5205  	lease, pendingLease := repl.GetLease()
  5206  	if pendingLease != (roachpb.Lease{}) || lease.OwnedBy(store.StoreID()) {
  5207  		t.Skip("the replica is not the leaseholder, this happens rarely under stressrace")
  5208  	}
  5209  	var wg sync.WaitGroup
  5210  	const N = 100
  5211  	for i := 0; i < N; i++ {
  5212  		wg.Add(1)
  5213  		go func(i int) {
  5214  			defer wg.Done()
  5215  			k := append(key[0:len(key):len(key)], strconv.Itoa(i)...)
  5216  			_, pErr := kv.SendWrappedWith(context.Background(), store.TestSender(), roachpb.Header{
  5217  				RangeID: desc.RangeID,
  5218  			}, putArgs(k, bytes.Repeat([]byte{'a'}, 1000)))
  5219  			require.Regexp(t,
  5220  				`result is ambiguous \(removing replica\)|`+
  5221  					`r`+strconv.Itoa(int(desc.RangeID))+" was not found on s1", pErr.GoError())
  5222  		}(i)
  5223  	}
  5224  	testutils.SucceedsSoon(t, func() error {
  5225  		if seen := atomic.LoadInt64(&putRequestCount); seen < N {
  5226  			return fmt.Errorf("saw %d, waiting for %d", seen, N)
  5227  		}
  5228  		return nil
  5229  	})
  5230  	desc = *repl.Desc()
  5231  	fromReplDesc, found := desc.GetReplicaDescriptor(3)
  5232  	require.True(t, found)
  5233  	replDesc, found := desc.GetReplicaDescriptor(store.StoreID())
  5234  	require.True(t, found)
  5235  	newReplDesc := replDesc
  5236  	newReplDesc.ReplicaID = desc.NextReplicaID
  5237  	require.Nil(t, store.HandleRaftRequest(ctx, &kvserver.RaftMessageRequest{
  5238  		RangeID:       desc.RangeID,
  5239  		RangeStartKey: desc.StartKey,
  5240  		FromReplica:   fromReplDesc,
  5241  		ToReplica:     newReplDesc,
  5242  		Message:       raftpb.Message{Type: raftpb.MsgVote, Term: 2},
  5243  	}, noopRaftMessageResponseSteam{}))
  5244  	ts := waitForTombstone(t, store.Engine(), desc.RangeID)
  5245  	require.Equal(t, ts.NextReplicaID, desc.NextReplicaID)
  5246  	wg.Wait()
  5247  	_, err = repl.GetProposalQuota().Acquire(ctx, 1)
  5248  	require.Regexp(t, "closed.*destroyed", err)
  5249  }
  5250  
  5251  type noopRaftMessageResponseSteam struct{}
  5252  
  5253  func (n noopRaftMessageResponseSteam) Context() context.Context {
  5254  	return context.Background()
  5255  }
  5256  
  5257  func (n noopRaftMessageResponseSteam) Send(*kvserver.RaftMessageResponse) error {
  5258  	return nil
  5259  }
  5260  
  5261  var _ kvserver.RaftMessageResponseStream = noopRaftMessageResponseSteam{}