github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/client_split_test.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver_test
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"math"
    18  	"math/rand"
    19  	"reflect"
    20  	"sort"
    21  	"strconv"
    22  	"sync/atomic"
    23  	"testing"
    24  	"time"
    25  
    26  	"github.com/cockroachdb/cockroach/pkg/base"
    27  	"github.com/cockroachdb/cockroach/pkg/config"
    28  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    29  	"github.com/cockroachdb/cockroach/pkg/gossip"
    30  	"github.com/cockroachdb/cockroach/pkg/keys"
    31  	"github.com/cockroachdb/cockroach/pkg/kv"
    32  	"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
    33  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
    34  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/abortspan"
    35  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    36  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
    37  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    38  	"github.com/cockroachdb/cockroach/pkg/rpc"
    39  	"github.com/cockroachdb/cockroach/pkg/server"
    40  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    41  	"github.com/cockroachdb/cockroach/pkg/storage"
    42  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    43  	"github.com/cockroachdb/cockroach/pkg/testutils"
    44  	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
    45  	"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
    46  	"github.com/cockroachdb/cockroach/pkg/ts"
    47  	"github.com/cockroachdb/cockroach/pkg/ts/tspb"
    48  	"github.com/cockroachdb/cockroach/pkg/util"
    49  	"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
    50  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    51  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    52  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    53  	"github.com/cockroachdb/cockroach/pkg/util/log"
    54  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    55  	"github.com/cockroachdb/cockroach/pkg/util/randutil"
    56  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    57  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    58  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    59  	"github.com/cockroachdb/errors"
    60  	"github.com/gogo/protobuf/proto"
    61  	"github.com/stretchr/testify/require"
    62  	"go.etcd.io/etcd/raft/raftpb"
    63  )
    64  
    65  // adminSplitArgs creates an AdminSplitRequest for the provided split key.
    66  func adminSplitArgs(splitKey roachpb.Key) *roachpb.AdminSplitRequest {
    67  	return &roachpb.AdminSplitRequest{
    68  		RequestHeader: roachpb.RequestHeader{
    69  			Key: splitKey,
    70  		},
    71  		SplitKey: splitKey,
    72  	}
    73  }
    74  
    75  // TestStoreRangeSplitAtIllegalKeys verifies a range cannot be split
    76  // at illegal keys.
    77  func TestStoreRangeSplitAtIllegalKeys(t *testing.T) {
    78  	defer leaktest.AfterTest(t)()
    79  	stopper := stop.NewStopper()
    80  	defer stopper.Stop(context.Background())
    81  
    82  	cfg := kvserver.TestStoreConfig(nil)
    83  	cfg.TestingKnobs.DisableSplitQueue = true
    84  	cfg.TestingKnobs.DisableMergeQueue = true
    85  	store := createTestStoreWithConfig(t, stopper, cfg)
    86  
    87  	for _, key := range []roachpb.Key{
    88  		keys.Meta1Prefix,
    89  		testutils.MakeKey(keys.Meta1Prefix, []byte("a")),
    90  		testutils.MakeKey(keys.Meta1Prefix, roachpb.RKeyMax),
    91  		keys.Meta2KeyMax,
    92  		testutils.MakeKey(keys.Meta2KeyMax, []byte("a")),
    93  		keys.SystemSQLCodec.TablePrefix(10 /* system descriptor ID */),
    94  	} {
    95  		args := adminSplitArgs(key)
    96  		_, pErr := kv.SendWrapped(context.Background(), store.TestSender(), args)
    97  		if !testutils.IsPError(pErr, "cannot split") {
    98  			t.Errorf("%q: unexpected split error %s", key, pErr)
    99  		}
   100  	}
   101  }
   102  
   103  // Verify that on a split, only the non-expired abort span records are copied
   104  // into the right hand side of the split.
   105  func TestStoreSplitAbortSpan(t *testing.T) {
   106  	defer leaktest.AfterTest(t)()
   107  
   108  	manualClock := hlc.NewManualClock(2400 * time.Hour.Nanoseconds())
   109  	clock := hlc.NewClock(manualClock.UnixNano, time.Millisecond)
   110  	storeCfg := kvserver.TestStoreConfig(clock)
   111  	storeCfg.TestingKnobs.DisableSplitQueue = true
   112  	storeCfg.TestingKnobs.DisableMergeQueue = true
   113  
   114  	stopper := stop.NewStopper()
   115  	defer stopper.Stop(context.Background())
   116  	store := createTestStoreWithConfig(t, stopper, storeCfg)
   117  	ctx := context.Background()
   118  
   119  	left, middle, right := roachpb.Key("a"), roachpb.Key("b"), roachpb.Key("c")
   120  
   121  	txn := func(key roachpb.Key, ts hlc.Timestamp) *roachpb.Transaction {
   122  		txn := roachpb.MakeTransaction("test", key, 0, ts, 0)
   123  		return &txn
   124  	}
   125  
   126  	var expAll []roachpb.AbortSpanEntry
   127  
   128  	populateAbortSpan := func(key roachpb.Key, ts hlc.Timestamp) *roachpb.ResolveIntentRequest {
   129  		pushee := txn(key, ts)
   130  
   131  		// First write an intent on the key...
   132  		incArgs := incrementArgs(key, 1)
   133  		_, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{Txn: pushee}, incArgs)
   134  		if pErr != nil {
   135  			t.Fatalf("while sending +%v: %s", incArgs, pErr)
   136  		}
   137  
   138  		// Then resolve the intent and poison. Without the intent write, the
   139  		// intent resolution would be a no-op and wouldn't leave an AbortSpan
   140  		// entry.
   141  		expAll = append(expAll, roachpb.AbortSpanEntry{
   142  			Key:       key,
   143  			Timestamp: ts,
   144  		})
   145  		return &roachpb.ResolveIntentRequest{
   146  			RequestHeader: roachpb.RequestHeader{
   147  				Key: key,
   148  			},
   149  			IntentTxn: pushee.TxnMeta,
   150  			Status:    roachpb.ABORTED,
   151  			Poison:    true,
   152  		}
   153  	}
   154  
   155  	key := func(k roachpb.Key, i int) roachpb.Key {
   156  		var r []byte
   157  		r = append(r, k...)
   158  		r = append(r, []byte(strconv.Itoa(i))...)
   159  		return r
   160  	}
   161  
   162  	thresh := kvserverbase.TxnCleanupThreshold.Nanoseconds()
   163  	// Pick a non-gcable and gcable timestamp, respectively. Avoid the clock's
   164  	// exact timestamp because of unpredictable logical ticks.
   165  	tsFresh := hlc.Timestamp{WallTime: manualClock.UnixNano() - thresh + 1}
   166  	tsStale := hlc.Timestamp{WallTime: manualClock.UnixNano() - thresh - 1}
   167  
   168  	args := []roachpb.Request{
   169  		populateAbortSpan(key(left, 1), tsFresh),
   170  		populateAbortSpan(key(left, 2), tsStale),
   171  		populateAbortSpan(key(middle, 1), tsFresh),
   172  		populateAbortSpan(key(middle, 2), tsStale),
   173  		populateAbortSpan(key(right, 1), tsFresh),
   174  		populateAbortSpan(key(right, 2), tsStale),
   175  		adminSplitArgs(middle),
   176  	}
   177  
   178  	// Nothing gets removed from the LHS during the split. This could
   179  	// be done but has to be done carefully to avoid large Raft proposals,
   180  	// and the stats computation needs to be checked carefully.
   181  	expL := []roachpb.AbortSpanEntry{
   182  		{Key: key(left, 1), Timestamp: tsFresh},
   183  		{Key: key(left, 2), Timestamp: tsStale},
   184  		{Key: key(middle, 1), Timestamp: tsFresh},
   185  		{Key: key(middle, 2), Timestamp: tsStale},
   186  		{Key: key(right, 1), Timestamp: tsFresh},
   187  		{Key: key(right, 2), Timestamp: tsStale},
   188  	}
   189  
   190  	// But we don't blindly copy everything over to the RHS. Only entries with
   191  	// recent timestamp are duplicated. This is important because otherwise the
   192  	// Raft command size can blow up and splits fail.
   193  	expR := []roachpb.AbortSpanEntry{
   194  		{Key: key(left, 1), Timestamp: tsFresh},
   195  		{Key: key(middle, 1), Timestamp: tsFresh},
   196  		{Key: key(right, 1), Timestamp: tsFresh},
   197  	}
   198  
   199  	for _, arg := range args {
   200  		_, pErr := kv.SendWrapped(ctx, store.TestSender(), arg)
   201  		if pErr != nil {
   202  			t.Fatalf("while sending +%v: %s", arg, pErr)
   203  		}
   204  	}
   205  
   206  	collect := func(as *abortspan.AbortSpan) []roachpb.AbortSpanEntry {
   207  		var results []roachpb.AbortSpanEntry
   208  		if err := as.Iterate(ctx, store.Engine(), func(_ roachpb.Key, entry roachpb.AbortSpanEntry) error {
   209  			entry.Priority = 0 // don't care about that
   210  			results = append(results, entry)
   211  			return nil
   212  		}); err != nil {
   213  			t.Fatal(err)
   214  		}
   215  		sort.Slice(results, func(i, j int) bool {
   216  			c := bytes.Compare(results[i].Key, results[j].Key)
   217  			if c == 0 {
   218  				return results[i].Timestamp.Less(results[j].Timestamp)
   219  			}
   220  			return c < 0
   221  		})
   222  		return results
   223  	}
   224  
   225  	l := collect(store.LookupReplica(keys.MustAddr(left)).AbortSpan())
   226  	r := collect(store.LookupReplica(keys.MustAddr(right)).AbortSpan())
   227  
   228  	if !reflect.DeepEqual(expL, l) {
   229  		t.Fatalf("left hand side: expected %+v, got %+v", expL, l)
   230  	}
   231  	if !reflect.DeepEqual(expR, r) {
   232  		t.Fatalf("right hand side: expected %+v, got %+v", expR, r)
   233  	}
   234  }
   235  
   236  // TestStoreRangeSplitAtTablePrefix verifies a range can be split at
   237  // UserTableDataMin and still gossip the SystemConfig properly.
   238  func TestStoreRangeSplitAtTablePrefix(t *testing.T) {
   239  	defer leaktest.AfterTest(t)()
   240  	storeCfg := kvserver.TestStoreConfig(nil)
   241  	storeCfg.TestingKnobs.DisableSplitQueue = true
   242  	storeCfg.TestingKnobs.DisableMergeQueue = true
   243  	stopper := stop.NewStopper()
   244  	defer stopper.Stop(context.Background())
   245  	store := createTestStoreWithConfig(t, stopper, storeCfg)
   246  
   247  	key := keys.UserTableDataMin
   248  	args := adminSplitArgs(key)
   249  	if _, pErr := kv.SendWrapped(context.Background(), store.TestSender(), args); pErr != nil {
   250  		t.Fatalf("%q: split unexpected error: %s", key, pErr)
   251  	}
   252  
   253  	var desc sqlbase.TableDescriptor
   254  	descBytes, err := protoutil.Marshal(&desc)
   255  	if err != nil {
   256  		t.Fatal(err)
   257  	}
   258  
   259  	// Update SystemConfig to trigger gossip.
   260  	if err := store.DB().Txn(context.Background(), func(ctx context.Context, txn *kv.Txn) error {
   261  		if err := txn.SetSystemConfigTrigger(); err != nil {
   262  			return err
   263  		}
   264  		// We don't care about the values, just the keys.
   265  		k := sqlbase.MakeDescMetadataKey(keys.SystemSQLCodec, sqlbase.ID(keys.MinUserDescID))
   266  		return txn.Put(ctx, k, &desc)
   267  	}); err != nil {
   268  		t.Fatal(err)
   269  	}
   270  
   271  	successChan := make(chan struct{}, 1)
   272  	store.Gossip().RegisterCallback(gossip.KeySystemConfig, func(_ string, content roachpb.Value) {
   273  		contentBytes, err := content.GetBytes()
   274  		if err != nil {
   275  			t.Fatal(err)
   276  		}
   277  		if bytes.Contains(contentBytes, descBytes) {
   278  			select {
   279  			case successChan <- struct{}{}:
   280  			default:
   281  			}
   282  		}
   283  	})
   284  
   285  	select {
   286  	case <-time.After(time.Second):
   287  		t.Errorf("expected a schema gossip containing %q, but did not see one", descBytes)
   288  	case <-successChan:
   289  	}
   290  }
   291  
   292  // TestStoreRangeSplitInsideRow verifies an attempt to split a range inside of
   293  // a table row will cause a split at a boundary between rows.
   294  func TestStoreRangeSplitInsideRow(t *testing.T) {
   295  	defer leaktest.AfterTest(t)()
   296  	storeCfg := kvserver.TestStoreConfig(nil)
   297  	storeCfg.TestingKnobs.DisableSplitQueue = true
   298  	storeCfg.TestingKnobs.DisableMergeQueue = true
   299  	stopper := stop.NewStopper()
   300  	defer stopper.Stop(context.Background())
   301  	store := createTestStoreWithConfig(t, stopper, storeCfg)
   302  
   303  	// Manually create some the column keys corresponding to the table:
   304  	//
   305  	//   CREATE TABLE t (id STRING PRIMARY KEY, col1 INT, col2 INT)
   306  	tableKey := roachpb.RKey(keys.SystemSQLCodec.TablePrefix(keys.MinUserDescID))
   307  	rowKey := roachpb.Key(encoding.EncodeVarintAscending(append([]byte(nil), tableKey...), 1))
   308  	rowKey = encoding.EncodeStringAscending(encoding.EncodeVarintAscending(rowKey, 1), "a")
   309  	col1Key, err := keys.EnsureSafeSplitKey(keys.MakeFamilyKey(append([]byte(nil), rowKey...), 1))
   310  	if err != nil {
   311  		t.Fatal(err)
   312  	}
   313  	col2Key, err := keys.EnsureSafeSplitKey(keys.MakeFamilyKey(append([]byte(nil), rowKey...), 2))
   314  	if err != nil {
   315  		t.Fatal(err)
   316  	}
   317  
   318  	// We don't care about the value, so just store any old thing.
   319  	if err := store.DB().Put(context.Background(), col1Key, "column 1"); err != nil {
   320  		t.Fatal(err)
   321  	}
   322  	if err := store.DB().Put(context.Background(), col2Key, "column 2"); err != nil {
   323  		t.Fatal(err)
   324  	}
   325  
   326  	// Split between col1Key and col2Key by splitting before col2Key.
   327  	args := adminSplitArgs(col2Key)
   328  	_, pErr := kv.SendWrapped(context.Background(), store.TestSender(), args)
   329  	if pErr != nil {
   330  		t.Fatalf("%s: split unexpected error: %s", col1Key, pErr)
   331  	}
   332  
   333  	repl1 := store.LookupReplica(roachpb.RKey(col1Key))
   334  	repl2 := store.LookupReplica(roachpb.RKey(col2Key))
   335  
   336  	// Verify the two columns are still on the same range.
   337  	if !reflect.DeepEqual(repl1, repl2) {
   338  		t.Fatalf("%s: ranges differ: %+v vs %+v", col1Key, repl1, repl2)
   339  	}
   340  	// Verify we split on a row key.
   341  	if startKey := repl1.Desc().StartKey; !startKey.Equal(rowKey) {
   342  		t.Fatalf("%s: expected split on %s, but found %s", col1Key, rowKey, startKey)
   343  	}
   344  
   345  	// Verify the previous range was split on a row key.
   346  	repl3 := store.LookupReplica(tableKey)
   347  	if endKey := repl3.Desc().EndKey; !endKey.Equal(rowKey) {
   348  		t.Fatalf("%s: expected split on %s, but found %s", col1Key, rowKey, endKey)
   349  	}
   350  }
   351  
   352  // TestStoreRangeSplitIntents executes a split of a range and verifies
   353  // that all intents are cleared and the transaction record cleaned up.
   354  func TestStoreRangeSplitIntents(t *testing.T) {
   355  	defer leaktest.AfterTest(t)()
   356  	storeCfg := kvserver.TestStoreConfig(nil)
   357  	storeCfg.TestingKnobs.DisableSplitQueue = true
   358  	storeCfg.TestingKnobs.DisableMergeQueue = true
   359  	stopper := stop.NewStopper()
   360  	defer stopper.Stop(context.Background())
   361  	store := createTestStoreWithConfig(t, stopper, storeCfg)
   362  
   363  	// First, write some values left and right of the proposed split key.
   364  	pArgs := putArgs([]byte("c"), []byte("foo"))
   365  	if _, pErr := kv.SendWrapped(context.Background(), store.TestSender(), pArgs); pErr != nil {
   366  		t.Fatal(pErr)
   367  	}
   368  	pArgs = putArgs([]byte("x"), []byte("bar"))
   369  	if _, pErr := kv.SendWrapped(context.Background(), store.TestSender(), pArgs); pErr != nil {
   370  		t.Fatal(pErr)
   371  	}
   372  
   373  	// Split the range.
   374  	splitKey := roachpb.Key("m")
   375  	args := adminSplitArgs(splitKey)
   376  	if _, pErr := kv.SendWrapped(context.Background(), store.TestSender(), args); pErr != nil {
   377  		t.Fatal(pErr)
   378  	}
   379  
   380  	// Verify no intents remains on range descriptor keys.
   381  	splitKeyAddr, err := keys.Addr(splitKey)
   382  	if err != nil {
   383  		t.Fatal(err)
   384  	}
   385  	for _, key := range []roachpb.Key{keys.RangeDescriptorKey(roachpb.RKeyMin), keys.RangeDescriptorKey(splitKeyAddr)} {
   386  		if _, _, err := storage.MVCCGet(
   387  			context.Background(), store.Engine(), key, store.Clock().Now(), storage.MVCCGetOptions{},
   388  		); err != nil {
   389  			t.Errorf("failed to read consistent range descriptor for key %s: %+v", key, err)
   390  		}
   391  	}
   392  
   393  	txnPrefix := func(key roachpb.Key) roachpb.Key {
   394  		rk, err := keys.Addr(key)
   395  		if err != nil {
   396  			t.Fatal(err)
   397  		}
   398  		return keys.MakeRangeKey(rk, keys.LocalTransactionSuffix, nil)
   399  	}
   400  	// Verify the transaction record is gone.
   401  	start := storage.MakeMVCCMetadataKey(keys.MakeRangeKeyPrefix(roachpb.RKeyMin))
   402  	end := storage.MakeMVCCMetadataKey(keys.MakeRangeKeyPrefix(roachpb.RKeyMax))
   403  	iter := store.Engine().NewIterator(storage.IterOptions{UpperBound: roachpb.KeyMax})
   404  
   405  	defer iter.Close()
   406  	for iter.SeekGE(start); ; iter.Next() {
   407  		if ok, err := iter.Valid(); err != nil {
   408  			t.Fatal(err)
   409  		} else if !ok || !iter.UnsafeKey().Less(end) {
   410  			break
   411  		}
   412  
   413  		if bytes.HasPrefix([]byte(iter.Key().Key), txnPrefix(roachpb.KeyMin)) ||
   414  			bytes.HasPrefix([]byte(iter.Key().Key), txnPrefix(splitKey)) {
   415  			t.Errorf("unexpected system key: %s; txn record should have been cleaned up", iter.Key())
   416  		}
   417  	}
   418  }
   419  
   420  // TestStoreRangeSplitAtRangeBounds verifies that attempting to
   421  // split a range at its start key is a no-op and does not actually
   422  // perform a split (would create zero-length range!). This sort
   423  // of thing might happen in the wild if two split requests arrived for
   424  // same key. The first one succeeds and second would try to split
   425  // at the start of the newly split range.
   426  func TestStoreRangeSplitAtRangeBounds(t *testing.T) {
   427  	defer leaktest.AfterTest(t)()
   428  	storeCfg := kvserver.TestStoreConfig(nil)
   429  	storeCfg.TestingKnobs.DisableSplitQueue = true
   430  	storeCfg.TestingKnobs.DisableMergeQueue = true
   431  	stopper := stop.NewStopper()
   432  	defer stopper.Stop(context.Background())
   433  	store := createTestStoreWithConfig(t, stopper, storeCfg)
   434  
   435  	// Split range 1 at an arbitrary key.
   436  	key := roachpb.Key("a")
   437  	rngID := store.LookupReplica(roachpb.RKey(key)).RangeID
   438  	h := roachpb.Header{RangeID: rngID}
   439  	args := adminSplitArgs(key)
   440  	if _, pErr := kv.SendWrappedWith(context.Background(), store, h, args); pErr != nil {
   441  		t.Fatal(pErr)
   442  	}
   443  	replCount := store.ReplicaCount()
   444  
   445  	// An AdminSplit request sent to the end of the old range
   446  	// should fail with a RangeKeyMismatchError.
   447  	_, pErr := kv.SendWrappedWith(context.Background(), store, h, args)
   448  	if _, ok := pErr.GetDetail().(*roachpb.RangeKeyMismatchError); !ok {
   449  		t.Fatalf("expected RangeKeyMismatchError, found: %v", pErr)
   450  	}
   451  
   452  	// An AdminSplit request sent to the start of the new range
   453  	// should succeed but no new ranges should be created.
   454  	newRng := store.LookupReplica(roachpb.RKey(key))
   455  	h.RangeID = newRng.RangeID
   456  	if _, pErr := kv.SendWrappedWith(context.Background(), store, h, args); pErr != nil {
   457  		t.Fatal(pErr)
   458  	}
   459  
   460  	newReplCount := store.ReplicaCount()
   461  	if replCount != newReplCount {
   462  		t.Fatalf("splitting at a range boundary should not create a new range; before second split "+
   463  			"found %d ranges, after second split found %d ranges", replCount, newReplCount)
   464  	}
   465  }
   466  
   467  // TestSplitTriggerRaftSnapshotRace verifies that when an uninitialized Replica
   468  // resulting from a split hasn't been initialized via the split trigger yet, a
   469  // grace period prevents the replica from requesting an errant Raft snapshot.
   470  // This is verified by running a number of splits and asserting that no Raft
   471  // snapshots are observed. As a nice side effect, this also verifies that log
   472  // truncations don't cause any Raft snapshots in this test.
   473  func TestSplitTriggerRaftSnapshotRace(t *testing.T) {
   474  	defer leaktest.AfterTest(t)()
   475  
   476  	ctx := context.Background()
   477  	const numNodes = 3
   478  	var args base.TestClusterArgs
   479  	// NB: the merge queue is enabled for additional "chaos". Note that the test
   480  	// uses three nodes and so there is no replica movement, which would other-
   481  	// wise tickle Raft snapshots for unrelated reasons.
   482  	tc := testcluster.StartTestCluster(t, numNodes, args)
   483  	defer tc.Stopper().Stop(ctx)
   484  
   485  	numSplits := 100
   486  	if util.RaceEnabled {
   487  		// Running 100 splits is overkill in race builds.
   488  		numSplits = 10
   489  	}
   490  	perm := rand.Perm(numSplits)
   491  	idx := int32(-1) // accessed atomically
   492  
   493  	numRaftSnaps := func(when string) int {
   494  		var totalSnaps int
   495  		for i := 0; i < numNodes; i++ {
   496  			var n int // num rows (sanity check against test rotting)
   497  			var c int // num Raft snapshots
   498  			if err := tc.ServerConn(i).QueryRow(`
   499  SELECT count(*), sum(value) FROM crdb_internal.node_metrics WHERE
   500  	name = 'range.snapshots.normal-applied'
   501  `).Scan(&n, &c); err != nil {
   502  				t.Fatal(err)
   503  			}
   504  			if expRows := 1; n != expRows {
   505  				t.Fatalf("%s: expected %d rows, got %d", when, expRows, n)
   506  			}
   507  			totalSnaps += c
   508  		}
   509  		return totalSnaps
   510  	}
   511  
   512  	// There are usually no raft snaps before, but there is a race condition where
   513  	// they can occasionally happen during upreplication.
   514  	numSnapsBefore := numRaftSnaps("before")
   515  
   516  	doSplit := func(ctx context.Context, _ int) error {
   517  		_, _, err := tc.SplitRange(
   518  			[]byte(fmt.Sprintf("key-%d", perm[atomic.AddInt32(&idx, 1)])))
   519  		return err
   520  	}
   521  
   522  	if err := ctxgroup.GroupWorkers(ctx, numSplits, doSplit); err != nil {
   523  		t.Fatal(err)
   524  	}
   525  
   526  	// Check that no snaps happened during the splits.
   527  	require.Equal(t, numSnapsBefore, numRaftSnaps("after"))
   528  }
   529  
   530  // TestStoreRangeSplitIdempotency executes a split of a range and
   531  // verifies that the resulting ranges respond to the right key ranges
   532  // and that their stats have been properly accounted for and requests
   533  // can't be replayed.
   534  func TestStoreRangeSplitIdempotency(t *testing.T) {
   535  	defer leaktest.AfterTest(t)()
   536  	storeCfg := kvserver.TestStoreConfig(nil)
   537  	storeCfg.TestingKnobs.DisableSplitQueue = true
   538  	storeCfg.TestingKnobs.DisableMergeQueue = true
   539  	stopper := stop.NewStopper()
   540  	defer stopper.Stop(context.Background())
   541  	store := createTestStoreWithOpts(t,
   542  		testStoreOpts{
   543  			// This test was written before the test stores were able to start with
   544  			// more than one range and is not prepared to handle many ranges.
   545  			dontCreateSystemRanges: true,
   546  			cfg:                    &storeCfg},
   547  		stopper)
   548  	rangeID := roachpb.RangeID(1)
   549  	splitKey := roachpb.Key("m")
   550  	content := roachpb.Key("asdvb")
   551  
   552  	// First, write some values left and right of the proposed split key.
   553  	pArgs := putArgs([]byte("c"), content)
   554  	if _, pErr := kv.SendWrapped(context.Background(), store.TestSender(), pArgs); pErr != nil {
   555  		t.Fatal(pErr)
   556  	}
   557  	pArgs = putArgs([]byte("x"), content)
   558  	if _, pErr := kv.SendWrapped(context.Background(), store.TestSender(), pArgs); pErr != nil {
   559  		t.Fatal(pErr)
   560  	}
   561  
   562  	// Increments are a good way of testing idempotency. Up here, we
   563  	// address them to the original range, then later to the one that
   564  	// contains the key.
   565  	txn := roachpb.MakeTransaction("test", []byte("c"), 10, store.Clock().Now(), 0)
   566  	lIncArgs := incrementArgs([]byte("apoptosis"), 100)
   567  	lTxn := txn
   568  	lTxn.Sequence++
   569  	lIncArgs.Sequence = lTxn.Sequence
   570  	if _, pErr := kv.SendWrappedWith(context.Background(), store.TestSender(), roachpb.Header{
   571  		Txn: &lTxn,
   572  	}, lIncArgs); pErr != nil {
   573  		t.Fatal(pErr)
   574  	}
   575  	rIncArgs := incrementArgs([]byte("wobble"), 10)
   576  	rTxn := txn
   577  	rTxn.Sequence++
   578  	rIncArgs.Sequence = rTxn.Sequence
   579  	if _, pErr := kv.SendWrappedWith(context.Background(), store.TestSender(), roachpb.Header{
   580  		Txn: &rTxn,
   581  	}, rIncArgs); pErr != nil {
   582  		t.Fatal(pErr)
   583  	}
   584  
   585  	// Get the original stats for key and value bytes.
   586  	ms, err := stateloader.Make(rangeID).LoadMVCCStats(context.Background(), store.Engine())
   587  	if err != nil {
   588  		t.Fatal(err)
   589  	}
   590  	keyBytes, valBytes := ms.KeyBytes, ms.ValBytes
   591  
   592  	// Split the range.
   593  	args := adminSplitArgs(splitKey)
   594  	if _, pErr := kv.SendWrapped(context.Background(), store.TestSender(), args); pErr != nil {
   595  		t.Fatal(pErr)
   596  	}
   597  
   598  	// Verify no intents remains on range descriptor keys.
   599  	splitKeyAddr, err := keys.Addr(splitKey)
   600  	if err != nil {
   601  		t.Fatal(err)
   602  	}
   603  	for _, key := range []roachpb.Key{keys.RangeDescriptorKey(roachpb.RKeyMin), keys.RangeDescriptorKey(splitKeyAddr)} {
   604  		if _, _, err := storage.MVCCGet(
   605  			context.Background(), store.Engine(), key, store.Clock().Now(), storage.MVCCGetOptions{},
   606  		); err != nil {
   607  			t.Fatal(err)
   608  		}
   609  	}
   610  
   611  	repl := store.LookupReplica(roachpb.RKeyMin)
   612  	rngDesc := repl.Desc()
   613  	newRng := store.LookupReplica([]byte("m"))
   614  	newRngDesc := newRng.Desc()
   615  	if !bytes.Equal(newRngDesc.StartKey, splitKey) || !bytes.Equal(splitKey, rngDesc.EndKey) {
   616  		t.Errorf("ranges mismatched, wanted %q=%q=%q", newRngDesc.StartKey, splitKey, rngDesc.EndKey)
   617  	}
   618  	if !bytes.Equal(newRngDesc.EndKey, roachpb.RKeyMax) || !bytes.Equal(rngDesc.StartKey, roachpb.RKeyMin) {
   619  		t.Errorf("new ranges do not cover KeyMin-KeyMax, but only %q-%q", rngDesc.StartKey, newRngDesc.EndKey)
   620  	}
   621  
   622  	// Try to get values from both left and right of where the split happened.
   623  	gArgs := getArgs([]byte("c"))
   624  	if reply, pErr := kv.SendWrapped(context.Background(), store.TestSender(), gArgs); pErr != nil {
   625  		t.Fatal(pErr)
   626  	} else if replyBytes, pErr := reply.(*roachpb.GetResponse).Value.GetBytes(); pErr != nil {
   627  		t.Fatal(pErr)
   628  	} else if !bytes.Equal(replyBytes, content) {
   629  		t.Fatalf("actual value %q did not match expected value %q", replyBytes, content)
   630  	}
   631  	gArgs = getArgs([]byte("x"))
   632  	if reply, pErr := kv.SendWrappedWith(context.Background(), store.TestSender(), roachpb.Header{
   633  		RangeID: newRng.RangeID,
   634  	}, gArgs); pErr != nil {
   635  		t.Fatal(pErr)
   636  	} else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil {
   637  		t.Fatal(err)
   638  	} else if !bytes.Equal(replyBytes, content) {
   639  		t.Fatalf("actual value %q did not match expected value %q", replyBytes, content)
   640  	}
   641  
   642  	// Send out an increment request copied from above (same txn/sequence)
   643  	// which remains in the old range.
   644  	_, pErr := kv.SendWrappedWith(context.Background(), store.TestSender(), roachpb.Header{
   645  		Txn: &lTxn,
   646  	}, lIncArgs)
   647  	if pErr != nil {
   648  		t.Fatal(pErr)
   649  	}
   650  
   651  	// Send out the same increment copied from above (same txn/sequence), but
   652  	// now to the newly created range (which should hold that key).
   653  	_, pErr = kv.SendWrappedWith(context.Background(), store.TestSender(), roachpb.Header{
   654  		RangeID: newRng.RangeID,
   655  		Txn:     &rTxn,
   656  	}, rIncArgs)
   657  	if pErr != nil {
   658  		t.Fatal(pErr)
   659  	}
   660  
   661  	// Compare stats of split ranges to ensure they are non zero and
   662  	// exceed the original range when summed.
   663  	left, err := stateloader.Make(rangeID).LoadMVCCStats(context.Background(), store.Engine())
   664  	if err != nil {
   665  		t.Fatal(err)
   666  	}
   667  	lKeyBytes, lValBytes := left.KeyBytes, left.ValBytes
   668  	right, err := stateloader.Make(newRng.RangeID).LoadMVCCStats(context.Background(), store.Engine())
   669  	if err != nil {
   670  		t.Fatal(err)
   671  	}
   672  	rKeyBytes, rValBytes := right.KeyBytes, right.ValBytes
   673  
   674  	if lKeyBytes == 0 || rKeyBytes == 0 {
   675  		t.Errorf("expected non-zero key bytes; got %d, %d", lKeyBytes, rKeyBytes)
   676  	}
   677  	if lValBytes == 0 || rValBytes == 0 {
   678  		t.Errorf("expected non-zero val bytes; got %d, %d", lValBytes, rValBytes)
   679  	}
   680  	if lKeyBytes+rKeyBytes <= keyBytes {
   681  		t.Errorf("left + right key bytes don't match; %d + %d <= %d", lKeyBytes, rKeyBytes, keyBytes)
   682  	}
   683  	if lValBytes+rValBytes <= valBytes {
   684  		t.Errorf("left + right val bytes don't match; %d + %d <= %d", lValBytes, rValBytes, valBytes)
   685  	}
   686  }
   687  
   688  // TestStoreRangeSplitStats starts by splitting the system keys from user-space
   689  // keys and verifying that the user space side of the split (which is empty),
   690  // has all zeros for stats. It then writes random data to the user space side,
   691  // splits it halfway and verifies the two splits have stats exactly equaling
   692  // the pre-split.
   693  func TestStoreRangeSplitStats(t *testing.T) {
   694  	defer leaktest.AfterTest(t)()
   695  	manual := hlc.NewManualClock(123)
   696  	storeCfg := kvserver.TestStoreConfig(hlc.NewClock(manual.UnixNano, time.Nanosecond))
   697  	storeCfg.TestingKnobs.DisableSplitQueue = true
   698  	storeCfg.TestingKnobs.DisableMergeQueue = true
   699  	stopper := stop.NewStopper()
   700  	defer stopper.Stop(context.Background())
   701  	store := createTestStoreWithConfig(t, stopper, storeCfg)
   702  	ctx := context.Background()
   703  
   704  	// Split the range after the last table data key.
   705  	keyPrefix := keys.SystemSQLCodec.TablePrefix(keys.MinUserDescID)
   706  	args := adminSplitArgs(keyPrefix)
   707  	if _, pErr := kv.SendWrapped(ctx, store.TestSender(), args); pErr != nil {
   708  		t.Fatal(pErr)
   709  	}
   710  	// Verify empty range has empty stats.
   711  	repl := store.LookupReplica(roachpb.RKey(keyPrefix))
   712  	// NOTE that this value is expected to change over time, depending on what
   713  	// we store in the sys-local keyspace. Update it accordingly for this test.
   714  	empty := enginepb.MVCCStats{LastUpdateNanos: manual.UnixNano()}
   715  	if err := verifyRangeStats(store.Engine(), repl.RangeID, empty); err != nil {
   716  		t.Fatal(err)
   717  	}
   718  
   719  	// Write random data.
   720  	midKey := kvserver.WriteRandomDataToRange(t, store, repl.RangeID, keyPrefix)
   721  
   722  	// Get the range stats now that we have data.
   723  	snap := store.Engine().NewSnapshot()
   724  	defer snap.Close()
   725  	ms, err := stateloader.Make(repl.RangeID).LoadMVCCStats(ctx, snap)
   726  	if err != nil {
   727  		t.Fatal(err)
   728  	}
   729  	if err := verifyRecomputedStats(snap, repl.Desc(), ms, manual.UnixNano()); err != nil {
   730  		t.Fatalf("failed to verify range's stats before split: %+v", err)
   731  	}
   732  	if inMemMS := repl.GetMVCCStats(); inMemMS != ms {
   733  		t.Fatalf("in-memory and on-disk diverged:\n%+v\n!=\n%+v", inMemMS, ms)
   734  	}
   735  
   736  	manual.Increment(100)
   737  
   738  	// Split the range at approximate halfway point.
   739  	args = adminSplitArgs(midKey)
   740  	if _, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{
   741  		RangeID: repl.RangeID,
   742  	}, args); pErr != nil {
   743  		t.Fatal(pErr)
   744  	}
   745  
   746  	snap = store.Engine().NewSnapshot()
   747  	defer snap.Close()
   748  	msLeft, err := stateloader.Make(repl.RangeID).LoadMVCCStats(ctx, snap)
   749  	if err != nil {
   750  		t.Fatal(err)
   751  	}
   752  	replRight := store.LookupReplica(midKey)
   753  	msRight, err := stateloader.Make(replRight.RangeID).LoadMVCCStats(ctx, snap)
   754  	if err != nil {
   755  		t.Fatal(err)
   756  	}
   757  
   758  	// The stats should be exactly equal when added.
   759  	expMS := enginepb.MVCCStats{
   760  		LiveBytes:   msLeft.LiveBytes + msRight.LiveBytes,
   761  		KeyBytes:    msLeft.KeyBytes + msRight.KeyBytes,
   762  		ValBytes:    msLeft.ValBytes + msRight.ValBytes,
   763  		IntentBytes: msLeft.IntentBytes + msRight.IntentBytes,
   764  		LiveCount:   msLeft.LiveCount + msRight.LiveCount,
   765  		KeyCount:    msLeft.KeyCount + msRight.KeyCount,
   766  		ValCount:    msLeft.ValCount + msRight.ValCount,
   767  		IntentCount: msLeft.IntentCount + msRight.IntentCount,
   768  	}
   769  	ms.SysBytes, ms.SysCount = 0, 0
   770  	ms.LastUpdateNanos = 0
   771  	if expMS != ms {
   772  		t.Errorf("expected left plus right ranges to equal original, but\n %+v\n+\n %+v\n!=\n %+v", msLeft, msRight, ms)
   773  	}
   774  
   775  	// Stats should both have the new timestamp.
   776  	now := manual.UnixNano()
   777  	if lTs := msLeft.LastUpdateNanos; lTs != now {
   778  		t.Errorf("expected left range stats to have new timestamp, want %d, got %d", now, lTs)
   779  	}
   780  	if rTs := msRight.LastUpdateNanos; rTs != now {
   781  		t.Errorf("expected right range stats to have new timestamp, want %d, got %d", now, rTs)
   782  	}
   783  
   784  	// Stats should agree with recomputation.
   785  	if err := verifyRecomputedStats(snap, repl.Desc(), msLeft, now); err != nil {
   786  		t.Fatalf("failed to verify left range's stats after split: %+v", err)
   787  	}
   788  	if err := verifyRecomputedStats(snap, replRight.Desc(), msRight, now); err != nil {
   789  		t.Fatalf("failed to verify right range's stats after split: %+v", err)
   790  	}
   791  }
   792  
   793  // RaftMessageHandlerInterceptor wraps a storage.RaftMessageHandler. It
   794  // delegates all methods to the underlying storage.RaftMessageHandler, except
   795  // that HandleSnapshot calls receiveSnapshotFilter with the snapshot request
   796  // header before delegating to the underlying HandleSnapshot method.
   797  type RaftMessageHandlerInterceptor struct {
   798  	kvserver.RaftMessageHandler
   799  	handleSnapshotFilter func(header *kvserver.SnapshotRequest_Header)
   800  }
   801  
   802  func (mh RaftMessageHandlerInterceptor) HandleSnapshot(
   803  	header *kvserver.SnapshotRequest_Header, respStream kvserver.SnapshotResponseStream,
   804  ) error {
   805  	mh.handleSnapshotFilter(header)
   806  	return mh.RaftMessageHandler.HandleSnapshot(header, respStream)
   807  }
   808  
   809  // TestStoreEmptyRangeSnapshotSize tests that the snapshot request header for a
   810  // range that contains no user data (an "empty" range) has RangeSize == 0. This
   811  // is arguably a bug, because system data like the range descriptor and raft log
   812  // should also count towards the size of the snapshot. Currently, though, this
   813  // property conveniently allows us to optimize the rebalancing of empty ranges
   814  // by throttling snapshots of empty ranges separately from non-empty snapshots.
   815  //
   816  // If you change the accounting of RangeSize such that this test breaks, please
   817  // preserve the optimization by introducing an alternative means of identifying
   818  // snapshot requests for empty or near-empty ranges, and then adjust this test
   819  // accordingly.
   820  func TestStoreEmptyRangeSnapshotSize(t *testing.T) {
   821  	defer leaktest.AfterTest(t)()
   822  
   823  	ctx := context.Background()
   824  
   825  	// Disable the replicate queue, the split queue, and the merge queue as we
   826  	// want to control both rebalancing, splits, and merges ourselves.
   827  	sc := kvserver.TestStoreConfig(nil)
   828  	sc.TestingKnobs.DisableReplicateQueue = true
   829  	sc.TestingKnobs.DisableSplitQueue = true
   830  	sc.TestingKnobs.DisableMergeQueue = true
   831  
   832  	mtc := &multiTestContext{storeConfig: &sc}
   833  	defer mtc.Stop()
   834  	mtc.Start(t, 2)
   835  
   836  	// Split the range after the last table data key to get a range that contains
   837  	// no user data.
   838  	splitKey := keys.SystemSQLCodec.TablePrefix(keys.MinUserDescID)
   839  	splitArgs := adminSplitArgs(splitKey)
   840  	if _, err := kv.SendWrapped(ctx, mtc.distSenders[0], splitArgs); err != nil {
   841  		t.Fatal(err)
   842  	}
   843  
   844  	// Wrap store 1's message handler to intercept and record all incoming
   845  	// snapshot request headers.
   846  	messageRecorder := struct {
   847  		syncutil.Mutex
   848  		headers []*kvserver.SnapshotRequest_Header
   849  	}{}
   850  	messageHandler := RaftMessageHandlerInterceptor{
   851  		RaftMessageHandler: mtc.stores[1],
   852  		handleSnapshotFilter: func(header *kvserver.SnapshotRequest_Header) {
   853  			// Each snapshot request is handled in a new goroutine, so we need
   854  			// synchronization.
   855  			messageRecorder.Lock()
   856  			defer messageRecorder.Unlock()
   857  			messageRecorder.headers = append(messageRecorder.headers, header)
   858  		},
   859  	}
   860  	mtc.transport.Listen(mtc.stores[1].StoreID(), messageHandler)
   861  
   862  	// Replicate the newly-split range to trigger a snapshot request from store 0
   863  	// to store 1.
   864  	rangeID := mtc.stores[0].LookupReplica(roachpb.RKey(splitKey)).RangeID
   865  	mtc.replicateRange(rangeID, 1)
   866  
   867  	// Verify that we saw at least one snapshot request,
   868  	messageRecorder.Lock()
   869  	defer messageRecorder.Unlock()
   870  	if a := len(messageRecorder.headers); a < 1 {
   871  		t.Fatalf("expected at least one snapshot header, but got %d", a)
   872  	}
   873  	for i, header := range messageRecorder.headers {
   874  		if e, a := header.State.Desc.RangeID, rangeID; e != a {
   875  			t.Errorf("%d: expected RangeID to be %d, but got %d", i, e, a)
   876  		}
   877  		if header.RangeSize != 0 {
   878  			t.Errorf("%d: expected RangeSize to be 0, but got %d", i, header.RangeSize)
   879  		}
   880  	}
   881  }
   882  
   883  // TestStoreRangeSplitStatsWithMerges starts by splitting the system keys from
   884  // user-space keys and verifying that the user space side of the split (which is empty),
   885  // has all zeros for stats. It then issues a number of Merge requests to the user
   886  // space side, simulating TimeSeries data. Finally, the test splits the user space
   887  // side halfway and verifies the stats on either side of the split are equal to a
   888  // recomputation.
   889  //
   890  // Note that unlike TestStoreRangeSplitStats, we do not check if the two halves of the
   891  // split's stats are equal to the pre-split stats when added, because this will not be
   892  // true of ranges populated with Merge requests. The reason for this is that Merge
   893  // requests' impact on MVCCStats are only estimated. See updateStatsOnMerge.
   894  func TestStoreRangeSplitStatsWithMerges(t *testing.T) {
   895  	defer leaktest.AfterTest(t)()
   896  	manual := hlc.NewManualClock(123)
   897  	storeCfg := kvserver.TestStoreConfig(hlc.NewClock(manual.UnixNano, time.Nanosecond))
   898  	storeCfg.TestingKnobs.DisableSplitQueue = true
   899  	stopper := stop.NewStopper()
   900  	defer stopper.Stop(context.Background())
   901  	store := createTestStoreWithConfig(t, stopper, storeCfg)
   902  	ctx := context.Background()
   903  
   904  	// Split the range after the last table data key.
   905  	keyPrefix := keys.SystemSQLCodec.TablePrefix(keys.MinUserDescID)
   906  	args := adminSplitArgs(keyPrefix)
   907  	if _, pErr := kv.SendWrapped(ctx, store.TestSender(), args); pErr != nil {
   908  		t.Fatal(pErr)
   909  	}
   910  	// Verify empty range has empty stats.
   911  	repl := store.LookupReplica(roachpb.RKey(keyPrefix))
   912  	// NOTE that this value is expected to change over time, depending on what
   913  	// we store in the sys-local keyspace. Update it accordingly for this test.
   914  	empty := enginepb.MVCCStats{LastUpdateNanos: manual.UnixNano()}
   915  	if err := verifyRangeStats(store.Engine(), repl.RangeID, empty); err != nil {
   916  		t.Fatal(err)
   917  	}
   918  
   919  	// Write random TimeSeries data.
   920  	midKey := writeRandomTimeSeriesDataToRange(t, store, repl.RangeID, keyPrefix)
   921  	manual.Increment(100)
   922  
   923  	// Split the range at approximate halfway point.
   924  	args = adminSplitArgs(midKey)
   925  	if _, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{
   926  		RangeID: repl.RangeID,
   927  	}, args); pErr != nil {
   928  		t.Fatal(pErr)
   929  	}
   930  
   931  	snap := store.Engine().NewSnapshot()
   932  	defer snap.Close()
   933  	msLeft, err := stateloader.Make(repl.RangeID).LoadMVCCStats(ctx, snap)
   934  	if err != nil {
   935  		t.Fatal(err)
   936  	}
   937  	replRight := store.LookupReplica(midKey)
   938  	msRight, err := stateloader.Make(replRight.RangeID).LoadMVCCStats(ctx, snap)
   939  	if err != nil {
   940  		t.Fatal(err)
   941  	}
   942  
   943  	// Stats should both have the new timestamp.
   944  	now := manual.UnixNano()
   945  	if lTs := msLeft.LastUpdateNanos; lTs != now {
   946  		t.Errorf("expected left range stats to have new timestamp, want %d, got %d", now, lTs)
   947  	}
   948  	if rTs := msRight.LastUpdateNanos; rTs != now {
   949  		t.Errorf("expected right range stats to have new timestamp, want %d, got %d", now, rTs)
   950  	}
   951  
   952  	// Stats should agree with recomputation.
   953  	if err := verifyRecomputedStats(snap, repl.Desc(), msLeft, now); err != nil {
   954  		t.Fatalf("failed to verify left range's stats after split: %+v", err)
   955  	}
   956  	if err := verifyRecomputedStats(snap, replRight.Desc(), msRight, now); err != nil {
   957  		t.Fatalf("failed to verify right range's stats after split: %+v", err)
   958  	}
   959  }
   960  
   961  // fillRange writes keys with the given prefix and associated values
   962  // until bytes bytes have been written or the given range has split.
   963  func fillRange(
   964  	t *testing.T,
   965  	store *kvserver.Store,
   966  	rangeID roachpb.RangeID,
   967  	prefix roachpb.Key,
   968  	bytes int64,
   969  	singleKey bool,
   970  ) {
   971  	src := rand.New(rand.NewSource(0))
   972  	var key []byte
   973  	for {
   974  		ms, err := stateloader.Make(rangeID).LoadMVCCStats(context.Background(), store.Engine())
   975  		if err != nil {
   976  			t.Fatal(err)
   977  		}
   978  		keyBytes, valBytes := ms.KeyBytes, ms.ValBytes
   979  		if keyBytes+valBytes >= bytes {
   980  			return
   981  		}
   982  		if key == nil || !singleKey {
   983  			key = append(append([]byte(nil), prefix...), randutil.RandBytes(src, 100)...)
   984  			key = keys.MakeFamilyKey(key, src.Uint32())
   985  		}
   986  		val := randutil.RandBytes(src, int(src.Int31n(1<<8)))
   987  		pArgs := putArgs(key, val)
   988  		_, pErr := kv.SendWrappedWith(context.Background(), store, roachpb.Header{
   989  			RangeID: rangeID,
   990  		}, pArgs)
   991  		// When the split occurs in the background, our writes may start failing.
   992  		// We know we can stop writing when this happens.
   993  		if _, ok := pErr.GetDetail().(*roachpb.RangeKeyMismatchError); ok {
   994  			return
   995  		} else if pErr != nil {
   996  			t.Fatal(pErr)
   997  		}
   998  	}
   999  }
  1000  
  1001  // TestStoreZoneUpdateAndRangeSplit verifies that modifying the zone
  1002  // configuration changes range max bytes and Range.maybeSplit() takes
  1003  // max bytes into account when deciding whether to enqueue a range for
  1004  // splitting. It further verifies that the range is in fact split on
  1005  // exceeding zone's RangeMaxBytes.
  1006  func TestStoreZoneUpdateAndRangeSplit(t *testing.T) {
  1007  	defer leaktest.AfterTest(t)()
  1008  	stopper := stop.NewStopper()
  1009  	defer stopper.Stop(context.Background())
  1010  	storeCfg := kvserver.TestStoreConfig(nil /* clock */)
  1011  	storeCfg.TestingKnobs.DisableMergeQueue = true
  1012  	store := createTestStoreWithConfig(t, stopper, storeCfg)
  1013  	config.TestingSetupZoneConfigHook(stopper)
  1014  
  1015  	const maxBytes = 1 << 16
  1016  	// Set max bytes.
  1017  	descID := uint32(keys.MinUserDescID)
  1018  	zoneConfig := zonepb.DefaultZoneConfig()
  1019  	zoneConfig.RangeMaxBytes = proto.Int64(maxBytes)
  1020  	config.TestingSetZoneConfig(descID, zoneConfig)
  1021  
  1022  	// Trigger gossip callback.
  1023  	if err := store.Gossip().AddInfoProto(gossip.KeySystemConfig, &config.SystemConfigEntries{}, 0); err != nil {
  1024  		t.Fatal(err)
  1025  	}
  1026  
  1027  	tableBoundary := keys.SystemSQLCodec.TablePrefix(descID)
  1028  
  1029  	{
  1030  		var repl *kvserver.Replica
  1031  
  1032  		// Wait for the range to be split along table boundaries.
  1033  		expectedRSpan := roachpb.RSpan{Key: roachpb.RKey(tableBoundary), EndKey: roachpb.RKeyMax}
  1034  		testutils.SucceedsSoon(t, func() error {
  1035  			repl = store.LookupReplica(roachpb.RKey(tableBoundary))
  1036  			if actualRSpan := repl.Desc().RSpan(); !actualRSpan.Equal(expectedRSpan) {
  1037  				return errors.Errorf("expected range %s to span %s", repl, expectedRSpan)
  1038  			}
  1039  			return nil
  1040  		})
  1041  
  1042  		// Check range's max bytes settings.
  1043  		if actualMaxBytes := repl.GetMaxBytes(); actualMaxBytes != maxBytes {
  1044  			t.Fatalf("range %s max bytes mismatch, got: %d, expected: %d", repl, actualMaxBytes, maxBytes)
  1045  		}
  1046  
  1047  		// Look in the range after prefix we're writing to.
  1048  		fillRange(t, store, repl.RangeID, tableBoundary, maxBytes, false /* singleKey */)
  1049  	}
  1050  
  1051  	// Verify that the range is in fact split.
  1052  	testutils.SucceedsSoon(t, func() error {
  1053  		repl := store.LookupReplica(roachpb.RKey(keys.SystemSQLCodec.TablePrefix(descID + 1)))
  1054  		rngDesc := repl.Desc()
  1055  		rngStart, rngEnd := rngDesc.StartKey, rngDesc.EndKey
  1056  		if rngStart.Equal(tableBoundary) || !rngEnd.Equal(roachpb.RKeyMax) {
  1057  			return errors.Errorf("range %s has not yet split", repl)
  1058  		}
  1059  		return nil
  1060  	})
  1061  }
  1062  
  1063  // TestStoreRangeSplitWithMaxBytesUpdate tests a scenario where a new
  1064  // zone config that updates the max bytes is set and triggers a range
  1065  // split.
  1066  func TestStoreRangeSplitWithMaxBytesUpdate(t *testing.T) {
  1067  	defer leaktest.AfterTest(t)()
  1068  	stopper := stop.NewStopper()
  1069  	defer stopper.Stop(context.Background())
  1070  	storeCfg := kvserver.TestStoreConfig(nil /* clock */)
  1071  	storeCfg.TestingKnobs.DisableMergeQueue = true
  1072  	store := createTestStoreWithConfig(t, stopper, storeCfg)
  1073  	config.TestingSetupZoneConfigHook(stopper)
  1074  
  1075  	origRng := store.LookupReplica(roachpb.RKeyMin)
  1076  
  1077  	// Set max bytes.
  1078  	const maxBytes = 1 << 16
  1079  	descID := uint32(keys.MinUserDescID)
  1080  	zoneConfig := zonepb.DefaultZoneConfig()
  1081  	zoneConfig.RangeMaxBytes = proto.Int64(maxBytes)
  1082  	config.TestingSetZoneConfig(descID, zoneConfig)
  1083  
  1084  	// Trigger gossip callback.
  1085  	if err := store.Gossip().AddInfoProto(gossip.KeySystemConfig, &config.SystemConfigEntries{}, 0); err != nil {
  1086  		t.Fatal(err)
  1087  	}
  1088  
  1089  	// Verify that the range is split and the new range has the correct max bytes.
  1090  	testutils.SucceedsSoon(t, func() error {
  1091  		newRng := store.LookupReplica(roachpb.RKey(keys.SystemSQLCodec.TablePrefix(descID)))
  1092  		if newRng.RangeID == origRng.RangeID {
  1093  			return errors.Errorf("expected new range created by split")
  1094  		}
  1095  		if newRng.GetMaxBytes() != maxBytes {
  1096  			return errors.Errorf("expected %d max bytes for the new range, but got %d",
  1097  				maxBytes, newRng.GetMaxBytes())
  1098  		}
  1099  		return nil
  1100  	})
  1101  }
  1102  
  1103  // TestStoreRangeSplitBackpressureWrites tests that ranges that grow too large
  1104  // begin enforcing backpressure on writes until the range is able to split. In
  1105  // the test, a range is filled past the point where it will begin applying
  1106  // backpressure. Splits are then blocked in-flight and we test that any future
  1107  // writes wait until the split succeeds and reduces the range size beneath the
  1108  // backpressure threshold.
  1109  func TestStoreRangeSplitBackpressureWrites(t *testing.T) {
  1110  	defer leaktest.AfterTest(t)()
  1111  
  1112  	// Backpressured writes react differently depending on whether there is an
  1113  	// ongoing split or not. If there is an ongoing split then the writes wait
  1114  	// on the split are only allowed to proceed if the split succeeds. If there
  1115  	// is not an ongoing split or if the range is unsplittable and in the split
  1116  	// queue's purgatory, the write is rejected immediately.
  1117  	testCases := []struct {
  1118  		splitOngoing    bool
  1119  		splitErr        bool
  1120  		splitImpossible bool
  1121  		expErr          string
  1122  	}{
  1123  		{splitOngoing: true, splitErr: false, expErr: ""},
  1124  		{splitOngoing: true, splitErr: true, expErr: "split failed while applying backpressure.* boom"},
  1125  		{splitOngoing: false, expErr: ""},
  1126  		{splitImpossible: true, expErr: "split failed while applying backpressure.* could not find valid split key"},
  1127  	}
  1128  	for _, tc := range testCases {
  1129  		var name string
  1130  		if tc.splitImpossible {
  1131  			name = fmt.Sprintf("splitImpossible=%t", tc.splitImpossible)
  1132  		} else {
  1133  			name = fmt.Sprintf("splitOngoing=%t,splitErr=%t", tc.splitOngoing, tc.splitErr)
  1134  		}
  1135  		t.Run(name, func(t *testing.T) {
  1136  			var activateSplitFilter int32
  1137  			splitKey := roachpb.RKey(keys.UserTableDataMin)
  1138  			splitPending, blockSplits := make(chan struct{}), make(chan struct{})
  1139  			storeCfg := kvserver.TestStoreConfig(nil)
  1140  			// Set maxBytes to something small so we can exceed the maximum split
  1141  			// size without adding 2x64MB of data.
  1142  			const maxBytes = 1 << 16
  1143  			storeCfg.DefaultZoneConfig.RangeMaxBytes = proto.Int64(maxBytes)
  1144  			storeCfg.TestingKnobs.DisableGCQueue = true
  1145  			storeCfg.TestingKnobs.DisableMergeQueue = true
  1146  			storeCfg.TestingKnobs.DisableSplitQueue = true
  1147  			storeCfg.TestingKnobs.TestingRequestFilter =
  1148  				func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error {
  1149  					for _, req := range ba.Requests {
  1150  						if cPut, ok := req.GetInner().(*roachpb.ConditionalPutRequest); ok {
  1151  							if cPut.Key.Equal(keys.RangeDescriptorKey(splitKey)) {
  1152  								if atomic.CompareAndSwapInt32(&activateSplitFilter, 1, 0) {
  1153  									splitPending <- struct{}{}
  1154  									<-blockSplits
  1155  									if tc.splitErr {
  1156  										return roachpb.NewErrorf("boom")
  1157  									}
  1158  								}
  1159  							}
  1160  						}
  1161  					}
  1162  					return nil
  1163  				}
  1164  
  1165  			ctx := context.Background()
  1166  			stopper := stop.NewStopper()
  1167  			defer stopper.Stop(ctx)
  1168  			store := createTestStoreWithConfig(t, stopper, storeCfg)
  1169  
  1170  			// Split at the split key.
  1171  			sArgs := adminSplitArgs(splitKey.AsRawKey())
  1172  			repl := store.LookupReplica(splitKey)
  1173  			if _, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{
  1174  				RangeID: repl.RangeID,
  1175  			}, sArgs); pErr != nil {
  1176  				t.Fatal(pErr)
  1177  			}
  1178  
  1179  			// Fill the new range past the point where writes should backpressure.
  1180  			repl = store.LookupReplica(splitKey)
  1181  			singleKey := tc.splitImpossible
  1182  			fillRange(t, store, repl.RangeID, splitKey.AsRawKey(), 2*maxBytes+1, singleKey)
  1183  
  1184  			if !repl.ShouldBackpressureWrites() {
  1185  				t.Fatal("expected ShouldBackpressureWrites=true, found false")
  1186  			}
  1187  
  1188  			// If necessary, allow the range to begin splitting and wait until
  1189  			// it gets blocked in the response filter.
  1190  			if tc.splitOngoing {
  1191  				atomic.StoreInt32(&activateSplitFilter, 1)
  1192  				if err := stopper.RunAsyncTask(ctx, "force split", func(_ context.Context) {
  1193  					store.SetSplitQueueActive(true)
  1194  					if err := store.ForceSplitScanAndProcess(); err != nil {
  1195  						log.Fatalf(ctx, "%v", err)
  1196  					}
  1197  				}); err != nil {
  1198  					t.Fatal(err)
  1199  				}
  1200  				<-splitPending
  1201  			} else if tc.splitImpossible {
  1202  				store.SetSplitQueueActive(true)
  1203  				if err := store.ForceSplitScanAndProcess(); err != nil {
  1204  					t.Fatal(err)
  1205  				}
  1206  				if l := store.SplitQueuePurgatoryLength(); l != 1 {
  1207  					t.Fatalf("expected split queue purgatory to contain 1 replica, found %d", l)
  1208  				}
  1209  			}
  1210  
  1211  			// Send a Put request. This should be backpressured on the split, so it should
  1212  			// not be able to succeed until we allow the split to continue.
  1213  			putRes := make(chan error)
  1214  			go func() {
  1215  				// Write to the first key of the range to make sure that
  1216  				// we don't end up on the wrong side of the split.
  1217  				putRes <- store.DB().Put(ctx, splitKey, "test")
  1218  			}()
  1219  
  1220  			// Send a Delete request in a transaction. Should also be backpressured on the split,
  1221  			// so it should not be able to succeed until we allow the split to continue.
  1222  			delRes := make(chan error)
  1223  			go func() {
  1224  				// Write to the first key of the range to make sure that
  1225  				// we don't end up on the wrong side of the split.
  1226  				delRes <- store.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
  1227  					b := txn.NewBatch()
  1228  					b.Del(splitKey)
  1229  					return txn.CommitInBatch(ctx, b)
  1230  				})
  1231  			}()
  1232  
  1233  			// Make sure the write doesn't return while a split is ongoing. If no
  1234  			// split is ongoing, the write will return an error immediately.
  1235  			if tc.splitOngoing {
  1236  				select {
  1237  				case err := <-putRes:
  1238  					close(blockSplits)
  1239  					t.Fatalf("put was not blocked on split, returned err %v", err)
  1240  				case err := <-delRes:
  1241  					close(blockSplits)
  1242  					t.Fatalf("delete was not blocked on split, returned err %v", err)
  1243  				case <-time.After(100 * time.Millisecond):
  1244  				}
  1245  
  1246  				// Let split through. Write should follow.
  1247  				close(blockSplits)
  1248  			}
  1249  
  1250  			for op, resCh := range map[string]chan error{
  1251  				"put":    putRes,
  1252  				"delete": delRes,
  1253  			} {
  1254  				if err := <-resCh; tc.expErr == "" {
  1255  					if err != nil {
  1256  						t.Fatalf("%s returned err %v, expected success", op, err)
  1257  					}
  1258  				} else {
  1259  					if !testutils.IsError(err, tc.expErr) {
  1260  						t.Fatalf("%s returned err %s, expected pattern %q", op, err, tc.expErr)
  1261  					}
  1262  				}
  1263  			}
  1264  
  1265  		})
  1266  	}
  1267  }
  1268  
  1269  // TestStoreRangeSystemSplits verifies that splits are based on the contents of
  1270  // the system.descriptor table.
  1271  func TestStoreRangeSystemSplits(t *testing.T) {
  1272  	defer leaktest.AfterTest(t)()
  1273  	stopper := stop.NewStopper()
  1274  	defer stopper.Stop(context.Background())
  1275  	// Intentionally leave the merge queue enabled. This indirectly tests that the
  1276  	// merge queue respects these split points.
  1277  	store, _ := createTestStore(t, stopper)
  1278  
  1279  	userTableMax := keys.MinUserDescID + 4
  1280  	var exceptions map[int]struct{}
  1281  	schema := sqlbase.MakeMetadataSchema(
  1282  		keys.SystemSQLCodec, zonepb.DefaultZoneConfigRef(), zonepb.DefaultSystemZoneConfigRef(),
  1283  	)
  1284  	// Write table descriptors for the tables in the metadata schema as well as
  1285  	// five dummy user tables. This does two things:
  1286  	//   - descriptor IDs are used to determine split keys
  1287  	//   - the write triggers a SystemConfig update and gossip
  1288  	// We should end up with splits at each user table prefix.
  1289  	if err := store.DB().Txn(context.Background(), func(ctx context.Context, txn *kv.Txn) error {
  1290  		if err := txn.SetSystemConfigTrigger(); err != nil {
  1291  			return err
  1292  		}
  1293  		descTablePrefix := keys.SystemSQLCodec.TablePrefix(keys.DescriptorTableID)
  1294  		kvs, _ /* splits */ := schema.GetInitialValues()
  1295  		for _, kv := range kvs {
  1296  			if !bytes.HasPrefix(kv.Key, descTablePrefix) {
  1297  				continue
  1298  			}
  1299  			if err := txn.Put(ctx, kv.Key, &kv.Value); err != nil {
  1300  				return err
  1301  			}
  1302  		}
  1303  		for i := keys.MinUserDescID; i <= userTableMax; i++ {
  1304  			// We don't care about the value, just the key.
  1305  			key := sqlbase.MakeDescMetadataKey(keys.SystemSQLCodec, sqlbase.ID(i))
  1306  			if err := txn.Put(ctx, key, sqlbase.WrapDescriptor(&sqlbase.TableDescriptor{})); err != nil {
  1307  				return err
  1308  			}
  1309  		}
  1310  		return nil
  1311  	}); err != nil {
  1312  		t.Fatal(err)
  1313  	}
  1314  
  1315  	verifySplitsAtTablePrefixes := func() {
  1316  		t.Helper()
  1317  		// We expect splits at each of the user tables and at a few fixed system
  1318  		// range boundaries, but not at system config table boundaries.
  1319  		expKeys := []roachpb.Key{
  1320  			testutils.MakeKey(keys.Meta2Prefix, keys.NodeLivenessPrefix),
  1321  			testutils.MakeKey(keys.Meta2Prefix, keys.NodeLivenessKeyMax),
  1322  			testutils.MakeKey(keys.Meta2Prefix, keys.TimeseriesPrefix),
  1323  			testutils.MakeKey(keys.Meta2Prefix, keys.TimeseriesPrefix.PrefixEnd()),
  1324  			testutils.MakeKey(keys.Meta2Prefix, keys.TableDataMin),
  1325  		}
  1326  		ids := schema.DescriptorIDs()
  1327  		maxID := uint32(ids[len(ids)-1])
  1328  		for i := uint32(keys.MaxSystemConfigDescID + 1); i <= maxID; i++ {
  1329  			expKeys = append(expKeys,
  1330  				testutils.MakeKey(keys.Meta2Prefix, keys.SystemSQLCodec.TablePrefix(i)),
  1331  			)
  1332  		}
  1333  		for i := keys.MinUserDescID; i <= userTableMax; i++ {
  1334  			if _, ok := exceptions[i]; !ok {
  1335  				expKeys = append(expKeys,
  1336  					testutils.MakeKey(keys.Meta2Prefix, keys.SystemSQLCodec.TablePrefix(uint32(i))),
  1337  				)
  1338  			}
  1339  		}
  1340  		expKeys = append(expKeys, testutils.MakeKey(keys.Meta2Prefix, roachpb.RKeyMax))
  1341  
  1342  		testutils.SucceedsSoon(t, func() error {
  1343  			rows, err := store.DB().Scan(context.Background(), keys.Meta2Prefix, keys.MetaMax, 0)
  1344  			if err != nil {
  1345  				return err
  1346  			}
  1347  			keys := make([]roachpb.Key, 0, len(expKeys))
  1348  			for _, r := range rows {
  1349  				keys = append(keys, r.Key)
  1350  			}
  1351  			if !reflect.DeepEqual(keys, expKeys) {
  1352  				return errors.Errorf("expected split keys:\n%v\nbut found:\n%v", expKeys, keys)
  1353  			}
  1354  			return nil
  1355  		})
  1356  	}
  1357  
  1358  	verifySplitsAtTablePrefixes()
  1359  
  1360  	// Write another, disjoint (+3) descriptor for a user table.
  1361  	userTableMax += 3
  1362  	exceptions = map[int]struct{}{userTableMax - 1: {}, userTableMax - 2: {}}
  1363  	if err := store.DB().Txn(context.Background(), func(ctx context.Context, txn *kv.Txn) error {
  1364  		if err := txn.SetSystemConfigTrigger(); err != nil {
  1365  			return err
  1366  		}
  1367  		// This time, only write the last table descriptor. Splits only occur for
  1368  		// the descriptor we add. We don't care about the value, just the key.
  1369  		k := sqlbase.MakeDescMetadataKey(keys.SystemSQLCodec, sqlbase.ID(userTableMax))
  1370  		return txn.Put(ctx, k, sqlbase.WrapDescriptor(&sqlbase.TableDescriptor{}))
  1371  	}); err != nil {
  1372  		t.Fatal(err)
  1373  	}
  1374  
  1375  	verifySplitsAtTablePrefixes()
  1376  }
  1377  
  1378  // runSetupSplitSnapshotRace engineers a situation in which a range has
  1379  // been split but node 3 hasn't processed it yet. There is a race
  1380  // depending on whether node 3 learns of the split from its left or
  1381  // right side. When this function returns most of the nodes will be
  1382  // stopped, and depending on the order in which they are restarted, we
  1383  // can arrange for both possible outcomes of the race.
  1384  //
  1385  // Range 1 is the system keyspace, located on node 0.
  1386  //
  1387  // The range containing leftKey is the left side of the split, located
  1388  // on nodes 1, 2, and 3.
  1389  //
  1390  // The range containing rightKey is the right side of the split,
  1391  // located on nodes 3, 4, and 5.
  1392  //
  1393  // Nodes 1-5 are stopped; only node 0 is running.
  1394  //
  1395  // See https://github.com/cockroachdb/cockroach/issues/1644.
  1396  func runSetupSplitSnapshotRace(
  1397  	t *testing.T, testFn func(*multiTestContext, roachpb.Key, roachpb.Key),
  1398  ) {
  1399  	sc := kvserver.TestStoreConfig(nil)
  1400  	// We'll control replication by hand.
  1401  	sc.TestingKnobs.DisableReplicateQueue = true
  1402  	// Async intent resolution can sometimes lead to hangs when we stop
  1403  	// most of the stores at the end of this function.
  1404  	sc.TestingKnobs.IntentResolverKnobs.DisableAsyncIntentResolution = true
  1405  	// Avoid fighting with the merge queue while trying to reproduce this race.
  1406  	sc.TestingKnobs.DisableMergeQueue = true
  1407  	sc.TestingKnobs.DisableGCQueue = true
  1408  	// Disable the split delay mechanism, or it'll spend 10s going in circles.
  1409  	// (We can't set it to zero as otherwise the default overrides us).
  1410  	sc.RaftDelaySplitToSuppressSnapshotTicks = -1
  1411  	sc.Clock = nil // manual clock
  1412  	mtc := &multiTestContext{storeConfig: &sc}
  1413  	defer mtc.Stop()
  1414  	mtc.Start(t, 6)
  1415  
  1416  	leftKey := roachpb.Key("a")
  1417  	rightKey := roachpb.Key("z")
  1418  
  1419  	// First, do a couple of writes; we'll use these to determine when
  1420  	// the dust has settled.
  1421  	incArgs := incrementArgs(leftKey, 1)
  1422  	if _, pErr := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); pErr != nil {
  1423  		t.Fatal(pErr)
  1424  	}
  1425  	incArgs = incrementArgs(rightKey, 2)
  1426  	if _, pErr := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), incArgs); pErr != nil {
  1427  		t.Fatal(pErr)
  1428  	}
  1429  
  1430  	// Split the system range from the rest of the keyspace.
  1431  	splitArgs := adminSplitArgs(keys.SystemMax)
  1432  	if _, pErr := kv.SendWrapped(context.Background(), mtc.stores[0].TestSender(), splitArgs); pErr != nil {
  1433  		t.Fatal(pErr)
  1434  	}
  1435  
  1436  	// Get the left range's ID. This is currently 2, but using
  1437  	// LookupReplica is more future-proof (and see below for
  1438  	// rightRangeID).
  1439  	leftRangeID := mtc.stores[0].LookupReplica(roachpb.RKey("a")).RangeID
  1440  
  1441  	// Replicate the left range onto nodes 1-3 and remove it from node 0. We have
  1442  	// to transfer the lease before unreplicating from range 0 because it isn't
  1443  	// safe (or allowed) for a leaseholder to remove itself from a cluster
  1444  	// without first giving up its lease.
  1445  	mtc.replicateRange(leftRangeID, 1, 2, 3)
  1446  	mtc.transferLease(context.Background(), leftRangeID, 0, 1)
  1447  	mtc.unreplicateRange(leftRangeID, 0)
  1448  
  1449  	mtc.waitForValues(leftKey, []int64{0, 1, 1, 1, 0, 0})
  1450  	mtc.waitForValues(rightKey, []int64{0, 2, 2, 2, 0, 0})
  1451  
  1452  	// Stop node 3 so it doesn't hear about the split.
  1453  	mtc.stopStore(3)
  1454  	mtc.advanceClock(context.Background())
  1455  
  1456  	// Split the data range.
  1457  	splitArgs = adminSplitArgs(roachpb.Key("m"))
  1458  	if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], splitArgs); pErr != nil {
  1459  		t.Fatal(pErr)
  1460  	}
  1461  
  1462  	// Get the right range's ID. Since the split was performed on node
  1463  	// 1, it is currently 11 and not 3 as might be expected.
  1464  	var rightRangeID roachpb.RangeID
  1465  	testutils.SucceedsSoon(t, func() error {
  1466  		rightRangeID = mtc.stores[1].LookupReplica(roachpb.RKey("z")).RangeID
  1467  		if rightRangeID == leftRangeID {
  1468  			return errors.Errorf("store 1 hasn't processed split yet")
  1469  		}
  1470  		return nil
  1471  	})
  1472  
  1473  	// Relocate the right range onto nodes 3-5.
  1474  	mtc.replicateRange(rightRangeID, 4, 5)
  1475  	mtc.unreplicateRange(rightRangeID, 2)
  1476  	mtc.transferLease(context.Background(), rightRangeID, 1, 4)
  1477  	mtc.unreplicateRange(rightRangeID, 1)
  1478  
  1479  	// Perform another increment after all the replication changes. This
  1480  	// lets us ensure that all the replication changes have been
  1481  	// processed and applied on all replicas. This is necessary because
  1482  	// the range is in an unstable state at the time of the last
  1483  	// unreplicateRange call above. It has four members which means it
  1484  	// can only tolerate one failure without losing quorum. That failure
  1485  	// is store 3 which we stopped earlier. Stopping store 1 too soon
  1486  	// (before it has committed the final config change *and* propagated
  1487  	// that commit to the followers 4 and 5) would constitute a second
  1488  	// failure and render the range unable to achieve quorum after
  1489  	// restart (in the SnapshotWins branch).
  1490  	incArgs = incrementArgs(rightKey, 3)
  1491  	if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], incArgs); pErr != nil {
  1492  		t.Fatal(pErr)
  1493  	}
  1494  
  1495  	// Store 3 still has the old value, but 4 and 5 are up to date.
  1496  	mtc.waitForValues(rightKey, []int64{0, 0, 0, 2, 5, 5})
  1497  
  1498  	// Scan the meta ranges to resolve all intents
  1499  	if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0],
  1500  		&roachpb.ScanRequest{
  1501  			RequestHeader: roachpb.RequestHeader{
  1502  				Key:    keys.MetaMin,
  1503  				EndKey: keys.MetaMax,
  1504  			},
  1505  		}); pErr != nil {
  1506  		t.Fatal(pErr)
  1507  	}
  1508  
  1509  	// Stop the remaining data stores.
  1510  	mtc.stopStore(1)
  1511  	mtc.stopStore(2)
  1512  	// 3 is already stopped.
  1513  	mtc.stopStore(4)
  1514  	mtc.stopStore(5)
  1515  
  1516  	testFn(mtc, leftKey, rightKey)
  1517  }
  1518  
  1519  // TestSplitSnapshotRace_SplitWins exercises one outcome of the
  1520  // split/snapshot race: The left side of the split propagates first,
  1521  // so the split completes before it sees a competing snapshot. This is
  1522  // the more common outcome in practice.
  1523  func TestSplitSnapshotRace_SplitWins(t *testing.T) {
  1524  	defer leaktest.AfterTest(t)()
  1525  	runSetupSplitSnapshotRace(t, func(mtc *multiTestContext, leftKey, rightKey roachpb.Key) {
  1526  		// Bring the left range up first so that the split happens before it sees a snapshot.
  1527  		for i := 1; i <= 3; i++ {
  1528  			mtc.restartStore(i)
  1529  		}
  1530  
  1531  		// Perform a write on the left range and wait for it to propagate.
  1532  		incArgs := incrementArgs(leftKey, 10)
  1533  		if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], incArgs); pErr != nil {
  1534  			t.Fatal(pErr)
  1535  		}
  1536  		mtc.waitForValues(leftKey, []int64{0, 11, 11, 11, 0, 0})
  1537  
  1538  		// Now wake the other stores up.
  1539  		mtc.restartStore(4)
  1540  		mtc.restartStore(5)
  1541  
  1542  		// Write to the right range.
  1543  		incArgs = incrementArgs(rightKey, 20)
  1544  		if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], incArgs); pErr != nil {
  1545  			t.Fatal(pErr)
  1546  		}
  1547  		mtc.waitForValues(rightKey, []int64{0, 0, 0, 25, 25, 25})
  1548  	})
  1549  }
  1550  
  1551  // TestSplitSnapshotRace_SnapshotWins exercises one outcome of the
  1552  // split/snapshot race: The right side of the split replicates first,
  1553  // so the target node sees a raft snapshot before it has processed the
  1554  // split, so it still has a conflicting range.
  1555  func TestSplitSnapshotRace_SnapshotWins(t *testing.T) {
  1556  	defer leaktest.AfterTest(t)()
  1557  	runSetupSplitSnapshotRace(t, func(mtc *multiTestContext, leftKey, rightKey roachpb.Key) {
  1558  		// Bring the right range up first.
  1559  		for i := 3; i <= 5; i++ {
  1560  			mtc.restartStore(i)
  1561  		}
  1562  
  1563  		// Perform a write on the right range.
  1564  		incArgs := incrementArgs(rightKey, 20)
  1565  		if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], incArgs); pErr != nil {
  1566  			t.Fatal(pErr)
  1567  		}
  1568  
  1569  		// It immediately propagates between nodes 4 and 5, but node 3
  1570  		// remains at its old value. It can't accept the right-hand range
  1571  		// because it conflicts with its not-yet-split copy of the left-hand
  1572  		// range. This test is not completely deterministic: we want to make
  1573  		// sure that node 3 doesn't panic when it receives the snapshot, but
  1574  		// since it silently drops the message there is nothing we can wait
  1575  		// for. There is a high probability that the message will have been
  1576  		// received by the time that nodes 4 and 5 have processed their
  1577  		// update.
  1578  		mtc.waitForValues(rightKey, []int64{0, 0, 0, 2, 25, 25})
  1579  
  1580  		// Wake up the left-hand range. This will allow the left-hand
  1581  		// range's split to complete and unblock the right-hand range.
  1582  		mtc.restartStore(1)
  1583  		mtc.restartStore(2)
  1584  
  1585  		// Perform writes on both sides. This is not strictly necessary but
  1586  		// it helps wake up dormant ranges that would otherwise have to wait
  1587  		// for retry timeouts.
  1588  		incArgs = incrementArgs(leftKey, 10)
  1589  		if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], incArgs); pErr != nil {
  1590  			t.Fatal(pErr)
  1591  		}
  1592  		mtc.waitForValues(leftKey, []int64{0, 11, 11, 11, 0, 0})
  1593  
  1594  		incArgs = incrementArgs(rightKey, 200)
  1595  		if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], incArgs); pErr != nil {
  1596  			t.Fatal(pErr)
  1597  		}
  1598  		mtc.waitForValues(rightKey, []int64{0, 0, 0, 225, 225, 225})
  1599  	})
  1600  }
  1601  
  1602  // TestStoreSplitTimestampCacheDifferentLeaseHolder prevents regression of
  1603  // #7899. When the first lease holder of the right-hand side of a Split was
  1604  // not equal to the left-hand side lease holder (at the time of the split),
  1605  // its timestamp cache would not be properly initialized, which would allow
  1606  // for writes which invalidated reads previously served by the pre-split lease.
  1607  func TestStoreSplitTimestampCacheDifferentLeaseHolder(t *testing.T) {
  1608  	defer leaktest.AfterTest(t)()
  1609  
  1610  	ctx := context.Background()
  1611  
  1612  	leftKey := roachpb.Key("a")
  1613  	splitKey := roachpb.Key("b")
  1614  	rightKey := roachpb.Key("c")
  1615  
  1616  	// This filter is better understood when reading the meat of the test
  1617  	// below first.
  1618  	var noLeaseForDesc atomic.Value
  1619  	filter := func(args kvserverbase.FilterArgs) *roachpb.Error {
  1620  		leaseReq, argOK := args.Req.(*roachpb.RequestLeaseRequest)
  1621  		forbiddenDesc, descOK := noLeaseForDesc.Load().(*roachpb.ReplicaDescriptor)
  1622  		if !argOK || !descOK || !bytes.Equal(leaseReq.Key, splitKey) {
  1623  			return nil
  1624  		}
  1625  		log.Infof(ctx, "received lease request (%s, %s)",
  1626  			leaseReq.Span(), leaseReq.Lease)
  1627  		if !reflect.DeepEqual(*forbiddenDesc, leaseReq.Lease.Replica) {
  1628  			return nil
  1629  		}
  1630  		log.Infof(ctx,
  1631  			"refusing lease request (%s, %s) because %+v held lease for LHS of split",
  1632  			leaseReq.Span(), leaseReq.Lease, forbiddenDesc)
  1633  		return roachpb.NewError(&roachpb.NotLeaseHolderError{RangeID: args.Hdr.RangeID})
  1634  	}
  1635  
  1636  	var args base.TestClusterArgs
  1637  	args.ReplicationMode = base.ReplicationManual
  1638  	args.ServerArgs.Knobs.Store = &kvserver.StoreTestingKnobs{
  1639  		EvalKnobs: kvserverbase.BatchEvalTestingKnobs{
  1640  			TestingEvalFilter: filter,
  1641  		},
  1642  	}
  1643  
  1644  	tc := testcluster.StartTestCluster(t, 2, args)
  1645  	defer tc.Stopper().Stop(context.Background())
  1646  
  1647  	// Split the data range, mainly to avoid other splits getting in our way.
  1648  	for _, k := range []roachpb.Key{leftKey, rightKey} {
  1649  		if _, _, err := tc.SplitRange(k); err != nil {
  1650  			t.Fatal(errors.Wrapf(err, "split at %s", k))
  1651  		}
  1652  	}
  1653  	if _, err := tc.AddReplicas(leftKey, tc.Target(1)); err != nil {
  1654  		t.Fatal(err)
  1655  	}
  1656  
  1657  	db := tc.Servers[0].DB() // irrelevant which one we use
  1658  
  1659  	// Make a context tied to the Stopper. The test works without, but this
  1660  	// is cleaner since we won't properly terminate the transaction below.
  1661  	ctx, cancel := tc.Server(0).Stopper().WithCancelOnQuiesce(ctx)
  1662  	defer cancel()
  1663  
  1664  	// This transaction will try to write "under" a served read.
  1665  	txnOld := kv.NewTxn(ctx, db, 0 /* gatewayNodeID */)
  1666  
  1667  	// Do something with txnOld so that its timestamp gets set.
  1668  	if _, err := txnOld.Scan(ctx, "a", "b", 0); err != nil {
  1669  		t.Fatal(err)
  1670  	}
  1671  
  1672  	// Another client comes along at a higher timestamp, touching everything on
  1673  	// the right of the (soon-to-be) split key.
  1674  	if _, err := db.Scan(ctx, splitKey, rightKey, 0); err != nil {
  1675  		t.Fatal(err)
  1676  	}
  1677  
  1678  	// This block makes sure that from now on, we don't allow the current
  1679  	// lease holder of our range to extend. Any attempt of doing so will
  1680  	// catch a NotLeaseHolderError, which means a retry by DistSender (until
  1681  	// the other node gets to be the lease holder instead).
  1682  	//
  1683  	// This makes sure that once we split, we'll be in the situation described
  1684  	// in #7899 (before the fix): The first lease holder of the right hand side
  1685  	// of the Split will not be that of the pre-split Range.
  1686  	// With the fix, the right-hand lease is initialized from the left-hand
  1687  	// lease, so the lease holders are the same, and there will never be a
  1688  	// lease request for the right-hand side in this test.
  1689  	leaseHolder := func(k roachpb.Key) roachpb.ReplicaDescriptor {
  1690  		desc, err := tc.LookupRange(k)
  1691  		if err != nil {
  1692  			t.Fatal(err)
  1693  		}
  1694  		lease, _, err := tc.FindRangeLease(desc, nil)
  1695  		if err != nil {
  1696  			t.Fatal(err)
  1697  		}
  1698  		leaseHolder := lease.Replica
  1699  		replica, found := desc.GetReplicaDescriptor(leaseHolder.StoreID)
  1700  		if !found {
  1701  			t.Fatalf("no replica on store %d found in %+v", leaseHolder.StoreID, desc)
  1702  		}
  1703  		return replica
  1704  	}
  1705  	blacklistedLeaseHolder := leaseHolder(leftKey)
  1706  	log.Infof(ctx, "blacklisting replica %+v for leases", blacklistedLeaseHolder)
  1707  	noLeaseForDesc.Store(&blacklistedLeaseHolder)
  1708  
  1709  	// Pull the trigger. This actually also reads the RHS descriptor after the
  1710  	// split, so when this returns, we've got the leases set up already.
  1711  	//
  1712  	// There's a slight race here: Just above, we've settled on who must not
  1713  	// be the future lease holder. But between then and now, that lease could
  1714  	// have expired and the other Replica could have obtained it. This would
  1715  	// have given it a proper initialization of the timestamp cache, and so
  1716  	// the split trigger would populate the right hand side with a timestamp
  1717  	// cache which does not exhibit the anomaly.
  1718  	//
  1719  	// In practice, this should only be possible if second-long delays occur
  1720  	// just above this comment, and we assert against it below.
  1721  	log.Infof(ctx, "splitting at %s", splitKey)
  1722  	if _, _, err := tc.SplitRange(splitKey); err != nil {
  1723  		t.Fatal(err)
  1724  	}
  1725  
  1726  	if currentLHSLeaseHolder := leaseHolder(leftKey); !reflect.DeepEqual(
  1727  		currentLHSLeaseHolder, blacklistedLeaseHolder) {
  1728  		t.Fatalf("lease holder changed from %+v to %+v, should de-flake this test",
  1729  			blacklistedLeaseHolder, currentLHSLeaseHolder)
  1730  	}
  1731  
  1732  	// This write (to the right-hand side of the split) should hit the
  1733  	// timestamp cache and flag the txn for a restart when we try to commit it
  1734  	// below. With the bug in #7899, the RHS of the split had an empty
  1735  	// timestamp cache and would simply let us write behind the previous read.
  1736  	if err := txnOld.Put(ctx, "bb", "bump"); err != nil {
  1737  		t.Fatal(err)
  1738  	}
  1739  
  1740  	if err := txnOld.Commit(ctx); err != nil {
  1741  		t.Fatalf("unexpected txn commit err: %+v", err)
  1742  	}
  1743  
  1744  	// Verify that the txn's safe timestamp was set.
  1745  	if txnOld.TestingCloneTxn().ReadTimestamp == (hlc.Timestamp{}) {
  1746  		t.Fatal("expected non-zero refreshed timestamp")
  1747  	}
  1748  
  1749  	// As outlined above, the anomaly was fixed by giving the right-hand side
  1750  	// of the split the same lease as the left-hand side of the Split. Check
  1751  	// that that's what's happened (we actually test a little more, namely
  1752  	// that it's the same ReplicaID, which is not required but should always
  1753  	// hold).
  1754  	if rhsLease := leaseHolder(rightKey); !reflect.DeepEqual(
  1755  		rhsLease, blacklistedLeaseHolder,
  1756  	) {
  1757  		t.Errorf("expected LHS and RHS to have same lease holder")
  1758  	}
  1759  }
  1760  
  1761  // TestStoreSplitOnRemovedReplica prevents regression of #23673. In that issue,
  1762  // it was observed that the retry loop in AdminSplit could go into an infinite
  1763  // loop if the replica it was being run on had been removed from the range. The
  1764  // loop now checks that the replica performing the split is the leaseholder
  1765  // before each iteration.
  1766  func TestStoreSplitOnRemovedReplica(t *testing.T) {
  1767  	defer leaktest.AfterTest(t)()
  1768  
  1769  	leftKey := roachpb.Key("a")
  1770  	splitKey := roachpb.Key("b")
  1771  	rightKey := roachpb.Key("c")
  1772  
  1773  	var newDesc roachpb.RangeDescriptor
  1774  	inFilter := make(chan struct{}, 1)
  1775  	beginBlockingSplit := make(chan struct{})
  1776  	finishBlockingSplit := make(chan struct{})
  1777  	filter := func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error {
  1778  		// Block replica 1's attempt to perform the AdminSplit. We detect the
  1779  		// split's range descriptor update and block until the rest of the test
  1780  		// is ready. We then return a ConditionFailedError, simulating a
  1781  		// descriptor update race.
  1782  		if ba.Replica.NodeID == 1 {
  1783  			for _, req := range ba.Requests {
  1784  				if cput, ok := req.GetInner().(*roachpb.ConditionalPutRequest); ok {
  1785  					leftDescKey := keys.RangeDescriptorKey(roachpb.RKey(leftKey))
  1786  					if cput.Key.Equal(leftDescKey) {
  1787  						var desc roachpb.RangeDescriptor
  1788  						if err := cput.Value.GetProto(&desc); err != nil {
  1789  							panic(err)
  1790  						}
  1791  
  1792  						if desc.EndKey.Equal(splitKey) {
  1793  							select {
  1794  							case <-beginBlockingSplit:
  1795  								select {
  1796  								case inFilter <- struct{}{}:
  1797  									// Let the test know we're in the filter.
  1798  								default:
  1799  								}
  1800  								<-finishBlockingSplit
  1801  
  1802  								var val roachpb.Value
  1803  								if err := val.SetProto(&newDesc); err != nil {
  1804  									panic(err)
  1805  								}
  1806  								return roachpb.NewError(&roachpb.ConditionFailedError{
  1807  									ActualValue: &val,
  1808  								})
  1809  							default:
  1810  							}
  1811  						}
  1812  					}
  1813  				}
  1814  			}
  1815  		}
  1816  		return nil
  1817  	}
  1818  
  1819  	var args base.TestClusterArgs
  1820  	args.ReplicationMode = base.ReplicationManual
  1821  	args.ServerArgs.Knobs.Store = &kvserver.StoreTestingKnobs{
  1822  		TestingRequestFilter: filter,
  1823  	}
  1824  
  1825  	tc := testcluster.StartTestCluster(t, 3, args)
  1826  	defer tc.Stopper().Stop(context.Background())
  1827  
  1828  	// Split the data range, mainly to avoid other splits getting in our way.
  1829  	for _, k := range []roachpb.Key{leftKey, rightKey} {
  1830  		if _, _, err := tc.SplitRange(k); err != nil {
  1831  			t.Fatal(errors.Wrapf(err, "split at %s", k))
  1832  		}
  1833  	}
  1834  
  1835  	// Send an AdminSplit request to the replica. In the filter above we'll
  1836  	// block the first cput in this split until we're ready to let it loose
  1837  	// again, which will be after we remove the replica from the range.
  1838  	splitRes := make(chan error)
  1839  	close(beginBlockingSplit)
  1840  	go func() {
  1841  		_, _, err := tc.SplitRange(splitKey)
  1842  		splitRes <- err
  1843  	}()
  1844  	<-inFilter
  1845  
  1846  	// Move the range from node 0 to node 1. Then add node 2 to the range.
  1847  	// node 0 will never hear about this range descriptor update.
  1848  	var err error
  1849  	if newDesc, err = tc.AddReplicas(leftKey, tc.Target(1)); err != nil {
  1850  		t.Fatal(err)
  1851  	}
  1852  	if err := tc.TransferRangeLease(newDesc, tc.Target(1)); err != nil {
  1853  		t.Fatal(err)
  1854  	}
  1855  	if _, err := tc.RemoveReplicas(leftKey, tc.Target(0)); err != nil {
  1856  		t.Fatal(err)
  1857  	}
  1858  	if newDesc, err = tc.AddReplicas(leftKey, tc.Target(2)); err != nil {
  1859  		t.Fatal(err)
  1860  	}
  1861  
  1862  	// Stop blocking the split request's cput. This will cause the cput to fail
  1863  	// with a ConditionFailedError. The error will warrant a retry in
  1864  	// AdminSplit's retry loop, but when the removed replica notices that it is
  1865  	// no longer the leaseholder, it will return a NotLeaseholderError. This in
  1866  	// turn will allow the AdminSplit to be re-routed to the new leaseholder,
  1867  	// where it will succeed.
  1868  	close(finishBlockingSplit)
  1869  	if err = <-splitRes; err != nil {
  1870  		t.Errorf("AdminSplit returned error: %+v", err)
  1871  	}
  1872  }
  1873  
  1874  func TestStoreSplitGCThreshold(t *testing.T) {
  1875  	defer leaktest.AfterTest(t)()
  1876  	storeCfg := kvserver.TestStoreConfig(nil)
  1877  	storeCfg.TestingKnobs.DisableSplitQueue = true
  1878  	storeCfg.TestingKnobs.DisableMergeQueue = true
  1879  	stopper := stop.NewStopper()
  1880  	defer stopper.Stop(context.Background())
  1881  	store := createTestStoreWithConfig(t, stopper, storeCfg)
  1882  
  1883  	leftKey := roachpb.Key("a")
  1884  	splitKey := roachpb.Key("b")
  1885  	rightKey := roachpb.Key("c")
  1886  	content := []byte("test")
  1887  
  1888  	pArgs := putArgs(leftKey, content)
  1889  	if _, pErr := kv.SendWrapped(context.Background(), store.TestSender(), pArgs); pErr != nil {
  1890  		t.Fatal(pErr)
  1891  	}
  1892  	pArgs = putArgs(rightKey, content)
  1893  	if _, pErr := kv.SendWrapped(context.Background(), store.TestSender(), pArgs); pErr != nil {
  1894  		t.Fatal(pErr)
  1895  	}
  1896  
  1897  	specifiedGCThreshold := hlc.Timestamp{
  1898  		WallTime: 2e9,
  1899  	}
  1900  	gcArgs := &roachpb.GCRequest{
  1901  		RequestHeader: roachpb.RequestHeader{
  1902  			Key:    leftKey,
  1903  			EndKey: rightKey,
  1904  		},
  1905  		Threshold: specifiedGCThreshold,
  1906  	}
  1907  	if _, pErr := kv.SendWrapped(context.Background(), store.TestSender(), gcArgs); pErr != nil {
  1908  		t.Fatal(pErr)
  1909  	}
  1910  
  1911  	args := adminSplitArgs(splitKey)
  1912  	if _, pErr := kv.SendWrapped(context.Background(), store.TestSender(), args); pErr != nil {
  1913  		t.Fatal(pErr)
  1914  	}
  1915  
  1916  	repl := store.LookupReplica(roachpb.RKey(splitKey))
  1917  	gcThreshold := repl.GetGCThreshold()
  1918  
  1919  	if !reflect.DeepEqual(gcThreshold, specifiedGCThreshold) {
  1920  		t.Fatalf("expected RHS's GCThreshold is equal to %v, but got %v", specifiedGCThreshold, gcThreshold)
  1921  	}
  1922  
  1923  	repl.AssertState(context.Background(), store.Engine())
  1924  }
  1925  
  1926  // TestStoreRangeSplitRaceUninitializedRHS reproduces #7600 (before it was
  1927  // fixed). While splits are happening, we simulate incoming messages for the
  1928  // right-hand side to trigger a race between the creation of the proper replica
  1929  // and the uninitialized replica reacting to messages.
  1930  func TestStoreRangeSplitRaceUninitializedRHS(t *testing.T) {
  1931  	defer leaktest.AfterTest(t)()
  1932  	mtc := &multiTestContext{}
  1933  	storeCfg := kvserver.TestStoreConfig(nil)
  1934  	storeCfg.TestingKnobs.DisableMergeQueue = true
  1935  	// An aggressive tick interval lets groups communicate more and thus
  1936  	// triggers test failures much more reliably. We can't go too aggressive
  1937  	// or race tests never make any progress.
  1938  	storeCfg.RaftTickInterval = 50 * time.Millisecond
  1939  	storeCfg.RaftElectionTimeoutTicks = 2
  1940  	currentTrigger := make(chan *roachpb.SplitTrigger, 1)
  1941  	var seen struct {
  1942  		syncutil.Mutex
  1943  		sids map[kvserverbase.CmdIDKey][2]bool
  1944  	}
  1945  	seen.sids = make(map[kvserverbase.CmdIDKey][2]bool)
  1946  
  1947  	storeCfg.TestingKnobs.EvalKnobs.TestingEvalFilter = func(args kvserverbase.FilterArgs) *roachpb.Error {
  1948  		et, ok := args.Req.(*roachpb.EndTxnRequest)
  1949  		if !ok || et.InternalCommitTrigger == nil {
  1950  			return nil
  1951  		}
  1952  		trigger := protoutil.Clone(et.InternalCommitTrigger.GetSplitTrigger()).(*roachpb.SplitTrigger)
  1953  		// The first time the trigger arrives (on each of the two stores),
  1954  		// return a transaction retry. This allows us to pass the trigger to
  1955  		// the goroutine creating faux incoming messages for the yet
  1956  		// nonexistent right-hand-side, giving it a head start. This code looks
  1957  		// fairly complicated since it wants to ensure that the two replicas
  1958  		// don't diverge.
  1959  		if trigger != nil && len(trigger.RightDesc.InternalReplicas) == 2 && args.Hdr.Txn.Epoch == 0 {
  1960  			seen.Lock()
  1961  			defer seen.Unlock()
  1962  			sid, sl := int(args.Sid)-1, seen.sids[args.CmdID]
  1963  			if !sl[sid] {
  1964  				sl[sid] = true
  1965  				seen.sids[args.CmdID] = sl
  1966  			} else {
  1967  				return nil
  1968  			}
  1969  			select {
  1970  			case currentTrigger <- trigger:
  1971  			default:
  1972  			}
  1973  			return roachpb.NewError(
  1974  				roachpb.NewReadWithinUncertaintyIntervalError(
  1975  					args.Hdr.Timestamp, args.Hdr.Timestamp, nil,
  1976  				))
  1977  		}
  1978  		return nil
  1979  	}
  1980  
  1981  	mtc.storeConfig = &storeCfg
  1982  	defer mtc.Stop()
  1983  	mtc.Start(t, 2)
  1984  
  1985  	leftRange := mtc.stores[0].LookupReplica(roachpb.RKey("a"))
  1986  
  1987  	// Replicate the left range onto the second node. We don't wait since we
  1988  	// don't actually care what the second node does. All we want is that the
  1989  	// first node isn't surprised by messages from that node.
  1990  	mtc.replicateRange(leftRange.RangeID, 1)
  1991  
  1992  	for i := 0; i < 10; i++ {
  1993  		errChan := make(chan *roachpb.Error)
  1994  
  1995  		// Closed when the split goroutine is done.
  1996  		splitDone := make(chan struct{})
  1997  
  1998  		go func() {
  1999  			defer close(splitDone)
  2000  
  2001  			// Split the data range. The split keys are chosen so that they move
  2002  			// towards "a" (so that the range being split is always the first
  2003  			// range).
  2004  			splitKey := roachpb.Key(encoding.EncodeVarintDescending([]byte("a"), int64(i)))
  2005  			splitArgs := adminSplitArgs(splitKey)
  2006  			_, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], splitArgs)
  2007  			errChan <- pErr
  2008  		}()
  2009  		go func() {
  2010  			defer func() { errChan <- nil }()
  2011  
  2012  			trigger := <-currentTrigger // our own copy
  2013  			// Make sure the first node is first for convenience.
  2014  			replicas := trigger.RightDesc.InternalReplicas
  2015  			if replicas[0].NodeID > replicas[1].NodeID {
  2016  				tmp := replicas[1]
  2017  				replicas[1] = replicas[0]
  2018  				replicas[0] = tmp
  2019  			}
  2020  
  2021  			// Send a few vote requests which look like they're from the other
  2022  			// node's right hand side of the split. This triggers a race which
  2023  			// is discussed in #7600 (briefly, the creation of the right hand
  2024  			// side in the split trigger was racing with the uninitialized
  2025  			// version for the same group, resulting in clobbered HardState).
  2026  			for term := uint64(1); ; term++ {
  2027  				if sent := mtc.transport.SendAsync(&kvserver.RaftMessageRequest{
  2028  					RangeID:     trigger.RightDesc.RangeID,
  2029  					ToReplica:   replicas[0],
  2030  					FromReplica: replicas[1],
  2031  					Message: raftpb.Message{
  2032  						Type: raftpb.MsgVote,
  2033  						To:   uint64(replicas[0].ReplicaID),
  2034  						From: uint64(replicas[1].ReplicaID),
  2035  						Term: term,
  2036  					},
  2037  				}, rpc.DefaultClass); !sent {
  2038  					t.Error("transport failed to send vote request")
  2039  				}
  2040  				select {
  2041  				case <-splitDone:
  2042  					return
  2043  				case <-time.After(time.Microsecond):
  2044  					// If we busy-loop here, we monopolize processRaftMu and the
  2045  					// split takes a long time to complete. Sleeping reduces the
  2046  					// chance that we hit the race, but it still shows up under
  2047  					// stress.
  2048  				}
  2049  			}
  2050  		}()
  2051  		for i := 0; i < 2; i++ {
  2052  			if pErr := <-errChan; pErr != nil {
  2053  				t.Fatal(pErr)
  2054  			}
  2055  		}
  2056  	}
  2057  }
  2058  
  2059  // TestLeaderAfterSplit verifies that a raft group created by a split
  2060  // elects a leader without waiting for an election timeout.
  2061  func TestLeaderAfterSplit(t *testing.T) {
  2062  	defer leaktest.AfterTest(t)()
  2063  	storeConfig := kvserver.TestStoreConfig(nil)
  2064  	storeConfig.TestingKnobs.DisableMergeQueue = true
  2065  	storeConfig.RaftElectionTimeoutTicks = 1000000
  2066  	mtc := &multiTestContext{
  2067  		storeConfig: &storeConfig,
  2068  	}
  2069  	defer mtc.Stop()
  2070  	mtc.Start(t, 3)
  2071  
  2072  	mtc.replicateRange(1, 1, 2)
  2073  
  2074  	leftKey := roachpb.Key("a")
  2075  	splitKey := roachpb.Key("m")
  2076  	rightKey := roachpb.Key("z")
  2077  
  2078  	splitArgs := adminSplitArgs(splitKey)
  2079  	if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], splitArgs); pErr != nil {
  2080  		t.Fatal(pErr)
  2081  	}
  2082  
  2083  	incArgs := incrementArgs(leftKey, 1)
  2084  	if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], incArgs); pErr != nil {
  2085  		t.Fatal(pErr)
  2086  	}
  2087  
  2088  	incArgs = incrementArgs(rightKey, 2)
  2089  	if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], incArgs); pErr != nil {
  2090  		t.Fatal(pErr)
  2091  	}
  2092  }
  2093  
  2094  func BenchmarkStoreRangeSplit(b *testing.B) {
  2095  	var mtc multiTestContext
  2096  	mtc.Start(b, 1)
  2097  	defer mtc.Stop()
  2098  	store := mtc.Store(0)
  2099  
  2100  	// Perform initial split of ranges.
  2101  	sArgs := adminSplitArgs(roachpb.Key("b"))
  2102  	if _, err := kv.SendWrapped(context.Background(), store.TestSender(), sArgs); err != nil {
  2103  		b.Fatal(err)
  2104  	}
  2105  
  2106  	// Write some values left and right of the split key.
  2107  	aDesc := store.LookupReplica([]byte("a")).Desc()
  2108  	bDesc := store.LookupReplica([]byte("c")).Desc()
  2109  	kvserver.WriteRandomDataToRange(b, store, aDesc.RangeID, []byte("aaa"))
  2110  	kvserver.WriteRandomDataToRange(b, store, bDesc.RangeID, []byte("ccc"))
  2111  
  2112  	// Merge the b range back into the a range.
  2113  	mArgs := adminMergeArgs(roachpb.KeyMin)
  2114  	if _, err := kv.SendWrapped(context.Background(), store.TestSender(), mArgs); err != nil {
  2115  		b.Fatal(err)
  2116  	}
  2117  
  2118  	b.ResetTimer()
  2119  	for i := 0; i < b.N; i++ {
  2120  		// Split the range.
  2121  		b.StartTimer()
  2122  		if _, err := kv.SendWrapped(context.Background(), store.TestSender(), sArgs); err != nil {
  2123  			b.Fatal(err)
  2124  		}
  2125  
  2126  		// Merge the ranges.
  2127  		b.StopTimer()
  2128  		if _, err := kv.SendWrapped(context.Background(), store.TestSender(), mArgs); err != nil {
  2129  			b.Fatal(err)
  2130  		}
  2131  	}
  2132  }
  2133  
  2134  func writeRandomTimeSeriesDataToRange(
  2135  	t testing.TB, store *kvserver.Store, rangeID roachpb.RangeID, keyPrefix []byte,
  2136  ) (midpoint []byte) {
  2137  	src := rand.New(rand.NewSource(0))
  2138  	r := ts.Resolution10s
  2139  	for i := 0; i < 20; i++ {
  2140  		var data []tspb.TimeSeriesData
  2141  		for j := int64(0); j <= src.Int63n(5); j++ {
  2142  			d := tspb.TimeSeriesData{
  2143  				Name:   "test.random.metric",
  2144  				Source: "cpu01",
  2145  			}
  2146  			for k := int64(0); k <= src.Int63n(10); k++ {
  2147  				d.Datapoints = append(d.Datapoints, tspb.TimeSeriesDatapoint{
  2148  					TimestampNanos: src.Int63n(200) * r.SlabDuration(),
  2149  					Value:          src.Float64(),
  2150  				})
  2151  			}
  2152  			data = append(data, d)
  2153  		}
  2154  		for _, d := range data {
  2155  			idatas, err := d.ToInternal(r.SlabDuration(), r.SampleDuration(), false)
  2156  			if err != nil {
  2157  				t.Fatal(err)
  2158  			}
  2159  			for _, idata := range idatas {
  2160  				var value roachpb.Value
  2161  				if err := value.SetProto(&idata); err != nil {
  2162  					t.Fatal(err)
  2163  				}
  2164  				mArgs := roachpb.MergeRequest{
  2165  					RequestHeader: roachpb.RequestHeader{
  2166  						Key: encoding.EncodeVarintAscending(keyPrefix, idata.StartTimestampNanos),
  2167  					},
  2168  					Value: value,
  2169  				}
  2170  				if _, pErr := kv.SendWrappedWith(context.Background(), store.TestSender(), roachpb.Header{
  2171  					RangeID: rangeID,
  2172  				}, &mArgs); pErr != nil {
  2173  					t.Fatal(pErr)
  2174  				}
  2175  			}
  2176  		}
  2177  	}
  2178  	// Return approximate midway point (100 is midway between random timestamps in range [0,200)).
  2179  	midKey := append([]byte(nil), keyPrefix...)
  2180  	midKey = encoding.EncodeVarintAscending(midKey, 100*r.SlabDuration())
  2181  	return midKey
  2182  }
  2183  
  2184  // TestStoreRangeGossipOnSplits verifies that the store descriptor
  2185  // is gossiped on splits up until the point where an additional
  2186  // split range doesn't exceed GossipWhenCapacityDeltaExceedsFraction.
  2187  func TestStoreRangeGossipOnSplits(t *testing.T) {
  2188  	defer leaktest.AfterTest(t)()
  2189  	storeCfg := kvserver.TestStoreConfig(nil)
  2190  	storeCfg.GossipWhenCapacityDeltaExceedsFraction = 0.5 // 50% for testing
  2191  	// We can't properly test how frequently changes in the number of ranges
  2192  	// trigger the store to gossip its capacities if we have to worry about
  2193  	// changes in the number of leases also triggering store gossip.
  2194  	storeCfg.TestingKnobs.DisableLeaseCapacityGossip = true
  2195  	storeCfg.TestingKnobs.DisableSplitQueue = true
  2196  	storeCfg.TestingKnobs.DisableMergeQueue = true
  2197  	storeCfg.TestingKnobs.DisableScanner = true
  2198  	stopper := stop.NewStopper()
  2199  	defer stopper.Stop(context.Background())
  2200  	store := createTestStoreWithConfig(t, stopper, storeCfg)
  2201  	storeKey := gossip.MakeStoreKey(store.StoreID())
  2202  
  2203  	// Avoid excessive logging on under-replicated ranges due to our many splits.
  2204  	config.TestingSetupZoneConfigHook(stopper)
  2205  	zoneConfig := zonepb.DefaultZoneConfig()
  2206  	zoneConfig.NumReplicas = proto.Int32(1)
  2207  	config.TestingSetZoneConfig(0, zoneConfig)
  2208  
  2209  	var lastSD roachpb.StoreDescriptor
  2210  	rangeCountCh := make(chan int32)
  2211  	unregister := store.Gossip().RegisterCallback(storeKey, func(_ string, val roachpb.Value) {
  2212  		var sd roachpb.StoreDescriptor
  2213  		if err := val.GetProto(&sd); err != nil {
  2214  			panic(err)
  2215  		}
  2216  		// Wait for range count to change as this callback is invoked
  2217  		// for lease count changes as well.
  2218  		if sd.Capacity.RangeCount == lastSD.Capacity.RangeCount {
  2219  			return
  2220  		}
  2221  		lastSD = sd
  2222  		rangeCountCh <- sd.Capacity.RangeCount
  2223  	})
  2224  	defer unregister()
  2225  
  2226  	// Pull the first gossiped range count.
  2227  	lastRangeCount := <-rangeCountCh
  2228  
  2229  	splitFunc := func(i int) *roachpb.Error {
  2230  		splitKey := roachpb.Key(fmt.Sprintf("%02d", i))
  2231  		_, pErr := store.LookupReplica(roachpb.RKey(splitKey)).AdminSplit(
  2232  			context.Background(),
  2233  			roachpb.AdminSplitRequest{
  2234  				RequestHeader: roachpb.RequestHeader{
  2235  					Key: splitKey,
  2236  				},
  2237  				SplitKey: splitKey,
  2238  			},
  2239  			"test",
  2240  		)
  2241  		return pErr
  2242  	}
  2243  
  2244  	// Split until we split at least 20 ranges.
  2245  	var rangeCount int32
  2246  	for i := 0; rangeCount < 20; i++ {
  2247  		if pErr := splitFunc(i); pErr != nil {
  2248  			// Avoid flakes caused by bad clocks.
  2249  			if testutils.IsPError(pErr, "rejecting command with timestamp in the future") {
  2250  				log.Warningf(context.Background(), "ignoring split error: %s", pErr)
  2251  				continue
  2252  			}
  2253  			t.Fatal(pErr)
  2254  		}
  2255  		select {
  2256  		case rangeCount = <-rangeCountCh:
  2257  			changeCount := int32(math.Ceil(math.Min(float64(lastRangeCount)*0.5, 3)))
  2258  			diff := rangeCount - (lastRangeCount + changeCount)
  2259  			if diff < -1 || diff > 1 {
  2260  				t.Errorf("gossiped range count %d more than 1 away from expected %d", rangeCount, lastRangeCount+changeCount)
  2261  			}
  2262  			lastRangeCount = rangeCount
  2263  		case <-time.After(10 * time.Millisecond):
  2264  		}
  2265  	}
  2266  }
  2267  
  2268  // TestStoreTxnWaitQueueEnabledOnSplit verifies that the TxnWaitQueue for
  2269  // the right hand side of the split range is enabled after a split.
  2270  func TestStoreTxnWaitQueueEnabledOnSplit(t *testing.T) {
  2271  	defer leaktest.AfterTest(t)()
  2272  	storeCfg := kvserver.TestStoreConfig(nil)
  2273  	storeCfg.TestingKnobs.DisableSplitQueue = true
  2274  	storeCfg.TestingKnobs.DisableMergeQueue = true
  2275  	stopper := stop.NewStopper()
  2276  	defer stopper.Stop(context.Background())
  2277  	store := createTestStoreWithConfig(t, stopper, storeCfg)
  2278  
  2279  	key := keys.UserTableDataMin
  2280  	args := adminSplitArgs(key)
  2281  	if _, pErr := kv.SendWrapped(context.Background(), store.TestSender(), args); pErr != nil {
  2282  		t.Fatalf("%q: split unexpected error: %s", key, pErr)
  2283  	}
  2284  
  2285  	rhsRepl := store.LookupReplica(roachpb.RKey(keys.UserTableDataMin))
  2286  	if !rhsRepl.GetConcurrencyManager().TxnWaitQueue().IsEnabled() {
  2287  		t.Errorf("expected RHS replica's push txn queue to be enabled post-split")
  2288  	}
  2289  }
  2290  
  2291  // TestDistributedTxnCleanup verifies that distributed transactions
  2292  // cleanup their txn records after commit or abort.
  2293  func TestDistributedTxnCleanup(t *testing.T) {
  2294  	defer leaktest.AfterTest(t)()
  2295  	storeCfg := kvserver.TestStoreConfig(nil)
  2296  	storeCfg.TestingKnobs.DisableSplitQueue = true
  2297  	storeCfg.TestingKnobs.DisableMergeQueue = true
  2298  	stopper := stop.NewStopper()
  2299  	defer stopper.Stop(context.Background())
  2300  	store := createTestStoreWithConfig(t, stopper, storeCfg)
  2301  
  2302  	// Split at "a".
  2303  	lhsKey := roachpb.Key("a")
  2304  	args := adminSplitArgs(lhsKey)
  2305  	if _, pErr := kv.SendWrapped(context.Background(), store.TestSender(), args); pErr != nil {
  2306  		t.Fatalf("split at %q: %s", lhsKey, pErr)
  2307  	}
  2308  	lhs := store.LookupReplica(roachpb.RKey("a"))
  2309  
  2310  	// Split at "b".
  2311  	rhsKey := roachpb.Key("b")
  2312  	args = adminSplitArgs(rhsKey)
  2313  	if _, pErr := kv.SendWrappedWith(context.Background(), store, roachpb.Header{
  2314  		RangeID: lhs.RangeID,
  2315  	}, args); pErr != nil {
  2316  		t.Fatalf("split at %q: %s", rhsKey, pErr)
  2317  	}
  2318  	rhs := store.LookupReplica(roachpb.RKey("b"))
  2319  
  2320  	if lhs == rhs {
  2321  		t.Errorf("LHS == RHS after split: %s == %s", lhs, rhs)
  2322  	}
  2323  
  2324  	// Test both commit and abort cases.
  2325  	testutils.RunTrueAndFalse(t, "force", func(t *testing.T, force bool) {
  2326  		testutils.RunTrueAndFalse(t, "commit", func(t *testing.T, commit bool) {
  2327  			// Run a distributed transaction involving the lhsKey and rhsKey.
  2328  			var txnKey roachpb.Key
  2329  			ctx := context.Background()
  2330  			txn := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */)
  2331  			txnFn := func(ctx context.Context, txn *kv.Txn) error {
  2332  				b := txn.NewBatch()
  2333  				b.Put(fmt.Sprintf("%s.force=%t,commit=%t", string(lhsKey), force, commit), "lhsValue")
  2334  				b.Put(fmt.Sprintf("%s.force=%t,commit=%t", string(rhsKey), force, commit), "rhsValue")
  2335  				if err := txn.Run(ctx, b); err != nil {
  2336  					return err
  2337  				}
  2338  				proto := txn.TestingCloneTxn()
  2339  				txnKey = keys.TransactionKey(proto.Key, proto.ID)
  2340  				// If force=true, we're force-aborting the txn out from underneath.
  2341  				// This simulates txn deadlock or a max priority txn aborting a
  2342  				// normal or min priority txn.
  2343  				if force {
  2344  					ba := roachpb.BatchRequest{}
  2345  					ba.Timestamp = store.Clock().Now()
  2346  					ba.RangeID = lhs.RangeID
  2347  					ba.Add(&roachpb.PushTxnRequest{
  2348  						RequestHeader: roachpb.RequestHeader{
  2349  							Key: proto.Key,
  2350  						},
  2351  						PusheeTxn: proto.TxnMeta,
  2352  						PushType:  roachpb.PUSH_ABORT,
  2353  						Force:     true,
  2354  					})
  2355  					_, pErr := store.Send(ctx, ba)
  2356  					if pErr != nil {
  2357  						t.Fatalf("failed to abort the txn: %s", pErr)
  2358  					}
  2359  				}
  2360  				if commit {
  2361  					return txn.Commit(ctx)
  2362  				}
  2363  				return errors.New("forced abort")
  2364  			}
  2365  			if err := txnFn(ctx, txn); err != nil {
  2366  				txn.CleanupOnError(ctx, err)
  2367  				if !force && commit {
  2368  					t.Fatalf("expected success with commit == true; got %v", err)
  2369  				}
  2370  			}
  2371  
  2372  			// Verify that the transaction record is cleaned up.
  2373  			testutils.SucceedsSoon(t, func() error {
  2374  				kv, err := store.DB().Get(ctx, txnKey)
  2375  				if err != nil {
  2376  					return err
  2377  				}
  2378  				if kv.Value != nil {
  2379  					return errors.Errorf("expected txn record %s to have been cleaned", txnKey)
  2380  				}
  2381  				return nil
  2382  			})
  2383  		})
  2384  	})
  2385  }
  2386  
  2387  // TestUnsplittableRange creates an unsplittable range and tests that
  2388  // it is handled correctly by the split queue's purgatory. The test:
  2389  // 1. creates an unsplittable range that needs to be split
  2390  // 2. makes sure that range enters purgatory
  2391  // 3. makes sure a purgatory run still fails
  2392  // 4. GCs part of the range so that it no longer needs to be split
  2393  // 5. makes sure a purgatory run succeeds and the range leaves purgatory
  2394  func TestUnsplittableRange(t *testing.T) {
  2395  	defer leaktest.AfterTest(t)()
  2396  
  2397  	ctx := context.Background()
  2398  	ttl := 1 * time.Hour
  2399  	const maxBytes = 1 << 16
  2400  
  2401  	stopper := stop.NewStopper()
  2402  	defer stopper.Stop(ctx)
  2403  
  2404  	manual := hlc.NewManualClock(123)
  2405  	splitQueuePurgatoryChan := make(chan time.Time, 1)
  2406  	cfg := kvserver.TestStoreConfig(hlc.NewClock(manual.UnixNano, time.Nanosecond))
  2407  	cfg.DefaultZoneConfig.RangeMaxBytes = proto.Int64(maxBytes)
  2408  	cfg.DefaultZoneConfig.GC = &zonepb.GCPolicy{
  2409  		TTLSeconds: int32(ttl.Seconds()),
  2410  	}
  2411  	cfg.DefaultSystemZoneConfig.RangeMaxBytes = proto.Int64(maxBytes)
  2412  	cfg.DefaultSystemZoneConfig.GC = &zonepb.GCPolicy{
  2413  		TTLSeconds: int32(ttl.Seconds()),
  2414  	}
  2415  	cfg.TestingKnobs.SplitQueuePurgatoryChan = splitQueuePurgatoryChan
  2416  	cfg.TestingKnobs.DisableMergeQueue = true
  2417  	store := createTestStoreWithConfig(t, stopper, cfg)
  2418  
  2419  	// Add a single large row to /Table/14.
  2420  	tableKey := roachpb.RKey(keys.SystemSQLCodec.TablePrefix(keys.UITableID))
  2421  	row1Key := roachpb.Key(encoding.EncodeVarintAscending(append([]byte(nil), tableKey...), 1))
  2422  	col1Key := keys.MakeFamilyKey(append([]byte(nil), row1Key...), 0)
  2423  	valueLen := 0.9 * maxBytes
  2424  	value := bytes.Repeat([]byte("x"), int(valueLen))
  2425  	if err := store.DB().Put(ctx, col1Key, value); err != nil {
  2426  		t.Fatal(err)
  2427  	}
  2428  
  2429  	// Wait for half of the ttl and add another large value in the same row.
  2430  	// Together, these two values bump the range over the max range size.
  2431  	manual.Increment(ttl.Nanoseconds() / 2)
  2432  	value2Len := 0.2 * maxBytes
  2433  	value2 := bytes.Repeat([]byte("y"), int(value2Len))
  2434  	if err := store.DB().Put(ctx, col1Key, value2); err != nil {
  2435  		t.Fatal(err)
  2436  	}
  2437  
  2438  	// Ensure that an attempt to split the range will hit an
  2439  	// unsplittableRangeError and place the range in purgatory.
  2440  	if err := store.ForceSplitScanAndProcess(); err != nil {
  2441  		t.Fatal(err)
  2442  	}
  2443  	if purgLen := store.SplitQueuePurgatoryLength(); purgLen != 1 {
  2444  		t.Fatalf("expected split queue purgatory to contain 1 replica, found %d", purgLen)
  2445  	}
  2446  
  2447  	// Signal the split queue's purgatory channel and ensure that the purgatory
  2448  	// remains occupied because the range still needs to split but can't.
  2449  	splitQueuePurgatoryChan <- timeutil.Now()
  2450  	if purgLen := store.SplitQueuePurgatoryLength(); purgLen != 1 {
  2451  		t.Fatalf("expected split queue purgatory to contain 1 replica, found %d", purgLen)
  2452  	}
  2453  
  2454  	// Wait for much longer than the ttl to accumulate GCByteAge.
  2455  	manual.Increment(10 * ttl.Nanoseconds())
  2456  	// Trigger the GC queue, which should clean up the earlier version of the
  2457  	// row. Once the first version of the row is cleaned up, the range should
  2458  	// exit the split queue purgatory.
  2459  	repl := store.LookupReplica(tableKey)
  2460  	if err := store.ManualGC(repl); err != nil {
  2461  		t.Fatal(err)
  2462  	}
  2463  
  2464  	// Signal the split queue's purgatory channel and ensure that the purgatory
  2465  	// removes its now well-sized replica.
  2466  	splitQueuePurgatoryChan <- timeutil.Now()
  2467  	testutils.SucceedsSoon(t, func() error {
  2468  		purgLen := store.SplitQueuePurgatoryLength()
  2469  		if purgLen == 0 {
  2470  			return nil
  2471  		}
  2472  		return errors.Errorf("expected split queue purgatory to be empty, found %d", purgLen)
  2473  	})
  2474  }
  2475  
  2476  // TestTxnWaitQueueDependencyCycleWithRangeSplit verifies that a range
  2477  // split which occurs while a dependency cycle is partially underway
  2478  // will cause the pending push txns to be retried such that they
  2479  // relocate to the appropriate new range.
  2480  func TestTxnWaitQueueDependencyCycleWithRangeSplit(t *testing.T) {
  2481  	defer leaktest.AfterTest(t)()
  2482  
  2483  	testutils.RunTrueAndFalse(t, "read2ndPass", func(t *testing.T, read2ndPass bool) {
  2484  		var pushCount int32
  2485  		firstPush := make(chan struct{})
  2486  
  2487  		storeCfg := kvserver.TestStoreConfig(nil)
  2488  		storeCfg.TestingKnobs.DisableSplitQueue = true
  2489  		storeCfg.TestingKnobs.DisableMergeQueue = true
  2490  		storeCfg.TestingKnobs.EvalKnobs.TestingEvalFilter =
  2491  			func(filterArgs kvserverbase.FilterArgs) *roachpb.Error {
  2492  				if _, ok := filterArgs.Req.(*roachpb.PushTxnRequest); ok {
  2493  					if atomic.AddInt32(&pushCount, 1) == 1 {
  2494  						close(firstPush)
  2495  					}
  2496  				}
  2497  				return nil
  2498  			}
  2499  		stopper := stop.NewStopper()
  2500  		defer stopper.Stop(context.Background())
  2501  		store := createTestStoreWithConfig(t, stopper, storeCfg)
  2502  
  2503  		lhsKey := roachpb.Key("a")
  2504  		rhsKey := roachpb.Key("b")
  2505  
  2506  		// Split at "a".
  2507  		args := adminSplitArgs(lhsKey)
  2508  		if _, pErr := kv.SendWrapped(context.Background(), store.TestSender(), args); pErr != nil {
  2509  			t.Fatalf("split at %q: %s", lhsKey, pErr)
  2510  		}
  2511  		lhs := store.LookupReplica(roachpb.RKey("a"))
  2512  
  2513  		var txnACount, txnBCount int32
  2514  
  2515  		txnAWritesA := make(chan struct{})
  2516  		txnAProceeds := make(chan struct{})
  2517  		txnBWritesB := make(chan struct{})
  2518  		txnBProceeds := make(chan struct{})
  2519  
  2520  		// Start txn to write key a.
  2521  		txnACh := make(chan error)
  2522  		go func() {
  2523  			txnACh <- store.DB().Txn(context.Background(), func(ctx context.Context, txn *kv.Txn) error {
  2524  				if err := txn.Put(ctx, lhsKey, "value"); err != nil {
  2525  					return err
  2526  				}
  2527  				if atomic.LoadInt32(&txnACount) == 0 {
  2528  					close(txnAWritesA)
  2529  					<-txnAProceeds
  2530  				}
  2531  				atomic.AddInt32(&txnACount, 1)
  2532  				return txn.Put(ctx, rhsKey, "value-from-A")
  2533  			})
  2534  		}()
  2535  		<-txnAWritesA
  2536  
  2537  		// Start txn to write key b.
  2538  		txnBCh := make(chan error)
  2539  		go func() {
  2540  			txnBCh <- store.DB().Txn(context.Background(), func(ctx context.Context, txn *kv.Txn) error {
  2541  				if err := txn.Put(ctx, rhsKey, "value"); err != nil {
  2542  					return err
  2543  				}
  2544  				if atomic.LoadInt32(&txnBCount) == 0 {
  2545  					close(txnBWritesB)
  2546  					<-txnBProceeds
  2547  				}
  2548  				atomic.AddInt32(&txnBCount, 1)
  2549  				// Read instead of write key "a" if directed. This caused a
  2550  				// PUSH_TIMESTAMP to be issued from txn B instead of PUSH_ABORT.
  2551  				if read2ndPass {
  2552  					if _, err := txn.Get(ctx, lhsKey); err != nil {
  2553  						return err
  2554  					}
  2555  				} else {
  2556  					if err := txn.Put(ctx, lhsKey, "value-from-B"); err != nil {
  2557  						return err
  2558  					}
  2559  				}
  2560  				return nil
  2561  			})
  2562  		}()
  2563  		<-txnBWritesB
  2564  
  2565  		// Now, let txnA proceed before splitting.
  2566  		close(txnAProceeds)
  2567  		// Wait for the push to occur.
  2568  		<-firstPush
  2569  
  2570  		// Split at "b".
  2571  		args = adminSplitArgs(rhsKey)
  2572  		if _, pErr := kv.SendWrappedWith(context.Background(), store, roachpb.Header{
  2573  			RangeID: lhs.RangeID,
  2574  		}, args); pErr != nil {
  2575  			t.Fatalf("split at %q: %s", rhsKey, pErr)
  2576  		}
  2577  
  2578  		// Now that we've split, allow txnB to proceed.
  2579  		close(txnBProceeds)
  2580  
  2581  		// Verify that both complete.
  2582  		for i, ch := range []chan error{txnACh, txnBCh} {
  2583  			if err := <-ch; err != nil {
  2584  				t.Fatalf("%d: txn failure: %+v", i, err)
  2585  			}
  2586  		}
  2587  	})
  2588  }
  2589  
  2590  func TestStoreCapacityAfterSplit(t *testing.T) {
  2591  	defer leaktest.AfterTest(t)()
  2592  	stopper := stop.NewStopper()
  2593  	defer stopper.Stop(context.Background())
  2594  	manualClock := hlc.NewManualClock(123)
  2595  	cfg := kvserver.TestStoreConfig(hlc.NewClock(manualClock.UnixNano, time.Nanosecond))
  2596  	cfg.TestingKnobs.DisableSplitQueue = true
  2597  	cfg.TestingKnobs.DisableMergeQueue = true
  2598  	s := createTestStoreWithOpts(
  2599  		t,
  2600  		testStoreOpts{
  2601  			// This test was written before the test stores were able to start with
  2602  			// more than one range and is not prepared to handle many ranges.
  2603  			dontCreateSystemRanges: true,
  2604  			cfg:                    &cfg},
  2605  		stopper)
  2606  
  2607  	cap, err := s.Capacity(false /* useCached */)
  2608  	if err != nil {
  2609  		t.Fatal(err)
  2610  	}
  2611  	if e, a := int32(1), cap.RangeCount; e != a {
  2612  		t.Errorf("expected cap.RangeCount=%d, got %d", e, a)
  2613  	}
  2614  	bpr1 := cap.BytesPerReplica
  2615  	if bpr1.P10 <= 0 {
  2616  		t.Errorf("expected all bytes-per-replica to be positive, got %+v", bpr1)
  2617  	}
  2618  	if bpr1.P10 != bpr1.P25 || bpr1.P10 != bpr1.P50 || bpr1.P10 != bpr1.P75 || bpr1.P10 != bpr1.P90 {
  2619  		t.Errorf("expected all bytes-per-replica percentiles to be identical, got %+v", bpr1)
  2620  	}
  2621  	wpr1 := cap.WritesPerReplica
  2622  	if wpr1.P10 != wpr1.P25 || wpr1.P10 != wpr1.P50 || wpr1.P10 != wpr1.P75 || wpr1.P10 != wpr1.P90 {
  2623  		t.Errorf("expected all writes-per-replica percentiles to be identical, got %+v", wpr1)
  2624  	}
  2625  
  2626  	// Increment the manual clock and do a write to increase the qps above zero.
  2627  	manualClock.Increment(int64(kvserver.MinStatsDuration))
  2628  	key := roachpb.Key("a")
  2629  	pArgs := putArgs(key, []byte("aaa"))
  2630  	if _, pErr := kv.SendWrapped(context.Background(), s.TestSender(), pArgs); pErr != nil {
  2631  		t.Fatal(pErr)
  2632  	}
  2633  
  2634  	cap, err = s.Capacity(false /* useCached */)
  2635  	if err != nil {
  2636  		t.Fatal(err)
  2637  	}
  2638  	if e, a := int32(1), cap.RangeCount; e != a {
  2639  		t.Errorf("expected cap.RangeCount=%d, got %d", e, a)
  2640  	}
  2641  	if e, a := int32(1), cap.LeaseCount; e != a {
  2642  		t.Errorf("expected cap.LeaseCount=%d, got %d", e, a)
  2643  	}
  2644  	if minExpected, a := 1/float64(kvserver.MinStatsDuration/time.Second), cap.WritesPerSecond; minExpected > a {
  2645  		t.Errorf("expected cap.WritesPerSecond >= %f, got %f", minExpected, a)
  2646  	}
  2647  	bpr2 := cap.BytesPerReplica
  2648  	if bpr2.P10 <= bpr1.P10 {
  2649  		t.Errorf("expected BytesPerReplica to have increased from %+v, but got %+v", bpr1, bpr2)
  2650  	}
  2651  	if bpr2.P10 != bpr2.P25 || bpr2.P10 != bpr2.P50 || bpr2.P10 != bpr2.P75 || bpr2.P10 != bpr2.P90 {
  2652  		t.Errorf("expected all bytes-per-replica percentiles to be identical, got %+v", bpr2)
  2653  	}
  2654  	wpr2 := cap.WritesPerReplica
  2655  	if wpr2.P10 <= wpr1.P10 {
  2656  		t.Errorf("expected WritesPerReplica to have increased from %+v, but got %+v", wpr1, wpr2)
  2657  	}
  2658  	if wpr2.P10 != wpr2.P25 || wpr2.P10 != wpr2.P50 || wpr2.P10 != wpr2.P75 || wpr2.P10 != wpr2.P90 {
  2659  		t.Errorf("expected all writes-per-replica percentiles to be identical, got %+v", wpr2)
  2660  	}
  2661  	if wpr2.P10 != cap.WritesPerSecond {
  2662  		t.Errorf("expected WritesPerReplica.percentiles to equal cap.WritesPerSecond, but got %f and %f",
  2663  			wpr2.P10, cap.WritesPerSecond)
  2664  	}
  2665  
  2666  	// Split the range to verify stats work properly with more than one range.
  2667  	sArgs := adminSplitArgs(key)
  2668  	if _, pErr := kv.SendWrapped(context.Background(), s.TestSender(), sArgs); pErr != nil {
  2669  		t.Fatal(pErr)
  2670  	}
  2671  
  2672  	cap, err = s.Capacity(false /* useCached */)
  2673  	if err != nil {
  2674  		t.Fatal(err)
  2675  	}
  2676  	if e, a := int32(2), cap.RangeCount; e != a {
  2677  		t.Errorf("expected cap.RangeCount=%d, got %d", e, a)
  2678  	}
  2679  	if e, a := int32(2), cap.LeaseCount; e != a {
  2680  		t.Errorf("expected cap.LeaseCount=%d, got %d", e, a)
  2681  	}
  2682  	{
  2683  		bpr := cap.BytesPerReplica
  2684  		if bpr.P10 != bpr.P25 {
  2685  			t.Errorf("expected BytesPerReplica p10 and p25 to be equal with 2 replicas, got %+v", bpr)
  2686  		}
  2687  		if bpr.P50 != bpr.P75 || bpr.P50 != bpr.P90 {
  2688  			t.Errorf("expected BytesPerReplica p50, p75, and p90 to be equal with 2 replicas, got %+v", bpr)
  2689  		}
  2690  		if bpr.P10 == bpr.P90 {
  2691  			t.Errorf("expected BytesPerReplica p10 and p90 to be different with 2 replicas, got %+v", bpr)
  2692  		}
  2693  	}
  2694  }
  2695  
  2696  // TestRangeLookupAfterMeta2Split verifies that RangeLookup scans succeed even
  2697  // when user ranges span the boundary of two split meta2 ranges. We test this
  2698  // with forward and reverse ScanRequests so that we test both forward and
  2699  // reverse RangeLookups. In the case of both the RangeLookup scan directions,
  2700  // the forward part of the scan will need to continue onto a second range to
  2701  // find the desired RangeDescriptor (remember that a reverse RangeLookup
  2702  // includes an initial forward scan).
  2703  func TestRangeLookupAfterMeta2Split(t *testing.T) {
  2704  	defer leaktest.AfterTest(t)()
  2705  
  2706  	ctx := context.Background()
  2707  	srv, _, _ := serverutils.StartServer(t, base.TestServerArgs{
  2708  		Knobs: base.TestingKnobs{
  2709  			Store: &kvserver.StoreTestingKnobs{
  2710  				DisableMergeQueue: true,
  2711  			},
  2712  		},
  2713  	})
  2714  	s := srv.(*server.TestServer)
  2715  	defer s.Stopper().Stop(ctx)
  2716  
  2717  	// Create a split at /Table/48 and /Meta2/Table/51. This creates:
  2718  	//   meta ranges [/Min-/Meta2/Table/51) and [/Meta2/Table/51-/System)
  2719  	//   user ranges [/Table/19-/Table/48)  and [/Table/48-/Max)
  2720  	//
  2721  	// Note that the two boundaries are offset such that a lookup for key /Table/49
  2722  	// will first search for meta(/Table/49) which is on the left meta2 range. However,
  2723  	// the user range [/Table/48-/Max) is stored on the right meta2 range, so the lookup
  2724  	// will require a scan that continues into the next meta2 range.
  2725  	const tableID = keys.MinUserDescID + 1 // 51
  2726  	splitReq := adminSplitArgs(keys.SystemSQLCodec.TablePrefix(tableID - 3 /* 48 */))
  2727  	if _, pErr := kv.SendWrapped(ctx, s.DB().NonTransactionalSender(), splitReq); pErr != nil {
  2728  		t.Fatal(pErr)
  2729  	}
  2730  
  2731  	metaKey := keys.RangeMetaKey(roachpb.RKey(keys.SystemSQLCodec.TablePrefix(tableID))).AsRawKey()
  2732  	splitReq = adminSplitArgs(metaKey)
  2733  	if _, pErr := kv.SendWrapped(ctx, s.DB().NonTransactionalSender(), splitReq); pErr != nil {
  2734  		t.Fatal(pErr)
  2735  	}
  2736  
  2737  	testutils.RunTrueAndFalse(t, "reverse", func(t *testing.T, rev bool) {
  2738  		// Clear the RangeDescriptorCache so that no cached descriptors are
  2739  		// available from previous lookups.
  2740  		s.DistSender().RangeDescriptorCache().Clear()
  2741  
  2742  		// Scan from [/Table/49-/Table/50) both forwards and backwards.
  2743  		// Either way, the resulting RangeLookup scan will be forced to
  2744  		// perform a continuation lookup.
  2745  		scanStart := keys.SystemSQLCodec.TablePrefix(tableID - 2) // 49
  2746  		scanEnd := scanStart.PrefixEnd()                          // 50
  2747  		header := roachpb.RequestHeader{
  2748  			Key:    scanStart,
  2749  			EndKey: scanEnd,
  2750  		}
  2751  
  2752  		var lookupReq roachpb.Request
  2753  		if rev {
  2754  			// A ReverseScanRequest will trigger a reverse RangeLookup scan.
  2755  			lookupReq = &roachpb.ReverseScanRequest{RequestHeader: header}
  2756  		} else {
  2757  			lookupReq = &roachpb.ScanRequest{RequestHeader: header}
  2758  		}
  2759  		if _, err := kv.SendWrapped(ctx, s.DB().NonTransactionalSender(), lookupReq); err != nil {
  2760  			t.Fatalf("%T %v", err.GoError(), err)
  2761  		}
  2762  	})
  2763  }
  2764  
  2765  // TestStoreSplitRangeLookupRace verifies that a RangeLookup scanning across
  2766  // multiple meta2 ranges that races with a split and misses all matching
  2767  // descriptors will retry its scan until it succeeds.
  2768  //
  2769  // This test creates a series of events that result in the injected range
  2770  // lookup scan response we see in TestRangeLookupRaceSplits/MissingDescriptor.
  2771  // It demonstrates how it is possible for an inconsistent range lookup scan
  2772  // that spans multiple ranges to completely miss its desired descriptor.
  2773  func TestStoreSplitRangeLookupRace(t *testing.T) {
  2774  	defer leaktest.AfterTest(t)()
  2775  
  2776  	// The scenario is modeled after:
  2777  	// https://github.com/cockroachdb/cockroach/issues/19147#issuecomment-336741791
  2778  	// See that comment for a description of why a non-transactional scan
  2779  	// starting at "/meta2/k" may only see non-matching descriptors when racing
  2780  	// with a split.
  2781  	//
  2782  	// To simulate this situation, we first perform splits at "/meta2/n", "j",
  2783  	// and "p". This creates the following structure, where the descriptor for
  2784  	// range [j, p) is stored on the second meta2 range:
  2785  	//
  2786  	//   [/meta2/a,/meta2/n), [/meta2/n,/meta2/z)
  2787  	//                     -----^
  2788  	//       ...      [j, p)      ...
  2789  	//
  2790  	// We then initiate a range lookup for key "k". This lookup will begin
  2791  	// scanning on the first meta2 range but won't find its desired desriptor. Normally,
  2792  	// it would continue scanning onto the second meta2 range and find the descriptor
  2793  	// for range [j, p) at "/meta2/p" (see TestRangeLookupAfterMeta2Split). However,
  2794  	// because RangeLookup scans are non-transactional, this can race with a split.
  2795  	// Here, we split at key "m", which creates the structure:
  2796  	//
  2797  	//   [/meta2/a,/meta2/n), [/meta2/n,/meta2/z)
  2798  	//             ^--        ---^
  2799  	//       ...   [j,m), [m,p)      ...
  2800  	//
  2801  	// If the second half of the RangeLookup scan sees the second meta2 range after
  2802  	// this split, it will miss the old descriptor for [j, p) and the new descriptor
  2803  	// for [j, m). In this case, the RangeLookup should retry.
  2804  	lookupKey := roachpb.Key("k")
  2805  	bounds, err := keys.MetaScanBounds(keys.RangeMetaKey(roachpb.RKey(lookupKey)))
  2806  	if err != nil {
  2807  		t.Fatal(err)
  2808  	}
  2809  
  2810  	// The following filter and set of channels is used to block the RangeLookup
  2811  	// scan for key "k" after it has scanned over the first meta2 range but not
  2812  	// the second.
  2813  	blockRangeLookups := make(chan struct{})
  2814  	blockedRangeLookups := int32(0)
  2815  	rangeLookupIsBlocked := make(chan struct{}, 1)
  2816  	unblockRangeLookups := make(chan struct{})
  2817  	respFilter := func(ctx context.Context, ba roachpb.BatchRequest, _ *roachpb.BatchResponse) *roachpb.Error {
  2818  		select {
  2819  		case <-blockRangeLookups:
  2820  			if kv.TestingIsRangeLookup(ba) &&
  2821  				ba.Requests[0].GetInner().(*roachpb.ScanRequest).Key.Equal(bounds.Key.AsRawKey()) {
  2822  
  2823  				select {
  2824  				case rangeLookupIsBlocked <- struct{}{}:
  2825  					atomic.AddInt32(&blockedRangeLookups, 1)
  2826  				default:
  2827  				}
  2828  				<-unblockRangeLookups
  2829  			}
  2830  		default:
  2831  		}
  2832  		return nil
  2833  	}
  2834  
  2835  	srv, _, _ := serverutils.StartServer(t, base.TestServerArgs{
  2836  		Knobs: base.TestingKnobs{
  2837  			Store: &kvserver.StoreTestingKnobs{
  2838  				DisableSplitQueue:     true,
  2839  				DisableMergeQueue:     true,
  2840  				TestingResponseFilter: respFilter,
  2841  				IntentResolverKnobs: kvserverbase.IntentResolverTestingKnobs{
  2842  					ForceSyncIntentResolution: true,
  2843  				},
  2844  			},
  2845  		},
  2846  	})
  2847  	s := srv.(*server.TestServer)
  2848  	defer s.Stopper().Stop(context.Background())
  2849  	store, err := s.Stores().GetStore(s.GetFirstStoreID())
  2850  	if err != nil {
  2851  		t.Fatal(err)
  2852  	}
  2853  
  2854  	mustSplit := func(splitKey roachpb.Key) {
  2855  		args := adminSplitArgs(splitKey)
  2856  
  2857  		// Don't use s.DistSender() so that we don't disturb the RangeDescriptorCache.
  2858  		rangeID := store.LookupReplica(roachpb.RKey(splitKey)).RangeID
  2859  		_, pErr := kv.SendWrappedWith(context.Background(), store, roachpb.Header{
  2860  			RangeID: rangeID,
  2861  		}, args)
  2862  		if pErr != nil {
  2863  			t.Fatal(pErr)
  2864  		}
  2865  	}
  2866  
  2867  	// Perform the initial splits. See above.
  2868  	mustSplit(keys.SystemPrefix)
  2869  	mustSplit(keys.RangeMetaKey(roachpb.RKey("n")).AsRawKey())
  2870  	mustSplit(roachpb.Key("j"))
  2871  	mustSplit(roachpb.Key("p"))
  2872  
  2873  	// Launch a goroutine to perform a range lookup for key "k" that will race
  2874  	// with a split at key "m".
  2875  	rangeLookupErr := make(chan error)
  2876  	go func() {
  2877  		close(blockRangeLookups)
  2878  
  2879  		// Loop until at-least one range lookup is triggered and blocked.
  2880  		// This accommodates for races with in-flight range lookups.
  2881  		var err error
  2882  		for atomic.LoadInt32(&blockedRangeLookups) == 0 && err == nil {
  2883  			// Clear the RangeDescriptorCache to trigger a range lookup when the
  2884  			// lookupKey is next accessed. Then immediately access lookupKey.
  2885  			s.DistSender().RangeDescriptorCache().Clear()
  2886  			_, err = s.DB().Get(context.Background(), lookupKey)
  2887  		}
  2888  		rangeLookupErr <- err
  2889  	}()
  2890  
  2891  	// Wait until the range lookup is blocked after performing a scan of the
  2892  	// first range [/meta2/a,/meta2/n) but before performing a scan of the
  2893  	// second range [/meta2/n,/meta2/z). Then split at key "m". Finally, let the
  2894  	// range lookup finish. The lookup will fail because it won't get consistent
  2895  	// results but will eventually succeed after retrying.
  2896  	select {
  2897  	case <-rangeLookupIsBlocked:
  2898  	case err := <-rangeLookupErr:
  2899  		// Unexpected early return.
  2900  		t.Fatalf("unexpected range lookup error %v", err)
  2901  	}
  2902  	mustSplit(roachpb.Key("m"))
  2903  	close(unblockRangeLookups)
  2904  
  2905  	if err := <-rangeLookupErr; err != nil {
  2906  		t.Fatalf("unexpected range lookup error %v", err)
  2907  	}
  2908  }
  2909  
  2910  // Verify that range lookup operations do not synchronously perform intent
  2911  // resolution as doing so can deadlock with the RangeDescriptorCache. See
  2912  // #17760.
  2913  func TestRangeLookupAsyncResolveIntent(t *testing.T) {
  2914  	defer leaktest.AfterTest(t)()
  2915  
  2916  	blockPushTxn := make(chan struct{})
  2917  	defer close(blockPushTxn)
  2918  
  2919  	// Disable async tasks in the intent resolver. All tasks will be synchronous.
  2920  	cfg := kvserver.TestStoreConfig(nil)
  2921  	cfg.TestingKnobs.IntentResolverKnobs.ForceSyncIntentResolution = true
  2922  	cfg.TestingKnobs.DisableSplitQueue = true
  2923  	cfg.TestingKnobs.DisableMergeQueue = true
  2924  	cfg.TestingKnobs.TestingProposalFilter =
  2925  		func(args kvserverbase.ProposalFilterArgs) *roachpb.Error {
  2926  			for _, union := range args.Req.Requests {
  2927  				if union.GetInner().Method() == roachpb.PushTxn {
  2928  					<-blockPushTxn
  2929  					break
  2930  				}
  2931  			}
  2932  			return nil
  2933  		}
  2934  	ctx := context.Background()
  2935  	stopper := stop.NewStopper()
  2936  	defer stopper.Stop(ctx)
  2937  	store := createTestStoreWithConfig(t, stopper, cfg)
  2938  
  2939  	// Split range 1 at an arbitrary key so that we're not dealing with the
  2940  	// first range for the rest of this test. The first range is handled
  2941  	// specially by the range descriptor cache.
  2942  	key := roachpb.Key("a")
  2943  	args := adminSplitArgs(key)
  2944  	if _, pErr := kv.SendWrapped(ctx, store.TestSender(), args); pErr != nil {
  2945  		t.Fatal(pErr)
  2946  	}
  2947  
  2948  	// Get original meta2 descriptor.
  2949  	rs, _, err := kv.RangeLookup(ctx, store.TestSender(), key, roachpb.READ_UNCOMMITTED, 0, false)
  2950  	if err != nil {
  2951  		t.Fatal(err)
  2952  	}
  2953  	origDesc := rs[0]
  2954  
  2955  	key2 := roachpb.Key("e")
  2956  	newDesc := origDesc
  2957  	newDesc.EndKey, err = keys.Addr(key2)
  2958  	if err != nil {
  2959  		t.Fatal(err)
  2960  	}
  2961  
  2962  	// Write the new descriptor as an intent.
  2963  	data, err := protoutil.Marshal(&newDesc)
  2964  	if err != nil {
  2965  		t.Fatal(err)
  2966  	}
  2967  	txn := roachpb.MakeTransaction("test", key2, 1,
  2968  		store.Clock().Now(), store.Clock().MaxOffset().Nanoseconds())
  2969  	// Officially begin the transaction. If not for this, the intent resolution
  2970  	// machinery would simply remove the intent we write below, see #3020.
  2971  	// We send directly to Replica throughout this test, so there's no danger
  2972  	// of the Store aborting this transaction (i.e. we don't have to set a high
  2973  	// priority).
  2974  	pArgs := putArgs(keys.RangeMetaKey(roachpb.RKey(key2)).AsRawKey(), data)
  2975  	txn.Sequence++
  2976  	pArgs.Sequence = txn.Sequence
  2977  	if _, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{Txn: &txn}, pArgs); pErr != nil {
  2978  		t.Fatal(pErr)
  2979  	}
  2980  
  2981  	// Clear the range descriptor cache so that any future requests will first
  2982  	// need to perform a RangeLookup.
  2983  	store.DB().NonTransactionalSender().(*kv.CrossRangeTxnWrapperSender).Wrapped().(*kvcoord.DistSender).RangeDescriptorCache().Clear()
  2984  
  2985  	// Now send a request, forcing the RangeLookup. Since the lookup is
  2986  	// inconsistent, there's no WriteIntentError, but we'll try to resolve any
  2987  	// intents that are found. If the RangeLookup op attempts to resolve the
  2988  	// intents synchronously, the operation will block forever.
  2989  	//
  2990  	// Note that 'a' < 'e'.
  2991  	if _, err := store.DB().Get(ctx, key); err != nil {
  2992  		t.Fatal(err)
  2993  	}
  2994  }
  2995  
  2996  // Verify that replicas don't temporarily disappear from the replicas map during
  2997  // the splits. See #29144.
  2998  func TestStoreSplitDisappearingReplicas(t *testing.T) {
  2999  	defer leaktest.AfterTest(t)()
  3000  	stopper := stop.NewStopper()
  3001  	defer stopper.Stop(context.Background())
  3002  	store, _ := createTestStore(t, stopper)
  3003  	go kvserver.WatchForDisappearingReplicas(t, store)
  3004  	for i := 0; i < 100; i++ {
  3005  		key := roachpb.Key(fmt.Sprintf("a%d", i))
  3006  		args := adminSplitArgs(key)
  3007  		if _, pErr := kv.SendWrapped(context.Background(), store.TestSender(), args); pErr != nil {
  3008  			t.Fatalf("%q: split unexpected error: %s", key, pErr)
  3009  		}
  3010  	}
  3011  }
  3012  
  3013  // Regression test for #21146. This verifies the behavior of when the
  3014  // application of some split command (part of the lhs's log) is delayed on some
  3015  // store and meanwhile the rhs has rebalanced away and back, ending up with a
  3016  // larger ReplicaID than the split thinks it will have. Additionally we remove
  3017  // the LHS replica on the same store before the split and re-add it after, so
  3018  // that when the connectivity restores the LHS will apply a split trigger while
  3019  // it is not a part of the descriptor.
  3020  //
  3021  // Or, in pictures (s3 looks like s1 throughout and is omitted):
  3022  //
  3023  //     s1:  [----r1@all-------------]
  3024  //     s2:  [----r1@all-------------]
  3025  // Remove s2:
  3026  //     s1:  [----r1@s1s3------------]
  3027  //     s2:  [----r1@all-------------] (outdated)
  3028  // Split r1:
  3029  //     s1:  [-r1@s1s3-|--r2@s1s3----]
  3030  //     s2:  [----r1@all-------------] (outdated)
  3031  // Add s2:
  3032  //     s1:  [-r1@all-|--r2@s1s3-----]
  3033  //     s2:  [----r1@all-------------] (outdated)
  3034  // Add learner to s2 on r2 (remains uninitialized due to LHS state blocking it):
  3035  //     s1:  [-r1@s1s3-|--r2@all-----]
  3036  //     s2:  [----r1@all-------------] (outdated), uninitialized replica r2/3
  3037  // Remove and re-add learner multiple times: r2/3 becomes r2/100
  3038  //     (diagram looks the same except for replacing r2/3)
  3039  //
  3040  // When connectivity is restored, r1@s2 will start to catch up on the raft log
  3041  // after it learns of its new replicaID. It first processes the replication
  3042  // change that removes it and switches to a desc that doesn't contain itself as
  3043  // a replica. Next it sees the split trigger that once caused a crash because
  3044  // the store tried to look up itself and failed. This being handled correctly,
  3045  // the split trigger next has to look up the right hand side, which surprisingly
  3046  // has a higher replicaID than that seen in the split trigger. This too needs to
  3047  // be tolerated.
  3048  func TestSplitTriggerMeetsUnexpectedReplicaID(t *testing.T) {
  3049  	defer leaktest.AfterTest(t)()
  3050  	ctx := context.Background()
  3051  
  3052  	blockPromoteCh := make(chan struct{})
  3053  	var skipLearnerSnaps int32
  3054  	withoutLearnerSnap := func(fn func()) {
  3055  		atomic.StoreInt32(&skipLearnerSnaps, 1)
  3056  		fn()
  3057  		atomic.StoreInt32(&skipLearnerSnaps, 0)
  3058  	}
  3059  	knobs := base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{
  3060  		ReplicaSkipLearnerSnapshot: func() bool {
  3061  			return atomic.LoadInt32(&skipLearnerSnaps) != 0
  3062  		},
  3063  		ReplicaAddStopAfterLearnerSnapshot: func(targets []roachpb.ReplicationTarget) bool {
  3064  			if atomic.LoadInt32(&skipLearnerSnaps) != 0 {
  3065  				return false
  3066  			}
  3067  			if len(targets) > 0 && targets[0].StoreID == 2 {
  3068  				<-blockPromoteCh
  3069  			}
  3070  			return false
  3071  		},
  3072  		ReplicaAddSkipLearnerRollback: func() bool {
  3073  			return true
  3074  		},
  3075  		// We rely on replicas remaining where they are even when they are removed
  3076  		// from the range as this lets us set up a split trigger that will apply
  3077  		// on a replica that is (at the time of the split trigger) not a member.
  3078  		DisableReplicaGCQueue: true,
  3079  	}}
  3080  	tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{
  3081  		ServerArgs:      base.TestServerArgs{Knobs: knobs},
  3082  		ReplicationMode: base.ReplicationManual,
  3083  	})
  3084  	defer tc.Stopper().Stop(ctx)
  3085  
  3086  	k := tc.ScratchRange(t)
  3087  	desc := tc.LookupRangeOrFatal(t, k)
  3088  
  3089  	// Add a replica on n3 which we'll need to achieve quorum while we cut off n2 below.
  3090  	tc.AddReplicasOrFatal(t, k, tc.Target(2))
  3091  
  3092  	// First construct a range with a learner replica on the second node (index 1)
  3093  	// and split it, ending up with an orphaned learner on each side of the split.
  3094  	// After the learner is created, but before the split, block all incoming raft
  3095  	// traffic to the learner on the lhs of the split (which is still on the
  3096  	// second node).
  3097  	g := ctxgroup.WithContext(ctx)
  3098  	g.GoCtx(func(ctx context.Context) error {
  3099  		_, err := tc.AddReplicas(k, tc.Target(1))
  3100  		return err
  3101  	})
  3102  
  3103  	store, _ := getFirstStoreReplica(t, tc.Server(1), k)
  3104  	tc.Servers[1].RaftTransport().Listen(store.StoreID(), &unreliableRaftHandler{
  3105  		rangeID:            desc.RangeID,
  3106  		RaftMessageHandler: store,
  3107  	})
  3108  
  3109  	_, kRHS := k, k.Next()
  3110  	// Remove the LHS on the isolated store, split the range, and re-add it.
  3111  	tc.RemoveReplicasOrFatal(t, k, tc.Target(1))
  3112  	descLHS, descRHS := tc.SplitRangeOrFatal(t, kRHS)
  3113  	withoutLearnerSnap(func() {
  3114  		// NB: can't use AddReplicas since that waits for the target to be up
  3115  		// to date, which it won't in this case.
  3116  		//
  3117  		// We avoid sending a snapshot because that snapshot would include the
  3118  		// split trigger and we want that to be processed via the log.
  3119  		d, err := tc.Servers[0].DB().AdminChangeReplicas(
  3120  			ctx, descLHS.StartKey.AsRawKey(), descLHS, roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, tc.Target(1)),
  3121  		)
  3122  		require.NoError(t, err)
  3123  		descLHS = *d
  3124  	})
  3125  
  3126  	close(blockPromoteCh)
  3127  	if err := g.Wait(); !testutils.IsError(err, `descriptor changed`) {
  3128  		t.Fatalf(`expected "descriptor changed" error got: %+v`, err)
  3129  	}
  3130  
  3131  	// Now repeatedly re-add the learner on the rhs, so it has a
  3132  	// different replicaID than the split trigger expects.
  3133  	add := func() {
  3134  		_, err := tc.AddReplicas(kRHS, tc.Target(1))
  3135  		// The "snapshot intersects existing range" error is expected if the store
  3136  		// has not heard a raft message addressed to a later replica ID while the
  3137  		// "was not found on" error is expected if the store has heard that it has
  3138  		// a newer replica ID before receiving the snapshot.
  3139  		if !testutils.IsError(err, `snapshot intersects existing range|r[0-9]+ was not found on s[0-9]+`) {
  3140  			t.Fatalf(`expected snapshot intersects existing range|r[0-9]+ was not found on s[0-9]+" error got: %+v`, err)
  3141  		}
  3142  	}
  3143  	for i := 0; i < 5; i++ {
  3144  		add()
  3145  		tc.RemoveReplicasOrFatal(t, kRHS, tc.Target(1))
  3146  	}
  3147  	add()
  3148  
  3149  	// Normally AddReplicas will return the latest version of the RangeDescriptor,
  3150  	// but because we're getting snapshot errors and using the
  3151  	// ReplicaAddSkipLearnerRollback hook, we have to look it up again ourselves
  3152  	// to find the current replicaID for the RHS learner.
  3153  	descRHS = tc.LookupRangeOrFatal(t, kRHS)
  3154  	learnerDescRHS, ok := descRHS.GetReplicaDescriptor(store.StoreID())
  3155  	require.True(t, ok)
  3156  
  3157  	// Wait for there to be an in-memory, uninitialized learner replica with the
  3158  	// latest ReplicaID. Note: it cannot become initialized at this point because
  3159  	// it needs a snapshot to do that and (as can be seen in the error check
  3160  	// above) snapshots will intersect the lhs replica (which doesn't know about
  3161  	// the split because we've blocked its raft traffic, and so it still covers
  3162  	// the pre-split keyspace).
  3163  	testutils.SucceedsSoon(t, func() error {
  3164  		repl, err := store.GetReplica(descRHS.RangeID)
  3165  		if err != nil {
  3166  			return err
  3167  		}
  3168  		status := repl.RaftStatus()
  3169  		if status == nil {
  3170  			return errors.New("raft group not initialized")
  3171  		}
  3172  		if replicaID := roachpb.ReplicaID(status.ID); replicaID != learnerDescRHS.ReplicaID {
  3173  			return errors.Errorf("expected %d got %d", learnerDescRHS.ReplicaID, replicaID)
  3174  		}
  3175  		return nil
  3176  	})
  3177  
  3178  	// Re-enable raft and wait for the lhs to catch up to the post-split
  3179  	// descriptor. This used to panic with "raft group deleted".
  3180  	tc.Servers[1].RaftTransport().Listen(store.StoreID(), store)
  3181  	testutils.SucceedsSoon(t, func() error {
  3182  		repl, err := store.GetReplica(descLHS.RangeID)
  3183  		if err != nil {
  3184  			return err
  3185  		}
  3186  		if desc := repl.Desc(); desc.IsInitialized() && !descLHS.Equal(desc) {
  3187  			require.NoError(t, store.ManualReplicaGC(repl))
  3188  			return errors.Errorf("expected %s got %s", &descLHS, desc)
  3189  		}
  3190  		return nil
  3191  	})
  3192  }
  3193  
  3194  // TestSplitBlocksReadsToRHS tests that an ongoing range split does not
  3195  // interrupt reads to the LHS of the split but does interrupt reads for the RHS
  3196  // of the split. The test relies on the fact that EndTxn(SplitTrigger) declares
  3197  // read access to the LHS of the split but declares write access to the RHS of
  3198  // the split.
  3199  func TestSplitBlocksReadsToRHS(t *testing.T) {
  3200  	defer leaktest.AfterTest(t)()
  3201  
  3202  	keyLHS, keySplit, keyRHS := roachpb.Key("a"), roachpb.Key("b"), roachpb.Key("c")
  3203  	splitBlocked := make(chan struct{})
  3204  	propFilter := func(args kvserverbase.ProposalFilterArgs) *roachpb.Error {
  3205  		if req, ok := args.Req.GetArg(roachpb.EndTxn); ok {
  3206  			et := req.(*roachpb.EndTxnRequest)
  3207  			if tr := et.InternalCommitTrigger.GetSplitTrigger(); tr != nil {
  3208  				if tr.RightDesc.StartKey.Equal(keySplit) {
  3209  					// Signal that the split is blocked.
  3210  					splitBlocked <- struct{}{}
  3211  					// Wait for split to be unblocked.
  3212  					<-splitBlocked
  3213  				}
  3214  			}
  3215  		}
  3216  		return nil
  3217  	}
  3218  
  3219  	storeCfg := kvserver.TestStoreConfig(nil)
  3220  	storeCfg.TestingKnobs.DisableSplitQueue = true
  3221  	storeCfg.TestingKnobs.DisableMergeQueue = true
  3222  	storeCfg.TestingKnobs.TestingProposalFilter = propFilter
  3223  	ctx := context.Background()
  3224  	stopper := stop.NewStopper()
  3225  	defer stopper.Stop(ctx)
  3226  	store := createTestStoreWithConfig(t, stopper, storeCfg)
  3227  	repl := store.LookupReplica(roachpb.RKey(keySplit))
  3228  	tsBefore := store.Clock().Now()
  3229  
  3230  	// Begin splitting the range.
  3231  	g := ctxgroup.WithContext(ctx)
  3232  	g.GoCtx(func(ctx context.Context) error {
  3233  		args := adminSplitArgs(keySplit)
  3234  		_, pErr := kv.SendWrapped(ctx, store.TestSender(), args)
  3235  		return pErr.GoError()
  3236  	})
  3237  
  3238  	// Wait until split is underway.
  3239  	<-splitBlocked
  3240  	tsAfter := store.Clock().Now()
  3241  
  3242  	// Read from the LHS and RHS, both below and above the split timestamp.
  3243  	lhsDone, rhsDone := make(chan error, 2), make(chan error, 2)
  3244  	for _, keyAndChan := range []struct {
  3245  		key   roachpb.Key
  3246  		errCh chan error
  3247  	}{
  3248  		{keyLHS, lhsDone},
  3249  		{keyRHS, rhsDone},
  3250  	} {
  3251  		for _, ts := range []hlc.Timestamp{tsBefore, tsAfter} {
  3252  			h := roachpb.Header{Timestamp: ts, RangeID: repl.RangeID}
  3253  			args := getArgs(keyAndChan.key)
  3254  			errCh := keyAndChan.errCh
  3255  			g.GoCtx(func(ctx context.Context) error {
  3256  				// Send directly to repl to avoid racing with the
  3257  				// split and routing requests to the post-split RHS.
  3258  				_, pErr := kv.SendWrappedWith(ctx, repl, h, args)
  3259  				errCh <- pErr.GoError()
  3260  				return nil
  3261  			})
  3262  		}
  3263  	}
  3264  
  3265  	// Only the LHS reads should succeed. The RHS reads should get
  3266  	// blocked waiting to acquire latches.
  3267  	for i := 0; i < cap(lhsDone); i++ {
  3268  		require.NoError(t, <-lhsDone)
  3269  	}
  3270  	select {
  3271  	case err := <-rhsDone:
  3272  		require.NoError(t, err)
  3273  		t.Fatal("unexpected read on RHS during split")
  3274  	case <-time.After(2 * time.Millisecond):
  3275  	}
  3276  
  3277  	// Unblock the split.
  3278  	splitBlocked <- struct{}{}
  3279  
  3280  	// The RHS reads should now both hit a RangeKeyMismatchError error.
  3281  	for i := 0; i < cap(rhsDone); i++ {
  3282  		require.Regexp(t, "outside of bounds of range", <-rhsDone)
  3283  	}
  3284  	require.Nil(t, g.Wait())
  3285  }