github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/client_replica_test.go (about)

     1  // Copyright 2015 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver_test
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"math"
    18  	"math/rand"
    19  	"reflect"
    20  	"strconv"
    21  	"sync"
    22  	"sync/atomic"
    23  	"testing"
    24  	"time"
    25  
    26  	"github.com/cockroachdb/cockroach/pkg/base"
    27  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    28  	"github.com/cockroachdb/cockroach/pkg/keys"
    29  	"github.com/cockroachdb/cockroach/pkg/kv"
    30  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
    31  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval"
    32  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    33  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    34  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts/ptpb"
    35  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    36  	"github.com/cockroachdb/cockroach/pkg/server"
    37  	"github.com/cockroachdb/cockroach/pkg/sql"
    38  	"github.com/cockroachdb/cockroach/pkg/storage"
    39  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    40  	"github.com/cockroachdb/cockroach/pkg/testutils"
    41  	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
    42  	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
    43  	"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
    44  	"github.com/cockroachdb/cockroach/pkg/util"
    45  	"github.com/cockroachdb/cockroach/pkg/util/caller"
    46  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    47  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    48  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    49  	"github.com/cockroachdb/cockroach/pkg/util/log"
    50  	"github.com/cockroachdb/cockroach/pkg/util/randutil"
    51  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    52  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    53  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    54  	"github.com/cockroachdb/errors"
    55  	"github.com/kr/pretty"
    56  	"github.com/stretchr/testify/assert"
    57  	"github.com/stretchr/testify/require"
    58  	"go.etcd.io/etcd/raft/raftpb"
    59  )
    60  
    61  func strToValue(s string) *roachpb.Value {
    62  	v := roachpb.MakeValueFromBytes([]byte(s))
    63  	return &v
    64  }
    65  
    66  // TestRangeCommandClockUpdate verifies that followers update their
    67  // clocks when executing a command, even if the lease holder's clock is far
    68  // in the future.
    69  func TestRangeCommandClockUpdate(t *testing.T) {
    70  	defer leaktest.AfterTest(t)()
    71  
    72  	const numNodes = 3
    73  	var manuals []*hlc.ManualClock
    74  	var clocks []*hlc.Clock
    75  	for i := 0; i < numNodes; i++ {
    76  		manuals = append(manuals, hlc.NewManualClock(1))
    77  		clocks = append(clocks, hlc.NewClock(manuals[i].UnixNano, 100*time.Millisecond))
    78  	}
    79  	mtc := &multiTestContext{
    80  		clocks: clocks,
    81  		// This test was written before the multiTestContext started creating many
    82  		// system ranges at startup, and hasn't been update to take that into
    83  		// account.
    84  		startWithSingleRange: true,
    85  	}
    86  	defer mtc.Stop()
    87  	mtc.Start(t, numNodes)
    88  	mtc.replicateRange(1, 1, 2)
    89  
    90  	// Advance the lease holder's clock ahead of the followers (by more than
    91  	// MaxOffset but less than the range lease) and execute a command.
    92  	manuals[0].Increment(int64(500 * time.Millisecond))
    93  	incArgs := incrementArgs([]byte("a"), 5)
    94  	ts := clocks[0].Now()
    95  	if _, err := kv.SendWrappedWith(context.Background(), mtc.stores[0].TestSender(), roachpb.Header{Timestamp: ts}, incArgs); err != nil {
    96  		t.Fatal(err)
    97  	}
    98  
    99  	// Wait for that command to execute on all the followers.
   100  	testutils.SucceedsSoon(t, func() error {
   101  		values := []int64{}
   102  		for _, eng := range mtc.engines {
   103  			val, _, err := storage.MVCCGet(context.Background(), eng, roachpb.Key("a"), clocks[0].Now(),
   104  				storage.MVCCGetOptions{})
   105  			if err != nil {
   106  				return err
   107  			}
   108  			values = append(values, mustGetInt(val))
   109  		}
   110  		if !reflect.DeepEqual(values, []int64{5, 5, 5}) {
   111  			return errors.Errorf("expected (5, 5, 5), got %v", values)
   112  		}
   113  		return nil
   114  	})
   115  
   116  	// Verify that all the followers have accepted the clock update from
   117  	// node 0 even though it comes from outside the usual max offset.
   118  	now := clocks[0].Now()
   119  	for i, clock := range clocks {
   120  		// Only compare the WallTimes: it's normal for clock 0 to be a few logical ticks ahead.
   121  		if clock.Now().WallTime < now.WallTime {
   122  			t.Errorf("clock %d is behind clock 0: %s vs %s", i, clock.Now(), now)
   123  		}
   124  	}
   125  }
   126  
   127  // TestRejectFutureCommand verifies that lease holders reject commands that
   128  // would cause a large time jump.
   129  func TestRejectFutureCommand(t *testing.T) {
   130  	defer leaktest.AfterTest(t)()
   131  
   132  	manual := hlc.NewManualClock(123)
   133  	clock := hlc.NewClock(manual.UnixNano, 100*time.Millisecond)
   134  	sc := kvserver.TestStoreConfig(clock)
   135  	mtc := &multiTestContext{storeConfig: &sc}
   136  	defer mtc.Stop()
   137  	mtc.Start(t, 1)
   138  
   139  	ts1 := clock.Now()
   140  
   141  	key := roachpb.Key("a")
   142  	incArgs := incrementArgs(key, 5)
   143  
   144  	// Commands with a future timestamp that is within the MaxOffset
   145  	// bound will be accepted and will cause the clock to advance.
   146  	const numCmds = 3
   147  	clockOffset := clock.MaxOffset() / numCmds
   148  	for i := int64(1); i <= numCmds; i++ {
   149  		ts := ts1.Add(i*clockOffset.Nanoseconds(), 0)
   150  		if _, err := kv.SendWrappedWith(context.Background(), mtc.stores[0].TestSender(), roachpb.Header{Timestamp: ts}, incArgs); err != nil {
   151  			t.Fatal(err)
   152  		}
   153  	}
   154  
   155  	ts2 := clock.Now()
   156  	if expAdvance, advance := ts2.GoTime().Sub(ts1.GoTime()), numCmds*clockOffset; advance != expAdvance {
   157  		t.Fatalf("expected clock to advance %s; got %s", expAdvance, advance)
   158  	}
   159  
   160  	// Once the accumulated offset reaches MaxOffset, commands will be rejected.
   161  	_, pErr := kv.SendWrappedWith(context.Background(), mtc.stores[0].TestSender(), roachpb.Header{Timestamp: ts1.Add(clock.MaxOffset().Nanoseconds()+1, 0)}, incArgs)
   162  	if !testutils.IsPError(pErr, "remote wall time is too far ahead") {
   163  		t.Fatalf("unexpected error %v", pErr)
   164  	}
   165  
   166  	// The clock did not advance and the final command was not executed.
   167  	ts3 := clock.Now()
   168  	if advance := ts3.GoTime().Sub(ts2.GoTime()); advance != 0 {
   169  		t.Fatalf("expected clock not to advance, but it advanced by %s", advance)
   170  	}
   171  	val, _, err := storage.MVCCGet(context.Background(), mtc.engines[0], key, ts3,
   172  		storage.MVCCGetOptions{})
   173  	if err != nil {
   174  		t.Fatal(err)
   175  	}
   176  	if a, e := mustGetInt(val), incArgs.Increment*numCmds; a != e {
   177  		t.Errorf("expected %d, got %d", e, a)
   178  	}
   179  }
   180  
   181  // TestTxnPutOutOfOrder tests a case where a put operation of an older
   182  // timestamp comes after a put operation of a newer timestamp in a
   183  // txn. The test ensures such an out-of-order put succeeds and
   184  // overrides an old value. The test uses a "Writer" and a "Reader"
   185  // to reproduce an out-of-order put.
   186  //
   187  // 1) The Writer executes a cput operation and writes a write intent with
   188  //    time T in a txn.
   189  // 2) Before the Writer's txn is committed, the Reader sends a high priority
   190  //    get operation with time T+100. This pushes the Writer txn timestamp to
   191  //    T+100. The Reader also writes to the same key the Writer did a cput to
   192  //    in order to trigger the restart of the Writer's txn. The original
   193  //    write intent timestamp is also updated to T+100.
   194  // 3) The Writer starts a new epoch of the txn, but before it writes, the
   195  //    Reader sends another high priority get operation with time T+200. This
   196  //    pushes the Writer txn timestamp to T+200 to trigger a restart of the
   197  //    Writer txn. The Writer will not actually restart until it tries to commit
   198  //    the current epoch of the transaction. The Reader updates the timestamp of
   199  //    the write intent to T+200. The test deliberately fails the Reader get
   200  //    operation, and cockroach doesn't update its timestamp cache.
   201  // 4) The Writer executes the put operation again. This put operation comes
   202  //    out-of-order since its timestamp is T+100, while the intent timestamp
   203  //    updated at Step 3 is T+200.
   204  // 5) The put operation overrides the old value using timestamp T+100.
   205  // 6) When the Writer attempts to commit its txn, the txn will be restarted
   206  //    again at a new epoch timestamp T+200, which will finally succeed.
   207  func TestTxnPutOutOfOrder(t *testing.T) {
   208  	defer leaktest.AfterTest(t)()
   209  
   210  	// key is selected to fall within the meta range in order for the later
   211  	// routing of requests to range 1 to work properly. Removing the routing
   212  	// of all requests to range 1 would allow us to make the key more normal.
   213  	const (
   214  		key        = "key"
   215  		restartKey = "restart"
   216  	)
   217  	// Set up a filter to so that the get operation at Step 3 will return an error.
   218  	var numGets int32
   219  
   220  	stopper := stop.NewStopper()
   221  	defer stopper.Stop(context.Background())
   222  	manual := hlc.NewManualClock(123)
   223  	cfg := kvserver.TestStoreConfig(hlc.NewClock(manual.UnixNano, time.Nanosecond))
   224  	// Splits can cause our chosen key to end up on a range other than range 1,
   225  	// and trying to handle that complicates the test without providing any
   226  	// added benefit.
   227  	cfg.TestingKnobs.DisableSplitQueue = true
   228  	cfg.TestingKnobs.EvalKnobs.TestingEvalFilter =
   229  		func(filterArgs kvserverbase.FilterArgs) *roachpb.Error {
   230  			if _, ok := filterArgs.Req.(*roachpb.GetRequest); ok &&
   231  				filterArgs.Req.Header().Key.Equal(roachpb.Key(key)) &&
   232  				filterArgs.Hdr.Txn == nil {
   233  				// The Reader executes two get operations, each of which triggers two get requests
   234  				// (the first request fails and triggers txn push, and then the second request
   235  				// succeeds). Returns an error for the fourth get request to avoid timestamp cache
   236  				// update after the third get operation pushes the txn timestamp.
   237  				if atomic.AddInt32(&numGets, 1) == 4 {
   238  					return roachpb.NewErrorWithTxn(errors.Errorf("Test"), filterArgs.Hdr.Txn)
   239  				}
   240  			}
   241  			return nil
   242  		}
   243  	eng := storage.NewDefaultInMem()
   244  	stopper.AddCloser(eng)
   245  	store := createTestStoreWithOpts(t,
   246  		testStoreOpts{eng: eng, cfg: &cfg},
   247  		stopper,
   248  	)
   249  
   250  	// Put an initial value.
   251  	initVal := []byte("initVal")
   252  	err := store.DB().Put(context.Background(), key, initVal)
   253  	if err != nil {
   254  		t.Fatalf("failed to put: %+v", err)
   255  	}
   256  
   257  	waitPut := make(chan struct{})
   258  	waitFirstGet := make(chan struct{})
   259  	waitTxnRestart := make(chan struct{})
   260  	waitSecondGet := make(chan struct{})
   261  	errChan := make(chan error)
   262  
   263  	// Start the Writer.
   264  	go func() {
   265  		epoch := -1
   266  		// Start a txn that does read-after-write.
   267  		// The txn will be restarted twice, and the out-of-order put
   268  		// will happen in the second epoch.
   269  		errChan <- store.DB().Txn(context.Background(), func(ctx context.Context, txn *kv.Txn) error {
   270  			epoch++
   271  
   272  			if epoch == 1 {
   273  				// Wait until the second get operation is issued.
   274  				close(waitTxnRestart)
   275  				<-waitSecondGet
   276  			}
   277  
   278  			// Get a key which we can write to from the Reader in order to force a restart.
   279  			if _, err := txn.Get(ctx, restartKey); err != nil {
   280  				return err
   281  			}
   282  
   283  			updatedVal := []byte("updatedVal")
   284  			if err := txn.CPut(ctx, key, updatedVal, strToValue("initVal")); err != nil {
   285  				log.Errorf(context.Background(), "failed put value: %+v", err)
   286  				return err
   287  			}
   288  
   289  			// Make sure a get will return the value that was just written.
   290  			actual, err := txn.Get(ctx, key)
   291  			if err != nil {
   292  				return err
   293  			}
   294  			if !bytes.Equal(actual.ValueBytes(), updatedVal) {
   295  				return errors.Errorf("unexpected get result: %s", actual)
   296  			}
   297  
   298  			if epoch == 0 {
   299  				// Wait until the first get operation will push the txn timestamp.
   300  				close(waitPut)
   301  				<-waitFirstGet
   302  			}
   303  
   304  			b := txn.NewBatch()
   305  			return txn.CommitInBatch(ctx, b)
   306  		})
   307  
   308  		if epoch != 2 {
   309  			file, line, _ := caller.Lookup(0)
   310  			errChan <- errors.Errorf("%s:%d unexpected number of txn retries. "+
   311  				"Expected epoch 2, got: %d.", file, line, epoch)
   312  		} else {
   313  			errChan <- nil
   314  		}
   315  	}()
   316  
   317  	<-waitPut
   318  
   319  	// Start the Reader.
   320  
   321  	// Advance the clock and send a get operation with higher
   322  	// priority to trigger the txn restart.
   323  	manual.Increment(100)
   324  
   325  	priority := roachpb.UserPriority(-math.MaxInt32)
   326  	requestHeader := roachpb.RequestHeader{
   327  		Key: roachpb.Key(key),
   328  	}
   329  	h := roachpb.Header{
   330  		Timestamp:    cfg.Clock.Now(),
   331  		UserPriority: priority,
   332  	}
   333  	if _, err := kv.SendWrappedWith(
   334  		context.Background(), store.TestSender(), h, &roachpb.GetRequest{RequestHeader: requestHeader},
   335  	); err != nil {
   336  		t.Fatalf("failed to get: %+v", err)
   337  	}
   338  	// Write to the restart key so that the Writer's txn must restart.
   339  	putReq := &roachpb.PutRequest{
   340  		RequestHeader: roachpb.RequestHeader{Key: roachpb.Key(restartKey)},
   341  		Value:         roachpb.MakeValueFromBytes([]byte("restart-value")),
   342  	}
   343  	if _, err := kv.SendWrappedWith(context.Background(), store.TestSender(), h, putReq); err != nil {
   344  		t.Fatalf("failed to put: %+v", err)
   345  	}
   346  
   347  	// Wait until the writer restarts the txn.
   348  	close(waitFirstGet)
   349  	<-waitTxnRestart
   350  
   351  	// Advance the clock and send a get operation again. This time
   352  	// we use TestingCommandFilter so that a get operation is not
   353  	// processed after the write intent is resolved (to prevent the
   354  	// timestamp cache from being updated).
   355  	manual.Increment(100)
   356  
   357  	h.Timestamp = cfg.Clock.Now()
   358  	if _, err := kv.SendWrappedWith(
   359  		context.Background(), store.TestSender(), h, &roachpb.GetRequest{RequestHeader: requestHeader},
   360  	); err == nil {
   361  		t.Fatal("unexpected success of get")
   362  	}
   363  	if _, err := kv.SendWrappedWith(context.Background(), store.TestSender(), h, putReq); err != nil {
   364  		t.Fatalf("failed to put: %+v", err)
   365  	}
   366  
   367  	close(waitSecondGet)
   368  	for i := 0; i < 2; i++ {
   369  		if err := <-errChan; err != nil {
   370  			t.Fatal(err)
   371  		}
   372  	}
   373  }
   374  
   375  // TestRangeLookupUseReverse tests whether the results and the results count
   376  // are correct when scanning in reverse order.
   377  func TestRangeLookupUseReverse(t *testing.T) {
   378  	defer leaktest.AfterTest(t)()
   379  	storeCfg := kvserver.TestStoreConfig(nil)
   380  	storeCfg.TestingKnobs.DisableSplitQueue = true
   381  	storeCfg.TestingKnobs.DisableMergeQueue = true
   382  	stopper := stop.NewStopper()
   383  	defer stopper.Stop(context.Background())
   384  	store := createTestStoreWithOpts(
   385  		t,
   386  		testStoreOpts{
   387  			// This test was written before the test stores were able to start with
   388  			// more than one range and is not prepared to handle many ranges.
   389  			dontCreateSystemRanges: true,
   390  			cfg:                    &storeCfg,
   391  		},
   392  		stopper)
   393  
   394  	// Init test ranges:
   395  	// ["","a"), ["a","c"), ["c","e"), ["e","g") and ["g","\xff\xff").
   396  	splits := []*roachpb.AdminSplitRequest{
   397  		adminSplitArgs(roachpb.Key("g")),
   398  		adminSplitArgs(roachpb.Key("e")),
   399  		adminSplitArgs(roachpb.Key("c")),
   400  		adminSplitArgs(roachpb.Key("a")),
   401  	}
   402  
   403  	for _, split := range splits {
   404  		_, pErr := kv.SendWrapped(context.Background(), store.TestSender(), split)
   405  		if pErr != nil {
   406  			t.Fatalf("%q: split unexpected error: %s", split.SplitKey, pErr)
   407  		}
   408  	}
   409  
   410  	// Resolve the intents.
   411  	scanArgs := roachpb.ScanRequest{
   412  		RequestHeader: roachpb.RequestHeader{
   413  			Key:    keys.RangeMetaKey(roachpb.RKeyMin.Next()).AsRawKey(),
   414  			EndKey: keys.RangeMetaKey(roachpb.RKeyMax).AsRawKey(),
   415  		},
   416  	}
   417  	testutils.SucceedsSoon(t, func() error {
   418  		_, pErr := kv.SendWrapped(context.Background(), store.TestSender(), &scanArgs)
   419  		return pErr.GoError()
   420  	})
   421  
   422  	testCases := []struct {
   423  		key         roachpb.RKey
   424  		maxResults  int64
   425  		expected    []roachpb.RangeDescriptor
   426  		expectedPre []roachpb.RangeDescriptor
   427  	}{
   428  		// Test key in the middle of the range.
   429  		{
   430  			key:        roachpb.RKey("f"),
   431  			maxResults: 2,
   432  			// ["e","g") and ["c","e").
   433  			expected: []roachpb.RangeDescriptor{
   434  				{StartKey: roachpb.RKey("e"), EndKey: roachpb.RKey("g")},
   435  			},
   436  			expectedPre: []roachpb.RangeDescriptor{
   437  				{StartKey: roachpb.RKey("c"), EndKey: roachpb.RKey("e")},
   438  			},
   439  		},
   440  		// Test key in the end key of the range.
   441  		{
   442  			key:        roachpb.RKey("g"),
   443  			maxResults: 3,
   444  			// ["e","g"), ["c","e") and ["a","c").
   445  			expected: []roachpb.RangeDescriptor{
   446  				{StartKey: roachpb.RKey("e"), EndKey: roachpb.RKey("g")},
   447  			},
   448  			expectedPre: []roachpb.RangeDescriptor{
   449  				{StartKey: roachpb.RKey("c"), EndKey: roachpb.RKey("e")},
   450  				{StartKey: roachpb.RKey("a"), EndKey: roachpb.RKey("c")},
   451  			},
   452  		},
   453  		{
   454  			key:        roachpb.RKey("e"),
   455  			maxResults: 2,
   456  			// ["c","e") and ["a","c").
   457  			expected: []roachpb.RangeDescriptor{
   458  				{StartKey: roachpb.RKey("c"), EndKey: roachpb.RKey("e")},
   459  			},
   460  			expectedPre: []roachpb.RangeDescriptor{
   461  				{StartKey: roachpb.RKey("a"), EndKey: roachpb.RKey("c")},
   462  			},
   463  		},
   464  		// Test RKeyMax.
   465  		{
   466  			key:        roachpb.RKeyMax,
   467  			maxResults: 2,
   468  			// ["e","g") and ["g","\xff\xff")
   469  			expected: []roachpb.RangeDescriptor{
   470  				{StartKey: roachpb.RKey("g"), EndKey: roachpb.RKey("\xff\xff")},
   471  			},
   472  			expectedPre: []roachpb.RangeDescriptor{
   473  				{StartKey: roachpb.RKey("e"), EndKey: roachpb.RKey("g")},
   474  			},
   475  		},
   476  		// Test Meta2KeyMax.
   477  		{
   478  			key:        roachpb.RKey(keys.Meta2KeyMax),
   479  			maxResults: 1,
   480  			// ["","a")
   481  			expected: []roachpb.RangeDescriptor{
   482  				{StartKey: roachpb.RKeyMin, EndKey: roachpb.RKey("a")},
   483  			},
   484  		},
   485  	}
   486  
   487  	for _, test := range testCases {
   488  		t.Run(fmt.Sprintf("key=%s", test.key), func(t *testing.T) {
   489  			rs, preRs, err := kv.RangeLookup(context.Background(), store.TestSender(),
   490  				test.key.AsRawKey(), roachpb.READ_UNCOMMITTED, test.maxResults-1, true /* prefetchReverse */)
   491  			if err != nil {
   492  				t.Fatalf("LookupRange error: %+v", err)
   493  			}
   494  
   495  			// Checks the results count.
   496  			if rsLen, preRsLen := len(rs), len(preRs); int64(rsLen+preRsLen) != test.maxResults {
   497  				t.Fatalf("returned results count, expected %d, but got %d+%d", test.maxResults, rsLen, preRsLen)
   498  			}
   499  			// Checks the range descriptors.
   500  			for _, rngSlice := range []struct {
   501  				expect, reply []roachpb.RangeDescriptor
   502  			}{
   503  				{test.expected, rs},
   504  				{test.expectedPre, preRs},
   505  			} {
   506  				for i, rng := range rngSlice.expect {
   507  					if !(rng.StartKey.Equal(rngSlice.reply[i].StartKey) && rng.EndKey.Equal(rngSlice.reply[i].EndKey)) {
   508  						t.Fatalf("returned range is not correct, expected %v, but got %v", rng, rngSlice.reply[i])
   509  					}
   510  				}
   511  			}
   512  		})
   513  	}
   514  }
   515  
   516  type leaseTransferTest struct {
   517  	mtc *multiTestContext
   518  	// replicas of range covering key "a" on the first and the second stores.
   519  	replica0, replica1         *kvserver.Replica
   520  	replica0Desc, replica1Desc roachpb.ReplicaDescriptor
   521  	leftKey                    roachpb.Key
   522  	filterMu                   syncutil.Mutex
   523  	filter                     func(filterArgs kvserverbase.FilterArgs) *roachpb.Error
   524  	waitForTransferBlocked     atomic.Value
   525  	transferBlocked            chan struct{}
   526  }
   527  
   528  func setupLeaseTransferTest(t *testing.T) *leaseTransferTest {
   529  	l := &leaseTransferTest{
   530  		leftKey: roachpb.Key("a"),
   531  	}
   532  
   533  	cfg := kvserver.TestStoreConfig(nil)
   534  	cfg.Clock = nil // manual clock
   535  	// Ensure the node liveness duration isn't too short. By default it is 900ms
   536  	// for TestStoreConfig().
   537  	cfg.RangeLeaseRaftElectionTimeoutMultiplier =
   538  		float64((9 * time.Second) / cfg.RaftElectionTimeout())
   539  	cfg.TestingKnobs.EvalKnobs.TestingEvalFilter =
   540  		func(filterArgs kvserverbase.FilterArgs) *roachpb.Error {
   541  			l.filterMu.Lock()
   542  			filterCopy := l.filter
   543  			l.filterMu.Unlock()
   544  			if filterCopy != nil {
   545  				return filterCopy(filterArgs)
   546  			}
   547  			return nil
   548  		}
   549  
   550  	l.waitForTransferBlocked.Store(false)
   551  	l.transferBlocked = make(chan struct{})
   552  	cfg.TestingKnobs.LeaseTransferBlockedOnExtensionEvent = func(
   553  		_ roachpb.ReplicaDescriptor) {
   554  		if l.waitForTransferBlocked.Load().(bool) {
   555  			l.transferBlocked <- struct{}{}
   556  			l.waitForTransferBlocked.Store(false)
   557  		}
   558  	}
   559  
   560  	l.mtc = &multiTestContext{}
   561  	// This test was written before the multiTestContext started creating many
   562  	// system ranges at startup, and hasn't been update to take that into account.
   563  	l.mtc.startWithSingleRange = true
   564  	l.mtc.storeConfig = &cfg
   565  	l.mtc.Start(t, 2)
   566  	l.mtc.initGossipNetwork()
   567  
   568  	// First, do a write; we'll use it to determine when the dust has settled.
   569  	l.leftKey = roachpb.Key("a")
   570  	incArgs := incrementArgs(l.leftKey, 1)
   571  	if _, pErr := kv.SendWrapped(context.Background(), l.mtc.distSenders[0], incArgs); pErr != nil {
   572  		t.Fatal(pErr)
   573  	}
   574  
   575  	// Get the left range's ID.
   576  	rangeID := l.mtc.stores[0].LookupReplica(keys.MustAddr(l.leftKey)).RangeID
   577  
   578  	// Replicate the left range onto node 1.
   579  	l.mtc.replicateRange(rangeID, 1)
   580  
   581  	l.replica0 = l.mtc.stores[0].LookupReplica(roachpb.RKey("a"))
   582  	l.replica1 = l.mtc.stores[1].LookupReplica(roachpb.RKey("a"))
   583  	{
   584  		var err error
   585  		if l.replica0Desc, err = l.replica0.GetReplicaDescriptor(); err != nil {
   586  			t.Fatal(err)
   587  		}
   588  		if l.replica1Desc, err = l.replica1.GetReplicaDescriptor(); err != nil {
   589  			t.Fatal(err)
   590  		}
   591  	}
   592  
   593  	// Check that replica0 can serve reads OK.
   594  	if pErr := l.sendRead(0); pErr != nil {
   595  		t.Fatal(pErr)
   596  	}
   597  	return l
   598  }
   599  
   600  func (l *leaseTransferTest) sendRead(storeIdx int) *roachpb.Error {
   601  	desc := l.mtc.stores[storeIdx].LookupReplica(keys.MustAddr(l.leftKey))
   602  	replicaDesc, err := desc.GetReplicaDescriptor()
   603  	if err != nil {
   604  		return roachpb.NewError(err)
   605  	}
   606  	_, pErr := kv.SendWrappedWith(
   607  		context.Background(),
   608  		l.mtc.senders[storeIdx],
   609  		roachpb.Header{RangeID: desc.RangeID, Replica: replicaDesc},
   610  		getArgs(l.leftKey),
   611  	)
   612  	if pErr != nil {
   613  		log.Warningf(context.Background(), "%v", pErr)
   614  	}
   615  	return pErr
   616  }
   617  
   618  // checkHasLease checks that a lease for the left range is owned by a
   619  // replica. The check is executed in a retry loop because the lease may not
   620  // have been applied yet.
   621  func (l *leaseTransferTest) checkHasLease(t *testing.T, storeIdx int) {
   622  	t.Helper()
   623  	testutils.SucceedsSoon(t, func() error {
   624  		return l.sendRead(storeIdx).GoError()
   625  	})
   626  }
   627  
   628  // setFilter is a helper function to enable/disable the blocking of
   629  // RequestLeaseRequests on replica1. This function will notify that an
   630  // extension is blocked on the passed in channel and will wait on the same
   631  // channel to unblock the extension. Note that once an extension is blocked,
   632  // the filter is cleared.
   633  func (l *leaseTransferTest) setFilter(setTo bool, extensionSem chan struct{}) {
   634  	l.filterMu.Lock()
   635  	defer l.filterMu.Unlock()
   636  	if !setTo {
   637  		l.filter = nil
   638  		return
   639  	}
   640  	l.filter = func(filterArgs kvserverbase.FilterArgs) *roachpb.Error {
   641  		if filterArgs.Sid != l.mtc.stores[1].Ident.StoreID {
   642  			return nil
   643  		}
   644  		llReq, ok := filterArgs.Req.(*roachpb.RequestLeaseRequest)
   645  		if !ok {
   646  			return nil
   647  		}
   648  		if llReq.Lease.Replica == l.replica1Desc {
   649  			// Notify the main thread that the extension is in progress and wait for
   650  			// the signal to proceed.
   651  			l.filterMu.Lock()
   652  			l.filter = nil
   653  			l.filterMu.Unlock()
   654  			extensionSem <- struct{}{}
   655  			log.Infof(filterArgs.Ctx, "filter blocking request: %s", llReq)
   656  			<-extensionSem
   657  			log.Infof(filterArgs.Ctx, "filter unblocking lease request")
   658  		}
   659  		return nil
   660  	}
   661  }
   662  
   663  // forceLeaseExtension moves the clock forward close to the lease's expiration,
   664  // and then performs a read on the range, which will force the lease to be
   665  // renewed. This assumes the lease is not epoch-based.
   666  func (l *leaseTransferTest) forceLeaseExtension(storeIdx int, lease roachpb.Lease) error {
   667  	// Set the clock close to the lease's expiration.
   668  	l.mtc.manualClock.Set(lease.Expiration.WallTime - 10)
   669  	err := l.sendRead(storeIdx).GoError()
   670  	// We can sometimes receive an error from our renewal attempt because the
   671  	// lease transfer ends up causing the renewal to re-propose and second
   672  	// attempt fails because it's already been renewed. This used to work
   673  	// before we compared the proposer's lease with the actual lease because
   674  	// the renewed lease still encompassed the previous request.
   675  	if errors.HasType(err, (*roachpb.NotLeaseHolderError)(nil)) {
   676  		err = nil
   677  	}
   678  	return err
   679  }
   680  
   681  // ensureLeaderAndRaftState is a helper function that blocks until leader is
   682  // the raft leader and follower is up to date.
   683  func (l *leaseTransferTest) ensureLeaderAndRaftState(
   684  	t *testing.T, leader *kvserver.Replica, follower roachpb.ReplicaDescriptor,
   685  ) {
   686  	t.Helper()
   687  	leaderDesc, err := leader.GetReplicaDescriptor()
   688  	if err != nil {
   689  		t.Fatal(err)
   690  	}
   691  	testutils.SucceedsSoon(t, func() error {
   692  		r := l.mtc.getRaftLeader(l.replica0.RangeID)
   693  		if r == nil {
   694  			return errors.Errorf("could not find raft leader replica for range %d", l.replica0.RangeID)
   695  		}
   696  		desc, err := r.GetReplicaDescriptor()
   697  		if err != nil {
   698  			return errors.Wrap(err, "could not get replica descriptor")
   699  		}
   700  		if desc != leaderDesc {
   701  			return errors.Errorf(
   702  				"expected replica with id %v to be raft leader, instead got id %v",
   703  				leaderDesc.ReplicaID,
   704  				desc.ReplicaID,
   705  			)
   706  		}
   707  		return nil
   708  	})
   709  
   710  	testutils.SucceedsSoon(t, func() error {
   711  		status := leader.RaftStatus()
   712  		progress, ok := status.Progress[uint64(follower.ReplicaID)]
   713  		if !ok {
   714  			return errors.Errorf(
   715  				"replica %v progress not found in progress map: %v",
   716  				follower.ReplicaID,
   717  				status.Progress,
   718  			)
   719  		}
   720  		if progress.Match < status.Commit {
   721  			return errors.Errorf("replica %v failed to catch up", follower.ReplicaID)
   722  		}
   723  		return nil
   724  	})
   725  }
   726  
   727  func TestLeaseExpirationBasedRangeTransfer(t *testing.T) {
   728  	defer leaktest.AfterTest(t)()
   729  
   730  	l := setupLeaseTransferTest(t)
   731  	defer l.mtc.Stop()
   732  	origLease, _ := l.replica0.GetLease()
   733  	{
   734  		// Transferring the lease to ourself should be a no-op.
   735  		if err := l.replica0.AdminTransferLease(context.Background(), l.replica0Desc.StoreID); err != nil {
   736  			t.Fatal(err)
   737  		}
   738  		newLease, _ := l.replica0.GetLease()
   739  		if !origLease.Equivalent(newLease) {
   740  			t.Fatalf("original lease %v and new lease %v not equivalent", origLease, newLease)
   741  		}
   742  	}
   743  
   744  	{
   745  		// An invalid target should result in an error.
   746  		const expected = "unable to find store .* in range"
   747  		if err := l.replica0.AdminTransferLease(context.Background(), 1000); !testutils.IsError(err, expected) {
   748  			t.Fatalf("expected %s, but found %v", expected, err)
   749  		}
   750  	}
   751  
   752  	if err := l.replica0.AdminTransferLease(context.Background(), l.replica1Desc.StoreID); err != nil {
   753  		t.Fatal(err)
   754  	}
   755  
   756  	// Check that replica0 doesn't serve reads any more.
   757  	pErr := l.sendRead(0)
   758  	nlhe, ok := pErr.GetDetail().(*roachpb.NotLeaseHolderError)
   759  	if !ok {
   760  		t.Fatalf("expected %T, got %s", &roachpb.NotLeaseHolderError{}, pErr)
   761  	}
   762  	if !nlhe.LeaseHolder.Equal(&l.replica1Desc) {
   763  		t.Fatalf("expected lease holder %+v, got %+v",
   764  			l.replica1Desc, nlhe.LeaseHolder)
   765  	}
   766  
   767  	// Check that replica1 now has the lease.
   768  	l.checkHasLease(t, 1)
   769  
   770  	replica1Lease, _ := l.replica1.GetLease()
   771  
   772  	// We'd like to verify the timestamp cache's low water mark, but this is
   773  	// impossible to determine precisely in all cases because it may have
   774  	// been subsumed by future tscache accesses. So instead of checking the
   775  	// low water mark, we make sure that the high water mark is equal to or
   776  	// greater than the new lease start time, which is less than the
   777  	// previous lease's expiration time.
   778  	if highWater := l.replica1.GetTSCacheHighWater(); highWater.Less(replica1Lease.Start) {
   779  		t.Fatalf("expected timestamp cache high water %s, but found %s",
   780  			replica1Lease.Start, highWater)
   781  	}
   782  
   783  }
   784  
   785  // TestLeaseExpirationBasedRangeTransferWithExtension make replica1
   786  // extend its lease and transfer the lease immediately after
   787  // that. Test that the transfer still happens (it'll wait until the
   788  // extension is done).
   789  func TestLeaseExpirationBasedRangeTransferWithExtension(t *testing.T) {
   790  	defer leaktest.AfterTest(t)()
   791  
   792  	l := setupLeaseTransferTest(t)
   793  	defer l.mtc.Stop()
   794  	// Ensure that replica1 has the lease.
   795  	if err := l.replica0.AdminTransferLease(context.Background(), l.replica1Desc.StoreID); err != nil {
   796  		t.Fatal(err)
   797  	}
   798  	l.checkHasLease(t, 1)
   799  
   800  	extensionSem := make(chan struct{})
   801  	l.setFilter(true, extensionSem)
   802  
   803  	// Initiate an extension.
   804  	renewalErrCh := make(chan error)
   805  	go func() {
   806  		lease, _ := l.replica1.GetLease()
   807  		renewalErrCh <- l.forceLeaseExtension(1, lease)
   808  	}()
   809  
   810  	// Wait for extension to be blocked.
   811  	<-extensionSem
   812  	l.waitForTransferBlocked.Store(true)
   813  	// Initiate a transfer.
   814  	transferErrCh := make(chan error)
   815  	go func() {
   816  		// Transfer back from replica1 to replica0.
   817  		err := l.replica1.AdminTransferLease(context.Background(), l.replica0Desc.StoreID)
   818  		// Ignore not leaseholder errors which can arise due to re-proposals.
   819  		if errors.HasType(err, (*roachpb.NotLeaseHolderError)(nil)) {
   820  			err = nil
   821  		}
   822  		transferErrCh <- err
   823  	}()
   824  	// Wait for the transfer to be blocked by the extension.
   825  	<-l.transferBlocked
   826  	// Now unblock the extension.
   827  	extensionSem <- struct{}{}
   828  	l.checkHasLease(t, 0)
   829  	l.setFilter(false, nil)
   830  
   831  	if err := <-renewalErrCh; err != nil {
   832  		t.Errorf("unexpected error from lease renewal: %+v", err)
   833  	}
   834  	if err := <-transferErrCh; err != nil {
   835  		t.Errorf("unexpected error from lease transfer: %+v", err)
   836  	}
   837  }
   838  
   839  // TestLeaseExpirationBasedDrainTransfer verifies that a draining store attempts to transfer away
   840  // range leases owned by its replicas.
   841  func TestLeaseExpirationBasedDrainTransfer(t *testing.T) {
   842  	defer leaktest.AfterTest(t)()
   843  
   844  	l := setupLeaseTransferTest(t)
   845  	defer l.mtc.Stop()
   846  	// We have to ensure that replica0 is the raft leader and that replica1 has
   847  	// caught up to replica0 as draining code doesn't transfer leases to
   848  	// behind replicas.
   849  	l.ensureLeaderAndRaftState(t, l.replica0, l.replica1Desc)
   850  	l.mtc.stores[0].SetDraining(true, nil /* reporter */)
   851  
   852  	// Check that replica0 doesn't serve reads any more.
   853  	pErr := l.sendRead(0)
   854  	nlhe, ok := pErr.GetDetail().(*roachpb.NotLeaseHolderError)
   855  	if !ok {
   856  		t.Fatalf("expected %T, got %s", &roachpb.NotLeaseHolderError{}, pErr)
   857  	}
   858  	if nlhe.LeaseHolder == nil || !nlhe.LeaseHolder.Equal(&l.replica1Desc) {
   859  		t.Fatalf("expected lease holder %+v, got %+v",
   860  			l.replica1Desc, nlhe.LeaseHolder)
   861  	}
   862  
   863  	// Check that replica1 now has the lease.
   864  	l.checkHasLease(t, 1)
   865  
   866  	l.mtc.stores[0].SetDraining(false, nil /* reporter */)
   867  }
   868  
   869  // TestLeaseExpirationBasedDrainTransferWithExtension verifies that
   870  // a draining store waits for any in-progress lease requests to
   871  // complete before transferring away the new lease.
   872  func TestLeaseExpirationBasedDrainTransferWithExtension(t *testing.T) {
   873  	defer leaktest.AfterTest(t)()
   874  
   875  	l := setupLeaseTransferTest(t)
   876  	defer l.mtc.Stop()
   877  	// Ensure that replica1 has the lease.
   878  	if err := l.replica0.AdminTransferLease(context.Background(), l.replica1Desc.StoreID); err != nil {
   879  		t.Fatal(err)
   880  	}
   881  	l.checkHasLease(t, 1)
   882  
   883  	extensionSem := make(chan struct{})
   884  	l.setFilter(true, extensionSem)
   885  
   886  	// Initiate an extension.
   887  	renewalErrCh := make(chan error)
   888  	go func() {
   889  		lease, _ := l.replica1.GetLease()
   890  		renewalErrCh <- l.forceLeaseExtension(1, lease)
   891  	}()
   892  
   893  	// Wait for extension to be blocked.
   894  	<-extensionSem
   895  
   896  	// Make sure that replica 0 is up to date enough to receive the lease.
   897  	l.ensureLeaderAndRaftState(t, l.replica1, l.replica0Desc)
   898  
   899  	// Drain node 1 with an extension in progress.
   900  	go func() {
   901  		l.mtc.stores[1].SetDraining(true, nil /* reporter */)
   902  	}()
   903  	// Now unblock the extension.
   904  	extensionSem <- struct{}{}
   905  
   906  	l.checkHasLease(t, 0)
   907  	l.setFilter(false, nil)
   908  
   909  	if err := <-renewalErrCh; err != nil {
   910  		t.Errorf("unexpected error from lease renewal: %+v", err)
   911  	}
   912  }
   913  
   914  // TestRangeLimitTxnMaxTimestamp verifies that on lease transfer, the
   915  // normal limiting of a txn's max timestamp to the first observed
   916  // timestamp on a node is extended to include the lease start
   917  // timestamp. This disallows the possibility that a write to another
   918  // replica of the range (on node n1) happened at a later timestamp
   919  // than the originally observed timestamp for the node which now owns
   920  // the lease (n2). This can happen if the replication of the write
   921  // doesn't make it from n1 to n2 before the transaction observes n2's
   922  // clock time.
   923  func TestRangeLimitTxnMaxTimestamp(t *testing.T) {
   924  	defer leaktest.AfterTest(t)()
   925  	cfg := kvserver.TestStoreConfig(nil)
   926  	cfg.RangeLeaseRaftElectionTimeoutMultiplier =
   927  		float64((9 * time.Second) / cfg.RaftElectionTimeout())
   928  	cfg.Clock = nil // manual clock
   929  	mtc := &multiTestContext{}
   930  	mtc.storeConfig = &cfg
   931  	keyA := roachpb.Key("a")
   932  	// Create a new clock for node2 to allow drift between the two wall clocks.
   933  	manual1 := hlc.NewManualClock(100) // node1 clock is @t=100
   934  	clock1 := hlc.NewClock(manual1.UnixNano, 250*time.Nanosecond)
   935  	manual2 := hlc.NewManualClock(98) // node2 clock is @t=98
   936  	clock2 := hlc.NewClock(manual2.UnixNano, 250*time.Nanosecond)
   937  	mtc.clocks = []*hlc.Clock{clock1, clock2}
   938  
   939  	// Start a transaction using node2 as a gateway.
   940  	txn := roachpb.MakeTransaction("test", keyA, 1, clock2.Now(), 250 /* maxOffsetNs */)
   941  	// Simulate a read to another range on node2 by setting the observed timestamp.
   942  	txn.UpdateObservedTimestamp(2, clock2.Now())
   943  
   944  	defer mtc.Stop()
   945  	mtc.Start(t, 2)
   946  
   947  	// Do a write on node1 to establish a key with its timestamp @t=100.
   948  	if _, pErr := kv.SendWrapped(
   949  		context.Background(), mtc.distSenders[0], putArgs(keyA, []byte("value")),
   950  	); pErr != nil {
   951  		t.Fatal(pErr)
   952  	}
   953  
   954  	// Up-replicate the data in the range to node2.
   955  	replica1 := mtc.stores[0].LookupReplica(roachpb.RKey(keyA))
   956  	mtc.replicateRange(replica1.RangeID, 1)
   957  
   958  	// Transfer the lease from node1 to node2.
   959  	replica2 := mtc.stores[1].LookupReplica(roachpb.RKey(keyA))
   960  	replica2Desc, err := replica2.GetReplicaDescriptor()
   961  	if err != nil {
   962  		t.Fatal(err)
   963  	}
   964  	testutils.SucceedsSoon(t, func() error {
   965  		if err := replica1.AdminTransferLease(context.Background(), replica2Desc.StoreID); err != nil {
   966  			t.Fatal(err)
   967  		}
   968  		lease, _ := replica2.GetLease()
   969  		if lease.Replica.NodeID != replica2.NodeID() {
   970  			return errors.Errorf("expected lease transfer to node2: %s", lease)
   971  		}
   972  		return nil
   973  	})
   974  	// Verify that after the lease transfer, node2's clock has advanced to at least 100.
   975  	if now1, now2 := clock1.Now(), clock2.Now(); now2.WallTime < now1.WallTime {
   976  		t.Fatalf("expected node2's clock walltime to be >= %d; got %d", now1.WallTime, now2.WallTime)
   977  	}
   978  
   979  	// Send a get request for keyA to node2, which is now the
   980  	// leaseholder. If the max timestamp were not being properly limited,
   981  	// we would end up incorrectly reading nothing for keyA. Instead we
   982  	// expect to see an uncertainty interval error.
   983  	h := roachpb.Header{Txn: &txn}
   984  	if _, pErr := kv.SendWrappedWith(
   985  		context.Background(), mtc.distSenders[0], h, getArgs(keyA),
   986  	); !testutils.IsPError(pErr, "uncertainty") {
   987  		t.Fatalf("expected an uncertainty interval error; got %v", pErr)
   988  	}
   989  }
   990  
   991  // TestLeaseMetricsOnSplitAndTransfer verifies that lease-related metrics
   992  // are updated after splitting a range and then initiating one successful
   993  // and one failing lease transfer.
   994  func TestLeaseMetricsOnSplitAndTransfer(t *testing.T) {
   995  	defer leaktest.AfterTest(t)()
   996  	var injectLeaseTransferError atomic.Value
   997  	sc := kvserver.TestStoreConfig(nil)
   998  	sc.TestingKnobs.DisableSplitQueue = true
   999  	sc.TestingKnobs.DisableMergeQueue = true
  1000  	sc.TestingKnobs.EvalKnobs.TestingEvalFilter =
  1001  		func(filterArgs kvserverbase.FilterArgs) *roachpb.Error {
  1002  			if args, ok := filterArgs.Req.(*roachpb.TransferLeaseRequest); ok {
  1003  				if val := injectLeaseTransferError.Load(); val != nil && val.(bool) {
  1004  					// Note that we can't just return an error here as we only
  1005  					// end up counting failures in the metrics if the command
  1006  					// makes it through to being executed. So use a fake store ID.
  1007  					args.Lease.Replica.StoreID = roachpb.StoreID(1000)
  1008  				}
  1009  			}
  1010  			return nil
  1011  		}
  1012  	sc.Clock = nil // manual clock
  1013  	mtc := &multiTestContext{
  1014  		storeConfig: &sc,
  1015  		// This test was written before the multiTestContext started creating many
  1016  		// system ranges at startup, and hasn't been update to take that into
  1017  		// account.
  1018  		startWithSingleRange: true,
  1019  	}
  1020  	defer mtc.Stop()
  1021  	mtc.Start(t, 2)
  1022  
  1023  	// Up-replicate to two replicas.
  1024  	keyMinReplica0 := mtc.stores[0].LookupReplica(roachpb.RKeyMin)
  1025  	mtc.replicateRange(keyMinReplica0.RangeID, 1)
  1026  
  1027  	// Split the key space at key "a".
  1028  	splitKey := roachpb.RKey("a")
  1029  	splitArgs := adminSplitArgs(splitKey.AsRawKey())
  1030  	if _, pErr := kv.SendWrapped(
  1031  		context.Background(), mtc.stores[0].TestSender(), splitArgs,
  1032  	); pErr != nil {
  1033  		t.Fatal(pErr)
  1034  	}
  1035  
  1036  	// Now, a successful transfer from LHS replica 0 to replica 1.
  1037  	injectLeaseTransferError.Store(false)
  1038  	if err := mtc.dbs[0].AdminTransferLease(
  1039  		context.Background(), keyMinReplica0.Desc().StartKey.AsRawKey(), mtc.stores[1].StoreID(),
  1040  	); err != nil {
  1041  		t.Fatalf("unable to transfer lease to replica 1: %+v", err)
  1042  	}
  1043  	// Wait for all replicas to process.
  1044  	testutils.SucceedsSoon(t, func() error {
  1045  		for i := 0; i < 2; i++ {
  1046  			r := mtc.stores[i].LookupReplica(roachpb.RKeyMin)
  1047  			if l, _ := r.GetLease(); l.Replica.StoreID != mtc.stores[1].StoreID() {
  1048  				return errors.Errorf("expected lease to transfer to replica 2: got %s", l)
  1049  			}
  1050  		}
  1051  		return nil
  1052  	})
  1053  
  1054  	// Next a failed transfer from RHS replica 0 to replica 1.
  1055  	injectLeaseTransferError.Store(true)
  1056  	keyAReplica0 := mtc.stores[0].LookupReplica(splitKey)
  1057  	if err := mtc.dbs[0].AdminTransferLease(
  1058  		context.Background(), keyAReplica0.Desc().StartKey.AsRawKey(), mtc.stores[1].StoreID(),
  1059  	); err == nil {
  1060  		t.Fatal("expected an error transferring to an unknown store ID")
  1061  	}
  1062  
  1063  	metrics := mtc.stores[0].Metrics()
  1064  	if a, e := metrics.LeaseTransferSuccessCount.Count(), int64(1); a != e {
  1065  		t.Errorf("expected %d lease transfer successes; got %d", e, a)
  1066  	}
  1067  	if a, e := metrics.LeaseTransferErrorCount.Count(), int64(1); a != e {
  1068  		t.Errorf("expected %d lease transfer errors; got %d", e, a)
  1069  	}
  1070  
  1071  	// Expire current leases and put a key to RHS of split to request
  1072  	// an epoch-based lease.
  1073  	testutils.SucceedsSoon(t, func() error {
  1074  		mtc.advanceClock(context.Background())
  1075  		if err := mtc.stores[0].DB().Put(context.Background(), "a", "foo"); err != nil {
  1076  			return err
  1077  		}
  1078  
  1079  		// Update replication gauges for all stores and verify we have 1 each of
  1080  		// expiration and epoch leases.
  1081  		var expirationLeases int64
  1082  		var epochLeases int64
  1083  		for i := range mtc.stores {
  1084  			if err := mtc.stores[i].ComputeMetrics(context.Background(), 0); err != nil {
  1085  				return err
  1086  			}
  1087  			metrics = mtc.stores[i].Metrics()
  1088  			expirationLeases += metrics.LeaseExpirationCount.Value()
  1089  			epochLeases += metrics.LeaseEpochCount.Value()
  1090  		}
  1091  		if a, e := expirationLeases, int64(1); a != e {
  1092  			return errors.Errorf("expected %d expiration lease count; got %d", e, a)
  1093  		}
  1094  		if a, e := epochLeases, int64(1); a != e {
  1095  			return errors.Errorf("expected %d epoch lease count; got %d", e, a)
  1096  		}
  1097  		return nil
  1098  	})
  1099  }
  1100  
  1101  // Test that leases held before a restart are not used after the restart.
  1102  // See replica.mu.minLeaseProposedTS for the reasons why this isn't allowed.
  1103  func TestLeaseNotUsedAfterRestart(t *testing.T) {
  1104  	defer leaktest.AfterTest(t)()
  1105  
  1106  	ctx := context.Background()
  1107  
  1108  	sc := kvserver.TestStoreConfig(nil)
  1109  	sc.Clock = nil // manual clock
  1110  	var leaseAcquisitionTrap atomic.Value
  1111  	// Disable the split queue so that no ranges are split. This makes it easy
  1112  	// below to trap any lease request and infer that it refers to the range we're
  1113  	// interested in.
  1114  	sc.TestingKnobs.DisableSplitQueue = true
  1115  	sc.TestingKnobs.LeaseRequestEvent = func(ts hlc.Timestamp) {
  1116  		val := leaseAcquisitionTrap.Load()
  1117  		if val == nil {
  1118  			return
  1119  		}
  1120  		trapCallback := val.(func(ts hlc.Timestamp))
  1121  		if trapCallback != nil {
  1122  			trapCallback(ts)
  1123  		}
  1124  	}
  1125  	mtc := &multiTestContext{storeConfig: &sc}
  1126  	defer mtc.Stop()
  1127  	mtc.Start(t, 1)
  1128  
  1129  	key := []byte("a")
  1130  	// Send a read, to acquire a lease.
  1131  	getArgs := getArgs(key)
  1132  	if _, err := kv.SendWrapped(ctx, mtc.stores[0].TestSender(), getArgs); err != nil {
  1133  		t.Fatal(err)
  1134  	}
  1135  
  1136  	preRestartLease, _ := mtc.stores[0].LookupReplica(key).GetLease()
  1137  
  1138  	mtc.manualClock.Increment(1e9)
  1139  
  1140  	// Restart the mtc. Before we do that, we're installing a callback used to
  1141  	// assert that a new lease has been requested. The callback is installed
  1142  	// before the restart, as the lease might be requested at any time and for
  1143  	// many reasons by background processes, even before we send the read below.
  1144  	leaseAcquisitionCh := make(chan error)
  1145  	var once sync.Once
  1146  	leaseAcquisitionTrap.Store(func(_ hlc.Timestamp) {
  1147  		once.Do(func() {
  1148  			close(leaseAcquisitionCh)
  1149  		})
  1150  	})
  1151  
  1152  	log.Info(ctx, "restarting")
  1153  	mtc.restart()
  1154  
  1155  	// Send another read and check that the pre-existing lease has not been used.
  1156  	// Concretely, we check that a new lease is requested.
  1157  	if _, err := kv.SendWrapped(ctx, mtc.stores[0].TestSender(), getArgs); err != nil {
  1158  		t.Fatal(err)
  1159  	}
  1160  	// Check that the Send above triggered a lease acquisition.
  1161  	select {
  1162  	case <-leaseAcquisitionCh:
  1163  	case <-time.After(time.Second):
  1164  		t.Fatalf("read did not acquire a new lease")
  1165  	}
  1166  
  1167  	postRestartLease, _ := mtc.stores[0].LookupReplica(key).GetLease()
  1168  
  1169  	// Verify that not only is a new lease requested, it also gets a new sequence
  1170  	// number. This makes sure that previously proposed commands actually fail at
  1171  	// apply time.
  1172  	if preRestartLease.Sequence == postRestartLease.Sequence {
  1173  		t.Fatalf("lease was not replaced:\nprev: %v\nnow:  %v", preRestartLease, postRestartLease)
  1174  	}
  1175  }
  1176  
  1177  // Test that a lease extension (a RequestLeaseRequest that doesn't change the
  1178  // lease holder) is not blocked by ongoing reads. The test relies on the fact
  1179  // that RequestLeaseRequest does not declare to touch the whole key span of the
  1180  // range, and thus don't conflict through the command queue with other reads.
  1181  func TestLeaseExtensionNotBlockedByRead(t *testing.T) {
  1182  	defer leaktest.AfterTest(t)()
  1183  	readBlocked := make(chan struct{})
  1184  	cmdFilter := func(fArgs kvserverbase.FilterArgs) *roachpb.Error {
  1185  		if fArgs.Hdr.UserPriority == 42 {
  1186  			// Signal that the read is blocked.
  1187  			readBlocked <- struct{}{}
  1188  			// Wait for read to be unblocked.
  1189  			<-readBlocked
  1190  		}
  1191  		return nil
  1192  	}
  1193  	srv, _, _ := serverutils.StartServer(t,
  1194  		base.TestServerArgs{
  1195  			Knobs: base.TestingKnobs{
  1196  				Store: &kvserver.StoreTestingKnobs{
  1197  					EvalKnobs: kvserverbase.BatchEvalTestingKnobs{
  1198  						TestingEvalFilter: cmdFilter,
  1199  					},
  1200  				},
  1201  			},
  1202  		})
  1203  	s := srv.(*server.TestServer)
  1204  	defer s.Stopper().Stop(context.Background())
  1205  
  1206  	store, err := s.GetStores().(*kvserver.Stores).GetStore(s.GetFirstStoreID())
  1207  	if err != nil {
  1208  		t.Fatal(err)
  1209  	}
  1210  
  1211  	// Start a read and wait for it to block.
  1212  	key := roachpb.Key("a")
  1213  	errChan := make(chan error)
  1214  	go func() {
  1215  		getReq := roachpb.GetRequest{
  1216  			RequestHeader: roachpb.RequestHeader{
  1217  				Key: key,
  1218  			},
  1219  		}
  1220  		if _, pErr := kv.SendWrappedWith(context.Background(), s.DB().NonTransactionalSender(),
  1221  			roachpb.Header{UserPriority: 42},
  1222  			&getReq); pErr != nil {
  1223  			errChan <- pErr.GoError()
  1224  		}
  1225  	}()
  1226  
  1227  	select {
  1228  	case err := <-errChan:
  1229  		t.Fatal(err)
  1230  	case <-readBlocked:
  1231  		// Send the lease request.
  1232  		rKey, err := keys.Addr(key)
  1233  		if err != nil {
  1234  			t.Fatal(err)
  1235  		}
  1236  		repl := store.LookupReplica(rKey)
  1237  		if repl == nil {
  1238  			t.Fatalf("replica for key %s not found", rKey)
  1239  		}
  1240  		replDesc, found := repl.Desc().GetReplicaDescriptor(store.StoreID())
  1241  		if !found {
  1242  			t.Fatalf("replica descriptor for key %s not found", rKey)
  1243  		}
  1244  
  1245  		leaseReq := roachpb.RequestLeaseRequest{
  1246  			RequestHeader: roachpb.RequestHeader{
  1247  				Key: key,
  1248  			},
  1249  			Lease: roachpb.Lease{
  1250  				Start:      s.Clock().Now(),
  1251  				Expiration: s.Clock().Now().Add(time.Second.Nanoseconds(), 0).Clone(),
  1252  				Replica:    replDesc,
  1253  			},
  1254  		}
  1255  
  1256  		for {
  1257  			curLease, _, err := s.GetRangeLease(context.Background(), key)
  1258  			if err != nil {
  1259  				t.Fatal(err)
  1260  			}
  1261  			leaseReq.PrevLease = curLease
  1262  
  1263  			_, pErr := kv.SendWrapped(context.Background(), s.DB().NonTransactionalSender(), &leaseReq)
  1264  			if _, ok := pErr.GetDetail().(*roachpb.AmbiguousResultError); ok {
  1265  				log.Infof(context.Background(), "retrying lease after %s", pErr)
  1266  				continue
  1267  			}
  1268  			if _, ok := pErr.GetDetail().(*roachpb.LeaseRejectedError); ok {
  1269  				// Lease rejected? Try again. The extension should work because
  1270  				// extending is idempotent (assuming the PrevLease matches).
  1271  				log.Infof(context.Background(), "retrying lease after %s", pErr)
  1272  				continue
  1273  			}
  1274  			if pErr != nil {
  1275  				t.Errorf("%T %s", pErr.GetDetail(), pErr) // NB: don't fatal or shutdown hangs
  1276  			}
  1277  			break
  1278  		}
  1279  		// Unblock the read.
  1280  		readBlocked <- struct{}{}
  1281  	}
  1282  }
  1283  
  1284  // LeaseInfo runs a LeaseInfoRequest using the specified server.
  1285  func LeaseInfo(
  1286  	t *testing.T,
  1287  	db *kv.DB,
  1288  	rangeDesc roachpb.RangeDescriptor,
  1289  	readConsistency roachpb.ReadConsistencyType,
  1290  ) roachpb.LeaseInfoResponse {
  1291  	leaseInfoReq := &roachpb.LeaseInfoRequest{
  1292  		RequestHeader: roachpb.RequestHeader{
  1293  			Key: rangeDesc.StartKey.AsRawKey(),
  1294  		},
  1295  	}
  1296  	reply, pErr := kv.SendWrappedWith(context.Background(), db.NonTransactionalSender(), roachpb.Header{
  1297  		ReadConsistency: readConsistency,
  1298  	}, leaseInfoReq)
  1299  	if pErr != nil {
  1300  		t.Fatal(pErr)
  1301  	}
  1302  	return *(reply.(*roachpb.LeaseInfoResponse))
  1303  }
  1304  
  1305  func TestLeaseInfoRequest(t *testing.T) {
  1306  	defer leaktest.AfterTest(t)()
  1307  	tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{})
  1308  	defer tc.Stopper().Stop(context.Background())
  1309  
  1310  	kvDB0 := tc.Servers[0].DB()
  1311  	kvDB1 := tc.Servers[1].DB()
  1312  
  1313  	key := []byte("a")
  1314  	rangeDesc, err := tc.LookupRange(key)
  1315  	if err != nil {
  1316  		t.Fatal(err)
  1317  	}
  1318  	replicas := make([]roachpb.ReplicaDescriptor, 3)
  1319  	for i := 0; i < 3; i++ {
  1320  		var ok bool
  1321  		replicas[i], ok = rangeDesc.GetReplicaDescriptor(tc.Servers[i].GetFirstStoreID())
  1322  		if !ok {
  1323  			t.Fatalf("expected to find replica in server %d", i)
  1324  		}
  1325  	}
  1326  
  1327  	// Transfer the lease to Servers[0] so we start in a known state. Otherwise,
  1328  	// there might be already a lease owned by a random node.
  1329  	err = tc.TransferRangeLease(rangeDesc, tc.Target(0))
  1330  	if err != nil {
  1331  		t.Fatal(err)
  1332  	}
  1333  
  1334  	// Now test the LeaseInfo. We might need to loop until the node we query has
  1335  	// applied the lease.
  1336  	testutils.SucceedsSoon(t, func() error {
  1337  		leaseHolderReplica := LeaseInfo(t, kvDB0, rangeDesc, roachpb.INCONSISTENT).Lease.Replica
  1338  		if leaseHolderReplica != replicas[0] {
  1339  			return fmt.Errorf("lease holder should be replica %+v, but is: %+v",
  1340  				replicas[0], leaseHolderReplica)
  1341  		}
  1342  		return nil
  1343  	})
  1344  
  1345  	// Transfer the lease to Server 1 and check that LeaseInfoRequest gets the
  1346  	// right answer.
  1347  	err = tc.TransferRangeLease(rangeDesc, tc.Target(1))
  1348  	if err != nil {
  1349  		t.Fatal(err)
  1350  	}
  1351  	// An inconsistent LeaseInfoReqeust on the old lease holder should give us the
  1352  	// right answer immediately, since the old holder has definitely applied the
  1353  	// transfer before TransferRangeLease returned.
  1354  	leaseHolderReplica := LeaseInfo(t, kvDB0, rangeDesc, roachpb.INCONSISTENT).Lease.Replica
  1355  	if !leaseHolderReplica.Equal(replicas[1]) {
  1356  		t.Fatalf("lease holder should be replica %+v, but is: %+v",
  1357  			replicas[1], leaseHolderReplica)
  1358  	}
  1359  
  1360  	// A read on the new lease holder does not necessarily succeed immediately,
  1361  	// since it might take a while for it to apply the transfer.
  1362  	testutils.SucceedsSoon(t, func() error {
  1363  		// We can't reliably do a CONSISTENT read here, even though we're reading
  1364  		// from the supposed lease holder, because this node might initially be
  1365  		// unaware of the new lease and so the request might bounce around for a
  1366  		// while (see #8816).
  1367  		leaseHolderReplica = LeaseInfo(t, kvDB1, rangeDesc, roachpb.INCONSISTENT).Lease.Replica
  1368  		if !leaseHolderReplica.Equal(replicas[1]) {
  1369  			return errors.Errorf("lease holder should be replica %+v, but is: %+v",
  1370  				replicas[1], leaseHolderReplica)
  1371  		}
  1372  		return nil
  1373  	})
  1374  
  1375  	// Transfer the lease to Server 2 and check that LeaseInfoRequest gets the
  1376  	// right answer.
  1377  	err = tc.TransferRangeLease(rangeDesc, tc.Target(2))
  1378  	if err != nil {
  1379  		t.Fatal(err)
  1380  	}
  1381  
  1382  	// We're now going to ask servers[1] for the lease info. We don't use kvDB1;
  1383  	// instead we go directly to the store because otherwise the DistSender might
  1384  	// use an old, cached, version of the range descriptor that doesn't have the
  1385  	// local replica in it (and so the request would be routed away).
  1386  	// TODO(andrei): Add a batch option to not use the range cache.
  1387  	s, err := tc.Servers[1].Stores().GetStore(tc.Servers[1].GetFirstStoreID())
  1388  	if err != nil {
  1389  		t.Fatal(err)
  1390  	}
  1391  	leaseInfoReq := &roachpb.LeaseInfoRequest{
  1392  		RequestHeader: roachpb.RequestHeader{
  1393  			Key: rangeDesc.StartKey.AsRawKey(),
  1394  		},
  1395  	}
  1396  	reply, pErr := kv.SendWrappedWith(
  1397  		context.Background(), s, roachpb.Header{
  1398  			RangeID:         rangeDesc.RangeID,
  1399  			ReadConsistency: roachpb.INCONSISTENT,
  1400  		}, leaseInfoReq)
  1401  	if pErr != nil {
  1402  		t.Fatal(pErr)
  1403  	}
  1404  	resp := *(reply.(*roachpb.LeaseInfoResponse))
  1405  	leaseHolderReplica = resp.Lease.Replica
  1406  
  1407  	if !leaseHolderReplica.Equal(replicas[2]) {
  1408  		t.Fatalf("lease holder should be replica %s, but is: %s", replicas[2], leaseHolderReplica)
  1409  	}
  1410  
  1411  	// TODO(andrei): test the side-effect of LeaseInfoRequest when there's no
  1412  	// active lease - the node getting the request is supposed to acquire the
  1413  	// lease. This requires a way to expire leases; the TestCluster probably needs
  1414  	// to use a mock clock.
  1415  }
  1416  
  1417  // Test that an error encountered by a read-only "NonKV" command is not
  1418  // swallowed, and doesn't otherwise cause a panic.
  1419  // We had a bug cause by the fact that errors for these commands aren't passed
  1420  // through the epilogue returned by replica.beginCommands() and were getting
  1421  // swallowed.
  1422  func TestErrorHandlingForNonKVCommand(t *testing.T) {
  1423  	defer leaktest.AfterTest(t)()
  1424  	cmdFilter := func(fArgs kvserverbase.FilterArgs) *roachpb.Error {
  1425  		if fArgs.Hdr.UserPriority == 42 {
  1426  			return roachpb.NewErrorf("injected error")
  1427  		}
  1428  		return nil
  1429  	}
  1430  	srv, _, _ := serverutils.StartServer(t,
  1431  		base.TestServerArgs{
  1432  			Knobs: base.TestingKnobs{
  1433  				Store: &kvserver.StoreTestingKnobs{
  1434  					EvalKnobs: kvserverbase.BatchEvalTestingKnobs{
  1435  						TestingEvalFilter: cmdFilter,
  1436  					},
  1437  				},
  1438  			},
  1439  		})
  1440  	s := srv.(*server.TestServer)
  1441  	defer s.Stopper().Stop(context.Background())
  1442  
  1443  	// Send the lease request.
  1444  	key := roachpb.Key("a")
  1445  	leaseReq := roachpb.LeaseInfoRequest{
  1446  		RequestHeader: roachpb.RequestHeader{
  1447  			Key: key,
  1448  		},
  1449  	}
  1450  	_, pErr := kv.SendWrappedWith(
  1451  		context.Background(),
  1452  		s.DB().NonTransactionalSender(),
  1453  		roachpb.Header{UserPriority: 42},
  1454  		&leaseReq,
  1455  	)
  1456  	if !testutils.IsPError(pErr, "injected error") {
  1457  		t.Fatalf("expected error %q, got: %s", "injected error", pErr)
  1458  	}
  1459  }
  1460  
  1461  func TestRangeInfo(t *testing.T) {
  1462  	defer leaktest.AfterTest(t)()
  1463  	storeCfg := kvserver.TestStoreConfig(nil /* clock */)
  1464  	storeCfg.TestingKnobs.DisableMergeQueue = true
  1465  	storeCfg.Clock = nil // manual clock
  1466  	mtc := &multiTestContext{
  1467  		storeConfig: &storeCfg,
  1468  		// This test was written before the multiTestContext started creating many
  1469  		// system ranges at startup, and hasn't been update to take that into
  1470  		// account.
  1471  		startWithSingleRange: true,
  1472  	}
  1473  	defer mtc.Stop()
  1474  	mtc.Start(t, 2)
  1475  
  1476  	// Up-replicate to two replicas.
  1477  	mtc.replicateRange(mtc.stores[0].LookupReplica(roachpb.RKeyMin).RangeID, 1)
  1478  
  1479  	// Split the key space at key "a".
  1480  	splitKey := roachpb.RKey("a")
  1481  	splitArgs := adminSplitArgs(splitKey.AsRawKey())
  1482  	if _, pErr := kv.SendWrapped(
  1483  		context.Background(), mtc.stores[0].TestSender(), splitArgs,
  1484  	); pErr != nil {
  1485  		t.Fatal(pErr)
  1486  	}
  1487  
  1488  	// Get the replicas for each side of the split. This is done within
  1489  	// a SucceedsSoon loop to ensure the split completes.
  1490  	var lhsReplica0, lhsReplica1, rhsReplica0, rhsReplica1 *kvserver.Replica
  1491  	testutils.SucceedsSoon(t, func() error {
  1492  		lhsReplica0 = mtc.stores[0].LookupReplica(roachpb.RKeyMin)
  1493  		lhsReplica1 = mtc.stores[1].LookupReplica(roachpb.RKeyMin)
  1494  		rhsReplica0 = mtc.stores[0].LookupReplica(splitKey)
  1495  		rhsReplica1 = mtc.stores[1].LookupReplica(splitKey)
  1496  		if lhsReplica0 == rhsReplica0 || lhsReplica1 == rhsReplica1 {
  1497  			return errors.Errorf("replicas not post-split %v, %v, %v, %v",
  1498  				lhsReplica0, rhsReplica0, rhsReplica0, rhsReplica1)
  1499  		}
  1500  		return nil
  1501  	})
  1502  	lhsLease, _ := lhsReplica0.GetLease()
  1503  	rhsLease, _ := rhsReplica0.GetLease()
  1504  
  1505  	// Verify range info is not set if unrequested.
  1506  	getArgs := getArgs(splitKey.AsRawKey())
  1507  	reply, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], getArgs)
  1508  	if pErr != nil {
  1509  		t.Fatal(pErr)
  1510  	}
  1511  	if len(reply.Header().RangeInfos) > 0 {
  1512  		t.Errorf("expected empty range infos if unrequested; got %v", reply.Header().RangeInfos)
  1513  	}
  1514  
  1515  	// Verify range info on a get request.
  1516  	h := roachpb.Header{
  1517  		ReturnRangeInfo: true,
  1518  	}
  1519  	reply, pErr = kv.SendWrappedWith(context.Background(), mtc.distSenders[0], h, getArgs)
  1520  	if pErr != nil {
  1521  		t.Fatal(pErr)
  1522  	}
  1523  	expRangeInfos := []roachpb.RangeInfo{
  1524  		{
  1525  			Desc:  *rhsReplica0.Desc(),
  1526  			Lease: rhsLease,
  1527  		},
  1528  	}
  1529  	if !reflect.DeepEqual(reply.Header().RangeInfos, expRangeInfos) {
  1530  		t.Errorf("on get reply, expected %+v; got %+v", expRangeInfos, reply.Header().RangeInfos)
  1531  	}
  1532  
  1533  	// Verify range info on a put request.
  1534  	putArgs := putArgs(splitKey.AsRawKey(), []byte("foo"))
  1535  	reply, pErr = kv.SendWrappedWith(context.Background(), mtc.distSenders[0], h, putArgs)
  1536  	if pErr != nil {
  1537  		t.Fatal(pErr)
  1538  	}
  1539  	if !reflect.DeepEqual(reply.Header().RangeInfos, expRangeInfos) {
  1540  		t.Errorf("on put reply, expected %+v; got %+v", expRangeInfos, reply.Header().RangeInfos)
  1541  	}
  1542  
  1543  	// Verify range info on an admin request.
  1544  	adminArgs := &roachpb.AdminTransferLeaseRequest{
  1545  		RequestHeader: roachpb.RequestHeader{
  1546  			Key: splitKey.AsRawKey(),
  1547  		},
  1548  		Target: rhsLease.Replica.StoreID,
  1549  	}
  1550  	reply, pErr = kv.SendWrappedWith(context.Background(), mtc.distSenders[0], h, adminArgs)
  1551  	if pErr != nil {
  1552  		t.Fatal(pErr)
  1553  	}
  1554  	if !reflect.DeepEqual(reply.Header().RangeInfos, expRangeInfos) {
  1555  		t.Errorf("on admin reply, expected %+v; got %+v", expRangeInfos, reply.Header().RangeInfos)
  1556  	}
  1557  
  1558  	// Verify multiple range infos on a scan request.
  1559  	scanArgs := roachpb.ScanRequest{
  1560  		RequestHeader: roachpb.RequestHeader{
  1561  			Key:    keys.SystemMax,
  1562  			EndKey: roachpb.KeyMax,
  1563  		},
  1564  	}
  1565  	txn := roachpb.MakeTransaction("test", roachpb.KeyMin, 1, mtc.clock().Now(), 0)
  1566  	h.Txn = &txn
  1567  	reply, pErr = kv.SendWrappedWith(context.Background(), mtc.distSenders[0], h, &scanArgs)
  1568  	if pErr != nil {
  1569  		t.Fatal(pErr)
  1570  	}
  1571  	expRangeInfos = []roachpb.RangeInfo{
  1572  		{
  1573  			Desc:  *lhsReplica0.Desc(),
  1574  			Lease: lhsLease,
  1575  		},
  1576  		{
  1577  			Desc:  *rhsReplica0.Desc(),
  1578  			Lease: rhsLease,
  1579  		},
  1580  	}
  1581  	if !reflect.DeepEqual(reply.Header().RangeInfos, expRangeInfos) {
  1582  		t.Errorf("on scan reply, expected %+v; got %+v", expRangeInfos, reply.Header().RangeInfos)
  1583  	}
  1584  
  1585  	// Verify multiple range infos and order on a reverse scan request.
  1586  	revScanArgs := roachpb.ReverseScanRequest{
  1587  		RequestHeader: roachpb.RequestHeader{
  1588  			Key:    keys.SystemMax,
  1589  			EndKey: roachpb.KeyMax,
  1590  		},
  1591  	}
  1592  	reply, pErr = kv.SendWrappedWith(context.Background(), mtc.distSenders[0], h, &revScanArgs)
  1593  	if pErr != nil {
  1594  		t.Fatal(pErr)
  1595  	}
  1596  	expRangeInfos = []roachpb.RangeInfo{
  1597  		{
  1598  			Desc:  *rhsReplica0.Desc(),
  1599  			Lease: rhsLease,
  1600  		},
  1601  		{
  1602  			Desc:  *lhsReplica0.Desc(),
  1603  			Lease: lhsLease,
  1604  		},
  1605  	}
  1606  	if !reflect.DeepEqual(reply.Header().RangeInfos, expRangeInfos) {
  1607  		t.Errorf("on reverse scan reply, expected %+v; got %+v", expRangeInfos, reply.Header().RangeInfos)
  1608  	}
  1609  
  1610  	// Change lease holders for both ranges and re-scan.
  1611  	for _, r := range []*kvserver.Replica{lhsReplica1, rhsReplica1} {
  1612  		replDesc, err := r.GetReplicaDescriptor()
  1613  		if err != nil {
  1614  			t.Fatal(err)
  1615  		}
  1616  		if err = mtc.dbs[0].AdminTransferLease(context.Background(),
  1617  			r.Desc().StartKey.AsRawKey(), replDesc.StoreID); err != nil {
  1618  			t.Fatalf("unable to transfer lease to replica %s: %+v", r, err)
  1619  		}
  1620  	}
  1621  	reply, pErr = kv.SendWrappedWith(context.Background(), mtc.distSenders[0], h, &scanArgs)
  1622  	if pErr != nil {
  1623  		t.Fatal(pErr)
  1624  	}
  1625  	// Read the expected lease from replica0 rather than replica1 as it may serve
  1626  	// a follower read which will contain the new lease information before
  1627  	// replica1 has applied the lease transfer.
  1628  	lhsLease, _ = lhsReplica0.GetLease()
  1629  	rhsLease, _ = rhsReplica0.GetLease()
  1630  	expRangeInfos = []roachpb.RangeInfo{
  1631  		{
  1632  			Desc:  *lhsReplica1.Desc(),
  1633  			Lease: lhsLease,
  1634  		},
  1635  		{
  1636  			Desc:  *rhsReplica1.Desc(),
  1637  			Lease: rhsLease,
  1638  		},
  1639  	}
  1640  	if !reflect.DeepEqual(reply.Header().RangeInfos, expRangeInfos) {
  1641  		t.Errorf("on scan reply, expected %+v; got %+v", expRangeInfos, reply.Header().RangeInfos)
  1642  	}
  1643  }
  1644  
  1645  // TestDrainRangeRejection verifies that an attempt to transfer a range to a
  1646  // draining store fails.
  1647  func TestDrainRangeRejection(t *testing.T) {
  1648  	defer leaktest.AfterTest(t)()
  1649  	mtc := &multiTestContext{}
  1650  	defer mtc.Stop()
  1651  	mtc.Start(t, 2)
  1652  
  1653  	repl, err := mtc.stores[0].GetReplica(1)
  1654  	if err != nil {
  1655  		t.Fatal(err)
  1656  	}
  1657  
  1658  	drainingIdx := 1
  1659  	mtc.stores[drainingIdx].SetDraining(true, nil /* reporter */)
  1660  	chgs := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA,
  1661  		roachpb.ReplicationTarget{
  1662  			NodeID:  mtc.idents[drainingIdx].NodeID,
  1663  			StoreID: mtc.idents[drainingIdx].StoreID,
  1664  		})
  1665  	if _, err := repl.ChangeReplicas(context.Background(), repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs); !testutils.IsError(err, "store is draining") {
  1666  		t.Fatalf("unexpected error: %+v", err)
  1667  	}
  1668  }
  1669  
  1670  func TestChangeReplicasGeneration(t *testing.T) {
  1671  	defer leaktest.AfterTest(t)()
  1672  	mtc := &multiTestContext{}
  1673  	defer mtc.Stop()
  1674  	mtc.Start(t, 2)
  1675  
  1676  	repl, err := mtc.stores[0].GetReplica(1)
  1677  	if err != nil {
  1678  		t.Fatal(err)
  1679  	}
  1680  
  1681  	oldGeneration := repl.Desc().Generation
  1682  	chgs := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, roachpb.ReplicationTarget{
  1683  		NodeID:  mtc.idents[1].NodeID,
  1684  		StoreID: mtc.idents[1].StoreID,
  1685  	})
  1686  	if _, err := repl.ChangeReplicas(context.Background(), repl.Desc(), kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeUnderReplicated, "", chgs); err != nil {
  1687  		t.Fatalf("unexpected error: %v", err)
  1688  	}
  1689  	assert.EqualValues(t, repl.Desc().Generation, oldGeneration+2)
  1690  
  1691  	oldGeneration = repl.Desc().Generation
  1692  	oldDesc := repl.Desc()
  1693  	chgs[0].ChangeType = roachpb.REMOVE_REPLICA
  1694  	newDesc, err := repl.ChangeReplicas(context.Background(), oldDesc, kvserver.SnapshotRequest_REBALANCE, kvserverpb.ReasonRangeOverReplicated, "", chgs)
  1695  	if err != nil {
  1696  		t.Fatalf("unexpected error: %v", err)
  1697  	}
  1698  	// Generation changes:
  1699  	// +1 for entering joint config due to demotion
  1700  	// +1 for transitioning out of joint config
  1701  	// +1 for removing learner
  1702  	assert.EqualValues(t, repl.Desc().Generation, oldGeneration+3, "\nold: %+v\nnew: %+v", oldDesc, newDesc)
  1703  }
  1704  
  1705  func TestSystemZoneConfigs(t *testing.T) {
  1706  	defer leaktest.AfterTest(t)()
  1707  
  1708  	// This test is relatively slow and resource intensive. When run under
  1709  	// stressrace on a loaded machine (as in the nightly tests), sometimes the
  1710  	// SucceedsSoon conditions below take longer than the allotted time (#25273).
  1711  	if testing.Short() || testutils.NightlyStress() || util.RaceEnabled {
  1712  		t.Skip()
  1713  	}
  1714  
  1715  	// This test relies on concurrently waiting for a value to change in the
  1716  	// underlying engine(s). Since the teeing engine does not respond well to
  1717  	// value mismatches, whether transient or permanent, skip this test if the
  1718  	// teeing engine is being used. See
  1719  	// https://github.com/cockroachdb/cockroach/issues/42656 for more context.
  1720  	if storage.DefaultStorageEngine == enginepb.EngineTypeTeePebbleRocksDB {
  1721  		t.Skip("disabled on teeing engine")
  1722  	}
  1723  
  1724  	ctx := context.Background()
  1725  	tc := testcluster.StartTestCluster(t, 7, base.TestClusterArgs{
  1726  		ServerArgs: base.TestServerArgs{
  1727  			Knobs: base.TestingKnobs{
  1728  				Store: &kvserver.StoreTestingKnobs{
  1729  					// Disable LBS because when the scan is happening at the rate it's happening
  1730  					// below, it's possible that one of the system ranges trigger a split.
  1731  					DisableLoadBasedSplitting: true,
  1732  				},
  1733  			},
  1734  			// Scan like a bat out of hell to ensure replication and replica GC
  1735  			// happen in a timely manner.
  1736  			ScanInterval: 50 * time.Millisecond,
  1737  		},
  1738  	})
  1739  	defer tc.Stopper().Stop(ctx)
  1740  	log.Info(ctx, "TestSystemZoneConfig: test cluster started")
  1741  
  1742  	expectedSystemRanges, err := tc.Servers[0].ExpectedInitialRangeCount()
  1743  	if err != nil {
  1744  		t.Fatal(err)
  1745  	}
  1746  	expectedUserRanges := 1
  1747  	expectedSystemRanges -= expectedUserRanges
  1748  	systemNumReplicas := int(*zonepb.DefaultSystemZoneConfig().NumReplicas)
  1749  	userNumReplicas := int(*zonepb.DefaultZoneConfig().NumReplicas)
  1750  	expectedReplicas := expectedSystemRanges*systemNumReplicas + expectedUserRanges*userNumReplicas
  1751  	log.Infof(ctx, "TestSystemZoneConfig: expecting %d system ranges and %d user ranges",
  1752  		expectedSystemRanges, expectedUserRanges)
  1753  	log.Infof(ctx, "TestSystemZoneConfig: expected (%dx%d) + (%dx%d) = %d replicas total",
  1754  		expectedSystemRanges, systemNumReplicas, expectedUserRanges, userNumReplicas, expectedReplicas)
  1755  
  1756  	waitForReplicas := func() error {
  1757  		replicas := make(map[roachpb.RangeID]roachpb.RangeDescriptor)
  1758  		for _, s := range tc.Servers {
  1759  			if err := kvserver.IterateRangeDescriptors(ctx, s.Engines()[0], func(desc roachpb.RangeDescriptor) (bool, error) {
  1760  				if len(desc.Replicas().Learners()) > 0 {
  1761  					return false, fmt.Errorf("descriptor contains learners: %v", desc)
  1762  				}
  1763  				if existing, ok := replicas[desc.RangeID]; ok && !existing.Equal(desc) {
  1764  					return false, fmt.Errorf("mismatch between\n%s\n%s", &existing, &desc)
  1765  				}
  1766  				replicas[desc.RangeID] = desc
  1767  				return false, nil
  1768  			}); err != nil {
  1769  				return err
  1770  			}
  1771  		}
  1772  		var totalReplicas int
  1773  		for _, desc := range replicas {
  1774  			totalReplicas += len(desc.Replicas().Voters())
  1775  		}
  1776  		if totalReplicas != expectedReplicas {
  1777  			return fmt.Errorf("got %d voters, want %d; details: %+v", totalReplicas, expectedReplicas, replicas)
  1778  		}
  1779  		return nil
  1780  	}
  1781  
  1782  	// Wait until we're down to the expected number of replicas. This is
  1783  	// effectively waiting on replica GC to kick in to destroy any replicas that
  1784  	// got removed during rebalancing of the initial ranges, since the testcluster
  1785  	// waits until nothing is underreplicated but not until all rebalancing has
  1786  	// settled down.
  1787  	testutils.SucceedsSoon(t, waitForReplicas)
  1788  	log.Info(ctx, "TestSystemZoneConfig: initial replication succeeded")
  1789  
  1790  	// Update the meta zone config to have more replicas and expect the number
  1791  	// of replicas to go up accordingly after running all replicas through the
  1792  	// replicate queue.
  1793  	sqlDB := sqlutils.MakeSQLRunner(tc.ServerConn(0))
  1794  	sqlutils.SetZoneConfig(t, sqlDB, "RANGE meta", "num_replicas: 7")
  1795  	expectedReplicas += 2
  1796  	testutils.SucceedsSoon(t, waitForReplicas)
  1797  	log.Info(ctx, "TestSystemZoneConfig: up-replication of meta ranges succeeded")
  1798  
  1799  	// Do the same thing, but down-replicating the timeseries range.
  1800  	sqlutils.SetZoneConfig(t, sqlDB, "RANGE timeseries", "num_replicas: 1")
  1801  	expectedReplicas -= 2
  1802  	testutils.SucceedsSoon(t, waitForReplicas)
  1803  	log.Info(ctx, "TestSystemZoneConfig: down-replication of timeseries ranges succeeded")
  1804  
  1805  	// Up-replicate the system.jobs table to demonstrate that it is configured
  1806  	// independently from the system database.
  1807  	sqlutils.SetZoneConfig(t, sqlDB, "TABLE system.jobs", "num_replicas: 7")
  1808  	expectedReplicas += 2
  1809  	testutils.SucceedsSoon(t, waitForReplicas)
  1810  	log.Info(ctx, "TestSystemZoneConfig: up-replication of jobs table succeeded")
  1811  
  1812  	// Finally, verify the system ranges. Note that in a new cluster there are
  1813  	// two system ranges, which we have to take into account here.
  1814  	sqlutils.SetZoneConfig(t, sqlDB, "RANGE system", "num_replicas: 7")
  1815  	expectedReplicas += 4
  1816  	testutils.SucceedsSoon(t, waitForReplicas)
  1817  	log.Info(ctx, "TestSystemZoneConfig: up-replication of system ranges succeeded")
  1818  }
  1819  
  1820  func TestClearRange(t *testing.T) {
  1821  	defer leaktest.AfterTest(t)()
  1822  
  1823  	ctx := context.Background()
  1824  	stopper := stop.NewStopper()
  1825  	defer stopper.Stop(ctx)
  1826  	store := createTestStoreWithConfig(t, stopper, kvserver.TestStoreConfig(nil))
  1827  
  1828  	clearRange := func(start, end roachpb.Key) {
  1829  		t.Helper()
  1830  		if _, err := kv.SendWrapped(ctx, store.DB().NonTransactionalSender(), &roachpb.ClearRangeRequest{
  1831  			RequestHeader: roachpb.RequestHeader{
  1832  				Key:    start,
  1833  				EndKey: end,
  1834  			},
  1835  		}); err != nil {
  1836  			t.Fatal(err)
  1837  		}
  1838  	}
  1839  
  1840  	verifyKeysWithPrefix := func(prefix roachpb.Key, expectedKeys []roachpb.Key) {
  1841  		t.Helper()
  1842  		start := prefix
  1843  		end := prefix.PrefixEnd()
  1844  		kvs, err := storage.Scan(store.Engine(), start, end, 0 /* maxRows */)
  1845  		if err != nil {
  1846  			t.Fatal(err)
  1847  		}
  1848  		var actualKeys []roachpb.Key
  1849  		for _, kv := range kvs {
  1850  			actualKeys = append(actualKeys, kv.Key.Key)
  1851  		}
  1852  		if !reflect.DeepEqual(expectedKeys, actualKeys) {
  1853  			t.Fatalf("expected %v, but got %v", expectedKeys, actualKeys)
  1854  		}
  1855  	}
  1856  
  1857  	rng, _ := randutil.NewPseudoRand()
  1858  
  1859  	// Write four keys with values small enough to use individual deletions
  1860  	// (sm1-sm4) and four keys with values large enough to require a range
  1861  	// deletion tombstone (lg1-lg4).
  1862  	sm, sm1, sm2, sm3 := roachpb.Key("sm"), roachpb.Key("sm1"), roachpb.Key("sm2"), roachpb.Key("sm3")
  1863  	lg, lg1, lg2, lg3 := roachpb.Key("lg"), roachpb.Key("lg1"), roachpb.Key("lg2"), roachpb.Key("lg3")
  1864  	for _, key := range []roachpb.Key{sm1, sm2, sm3} {
  1865  		if err := store.DB().Put(ctx, key, "sm-val"); err != nil {
  1866  			t.Fatal(err)
  1867  		}
  1868  	}
  1869  	for _, key := range []roachpb.Key{lg1, lg2, lg3} {
  1870  		if err := store.DB().Put(
  1871  			ctx, key, randutil.RandBytes(rng, batcheval.ClearRangeBytesThreshold),
  1872  		); err != nil {
  1873  			t.Fatal(err)
  1874  		}
  1875  	}
  1876  	verifyKeysWithPrefix(sm, []roachpb.Key{sm1, sm2, sm3})
  1877  	verifyKeysWithPrefix(lg, []roachpb.Key{lg1, lg2, lg3})
  1878  
  1879  	// Verify that a ClearRange request from [sm1, sm3) removes sm1 and sm2.
  1880  	clearRange(sm1, sm3)
  1881  	verifyKeysWithPrefix(sm, []roachpb.Key{sm3})
  1882  
  1883  	// Verify that a ClearRange request from [lg1, lg3) removes lg1 and lg2.
  1884  	clearRange(lg1, lg3)
  1885  	verifyKeysWithPrefix(lg, []roachpb.Key{lg3})
  1886  
  1887  	// Verify that only the large ClearRange request used a range deletion
  1888  	// tombstone by checking for the presence of a suggested compaction.
  1889  	verifyKeysWithPrefix(keys.LocalStoreSuggestedCompactionsMin,
  1890  		[]roachpb.Key{keys.StoreSuggestedCompactionKey(lg1, lg3)})
  1891  }
  1892  
  1893  // TestLeaseTransferInSnapshotUpdatesTimestampCache prevents a regression of
  1894  // #34025. A Replica is targeted for a lease transfer target when it needs a
  1895  // Raft snapshot to catch up. Normally we try to prevent this case, but it is
  1896  // possible and hard to prevent entirely. The Replica will only learn that it is
  1897  // the new leaseholder when it applies the snapshot. When doing so, it should
  1898  // make sure to apply the lease-related side-effects to its in-memory state.
  1899  func TestLeaseTransferInSnapshotUpdatesTimestampCache(t *testing.T) {
  1900  	defer leaktest.AfterTest(t)()
  1901  
  1902  	ctx := context.Background()
  1903  	sc := kvserver.TestStoreConfig(nil)
  1904  	// We'll control replication by hand.
  1905  	sc.TestingKnobs.DisableReplicateQueue = true
  1906  	// Avoid fighting with the merge queue while trying to reproduce this race.
  1907  	sc.TestingKnobs.DisableMergeQueue = true
  1908  	mtc := &multiTestContext{storeConfig: &sc}
  1909  	defer mtc.Stop()
  1910  	mtc.Start(t, 3)
  1911  	store2 := mtc.Store(2)
  1912  
  1913  	keyA := roachpb.Key("a")
  1914  	keyB := roachpb.Key("b")
  1915  	keyC := roachpb.Key("c")
  1916  
  1917  	// First, do a couple of writes; we'll use these to determine when
  1918  	// the dust has settled.
  1919  	incA := incrementArgs(keyA, 1)
  1920  	if _, pErr := kv.SendWrapped(ctx, mtc.stores[0].TestSender(), incA); pErr != nil {
  1921  		t.Fatal(pErr)
  1922  	}
  1923  	incC := incrementArgs(keyC, 2)
  1924  	if _, pErr := kv.SendWrapped(ctx, mtc.stores[0].TestSender(), incC); pErr != nil {
  1925  		t.Fatal(pErr)
  1926  	}
  1927  
  1928  	// Split the system range from the rest of the keyspace.
  1929  	splitArgs := adminSplitArgs(keys.SystemMax)
  1930  	if _, pErr := kv.SendWrapped(ctx, mtc.stores[0].TestSender(), splitArgs); pErr != nil {
  1931  		t.Fatal(pErr)
  1932  	}
  1933  
  1934  	// Get the range's ID.
  1935  	repl0 := mtc.stores[0].LookupReplica(roachpb.RKey(keyA))
  1936  	rangeID := repl0.RangeID
  1937  
  1938  	// Replicate the range onto nodes 1 and 2.
  1939  	// Wait for all replicas to be caught up.
  1940  	mtc.replicateRange(rangeID, 1, 2)
  1941  	mtc.waitForValues(keyA, []int64{1, 1, 1})
  1942  	mtc.waitForValues(keyC, []int64{2, 2, 2})
  1943  
  1944  	// Create a transaction that will try to write "under" a served read.
  1945  	// The read will have been served by the original leaseholder (node 0)
  1946  	// and the write will be attempted on the new leaseholder (node 2).
  1947  	// It should not succeed because it should run into the timestamp cache.
  1948  	db := mtc.dbs[0]
  1949  	txnOld := kv.NewTxn(ctx, db, 0 /* gatewayNodeID */)
  1950  
  1951  	// Perform a write with txnOld so that its timestamp gets set.
  1952  	if _, err := txnOld.Inc(ctx, keyB, 3); err != nil {
  1953  		t.Fatal(err)
  1954  	}
  1955  
  1956  	// Read keyC with txnOld, which is updated below. This prevents the
  1957  	// transaction from refreshing when it hits the serializable error.
  1958  	if _, err := txnOld.Get(ctx, keyC); err != nil {
  1959  		t.Fatal(err)
  1960  	}
  1961  
  1962  	// Ensure that the transaction sends its first hearbeat so that it creates
  1963  	// its transaction record and doesn't run into trouble with the low water
  1964  	// mark of the new leaseholder's timestamp cache. Amusingly, if the bug
  1965  	// we're regression testing against here still existed, we would not have
  1966  	// to do this.
  1967  	hb, hbH := heartbeatArgs(txnOld.TestingCloneTxn(), mtc.clock().Now())
  1968  	if _, pErr := kv.SendWrappedWith(ctx, mtc.stores[0].TestSender(), hbH, hb); pErr != nil {
  1969  		t.Fatal(pErr)
  1970  	}
  1971  
  1972  	// Another client comes along at a higher timestamp and reads. We should
  1973  	// never be able to write under this time or we would be rewriting history.
  1974  	if _, err := db.Get(ctx, keyA); err != nil {
  1975  		t.Fatal(err)
  1976  	}
  1977  
  1978  	// Partition node 2 from the rest of its range. Once partitioned, perform
  1979  	// another write and truncate the Raft log on the two connected nodes. This
  1980  	// ensures that that when node 2 comes back up it will require a snapshot
  1981  	// from Raft.
  1982  	mtc.transport.Listen(store2.Ident.StoreID, &unreliableRaftHandler{
  1983  		rangeID:            rangeID,
  1984  		RaftMessageHandler: store2,
  1985  	})
  1986  
  1987  	if _, pErr := kv.SendWrapped(ctx, mtc.stores[0].TestSender(), incC); pErr != nil {
  1988  		t.Fatal(pErr)
  1989  	}
  1990  	mtc.waitForValues(keyC, []int64{4, 4, 2})
  1991  
  1992  	// Truncate the log at index+1 (log entries < N are removed, so this
  1993  	// includes the increment). This necessitates a snapshot when the
  1994  	// partitioned replica rejoins the rest of the range.
  1995  	index, err := repl0.GetLastIndex()
  1996  	if err != nil {
  1997  		t.Fatal(err)
  1998  	}
  1999  	truncArgs := truncateLogArgs(index+1, rangeID)
  2000  	truncArgs.Key = keyA
  2001  	if _, err := kv.SendWrapped(ctx, mtc.stores[0].TestSender(), truncArgs); err != nil {
  2002  		t.Fatal(err)
  2003  	}
  2004  
  2005  	// Finally, transfer the lease to node 2 while it is still unavailable and
  2006  	// behind. We try to avoid this case when picking new leaseholders in practice,
  2007  	// but we're never 100% successful.
  2008  	if err := repl0.AdminTransferLease(ctx, store2.Ident.StoreID); err != nil {
  2009  		t.Fatal(err)
  2010  	}
  2011  
  2012  	// Remove the partition. A snapshot to node 2 should follow. This snapshot
  2013  	// will inform node 2 that it is the new leaseholder for the range. Node 2
  2014  	// should act accordingly and update its internal state to reflect this.
  2015  	mtc.transport.Listen(store2.Ident.StoreID, store2)
  2016  	mtc.waitForValues(keyC, []int64{4, 4, 4})
  2017  
  2018  	// Perform a write on the new leaseholder underneath the previously served
  2019  	// read. This write should hit the timestamp cache and flag the txn for a
  2020  	// restart when we try to commit it below. With the bug in #34025, the new
  2021  	// leaseholder who heard about the lease transfer from a snapshot had an
  2022  	// empty timestamp cache and would simply let us write under the previous
  2023  	// read.
  2024  	if _, err := txnOld.Inc(ctx, keyA, 4); err != nil {
  2025  		t.Fatal(err)
  2026  	}
  2027  	const exp = `TransactionRetryError: retry txn \(RETRY_SERIALIZABLE\)`
  2028  	if err := txnOld.Commit(ctx); !testutils.IsError(err, exp) {
  2029  		t.Fatalf("expected retry error, got: %v; did we write under a read?", err)
  2030  	}
  2031  }
  2032  
  2033  // TestConcurrentAdminChangeReplicasRequests ensures that when two attempts to
  2034  // change replicas for a range race, only one will succeed.
  2035  func TestConcurrentAdminChangeReplicasRequests(t *testing.T) {
  2036  	defer leaktest.AfterTest(t)()
  2037  	// With 5 nodes the test is set up to have 2 actors trying to change the
  2038  	// replication concurrently. The first one attempts to change the replication
  2039  	// from [1] to [1, 2, 3, 4] and the second one starts by assuming that the
  2040  	// first actor succeeded on its first request and expected [1, 2] and tries
  2041  	// to move the replication to [1, 2, 4, 5]. One of these actors should
  2042  	// succeed.
  2043  	const numNodes = 5
  2044  	tc := testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{
  2045  		ReplicationMode: base.ReplicationManual,
  2046  	})
  2047  	ctx := context.Background()
  2048  	defer tc.Stopper().Stop(ctx)
  2049  	key := roachpb.Key("a")
  2050  	db := tc.Servers[0].DB()
  2051  	rangeInfo, err := getRangeInfo(ctx, db, key)
  2052  	require.Nil(t, err)
  2053  	require.Len(t, rangeInfo.Desc.InternalReplicas, 1)
  2054  	targets1, targets2 := makeReplicationTargets(2, 3, 4), makeReplicationTargets(4, 5)
  2055  	expects1 := rangeInfo.Desc
  2056  	expects2 := rangeInfo.Desc
  2057  	expects2.InternalReplicas = append(expects2.InternalReplicas, roachpb.ReplicaDescriptor{
  2058  		NodeID:    2,
  2059  		StoreID:   2,
  2060  		ReplicaID: expects2.NextReplicaID,
  2061  	})
  2062  	expects2.NextReplicaID++
  2063  	var err1, err2 error
  2064  	var res1, res2 *roachpb.RangeDescriptor
  2065  	var wg sync.WaitGroup
  2066  	wg.Add(2)
  2067  	go func() {
  2068  		res1, err1 = db.AdminChangeReplicas(
  2069  			ctx, key, expects1, roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, targets1...))
  2070  		wg.Done()
  2071  	}()
  2072  	go func() {
  2073  		res2, err2 = db.AdminChangeReplicas(
  2074  			ctx, key, expects2, roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, targets2...))
  2075  		wg.Done()
  2076  	}()
  2077  	wg.Wait()
  2078  
  2079  	infoAfter, err := getRangeInfo(ctx, db, key)
  2080  	require.Nil(t, err)
  2081  
  2082  	assert.Falsef(t, err1 == nil && err2 == nil,
  2083  		"expected one of racing AdminChangeReplicasRequests to fail but neither did")
  2084  	// It is possible that an error can occur due to a rejected snapshot from the
  2085  	// target range. We don't want to fail the test if we got one of those.
  2086  	isSnapshotErr := func(err error) bool {
  2087  		return testutils.IsError(err, "snapshot failed:")
  2088  	}
  2089  	atLeastOneIsSnapshotErr := isSnapshotErr(err1) || isSnapshotErr(err2)
  2090  	assert.Falsef(t, err1 != nil && err2 != nil && !atLeastOneIsSnapshotErr,
  2091  		"expected only one of racing AdminChangeReplicasRequests to fail but both "+
  2092  			"had errors and neither were snapshot: %v %v", err1, err2)
  2093  	replicaNodeIDs := func(desc roachpb.RangeDescriptor) (ids []int) {
  2094  		for _, r := range desc.InternalReplicas {
  2095  			ids = append(ids, int(r.NodeID))
  2096  		}
  2097  		return ids
  2098  	}
  2099  	if err1 == nil {
  2100  		assert.ElementsMatch(t, replicaNodeIDs(infoAfter.Desc), []int{1, 2, 3, 4})
  2101  		assert.EqualValues(t, infoAfter.Desc, *res1)
  2102  	} else if err2 == nil {
  2103  		assert.ElementsMatch(t, replicaNodeIDs(infoAfter.Desc), []int{1, 2, 4, 5})
  2104  		assert.EqualValues(t, infoAfter.Desc, *res2)
  2105  	}
  2106  }
  2107  
  2108  // TestRandomConcurrentAdminChangeReplicasRequests ensures that when multiple
  2109  // AdminChangeReplicasRequests are issued concurrently, so long as requests
  2110  // provide the the value of the RangeDescriptor they will not accidentally
  2111  // perform replication changes. In particular this test runs a number of
  2112  // concurrent actors which all use the same expectations of the RangeDescriptor
  2113  // and verifies that at most one actor succeeds in making all of its changes.
  2114  func TestRandomConcurrentAdminChangeReplicasRequests(t *testing.T) {
  2115  	defer leaktest.AfterTest(t)()
  2116  	const numNodes = 6
  2117  	tc := testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{
  2118  		ReplicationMode: base.ReplicationManual,
  2119  	})
  2120  	ctx := context.Background()
  2121  	defer tc.Stopper().Stop(ctx)
  2122  	const actors = 10
  2123  	errors := make([]error, actors)
  2124  	var wg sync.WaitGroup
  2125  	key := roachpb.Key("a")
  2126  	db := tc.Servers[0].DB()
  2127  	require.Nil(t, db.AdminRelocateRange(ctx, key, makeReplicationTargets(1, 2, 3)))
  2128  	// Random targets consisting of a random number of nodes from the set of nodes
  2129  	// in the cluster which currently do not have a replica.
  2130  	pickTargets := func() []roachpb.ReplicationTarget {
  2131  		availableIDs := make([]int, 0, numNodes-3)
  2132  		for id := 4; id <= numNodes; id++ {
  2133  			availableIDs = append(availableIDs, id)
  2134  		}
  2135  		rand.Shuffle(len(availableIDs), func(i, j int) {
  2136  			availableIDs[i], availableIDs[j] = availableIDs[j], availableIDs[i]
  2137  		})
  2138  		n := rand.Intn(len(availableIDs)) + 1
  2139  		return makeReplicationTargets(availableIDs[:n]...)
  2140  	}
  2141  	// TODO(ajwerner): consider doing this read inside the addReplicas function
  2142  	// and then allowing multiple writes to overlap and validate that the state
  2143  	// corresponds to a valid history of events.
  2144  	rangeInfo, err := getRangeInfo(ctx, db, key)
  2145  	require.Nil(t, err)
  2146  	addReplicas := func() error {
  2147  		_, err := db.AdminChangeReplicas(
  2148  			ctx, key, rangeInfo.Desc, roachpb.MakeReplicationChanges(
  2149  				roachpb.ADD_REPLICA, pickTargets()...))
  2150  		return err
  2151  	}
  2152  	wg.Add(actors)
  2153  	for i := 0; i < actors; i++ {
  2154  		go func(i int) { errors[i] = addReplicas(); wg.Done() }(i)
  2155  	}
  2156  	wg.Wait()
  2157  	var gotSuccess bool
  2158  	for _, err := range errors {
  2159  		if err != nil {
  2160  			const exp = "change replicas of .* failed: descriptor changed" +
  2161  				"|snapshot failed:"
  2162  			assert.True(t, testutils.IsError(err, exp), err)
  2163  		} else if gotSuccess {
  2164  			t.Error("expected only one success")
  2165  		} else {
  2166  			gotSuccess = true
  2167  		}
  2168  	}
  2169  }
  2170  
  2171  // TestReplicaTombstone ensures that tombstones are written when we expect
  2172  // them to be. Tombstones are laid down when replicas are removed.
  2173  // Replicas are removed for several reasons:
  2174  //
  2175  //  (1)   In response to a ChangeReplicasTrigger which removes it.
  2176  //  (2)   In response to a ReplicaTooOldError from a sent raft message.
  2177  //  (3)   Due to the replica GC queue detecting a replica is not in the range.
  2178  //  (3.1) When the replica detects the range has been merged away.
  2179  //  (4)   Due to a raft message addressed to a newer replica ID.
  2180  //  (4.1) When the older replica is not initialized.
  2181  //  (5)   Due to a merge.
  2182  //  (6)   Due to snapshot which subsumes a range.
  2183  //
  2184  // This test creates all of these scenarios and ensures that tombstones are
  2185  // written at sane values.
  2186  func TestReplicaTombstone(t *testing.T) {
  2187  	defer leaktest.AfterTest(t)()
  2188  
  2189  	// This test relies on concurrently waiting for a value to change in the
  2190  	// underlying engine(s). Since the teeing engine does not respond well to
  2191  	// value mismatches, whether transient or permanent, skip this test if the
  2192  	// teeing engine is being used. See
  2193  	// https://github.com/cockroachdb/cockroach/issues/42656 for more context.
  2194  	if storage.DefaultStorageEngine == enginepb.EngineTypeTeePebbleRocksDB {
  2195  		t.Skip("disabled on teeing engine")
  2196  	}
  2197  
  2198  	t.Run("(1) ChangeReplicasTrigger", func(t *testing.T) {
  2199  		defer leaktest.AfterTest(t)()
  2200  		ctx := context.Background()
  2201  		tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{
  2202  			ServerArgs: base.TestServerArgs{
  2203  				Knobs: base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{
  2204  					DisableReplicaGCQueue: true,
  2205  				}},
  2206  			},
  2207  			ReplicationMode: base.ReplicationManual,
  2208  		})
  2209  		defer tc.Stopper().Stop(ctx)
  2210  
  2211  		key := tc.ScratchRange(t)
  2212  		require.NoError(t, tc.WaitForSplitAndInitialization(key))
  2213  		desc, err := tc.LookupRange(key)
  2214  		require.NoError(t, err)
  2215  		rangeID := desc.RangeID
  2216  		tc.AddReplicasOrFatal(t, key, tc.Target(1))
  2217  		// Partition node 2 from receiving responses but not requests.
  2218  		// This will lead to it applying the ChangeReplicasTrigger which removes
  2219  		// it rather than receiving a ReplicaTooOldError.
  2220  		store, _ := getFirstStoreReplica(t, tc.Server(1), key)
  2221  		funcs := noopRaftHandlerFuncs()
  2222  		funcs.dropResp = func(*kvserver.RaftMessageResponse) bool {
  2223  			return true
  2224  		}
  2225  		tc.Servers[1].RaftTransport().Listen(store.StoreID(), &unreliableRaftHandler{
  2226  			rangeID:                    desc.RangeID,
  2227  			RaftMessageHandler:         store,
  2228  			unreliableRaftHandlerFuncs: funcs,
  2229  		})
  2230  		tc.RemoveReplicasOrFatal(t, key, tc.Target(1))
  2231  		tombstone := waitForTombstone(t, store.Engine(), rangeID)
  2232  		require.Equal(t, roachpb.ReplicaID(3), tombstone.NextReplicaID)
  2233  	})
  2234  	t.Run("(2) ReplicaTooOldError", func(t *testing.T) {
  2235  		defer leaktest.AfterTest(t)()
  2236  		ctx := context.Background()
  2237  		tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{
  2238  			ServerArgs: base.TestServerArgs{
  2239  				RaftConfig: base.RaftConfig{
  2240  					// Make the tick interval short so we don't need to wait too long for
  2241  					// the partitioned node to time out but increase the lease timeout
  2242  					// so expiration-based leases still work.
  2243  					RaftTickInterval:                        time.Millisecond,
  2244  					RangeLeaseRaftElectionTimeoutMultiplier: 10000,
  2245  				},
  2246  				Knobs: base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{
  2247  					DisableReplicaGCQueue: true,
  2248  				}},
  2249  			},
  2250  			ReplicationMode: base.ReplicationManual,
  2251  		})
  2252  		defer tc.Stopper().Stop(ctx)
  2253  
  2254  		key := tc.ScratchRange(t)
  2255  		require.NoError(t, tc.WaitForSplitAndInitialization(key))
  2256  		desc, err := tc.LookupRange(key)
  2257  		require.NoError(t, err)
  2258  		rangeID := desc.RangeID
  2259  		tc.AddReplicasOrFatal(t, key, tc.Target(1), tc.Target(2))
  2260  		require.NoError(t,
  2261  			tc.WaitForVoters(key, tc.Target(1), tc.Target(2)))
  2262  		store, repl := getFirstStoreReplica(t, tc.Server(2), key)
  2263  		// Partition the range such that it hears responses but does not hear
  2264  		// requests. It should destroy the local replica due to a
  2265  		// ReplicaTooOldError.
  2266  		sawTooOld := make(chan struct{}, 1)
  2267  		raftFuncs := noopRaftHandlerFuncs()
  2268  		raftFuncs.dropResp = func(resp *kvserver.RaftMessageResponse) bool {
  2269  			if pErr, ok := resp.Union.GetValue().(*roachpb.Error); ok {
  2270  				if _, isTooOld := pErr.GetDetail().(*roachpb.ReplicaTooOldError); isTooOld {
  2271  					select {
  2272  					case sawTooOld <- struct{}{}:
  2273  					default:
  2274  					}
  2275  				}
  2276  			}
  2277  			return false
  2278  		}
  2279  		raftFuncs.dropReq = func(req *kvserver.RaftMessageRequest) bool {
  2280  			return req.ToReplica.StoreID == store.StoreID()
  2281  		}
  2282  		tc.Servers[2].RaftTransport().Listen(store.StoreID(), &unreliableRaftHandler{
  2283  			rangeID:                    desc.RangeID,
  2284  			RaftMessageHandler:         store,
  2285  			unreliableRaftHandlerFuncs: raftFuncs,
  2286  		})
  2287  		tc.RemoveReplicasOrFatal(t, key, tc.Target(2))
  2288  		testutils.SucceedsSoon(t, func() error {
  2289  			repl.UnquiesceAndWakeLeader()
  2290  			if len(sawTooOld) == 0 {
  2291  				return errors.New("still haven't seen ReplicaTooOldError")
  2292  			}
  2293  			return nil
  2294  		})
  2295  		// Wait until we're sure that the replica has seen ReplicaTooOld,
  2296  		// then go look for the tombstone.
  2297  		<-sawTooOld
  2298  		tombstone := waitForTombstone(t, store.Engine(), rangeID)
  2299  		require.Equal(t, roachpb.ReplicaID(4), tombstone.NextReplicaID)
  2300  	})
  2301  	t.Run("(3) ReplicaGCQueue", func(t *testing.T) {
  2302  		defer leaktest.AfterTest(t)()
  2303  
  2304  		ctx := context.Background()
  2305  		tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{
  2306  			ServerArgs: base.TestServerArgs{
  2307  				Knobs: base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{
  2308  					DisableReplicaGCQueue: true,
  2309  				}},
  2310  			},
  2311  			ReplicationMode: base.ReplicationManual,
  2312  		})
  2313  		defer tc.Stopper().Stop(ctx)
  2314  
  2315  		key := tc.ScratchRange(t)
  2316  		require.NoError(t, tc.WaitForSplitAndInitialization(key))
  2317  		desc, err := tc.LookupRange(key)
  2318  		require.NoError(t, err)
  2319  		rangeID := desc.RangeID
  2320  		tc.AddReplicasOrFatal(t, key, tc.Target(1), tc.Target(2))
  2321  		// Partition node 2 from receiving any raft messages.
  2322  		// It will never find out it has been removed. We'll remove it
  2323  		// with a manual replica GC.
  2324  		store, _ := getFirstStoreReplica(t, tc.Server(2), key)
  2325  		tc.Servers[2].RaftTransport().Listen(store.StoreID(), &unreliableRaftHandler{
  2326  			rangeID:            desc.RangeID,
  2327  			RaftMessageHandler: store,
  2328  		})
  2329  		tc.RemoveReplicasOrFatal(t, key, tc.Target(2))
  2330  		repl, err := store.GetReplica(desc.RangeID)
  2331  		require.NoError(t, err)
  2332  		require.NoError(t, store.ManualReplicaGC(repl))
  2333  		tombstone := waitForTombstone(t, store.Engine(), rangeID)
  2334  		require.Equal(t, roachpb.ReplicaID(4), tombstone.NextReplicaID)
  2335  	})
  2336  	// This case also detects the tombstone for nodes which processed the merge.
  2337  	t.Run("(3.1) (5) replica GC queue and merge", func(t *testing.T) {
  2338  		defer leaktest.AfterTest(t)()
  2339  
  2340  		ctx := context.Background()
  2341  		tc := testcluster.StartTestCluster(t, 4, base.TestClusterArgs{
  2342  			ServerArgs: base.TestServerArgs{
  2343  				Knobs: base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{
  2344  					DisableReplicaGCQueue: true,
  2345  				}},
  2346  			},
  2347  			ReplicationMode: base.ReplicationManual,
  2348  		})
  2349  		defer tc.Stopper().Stop(ctx)
  2350  
  2351  		key := tc.ScratchRange(t)
  2352  		require.NoError(t, tc.WaitForSplitAndInitialization(key))
  2353  		tc.AddReplicasOrFatal(t, key, tc.Target(1))
  2354  		keyA := append(key[:len(key):len(key)], 'a')
  2355  		_, desc, err := tc.SplitRange(keyA)
  2356  		require.NoError(t, err)
  2357  		require.NoError(t, tc.WaitForSplitAndInitialization(keyA))
  2358  		tc.AddReplicasOrFatal(t, key, tc.Target(3))
  2359  		tc.AddReplicasOrFatal(t, keyA, tc.Target(2))
  2360  		rangeID := desc.RangeID
  2361  		// Partition node 2 from all raft communication.
  2362  		store, _ := getFirstStoreReplica(t, tc.Server(2), keyA)
  2363  		tc.Servers[2].RaftTransport().Listen(store.StoreID(), &unreliableRaftHandler{
  2364  			rangeID:            desc.RangeID,
  2365  			RaftMessageHandler: store,
  2366  		})
  2367  
  2368  		// We'll move the range from server 2 to 3 and merge key and keyA.
  2369  		// Server 2 won't hear about any of that.
  2370  		tc.RemoveReplicasOrFatal(t, keyA, tc.Target(2))
  2371  		tc.AddReplicasOrFatal(t, keyA, tc.Target(3))
  2372  		require.NoError(t, tc.WaitForSplitAndInitialization(keyA))
  2373  		require.NoError(t, tc.Server(0).DB().AdminMerge(ctx, key))
  2374  		// Run replica GC on server 2.
  2375  		repl, err := store.GetReplica(desc.RangeID)
  2376  		require.NoError(t, err)
  2377  		require.NoError(t, store.ManualReplicaGC(repl))
  2378  		// Verify the tombstone generated from replica GC of a merged range.
  2379  		tombstone := waitForTombstone(t, store.Engine(), rangeID)
  2380  		require.Equal(t, roachpb.ReplicaID(math.MaxInt32), tombstone.NextReplicaID)
  2381  		// Verify the tombstone generated from processing a merge trigger.
  2382  		store3, _ := getFirstStoreReplica(t, tc.Server(0), key)
  2383  		tombstone = waitForTombstone(t, store3.Engine(), rangeID)
  2384  		require.Equal(t, roachpb.ReplicaID(math.MaxInt32), tombstone.NextReplicaID)
  2385  	})
  2386  	t.Run("(4) (4.1) raft messages to newer replicaID ", func(t *testing.T) {
  2387  		defer leaktest.AfterTest(t)()
  2388  		ctx := context.Background()
  2389  		tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{
  2390  			ServerArgs: base.TestServerArgs{
  2391  				RaftConfig: base.RaftConfig{
  2392  					// Make the tick interval short so we don't need to wait too long
  2393  					// for a heartbeat to be sent. Increase the election timeout so
  2394  					// expiration based leases still work.
  2395  					RaftTickInterval:                        time.Millisecond,
  2396  					RangeLeaseRaftElectionTimeoutMultiplier: 10000,
  2397  				},
  2398  				Knobs: base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{
  2399  					DisableReplicaGCQueue: true,
  2400  				}},
  2401  			},
  2402  			ReplicationMode: base.ReplicationManual,
  2403  		})
  2404  		defer tc.Stopper().Stop(ctx)
  2405  
  2406  		key := tc.ScratchRange(t)
  2407  		desc, err := tc.LookupRange(key)
  2408  		require.NoError(t, err)
  2409  		rangeID := desc.RangeID
  2410  		tc.AddReplicasOrFatal(t, key, tc.Target(1), tc.Target(2))
  2411  		require.NoError(t, tc.WaitForSplitAndInitialization(key))
  2412  		store, repl := getFirstStoreReplica(t, tc.Server(2), key)
  2413  		// Set up a partition for everything but heartbeats on store 2.
  2414  		// Make ourselves a tool to block snapshots until we've heard a
  2415  		// heartbeat above a certain replica ID.
  2416  		var waiter struct {
  2417  			syncutil.Mutex
  2418  			sync.Cond
  2419  			minHeartbeatReplicaID roachpb.ReplicaID
  2420  			blockSnapshot         bool
  2421  		}
  2422  		waiter.L = &waiter.Mutex
  2423  		waitForSnapshot := func() {
  2424  			waiter.Lock()
  2425  			defer waiter.Unlock()
  2426  			for waiter.blockSnapshot {
  2427  				waiter.Wait()
  2428  			}
  2429  		}
  2430  		recordHeartbeat := func(replicaID roachpb.ReplicaID) {
  2431  			waiter.Lock()
  2432  			defer waiter.Unlock()
  2433  			if waiter.blockSnapshot && replicaID >= waiter.minHeartbeatReplicaID {
  2434  				waiter.blockSnapshot = false
  2435  				waiter.Broadcast()
  2436  			}
  2437  		}
  2438  		setMinHeartbeat := func(replicaID roachpb.ReplicaID) {
  2439  			waiter.Lock()
  2440  			defer waiter.Unlock()
  2441  			waiter.minHeartbeatReplicaID = replicaID
  2442  			waiter.blockSnapshot = true
  2443  		}
  2444  		setMinHeartbeat(repl.ReplicaID() + 1)
  2445  		tc.Servers[2].RaftTransport().Listen(store.StoreID(), &unreliableRaftHandler{
  2446  			rangeID:            desc.RangeID,
  2447  			RaftMessageHandler: store,
  2448  			unreliableRaftHandlerFuncs: unreliableRaftHandlerFuncs{
  2449  				dropResp: func(*kvserver.RaftMessageResponse) bool {
  2450  					return true
  2451  				},
  2452  				dropReq: func(*kvserver.RaftMessageRequest) bool {
  2453  					return true
  2454  				},
  2455  				dropHB: func(hb *kvserver.RaftHeartbeat) bool {
  2456  					recordHeartbeat(hb.ToReplicaID)
  2457  					return false
  2458  				},
  2459  				snapErr: func(*kvserver.SnapshotRequest_Header) error {
  2460  					waitForSnapshot()
  2461  					return errors.New("boom")
  2462  				},
  2463  			},
  2464  		})
  2465  		// Remove the current replica from the node, it will not hear about this.
  2466  		tc.RemoveReplicasOrFatal(t, key, tc.Target(2))
  2467  		// Try to add it back as a learner. We'll wait until it's heard about
  2468  		// this as a heartbeat. This demonstrates case (4) where a raft message
  2469  		// to a newer replica ID (in this case a heartbeat) removes an initialized
  2470  		// Replica.
  2471  		_, err = tc.AddReplicas(key, tc.Target(2))
  2472  		require.Regexp(t, "boom", err)
  2473  		tombstone := waitForTombstone(t, store.Engine(), rangeID)
  2474  		require.Equal(t, roachpb.ReplicaID(4), tombstone.NextReplicaID)
  2475  		// Try adding it again and again block the snapshot until a heartbeat
  2476  		// at a higher ID has been sent. This is case (4.1) where a raft message
  2477  		// removes an uninitialized Replica.
  2478  		//
  2479  		// Note that this case represents a potential memory leak. If we hear about
  2480  		// a Replica and then either never receive a snapshot or for whatever reason
  2481  		// fail to receive a snapshot and then we never hear from the range again we
  2482  		// may leak in-memory state about this replica.
  2483  		//
  2484  		// We could replica GC these replicas without too much extra work but they
  2485  		// also should be rare. Note this is not new with learner replicas.
  2486  		setMinHeartbeat(5)
  2487  		_, err = tc.AddReplicas(key, tc.Target(2))
  2488  		require.Regexp(t, "boom", err)
  2489  		// We will start out reading the old tombstone so keep retrying.
  2490  		testutils.SucceedsSoon(t, func() error {
  2491  			tombstone = waitForTombstone(t, store.Engine(), rangeID)
  2492  			if tombstone.NextReplicaID != 5 {
  2493  				return errors.Errorf("read tombstone with NextReplicaID %d, want %d",
  2494  					tombstone.NextReplicaID, 5)
  2495  			}
  2496  			return nil
  2497  		})
  2498  	})
  2499  	t.Run("(6) subsumption via snapshot", func(t *testing.T) {
  2500  		defer leaktest.AfterTest(t)()
  2501  
  2502  		ctx := context.Background()
  2503  		var proposalFilter atomic.Value
  2504  		noopProposalFilter := func(kvserverbase.ProposalFilterArgs) *roachpb.Error {
  2505  			return nil
  2506  		}
  2507  		proposalFilter.Store(noopProposalFilter)
  2508  		tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{
  2509  			ServerArgs: base.TestServerArgs{
  2510  				Knobs: base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{
  2511  					DisableReplicaGCQueue: true,
  2512  					TestingProposalFilter: kvserverbase.ReplicaProposalFilter(
  2513  						func(args kvserverbase.ProposalFilterArgs) *roachpb.Error {
  2514  							return proposalFilter.
  2515  								Load().(func(kvserverbase.ProposalFilterArgs) *roachpb.Error)(args)
  2516  						},
  2517  					),
  2518  				}},
  2519  			},
  2520  			ReplicationMode: base.ReplicationManual,
  2521  		})
  2522  		defer tc.Stopper().Stop(ctx)
  2523  
  2524  		key := tc.ScratchRange(t)
  2525  		require.NoError(t, tc.WaitForSplitAndInitialization(key))
  2526  		tc.AddReplicasOrFatal(t, key, tc.Target(1), tc.Target(2))
  2527  		keyA := append(key[:len(key):len(key)], 'a')
  2528  		lhsDesc, rhsDesc, err := tc.SplitRange(keyA)
  2529  		require.NoError(t, err)
  2530  		require.NoError(t, tc.WaitForSplitAndInitialization(key))
  2531  		require.NoError(t, tc.WaitForSplitAndInitialization(keyA))
  2532  		require.NoError(t, tc.WaitForVoters(key, tc.Target(1), tc.Target(2)))
  2533  		require.NoError(t, tc.WaitForVoters(keyA, tc.Target(1), tc.Target(2)))
  2534  
  2535  		// We're going to block the RHS and LHS of node 2 as soon as the merge
  2536  		// attempts to propose the command to commit the merge. This should prevent
  2537  		// the merge from being applied on node 2. Then we'll manually force a
  2538  		// snapshots to be sent to the LHS of store 2 after the merge commits.
  2539  		store, repl := getFirstStoreReplica(t, tc.Server(2), key)
  2540  		var partActive atomic.Value
  2541  		partActive.Store(false)
  2542  		raftFuncs := noopRaftHandlerFuncs()
  2543  		raftFuncs.dropReq = func(req *kvserver.RaftMessageRequest) bool {
  2544  			return partActive.Load().(bool) && req.Message.Type == raftpb.MsgApp
  2545  		}
  2546  		tc.Servers[2].RaftTransport().Listen(store.StoreID(), &unreliableRaftHandler{
  2547  			rangeID:                    lhsDesc.RangeID,
  2548  			unreliableRaftHandlerFuncs: raftFuncs,
  2549  			RaftMessageHandler: &unreliableRaftHandler{
  2550  				rangeID:                    rhsDesc.RangeID,
  2551  				RaftMessageHandler:         store,
  2552  				unreliableRaftHandlerFuncs: raftFuncs,
  2553  			},
  2554  		})
  2555  		proposalFilter.Store(func(args kvserverbase.ProposalFilterArgs) *roachpb.Error {
  2556  			merge := args.Cmd.ReplicatedEvalResult.Merge
  2557  			if merge != nil && merge.LeftDesc.RangeID == lhsDesc.RangeID {
  2558  				partActive.Store(true)
  2559  			}
  2560  			return nil
  2561  		})
  2562  		require.NoError(t, tc.Server(0).DB().AdminMerge(ctx, key))
  2563  		var tombstone roachpb.RangeTombstone
  2564  		testutils.SucceedsSoon(t, func() (err error) {
  2565  			// One of the two other stores better be the raft leader eventually.
  2566  			// We keep trying to send snapshots until one takes.
  2567  			for i := range []int{0, 1} {
  2568  				s, r := getFirstStoreReplica(t, tc.Server(i), key)
  2569  				err = s.ManualRaftSnapshot(r, repl.ReplicaID())
  2570  				if err == nil {
  2571  					break
  2572  				}
  2573  			}
  2574  			if err != nil {
  2575  				return err
  2576  			}
  2577  			tombstoneKey := keys.RangeTombstoneKey(rhsDesc.RangeID)
  2578  			ok, err := storage.MVCCGetProto(
  2579  				context.Background(), store.Engine(), tombstoneKey, hlc.Timestamp{}, &tombstone, storage.MVCCGetOptions{},
  2580  			)
  2581  			require.NoError(t, err)
  2582  			if !ok {
  2583  				return errors.New("no tombstone found")
  2584  			}
  2585  			return nil
  2586  		})
  2587  		require.Equal(t, roachpb.ReplicaID(math.MaxInt32), tombstone.NextReplicaID)
  2588  	})
  2589  }
  2590  
  2591  // TestAdminRelocateRangeSafety exercises a situation where calls to
  2592  // AdminRelocateRange can race with calls to ChangeReplicas and verifies
  2593  // that such races do not leave the range in an under-replicated state.
  2594  func TestAdminRelocateRangeSafety(t *testing.T) {
  2595  	defer leaktest.AfterTest(t)()
  2596  
  2597  	// The test is going to verify that when a replica removal due to a
  2598  	// Replica.ChangeReplicas call coincides with the removal phase of an
  2599  	// AdminRelocateRangeRequest that one of the removals will fail.
  2600  	// In order to ensure that the AdminChangeReplicas command coincides with
  2601  	// the remove phase of the AdminRelocateReplicas the test injects a response
  2602  	// filter which, when useSeenAdd holds true, signals on seenAdd when it sees
  2603  	// an AdminChangeReplicasRequest which added a replica.
  2604  	const numNodes = 4
  2605  	var useSeenAdd atomic.Value
  2606  	useSeenAdd.Store(false)
  2607  	seenAdd := make(chan struct{}, 1)
  2608  	responseFilter := func(ctx context.Context, ba roachpb.BatchRequest, _ *roachpb.BatchResponse) *roachpb.Error {
  2609  		if ba.IsSingleRequest() {
  2610  			changeReplicas, ok := ba.Requests[0].GetInner().(*roachpb.AdminChangeReplicasRequest)
  2611  			if ok && changeReplicas.Changes()[0].ChangeType == roachpb.ADD_REPLICA && useSeenAdd.Load().(bool) {
  2612  				seenAdd <- struct{}{}
  2613  			}
  2614  		}
  2615  		return nil
  2616  	}
  2617  	tc := testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{
  2618  		ReplicationMode: base.ReplicationManual,
  2619  		ServerArgs: base.TestServerArgs{
  2620  			Knobs: base.TestingKnobs{
  2621  				Store: &kvserver.StoreTestingKnobs{
  2622  					TestingResponseFilter: responseFilter,
  2623  				},
  2624  			},
  2625  		},
  2626  	})
  2627  	ctx := context.Background()
  2628  	defer tc.Stopper().Stop(ctx)
  2629  	db := tc.Servers[rand.Intn(numNodes)].DB()
  2630  
  2631  	// The test assumes from the way that the range gets set up that the lease
  2632  	// holder is node 1 and from the above relocate call that the range in
  2633  	// question has replicas on nodes 1-3. Make the call to AdminRelocate range
  2634  	// to set up the replication and then verify the assumed state.
  2635  
  2636  	key := roachpb.Key("a")
  2637  	assert.Nil(t, db.AdminRelocateRange(ctx, key, makeReplicationTargets(1, 2, 3)))
  2638  	rangeInfo, err := getRangeInfo(ctx, db, key)
  2639  	assert.Nil(t, err)
  2640  	assert.Len(t, rangeInfo.Desc.InternalReplicas, 3)
  2641  	assert.Equal(t, rangeInfo.Lease.Replica.NodeID, roachpb.NodeID(1))
  2642  	for id := roachpb.StoreID(1); id <= 3; id++ {
  2643  		_, hasReplica := rangeInfo.Desc.GetReplicaDescriptor(id)
  2644  		assert.Truef(t, hasReplica, "missing replica descriptor for store %d", id)
  2645  	}
  2646  
  2647  	// The test now proceeds to use AdminRelocateRange to move a replica from node
  2648  	// 3 to node 4. The call should first which will first add 4 and then
  2649  	// remove 3. Concurrently a separate goroutine will attempt to remove the
  2650  	// replica on node 2. The ResponseFilter passed in the TestingKnobs will
  2651  	// prevent the remove call from proceeding until after the Add of 4 has
  2652  	// completed.
  2653  
  2654  	// Code above verified r1 is the leaseholder, so use it to ChangeReplicas.
  2655  	r1, _, err := tc.Servers[0].Stores().GetReplicaForRangeID(rangeInfo.Desc.RangeID)
  2656  	assert.Nil(t, err)
  2657  	expDescAfterAdd := rangeInfo.Desc // for use with ChangeReplicas
  2658  	expDescAfterAdd.NextReplicaID++
  2659  	expDescAfterAdd.InternalReplicas = append(expDescAfterAdd.InternalReplicas, roachpb.ReplicaDescriptor{
  2660  		NodeID:    4,
  2661  		StoreID:   4,
  2662  		ReplicaID: 4,
  2663  	})
  2664  	var relocateErr, changeErr error
  2665  	var changedDesc *roachpb.RangeDescriptor // only populated if changeErr == nil
  2666  	change := func() {
  2667  		<-seenAdd
  2668  		chgs := roachpb.MakeReplicationChanges(roachpb.REMOVE_REPLICA, makeReplicationTargets(2)...)
  2669  		changedDesc, changeErr = r1.ChangeReplicas(ctx, &expDescAfterAdd, kvserver.SnapshotRequest_REBALANCE, "replicate", "testing", chgs)
  2670  	}
  2671  	relocate := func() {
  2672  		relocateErr = db.AdminRelocateRange(ctx, key, makeReplicationTargets(1, 2, 4))
  2673  	}
  2674  	useSeenAdd.Store(true)
  2675  	var wg sync.WaitGroup
  2676  	wg.Add(2)
  2677  	go func() { relocate(); wg.Done() }()
  2678  	go func() { change(); wg.Done() }()
  2679  	wg.Wait()
  2680  	rangeInfo, err = getRangeInfo(ctx, db, key)
  2681  	assert.Nil(t, err)
  2682  	assert.True(t, len(rangeInfo.Desc.InternalReplicas) >= 3)
  2683  	assert.Falsef(t, relocateErr == nil && changeErr == nil,
  2684  		"expected one of racing AdminRelocateReplicas and ChangeReplicas "+
  2685  			"to fail but neither did")
  2686  	assert.Falsef(t, relocateErr != nil && changeErr != nil,
  2687  		"expected only one of racing AdminRelocateReplicas and ChangeReplicas "+
  2688  			"to fail but both did")
  2689  	if changeErr == nil {
  2690  		assert.EqualValues(t, *changedDesc, rangeInfo.Desc)
  2691  	}
  2692  }
  2693  
  2694  // TestChangeReplicasLeaveAtomicRacesWithMerge exercises a hazardous case which
  2695  // arises during concurrent AdminChangeReplicas requests. The code reads the
  2696  // descriptor from range id local, checks to make sure that the read
  2697  // descriptor matches the expectation and then uses the bytes of the read read
  2698  // bytes in a CPut with the update. The code contains an optimization to
  2699  // transition out of joint consensus even if the read descriptor does not match
  2700  // the expectation. That optimization did not verify anything about the read
  2701  // descriptor, not even if it was nil.
  2702  //
  2703  // This test wants to exercise this scenario. We need to get the replica in
  2704  // a state where it has an outgoing voter and then we need to have two
  2705  // different requests trying to make changes and only the merge succeeds. The
  2706  // race is that the second command will notice the voter outgoing and will
  2707  // attempt to fix it.  In order to do that it reads the range descriptor to
  2708  // ensure that it has not changed (and to get the raw bytes of the range
  2709  // descriptor for use in a CPut as the current API only uses the in-memory
  2710  // value and we need the encoding is not necessarily stable.
  2711  //
  2712  // The test also contains a variant whereby the range is re-split at the
  2713  // same key producing a range descriptor with a different range ID.
  2714  //
  2715  // See https://github.com/cockroachdb/cockroach/issues/40877.
  2716  func TestChangeReplicasLeaveAtomicRacesWithMerge(t *testing.T) {
  2717  	defer leaktest.AfterTest(t)()
  2718  	testutils.RunTrueAndFalse(t, "resplit", func(t *testing.T, resplit bool) {
  2719  		const numNodes = 4
  2720  		var stopAfterJointConfig atomic.Value
  2721  		stopAfterJointConfig.Store(false)
  2722  		var rangeToBlockRangeDescriptorRead atomic.Value
  2723  		rangeToBlockRangeDescriptorRead.Store(roachpb.RangeID(0))
  2724  		blockRangeDescriptorReadChan := make(chan struct{}, 1)
  2725  		blockOnChangeReplicasRead := kvserverbase.ReplicaRequestFilter(func(ctx context.Context, ba roachpb.BatchRequest) *roachpb.Error {
  2726  			if req, isGet := ba.GetArg(roachpb.Get); !isGet ||
  2727  				ba.RangeID != rangeToBlockRangeDescriptorRead.Load().(roachpb.RangeID) ||
  2728  				!ba.IsSingleRequest() ||
  2729  				!bytes.HasSuffix([]byte(req.(*roachpb.GetRequest).Key),
  2730  					[]byte(keys.LocalRangeDescriptorSuffix)) {
  2731  				return nil
  2732  			}
  2733  			select {
  2734  			case <-blockRangeDescriptorReadChan:
  2735  				<-blockRangeDescriptorReadChan
  2736  			case <-ctx.Done():
  2737  			default:
  2738  			}
  2739  			return nil
  2740  		})
  2741  		tc := testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{
  2742  			ServerArgs: base.TestServerArgs{
  2743  				Knobs: base.TestingKnobs{
  2744  					Store: &kvserver.StoreTestingKnobs{
  2745  						TestingRequestFilter: blockOnChangeReplicasRead,
  2746  						ReplicaAddStopAfterJointConfig: func() bool {
  2747  							return stopAfterJointConfig.Load().(bool)
  2748  						},
  2749  					},
  2750  				},
  2751  			},
  2752  			ReplicationMode: base.ReplicationManual,
  2753  		})
  2754  		ctx := context.Background()
  2755  		defer tc.Stopper().Stop(ctx)
  2756  
  2757  		// We want to first get into a joint consensus scenario.
  2758  		// Then we want to issue a ChangeReplicasRequest on a goroutine that will
  2759  		// block trying to read the RHS's range descriptor. Then we'll merge the RHS
  2760  		// away.
  2761  
  2762  		// Set up a userspace range to mess around with.
  2763  		lhs := tc.ScratchRange(t)
  2764  		_, err := tc.AddReplicas(lhs, tc.Targets(1, 2)...)
  2765  		require.NoError(t, err)
  2766  
  2767  		// Split it and then we're going to try to up-replicate.
  2768  		// We're going to have one goroutine trying to ADD the 4th node.
  2769  		// and another goroutine trying to move out of a joint config on both
  2770  		// sides and then merge the range. We ensure that the first goroutine
  2771  		// blocks and the second one succeeds. This will test that the first
  2772  		// goroutine detects reading the nil descriptor.
  2773  		rhs := append(lhs[:len(lhs):len(lhs)], 'a')
  2774  		lhsDesc, rhsDesc := &roachpb.RangeDescriptor{}, &roachpb.RangeDescriptor{}
  2775  		*lhsDesc, *rhsDesc, err = tc.SplitRange(rhs)
  2776  		require.NoError(t, err)
  2777  
  2778  		err = tc.WaitForSplitAndInitialization(rhs)
  2779  		require.NoError(t, err)
  2780  
  2781  		// Manually construct the batch because the (*DB).AdminChangeReplicas does
  2782  		// not yet support atomic replication changes.
  2783  		db := tc.Servers[0].DB()
  2784  		swapReplicas := func(key roachpb.Key, desc roachpb.RangeDescriptor, add, remove int) (*roachpb.RangeDescriptor, error) {
  2785  			return db.AdminChangeReplicas(ctx, key, desc, []roachpb.ReplicationChange{
  2786  				{ChangeType: roachpb.ADD_REPLICA, Target: tc.Target(add)},
  2787  				{ChangeType: roachpb.REMOVE_REPLICA, Target: tc.Target(remove)},
  2788  			})
  2789  		}
  2790  
  2791  		// Move the RHS and LHS to 3 from 2.
  2792  		_, err = swapReplicas(lhs, *lhsDesc, 3, 2)
  2793  		require.NoError(t, err)
  2794  		stopAfterJointConfig.Store(true) // keep the RHS in a joint config.
  2795  		rhsDesc, err = swapReplicas(rhs, *rhsDesc, 3, 2)
  2796  		require.NoError(t, err)
  2797  		stopAfterJointConfig.Store(false)
  2798  
  2799  		// Run a goroutine which sends an AdminChangeReplicasRequest which will try to
  2800  		// move the range out of joint config but will end up blocking on
  2801  		// blockRangeDescriptorReadChan until we close it later.
  2802  		rangeToBlockRangeDescriptorRead.Store(rhsDesc.RangeID)
  2803  		blockRangeDescriptorReadChan <- struct{}{}
  2804  		var wg sync.WaitGroup
  2805  
  2806  		defer func() {
  2807  			// Unblock the original add on the separate goroutine to ensure that it
  2808  			// properly handles reading a nil range descriptor.
  2809  			close(blockRangeDescriptorReadChan)
  2810  			wg.Wait()
  2811  		}()
  2812  		wg.Add(1)
  2813  
  2814  		go func() {
  2815  			defer wg.Done()
  2816  			_, err := db.AdminChangeReplicas(
  2817  				ctx, rhs, *rhsDesc, roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, tc.Target(2)),
  2818  			)
  2819  			// We'll ultimately fail because we're going to race with the work below.
  2820  			msg := "descriptor changed"
  2821  			if resplit {
  2822  				// We don't convert ConditionFailedError to the "descriptor changed"
  2823  				// error if the range ID changed.
  2824  				msg = "unexpected value"
  2825  			}
  2826  			require.True(t, testutils.IsError(err, msg), err)
  2827  		}()
  2828  		// Wait until our goroutine is blocked.
  2829  		testutils.SucceedsSoon(t, func() error {
  2830  			if len(blockRangeDescriptorReadChan) != 0 {
  2831  				return errors.New("not blocked yet")
  2832  			}
  2833  			return nil
  2834  		})
  2835  		// Remove the learner replica (left because the joint config was demoting
  2836  		// a voter) which as a side effect exists the joint config.
  2837  		_, err = tc.RemoveReplicas(rhs, tc.Target(2))
  2838  		require.NoError(t, err)
  2839  		// Merge the RHS away.
  2840  		err = db.AdminMerge(ctx, lhs)
  2841  		require.NoError(t, err)
  2842  		if resplit {
  2843  			require.NoError(t, db.AdminSplit(ctx, lhs, rhs, hlc.Timestamp{WallTime: math.MaxInt64}))
  2844  			err = tc.WaitForSplitAndInitialization(rhs)
  2845  			require.NoError(t, err)
  2846  		}
  2847  	})
  2848  }
  2849  
  2850  // This test is designed to demonstrate that it is not possible to have pending
  2851  // proposals concurrent with a TransferLeaseRequest. This property ensures that
  2852  // we cannot possibly receive AmbiguousResultError due to an outgoing leaseholder
  2853  // being removed while still having pending proposals for a lease which did not
  2854  // expire (i.e. was transferred cooperatively using TransferLease rather than
  2855  // being taken with a RequestLease).
  2856  //
  2857  // At the time of writing this test were three hazardous cases which are now
  2858  // avoided:
  2859  //
  2860  //  (1) The outgoing leaseholder learns about its removal before applying the
  2861  //      lease transfer. This could happen if it has a lot left to apply but it
  2862  //      does indeed know in its log that it is either no longer the leaseholder
  2863  //      or that some of its commands will apply successfully.
  2864  //
  2865  //  (2) The replica learns about its removal after applying the lease transfer
  2866  //      but it potentially still has pending commands which it thinks might
  2867  //      have been proposed. This can occur if there are commands which are
  2868  //      proposed after the lease transfer has been proposed but before the lease
  2869  //      transfer has applied. This can also occur if commands are re-ordered
  2870  //      by raft due to a leadership change.
  2871  //
  2872  //  (3) The replica learns about its removal after applying the lease transfer
  2873  //      but proposed a command evaluated under the old lease after the lease
  2874  //      transfer has been applied. This can occur if there are commands evaluate
  2875  //      before the lease transfer is proposed but are not inserted into the
  2876  //      proposal buffer until after it has been applied.
  2877  //
  2878  // None of these cases are possible any longer as latches now prevent writes
  2879  // from occurring concurrently with TransferLeaseRequests. (1) is prevented
  2880  // because all proposals will need to apply before the TransferLeaseRequest
  2881  // can be evaluated. (2) and (3) are not possible because either the commands
  2882  // in question acquire their latches before the TransferLeaseRequest in which
  2883  // case they'll apply before the TransferLease can be proposed or they acquire
  2884  // their latches after the TransferLease applies in which case they will fail
  2885  // due to NotLeaseHolderError prior to application.
  2886  func TestTransferLeaseBlocksWrites(t *testing.T) {
  2887  	defer leaktest.AfterTest(t)()
  2888  
  2889  	// We want to verify that we will not propose a TransferLeaseRequest while
  2890  	// there is an outstanding proposal.
  2891  	var scratchRangeID atomic.Value
  2892  	scratchRangeID.Store(roachpb.RangeID(0))
  2893  	blockInc := make(chan chan struct{})
  2894  	tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{
  2895  		ServerArgs: base.TestServerArgs{
  2896  			Knobs: base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{
  2897  				TestingProposalFilter: kvserverbase.ReplicaProposalFilter(
  2898  					func(args kvserverbase.ProposalFilterArgs) *roachpb.Error {
  2899  						if args.Req.RangeID != scratchRangeID.Load().(roachpb.RangeID) {
  2900  							return nil
  2901  						}
  2902  						// Block increment requests on blockInc.
  2903  						if _, isInc := args.Req.GetArg(roachpb.Increment); isInc {
  2904  							unblock := make(chan struct{})
  2905  							blockInc <- unblock
  2906  							<-unblock
  2907  						}
  2908  						return nil
  2909  					},
  2910  				),
  2911  			}},
  2912  		},
  2913  		ReplicationMode: base.ReplicationManual,
  2914  	})
  2915  	defer tc.Stopper().Stop(context.Background())
  2916  
  2917  	scratch := tc.ScratchRange(t)
  2918  	makeKey := func() roachpb.Key {
  2919  		return append(scratch[:len(scratch):len(scratch)], uuid.MakeV4().String()...)
  2920  	}
  2921  	desc := tc.AddReplicasOrFatal(t, scratch, tc.Target(1), tc.Target(2))
  2922  	scratchRangeID.Store(desc.RangeID)
  2923  	require.NoError(t, tc.WaitForVoters(scratch, tc.Target(1), tc.Target(2)))
  2924  
  2925  	// Launch a goroutine to increment a value, it will block in the proposal
  2926  	// filter.
  2927  	incErr := make(chan error)
  2928  	go func() {
  2929  		_, err := tc.Server(1).DB().Inc(context.Background(), makeKey(), 1)
  2930  		incErr <- err
  2931  	}()
  2932  
  2933  	// Wait for the increment to be blocked on the proposal filter so we know
  2934  	// it holds a write latch.
  2935  	unblock := <-blockInc
  2936  
  2937  	// Launch a goroutine to transfer the lease to store 1.
  2938  	transferErr := make(chan error)
  2939  	go func() {
  2940  		transferErr <- tc.TransferRangeLease(desc, tc.Target(1))
  2941  	}()
  2942  
  2943  	// Ensure that the lease transfer doesn't succeed.
  2944  	// We don't wait that long because we don't want this test to take too long.
  2945  	// The theory is that if we weren't acquiring latches over the keyspace then
  2946  	// the lease transfer could succeed before we unblocked the increment request.
  2947  	select {
  2948  	case <-time.After(100 * time.Millisecond):
  2949  	case err := <-transferErr:
  2950  		t.Fatalf("did not expect transfer to complete, got %v", err)
  2951  	}
  2952  
  2953  	close(unblock)
  2954  	require.NoError(t, <-incErr)
  2955  	require.NoError(t, <-transferErr)
  2956  }
  2957  
  2958  // TestStrictGCEnforcement ensures that strict GC enforcement is respected and
  2959  // furthermore is responsive to changes in protected timestamps and in changes
  2960  // to the zone configs.
  2961  func TestStrictGCEnforcement(t *testing.T) {
  2962  	defer leaktest.AfterTest(t)()
  2963  
  2964  	// The unfortunate thing about this test is that the gcttl is in seconds and
  2965  	// we need to wait for the replica's lease start time to be sufficiently old.
  2966  	// It takes about two seconds. All of that time is in setup.
  2967  	if testing.Short() {
  2968  		return
  2969  	}
  2970  	ctx := context.Background()
  2971  
  2972  	tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{
  2973  		ReplicationMode: base.ReplicationManual,
  2974  	})
  2975  	defer tc.Stopper().Stop(ctx)
  2976  
  2977  	sqlDB := sqlutils.MakeSQLRunner(tc.ServerConn(0))
  2978  	sqlDB.Exec(t, `CREATE TABLE foo (i INT PRIMARY KEY)`)
  2979  
  2980  	var (
  2981  		db         = tc.Server(0).DB()
  2982  		getTableID = func() (tableID uint32) {
  2983  			sqlDB.QueryRow(t, `SELECT table_id FROM crdb_internal.tables`+
  2984  				` WHERE name = 'foo' AND database_name = current_database()`).Scan(&tableID)
  2985  			return tableID
  2986  		}
  2987  		tableID       = getTableID()
  2988  		tenSecondsAgo hlc.Timestamp // written in setup
  2989  		tableKey      = keys.SystemSQLCodec.TablePrefix(tableID)
  2990  		tableSpan     = roachpb.Span{Key: tableKey, EndKey: tableKey.PrefixEnd()}
  2991  		mkRecord      = func() ptpb.Record {
  2992  			return ptpb.Record{
  2993  				ID:        uuid.MakeV4(),
  2994  				Timestamp: tenSecondsAgo.Add(-10*time.Second.Nanoseconds(), 0),
  2995  				Spans:     []roachpb.Span{tableSpan},
  2996  			}
  2997  		}
  2998  		mkStaleTxn = func() *kv.Txn {
  2999  			txn := db.NewTxn(ctx, "foo")
  3000  			txn.SetFixedTimestamp(ctx, tenSecondsAgo)
  3001  			return txn
  3002  		}
  3003  		getRejectedMsg = func() string {
  3004  			return tenSecondsAgo.String() + " must be after replica GC threshold "
  3005  		}
  3006  		performScan = func() error {
  3007  			txn := mkStaleTxn()
  3008  			_, err := txn.Scan(ctx, tableKey, tableKey.PrefixEnd(), 1)
  3009  			return err
  3010  		}
  3011  		assertScanRejected = func(t *testing.T) {
  3012  			t.Helper()
  3013  			require.Regexp(t, getRejectedMsg(), performScan())
  3014  		}
  3015  
  3016  		assertScanOk = func(t *testing.T) {
  3017  			t.Helper()
  3018  			require.NoError(t, performScan())
  3019  		}
  3020  		// Make sure the cache has been updated. Once it has then we know it won't
  3021  		// be for minutes. It should read on startup.
  3022  		waitForCacheAfter = func(t *testing.T, min hlc.Timestamp) {
  3023  			t.Helper()
  3024  			testutils.SucceedsSoon(t, func() error {
  3025  				for i := 0; i < tc.NumServers(); i++ {
  3026  					ptp := tc.Server(i).ExecutorConfig().(sql.ExecutorConfig).ProtectedTimestampProvider
  3027  					if ptp.Iterate(ctx, tableKey, tableKey, func(record *ptpb.Record) (wantMore bool) {
  3028  						return false
  3029  					}).Less(min) {
  3030  						return errors.Errorf("not yet read")
  3031  					}
  3032  				}
  3033  				return nil
  3034  			})
  3035  		}
  3036  		setGCTTL = func(t *testing.T, object string, exp int) {
  3037  			t.Helper()
  3038  			testutils.SucceedsSoon(t, func() error {
  3039  				sqlDB.Exec(t, `ALTER `+object+` CONFIGURE ZONE USING gc.ttlseconds = `+strconv.Itoa(exp))
  3040  				for i := 0; i < tc.NumServers(); i++ {
  3041  					s := tc.Server(i)
  3042  					_, r := getFirstStoreReplica(t, s, tableKey)
  3043  					if _, z := r.DescAndZone(); z.GC.TTLSeconds != int32(exp) {
  3044  						_, sysCfg := getFirstStoreReplica(t, tc.Server(i), keys.SystemConfigSpan.Key)
  3045  						require.NoError(t, sysCfg.MaybeGossipSystemConfig(ctx))
  3046  						return errors.Errorf("expected %d, got %d", exp, z.GC.TTLSeconds)
  3047  					}
  3048  				}
  3049  				return nil
  3050  			})
  3051  		}
  3052  		setStrictGC = func(t *testing.T, val bool) {
  3053  			t.Helper()
  3054  			sqlDB.Exec(t, `SET CLUSTER SETTING kv.gc_ttl.strict_enforcement.enabled = `+fmt.Sprint(val))
  3055  			testutils.SucceedsSoon(t, func() error {
  3056  				for i := 0; i < tc.NumServers(); i++ {
  3057  					s, r := getFirstStoreReplica(t, tc.Server(i), keys.SystemConfigSpan.Key)
  3058  					if kvserver.StrictGCEnforcement.Get(&s.ClusterSettings().SV) != val {
  3059  						require.NoError(t, r.MaybeGossipSystemConfig(ctx))
  3060  						return errors.Errorf("expected %v, got %v", val, !val)
  3061  					}
  3062  				}
  3063  				return nil
  3064  			})
  3065  		}
  3066  		setTableGCTTL = func(t *testing.T, exp int) {
  3067  			t.Helper()
  3068  			setGCTTL(t, "TABLE foo", exp)
  3069  		}
  3070  		setSystemGCTTL = func(t *testing.T, exp int) {
  3071  			// TODO(ajwerner): adopt this to test the system ranges are unaffected.
  3072  			t.Helper()
  3073  			setGCTTL(t, "RANGE system", exp)
  3074  		}
  3075  		refreshPastLeaseStart = func(t *testing.T) {
  3076  			for i := 0; i < tc.NumServers(); i++ {
  3077  				ptp := tc.Server(i).ExecutorConfig().(sql.ExecutorConfig).ProtectedTimestampProvider
  3078  				_, r := getFirstStoreReplica(t, tc.Server(i), tableKey)
  3079  				l, _ := r.GetLease()
  3080  				require.NoError(t, ptp.Refresh(ctx, l.Start.Next()))
  3081  				r.ReadProtectedTimestamps(ctx)
  3082  			}
  3083  		}
  3084  	)
  3085  
  3086  	{
  3087  		// Setup the initial state to be sure that we'll actually strictly enforce
  3088  		// gc ttls.
  3089  		tc.SplitRangeOrFatal(t, tableKey)
  3090  		_, err := tc.AddReplicas(tableKey, tc.Target(1), tc.Target(2))
  3091  		require.NoError(t, err)
  3092  		_, err = tc.AddReplicas(keys.SystemConfigSpan.Key, tc.Target(1), tc.Target(2))
  3093  		require.NoError(t, err)
  3094  
  3095  		setTableGCTTL(t, 1)
  3096  		waitForCacheAfter(t, hlc.Timestamp{})
  3097  
  3098  		defer sqlDB.Exec(t, `SET CLUSTER SETTING kv.gc_ttl.strict_enforcement.enabled = DEFAULT`)
  3099  		setStrictGC(t, true)
  3100  		tenSecondsAgo = tc.Server(0).Clock().Now().Add(-10*time.Second.Nanoseconds(), 0)
  3101  	}
  3102  
  3103  	t.Run("strict enforcement", func(t *testing.T) {
  3104  		refreshPastLeaseStart(t)
  3105  		assertScanRejected(t)
  3106  	})
  3107  	t.Run("disable strict enforcement", func(t *testing.T) {
  3108  		setStrictGC(t, false)
  3109  		defer setStrictGC(t, true)
  3110  		assertScanOk(t)
  3111  	})
  3112  	t.Run("zone config changes are respected", func(t *testing.T) {
  3113  		setTableGCTTL(t, 60)
  3114  		assertScanOk(t)
  3115  		setTableGCTTL(t, 1)
  3116  		assertScanRejected(t)
  3117  	})
  3118  	t.Run("system ranges are unaffected", func(t *testing.T) {
  3119  		setSystemGCTTL(t, 1)
  3120  		txn := mkStaleTxn()
  3121  		descriptorTable := keys.SystemSQLCodec.TablePrefix(keys.DescriptorTableID)
  3122  		_, err := txn.Scan(ctx, descriptorTable, descriptorTable.PrefixEnd(), 1)
  3123  		require.NoError(t, err)
  3124  	})
  3125  	t.Run("protected timestamps are respected", func(t *testing.T) {
  3126  		waitForCacheAfter(t, hlc.Timestamp{})
  3127  		ptp := tc.Server(0).ExecutorConfig().(sql.ExecutorConfig).ProtectedTimestampProvider
  3128  		assertScanRejected(t)
  3129  		// Create a protected timestamp, don't verify it, make sure it's not
  3130  		// respected.
  3131  		rec := mkRecord()
  3132  		require.NoError(t, db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
  3133  			return ptp.Protect(ctx, txn, &rec)
  3134  		}))
  3135  		assertScanRejected(t)
  3136  
  3137  		require.NoError(t, ptp.Verify(ctx, rec.ID))
  3138  		assertScanOk(t)
  3139  
  3140  		// Transfer the lease and demonstrate that the query succeeds because we're
  3141  		// cautious in the face of lease transfers.
  3142  		desc, err := tc.LookupRange(tableKey)
  3143  		require.NoError(t, err)
  3144  		require.NoError(t, tc.TransferRangeLease(desc, tc.Target(1)))
  3145  		assertScanOk(t)
  3146  	})
  3147  }
  3148  
  3149  // TestProposalOverhead ensures that the command overhead for put operations
  3150  // is as expected. It exists to prevent changes which might increase the
  3151  // byte overhead of replicating commands.
  3152  //
  3153  // Note that it intentionally avoids using a system range which incurs the
  3154  // overhead due to the logical op log.
  3155  func TestProposalOverhead(t *testing.T) {
  3156  	defer leaktest.AfterTest(t)()
  3157  
  3158  	var overhead uint32
  3159  	var key atomic.Value
  3160  	key.Store(roachpb.Key{})
  3161  	filter := func(args kvserverbase.ProposalFilterArgs) *roachpb.Error {
  3162  		if len(args.Req.Requests) != 1 {
  3163  			return nil
  3164  		}
  3165  		req, ok := args.Req.GetArg(roachpb.Put)
  3166  		if !ok {
  3167  			return nil
  3168  		}
  3169  		put := req.(*roachpb.PutRequest)
  3170  		if !bytes.Equal(put.Key, key.Load().(roachpb.Key)) {
  3171  			return nil
  3172  		}
  3173  		// Sometime the logical portion of the timestamp can be non-zero which makes
  3174  		// the overhead non-deterministic.
  3175  		args.Cmd.ReplicatedEvalResult.Timestamp.Logical = 0
  3176  		atomic.StoreUint32(&overhead, uint32(args.Cmd.Size()-args.Cmd.WriteBatch.Size()))
  3177  		// We don't want to print the WriteBatch because it's explicitly
  3178  		// excluded from the size computation. Nil'ing it out does not
  3179  		// affect the memory held by the caller because neither `args` nor
  3180  		// `args.Cmd` are pointers.
  3181  		args.Cmd.WriteBatch = nil
  3182  		t.Logf(pretty.Sprint(args.Cmd))
  3183  		return nil
  3184  	}
  3185  	tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{
  3186  		ServerArgs: base.TestServerArgs{
  3187  			Knobs: base.TestingKnobs{
  3188  				Store: &kvserver.StoreTestingKnobs{TestingProposalFilter: filter},
  3189  			},
  3190  		},
  3191  	})
  3192  	ctx := context.Background()
  3193  	defer tc.Stopper().Stop(ctx)
  3194  
  3195  	db := tc.Server(0).DB()
  3196  	// NB: the expected overhead reflects the space overhead currently
  3197  	// present in Raft commands. This test will fail if that overhead
  3198  	// changes. Try to make this number go down and not up. It slightly
  3199  	// undercounts because our proposal filter is called before
  3200  	// maxLeaseIndex is filled in. The difference between the user and system
  3201  	// overhead is that users ranges do not have rangefeeds on by default whereas
  3202  	// system ranges do.
  3203  	const (
  3204  		expectedUserOverhead uint32 = 42
  3205  	)
  3206  	t.Run("user-key overhead", func(t *testing.T) {
  3207  		userKey := tc.ScratchRange(t)
  3208  		k := roachpb.Key(encoding.EncodeStringAscending(userKey, "foo"))
  3209  		key.Store(k)
  3210  		require.NoError(t, db.Put(ctx, k, "v"))
  3211  		require.Equal(t, expectedUserOverhead, atomic.LoadUint32(&overhead))
  3212  	})
  3213  
  3214  }
  3215  
  3216  // getRangeInfo retreives range info by performing a get against the provided
  3217  // key and setting the ReturnRangeInfo flag to true.
  3218  func getRangeInfo(
  3219  	ctx context.Context, db *kv.DB, key roachpb.Key,
  3220  ) (ri *roachpb.RangeInfo, err error) {
  3221  	err = db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
  3222  		b := txn.NewBatch()
  3223  		b.Header.ReturnRangeInfo = true
  3224  		b.AddRawRequest(roachpb.NewGet(key))
  3225  		if err = db.Run(ctx, b); err != nil {
  3226  			return err
  3227  		}
  3228  		resp := b.RawResponse()
  3229  		ri = &resp.Responses[0].GetInner().Header().RangeInfos[0]
  3230  		return nil
  3231  	})
  3232  	return ri, err
  3233  }
  3234  
  3235  // makeReplicationTargets creates a slice of replication targets where each
  3236  // target has a NodeID and StoreID with a value corresponding to an id in ids.
  3237  func makeReplicationTargets(ids ...int) (targets []roachpb.ReplicationTarget) {
  3238  	for _, id := range ids {
  3239  		targets = append(targets, roachpb.ReplicationTarget{
  3240  			NodeID:  roachpb.NodeID(id),
  3241  			StoreID: roachpb.StoreID(id),
  3242  		})
  3243  	}
  3244  	return targets
  3245  }