github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/client_merge_test.go (about)

     1  // Copyright 2015 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver_test
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"math"
    18  	"math/rand"
    19  	"reflect"
    20  	"regexp"
    21  	"strconv"
    22  	"strings"
    23  	"sync"
    24  	"sync/atomic"
    25  	"testing"
    26  	"time"
    27  
    28  	"github.com/cockroachdb/cockroach/pkg/base"
    29  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    30  	"github.com/cockroachdb/cockroach/pkg/gossip"
    31  	"github.com/cockroachdb/cockroach/pkg/keys"
    32  	"github.com/cockroachdb/cockroach/pkg/kv"
    33  	"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
    34  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
    35  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    36  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer"
    37  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
    38  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/txnwait"
    39  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    40  	"github.com/cockroachdb/cockroach/pkg/rpc"
    41  	"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
    42  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    43  	"github.com/cockroachdb/cockroach/pkg/storage"
    44  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    45  	"github.com/cockroachdb/cockroach/pkg/testutils"
    46  	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
    47  	"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
    48  	"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
    49  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    50  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    51  	"github.com/cockroachdb/cockroach/pkg/util/log"
    52  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    53  	"github.com/cockroachdb/cockroach/pkg/util/randutil"
    54  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    55  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    56  	"github.com/cockroachdb/errors"
    57  	"github.com/gogo/protobuf/proto"
    58  	"github.com/stretchr/testify/assert"
    59  	"github.com/stretchr/testify/require"
    60  	"go.etcd.io/etcd/raft/raftpb"
    61  )
    62  
    63  func adminMergeArgs(key roachpb.Key) *roachpb.AdminMergeRequest {
    64  	return &roachpb.AdminMergeRequest{
    65  		RequestHeader: roachpb.RequestHeader{
    66  			Key: key,
    67  		},
    68  	}
    69  }
    70  
    71  // createSplitRanges issues an AdminSplit command for the key "b". It returns
    72  // the descriptors for the ranges to the left and right of the split.
    73  func createSplitRanges(
    74  	ctx context.Context, store *kvserver.Store,
    75  ) (*roachpb.RangeDescriptor, *roachpb.RangeDescriptor, error) {
    76  	args := adminSplitArgs(roachpb.Key("b"))
    77  	if _, err := kv.SendWrapped(ctx, store.TestSender(), args); err != nil {
    78  		return nil, nil, err.GoError()
    79  	}
    80  
    81  	lhsDesc := store.LookupReplica(roachpb.RKey("a")).Desc()
    82  	rhsDesc := store.LookupReplica(roachpb.RKey("c")).Desc()
    83  
    84  	if bytes.Equal(lhsDesc.StartKey, rhsDesc.StartKey) {
    85  		return nil, nil, fmt.Errorf("split ranges have the same start key: %q = %q",
    86  			lhsDesc.StartKey, rhsDesc.StartKey)
    87  	}
    88  
    89  	return lhsDesc, rhsDesc, nil
    90  }
    91  
    92  // TestStoreRangeMergeTwoEmptyRanges tries to merge two empty ranges together.
    93  func TestStoreRangeMergeTwoEmptyRanges(t *testing.T) {
    94  	defer leaktest.AfterTest(t)()
    95  
    96  	ctx := context.Background()
    97  	storeCfg := kvserver.TestStoreConfig(nil)
    98  	storeCfg.TestingKnobs.DisableMergeQueue = true
    99  	mtc := &multiTestContext{storeConfig: &storeCfg}
   100  	mtc.Start(t, 1)
   101  	defer mtc.Stop()
   102  	store := mtc.Store(0)
   103  
   104  	lhsDesc, _, err := createSplitRanges(ctx, store)
   105  	if err != nil {
   106  		t.Fatal(err)
   107  	}
   108  
   109  	// Merge the RHS back into the LHS.
   110  	args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
   111  	_, pErr := kv.SendWrapped(ctx, store.TestSender(), args)
   112  	if pErr != nil {
   113  		t.Fatal(pErr)
   114  	}
   115  
   116  	// Verify the merge by looking up keys from both ranges.
   117  	lhsRepl := store.LookupReplica(roachpb.RKey("a"))
   118  	rhsRepl := store.LookupReplica(roachpb.RKey("c"))
   119  
   120  	if !reflect.DeepEqual(lhsRepl, rhsRepl) {
   121  		t.Fatalf("ranges were not merged: %s != %s", lhsRepl, rhsRepl)
   122  	}
   123  
   124  	// The LHS has been split once and merged once, so it should have received
   125  	// two generation bumps.
   126  	if e, a := int64(2), lhsRepl.Desc().Generation; e != a {
   127  		t.Fatalf("expected LHS to have generation %d, but got %d", e, a)
   128  	}
   129  }
   130  
   131  func getEngineKeySet(t *testing.T, e storage.Engine) map[string]struct{} {
   132  	t.Helper()
   133  	kvs, err := storage.Scan(e, roachpb.KeyMin, roachpb.KeyMax, 0 /* max */)
   134  	if err != nil {
   135  		t.Fatal(err)
   136  	}
   137  	out := map[string]struct{}{}
   138  	for _, kv := range kvs {
   139  		out[string(kv.Key.Key)] = struct{}{}
   140  	}
   141  	return out
   142  }
   143  
   144  // TestStoreRangeMergeMetadataCleanup tests that all metadata of a
   145  // subsumed range is cleaned up on merge.
   146  func TestStoreRangeMergeMetadataCleanup(t *testing.T) {
   147  	defer leaktest.AfterTest(t)()
   148  
   149  	ctx := context.Background()
   150  	storeCfg := kvserver.TestStoreConfig(nil)
   151  	storeCfg.TestingKnobs.DisableMergeQueue = true
   152  	mtc := &multiTestContext{storeConfig: &storeCfg}
   153  	mtc.Start(t, 1)
   154  	defer mtc.Stop()
   155  	store := mtc.Store(0)
   156  
   157  	content := roachpb.Key("testing!")
   158  
   159  	// Write some values left of the proposed split key.
   160  	pArgs := putArgs(roachpb.Key("aaa"), content)
   161  	if _, pErr := kv.SendWrapped(ctx, store.TestSender(), pArgs); pErr != nil {
   162  		t.Fatal(pErr)
   163  	}
   164  
   165  	// Collect all the keys.
   166  	preKeys := getEngineKeySet(t, store.Engine())
   167  
   168  	// Split the range.
   169  	lhsDesc, rhsDesc, err := createSplitRanges(ctx, store)
   170  	if err != nil {
   171  		t.Fatal(err)
   172  	}
   173  
   174  	// Write some values right of the split key.
   175  	pArgs = putArgs(roachpb.Key("ccc"), content)
   176  	if _, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{
   177  		RangeID: rhsDesc.RangeID,
   178  	}, pArgs); pErr != nil {
   179  		t.Fatal(pErr)
   180  	}
   181  
   182  	// Merge the b range back into the a range.
   183  	args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
   184  	if _, pErr := kv.SendWrapped(ctx, store.TestSender(), args); pErr != nil {
   185  		t.Fatal(pErr)
   186  	}
   187  
   188  	// Collect all the keys again.
   189  	postKeys := getEngineKeySet(t, store.Engine())
   190  
   191  	// Compute the new keys.
   192  	for k := range preKeys {
   193  		delete(postKeys, k)
   194  	}
   195  
   196  	tombstoneKey := string(keys.RangeTombstoneKey(rhsDesc.RangeID))
   197  	if _, ok := postKeys[tombstoneKey]; !ok {
   198  		t.Errorf("tombstone key (%s) missing after merge", roachpb.Key(tombstoneKey))
   199  	}
   200  	delete(postKeys, tombstoneKey)
   201  
   202  	// Keep only the subsumed range's local keys.
   203  	localRangeKeyPrefix := string(keys.MakeRangeIDPrefix(rhsDesc.RangeID))
   204  	for k := range postKeys {
   205  		if !strings.HasPrefix(k, localRangeKeyPrefix) {
   206  			delete(postKeys, k)
   207  		}
   208  	}
   209  
   210  	if numKeys := len(postKeys); numKeys > 0 {
   211  		var buf bytes.Buffer
   212  		fmt.Fprintf(&buf, "%d keys were not cleaned up:\n", numKeys)
   213  		for k := range postKeys {
   214  			fmt.Fprintf(&buf, "%s (%q)\n", roachpb.Key(k), k)
   215  		}
   216  		t.Fatal(buf.String())
   217  	}
   218  }
   219  
   220  // TestStoreRangeMergeWithData attempts to merge two ranges, each containing
   221  // data.
   222  func TestStoreRangeMergeWithData(t *testing.T) {
   223  	defer leaktest.AfterTest(t)()
   224  
   225  	for _, retries := range []int64{0, 3} {
   226  		t.Run(fmt.Sprintf("retries=%d", retries), func(t *testing.T) {
   227  			mergeWithData(t, retries)
   228  		})
   229  	}
   230  }
   231  
   232  func mergeWithData(t *testing.T, retries int64) {
   233  	ctx := context.Background()
   234  	storeCfg := kvserver.TestStoreConfig(nil)
   235  	storeCfg.TestingKnobs.DisableReplicateQueue = true
   236  	storeCfg.TestingKnobs.DisableMergeQueue = true
   237  	storeCfg.Clock = nil // manual clock
   238  
   239  	// Maybe inject some retryable errors when the merge transaction commits.
   240  	var mtc *multiTestContext
   241  	storeCfg.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error {
   242  		for _, req := range ba.Requests {
   243  			if et := req.GetEndTxn(); et != nil && et.InternalCommitTrigger.GetMergeTrigger() != nil {
   244  				if atomic.AddInt64(&retries, -1) >= 0 {
   245  					return roachpb.NewError(
   246  						roachpb.NewTransactionRetryError(roachpb.RETRY_SERIALIZABLE, "filter err"))
   247  				}
   248  			}
   249  			if req.GetSubsume() != nil {
   250  				// Introduce targeted chaos by forcing a lease acquisition before
   251  				// Subsume can execute. This triggers an unusual code path where the
   252  				// lease acquisition, not Subsume, notices the merge and installs a
   253  				// mergeComplete channel on the replica.
   254  				mtc.advanceClock(ctx)
   255  			}
   256  		}
   257  		return nil
   258  	}
   259  
   260  	mtc = &multiTestContext{
   261  		storeConfig: &storeCfg,
   262  		// This test was written before the multiTestContext started creating many
   263  		// system ranges at startup, and hasn't been update to take that into
   264  		// account.
   265  		startWithSingleRange: true,
   266  	}
   267  
   268  	var store1, store2 *kvserver.Store
   269  	mtc.Start(t, 1)
   270  	store1, store2 = mtc.stores[0], mtc.stores[0]
   271  	defer mtc.Stop()
   272  
   273  	lhsDesc, rhsDesc, pErr := createSplitRanges(ctx, store1)
   274  	if pErr != nil {
   275  		t.Fatal(pErr)
   276  	}
   277  
   278  	content := []byte("testing!")
   279  
   280  	// Write some values left and right of the proposed split key.
   281  	pArgs := putArgs(roachpb.Key("aaa"), content)
   282  	if _, pErr := kv.SendWrapped(ctx, store1.TestSender(), pArgs); pErr != nil {
   283  		t.Fatal(pErr)
   284  	}
   285  	pArgs = putArgs(roachpb.Key("ccc"), content)
   286  	if _, pErr := kv.SendWrappedWith(ctx, store2.TestSender(), roachpb.Header{
   287  		RangeID: rhsDesc.RangeID,
   288  	}, pArgs); pErr != nil {
   289  		t.Fatal(pErr)
   290  	}
   291  
   292  	// Confirm the values are there.
   293  	gArgs := getArgs(roachpb.Key("aaa"))
   294  	if reply, pErr := kv.SendWrapped(ctx, store1.TestSender(), gArgs); pErr != nil {
   295  		t.Fatal(pErr)
   296  	} else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil {
   297  		t.Fatal(err)
   298  	} else if !bytes.Equal(replyBytes, content) {
   299  		t.Fatalf("actual value %q did not match expected value %q", replyBytes, content)
   300  	}
   301  	gArgs = getArgs(roachpb.Key("ccc"))
   302  	if reply, pErr := kv.SendWrappedWith(ctx, store2.TestSender(), roachpb.Header{
   303  		RangeID: rhsDesc.RangeID,
   304  	}, gArgs); pErr != nil {
   305  		t.Fatal(pErr)
   306  	} else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil {
   307  		t.Fatal(err)
   308  	} else if !bytes.Equal(replyBytes, content) {
   309  		t.Fatalf("actual value %q did not match expected value %q", replyBytes, content)
   310  	}
   311  
   312  	// Merge the b range back into the a range.
   313  	args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
   314  	if _, pErr := kv.SendWrapped(ctx, store1.TestSender(), args); pErr != nil {
   315  		t.Fatal(pErr)
   316  	}
   317  
   318  	// Verify no intents remains on range descriptor keys.
   319  	for _, key := range []roachpb.Key{keys.RangeDescriptorKey(lhsDesc.StartKey), keys.RangeDescriptorKey(rhsDesc.StartKey)} {
   320  		if _, _, err := storage.MVCCGet(
   321  			ctx, store1.Engine(), key, store1.Clock().Now(), storage.MVCCGetOptions{},
   322  		); err != nil {
   323  			t.Fatal(err)
   324  		}
   325  	}
   326  
   327  	// Verify the merge by looking up keys from both ranges.
   328  	lhsRepl := store1.LookupReplica(roachpb.RKey("a"))
   329  	rhsRepl := store1.LookupReplica(roachpb.RKey("c"))
   330  
   331  	if lhsRepl != rhsRepl {
   332  		t.Fatalf("ranges were not merged %+v=%+v", lhsRepl.Desc(), rhsRepl.Desc())
   333  	}
   334  	if startKey := lhsRepl.Desc().StartKey; !bytes.Equal(startKey, roachpb.RKeyMin) {
   335  		t.Fatalf("The start key is not equal to KeyMin %q=%q", startKey, roachpb.RKeyMin)
   336  	}
   337  	if endKey := rhsRepl.Desc().EndKey; !bytes.Equal(endKey, roachpb.RKeyMax) {
   338  		t.Fatalf("The end key is not equal to KeyMax %q=%q", endKey, roachpb.RKeyMax)
   339  	}
   340  
   341  	// Try to get values from after the merge.
   342  	gArgs = getArgs(roachpb.Key("aaa"))
   343  	if reply, pErr := kv.SendWrapped(ctx, store1.TestSender(), gArgs); pErr != nil {
   344  		t.Fatal(pErr)
   345  	} else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil {
   346  		t.Fatal(err)
   347  	} else if !bytes.Equal(replyBytes, content) {
   348  		t.Fatalf("actual value %q did not match expected value %q", replyBytes, content)
   349  	}
   350  	gArgs = getArgs(roachpb.Key("ccc"))
   351  	if reply, pErr := kv.SendWrappedWith(ctx, store1.TestSender(), roachpb.Header{
   352  		RangeID: rhsRepl.RangeID,
   353  	}, gArgs); pErr != nil {
   354  		t.Fatal(pErr)
   355  	} else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil {
   356  		t.Fatal(err)
   357  	} else if !bytes.Equal(replyBytes, content) {
   358  		t.Fatalf("actual value %q did not match expected value %q", replyBytes, content)
   359  	}
   360  
   361  	// Put new values after the merge on both sides.
   362  	pArgs = putArgs(roachpb.Key("aaaa"), content)
   363  	if _, pErr := kv.SendWrapped(ctx, store1.TestSender(), pArgs); pErr != nil {
   364  		t.Fatal(pErr)
   365  	}
   366  	pArgs = putArgs(roachpb.Key("cccc"), content)
   367  	if _, pErr := kv.SendWrappedWith(ctx, store1.TestSender(), roachpb.Header{
   368  		RangeID: rhsRepl.RangeID,
   369  	}, pArgs); pErr != nil {
   370  		t.Fatal(pErr)
   371  	}
   372  
   373  	// Try to get the newly placed values.
   374  	gArgs = getArgs(roachpb.Key("aaaa"))
   375  	if reply, pErr := kv.SendWrapped(ctx, store1.TestSender(), gArgs); pErr != nil {
   376  		t.Fatal(pErr)
   377  	} else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil {
   378  		t.Fatal(err)
   379  	} else if !bytes.Equal(replyBytes, content) {
   380  		t.Fatalf("actual value %q did not match expected value %q", replyBytes, content)
   381  	}
   382  	gArgs = getArgs(roachpb.Key("cccc"))
   383  	if reply, pErr := kv.SendWrapped(ctx, store1.TestSender(), gArgs); pErr != nil {
   384  		t.Fatal(pErr)
   385  	} else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil {
   386  		t.Fatal(err)
   387  	} else if !bytes.Equal(replyBytes, content) {
   388  		t.Fatalf("actual value %q did not match expected value %q", replyBytes, content)
   389  	}
   390  
   391  	gArgs = getArgs(roachpb.Key("cccc"))
   392  	if _, pErr := kv.SendWrappedWith(ctx, store2, roachpb.Header{
   393  		RangeID: rhsDesc.RangeID,
   394  	}, gArgs); !testutils.IsPError(
   395  		pErr, `r2 was not found`,
   396  	) {
   397  		t.Fatalf("expected get on rhs to fail after merge, but got err=%v", pErr)
   398  	}
   399  
   400  	if atomic.LoadInt64(&retries) >= 0 {
   401  		t.Fatalf("%d retries remaining (expected less than zero)", retries)
   402  	}
   403  }
   404  
   405  // TestStoreRangeMergeTimestampCache verifies that the timestamp cache on the
   406  // LHS is properly updated after a merge.
   407  func TestStoreRangeMergeTimestampCache(t *testing.T) {
   408  	defer leaktest.AfterTest(t)()
   409  
   410  	testutils.RunTrueAndFalse(t, "disjoint-leaseholders", mergeCheckingTimestampCaches)
   411  }
   412  
   413  func mergeCheckingTimestampCaches(t *testing.T, disjointLeaseholders bool) {
   414  	ctx := context.Background()
   415  	storeCfg := kvserver.TestStoreConfig(nil)
   416  	storeCfg.TestingKnobs.DisableMergeQueue = true
   417  	mtc := &multiTestContext{storeConfig: &storeCfg}
   418  	var lhsStore, rhsStore *kvserver.Store
   419  	if disjointLeaseholders {
   420  		mtc.Start(t, 2)
   421  		lhsStore, rhsStore = mtc.Store(0), mtc.Store(1)
   422  	} else {
   423  		mtc.Start(t, 1)
   424  		lhsStore, rhsStore = mtc.Store(0), mtc.Store(0)
   425  	}
   426  	defer mtc.Stop()
   427  
   428  	lhsDesc, rhsDesc, err := createSplitRanges(ctx, lhsStore)
   429  	if err != nil {
   430  		t.Fatal(err)
   431  	}
   432  
   433  	if disjointLeaseholders {
   434  		mtc.replicateRange(lhsDesc.RangeID, 1)
   435  		mtc.replicateRange(rhsDesc.RangeID, 1)
   436  		mtc.transferLease(ctx, rhsDesc.RangeID, 0, 1)
   437  		testutils.SucceedsSoon(t, func() error {
   438  			rhsRepl, err := rhsStore.GetReplica(rhsDesc.RangeID)
   439  			if err != nil {
   440  				return err
   441  			}
   442  			if !rhsRepl.OwnsValidLease(mtc.clock().Now()) {
   443  				return errors.New("rhs store does not own valid lease for rhs range")
   444  			}
   445  			return nil
   446  		})
   447  	}
   448  
   449  	// Write a key to the RHS.
   450  	rhsKey := roachpb.Key("c")
   451  	if _, pErr := kv.SendWrappedWith(ctx, rhsStore, roachpb.Header{
   452  		RangeID: rhsDesc.RangeID,
   453  	}, incrementArgs(rhsKey, 1)); pErr != nil {
   454  		t.Fatal(pErr)
   455  	}
   456  
   457  	readTS := mtc.clock().Now()
   458  
   459  	// Simulate a read on the RHS from a node with a newer clock.
   460  	var ba roachpb.BatchRequest
   461  	ba.Timestamp = readTS
   462  	ba.RangeID = rhsDesc.RangeID
   463  	ba.Add(getArgs(rhsKey))
   464  	if br, pErr := rhsStore.Send(ctx, ba); pErr != nil {
   465  		t.Fatal(pErr)
   466  	} else if v, err := br.Responses[0].GetGet().Value.GetInt(); err != nil {
   467  		t.Fatal(err)
   468  	} else if v != 1 {
   469  		t.Fatalf("expected 1, but got %d", v)
   470  	} else if br.Timestamp != readTS {
   471  		t.Fatalf("expected read to execute at %v, but executed at %v", readTS, br.Timestamp)
   472  	}
   473  
   474  	// Simulate a txn abort on the RHS from a node with a newer clock. Because
   475  	// the transaction record for the pushee was not yet written, this will bump
   476  	// the timestamp cache to record the abort.
   477  	pushee := roachpb.MakeTransaction("pushee", rhsKey, roachpb.MinUserPriority, readTS, 0)
   478  	pusher := roachpb.MakeTransaction("pusher", rhsKey, roachpb.MaxUserPriority, readTS, 0)
   479  	ba = roachpb.BatchRequest{}
   480  	ba.Timestamp = mtc.clock().Now()
   481  	ba.RangeID = rhsDesc.RangeID
   482  	ba.Add(pushTxnArgs(&pusher, &pushee, roachpb.PUSH_ABORT))
   483  	if br, pErr := rhsStore.Send(ctx, ba); pErr != nil {
   484  		t.Fatal(pErr)
   485  	} else if txn := br.Responses[0].GetPushTxn().PusheeTxn; txn.Status != roachpb.ABORTED {
   486  		t.Fatalf("expected aborted pushee, but got %v", txn)
   487  	}
   488  
   489  	// Merge the RHS back into the LHS.
   490  	args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
   491  	if _, pErr := kv.SendWrapped(ctx, lhsStore.TestSender(), args); pErr != nil {
   492  		t.Fatal(pErr)
   493  	}
   494  
   495  	// After the merge, attempt to write under the read. The batch should get
   496  	// forwarded to a timestamp after the read.
   497  	ba = roachpb.BatchRequest{}
   498  	ba.Timestamp = readTS
   499  	ba.RangeID = lhsDesc.RangeID
   500  	ba.Add(incrementArgs(rhsKey, 1))
   501  	if br, pErr := lhsStore.Send(ctx, ba); pErr != nil {
   502  		t.Fatal(pErr)
   503  	} else if br.Timestamp.LessEq(readTS) {
   504  		t.Fatalf("expected write to execute after %v, but executed at %v", readTS, br.Timestamp)
   505  	}
   506  
   507  	// Attempt to create a transaction record for the pushee transaction, which
   508  	// was aborted before the merge. This should be rejected with a transaction
   509  	// aborted error. The reason will depend on whether the leaseholders were
   510  	// disjoint or not because disjoint leaseholders will lead to a loss of
   511  	// resolution in the timestamp cache. Either way though, the transaction
   512  	// should not be allowed to create its record.
   513  	hb, hbH := heartbeatArgs(&pushee, mtc.clock().Now())
   514  	ba = roachpb.BatchRequest{}
   515  	ba.Header = hbH
   516  	ba.RangeID = lhsDesc.RangeID
   517  	ba.Add(hb)
   518  	var expReason roachpb.TransactionAbortedReason
   519  	if disjointLeaseholders {
   520  		expReason = roachpb.ABORT_REASON_TIMESTAMP_CACHE_REJECTED
   521  	} else {
   522  		expReason = roachpb.ABORT_REASON_ABORTED_RECORD_FOUND
   523  	}
   524  	if _, pErr := lhsStore.Send(ctx, ba); pErr == nil {
   525  		t.Fatalf("expected TransactionAbortedError(%s) but got %v", expReason, pErr)
   526  	} else if abortErr, ok := pErr.GetDetail().(*roachpb.TransactionAbortedError); !ok {
   527  		t.Fatalf("expected TransactionAbortedError(%s) but got %v", expReason, pErr)
   528  	} else if abortErr.Reason != expReason {
   529  		t.Fatalf("expected TransactionAbortedError(%s) but got %v", expReason, pErr)
   530  	}
   531  }
   532  
   533  // TestStoreRangeMergeTimestampCacheCausality verifies that range merges update
   534  // the clock on the subsuming store as necessary to preserve causality.
   535  //
   536  // The test simulates a particularly diabolical sequence of events in which
   537  // causality information is not communicated through the normal channels.
   538  // Suppose two adjacent ranges, A and B, are collocated on S2, S3, and S4. (S1
   539  // is omitted for consistency with the store numbering in the test itself.) S3
   540  // holds the lease on A, while S4 holds the lease on B. Every store's clock
   541  // starts at time T1.
   542  //
   543  // To merge A and B, S3 will launch a merge transaction that sends several RPCs
   544  // to S4. Suppose that, just before S4 begins executing the Subsume request, a
   545  // read sneaks in for some key K at a large timestamp T3. S4 will bump its clock
   546  // from T1 to T3, so when the Subsume goes to determine the current time to use
   547  // for the FreezeStart field in the Subsume response, it will use T3. When S3
   548  // completes the merge, it will thus use T3 as the timestamp cache's low water
   549  // mark for the keys that previously belonged to B.
   550  //
   551  // Importantly, S3 must also update its clock from T1 to T3. Otherwise, as this
   552  // test demonstrates, it is possible for S3 to send a lease to another store, in
   553  // this case S2, that begins at T2. S2 will then assume it is free to accept a
   554  // write at T2, when in fact we already served a read at T3. This would be a
   555  // serializability violation!
   556  //
   557  // Note that there are several mechanisms that *almost* prevent this problem. If
   558  // the read of K at T3 occurs slightly earlier, the batch response for Subsume
   559  // will set the Now field to T3, which S3 will use to bump its clock.
   560  // (BatchResponse.Now is computed when the batch is received, not when it
   561  // finishes executing.) If S3 receives a write for K at T2, it will a) properly
   562  // bump the write to T4, because its timestamp cache is up to date, and then b)
   563  // bump its clock to T4. Or if S4 were to send a single RPC to S3, S3 would bump
   564  // its clock based on the BatchRequest.Timestamp.
   565  //
   566  // In short, this sequence of events is likely to be exceedingly unlikely in
   567  // practice, but is subtle enough to warrant a test.
   568  func TestStoreRangeMergeTimestampCacheCausality(t *testing.T) {
   569  	defer leaktest.AfterTest(t)()
   570  
   571  	ctx := context.Background()
   572  	storeCfg := kvserver.TestStoreConfig(nil /* clock */)
   573  	storeCfg.TestingKnobs.DisableMergeQueue = true
   574  	storeCfg.Clock = nil // manual clock
   575  	mtc := &multiTestContext{storeConfig: &storeCfg}
   576  	var readTS hlc.Timestamp
   577  	rhsKey := roachpb.Key("c")
   578  	mtc.storeConfig.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error {
   579  		if ba.IsSingleSubsumeRequest() {
   580  			// Before we execute a Subsume request, execute a read on the same store
   581  			// at a much higher timestamp.
   582  			gba := roachpb.BatchRequest{}
   583  			gba.RangeID = ba.RangeID
   584  			gba.Timestamp = ba.Timestamp.Add(42 /* wallTime */, 0 /* logical */)
   585  			gba.Add(getArgs(rhsKey))
   586  			store := mtc.Store(int(ba.Header.Replica.StoreID - 1))
   587  			gbr, pErr := store.Send(ctx, gba)
   588  			if pErr != nil {
   589  				t.Error(pErr) // different goroutine, so can't use t.Fatal
   590  			}
   591  			readTS = gbr.Timestamp
   592  		}
   593  		return nil
   594  	}
   595  	for i := 0; i < 4; i++ {
   596  		clock := hlc.NewClock(hlc.NewManualClock(123).UnixNano, time.Millisecond /* maxOffset */)
   597  		mtc.clocks = append(mtc.clocks, clock)
   598  	}
   599  	mtc.Start(t, 4)
   600  	defer mtc.Stop()
   601  	distSender := mtc.distSenders[0]
   602  
   603  	for _, key := range []roachpb.Key{roachpb.Key("a"), roachpb.Key("b")} {
   604  		if _, pErr := kv.SendWrapped(ctx, distSender, adminSplitArgs(key)); pErr != nil {
   605  			t.Fatal(pErr)
   606  		}
   607  	}
   608  
   609  	lhsRangeID := mtc.Store(0).LookupReplica(roachpb.RKey("a")).RangeID
   610  	rhsRangeID := mtc.Store(0).LookupReplica(roachpb.RKey("b")).RangeID
   611  
   612  	// Replicate [a, b) to s2, s3, and s4, and put the lease on s3.
   613  	mtc.replicateRange(lhsRangeID, 1, 2, 3)
   614  	mtc.transferLease(ctx, lhsRangeID, 0, 2)
   615  	mtc.unreplicateRange(lhsRangeID, 0)
   616  
   617  	// Replicate [b, Max) to s2, s3, and s4, and put the lease on s4.
   618  	mtc.replicateRange(rhsRangeID, 1, 2, 3)
   619  	mtc.transferLease(ctx, rhsRangeID, 0, 3)
   620  	mtc.unreplicateRange(rhsRangeID, 0)
   621  
   622  	// N.B. We isolate r1 on s1 so that node liveness heartbeats do not interfere
   623  	// with our precise clock management on s2, s3, and s4.
   624  
   625  	// Write a key to [b, Max).
   626  	if _, pErr := kv.SendWrapped(ctx, distSender, incrementArgs(rhsKey, 1)); pErr != nil {
   627  		t.Fatal(pErr)
   628  	}
   629  
   630  	// Wait for all relevant stores to have the same value. This indirectly
   631  	// ensures the lease transfers have applied on all relevant stores.
   632  	mtc.waitForValues(rhsKey, []int64{0, 1, 1, 1})
   633  
   634  	// Merge [a, b) and [b, Max). Our request filter above will intercept the
   635  	// merge and execute a read with a large timestamp immediately before the
   636  	// Subsume request executes.
   637  	if _, pErr := kv.SendWrappedWith(ctx, mtc.Store(2), roachpb.Header{
   638  		RangeID: lhsRangeID,
   639  	}, adminMergeArgs(roachpb.Key("a"))); pErr != nil {
   640  		t.Fatal(pErr)
   641  	}
   642  
   643  	// Immediately transfer the lease on the merged range [a, Max) from s3 to s2.
   644  	// To test that it is, in fact, the merge trigger that properly bumps s3's
   645  	// clock, s3 must not send or receive any requests before it transfers the
   646  	// lease, as those requests could bump s3's clock through other code paths.
   647  	mtc.transferLease(ctx, lhsRangeID, 2, 1)
   648  	testutils.SucceedsSoon(t, func() error {
   649  		lhsRepl1, err := mtc.Store(1).GetReplica(lhsRangeID)
   650  		if err != nil {
   651  			return err
   652  		}
   653  		if !lhsRepl1.OwnsValidLease(mtc.clocks[1].Now()) {
   654  			return errors.New("s2 does not own valid lease for lhs range")
   655  		}
   656  		return nil
   657  	})
   658  
   659  	// Attempt to write at the same time as the read. The write's timestamp
   660  	// should be forwarded to after the read.
   661  	ba := roachpb.BatchRequest{}
   662  	ba.Timestamp = readTS
   663  	ba.RangeID = lhsRangeID
   664  	ba.Add(incrementArgs(rhsKey, 1))
   665  	if br, pErr := mtc.Store(1).Send(ctx, ba); pErr != nil {
   666  		t.Fatal(pErr)
   667  	} else if br.Timestamp.LessEq(readTS) {
   668  		t.Fatalf("expected write to execute after %v, but executed at %v", readTS, br.Timestamp)
   669  	}
   670  }
   671  
   672  // TestStoreRangeMergeLastRange verifies that merging the last range fails.
   673  func TestStoreRangeMergeLastRange(t *testing.T) {
   674  	defer leaktest.AfterTest(t)()
   675  
   676  	ctx := context.Background()
   677  	mtc := multiTestContext{
   678  		// This test was written before the multiTestContext started creating many
   679  		// system ranges at startup, and hasn't been update to take that into
   680  		// account.
   681  		startWithSingleRange: true,
   682  	}
   683  	mtc.Start(t, 1)
   684  	defer mtc.Stop()
   685  	store := mtc.Store(0)
   686  
   687  	// Merge last range.
   688  	_, pErr := kv.SendWrapped(ctx, store.TestSender(), adminMergeArgs(roachpb.KeyMin))
   689  	if !testutils.IsPError(pErr, "cannot merge final range") {
   690  		t.Fatalf("expected 'cannot merge final range' error; got %s", pErr)
   691  	}
   692  }
   693  
   694  func TestStoreRangeMergeTxnFailure(t *testing.T) {
   695  	defer leaktest.AfterTest(t)()
   696  
   697  	ctx := context.Background()
   698  	storeCfg := kvserver.TestStoreConfig(nil)
   699  	storeCfg.TestingKnobs.DisableSplitQueue = true
   700  	storeCfg.TestingKnobs.DisableMergeQueue = true
   701  
   702  	// Install a store filter that maybe injects retryable errors into a merge
   703  	// transaction before ultimately aborting the merge.
   704  	var retriesBeforeFailure int64
   705  	storeCfg.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error {
   706  		for _, req := range ba.Requests {
   707  			if et := req.GetEndTxn(); et != nil && et.InternalCommitTrigger.GetMergeTrigger() != nil {
   708  				if atomic.AddInt64(&retriesBeforeFailure, -1) >= 0 {
   709  					return roachpb.NewError(
   710  						roachpb.NewTransactionRetryError(roachpb.RETRY_SERIALIZABLE, "filter err"))
   711  				}
   712  				return roachpb.NewError(errors.New("injected permafail"))
   713  			}
   714  		}
   715  		return nil
   716  	}
   717  
   718  	mtc := &multiTestContext{storeConfig: &storeCfg}
   719  	mtc.Start(t, 1)
   720  	defer mtc.Stop()
   721  	store := mtc.Store(0)
   722  	kvDB := store.DB()
   723  
   724  	if err := kvDB.Put(ctx, "aa", "val"); err != nil {
   725  		t.Fatal(err)
   726  	}
   727  	if err := kvDB.Put(ctx, "cc", "val"); err != nil {
   728  		t.Fatal(err)
   729  	}
   730  	lhsDesc, rhsDesc, err := createSplitRanges(ctx, store)
   731  	if err != nil {
   732  		t.Fatal(err)
   733  	}
   734  
   735  	verifyLHSAndRHSLive := func() {
   736  		t.Helper()
   737  		for _, tc := range []struct {
   738  			rangeID roachpb.RangeID
   739  			key     roachpb.Key
   740  		}{
   741  			{lhsDesc.RangeID, roachpb.Key("aa")},
   742  			{rhsDesc.RangeID, roachpb.Key("cc")},
   743  		} {
   744  			if reply, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{
   745  				RangeID: tc.rangeID,
   746  			}, getArgs(tc.key)); pErr != nil {
   747  				t.Fatal(pErr)
   748  			} else if replyBytes, err := reply.(*roachpb.GetResponse).Value.GetBytes(); err != nil {
   749  				t.Fatal(err)
   750  			} else if !bytes.Equal(replyBytes, []byte("val")) {
   751  				t.Fatalf("actual value %q did not match expected value %q", replyBytes, []byte("val"))
   752  			}
   753  		}
   754  	}
   755  
   756  	attemptMerge := func() {
   757  		t.Helper()
   758  		args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
   759  		_, pErr := kv.SendWrapped(ctx, store.TestSender(), args)
   760  		if exp := "injected permafail"; !testutils.IsPError(pErr, exp) {
   761  			t.Fatalf("expected %q error, but got %q", exp, pErr)
   762  		}
   763  	}
   764  
   765  	verifyLHSAndRHSLive()
   766  
   767  	atomic.StoreInt64(&retriesBeforeFailure, 0)
   768  	attemptMerge()
   769  	verifyLHSAndRHSLive()
   770  	if atomic.LoadInt64(&retriesBeforeFailure) >= 0 {
   771  		t.Fatalf("%d retries remaining (expected less than zero)", retriesBeforeFailure)
   772  	}
   773  
   774  	atomic.StoreInt64(&retriesBeforeFailure, 3)
   775  	attemptMerge()
   776  	verifyLHSAndRHSLive()
   777  	if atomic.LoadInt64(&retriesBeforeFailure) >= 0 {
   778  		t.Fatalf("%d retries remaining (expected less than zero)", retriesBeforeFailure)
   779  	}
   780  }
   781  
   782  // TestStoreRangeSplitMergeGeneration verifies that splits and merges both
   783  // update the range descriptor generations of the involved ranges according to
   784  // the comment on the RangeDescriptor.Generation field.
   785  func TestStoreRangeSplitMergeGeneration(t *testing.T) {
   786  	defer leaktest.AfterTest(t)()
   787  
   788  	testutils.RunTrueAndFalse(t, "rhsHasHigherGen", func(t *testing.T, rhsHasHigherGen bool) {
   789  		s, _, _ := serverutils.StartServer(t, base.TestServerArgs{
   790  			Knobs: base.TestingKnobs{
   791  				Store: &kvserver.StoreTestingKnobs{
   792  					// Disable both splits and merges so that we're in full
   793  					// control over them.
   794  					DisableMergeQueue: true,
   795  					DisableSplitQueue: true,
   796  				},
   797  			},
   798  		})
   799  		defer s.Stopper().Stop(context.Background())
   800  
   801  		leftKey := roachpb.Key("z")
   802  		rightKey := leftKey.Next().Next()
   803  
   804  		// First, split at the left key for convenience, so that we can check
   805  		// leftDesc.StartKey == leftKey later.
   806  		_, _, err := s.SplitRange(leftKey)
   807  		assert.NoError(t, err)
   808  
   809  		store, err := s.GetStores().(*kvserver.Stores).GetStore(s.GetFirstStoreID())
   810  		assert.NoError(t, err)
   811  		leftRepl := store.LookupReplica(keys.MustAddr(leftKey))
   812  		assert.NotNil(t, leftRepl)
   813  		preSplitGen := leftRepl.Desc().Generation
   814  		leftDesc, rightDesc, err := s.SplitRange(rightKey)
   815  		assert.NoError(t, err)
   816  
   817  		// Split should increment the LHS' generation and also propagate the result
   818  		// to the RHS.
   819  		assert.Equal(t, preSplitGen+1, leftDesc.Generation)
   820  		assert.Equal(t, preSplitGen+1, rightDesc.Generation)
   821  
   822  		if rhsHasHigherGen {
   823  			// Split the RHS again to increment its generation once more, so that
   824  			// we get (assuming preSplitGen=1):
   825  			//
   826  			// |--left@2---||---right@3---||--don't care--|
   827  			//
   828  			rightDesc, _, err = s.SplitRange(rightKey.Next())
   829  			assert.NoError(t, err)
   830  			assert.Equal(t, preSplitGen+2, rightDesc.Generation)
   831  		} else {
   832  			// Split and merge the LHS to increment the generation (it ends up
   833  			// being incremented by two). Note that leftKey.Next() is still in
   834  			// the left range. Assuming preSplitGen=1, we'll end up in the
   835  			// situation:
   836  			//
   837  			// |--left@4---||---right@2---|
   838  			var tmpRightDesc roachpb.RangeDescriptor
   839  			leftDesc, tmpRightDesc, err = s.SplitRange(leftKey.Next())
   840  			assert.Equal(t, preSplitGen+2, leftDesc.Generation)
   841  			assert.Equal(t, preSplitGen+2, tmpRightDesc.Generation)
   842  			assert.NoError(t, err)
   843  			leftDesc, err = s.MergeRanges(leftKey)
   844  			assert.NoError(t, err)
   845  			assert.Equal(t, preSplitGen+3, leftDesc.Generation)
   846  		}
   847  
   848  		// Make sure the split/merge shenanigans above didn't get the range
   849  		// descriptors confused.
   850  		assert.Equal(t, leftKey, leftDesc.StartKey.AsRawKey())
   851  		assert.Equal(t, rightKey, rightDesc.StartKey.AsRawKey())
   852  
   853  		// Merge the two ranges back to verify that the resulting descriptor
   854  		// has the correct generation.
   855  		mergedDesc, err := s.MergeRanges(leftKey)
   856  		assert.NoError(t, err)
   857  
   858  		maxPreMergeGen := leftDesc.Generation
   859  		if rhsGen := rightDesc.Generation; rhsGen > maxPreMergeGen {
   860  			maxPreMergeGen = rhsGen
   861  		}
   862  
   863  		assert.Equal(t, maxPreMergeGen+1, mergedDesc.Generation)
   864  		assert.Equal(t, leftDesc.RangeID, mergedDesc.RangeID)
   865  	})
   866  }
   867  
   868  // TestStoreRangeMergeStats starts by splitting a range, then writing random
   869  // data to both sides of the split. It then merges the ranges and verifies the
   870  // merged range has stats consistent with recomputations.
   871  func TestStoreRangeMergeStats(t *testing.T) {
   872  	defer leaktest.AfterTest(t)()
   873  	ctx := context.Background()
   874  	storeCfg := kvserver.TestStoreConfig(nil)
   875  	storeCfg.TestingKnobs.DisableMergeQueue = true
   876  	storeCfg.Clock = nil // manual clock
   877  	mtc := &multiTestContext{storeConfig: &storeCfg}
   878  	mtc.Start(t, 1)
   879  	defer mtc.Stop()
   880  	store := mtc.Store(0)
   881  
   882  	// Split the range.
   883  	lhsDesc, rhsDesc, err := createSplitRanges(ctx, store)
   884  	if err != nil {
   885  		t.Fatal(err)
   886  	}
   887  
   888  	// Write some values left and right of the proposed split key.
   889  	kvserver.WriteRandomDataToRange(t, store, lhsDesc.RangeID, []byte("aaa"))
   890  	kvserver.WriteRandomDataToRange(t, store, rhsDesc.RangeID, []byte("ccc"))
   891  
   892  	// Litter some abort span records. txn1 will leave a record on the LHS, txn2
   893  	// will leave a record on the RHS, and txn3 will leave a record on both. This
   894  	// tests whether the merge code properly accounts for merging abort span
   895  	// records for the same transaction.
   896  	txn1 := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */)
   897  	if err := txn1.Put(ctx, "a-txn1", "val"); err != nil {
   898  		t.Fatal(err)
   899  	}
   900  	txn2 := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */)
   901  	if err := txn2.Put(ctx, "c-txn2", "val"); err != nil {
   902  		t.Fatal(err)
   903  	}
   904  	txn3 := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */)
   905  	if err := txn3.Put(ctx, "a-txn3", "val"); err != nil {
   906  		t.Fatal(err)
   907  	}
   908  	if err := txn3.Put(ctx, "c-txn3", "val"); err != nil {
   909  		t.Fatal(err)
   910  	}
   911  	hiPriTxn := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */)
   912  	hiPriTxn.TestingSetPriority(enginepb.MaxTxnPriority)
   913  	for _, key := range []string{"a-txn1", "c-txn2", "a-txn3", "c-txn3"} {
   914  		if err := hiPriTxn.Put(ctx, key, "val"); err != nil {
   915  			t.Fatal(err)
   916  		}
   917  	}
   918  	if err := hiPriTxn.Commit(ctx); err != nil {
   919  		t.Fatal(err)
   920  	}
   921  	// Leave txn1-txn3 open so that their abort span records exist during the
   922  	// merge below.
   923  
   924  	// Get the range stats for both ranges now that we have data.
   925  	snap := store.Engine().NewSnapshot()
   926  	defer snap.Close()
   927  	msA, err := stateloader.Make(lhsDesc.RangeID).LoadMVCCStats(ctx, snap)
   928  	if err != nil {
   929  		t.Fatal(err)
   930  	}
   931  	msB, err := stateloader.Make(rhsDesc.RangeID).LoadMVCCStats(ctx, snap)
   932  	if err != nil {
   933  		t.Fatal(err)
   934  	}
   935  
   936  	// Stats should agree with recomputation.
   937  	if err := verifyRecomputedStats(snap, lhsDesc, msA, mtc.manualClock.UnixNano()); err != nil {
   938  		t.Fatalf("failed to verify range A's stats before split: %+v", err)
   939  	}
   940  	if err := verifyRecomputedStats(snap, rhsDesc, msB, mtc.manualClock.UnixNano()); err != nil {
   941  		t.Fatalf("failed to verify range B's stats before split: %+v", err)
   942  	}
   943  
   944  	mtc.manualClock.Increment(100)
   945  
   946  	// Merge the b range back into the a range.
   947  	args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
   948  	if _, err := kv.SendWrapped(ctx, store.TestSender(), args); err != nil {
   949  		t.Fatal(err)
   950  	}
   951  	replMerged := store.LookupReplica(lhsDesc.StartKey)
   952  
   953  	// Get the range stats for the merged range and verify.
   954  	snap = store.Engine().NewSnapshot()
   955  	defer snap.Close()
   956  	msMerged, err := stateloader.Make(replMerged.RangeID).LoadMVCCStats(ctx, snap)
   957  	if err != nil {
   958  		t.Fatal(err)
   959  	}
   960  
   961  	// Merged stats should agree with recomputation.
   962  	nowNanos := mtc.manualClock.UnixNano()
   963  	msMerged.AgeTo(nowNanos)
   964  	if err := verifyRecomputedStats(snap, replMerged.Desc(), msMerged, nowNanos); err != nil {
   965  		t.Errorf("failed to verify range's stats after merge: %+v", err)
   966  	}
   967  }
   968  
   969  func TestStoreRangeMergeInFlightTxns(t *testing.T) {
   970  	defer leaktest.AfterTest(t)()
   971  
   972  	ctx := context.Background()
   973  	storeCfg := kvserver.TestStoreConfig(nil)
   974  	storeCfg.TestingKnobs.DisableReplicateQueue = true
   975  	storeCfg.TestingKnobs.DisableMergeQueue = true
   976  	mtc := &multiTestContext{storeConfig: &storeCfg}
   977  	mtc.Start(t, 1)
   978  	defer mtc.Stop()
   979  	store := mtc.Store(0)
   980  
   981  	// Create two adjacent ranges.
   982  	setupReplicas := func() (lhsDesc, rhsDesc *roachpb.RangeDescriptor, err error) {
   983  		lhsDesc, rhsDesc, err = createSplitRanges(ctx, store)
   984  		if err != nil {
   985  			return nil, nil, err
   986  		}
   987  		return lhsDesc, rhsDesc, nil
   988  	}
   989  
   990  	// Verify that a transaction can span a merge.
   991  	t.Run("valid", func(t *testing.T) {
   992  		lhsDesc, _, err := setupReplicas()
   993  		if err != nil {
   994  			t.Fatal(err)
   995  		}
   996  		lhsKey, rhsKey := roachpb.Key("aa"), roachpb.Key("cc")
   997  
   998  		txn := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */)
   999  		// Put the key on the RHS side first so ownership of the transaction record
  1000  		// will need to transfer to the LHS range during the merge.
  1001  		if err := txn.Put(ctx, rhsKey, t.Name()); err != nil {
  1002  			t.Fatal(err)
  1003  		}
  1004  		if err := txn.Put(ctx, lhsKey, t.Name()); err != nil {
  1005  			t.Fatal(err)
  1006  		}
  1007  		args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  1008  		if _, pErr := kv.SendWrapped(ctx, store.TestSender(), args); pErr != nil {
  1009  			t.Fatal(pErr)
  1010  		}
  1011  		if err := txn.Commit(ctx); err != nil {
  1012  			t.Fatal(err)
  1013  		}
  1014  
  1015  		for _, key := range []roachpb.Key{lhsKey, rhsKey} {
  1016  			kv, err := store.DB().Get(ctx, key)
  1017  			if err != nil {
  1018  				t.Fatal(err)
  1019  			} else if string(kv.ValueBytes()) != t.Name() {
  1020  				t.Fatalf("actual value %q did not match expected value %q", kv.ValueBytes(), t.Name())
  1021  			}
  1022  		}
  1023  	})
  1024  
  1025  	// Verify that a transaction's abort span records are preserved when the
  1026  	// transaction spans a merge.
  1027  	t.Run("abort-span", func(t *testing.T) {
  1028  		lhsDesc, _, err := setupReplicas()
  1029  		if err != nil {
  1030  			t.Fatal(err)
  1031  		}
  1032  		rhsKey := roachpb.Key("cc")
  1033  
  1034  		// Create a transaction that will be aborted before the merge but won't
  1035  		// realize until after the merge.
  1036  		txn1 := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */)
  1037  		// Put the key on the RHS side so ownership of the transaction record and
  1038  		// abort span records will need to transfer to the LHS during the merge.
  1039  		if err := txn1.Put(ctx, rhsKey, t.Name()); err != nil {
  1040  			t.Fatal(err)
  1041  		}
  1042  
  1043  		// Create and commit a txn that aborts txn1.
  1044  		txn2 := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */)
  1045  		txn2.TestingSetPriority(enginepb.MaxTxnPriority)
  1046  		if err := txn2.Put(ctx, rhsKey, "muhahahah"); err != nil {
  1047  			t.Fatal(err)
  1048  		}
  1049  		if err := txn2.Commit(ctx); err != nil {
  1050  			t.Fatal(err)
  1051  		}
  1052  
  1053  		// Complete the merge.
  1054  		args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  1055  		if _, pErr := kv.SendWrapped(ctx, store.TestSender(), args); pErr != nil {
  1056  			t.Fatal(pErr)
  1057  		}
  1058  		expErr := "TransactionAbortedError(ABORT_REASON_ABORT_SPAN)"
  1059  		if _, err := txn1.Get(ctx, rhsKey); !testutils.IsError(err, regexp.QuoteMeta(expErr)) {
  1060  			t.Fatalf("expected %s but got %v", expErr, err)
  1061  		}
  1062  	})
  1063  
  1064  	// Verify that the transaction wait queue on the right-hand range in a merge
  1065  	// is cleared if the merge commits.
  1066  	t.Run("wait-queue", func(t *testing.T) {
  1067  		lhsDesc, rhsDesc, err := setupReplicas()
  1068  		if err != nil {
  1069  			t.Fatal(err)
  1070  		}
  1071  		rhsKey := roachpb.Key("cc")
  1072  
  1073  		// Set a timeout, and set the the transaction liveness threshold to
  1074  		// something much larger than our timeout. We want transactions to get stuck
  1075  		// in the transaction wait queue and trigger the timeout if we forget to
  1076  		// clear it.
  1077  		var cancel func()
  1078  		ctx, cancel = context.WithTimeout(ctx, testutils.DefaultSucceedsSoonDuration)
  1079  		defer cancel()
  1080  		defer txnwait.TestingOverrideTxnLivenessThreshold(2 * testutils.DefaultSucceedsSoonDuration)
  1081  
  1082  		// Create a transaction that won't complete until after the merge.
  1083  		txn1 := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */)
  1084  		// Put the key on the RHS side so ownership of the transaction record and
  1085  		// abort span records will need to transfer to the LHS during the merge.
  1086  		if err := txn1.Put(ctx, rhsKey, t.Name()); err != nil {
  1087  			t.Fatal(err)
  1088  		}
  1089  
  1090  		// Create a txn that will conflict with txn1.
  1091  		txn2 := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */)
  1092  		txn2ErrCh := make(chan error)
  1093  		go func() {
  1094  			// Get should block on txn1's intent until txn1 commits.
  1095  			kv, err := txn2.Get(ctx, rhsKey)
  1096  			if err != nil {
  1097  				txn2ErrCh <- err
  1098  			} else if string(kv.ValueBytes()) != t.Name() {
  1099  				txn2ErrCh <- errors.Errorf("actual value %q did not match expected value %q", kv.ValueBytes(), t.Name())
  1100  			}
  1101  			txn2ErrCh <- nil
  1102  		}()
  1103  
  1104  		// Wait for txn2 to realize it conflicts with txn1 and enter its wait queue.
  1105  		{
  1106  			repl, err := store.GetReplica(rhsDesc.RangeID)
  1107  			if err != nil {
  1108  				t.Fatal(err)
  1109  			}
  1110  			for {
  1111  				if _, ok := repl.GetConcurrencyManager().TxnWaitQueue().TrackedTxns()[txn1.ID()]; ok {
  1112  					break
  1113  				}
  1114  				select {
  1115  				case <-time.After(10 * time.Millisecond):
  1116  				case <-ctx.Done():
  1117  					t.Fatal("timed out waiting for txn2 to enter wait queue")
  1118  				}
  1119  			}
  1120  		}
  1121  
  1122  		// Complete the merge.
  1123  		args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  1124  		if _, pErr := kv.SendWrapped(ctx, store.TestSender(), args); pErr != nil {
  1125  			t.Fatal(pErr)
  1126  		}
  1127  
  1128  		if err := txn1.Commit(ctx); err != nil {
  1129  			t.Fatal(err)
  1130  		}
  1131  
  1132  		// Now that txn1 has committed, txn2's get operation should complete.
  1133  		select {
  1134  		case err := <-txn2ErrCh:
  1135  			if err != nil {
  1136  				t.Fatal(err)
  1137  			}
  1138  		case <-ctx.Done():
  1139  			t.Fatal("timed out waiting for txn2 to complete get")
  1140  		}
  1141  
  1142  		if err := txn2.Commit(ctx); err != nil {
  1143  			t.Fatal(err)
  1144  		}
  1145  	})
  1146  }
  1147  
  1148  // TestStoreRangeMergeSplitRace_MergeWins (occasionally) reproduces a race where
  1149  // a concurrent merge and split could deadlock. It exercises the case where the
  1150  // merge commits and the split aborts. See the SplitWins variant of this test
  1151  // for the inverse case.
  1152  //
  1153  // The bug works like this. A merge of adjacent ranges P and Q and a split of Q
  1154  // execute concurrently, though the merge executes with an earlier timestamp.
  1155  // The merge updates Q's meta2 range descriptor. The split updates Q's local
  1156  // range descriptor, then tries to update Q's meta2 range descriptor, but runs
  1157  // into the merge's intent and attempts to push the merge. Under our current
  1158  // concurrency control strategy, this results in the split waiting for the merge
  1159  // to complete. The merge then tries to update Q's local range descriptor but
  1160  // runs into the split's intent. While pushing the split, the merge realizes
  1161  // that waiting for the split to complete would cause deadlock, so it aborts the
  1162  // split instead.
  1163  //
  1164  // But before the split can clean up its transaction record and intents, the
  1165  // merge locks Q and launches a goroutine to unlock Q when the merge commits.
  1166  // Then the merge completes, which has a weird side effect: the split's push of
  1167  // the merge will succeed! How is this possible? The split's push request is not
  1168  // guaranteed to notice that the split has been aborted before it notices that
  1169  // the merge has completed. So the aborted split winds up resolving the merge's
  1170  // intent on Q's meta2 range descriptor and leaving its own intent in its place.
  1171  //
  1172  // In the past, the merge watcher goroutine would perform a range lookup for Q;
  1173  // this would indirectly wait for the merge to complete by waiting for its
  1174  // intent in meta2 to be resolved. In this case, however, its the *split*'s
  1175  // intent that the watcher goroutine sees. This intent can't be resolved because
  1176  // the split's transaction record is located on the locked range Q! And so Q can
  1177  // never be unlocked.
  1178  //
  1179  // This bug was fixed by teaching the watcher goroutine to push the merge
  1180  // transaction directly instead of doing so indirectly by querying meta2.
  1181  //
  1182  // Attempting a foolproof reproduction of the bug proved challenging and would
  1183  // have required a mess of store filters. This test takes a simpler approach of
  1184  // running the necessary split and a merge concurrently and allowing the race
  1185  // scheduler to occasionally strike the right interleaving. At the time of
  1186  // writing, the test would reliably reproduce the bug in about 50 runs (about
  1187  // ten seconds of stress on an eight core laptop).
  1188  func TestStoreRangeMergeSplitRace_MergeWins(t *testing.T) {
  1189  	defer leaktest.AfterTest(t)()
  1190  
  1191  	ctx := context.Background()
  1192  	storeCfg := kvserver.TestStoreConfig(nil)
  1193  	storeCfg.TestingKnobs.DisableReplicateQueue = true
  1194  	mtc := &multiTestContext{storeConfig: &storeCfg}
  1195  	mtc.Start(t, 1)
  1196  	defer mtc.Stop()
  1197  	distSender := mtc.distSenders[0]
  1198  
  1199  	lhsDesc, rhsDesc, err := createSplitRanges(ctx, mtc.Store(0))
  1200  	if err != nil {
  1201  		t.Fatal(err)
  1202  	}
  1203  
  1204  	splitErrCh := make(chan error)
  1205  	go func() {
  1206  		time.Sleep(10 * time.Millisecond)
  1207  		splitArgs := adminSplitArgs(rhsDesc.StartKey.AsRawKey().Next())
  1208  		_, pErr := kv.SendWrapped(ctx, distSender, splitArgs)
  1209  		splitErrCh <- pErr.GoError()
  1210  	}()
  1211  
  1212  	mergeArgs := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  1213  	if _, pErr := kv.SendWrapped(ctx, distSender, mergeArgs); pErr != nil {
  1214  		t.Fatal(pErr)
  1215  	}
  1216  
  1217  	if err := <-splitErrCh; err != nil {
  1218  		t.Fatal(err)
  1219  	}
  1220  }
  1221  
  1222  // TestStoreRangeMergeSplitRace_SplitWins reproduces a race where a concurrent
  1223  // merge and split could deadlock. It exercises the case where the split commits
  1224  // and the merge aborts. See the MergeWins variant of this test for the inverse
  1225  // case.
  1226  //
  1227  // The bug works like this. A merge of adjacent ranges P and Q and a split of Q
  1228  // execute concurrently, though the merge executes with an earlier timestamp.
  1229  // First, the merge transaction reads Q's local range descriptor to determine
  1230  // the combined range's range descriptor. Then it writes an intent to update P's
  1231  // local range descriptor.
  1232  //
  1233  // Next, the split transaction runs from start to finish, updating Q's local
  1234  // descriptor and its associated meta2 record. Notably, the split transaction
  1235  // does not encounter any intents from the merge transaction, since the merge
  1236  // transaction's only intent so far is on P's local range descriptor, and so the
  1237  // split transaction can happily commit.
  1238  //
  1239  // The merge transaction then continues, writing an intent on Q's local
  1240  // descriptor. Since the merge transaction is executing at an earlier timestamp
  1241  // than the split transaction, the intent is written "under" the updated
  1242  // descriptor written by the split transaction.
  1243  //
  1244  // In the past, the merge transaction would simply push its commit timestamp
  1245  // forward and proceed, even though, upon committing, it would discover that it
  1246  // was forbidden from committing with a pushed timestamp and abort instead. (For
  1247  // why merge transactions cannot forward their commit timestamps, see the
  1248  // discussion on the retry loop within AdminMerge.) This was problematic. Before
  1249  // the doomed merge transaction attempted to commit, it would send a Subsume
  1250  // request, launching a merge watcher goroutine on Q. This watcher goroutine
  1251  // could incorrectly think that the merge transaction committed. Why? To
  1252  // determine whether a merge has truly aborted, the watcher goroutine sends a
  1253  // Get(/Meta2/QEndKey) request with a read uncommitted isolation level. If the
  1254  // Get request returns either nil or a descriptor for a different range, the
  1255  // merge is assumed to have committed. In this case, unfortunately, QEndKey is
  1256  // the Q's end key post-split. After all, the split has committed and updated
  1257  // Q's in-memory descriptor. The split transactions intents are cleaned up
  1258  // asynchronously, however, and since the watcher goroutine is not performing a
  1259  // consistent read it will not wait for the intents to be cleaned up. So
  1260  // Get(/Meta2/QEndKey) might return nil, in which case the watcher goroutine
  1261  // will incorrectly infer that the merge committed. (Note that the watcher
  1262  // goroutine can't perform a consistent read, as that would look up the
  1263  // transaction record on Q and deadlock, since Q is blocked for merging.)
  1264  //
  1265  // The bug was fixed by updating Q's local descriptor with a conditional put
  1266  // instead of a put. This forces the merge transaction to fail early if writing
  1267  // the intent would require forwarding the commit timestamp. In other words,
  1268  // this ensures that the merge watcher goroutine is never launched if the RHS
  1269  // local descriptor is updated while the merge transaction is executing.
  1270  func TestStoreRangeMergeSplitRace_SplitWins(t *testing.T) {
  1271  	defer leaktest.AfterTest(t)()
  1272  
  1273  	ctx := context.Background()
  1274  	storeCfg := kvserver.TestStoreConfig(nil)
  1275  	storeCfg.TestingKnobs.DisableReplicateQueue = true
  1276  
  1277  	var distSender *kvcoord.DistSender
  1278  	var lhsDescKey atomic.Value
  1279  	var launchSplit int64
  1280  	var mergeRetries int64
  1281  	storeCfg.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error {
  1282  		for _, req := range ba.Requests {
  1283  			if cput := req.GetConditionalPut(); cput != nil {
  1284  				if v := lhsDescKey.Load(); v != nil && v.(roachpb.Key).Equal(cput.Key) {
  1285  					// If this is the first merge attempt, launch the split
  1286  					// before the merge's first write succeeds.
  1287  					if atomic.CompareAndSwapInt64(&launchSplit, 1, 0) {
  1288  						_, pErr := kv.SendWrapped(ctx, distSender, adminSplitArgs(roachpb.Key("c")))
  1289  						return pErr
  1290  					}
  1291  					// Otherwise, record that the merge retried and proceed.
  1292  					atomic.AddInt64(&mergeRetries, 1)
  1293  				}
  1294  			}
  1295  		}
  1296  		return nil
  1297  	}
  1298  
  1299  	mtc := &multiTestContext{storeConfig: &storeCfg}
  1300  	mtc.Start(t, 1)
  1301  	defer mtc.Stop()
  1302  	distSender = mtc.distSenders[0]
  1303  
  1304  	lhsDesc, _, err := createSplitRanges(ctx, mtc.Store(0))
  1305  	if err != nil {
  1306  		t.Fatal(err)
  1307  	}
  1308  	lhsDescKey.Store(keys.RangeDescriptorKey(lhsDesc.StartKey))
  1309  	atomic.StoreInt64(&launchSplit, 1)
  1310  
  1311  	mergeArgs := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  1312  	if _, pErr := kv.SendWrapped(ctx, distSender, mergeArgs); pErr != nil {
  1313  		t.Fatal(pErr)
  1314  	}
  1315  	if atomic.LoadInt64(&mergeRetries) == 0 {
  1316  		t.Fatal("expected merge to retry at least once due to concurrent split")
  1317  	}
  1318  }
  1319  
  1320  // TestStoreRangeMergeRHSLeaseExpiration verifies that, if the right-hand range
  1321  // in a merge loses its lease while a merge is in progress, the new leaseholder
  1322  // does not incorrectly serve traffic before the merge completes.
  1323  func TestStoreRangeMergeRHSLeaseExpiration(t *testing.T) {
  1324  	defer leaktest.AfterTest(t)()
  1325  
  1326  	ctx := context.Background()
  1327  	storeCfg := kvserver.TestStoreConfig(nil)
  1328  	storeCfg.TestingKnobs.DisableReplicateQueue = true
  1329  	storeCfg.TestingKnobs.DisableMergeQueue = true
  1330  	storeCfg.Clock = nil // manual clock
  1331  
  1332  	// The synchronization in this test is tricky. The merge transaction is
  1333  	// controlled by the AdminMerge function and normally commits quite quickly,
  1334  	// but we need to ensure an expiration of the RHS's lease occurs while the
  1335  	// merge transaction is open. To do so we install various hooks to observe
  1336  	// and control requests. It's easiest to understand these hooks after you've
  1337  	// read the meat of the test.
  1338  
  1339  	// Install a hook to control when the merge transaction commits.
  1340  	mergeEndTxnReceived := make(chan *roachpb.Transaction, 10) // headroom in case the merge transaction retries
  1341  	finishMerge := make(chan struct{})
  1342  	storeCfg.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error {
  1343  		for _, r := range ba.Requests {
  1344  			if et := r.GetEndTxn(); et != nil && et.InternalCommitTrigger.GetMergeTrigger() != nil {
  1345  				mergeEndTxnReceived <- ba.Txn
  1346  				<-finishMerge
  1347  			}
  1348  		}
  1349  		return nil
  1350  	}
  1351  
  1352  	// Install a hook to observe when a get or a put request for a special key,
  1353  	// rhsSentinel, acquires latches and begins evaluating.
  1354  	const reqConcurrency = 10
  1355  	rhsSentinel := roachpb.Key("rhs-sentinel")
  1356  	reqAcquiredLatch := make(chan struct{}, reqConcurrency)
  1357  	storeCfg.TestingKnobs.TestingLatchFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error {
  1358  		for _, r := range ba.Requests {
  1359  			req := r.GetInner()
  1360  			switch req.Method() {
  1361  			case roachpb.Get, roachpb.Put:
  1362  				if req.Header().Key.Equal(rhsSentinel) {
  1363  					reqAcquiredLatch <- struct{}{}
  1364  				}
  1365  			}
  1366  		}
  1367  		return nil
  1368  	}
  1369  
  1370  	mtc := &multiTestContext{
  1371  		storeConfig: &storeCfg,
  1372  		// This test was written before the multiTestContext started creating many
  1373  		// system ranges at startup, and hasn't been update to take that into
  1374  		// account.
  1375  		startWithSingleRange: true,
  1376  	}
  1377  
  1378  	mtc.Start(t, 2)
  1379  	defer mtc.Stop()
  1380  
  1381  	// Create the ranges to be merged. Put both ranges on both stores, but give
  1382  	// the second store the lease on the RHS. The LHS is largely irrelevant. What
  1383  	// matters is that the RHS exists on two stores so we can transfer its lease
  1384  	// during the merge.
  1385  	lhsDesc, rhsDesc, err := createSplitRanges(ctx, mtc.stores[0])
  1386  	if err != nil {
  1387  		t.Fatal(err)
  1388  	}
  1389  	mtc.replicateRange(lhsDesc.RangeID, 1)
  1390  	mtc.replicateRange(rhsDesc.RangeID, 1)
  1391  	mtc.transferLease(ctx, rhsDesc.RangeID, 0, 1)
  1392  
  1393  	// Launch the merge.
  1394  	mergeErr := make(chan error)
  1395  	go func() {
  1396  		args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  1397  		_, pErr := kv.SendWrapped(ctx, mtc.stores[0].TestSender(), args)
  1398  		mergeErr <- pErr.GoError()
  1399  	}()
  1400  
  1401  	// Wait for the merge transaction to send its EndTxn request. It won't
  1402  	// be able to complete just yet, thanks to the hook we installed above.
  1403  	mergeTxn := <-mergeEndTxnReceived
  1404  
  1405  	// Now's our chance to move the lease on the RHS from the second store to the
  1406  	// first. This isn't entirely straightforward. The replica on the second store
  1407  	// is aware of the merge and is refusing all traffic, so we can't just send a
  1408  	// TransferLease request. Instead, we need to expire the second store's lease,
  1409  	// then acquire the lease on the first store.
  1410  
  1411  	// Before doing so, however, ensure that the merge transaction has written
  1412  	// its transaction record so that it doesn't run into trouble with the low
  1413  	// water mark of the new leaseholder's timestamp cache. This could result in
  1414  	// the transaction being inadvertently aborted during its first attempt,
  1415  	// which this test is not designed to handle. If the merge transaction did
  1416  	// abort then the get requests could complete on r2 before the merge retried.
  1417  	hb, hbH := heartbeatArgs(mergeTxn, mtc.clock().Now())
  1418  	if _, pErr := kv.SendWrappedWith(ctx, mtc.stores[0].TestSender(), hbH, hb); pErr != nil {
  1419  		t.Fatal(pErr)
  1420  	}
  1421  
  1422  	// Turn off liveness heartbeats on the second store, then advance the clock
  1423  	// past the liveness expiration time. This expires all leases on all stores.
  1424  	mtc.nodeLivenesses[1].PauseHeartbeat(true)
  1425  	mtc.advanceClock(ctx)
  1426  
  1427  	// Manually heartbeat the liveness on the first store to ensure it's
  1428  	// considered live. The automatic heartbeat might not come for a while.
  1429  	require.NoError(t, mtc.heartbeatLiveness(ctx, 0))
  1430  
  1431  	// Send several get and put requests to the the RHS. The first of these to
  1432  	// arrive will acquire the lease; the remaining requests will wait for that
  1433  	// lease acquisition to complete. Then all requests should block waiting for
  1434  	// the Subsume request to complete. By sending several of these requests in
  1435  	// parallel, we attempt to trigger a race where a request could slip through
  1436  	// on the replica between when the new lease is installed and when the
  1437  	// mergeComplete channel is installed.
  1438  	//
  1439  	// Note that the first request would never hit this race on its own. Nor would
  1440  	// any request that arrived early enough to see an outdated lease in
  1441  	// Replica.mu.state.Lease. All of these requests joined the in-progress lease
  1442  	// acquisition and blocked until the lease command acquires its latches,
  1443  	// at which point the mergeComplete channel was updated. To hit the race, the
  1444  	// request needed to arrive exactly between the update to
  1445  	// Replica.mu.state.Lease and the update to Replica.mu.mergeComplete.
  1446  	//
  1447  	// This race has since been fixed by installing the mergeComplete channel
  1448  	// before the new lease.
  1449  	reqErrs := make(chan *roachpb.Error) // closed when all reqs done
  1450  	var wg sync.WaitGroup
  1451  	wg.Add(reqConcurrency)
  1452  	go func() {
  1453  		wg.Wait()
  1454  		close(reqErrs)
  1455  	}()
  1456  
  1457  	for i := 0; i < reqConcurrency; i++ {
  1458  		go func(i int) {
  1459  			defer wg.Done()
  1460  			// For this test to have a shot at triggering a race, this log message
  1461  			// must be interleaved with the "new range lease" message, like so:
  1462  			//
  1463  			//     I180821 21:57:53.799207 388 storage/client_merge_test.go:1079  starting get 5
  1464  			//     I180821 21:57:53.800122 72 storage/replica_proposal.go:214  [s1,r2/1:{b-/Max}] new range lease ...
  1465  			//     I180821 21:57:53.800447 318 storage/client_merge_test.go:1079  starting get 6
  1466  			//
  1467  			// When this test was written, it would always produce the above
  1468  			// interleaving, and successfully trigger the race when run with the race
  1469  			// detector enabled about 50% of the time.
  1470  			log.Infof(ctx, "starting req %d", i)
  1471  			var req roachpb.Request
  1472  			if i%2 == 0 {
  1473  				req = getArgs(rhsSentinel)
  1474  			} else {
  1475  				req = putArgs(rhsSentinel, []byte(fmt.Sprintf("val%d", i)))
  1476  			}
  1477  			_, pErr := kv.SendWrappedWith(ctx, mtc.stores[0].TestSender(), roachpb.Header{
  1478  				RangeID: rhsDesc.RangeID,
  1479  			}, req)
  1480  			reqErrs <- pErr
  1481  		}(i)
  1482  		time.Sleep(time.Millisecond)
  1483  	}
  1484  
  1485  	// Wait for the get and put requests to acquire latches, which is as far as
  1486  	// they can get while the merge is in progress. Then wait a little bit
  1487  	// longer. This tests that the requests really do get stuck waiting for the
  1488  	// merge to complete without depending too heavily on implementation
  1489  	// details.
  1490  	for i := 0; i < reqConcurrency; i++ {
  1491  		select {
  1492  		case <-reqAcquiredLatch:
  1493  			// Latch acquired.
  1494  		case pErr := <-reqErrs:
  1495  			// Requests may never make it to the latch acquisition if s1 has not
  1496  			// yet learned s2's lease is expired. Instead, we'll see a
  1497  			// NotLeaseholderError.
  1498  			require.IsType(t, &roachpb.NotLeaseHolderError{}, pErr.GetDetail())
  1499  		}
  1500  	}
  1501  	time.Sleep(50 * time.Millisecond)
  1502  
  1503  	// Finally, allow the merge to complete. It should complete successfully.
  1504  	close(finishMerge)
  1505  	require.NoError(t, <-mergeErr)
  1506  
  1507  	// Because the merge completed successfully, r2 has ceased to exist. We
  1508  	// therefore *must* see only RangeNotFoundErrors here from every pending get
  1509  	// and put request. Anything else is a consistency error (or a bug in the
  1510  	// test).
  1511  	for pErr := range reqErrs {
  1512  		require.IsType(t, &roachpb.RangeNotFoundError{}, pErr.GetDetail())
  1513  	}
  1514  }
  1515  
  1516  // TestStoreRangeMergeConcurrentRequests tests merging ranges that are serving
  1517  // other traffic concurrently.
  1518  func TestStoreRangeMergeConcurrentRequests(t *testing.T) {
  1519  	defer leaktest.AfterTest(t)()
  1520  
  1521  	ctx := context.Background()
  1522  	storeCfg := kvserver.TestStoreConfig(nil)
  1523  	storeCfg.TestingKnobs.DisableSplitQueue = true
  1524  	storeCfg.TestingKnobs.DisableMergeQueue = true
  1525  	storeCfg.TestingKnobs.DisableReplicateQueue = true
  1526  	storeCfg.Clock = nil // manual clock
  1527  
  1528  	var mtc *multiTestContext
  1529  	storeCfg.TestingKnobs.TestingResponseFilter = func(
  1530  		ctx context.Context, ba roachpb.BatchRequest, _ *roachpb.BatchResponse,
  1531  	) *roachpb.Error {
  1532  		del := ba.Requests[0].GetDelete()
  1533  		if del != nil && bytes.HasSuffix(del.Key, keys.LocalRangeDescriptorSuffix) && rand.Int()%4 == 0 {
  1534  			// After every few deletions of the local range descriptor, expire all
  1535  			// range leases. This makes the following sequence of events quite likely:
  1536  			//
  1537  			//     1. The merge transaction begins and lays down deletion intents for
  1538  			//        the meta2 and local copies of the RHS range descriptor.
  1539  			//     2. The RHS replica loses its lease, thanks to the following call to
  1540  			//        mtc.advanceClock.
  1541  			//     3. A Get request arrives at the RHS replica and triggers a
  1542  			//        synchronous lease acquisition. The lease acquisition notices
  1543  			//        that a merge is in progress and installs a mergeComplete
  1544  			//        channel.
  1545  			//     4. The Get request blocks on the newly installed mergeComplete
  1546  			//        channel.
  1547  			//     5. The Subsume request arrives. (Or, if the merge transaction is
  1548  			//        incorrectly pipelined, the QueryIntent request for the RHS range
  1549  			//        descriptor key that precedes the Subsume request arrives.)
  1550  			//
  1551  			// This scenario previously caused deadlock. The merge was not able to
  1552  			// complete until the Subsume request completed, but the Subsume request
  1553  			// was unable to acquire latches until the Get request finished, which
  1554  			// was itself waiting for the merge to complete. Whoops!
  1555  			mtc.advanceClock(ctx)
  1556  		}
  1557  		return nil
  1558  	}
  1559  
  1560  	mtc = &multiTestContext{storeConfig: &storeCfg}
  1561  	mtc.Start(t, 1)
  1562  	defer mtc.Stop()
  1563  	store := mtc.Store(0)
  1564  
  1565  	keys := []roachpb.Key{
  1566  		roachpb.Key("a1"), roachpb.Key("a2"), roachpb.Key("a3"),
  1567  		roachpb.Key("c1"), roachpb.Key("c2"), roachpb.Key("c3"),
  1568  	}
  1569  
  1570  	for _, k := range keys {
  1571  		if err := store.DB().Put(ctx, k, "val"); err != nil {
  1572  			t.Fatal(err)
  1573  		}
  1574  	}
  1575  
  1576  	// Failures in this test often present as a deadlock. Set a short timeout to
  1577  	// limit the damage.
  1578  	ctx, cancel := context.WithTimeout(ctx, testutils.DefaultSucceedsSoonDuration)
  1579  	defer cancel()
  1580  
  1581  	const numGetWorkers = 16
  1582  	const numMerges = 16
  1583  
  1584  	var numGets int64
  1585  	doneCh := make(chan struct{})
  1586  	g := ctxgroup.WithContext(ctx)
  1587  	for i := 0; i < numGetWorkers; i++ {
  1588  		g.GoCtx(func(ctx context.Context) error {
  1589  			for {
  1590  				select {
  1591  				case <-ctx.Done():
  1592  					return ctx.Err()
  1593  				case <-doneCh:
  1594  					return nil
  1595  				default:
  1596  				}
  1597  				key := keys[rand.Intn(len(keys))]
  1598  				if kv, err := store.DB().Get(ctx, key); err != nil {
  1599  					return err
  1600  				} else if v := string(kv.ValueBytes()); v != "val" {
  1601  					return fmt.Errorf(`expected "val", but got %q`, v)
  1602  				}
  1603  				atomic.AddInt64(&numGets, 1)
  1604  			}
  1605  		})
  1606  	}
  1607  
  1608  	for i := 0; i < numMerges; i++ {
  1609  		lhsDesc, _, err := createSplitRanges(ctx, store)
  1610  		if err != nil {
  1611  			t.Fatal(err)
  1612  		}
  1613  		args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  1614  		if _, pErr := kv.SendWrapped(ctx, store.TestSender(), args); pErr != nil {
  1615  			t.Fatal(pErr)
  1616  		}
  1617  	}
  1618  
  1619  	close(doneCh)
  1620  	if err := g.Wait(); err != nil {
  1621  		t.Fatal(err)
  1622  	}
  1623  
  1624  	// Expect that each worker was able to issue one at least one get request
  1625  	// during every split/merge cycle. Empirical evidence suggests that this a
  1626  	// very conservative estimate that is unlikely to be flaky.
  1627  	if n := atomic.LoadInt64(&numGets); n < numGetWorkers*numMerges {
  1628  		t.Fatalf("suspiciously low numGets (expected at least %d): %d", numGetWorkers*numMerges, n)
  1629  	}
  1630  }
  1631  
  1632  // TestStoreReplicaGCAfterMerge verifies that the replica GC queue writes the
  1633  // correct tombstone when it GCs a replica of range that has been merged away.
  1634  //
  1635  // Consider the following sequence of events observed in a real cluster:
  1636  //
  1637  //     1. Adjacent ranges Q and R are slated to be merged. Q has replicas on
  1638  //        stores S1, S2, and S3, while R has replicas on S1, S2, and S4.
  1639  //     2. To collocate Q and R, the merge queue adds a replica of R on S3 and
  1640  //        removes the replica on S4. The replica on S4 is queued for garbage
  1641  //        collection, but is not yet processed.
  1642  //     3. The merge transaction commits, deleting R's range descriptor from the
  1643  //        meta2 index.
  1644  //     4. The replica GC queue processes the former replica of R on S4. It
  1645  //        performs a consistent lookup of R's start key in the meta2 index to
  1646  //        determine whether the replica is still a member of R. Since R has been
  1647  //        deleted, the lookup returns Q's range descriptor, not R's.
  1648  //
  1649  // The replica GC queue would previously fail to notice that it had received Q's
  1650  // range descriptor, not R's. It would then proceed to call store.RemoveReplica
  1651  // with Q's descriptor, which would write a replica tombstone for Q, when in
  1652  // fact the replica tombstone needed to be written for R. Without the correct
  1653  // replica tombstone, if S4 received a slow Raft message for the now-GC'd
  1654  // replica, it would incorrectly construct an uninitialized replica and panic.
  1655  //
  1656  // This test also ensures that the nodes which processes the Merge writes a
  1657  // tombstone which prevents the range from being resurrected by a raft message.
  1658  //
  1659  // This test's approach to simulating this sequence of events is based on
  1660  // TestReplicaGCRace.
  1661  func TestStoreReplicaGCAfterMerge(t *testing.T) {
  1662  	defer leaktest.AfterTest(t)()
  1663  
  1664  	ctx := context.Background()
  1665  	storeCfg := kvserver.TestStoreConfig(nil)
  1666  	storeCfg.TestingKnobs.DisableReplicateQueue = true
  1667  	storeCfg.TestingKnobs.DisableReplicaGCQueue = true
  1668  	storeCfg.TestingKnobs.DisableMergeQueue = true
  1669  	storeCfg.TestingKnobs.DisableEagerReplicaRemoval = true
  1670  	mtc := &multiTestContext{storeConfig: &storeCfg}
  1671  	mtc.Start(t, 2)
  1672  	defer mtc.Stop()
  1673  	store0, store1 := mtc.Store(0), mtc.Store(1)
  1674  
  1675  	rngID := store0.LookupReplica(roachpb.RKey("a")).Desc().RangeID
  1676  	mtc.replicateRange(rngID, 1)
  1677  	lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0)
  1678  	if err != nil {
  1679  		t.Fatal(err)
  1680  	}
  1681  
  1682  	mtc.unreplicateRange(lhsDesc.RangeID, 1)
  1683  	mtc.unreplicateRange(rhsDesc.RangeID, 1)
  1684  
  1685  	args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  1686  	_, pErr := kv.SendWrapped(ctx, store0.TestSender(), args)
  1687  	if pErr != nil {
  1688  		t.Fatal(pErr)
  1689  	}
  1690  
  1691  	for _, rangeID := range []roachpb.RangeID{lhsDesc.RangeID, rhsDesc.RangeID} {
  1692  		repl, err := store1.GetReplica(rangeID)
  1693  		if err != nil {
  1694  			t.Fatal(err)
  1695  		}
  1696  		if err := store1.ManualReplicaGC(repl); err != nil {
  1697  			t.Fatal(err)
  1698  		}
  1699  		if _, err := store1.GetReplica(rangeID); err == nil {
  1700  			t.Fatalf("replica of r%d not gc'd from s1", rangeID)
  1701  		}
  1702  	}
  1703  
  1704  	rhsReplDesc0, ok := rhsDesc.GetReplicaDescriptor(store0.StoreID())
  1705  	if !ok {
  1706  		t.Fatalf("expected %s to have a replica on %s", rhsDesc, store0)
  1707  	}
  1708  	rhsReplDesc1, ok := rhsDesc.GetReplicaDescriptor(store1.StoreID())
  1709  	if !ok {
  1710  		t.Fatalf("expected %s to have a replica on %s", rhsDesc, store1)
  1711  	}
  1712  
  1713  	transport := kvserver.NewRaftTransport(
  1714  		log.AmbientContext{Tracer: mtc.storeConfig.Settings.Tracer},
  1715  		cluster.MakeTestingClusterSettings(),
  1716  		nodedialer.New(mtc.rpcContext, gossip.AddressResolver(mtc.gossips[0])),
  1717  		nil, /* grpcServer */
  1718  		mtc.transportStopper,
  1719  	)
  1720  	errChan := errorChannelTestHandler(make(chan *roachpb.Error, 1))
  1721  	transport.Listen(store0.StoreID(), errChan)
  1722  	transport.Listen(store1.StoreID(), errChan)
  1723  
  1724  	sendHeartbeat := func(
  1725  		rangeID roachpb.RangeID,
  1726  		fromReplDesc, toReplDesc roachpb.ReplicaDescriptor,
  1727  	) {
  1728  		// Try several times, as the message may be dropped (see #18355).
  1729  		for i := 0; i < 5; i++ {
  1730  			if sent := transport.SendAsync(&kvserver.RaftMessageRequest{
  1731  				FromReplica: fromReplDesc,
  1732  				ToReplica:   toReplDesc,
  1733  				Heartbeats: []kvserver.RaftHeartbeat{
  1734  					{
  1735  						RangeID:       rangeID,
  1736  						FromReplicaID: fromReplDesc.ReplicaID,
  1737  						ToReplicaID:   toReplDesc.ReplicaID,
  1738  						Commit:        42,
  1739  					},
  1740  				},
  1741  			}, rpc.DefaultClass); !sent {
  1742  				t.Fatal("failed to send heartbeat")
  1743  			}
  1744  			select {
  1745  			case pErr := <-errChan:
  1746  				switch pErr.GetDetail().(type) {
  1747  				case *roachpb.RaftGroupDeletedError:
  1748  					return
  1749  				default:
  1750  					t.Fatalf("unexpected error type %T: %s", pErr.GetDetail(), pErr)
  1751  				}
  1752  			case <-time.After(time.Second):
  1753  			}
  1754  		}
  1755  		t.Fatal("did not get expected RaftGroupDeleted error")
  1756  	}
  1757  
  1758  	// Send a heartbeat to the now-GC'd replica on the stores. If the replica
  1759  	// tombstone was not written correctly when the replica was GC'd, this will
  1760  	// cause a panic.
  1761  	sendHeartbeat(rhsDesc.RangeID, rhsReplDesc0, rhsReplDesc1)
  1762  	sendHeartbeat(rhsDesc.RangeID, rhsReplDesc1, rhsReplDesc0)
  1763  
  1764  	// Send a heartbeat to a fictional replicas on with a large replica ID.
  1765  	// This tests an implementation detail: the replica tombstone that gets
  1766  	// written in this case will use the maximum possible replica ID, so the
  1767  	// stores should ignore heartbeats for replicas of the range with _any_
  1768  	// replica ID.
  1769  	sendHeartbeat(rhsDesc.RangeID, rhsReplDesc0, roachpb.ReplicaDescriptor{
  1770  		NodeID:    store1.Ident.NodeID,
  1771  		StoreID:   store1.Ident.StoreID,
  1772  		ReplicaID: 123456,
  1773  	})
  1774  
  1775  	sendHeartbeat(rhsDesc.RangeID, rhsReplDesc1, roachpb.ReplicaDescriptor{
  1776  		NodeID:    store0.Ident.NodeID,
  1777  		StoreID:   store0.Ident.StoreID,
  1778  		ReplicaID: 123456,
  1779  	})
  1780  
  1781  	// Be extra paranoid and verify the exact value of the replica tombstone.
  1782  	checkTombstone := func(eng storage.Engine) {
  1783  		var rhsTombstone roachpb.RangeTombstone
  1784  		rhsTombstoneKey := keys.RangeTombstoneKey(rhsDesc.RangeID)
  1785  		ok, err = storage.MVCCGetProto(ctx, eng, rhsTombstoneKey, hlc.Timestamp{},
  1786  			&rhsTombstone, storage.MVCCGetOptions{})
  1787  		if err != nil {
  1788  			t.Fatal(err)
  1789  		} else if !ok {
  1790  			t.Fatalf("missing range tombstone at key %s", rhsTombstoneKey)
  1791  		}
  1792  		if e, a := roachpb.ReplicaID(math.MaxInt32), rhsTombstone.NextReplicaID; e != a {
  1793  			t.Fatalf("expected next replica ID to be %d, but got %d", e, a)
  1794  		}
  1795  	}
  1796  	checkTombstone(store0.Engine())
  1797  	checkTombstone(store1.Engine())
  1798  }
  1799  
  1800  // TestStoreRangeMergeAddReplicaRace verifies that when an add replica request
  1801  // occurs concurrently with a merge, one of them is aborted with a "descriptor
  1802  // changed" CPut error.
  1803  func TestStoreRangeMergeAddReplicaRace(t *testing.T) {
  1804  	defer leaktest.AfterTest(t)()
  1805  	ctx := context.Background()
  1806  	tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{
  1807  		ReplicationMode: base.ReplicationManual,
  1808  	})
  1809  	defer tc.Stopper().Stop(ctx)
  1810  
  1811  	scratchStartKey := tc.ScratchRange(t)
  1812  	origDesc := tc.LookupRangeOrFatal(t, scratchStartKey)
  1813  	splitKey := scratchStartKey.Next()
  1814  	beforeDesc, _ := tc.SplitRangeOrFatal(t, splitKey)
  1815  
  1816  	mergeErrCh, addErrCh := make(chan error, 1), make(chan error, 1)
  1817  	go func() {
  1818  		mergeErrCh <- tc.Server(0).DB().AdminMerge(ctx, scratchStartKey)
  1819  	}()
  1820  	go func() {
  1821  		_, err := tc.Server(0).DB().AdminChangeReplicas(
  1822  			ctx, scratchStartKey, beforeDesc, roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, tc.Target(1)))
  1823  		addErrCh <- err
  1824  	}()
  1825  	mergeErr := <-mergeErrCh
  1826  	addErr := <-addErrCh
  1827  	afterDesc := tc.LookupRangeOrFatal(t, scratchStartKey)
  1828  
  1829  	const acceptableMergeErr = `unexpected value: raw_bytes|ranges not collocated` +
  1830  		`|cannot merge range with non-voter replicas`
  1831  	if mergeErr == nil && testutils.IsError(addErr, `descriptor changed: \[expected\]`) {
  1832  		// Merge won the race, no add happened.
  1833  		require.Len(t, afterDesc.Replicas().Voters(), 1)
  1834  		require.Equal(t, origDesc.EndKey, afterDesc.EndKey)
  1835  	} else if addErr == nil && testutils.IsError(mergeErr, acceptableMergeErr) {
  1836  		// Add won the race, no merge happened.
  1837  		require.Len(t, afterDesc.Replicas().Voters(), 2)
  1838  		require.Equal(t, beforeDesc.EndKey, afterDesc.EndKey)
  1839  	} else {
  1840  		t.Fatalf(`expected exactly one of merge or add to succeed got: [merge] %v [add] %v`,
  1841  			mergeErr, addErr)
  1842  	}
  1843  }
  1844  
  1845  // TestStoreRangeMergeResplitAddReplicaRace tests a diabolical edge case in the
  1846  // merge/add replica race. If two replicas merge and then split at the previous
  1847  // boundary, the descriptor will look unchanged and our usual CPut protection
  1848  // would fail. For this reason, we introduced RangeDescriptor.Generation.
  1849  //
  1850  // Note that splits will not increment the generation counter until the cluster
  1851  // version includes VersionRangeMerges. That's ok, because a sequence of splits
  1852  // alone will always result in a descriptor with a smaller end key. Only a
  1853  // sequence of splits AND merges can result in an unchanged end key, and merges
  1854  // always increment the generation counter.
  1855  func TestStoreRangeMergeResplitAddReplicaRace(t *testing.T) {
  1856  	defer leaktest.AfterTest(t)()
  1857  	ctx := context.Background()
  1858  	tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{
  1859  		ReplicationMode: base.ReplicationManual,
  1860  	})
  1861  	defer tc.Stopper().Stop(ctx)
  1862  
  1863  	scratchStartKey := tc.ScratchRange(t)
  1864  	splitKey := scratchStartKey.Next()
  1865  	origDesc, _ := tc.SplitRangeOrFatal(t, splitKey)
  1866  	require.NoError(t, tc.Server(0).DB().AdminMerge(ctx, scratchStartKey))
  1867  	resplitDesc, _ := tc.SplitRangeOrFatal(t, splitKey)
  1868  
  1869  	assert.Equal(t, origDesc.RangeID, resplitDesc.RangeID)
  1870  	assert.Equal(t, origDesc.StartKey, resplitDesc.StartKey)
  1871  	assert.Equal(t, origDesc.EndKey, resplitDesc.EndKey)
  1872  	assert.Equal(t, origDesc.Replicas().All(), resplitDesc.Replicas().All())
  1873  	assert.NotEqual(t, origDesc.Generation, resplitDesc.Generation)
  1874  
  1875  	_, err := tc.Server(0).DB().AdminChangeReplicas(
  1876  		ctx, scratchStartKey, origDesc, roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, tc.Target(1)))
  1877  	if !testutils.IsError(err, `descriptor changed`) {
  1878  		t.Fatalf(`expected "descriptor changed" error got: %+v`, err)
  1879  	}
  1880  }
  1881  
  1882  func TestStoreRangeMergeSlowUnabandonedFollower_NoSplit(t *testing.T) {
  1883  	defer leaktest.AfterTest(t)()
  1884  
  1885  	ctx := context.Background()
  1886  	storeCfg := kvserver.TestStoreConfig(nil)
  1887  	storeCfg.TestingKnobs.DisableReplicateQueue = true
  1888  	storeCfg.TestingKnobs.DisableReplicaGCQueue = true
  1889  	mtc := &multiTestContext{storeConfig: &storeCfg}
  1890  	mtc.Start(t, 3)
  1891  	defer mtc.Stop()
  1892  	store0, store2 := mtc.Store(0), mtc.Store(2)
  1893  
  1894  	rngID := store0.LookupReplica(roachpb.RKey("a")).Desc().RangeID
  1895  	mtc.replicateRange(rngID, 1, 2)
  1896  	lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0)
  1897  	if err != nil {
  1898  		t.Fatal(err)
  1899  	}
  1900  
  1901  	// Wait for store2 to hear about the split.
  1902  	testutils.SucceedsSoon(t, func() error {
  1903  		if rhsRepl2, err := store2.GetReplica(rhsDesc.RangeID); err != nil || !rhsRepl2.IsInitialized() {
  1904  			return errors.Errorf("store2 has not yet processed split. err: %v", err)
  1905  		}
  1906  		return nil
  1907  	})
  1908  
  1909  	// Block Raft traffic to the LHS replica on store2, by holding its raftMu, so
  1910  	// that its LHS isn't aware there's a merge in progress.
  1911  	lhsRepl2, err := store2.GetReplica(lhsDesc.RangeID)
  1912  	if err != nil {
  1913  		t.Fatal(err)
  1914  	}
  1915  	lhsRepl2.RaftLock()
  1916  
  1917  	args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  1918  	_, pErr := kv.SendWrapped(ctx, store0.TestSender(), args)
  1919  	if pErr != nil {
  1920  		t.Fatal(pErr)
  1921  	}
  1922  
  1923  	// Verify that store2 won't inadvertently GC the RHS before it's heard about
  1924  	// the merge. This is a tricky case for the replica GC queue, as meta2 will
  1925  	// indicate that the range has been merged away.
  1926  	rhsRepl2, err := store2.GetReplica(rhsDesc.RangeID)
  1927  	if err != nil {
  1928  		t.Fatal(err)
  1929  	}
  1930  	if err := store2.ManualReplicaGC(rhsRepl2); err != nil {
  1931  		t.Fatal(err)
  1932  	}
  1933  	if _, err := store2.GetReplica(rhsDesc.RangeID); err != nil {
  1934  		t.Fatalf("non-abandoned rhs replica unexpectedly GC'd before merge")
  1935  	}
  1936  
  1937  	// Restore communication with store2. Give it the lease to force all commands
  1938  	// to be applied, including the merge trigger.
  1939  	lhsRepl2.RaftUnlock()
  1940  	mtc.transferLease(ctx, lhsDesc.RangeID, 0, 2)
  1941  }
  1942  
  1943  func TestStoreRangeMergeSlowUnabandonedFollower_WithSplit(t *testing.T) {
  1944  	defer leaktest.AfterTest(t)()
  1945  
  1946  	ctx := context.Background()
  1947  	storeCfg := kvserver.TestStoreConfig(nil)
  1948  	storeCfg.TestingKnobs.DisableReplicateQueue = true
  1949  	mtc := &multiTestContext{storeConfig: &storeCfg}
  1950  	mtc.Start(t, 3)
  1951  	defer mtc.Stop()
  1952  	store0, store2 := mtc.Store(0), mtc.Store(2)
  1953  
  1954  	rngID := store0.LookupReplica(roachpb.RKey("a")).Desc().RangeID
  1955  	mtc.replicateRange(rngID, 1, 2)
  1956  	lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0)
  1957  	if err != nil {
  1958  		t.Fatal(err)
  1959  	}
  1960  
  1961  	// Wait for store2 to hear about the split.
  1962  	testutils.SucceedsSoon(t, func() error {
  1963  		_, err := store2.GetReplica(rhsDesc.RangeID)
  1964  		return err
  1965  	})
  1966  
  1967  	// Start dropping all Raft traffic to the LHS on store2 so that it won't be
  1968  	// aware that there is a merge in progress.
  1969  	mtc.transport.Listen(store2.Ident.StoreID, &unreliableRaftHandler{
  1970  		rangeID:            lhsDesc.RangeID,
  1971  		RaftMessageHandler: store2,
  1972  	})
  1973  
  1974  	args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  1975  	_, pErr := kv.SendWrapped(ctx, store0.TestSender(), args)
  1976  	if pErr != nil {
  1977  		t.Fatal(pErr)
  1978  	}
  1979  
  1980  	// Now split the newly merged range splits back out at exactly the same key.
  1981  	// When the replica GC queue looks in meta2 it will find the new RHS range, of
  1982  	// which store2 is a member. Note that store2 does not yet have an initialized
  1983  	// replica for this range, since it would intersect with the old RHS replica.
  1984  	_, newRHSDesc, err := createSplitRanges(ctx, store0)
  1985  	if err != nil {
  1986  		t.Fatal(err)
  1987  	}
  1988  
  1989  	// Remove the LHS replica from store2.
  1990  	mtc.unreplicateRange(lhsDesc.RangeID, 2)
  1991  
  1992  	// Transfer the lease on the new RHS to store2 and wait for it to apply. This
  1993  	// will force its replica to of the new RHS to become up to date, which
  1994  	// indirectly tests that the replica GC queue cleans up both the LHS replica
  1995  	// and the old RHS replica.
  1996  	mtc.transferLease(ctx, newRHSDesc.RangeID, 0, 2)
  1997  	testutils.SucceedsSoon(t, func() error {
  1998  		rhsRepl, err := store2.GetReplica(newRHSDesc.RangeID)
  1999  		if err != nil {
  2000  			return err
  2001  		}
  2002  		if !rhsRepl.OwnsValidLease(mtc.clock().Now()) {
  2003  			return errors.New("rhs store does not own valid lease for rhs range")
  2004  		}
  2005  		return nil
  2006  	})
  2007  }
  2008  
  2009  func TestStoreRangeMergeSlowAbandonedFollower(t *testing.T) {
  2010  	defer leaktest.AfterTest(t)()
  2011  
  2012  	ctx := context.Background()
  2013  	storeCfg := kvserver.TestStoreConfig(nil)
  2014  	storeCfg.TestingKnobs.DisableMergeQueue = true
  2015  	storeCfg.TestingKnobs.DisableReplicateQueue = true
  2016  	storeCfg.TestingKnobs.DisableReplicaGCQueue = true
  2017  	mtc := &multiTestContext{storeConfig: &storeCfg}
  2018  	mtc.Start(t, 3)
  2019  	defer mtc.Stop()
  2020  	store0, store2 := mtc.Store(0), mtc.Store(2)
  2021  
  2022  	rngID := store0.LookupReplica(roachpb.RKey("a")).Desc().RangeID
  2023  	mtc.replicateRange(rngID, 1, 2)
  2024  	lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0)
  2025  	if err != nil {
  2026  		t.Fatal(err)
  2027  	}
  2028  
  2029  	// Wait for store2 to hear about the split.
  2030  	var rhsRepl2 *kvserver.Replica
  2031  	testutils.SucceedsSoon(t, func() error {
  2032  		if rhsRepl2, err = store2.GetReplica(rhsDesc.RangeID); err != nil || !rhsRepl2.IsInitialized() {
  2033  			return errors.New("store2 has not yet processed split")
  2034  		}
  2035  		return nil
  2036  	})
  2037  
  2038  	// Block Raft traffic to the LHS replica on store2, by holding its raftMu, so
  2039  	// that its LHS isn't aware there's a merge in progress.
  2040  	lhsRepl2, err := store2.GetReplica(lhsDesc.RangeID)
  2041  	if err != nil {
  2042  		t.Fatal(err)
  2043  	}
  2044  	lhsRepl2.RaftLock()
  2045  
  2046  	args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  2047  	_, pErr := kv.SendWrapped(ctx, store0.TestSender(), args)
  2048  	if pErr != nil {
  2049  		t.Fatal(pErr)
  2050  	}
  2051  
  2052  	// Remove store2 from the range after the merge. It won't hear about this yet,
  2053  	// but we'll be able to commit the configuration change because we have two
  2054  	// other live members.
  2055  	mtc.unreplicateRange(lhsDesc.RangeID, 2)
  2056  
  2057  	// Verify that store2 won't inadvertently GC the RHS before it's heard about
  2058  	// the merge. This is a particularly tricky case for the replica GC queue, as
  2059  	// meta2 will indicate that the range has been merged away AND that store2 is
  2060  	// not a member of the new range.
  2061  	if err := store2.ManualReplicaGC(rhsRepl2); err != nil {
  2062  		t.Fatal(err)
  2063  	}
  2064  	if _, err := store2.GetReplica(rhsDesc.RangeID); err != nil {
  2065  		t.Fatal("rhs replica on store2 destroyed before lhs applied merge")
  2066  	}
  2067  
  2068  	// Flush store2's queued requests.
  2069  	lhsRepl2.RaftUnlock()
  2070  
  2071  	// Ensure that the unblocked merge eventually applies and subsumes the RHS.
  2072  	// In general this will happen due to receiving a ReplicaTooOldError but
  2073  	// it may require the replica GC queue. In rare cases the LHS will never
  2074  	// hear about the merge and may need to be GC'd on its own.
  2075  	testutils.SucceedsSoon(t, func() error {
  2076  		// Make the the LHS gets destroyed.
  2077  		if lhsRepl, err := store2.GetReplica(lhsDesc.RangeID); err == nil {
  2078  			if err := store2.ManualReplicaGC(lhsRepl); err != nil {
  2079  				t.Fatal(err)
  2080  			}
  2081  		}
  2082  		if rhsRepl, err := store2.GetReplica(rhsDesc.RangeID); err == nil {
  2083  			if err := store2.ManualReplicaGC(rhsRepl); err != nil {
  2084  				t.Fatal(err)
  2085  			}
  2086  			return errors.New("rhs not yet destroyed")
  2087  		}
  2088  		return nil
  2089  	})
  2090  }
  2091  
  2092  func TestStoreRangeMergeAbandonedFollowers(t *testing.T) {
  2093  	defer leaktest.AfterTest(t)()
  2094  
  2095  	ctx := context.Background()
  2096  	storeCfg := kvserver.TestStoreConfig(nil)
  2097  	storeCfg.TestingKnobs.DisableReplicateQueue = true
  2098  	storeCfg.TestingKnobs.DisableReplicaGCQueue = true
  2099  	storeCfg.TestingKnobs.DisableSplitQueue = true
  2100  	storeCfg.TestingKnobs.DisableMergeQueue = true
  2101  	storeCfg.TestingKnobs.DisableEagerReplicaRemoval = true
  2102  	mtc := &multiTestContext{storeConfig: &storeCfg}
  2103  	mtc.Start(t, 3)
  2104  	defer mtc.Stop()
  2105  	store2 := mtc.Store(2)
  2106  
  2107  	rngID := mtc.Store(0).LookupReplica(roachpb.RKey("a")).Desc().RangeID
  2108  	mtc.replicateRange(rngID, 1, 2)
  2109  
  2110  	// Split off three ranges.
  2111  	keys := []roachpb.RKey{roachpb.RKey("a"), roachpb.RKey("b"), roachpb.RKey("c")}
  2112  	for _, key := range keys {
  2113  		splitArgs := adminSplitArgs(key.AsRawKey())
  2114  		if _, pErr := kv.SendWrapped(ctx, mtc.distSenders[0], splitArgs); pErr != nil {
  2115  			t.Fatal(pErr)
  2116  		}
  2117  	}
  2118  
  2119  	// Wait for store2 to hear about all three splits.
  2120  	var repls []*kvserver.Replica
  2121  	testutils.SucceedsSoon(t, func() error {
  2122  		repls = nil
  2123  		for _, key := range keys {
  2124  			repl := store2.LookupReplica(key) /* end */
  2125  			if repl == nil || !repl.Desc().StartKey.Equal(key) {
  2126  				return fmt.Errorf("replica for key %q is missing or has wrong start key: %s", key, repl)
  2127  			}
  2128  			repls = append(repls, repl)
  2129  		}
  2130  		return nil
  2131  	})
  2132  
  2133  	// Remove all replicas from store2.
  2134  	for _, repl := range repls {
  2135  		mtc.unreplicateRange(repl.RangeID, 2)
  2136  	}
  2137  
  2138  	// Merge all three ranges together. store2 won't hear about this merge.
  2139  	for i := 0; i < 2; i++ {
  2140  		if _, pErr := kv.SendWrapped(ctx, mtc.distSenders[0], adminMergeArgs(roachpb.Key("a"))); pErr != nil {
  2141  			t.Fatal(pErr)
  2142  		}
  2143  	}
  2144  
  2145  	// Verify that the abandoned ranges on store2 can only be GC'd from left to
  2146  	// right.
  2147  	if err := store2.ManualReplicaGC(repls[2]); err != nil {
  2148  		t.Fatal(err)
  2149  	}
  2150  	if _, err := store2.GetReplica(repls[2].RangeID); err != nil {
  2151  		t.Fatal("c replica on store2 destroyed before b")
  2152  	}
  2153  	if err := store2.ManualReplicaGC(repls[1]); err != nil {
  2154  		t.Fatal(err)
  2155  	}
  2156  	if _, err := store2.GetReplica(repls[1].RangeID); err != nil {
  2157  		t.Fatal("b replica on store2 destroyed before a")
  2158  	}
  2159  	if err := store2.ManualReplicaGC(repls[0]); err != nil {
  2160  		t.Fatal(err)
  2161  	}
  2162  	if _, err := store2.GetReplica(repls[0].RangeID); err == nil {
  2163  		t.Fatal("a replica not destroyed")
  2164  	}
  2165  
  2166  	if err := store2.ManualReplicaGC(repls[2]); err != nil {
  2167  		t.Fatal(err)
  2168  	}
  2169  	if _, err := store2.GetReplica(repls[2].RangeID); err != nil {
  2170  		t.Fatal("c replica on store2 destroyed before b")
  2171  	}
  2172  	if err := store2.ManualReplicaGC(repls[1]); err != nil {
  2173  		t.Fatal(err)
  2174  	}
  2175  	if _, err := store2.GetReplica(repls[1].RangeID); err == nil {
  2176  		t.Fatal("b replica not destroyed")
  2177  	}
  2178  
  2179  	if err := store2.ManualReplicaGC(repls[2]); err != nil {
  2180  		t.Fatal(err)
  2181  	}
  2182  	if _, err := store2.GetReplica(repls[2].RangeID); err == nil {
  2183  		t.Fatal("c replica not destroyed")
  2184  	}
  2185  }
  2186  
  2187  // TestStoreRangeMergeAbandonedFollowersAutomaticallyGarbageCollected verifies
  2188  // that the replica GC queue will clean up an abandoned RHS replica whose
  2189  // destroyStatus is destroyReasonMergePending. The RHS replica ends up in this
  2190  // state when its merge watcher goroutine notices that the merge committed, and
  2191  // thus marks it as destroyed with reason destroyReasonMergePending, but the
  2192  // corresponding LHS is rebalanced off the store before it can apply the merge
  2193  // trigger. The replica GC queue would previously refuse to GC the abandoned
  2194  // RHS, as it interpreted destroyReasonMergePending to mean that the RHS replica
  2195  // had already been garbage collected.
  2196  func TestStoreRangeMergeAbandonedFollowersAutomaticallyGarbageCollected(t *testing.T) {
  2197  	defer leaktest.AfterTest(t)()
  2198  
  2199  	ctx := context.Background()
  2200  	storeCfg := kvserver.TestStoreConfig(nil)
  2201  	storeCfg.TestingKnobs.DisableReplicateQueue = true
  2202  	mtc := &multiTestContext{storeConfig: &storeCfg}
  2203  	mtc.Start(t, 3)
  2204  	defer mtc.Stop()
  2205  	store0, store2 := mtc.Store(0), mtc.Store(2)
  2206  
  2207  	rngID := store0.LookupReplica(roachpb.RKey("a")).Desc().RangeID
  2208  	mtc.replicateRange(rngID, 1, 2)
  2209  	lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0)
  2210  	if err != nil {
  2211  		t.Fatal(err)
  2212  	}
  2213  
  2214  	// Make store2 the leaseholder for the RHS and wait for the lease transfer to
  2215  	// apply.
  2216  	mtc.transferLease(ctx, rhsDesc.RangeID, 0, 2)
  2217  	testutils.SucceedsSoon(t, func() error {
  2218  		rhsRepl, err := store2.GetReplica(rhsDesc.RangeID)
  2219  		if err != nil {
  2220  			return err
  2221  		}
  2222  		if !rhsRepl.OwnsValidLease(mtc.clock().Now()) {
  2223  			return errors.New("store2 does not own valid lease for rhs range")
  2224  		}
  2225  		return nil
  2226  	})
  2227  
  2228  	// Start dropping all Raft traffic to the LHS replica on store2 so that it
  2229  	// won't be aware that there is a merge in progress.
  2230  	mtc.transport.Listen(store2.Ident.StoreID, &unreliableRaftHandler{
  2231  		rangeID:            lhsDesc.RangeID,
  2232  		RaftMessageHandler: store2,
  2233  	})
  2234  
  2235  	// Perform the merge. The LHS replica on store2 whon't hear about this merge
  2236  	// and thus won't subsume its RHS replica. The RHS replica's merge watcher
  2237  	// goroutine will, however, notice the merge and mark the RHS replica as
  2238  	// destroyed with reason destroyReasonMergePending.
  2239  	args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  2240  	_, pErr := kv.SendWrapped(ctx, store0.TestSender(), args)
  2241  	if pErr != nil {
  2242  		t.Fatal(pErr)
  2243  	}
  2244  
  2245  	// Remove the merged range from store2. Its replicas of both the LHS and RHS
  2246  	// are now eligible for GC.
  2247  	mtc.unreplicateRange(lhsDesc.RangeID, 2)
  2248  
  2249  	// Note that we purposely do not call store.ManualReplicaGC here, as that
  2250  	// calls replicaGCQueue.process directly, bypassing the logic in
  2251  	// baseQueue.MaybeAdd and baseQueue.Add. We specifically want to test that
  2252  	// queuing logic, which has been broken in the past.
  2253  	testutils.SucceedsSoon(t, func() error {
  2254  		if _, err := store2.GetReplica(lhsDesc.RangeID); err == nil {
  2255  			return errors.New("lhs not destroyed")
  2256  		}
  2257  		if _, err := store2.GetReplica(rhsDesc.RangeID); err == nil {
  2258  			return errors.New("rhs not destroyed")
  2259  		}
  2260  		return nil
  2261  	})
  2262  }
  2263  
  2264  func TestStoreRangeMergeDeadFollowerBeforeTxn(t *testing.T) {
  2265  	defer leaktest.AfterTest(t)()
  2266  
  2267  	ctx := context.Background()
  2268  	var mtc *multiTestContext
  2269  	storeCfg := kvserver.TestStoreConfig(nil)
  2270  	storeCfg.TestingKnobs.DisableMergeQueue = true
  2271  	mtc = &multiTestContext{storeConfig: &storeCfg}
  2272  	mtc.Start(t, 3)
  2273  	defer mtc.Stop()
  2274  	store0 := mtc.Store(0)
  2275  
  2276  	rngID := store0.LookupReplica(roachpb.RKey("a")).Desc().RangeID
  2277  	mtc.replicateRange(rngID, 1, 2)
  2278  	lhsDesc, _, err := createSplitRanges(ctx, store0)
  2279  	if err != nil {
  2280  		t.Fatal(err)
  2281  	}
  2282  
  2283  	mtc.stopStore(2)
  2284  
  2285  	args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  2286  	_, pErr := kv.SendWrapped(ctx, store0.TestSender(), args)
  2287  	expErr := "waiting for all left-hand replicas to initialize"
  2288  	if !testutils.IsPError(pErr, expErr) {
  2289  		t.Fatalf("expected %q error, but got %v", expErr, pErr)
  2290  	}
  2291  }
  2292  
  2293  func TestStoreRangeMergeDeadFollowerDuringTxn(t *testing.T) {
  2294  	defer leaktest.AfterTest(t)()
  2295  
  2296  	ctx := context.Background()
  2297  	var mtc *multiTestContext
  2298  	storeCfg := kvserver.TestStoreConfig(nil)
  2299  	storeCfg.TestingKnobs.DisableMergeQueue = true
  2300  	storeCfg.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error {
  2301  		if ba.IsSingleSubsumeRequest() && mtc.Store(2) != nil {
  2302  			mtc.stopStore(2)
  2303  		}
  2304  		return nil
  2305  	}
  2306  	mtc = &multiTestContext{storeConfig: &storeCfg}
  2307  	mtc.Start(t, 3)
  2308  	defer mtc.Stop()
  2309  	store0 := mtc.Store(0)
  2310  
  2311  	rngID := store0.LookupReplica(roachpb.RKey("a")).Desc().RangeID
  2312  	mtc.replicateRange(rngID, 1, 2)
  2313  	lhsDesc, _, err := createSplitRanges(ctx, store0)
  2314  	if err != nil {
  2315  		t.Fatal(err)
  2316  	}
  2317  
  2318  	args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  2319  	_, pErr := kv.SendWrapped(ctx, store0.TestSender(), args)
  2320  	expErr := "merge failed: waiting for all right-hand replicas to catch up"
  2321  	if !testutils.IsPError(pErr, expErr) {
  2322  		t.Fatalf("expected %q error, but got %v", expErr, pErr)
  2323  	}
  2324  }
  2325  
  2326  func TestStoreRangeReadoptedLHSFollower(t *testing.T) {
  2327  	defer leaktest.AfterTest(t)()
  2328  
  2329  	run := func(t *testing.T, withMerge bool) {
  2330  		ctx := context.Background()
  2331  		storeCfg := kvserver.TestStoreConfig(nil)
  2332  		storeCfg.TestingKnobs.DisableReplicateQueue = true
  2333  		storeCfg.TestingKnobs.DisableReplicaGCQueue = true
  2334  		storeCfg.TestingKnobs.DisableMergeQueue = true
  2335  		mtc := &multiTestContext{storeConfig: &storeCfg}
  2336  		mtc.Start(t, 3)
  2337  		defer mtc.Stop()
  2338  		store0, store2 := mtc.Store(0), mtc.Store(2)
  2339  
  2340  		// Create two ranges on store0 and store1.
  2341  		lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0)
  2342  		if err != nil {
  2343  			t.Fatal(err)
  2344  		}
  2345  		mtc.replicateRange(lhsDesc.RangeID, 1)
  2346  		mtc.replicateRange(rhsDesc.RangeID, 1)
  2347  
  2348  		// Abandon a replica of the LHS on store2.
  2349  		mtc.replicateRange(lhsDesc.RangeID, 2)
  2350  		var lhsRepl2 *kvserver.Replica
  2351  		testutils.SucceedsSoon(t, func() error {
  2352  			lhsRepl2, err = store2.GetReplica(lhsDesc.RangeID)
  2353  			if err != nil {
  2354  				return err
  2355  			}
  2356  			if !lhsRepl2.IsInitialized() {
  2357  				// Make sure the replica is initialized before unreplicating.
  2358  				// Uninitialized replicas that have a replicaID are hard to
  2359  				// GC (not implemented at the time of writing).
  2360  				return errors.Errorf("%s not initialized", lhsRepl2)
  2361  			}
  2362  			return nil
  2363  		})
  2364  		mtc.unreplicateRange(lhsDesc.RangeID, 2)
  2365  
  2366  		if withMerge {
  2367  			// Merge the two ranges together.
  2368  			args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  2369  			_, pErr := kv.SendWrapped(ctx, store0.TestSender(), args)
  2370  			if pErr != nil {
  2371  				t.Fatal(pErr)
  2372  			}
  2373  		}
  2374  
  2375  		// Attempt to re-add the merged range to store2. This should succeed
  2376  		// immediately because there are no overlapping replicas that would interfere
  2377  		// with the widening of the existing LHS replica.
  2378  		if _, err := mtc.dbs[0].AdminChangeReplicas(
  2379  			ctx, lhsDesc.StartKey.AsRawKey(),
  2380  			*lhsDesc,
  2381  			roachpb.MakeReplicationChanges(
  2382  				roachpb.ADD_REPLICA,
  2383  				roachpb.ReplicationTarget{
  2384  					NodeID:  mtc.idents[2].NodeID,
  2385  					StoreID: mtc.idents[2].StoreID,
  2386  				}),
  2387  		); !testutils.IsError(err, "descriptor changed") {
  2388  			t.Fatal(err)
  2389  		}
  2390  
  2391  		if err := store2.ManualReplicaGC(lhsRepl2); err != nil {
  2392  			t.Fatal(err)
  2393  		}
  2394  
  2395  		mtc.replicateRange(lhsDesc.RangeID, 2)
  2396  		// Give store2 the lease to force all commands to be applied, including the
  2397  		// ChangeReplicas.
  2398  		mtc.transferLease(ctx, lhsDesc.RangeID, 0, 2)
  2399  	}
  2400  
  2401  	testutils.RunTrueAndFalse(t, "withMerge", run)
  2402  }
  2403  
  2404  // slowSnapRaftHandler delays any snapshots to rangeID until waitCh is closed.
  2405  type slowSnapRaftHandler struct {
  2406  	rangeID roachpb.RangeID
  2407  	waitCh  chan struct{}
  2408  	kvserver.RaftMessageHandler
  2409  	syncutil.Mutex
  2410  }
  2411  
  2412  func (h *slowSnapRaftHandler) unblock() {
  2413  	h.Lock()
  2414  	if h.waitCh != nil {
  2415  		close(h.waitCh)
  2416  		h.waitCh = nil
  2417  	}
  2418  	h.Unlock()
  2419  }
  2420  
  2421  func (h *slowSnapRaftHandler) HandleSnapshot(
  2422  	header *kvserver.SnapshotRequest_Header, respStream kvserver.SnapshotResponseStream,
  2423  ) error {
  2424  	if header.RaftMessageRequest.RangeID == h.rangeID {
  2425  		h.Lock()
  2426  		waitCh := h.waitCh
  2427  		h.Unlock()
  2428  		if waitCh != nil {
  2429  			<-waitCh
  2430  		}
  2431  	}
  2432  	return h.RaftMessageHandler.HandleSnapshot(header, respStream)
  2433  }
  2434  
  2435  // TestStoreRangeMergeUninitializedLHSFollower reproduces a rare bug in which a
  2436  // replica of the RHS of a merge could be garbage collected too soon.
  2437  //
  2438  // Consider two adjacent ranges, A and B. Suppose the replica of
  2439  // A on the last store, S3, is uninitialized, e.g. because A was recently
  2440  // created by a split and S3 has neither processed the split trigger nor
  2441  // received a snapshot. The leaseholder for A will attempt to send a Raft
  2442  // snapshot to bring S3's replica up to date, but this Raft snapshot may be
  2443  // delayed due to a busy Raft snapshot queue or a slow network.
  2444  //
  2445  // Now suppose a merge of A and B commits before S3 receives a Raft snapshot for
  2446  // A. There is a small window of time in which S3 can garbage collect its
  2447  // replica of B! When S3 looks up B's meta2 descriptor, it will find that B has
  2448  // been merged away. S3 will then try to prove that B's local left neighbor is
  2449  // generationally up-to-date; if it is, it safe to GC B. Usually, S3 would
  2450  // determine A to be B's left neighbor, realize that A has not yet processed the
  2451  // merge, and correctly refuse to GC its replica of B. In this case, however,
  2452  // S3's replica of A is uninitialized and thus doesn't know its start and end
  2453  // key, so S3 will instead discover some more-distant left neighbor of B. This
  2454  // distant neighbor might very well be up-to-date, and S3 will incorrectly
  2455  // conclude that it can GC its replica of B!
  2456  //
  2457  // So say S3 GCs its replica of B. There are now two paths that A might take.
  2458  // The happy case is that A receives a Raft snapshot that postdates the merge.
  2459  // The unhappy case is that A receives a Raft snapshot that predates the merge,
  2460  // and is then required to apply the merge via a MsgApp. Since there is no
  2461  // longer a replica of B on S3, applying the merge trigger will explode.
  2462  //
  2463  // The solution was to require that all LHS replicas are initialized before
  2464  // beginning a merge transaction. This ensures that the replica GC queue will
  2465  // always discover the correct left neighbor when considering whether a subsumed
  2466  // range can be GC'd.
  2467  func TestStoreRangeMergeUninitializedLHSFollower(t *testing.T) {
  2468  	defer leaktest.AfterTest(t)()
  2469  
  2470  	ctx := context.Background()
  2471  	storeCfg := kvserver.TestStoreConfig(nil)
  2472  	storeCfg.TestingKnobs.DisableReplicateQueue = true
  2473  	mtc := &multiTestContext{storeConfig: &storeCfg}
  2474  	mtc.Start(t, 3)
  2475  	defer mtc.Stop()
  2476  	store0, store2 := mtc.Store(0), mtc.Store(2)
  2477  	distSender := mtc.distSenders[0]
  2478  
  2479  	split := func(key roachpb.RKey) roachpb.RangeID {
  2480  		t.Helper()
  2481  		if _, pErr := kv.SendWrapped(ctx, distSender, adminSplitArgs(key.AsRawKey())); pErr != nil {
  2482  			t.Fatal(pErr)
  2483  		}
  2484  		return store0.LookupReplica(key).RangeID
  2485  	}
  2486  
  2487  	// We'll create two ranges, A and B, as described in the comment on this test
  2488  	// function.
  2489  	aKey, bKey := roachpb.RKey("a"), roachpb.RKey("b")
  2490  
  2491  	// Put range 1 on all three stores.
  2492  	rngID := store0.LookupReplica(aKey).Desc().RangeID
  2493  	mtc.replicateRange(rngID, 1, 2)
  2494  
  2495  	// Create range B and wait for store2 to process the split.
  2496  	bRangeID := split(bKey)
  2497  	var bRepl2 *kvserver.Replica
  2498  	testutils.SucceedsSoon(t, func() (err error) {
  2499  		if bRepl2, err = store2.GetReplica(bRangeID); err != nil || !bRepl2.IsInitialized() {
  2500  			return errors.New("store2 has not yet processed split of c")
  2501  		}
  2502  		return nil
  2503  	})
  2504  
  2505  	// Now we want to create range A, but we need to make sure store2's replica of
  2506  	// A is not initialized. This requires dropping all Raft traffic to store2
  2507  	// from range 1, which will be the LHS of the split, so that store2's replica
  2508  	// of range 1 never processes the split trigger, which would create an
  2509  	// initialized replica of A.
  2510  	unreliableHandler := &unreliableRaftHandler{
  2511  		rangeID:            rngID,
  2512  		RaftMessageHandler: store2,
  2513  	}
  2514  	mtc.transport.Listen(store2.Ident.StoreID, unreliableHandler)
  2515  
  2516  	// Perform the split of A, now that store2 won't be able to initialize its
  2517  	// replica of A.
  2518  	aRangeID := split(aKey)
  2519  
  2520  	// Wedge a Raft snapshot that's destined for A. This allows us to capture a
  2521  	// pre-merge Raft snapshot, which we'll let loose after the merge commits.
  2522  	slowSnapHandler := &slowSnapRaftHandler{
  2523  		rangeID:            aRangeID,
  2524  		waitCh:             make(chan struct{}),
  2525  		RaftMessageHandler: unreliableHandler,
  2526  	}
  2527  	defer slowSnapHandler.unblock()
  2528  	mtc.transport.Listen(store2.Ident.StoreID, slowSnapHandler)
  2529  
  2530  	// Remove the replica of range 1 on store2. If we were to leave it in place,
  2531  	// store2 would refuse to GC its replica of C after the merge commits, because
  2532  	// the left neighbor of C would be this out-of-date replica of range 1.
  2533  	// (Remember that we refused to let it process the split of A.) So we need to
  2534  	// remove it so that C has no left neighbor and thus will be eligible for GC.
  2535  	{
  2536  		r1Repl2, err := store2.GetReplica(rngID)
  2537  		if err != nil {
  2538  			t.Fatal(err)
  2539  		}
  2540  		mtc.unreplicateRange(rngID, 2)
  2541  		testutils.SucceedsSoon(t, func() error {
  2542  			if err := store2.ManualReplicaGC(r1Repl2); err != nil {
  2543  				return err
  2544  			}
  2545  			if _, err := store2.GetReplica(rngID); err == nil {
  2546  				return errors.New("r1Repl2 still exists")
  2547  			}
  2548  			return nil
  2549  		})
  2550  	}
  2551  
  2552  	// Launch the merge of A and B.
  2553  	mergeErr := make(chan error)
  2554  	go func() {
  2555  		_, pErr := kv.SendWrapped(ctx, distSender, adminMergeArgs(aKey.AsRawKey()))
  2556  		mergeErr <- pErr.GoError()
  2557  	}()
  2558  
  2559  	// We want to assert that the merge does not complete until we allow store2's
  2560  	// replica of B to be initialized (by releasing the blocked Raft snapshot). A
  2561  	// happens-before assertion is nearly impossible to express, though, so
  2562  	// instead we just wait in the hope that, if the merge is buggy, it will
  2563  	// commit while we wait. Before the bug was fixed, this caused the test
  2564  	// to fail reliably.
  2565  	start := timeutil.Now()
  2566  	for timeutil.Since(start) < 50*time.Millisecond {
  2567  		if _, err := store2.GetReplica(bRangeID); err == nil {
  2568  			// Attempt to reproduce the exact fatal error described in the comment on
  2569  			// the test by running range B through the GC queue. If the bug is
  2570  			// present, GC will be successful and so the application of the merge
  2571  			// trigger on A to fail once we allow the Raft snapshot through. If the
  2572  			// bug is not present, we'll be unable to GC range B because it won't get
  2573  			// subsumed until after we allow the Raft snapshot through.
  2574  			_ = store2.ManualReplicaGC(bRepl2)
  2575  		}
  2576  		time.Sleep(5 * time.Millisecond) // don't spin too hot to give the merge CPU time to complete
  2577  	}
  2578  
  2579  	select {
  2580  	case err := <-mergeErr:
  2581  		t.Errorf("merge completed early (err: %v)", err)
  2582  		close(mergeErr)
  2583  	default:
  2584  	}
  2585  
  2586  	// Allow store2's replica of A to initialize with a Raft snapshot that
  2587  	// predates the merge.
  2588  	slowSnapHandler.unblock()
  2589  
  2590  	// Assert that the merge completes successfully.
  2591  	if err := <-mergeErr; err != nil {
  2592  		t.Fatal(err)
  2593  	}
  2594  
  2595  	// Give store2 the lease on the merged range to force all commands to be
  2596  	// applied, including the merge trigger.
  2597  	mtc.transferLease(ctx, aRangeID, 0, 2)
  2598  }
  2599  
  2600  // TestStoreRangeMergeWatcher verifies that the watcher goroutine for a merge's
  2601  // RHS does not erroneously permit traffic after the merge commits.
  2602  func TestStoreRangeMergeWatcher(t *testing.T) {
  2603  	defer leaktest.AfterTest(t)()
  2604  
  2605  	testutils.RunTrueAndFalse(t, "inject-failures", testMergeWatcher)
  2606  }
  2607  
  2608  func testMergeWatcher(t *testing.T, injectFailures bool) {
  2609  	ctx := context.Background()
  2610  	storeCfg := kvserver.TestStoreConfig(nil)
  2611  	storeCfg.TestingKnobs.DisableReplicateQueue = true
  2612  	storeCfg.TestingKnobs.DisableReplicaGCQueue = true
  2613  
  2614  	var mergeTxnRetries, pushTxnRetries, meta2GetRetries int64
  2615  	if injectFailures {
  2616  		mergeTxnRetries = 3
  2617  		pushTxnRetries = 3
  2618  		meta2GetRetries = 3
  2619  	}
  2620  
  2621  	// Maybe inject some retryable errors when the merge transaction commits.
  2622  	var mtc *multiTestContext
  2623  	storeCfg.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error {
  2624  		for _, req := range ba.Requests {
  2625  			if et := req.GetEndTxn(); et != nil && et.InternalCommitTrigger.GetMergeTrigger() != nil {
  2626  				if atomic.AddInt64(&mergeTxnRetries, -1) >= 0 {
  2627  					return roachpb.NewError(
  2628  						roachpb.NewTransactionRetryError(roachpb.RETRY_SERIALIZABLE, "filter err"))
  2629  				}
  2630  			}
  2631  			if pt := req.GetPushTxn(); pt != nil {
  2632  				if atomic.AddInt64(&pushTxnRetries, -1) >= 0 {
  2633  					return roachpb.NewErrorf("injected failure")
  2634  				}
  2635  			}
  2636  			if g := req.GetGet(); g != nil && ba.ReadConsistency == roachpb.READ_UNCOMMITTED {
  2637  				if atomic.AddInt64(&meta2GetRetries, -1) >= 0 {
  2638  					return roachpb.NewErrorf("injected failure")
  2639  				}
  2640  			}
  2641  		}
  2642  		return nil
  2643  	}
  2644  
  2645  	mtc = &multiTestContext{
  2646  		storeConfig: &storeCfg,
  2647  		// This test was written before the multiTestContext started creating many
  2648  		// system ranges at startup, and hasn't been update to take that into
  2649  		// account.
  2650  		startWithSingleRange: true,
  2651  	}
  2652  
  2653  	mtc.Start(t, 3)
  2654  	defer mtc.Stop()
  2655  	store0, store2 := mtc.Store(0), mtc.Store(2)
  2656  
  2657  	// Make store0 the leaseholder of the LHS and store2 the leaseholder of the
  2658  	// RHS. We'll be forcing store2's LHS to fall behind. This creates an
  2659  	// interesting scenario in which the leaseholder for the RHS has very
  2660  	// out-of-date information about the status of the merge.
  2661  	rngID := store0.LookupReplica(roachpb.RKey("a")).Desc().RangeID
  2662  	mtc.replicateRange(rngID, 1, 2)
  2663  	lhsDesc, rhsDesc, err := createSplitRanges(ctx, store0)
  2664  	if err != nil {
  2665  		t.Fatal(err)
  2666  	}
  2667  	mtc.transferLease(ctx, rhsDesc.RangeID, 0, 2)
  2668  
  2669  	// After the LHS replica on store2 processes the split, block Raft traffic to
  2670  	// it by holding its raftMu, so that it isn't aware there's a merge in
  2671  	// progress.
  2672  	lhsRepl2, err := store2.GetReplica(lhsDesc.RangeID)
  2673  	if err != nil {
  2674  		t.Fatal(err)
  2675  	}
  2676  	testutils.SucceedsSoon(t, func() error {
  2677  		if !lhsRepl2.Desc().Equal(lhsDesc) {
  2678  			return errors.New("store2 has not processed split")
  2679  		}
  2680  		return nil
  2681  	})
  2682  	lhsRepl2.RaftLock()
  2683  
  2684  	args := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  2685  	_, pErr := kv.SendWrapped(ctx, store0.TestSender(), args)
  2686  	if pErr != nil {
  2687  		t.Fatal(pErr)
  2688  	}
  2689  
  2690  	// Immediately after the merge completes, send a request to the RHS which will
  2691  	// be handled by the leaseholder, on store2. This exercises a tricky scenario.
  2692  	// We've forced store2's LHS replica to fall behind, so it can't subsume
  2693  	// store2's RHS. store2's RHS is watching for the merge to complete, however,
  2694  	// and will notice that the merge has committed before the LHS does.
  2695  	getErr := make(chan error)
  2696  	go func() {
  2697  		_, pErr = kv.SendWrappedWith(ctx, store2.TestSender(), roachpb.Header{
  2698  			RangeID: rhsDesc.RangeID,
  2699  		}, getArgs(rhsDesc.StartKey.AsRawKey()))
  2700  		getErr <- pErr.GoError()
  2701  	}()
  2702  
  2703  	// Restore communication with store2. Give it the lease to force all commands
  2704  	// to be applied, including the merge trigger.
  2705  	lhsRepl2.RaftUnlock()
  2706  	mtc.transferLease(ctx, lhsDesc.RangeID, 0, 2)
  2707  
  2708  	// We *must* see a RangeNotFound error from the get request we sent earlier
  2709  	// because we sent it after the merge completed. Anything else is a
  2710  	// consistency error (or a bug in the test).
  2711  	if err := <-getErr; !testutils.IsError(err, "r2 was not found") {
  2712  		t.Fatalf("expected RangeNotFound error from get after merge, but got %v", err)
  2713  	}
  2714  }
  2715  
  2716  // TestStoreRangeMergeSlowWatcher verifies that the watcher goroutine for the
  2717  // RHS of a merge does not erroneously permit traffic after the merge commits,
  2718  // even if the watcher goroutine is so slow in noticing the merge that another
  2719  // merge occurs.
  2720  //
  2721  // This test is a more complicated version of TestStoreRangeMergeWatcher that
  2722  // exercises a rare but important edge case.
  2723  //
  2724  // The test creates three ranges, [a, b), [b, c), and [c, /Max). Hereafter these
  2725  // ranges will be referred to as A, B, and C, respectively. store0 holds the
  2726  // lease on A and C, while store1 holds the lease on B. The test will execute
  2727  // two merges such that first A subsumes B, then AB subsumes C. The idea is to
  2728  // inform store1 that the A <- B merge is in progress so that it locks B down,
  2729  // but then keep it in the dark about the status of the merge for long enough
  2730  // that the AB <- C merge commits.
  2731  //
  2732  // When store1's merge watcher goroutine looks up whether the A <- B merge
  2733  // commit occurred in meta2 with a Get(/Meta2/c) request, it won't find the
  2734  // descriptor for B, which would indicate that the merge aborted, nor the
  2735  // descriptor for AB, which would indicate that the merge committed. Instead it
  2736  // will find no descriptor at all, since the AB <- C merge has committed and the
  2737  // descriptor for the merged range ABC is stored at /Meta2/Max, not /Meta2/c.
  2738  func TestStoreRangeMergeSlowWatcher(t *testing.T) {
  2739  	defer leaktest.AfterTest(t)()
  2740  
  2741  	ctx := context.Background()
  2742  	aKey, bKey, cKey := roachpb.RKey("a"), roachpb.RKey("b"), roachpb.RKey("c")
  2743  	storeCfg := kvserver.TestStoreConfig(nil)
  2744  	storeCfg.TestingKnobs.DisableReplicateQueue = true
  2745  	var mtc *multiTestContext
  2746  	var store0, store1 *kvserver.Store
  2747  
  2748  	// Force PushTxn requests generated by the watcher goroutine to wait on a
  2749  	// channel. This is how we control when store1's merge watcher goroutine hears
  2750  	// about the status of the A <- B merge.
  2751  	var syn syncutil.Mutex
  2752  	cond := sync.NewCond(&syn)
  2753  	storeCfg.TestingKnobs.TestingRequestFilter = func(_ context.Context, ba roachpb.BatchRequest) *roachpb.Error {
  2754  		syn.Lock()
  2755  		defer syn.Unlock()
  2756  		for _, req := range ba.Requests {
  2757  			// We can detect PushTxn requests generated by the watcher goroutine
  2758  			// because they use the minimum transaction priority. Note that we
  2759  			// only block the watcher goroutine on store1 so that we only interfere
  2760  			// with the first merge (A <- B) and not the later merge (AB <- C).
  2761  			if pt := req.GetPushTxn(); pt != nil && pt.PusherTxn.Priority == enginepb.MinTxnPriority &&
  2762  				ba.GatewayNodeID == store1.Ident.NodeID {
  2763  				cond.Wait()
  2764  			}
  2765  			if et := req.GetEndTxn(); et != nil && !et.Commit && ba.Txn.Name == "merge" {
  2766  				// The merge transaction needed to restart for some reason. To avoid
  2767  				// deadlocking, we need to allow the watcher goroutine's PushTxn request
  2768  				// through so that it allows traffic on the range again. We'll try again
  2769  				// with the restarted merge transaction.
  2770  				cond.Signal()
  2771  			}
  2772  		}
  2773  		return nil
  2774  	}
  2775  
  2776  	// Record whether we've seen a request to Get(/Meta2/c) that returned nil.
  2777  	// This verifies that we're actually testing what we claim to.
  2778  	var sawMeta2Req int64
  2779  	meta2CKey := keys.RangeMetaKey(cKey).AsRawKey()
  2780  	storeCfg.TestingKnobs.TestingResponseFilter = func(
  2781  		ctx context.Context, ba roachpb.BatchRequest, br *roachpb.BatchResponse,
  2782  	) *roachpb.Error {
  2783  		for i, req := range ba.Requests {
  2784  			if g := req.GetGet(); g != nil && g.Key.Equal(meta2CKey) && br.Responses[i].GetGet().Value == nil {
  2785  				atomic.StoreInt64(&sawMeta2Req, 1)
  2786  			}
  2787  		}
  2788  		return nil
  2789  	}
  2790  
  2791  	mtc = &multiTestContext{storeConfig: &storeCfg}
  2792  	mtc.Start(t, 3)
  2793  	defer mtc.Stop()
  2794  	store0, store1 = mtc.Store(0), mtc.Store(1)
  2795  
  2796  	// Create and place the ranges as described in the comment on this test.
  2797  	rngID := store0.LookupReplica(aKey).Desc().RangeID
  2798  	mtc.replicateRange(rngID, 1, 2)
  2799  	keys := []roachpb.RKey{aKey, bKey, cKey}
  2800  	for _, key := range keys {
  2801  		splitArgs := adminSplitArgs(key.AsRawKey())
  2802  		if _, pErr := kv.SendWrapped(ctx, mtc.distSenders[0], splitArgs); pErr != nil {
  2803  			t.Fatal(pErr)
  2804  		}
  2805  	}
  2806  	bRangeID := store0.LookupReplica(bKey).RangeID
  2807  	mtc.transferLease(ctx, bRangeID, 0, 1)
  2808  
  2809  	// Warm the DistSender cache on each node. We'll be blocking requests to B
  2810  	// during the test, and we don't want requests headed for A or C to get routed
  2811  	// to B while its blocked because of a stale DistSender cache.
  2812  	for _, key := range keys {
  2813  		for _, distSender := range mtc.distSenders {
  2814  			if _, pErr := kv.SendWrapped(ctx, distSender, getArgs(key.AsRawKey())); pErr != nil {
  2815  				t.Fatal(pErr)
  2816  			}
  2817  		}
  2818  	}
  2819  
  2820  	// Force the replica of A on store1 to fall behind so that it doesn't apply
  2821  	// any merge triggers. This makes the watcher goroutine responsible for
  2822  	// marking B as destroyed.
  2823  	aRepl1 := store1.LookupReplica(aKey)
  2824  	aRepl1.RaftLock()
  2825  	defer aRepl1.RaftUnlock()
  2826  
  2827  	// Merge A <- B.
  2828  	mergeArgs := adminMergeArgs(aKey.AsRawKey())
  2829  	if _, pErr := kv.SendWrapped(ctx, mtc.distSenders[0], mergeArgs); pErr != nil {
  2830  		t.Fatal(pErr)
  2831  	}
  2832  
  2833  	// Immediately after the merge completes, send a request to B.
  2834  	getErr := make(chan error)
  2835  	go func() {
  2836  		_, pErr := kv.SendWrappedWith(ctx, store1.TestSender(), roachpb.Header{
  2837  			RangeID: bRangeID,
  2838  		}, getArgs(bKey.AsRawKey()))
  2839  		getErr <- pErr.GoError()
  2840  	}()
  2841  
  2842  	// Merge AB <- C.
  2843  	if _, pErr := kv.SendWrapped(ctx, mtc.distSenders[0], mergeArgs); pErr != nil {
  2844  		t.Fatal(pErr)
  2845  	}
  2846  
  2847  	// Synchronously ensure that the intent on meta2CKey has been cleaned up.
  2848  	// The merge committed, but the intent resolution happens asynchronously.
  2849  	_, pErr := kv.SendWrapped(ctx, mtc.distSenders[0], getArgs(meta2CKey))
  2850  	if pErr != nil {
  2851  		t.Fatal(pErr)
  2852  	}
  2853  
  2854  	// With the meta2CKey intent cleaned up, allow store1's merge watcher
  2855  	// goroutine to proceed.
  2856  	cond.Signal()
  2857  
  2858  	// We *must* see a RangeNotFound error from the get request we sent earlier
  2859  	// because we sent it after the merge completed. Anything else is a
  2860  	// consistency error (or a bug in the test).
  2861  	expErr := fmt.Sprintf("r%d was not found", bRangeID)
  2862  	if err := <-getErr; !testutils.IsError(err, expErr) {
  2863  		t.Fatalf("expected %q error from get after merge, but got %v", expErr, err)
  2864  	}
  2865  
  2866  	if atomic.LoadInt64(&sawMeta2Req) != 1 {
  2867  		t.Fatalf("test did not generate expected meta2 get request/response")
  2868  	}
  2869  }
  2870  
  2871  func TestStoreRangeMergeRaftSnapshot(t *testing.T) {
  2872  	defer leaktest.AfterTest(t)()
  2873  
  2874  	// We will be testing the SSTs written on store2's engine.
  2875  	var receivingEng, sendingEng storage.Engine
  2876  	ctx := context.Background()
  2877  	storeCfg := kvserver.TestStoreConfig(nil)
  2878  	storeCfg.TestingKnobs.DisableReplicateQueue = true
  2879  	storeCfg.TestingKnobs.DisableReplicaGCQueue = true
  2880  	storeCfg.Clock = nil // manual clock
  2881  	storeCfg.TestingKnobs.BeforeSnapshotSSTIngestion = func(
  2882  		inSnap kvserver.IncomingSnapshot,
  2883  		snapType kvserver.SnapshotRequest_Type,
  2884  		sstNames []string,
  2885  	) error {
  2886  		// Only verify snapshots of type RAFT and on the range under exercise
  2887  		// (range 2). Note that the keys of range 2 aren't verified in this
  2888  		// functions. Unreplicated range-id local keys are not verified because
  2889  		// there are too many keys and the other replicated keys are verified later
  2890  		// on in the test. This function verifies that the subsumed replicas have
  2891  		// been handled properly.
  2892  		if snapType != kvserver.SnapshotRequest_RAFT || inSnap.State.Desc.RangeID != roachpb.RangeID(2) {
  2893  			return nil
  2894  		}
  2895  		// The seven SSTs we are expecting to ingest are in the following order:
  2896  		// 1. Replicated range-id local keys of the range in the snapshot.
  2897  		// 2. Range-local keys of the range in the snapshot.
  2898  		// 3. User keys of the range in the snapshot.
  2899  		// 4. Unreplicated range-id local keys of the range in the snapshot.
  2900  		// 5. SST to clear range-id local keys of the subsumed replica with
  2901  		//    RangeID 3.
  2902  		// 6. SST to clear range-id local keys of the subsumed replica with
  2903  		//    RangeID 4.
  2904  		// 7. SST to clear the user keys of the subsumed replicas.
  2905  		//
  2906  		// NOTE: There are no range-local keys in [d, /Max) in the store we're
  2907  		// sending a snapshot to, so we aren't expecting an SST to clear those
  2908  		// keys.
  2909  		if len(sstNames) != 7 {
  2910  			return errors.Errorf("expected to ingest 7 SSTs, got %d SSTs", len(sstNames))
  2911  		}
  2912  
  2913  		// Only try to predict SSTs 3 and 5-7. SSTs 1, 2 and 4 are excluded in
  2914  		// the test since the state of the Raft log can be non-deterministic
  2915  		// with extra entries being appended to the sender's log after the
  2916  		// snapshot has already been sent.
  2917  		var sstNamesSubset []string
  2918  		sstNamesSubset = append(sstNamesSubset, sstNames[2])
  2919  		sstNamesSubset = append(sstNamesSubset, sstNames[4:]...)
  2920  
  2921  		// Construct the expected SSTs and ensure that they are byte-by-byte
  2922  		// equal. This verification ensures that the SSTs have the same
  2923  		// tombstones and range deletion tombstones.
  2924  		var expectedSSTs [][]byte
  2925  
  2926  		// Construct SST #1 through #3 as numbered above, but only ultimately
  2927  		// keep the 3rd one.
  2928  		keyRanges := rditer.MakeReplicatedKeyRanges(inSnap.State.Desc)
  2929  		it := rditer.NewReplicaDataIterator(inSnap.State.Desc, sendingEng, true /* replicatedOnly */, false /* seekEnd */)
  2930  		defer it.Close()
  2931  		// Write a range deletion tombstone to each of the SSTs then put in the
  2932  		// kv entries from the sender of the snapshot.
  2933  		for _, r := range keyRanges {
  2934  			sstFile := &storage.MemFile{}
  2935  			sst := storage.MakeIngestionSSTWriter(sstFile)
  2936  			if err := sst.ClearRange(r.Start, r.End); err != nil {
  2937  				return err
  2938  			}
  2939  
  2940  			// Keep adding kv data to the SST until the the key exceeds the
  2941  			// bounds of the range, then proceed to the next range.
  2942  			for ; ; it.Next() {
  2943  				valid, err := it.Valid()
  2944  				if err != nil {
  2945  					return err
  2946  				}
  2947  				if !valid || r.End.Key.Compare(it.Key().Key) <= 0 {
  2948  					if err := sst.Finish(); err != nil {
  2949  						return err
  2950  					}
  2951  					sst.Close()
  2952  					expectedSSTs = append(expectedSSTs, sstFile.Data())
  2953  					break
  2954  				}
  2955  				if err := sst.Put(it.Key(), it.Value()); err != nil {
  2956  					return err
  2957  				}
  2958  			}
  2959  		}
  2960  		expectedSSTs = expectedSSTs[2:]
  2961  
  2962  		// Construct SSTs #5 and #6: range-id local keys of subsumed replicas
  2963  		// with RangeIDs 3 and 4.
  2964  		for _, rangeID := range []roachpb.RangeID{roachpb.RangeID(3), roachpb.RangeID(4)} {
  2965  			sstFile := &storage.MemFile{}
  2966  			sst := storage.MakeIngestionSSTWriter(sstFile)
  2967  			defer sst.Close()
  2968  			r := rditer.MakeRangeIDLocalKeyRange(rangeID, false /* replicatedOnly */)
  2969  			if err := sst.ClearRange(r.Start, r.End); err != nil {
  2970  				return err
  2971  			}
  2972  			tombstoneKey := keys.RangeTombstoneKey(rangeID)
  2973  			tombstoneValue := &roachpb.RangeTombstone{NextReplicaID: math.MaxInt32}
  2974  			if err := storage.MVCCBlindPutProto(context.Background(), &sst, nil, tombstoneKey, hlc.Timestamp{}, tombstoneValue, nil); err != nil {
  2975  				return err
  2976  			}
  2977  			err := sst.Finish()
  2978  			if err != nil {
  2979  				return err
  2980  			}
  2981  			expectedSSTs = append(expectedSSTs, sstFile.Data())
  2982  		}
  2983  
  2984  		// Construct SST #7: user key range of subsumed replicas.
  2985  		sstFile := &storage.MemFile{}
  2986  		sst := storage.MakeIngestionSSTWriter(sstFile)
  2987  		defer sst.Close()
  2988  		desc := roachpb.RangeDescriptor{
  2989  			StartKey: roachpb.RKey("d"),
  2990  			EndKey:   roachpb.RKeyMax,
  2991  		}
  2992  		r := rditer.MakeUserKeyRange(&desc)
  2993  		if err := storage.ClearRangeWithHeuristic(receivingEng, &sst, r.Start.Key, r.End.Key); err != nil {
  2994  			return err
  2995  		}
  2996  		err := sst.Finish()
  2997  		if err != nil {
  2998  			return err
  2999  		}
  3000  		expectedSSTs = append(expectedSSTs, sstFile.Data())
  3001  
  3002  		var mismatchedSstsIdx []int
  3003  		// Iterate over all the tested SSTs and check that they're byte-by-byte equal.
  3004  		for i := range sstNamesSubset {
  3005  			actualSST, err := receivingEng.ReadFile(sstNamesSubset[i])
  3006  			if err != nil {
  3007  				return err
  3008  			}
  3009  			if !bytes.Equal(actualSST, expectedSSTs[i]) {
  3010  				mismatchedSstsIdx = append(mismatchedSstsIdx, i)
  3011  			}
  3012  		}
  3013  		if len(mismatchedSstsIdx) != 0 {
  3014  			return errors.Errorf("SST indices %v don't match", mismatchedSstsIdx)
  3015  		}
  3016  		return nil
  3017  	}
  3018  	mtc := &multiTestContext{
  3019  		storeConfig: &storeCfg,
  3020  		// This test was written before the multiTestContext started creating many
  3021  		// system ranges at startup, and hasn't been update to take that into
  3022  		// account.
  3023  		startWithSingleRange: true,
  3024  	}
  3025  	mtc.Start(t, 3)
  3026  	defer mtc.Stop()
  3027  	store0, store2 := mtc.Store(0), mtc.Store(2)
  3028  	sendingEng = store0.Engine()
  3029  	receivingEng = store2.Engine()
  3030  	distSender := mtc.distSenders[0]
  3031  
  3032  	// Create three fully-caught-up, adjacent ranges on all three stores.
  3033  	mtc.replicateRange(roachpb.RangeID(1), 1, 2)
  3034  	for _, key := range []roachpb.Key{roachpb.Key("a"), roachpb.Key("b"), roachpb.Key("c")} {
  3035  		if _, pErr := kv.SendWrapped(ctx, distSender, adminSplitArgs(key)); pErr != nil {
  3036  			t.Fatal(pErr)
  3037  		}
  3038  		if _, pErr := kv.SendWrapped(ctx, distSender, incrementArgs(key, 1)); pErr != nil {
  3039  			t.Fatal(pErr)
  3040  		}
  3041  		mtc.waitForValues(key, []int64{1, 1, 1})
  3042  	}
  3043  
  3044  	// Put some keys in [d, /Max) so the subsumed replica of [c, /Max) with range
  3045  	// ID 4 has tombstones. We will clear uncontained key range of subsumed
  3046  	// replicas, so when we are receiving a snapshot for [a, d), we expect to
  3047  	// clear the keys in [d, /Max).
  3048  	for i := 0; i < 10; i++ {
  3049  		key := roachpb.Key("d" + strconv.Itoa(i))
  3050  		if _, pErr := kv.SendWrapped(ctx, distSender, incrementArgs(key, 1)); pErr != nil {
  3051  			t.Fatal(pErr)
  3052  		}
  3053  		mtc.waitForValues(key, []int64{1, 1, 1})
  3054  	}
  3055  
  3056  	aRepl0 := store0.LookupReplica(roachpb.RKey("a"))
  3057  
  3058  	// Start dropping all Raft traffic to the first range on store2.
  3059  	mtc.transport.Listen(store2.Ident.StoreID, &unreliableRaftHandler{
  3060  		rangeID:            aRepl0.RangeID,
  3061  		RaftMessageHandler: store2,
  3062  	})
  3063  
  3064  	// Merge [a, b) into [b, c), then [a, c) into [c, /Max).
  3065  	for i := 0; i < 2; i++ {
  3066  		if _, pErr := kv.SendWrapped(ctx, distSender, adminMergeArgs(roachpb.Key("a"))); pErr != nil {
  3067  			t.Fatal(pErr)
  3068  		}
  3069  	}
  3070  
  3071  	// Split [a, /Max) into [a, d) and [d, /Max). This means the Raft snapshot
  3072  	// will span both a merge and a split.
  3073  	if _, pErr := kv.SendWrapped(ctx, distSender, adminSplitArgs(roachpb.Key("d"))); pErr != nil {
  3074  		t.Fatal(pErr)
  3075  	}
  3076  
  3077  	// Truncate the logs of the LHS.
  3078  	index := func() uint64 {
  3079  		repl := store0.LookupReplica(roachpb.RKey("a"))
  3080  		index, err := repl.GetLastIndex()
  3081  		if err != nil {
  3082  			t.Fatal(err)
  3083  		}
  3084  		// Truncate the log at index+1 (log entries < N are removed, so this
  3085  		// includes the merge).
  3086  		truncArgs := &roachpb.TruncateLogRequest{
  3087  			RequestHeader: roachpb.RequestHeader{Key: roachpb.Key("a")},
  3088  			Index:         index,
  3089  			RangeID:       repl.RangeID,
  3090  		}
  3091  		if _, err := kv.SendWrapped(ctx, mtc.distSenders[0], truncArgs); err != nil {
  3092  			t.Fatal(err)
  3093  		}
  3094  		return index
  3095  	}()
  3096  
  3097  	beforeRaftSnaps := store2.Metrics().RangeSnapshotsNormalApplied.Count()
  3098  
  3099  	// Restore Raft traffic to the LHS on store2.
  3100  	log.Infof(ctx, "restored traffic to store 2")
  3101  	mtc.transport.Listen(store2.Ident.StoreID, &unreliableRaftHandler{
  3102  		rangeID:            aRepl0.RangeID,
  3103  		RaftMessageHandler: store2,
  3104  		unreliableRaftHandlerFuncs: unreliableRaftHandlerFuncs{
  3105  			dropReq: func(req *kvserver.RaftMessageRequest) bool {
  3106  				// Make sure that even going forward no MsgApp for what we just
  3107  				// truncated can make it through. The Raft transport is asynchronous
  3108  				// so this is necessary to make the test pass reliably - otherwise
  3109  				// the follower on store2 may catch up without needing a snapshot,
  3110  				// tripping up the test.
  3111  				//
  3112  				// NB: the Index on the message is the log index that _precedes_ any of the
  3113  				// entries in the MsgApp, so filter where msg.Index < index, not <= index.
  3114  				return req.Message.Type == raftpb.MsgApp && req.Message.Index < index
  3115  			},
  3116  			// Don't drop heartbeats or responses.
  3117  			dropHB:   func(*kvserver.RaftHeartbeat) bool { return false },
  3118  			dropResp: func(*kvserver.RaftMessageResponse) bool { return false },
  3119  		},
  3120  	})
  3121  
  3122  	// Wait for all replicas to catch up to the same point. Because we truncated
  3123  	// the log while store2 was unavailable, this will require a Raft snapshot.
  3124  	testutils.SucceedsSoon(t, func() error {
  3125  		afterRaftSnaps := store2.Metrics().RangeSnapshotsNormalApplied.Count()
  3126  		if afterRaftSnaps <= beforeRaftSnaps {
  3127  			return errors.New("expected store2 to apply at least 1 additional raft snapshot")
  3128  		}
  3129  
  3130  		// Verify that the sets of keys in store0 and store2 are identical.
  3131  		storeKeys0 := getEngineKeySet(t, store0.Engine())
  3132  		storeKeys2 := getEngineKeySet(t, store2.Engine())
  3133  		dRepl0 := store0.LookupReplica(roachpb.RKey("d"))
  3134  		ignoreKey := func(k string) bool {
  3135  			// Unreplicated keys for the remaining ranges are allowed to differ.
  3136  			for _, id := range []roachpb.RangeID{1, aRepl0.RangeID, dRepl0.RangeID} {
  3137  				if strings.HasPrefix(k, string(keys.MakeRangeIDUnreplicatedPrefix(id))) {
  3138  					return true
  3139  				}
  3140  			}
  3141  			return false
  3142  		}
  3143  		for k := range storeKeys0 {
  3144  			if ignoreKey(k) {
  3145  				continue
  3146  			}
  3147  			if _, ok := storeKeys2[k]; !ok {
  3148  				return fmt.Errorf("store2 missing key %s", roachpb.Key(k))
  3149  			}
  3150  		}
  3151  		for k := range storeKeys2 {
  3152  			if ignoreKey(k) {
  3153  				continue
  3154  			}
  3155  			if _, ok := storeKeys0[k]; !ok {
  3156  				return fmt.Errorf("store2 has extra key %s", roachpb.Key(k))
  3157  			}
  3158  		}
  3159  		return nil
  3160  	})
  3161  }
  3162  
  3163  // TestStoreRangeMergeDuringShutdown verifies that a shutdown of a store
  3164  // containing the RHS of a merge can occur cleanly. This previously triggered
  3165  // a fatal error (#27552).
  3166  func TestStoreRangeMergeDuringShutdown(t *testing.T) {
  3167  	defer leaktest.AfterTest(t)()
  3168  
  3169  	ctx := context.Background()
  3170  	storeCfg := kvserver.TestStoreConfig(nil)
  3171  	storeCfg.TestingKnobs.DisableSplitQueue = true
  3172  	storeCfg.TestingKnobs.DisableMergeQueue = true
  3173  	storeCfg.TestingKnobs.DisableReplicateQueue = true
  3174  	storeCfg.Clock = nil // manual clock
  3175  
  3176  	// Install a filter that triggers a shutdown when stop is non-zero and the
  3177  	// rhsDesc requests a new lease.
  3178  	var mtc *multiTestContext
  3179  	var state struct {
  3180  		syncutil.Mutex
  3181  		rhsDesc        *roachpb.RangeDescriptor
  3182  		stop, stopping bool
  3183  	}
  3184  	storeCfg.TestingKnobs.TestingPostApplyFilter = func(args kvserverbase.ApplyFilterArgs) (int, *roachpb.Error) {
  3185  		state.Lock()
  3186  		if state.stop && !state.stopping && args.RangeID == state.rhsDesc.RangeID && args.IsLeaseRequest {
  3187  			// Shut down the store. The lease acquisition will notice that a merge is
  3188  			// in progress and attempt to run a task to watch for its completion.
  3189  			// Shutting down the store before running leasePostApply will prevent that
  3190  			// task from launching. This error path would previously fatal a node
  3191  			// incorrectly (#27552).
  3192  			state.stopping = true
  3193  			state.Unlock()
  3194  			go mtc.Stop()
  3195  			// Sleep to give the shutdown time to propagate. The test appeared to work
  3196  			// without this sleep, but best to be somewhat robust to different
  3197  			// goroutine schedules.
  3198  			time.Sleep(10 * time.Millisecond)
  3199  		} else {
  3200  			state.Unlock()
  3201  		}
  3202  		return 0, nil
  3203  	}
  3204  
  3205  	mtc = &multiTestContext{
  3206  		storeConfig: &storeCfg,
  3207  		// This test was written before the multiTestContext started creating many
  3208  		// system ranges at startup, and hasn't been update to take that into
  3209  		// account.
  3210  		startWithSingleRange: true,
  3211  	}
  3212  	mtc.Start(t, 1)
  3213  	store := mtc.Store(0)
  3214  	stopper := mtc.engineStoppers[0]
  3215  
  3216  	_, rhsDesc, err := createSplitRanges(ctx, store)
  3217  	if err != nil {
  3218  		t.Fatal(err)
  3219  	}
  3220  	state.Lock()
  3221  	state.rhsDesc = rhsDesc
  3222  	state.Unlock()
  3223  
  3224  	// Simulate a merge transaction by launching a transaction that lays down
  3225  	// intents on the two copies of the RHS range descriptor.
  3226  	txn := kv.NewTxn(ctx, store.DB(), 0 /* gatewayNodeID */)
  3227  	if err := txn.Del(ctx, keys.RangeDescriptorKey(rhsDesc.StartKey)); err != nil {
  3228  		t.Fatal(err)
  3229  	}
  3230  	if err := txn.Del(ctx, keys.RangeMetaKey(rhsDesc.StartKey)); err != nil {
  3231  		t.Fatal(err)
  3232  	}
  3233  
  3234  	// Indicate to the store filter installed above that the next lease
  3235  	// acquisition for the RHS should trigger a shutdown.
  3236  	state.Lock()
  3237  	state.stop = true
  3238  	state.Unlock()
  3239  
  3240  	// Expire all leases.
  3241  	mtc.advanceClock(ctx)
  3242  
  3243  	// Send a dummy get request on the RHS to force a lease acquisition. We expect
  3244  	// this to fail, as quiescing stores cannot acquire leases.
  3245  	err = stopper.RunTaskWithErr(ctx, "test-get-rhs-key", func(ctx context.Context) error {
  3246  		_, err := store.DB().Get(ctx, "dummy-rhs-key")
  3247  		return err
  3248  	})
  3249  	if exp := "not lease holder"; !testutils.IsError(err, exp) {
  3250  		t.Fatalf("expected %q error, but got %v", err, exp)
  3251  	}
  3252  }
  3253  
  3254  func TestMergeQueue(t *testing.T) {
  3255  	defer leaktest.AfterTest(t)()
  3256  
  3257  	ctx := context.Background()
  3258  	manualClock := hlc.NewManualClock(123)
  3259  	clock := hlc.NewClock(manualClock.UnixNano, time.Nanosecond)
  3260  	storeCfg := kvserver.TestStoreConfig(nil)
  3261  	storeCfg.TestingKnobs.DisableSplitQueue = true
  3262  	storeCfg.TestingKnobs.DisableReplicateQueue = true
  3263  	storeCfg.TestingKnobs.DisableScanner = true
  3264  	rangeMinBytes := int64(1 << 10) // 1KB
  3265  	storeCfg.DefaultZoneConfig.RangeMinBytes = &rangeMinBytes
  3266  	sv := &storeCfg.Settings.SV
  3267  	kvserverbase.MergeQueueEnabled.Override(sv, true)
  3268  	kvserver.MergeQueueInterval.Override(sv, 0) // process greedily
  3269  	var mtc multiTestContext
  3270  	// This test was written before the multiTestContext started creating many
  3271  	// system ranges at startup, and hasn't been update to take that into account.
  3272  	mtc.startWithSingleRange = true
  3273  
  3274  	mtc.storeConfig = &storeCfg
  3275  	// Inject clock for manipulation in tests.
  3276  	mtc.storeConfig.Clock = clock
  3277  	mtc.Start(t, 2)
  3278  	defer mtc.Stop()
  3279  	mtc.initGossipNetwork() // needed for the non-collocated case's rebalancing to work
  3280  	store := mtc.Store(0)
  3281  	store.SetMergeQueueActive(true)
  3282  
  3283  	split := func(t *testing.T, key roachpb.Key, expirationTime hlc.Timestamp) {
  3284  		t.Helper()
  3285  		args := adminSplitArgs(key)
  3286  		args.ExpirationTime = expirationTime
  3287  		if _, pErr := kv.SendWrapped(ctx, store.DB().NonTransactionalSender(), args); pErr != nil {
  3288  			t.Fatal(pErr)
  3289  		}
  3290  	}
  3291  
  3292  	clearRange := func(t *testing.T, start, end roachpb.RKey) {
  3293  		if _, pErr := kv.SendWrapped(ctx, store.DB().NonTransactionalSender(), &roachpb.ClearRangeRequest{
  3294  			RequestHeader: roachpb.RequestHeader{Key: start.AsRawKey(), EndKey: end.AsRawKey()},
  3295  		}); pErr != nil {
  3296  			t.Fatal(pErr)
  3297  		}
  3298  	}
  3299  
  3300  	// Create two empty ranges, a - b and b - c, by splitting at a, b, and c.
  3301  	lhsStartKey := roachpb.RKey("a")
  3302  	rhsStartKey := roachpb.RKey("b")
  3303  	rhsEndKey := roachpb.RKey("c")
  3304  	for _, k := range []roachpb.RKey{lhsStartKey, rhsStartKey, rhsEndKey} {
  3305  		split(t, k.AsRawKey(), hlc.Timestamp{} /* expirationTime */)
  3306  	}
  3307  	lhs := func() *kvserver.Replica { return store.LookupReplica(lhsStartKey) }
  3308  	rhs := func() *kvserver.Replica { return store.LookupReplica(rhsStartKey) }
  3309  
  3310  	// setThresholds simulates a zone config update that updates the ranges'
  3311  	// minimum and maximum sizes.
  3312  	setZones := func(zone zonepb.ZoneConfig) {
  3313  		lhs().SetZoneConfig(&zone)
  3314  		rhs().SetZoneConfig(&zone)
  3315  	}
  3316  
  3317  	rng, _ := randutil.NewPseudoRand()
  3318  	randBytes := randutil.RandBytes(rng, int(*storeCfg.DefaultZoneConfig.RangeMinBytes))
  3319  
  3320  	reset := func(t *testing.T) {
  3321  		t.Helper()
  3322  		clearRange(t, lhsStartKey, rhsEndKey)
  3323  		for _, k := range []roachpb.RKey{lhsStartKey, rhsStartKey} {
  3324  			if err := store.DB().Put(ctx, k, randBytes); err != nil {
  3325  				t.Fatal(err)
  3326  			}
  3327  		}
  3328  		setZones(*storeCfg.DefaultZoneConfig)
  3329  		store.MustForceMergeScanAndProcess() // drain any merges that might already be queued
  3330  		split(t, roachpb.Key("b"), hlc.Timestamp{} /* expirationTime */)
  3331  	}
  3332  
  3333  	verifyMerged := func(t *testing.T) {
  3334  		t.Helper()
  3335  		repl := store.LookupReplica(rhsStartKey)
  3336  		if !repl.Desc().StartKey.Equal(lhsStartKey) {
  3337  			t.Fatalf("ranges unexpectedly unmerged")
  3338  		}
  3339  	}
  3340  
  3341  	verifyUnmerged := func(t *testing.T) {
  3342  		t.Helper()
  3343  		repl := store.LookupReplica(rhsStartKey)
  3344  		if repl.Desc().StartKey.Equal(lhsStartKey) {
  3345  			t.Fatalf("ranges unexpectedly merged")
  3346  		}
  3347  	}
  3348  
  3349  	t.Run("sanity", func(t *testing.T) {
  3350  		// Check that ranges are not trivially merged after reset.
  3351  		reset(t)
  3352  		store.MustForceMergeScanAndProcess()
  3353  		verifyUnmerged(t)
  3354  		reset(t)
  3355  		store.MustForceMergeScanAndProcess()
  3356  		verifyUnmerged(t)
  3357  	})
  3358  
  3359  	t.Run("both-empty", func(t *testing.T) {
  3360  		reset(t)
  3361  		clearRange(t, lhsStartKey, rhsEndKey)
  3362  		store.MustForceMergeScanAndProcess()
  3363  		verifyMerged(t)
  3364  	})
  3365  
  3366  	t.Run("lhs-undersize", func(t *testing.T) {
  3367  		reset(t)
  3368  		zone := protoutil.Clone(storeCfg.DefaultZoneConfig).(*zonepb.ZoneConfig)
  3369  		*zone.RangeMinBytes *= 2
  3370  		lhs().SetZoneConfig(zone)
  3371  		store.MustForceMergeScanAndProcess()
  3372  		verifyMerged(t)
  3373  	})
  3374  
  3375  	t.Run("combined-threshold", func(t *testing.T) {
  3376  		reset(t)
  3377  
  3378  		// The ranges are individually beneath the minimum size threshold, but
  3379  		// together they'll exceed the maximum size threshold.
  3380  		zone := protoutil.Clone(storeCfg.DefaultZoneConfig).(*zonepb.ZoneConfig)
  3381  		zone.RangeMinBytes = proto.Int64(lhs().GetMVCCStats().Total() + 1)
  3382  		zone.RangeMaxBytes = proto.Int64(lhs().GetMVCCStats().Total()*2 - 1)
  3383  		setZones(*zone)
  3384  		store.MustForceMergeScanAndProcess()
  3385  		verifyUnmerged(t)
  3386  
  3387  		// Once the maximum size threshold is increased, the merge can occur.
  3388  		zone.RangeMaxBytes = proto.Int64(*zone.RangeMaxBytes + 1)
  3389  		setZones(*zone)
  3390  		store.MustForceMergeScanAndProcess()
  3391  		verifyMerged(t)
  3392  	})
  3393  
  3394  	t.Run("non-collocated", func(t *testing.T) {
  3395  		reset(t)
  3396  		verifyUnmerged(t)
  3397  		rhsRangeID := rhs().RangeID
  3398  		mtc.replicateRange(rhsRangeID, 1)
  3399  		mtc.transferLease(ctx, rhsRangeID, 0, 1)
  3400  		mtc.unreplicateRange(rhsRangeID, 0)
  3401  		require.NoError(t, mtc.waitForUnreplicated(rhsRangeID, 0))
  3402  
  3403  		clearRange(t, lhsStartKey, rhsEndKey)
  3404  		store.MustForceMergeScanAndProcess()
  3405  		verifyMerged(t)
  3406  	})
  3407  
  3408  	// TODO(jeffreyxiao): Add subtest to consider load when making merging
  3409  	// decisions.
  3410  
  3411  	t.Run("sticky-bit", func(t *testing.T) {
  3412  		reset(t)
  3413  		store.MustForceMergeScanAndProcess()
  3414  		verifyUnmerged(t)
  3415  
  3416  		// Perform manual merge and verify that no merge occurred.
  3417  		split(t, rhsStartKey.AsRawKey(), hlc.MaxTimestamp /* expirationTime */)
  3418  		clearRange(t, lhsStartKey, rhsEndKey)
  3419  		store.MustForceMergeScanAndProcess()
  3420  		verifyUnmerged(t)
  3421  
  3422  		// Delete sticky bit and verify that merge occurs.
  3423  		unsplitArgs := &roachpb.AdminUnsplitRequest{
  3424  			RequestHeader: roachpb.RequestHeader{
  3425  				Key: rhsStartKey.AsRawKey(),
  3426  			},
  3427  		}
  3428  		if _, err := kv.SendWrapped(ctx, store.DB().NonTransactionalSender(), unsplitArgs); err != nil {
  3429  			t.Fatal(err)
  3430  		}
  3431  		store.MustForceMergeScanAndProcess()
  3432  		verifyMerged(t)
  3433  	})
  3434  
  3435  	t.Run("sticky-bit-expiration", func(t *testing.T) {
  3436  		manualSplitTTL := time.Millisecond * 200
  3437  		reset(t)
  3438  		store.MustForceMergeScanAndProcess()
  3439  		verifyUnmerged(t)
  3440  
  3441  		// Perform manual merge and verify that no merge occurred.
  3442  		split(t, rhsStartKey.AsRawKey(), clock.Now().Add(manualSplitTTL.Nanoseconds(), 0) /* expirationTime */)
  3443  		clearRange(t, lhsStartKey, rhsEndKey)
  3444  		store.MustForceMergeScanAndProcess()
  3445  		verifyUnmerged(t)
  3446  
  3447  		// Sticky bit is not expired yet.
  3448  		manualClock.Set(manualSplitTTL.Nanoseconds())
  3449  		store.MustForceMergeScanAndProcess()
  3450  		verifyUnmerged(t)
  3451  
  3452  		// Sticky bit is expired.
  3453  		manualClock.Set(manualSplitTTL.Nanoseconds() * 2)
  3454  		store.MustForceMergeScanAndProcess()
  3455  		verifyMerged(t)
  3456  	})
  3457  }
  3458  
  3459  func TestInvalidSubsumeRequest(t *testing.T) {
  3460  	defer leaktest.AfterTest(t)()
  3461  
  3462  	ctx := context.Background()
  3463  	var mtc multiTestContext
  3464  	mtc.Start(t, 1)
  3465  	defer mtc.Stop()
  3466  	store := mtc.Store(0)
  3467  
  3468  	// A Subsume request that succeeds when it shouldn't will wedge a
  3469  	// store because it waits for a merge that is not actually in progress. Set a
  3470  	// short timeout to limit the damage.
  3471  	ctx, cancel := context.WithTimeout(ctx, testutils.DefaultSucceedsSoonDuration)
  3472  	defer cancel()
  3473  
  3474  	lhsDesc, rhsDesc, err := createSplitRanges(ctx, store)
  3475  	if err != nil {
  3476  		t.Fatal(err)
  3477  	}
  3478  
  3479  	getSnapArgs := roachpb.SubsumeRequest{
  3480  		RequestHeader: roachpb.RequestHeader{Key: rhsDesc.StartKey.AsRawKey()},
  3481  		LeftDesc:      *lhsDesc,
  3482  		RightDesc:     *rhsDesc,
  3483  	}
  3484  
  3485  	// Subsume with an incorrect RightDesc should fail.
  3486  	{
  3487  		badRHSDesc := *rhsDesc
  3488  		badRHSDesc.EndKey = badRHSDesc.EndKey.Next()
  3489  		badArgs := getSnapArgs
  3490  		badArgs.RightDesc = badRHSDesc
  3491  		_, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{
  3492  			RangeID: rhsDesc.RangeID,
  3493  		}, &badArgs)
  3494  		if exp := "RHS range bounds do not match"; !testutils.IsPError(pErr, exp) {
  3495  			t.Fatalf("expected %q error, but got %v", exp, pErr)
  3496  		}
  3497  	}
  3498  
  3499  	// Subsume from a non-neighboring LHS should fail.
  3500  	{
  3501  		badArgs := getSnapArgs
  3502  		badArgs.LeftDesc.EndKey = badArgs.LeftDesc.EndKey.Next()
  3503  		_, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{
  3504  			RangeID: rhsDesc.RangeID,
  3505  		}, &badArgs)
  3506  		if exp := "ranges are not adjacent"; !testutils.IsPError(pErr, exp) {
  3507  			t.Fatalf("expected %q error, but got %v", exp, pErr)
  3508  		}
  3509  	}
  3510  
  3511  	// Subsume without an intent on the local range descriptor should fail.
  3512  	_, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{
  3513  		RangeID: rhsDesc.RangeID,
  3514  	}, &getSnapArgs)
  3515  	if exp := "range missing intent on its local descriptor"; !testutils.IsPError(pErr, exp) {
  3516  		t.Fatalf("expected %q error, but got %v", exp, pErr)
  3517  	}
  3518  
  3519  	// Subsume when a non-deletion intent is present on the
  3520  	// local range descriptor should fail.
  3521  	err = store.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
  3522  		if err := txn.Put(ctx, keys.RangeDescriptorKey(rhsDesc.StartKey), "garbage"); err != nil {
  3523  			return err
  3524  		}
  3525  		// NB: Subsume intentionally takes place outside of the txn so
  3526  		// that it sees an intent rather than the value the txn just wrote.
  3527  		_, pErr := kv.SendWrappedWith(ctx, store.TestSender(), roachpb.Header{
  3528  			RangeID: rhsDesc.RangeID,
  3529  		}, &getSnapArgs)
  3530  		if exp := "non-deletion intent on local range descriptor"; !testutils.IsPError(pErr, exp) {
  3531  			return fmt.Errorf("expected %q error, but got %v", exp, pErr)
  3532  		}
  3533  		return nil
  3534  	})
  3535  	if err != nil {
  3536  		t.Fatal(err)
  3537  	}
  3538  }
  3539  
  3540  func BenchmarkStoreRangeMerge(b *testing.B) {
  3541  	ctx := context.Background()
  3542  	var mtc multiTestContext
  3543  	mtc.Start(b, 1)
  3544  	defer mtc.Stop()
  3545  	store := mtc.Store(0)
  3546  
  3547  	lhsDesc, rhsDesc, err := createSplitRanges(ctx, store)
  3548  	if err != nil {
  3549  		b.Fatal(err)
  3550  	}
  3551  
  3552  	// Write some values left and right of the proposed split key.
  3553  	kvserver.WriteRandomDataToRange(b, store, lhsDesc.RangeID, []byte("aaa"))
  3554  	kvserver.WriteRandomDataToRange(b, store, rhsDesc.RangeID, []byte("ccc"))
  3555  
  3556  	// Create args to merge the b range back into the a range.
  3557  	mArgs := adminMergeArgs(lhsDesc.StartKey.AsRawKey())
  3558  
  3559  	b.ResetTimer()
  3560  	for i := 0; i < b.N; i++ {
  3561  		// Merge the ranges.
  3562  		b.StartTimer()
  3563  		if _, err := kv.SendWrapped(ctx, store.TestSender(), mArgs); err != nil {
  3564  			b.Fatal(err)
  3565  		}
  3566  
  3567  		// Split the range.
  3568  		b.StopTimer()
  3569  		if _, _, err := createSplitRanges(ctx, store); err != nil {
  3570  			b.Fatal(err)
  3571  		}
  3572  	}
  3573  }