github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/client_lease_test.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver_test
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"runtime"
    17  	"sync"
    18  	"sync/atomic"
    19  	"testing"
    20  	"time"
    21  
    22  	"github.com/cockroachdb/cockroach/pkg/base"
    23  	"github.com/cockroachdb/cockroach/pkg/config"
    24  	"github.com/cockroachdb/cockroach/pkg/gossip"
    25  	"github.com/cockroachdb/cockroach/pkg/keys"
    26  	"github.com/cockroachdb/cockroach/pkg/kv"
    27  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
    28  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    29  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    30  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    31  	"github.com/cockroachdb/cockroach/pkg/testutils"
    32  	"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
    33  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    34  	"github.com/cockroachdb/cockroach/pkg/util/log"
    35  	"github.com/cockroachdb/errors"
    36  	"github.com/stretchr/testify/require"
    37  )
    38  
    39  // TestStoreRangeLease verifies that regular ranges (not some special ones at
    40  // the start of the key space) get epoch-based range leases if enabled and
    41  // expiration-based otherwise.
    42  func TestStoreRangeLease(t *testing.T) {
    43  	defer leaktest.AfterTest(t)()
    44  
    45  	testutils.RunTrueAndFalse(t, "enableEpoch", func(t *testing.T, enableEpoch bool) {
    46  		sc := kvserver.TestStoreConfig(nil)
    47  		sc.TestingKnobs.DisableMergeQueue = true
    48  		sc.EnableEpochRangeLeases = enableEpoch
    49  		mtc := &multiTestContext{storeConfig: &sc}
    50  		defer mtc.Stop()
    51  		mtc.Start(t, 1)
    52  
    53  		// NodeLivenessKeyMax is a static split point, so this is always
    54  		// the start key of the first range that uses epoch-based
    55  		// leases. Splitting on it here is redundant, but we want to include
    56  		// it in our tests of lease types below.
    57  		splitKeys := []roachpb.Key{
    58  			keys.NodeLivenessKeyMax, roachpb.Key("a"), roachpb.Key("b"), roachpb.Key("c"),
    59  		}
    60  		for _, splitKey := range splitKeys {
    61  			splitArgs := adminSplitArgs(splitKey)
    62  			if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], splitArgs); pErr != nil {
    63  				t.Fatal(pErr)
    64  			}
    65  		}
    66  
    67  		rLeft := mtc.stores[0].LookupReplica(roachpb.RKeyMin)
    68  		lease, _ := rLeft.GetLease()
    69  		if lt := lease.Type(); lt != roachpb.LeaseExpiration {
    70  			t.Fatalf("expected lease type expiration; got %d", lt)
    71  		}
    72  
    73  		// After the expiration, expect an epoch lease for all the ranges if
    74  		// we've enabled epoch based range leases.
    75  		for _, key := range splitKeys {
    76  			repl := mtc.stores[0].LookupReplica(roachpb.RKey(key))
    77  			lease, _ = repl.GetLease()
    78  			if enableEpoch {
    79  				if lt := lease.Type(); lt != roachpb.LeaseEpoch {
    80  					t.Fatalf("expected lease type epoch; got %d", lt)
    81  				}
    82  			} else {
    83  				if lt := lease.Type(); lt != roachpb.LeaseExpiration {
    84  					t.Fatalf("expected lease type expiration; got %d", lt)
    85  				}
    86  			}
    87  		}
    88  	})
    89  }
    90  
    91  // TestStoreRangeLeaseSwitcheroo verifies that ranges can be switched
    92  // between expiration and epoch and back.
    93  func TestStoreRangeLeaseSwitcheroo(t *testing.T) {
    94  	defer leaktest.AfterTest(t)()
    95  	sc := kvserver.TestStoreConfig(nil)
    96  	sc.TestingKnobs.DisableMergeQueue = true
    97  	sc.EnableEpochRangeLeases = true
    98  	sc.Clock = nil // manual clock
    99  	mtc := &multiTestContext{storeConfig: &sc}
   100  	defer mtc.Stop()
   101  	mtc.Start(t, 1)
   102  
   103  	splitKey := roachpb.Key("a")
   104  	splitArgs := adminSplitArgs(splitKey)
   105  	if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], splitArgs); pErr != nil {
   106  		t.Fatal(pErr)
   107  	}
   108  
   109  	// Allow leases to expire and send commands to ensure we
   110  	// re-acquire, then check types again.
   111  	mtc.advanceClock(context.Background())
   112  	if _, err := mtc.dbs[0].Inc(context.Background(), splitKey, 1); err != nil {
   113  		t.Fatalf("failed to increment: %+v", err)
   114  	}
   115  
   116  	// We started with epoch ranges enabled, so verify we have an epoch lease.
   117  	repl := mtc.stores[0].LookupReplica(roachpb.RKey(splitKey))
   118  	lease, _ := repl.GetLease()
   119  	if lt := lease.Type(); lt != roachpb.LeaseEpoch {
   120  		t.Fatalf("expected lease type epoch; got %d", lt)
   121  	}
   122  
   123  	// Stop the store and reverse the epoch range lease setting.
   124  	mtc.stopStore(0)
   125  	sc.EnableEpochRangeLeases = false
   126  	mtc.restartStore(0)
   127  
   128  	mtc.advanceClock(context.Background())
   129  	if _, err := mtc.dbs[0].Inc(context.Background(), splitKey, 1); err != nil {
   130  		t.Fatalf("failed to increment: %+v", err)
   131  	}
   132  
   133  	// Verify we end up with an expiration lease on restart.
   134  	repl = mtc.stores[0].LookupReplica(roachpb.RKey(splitKey))
   135  	lease, _ = repl.GetLease()
   136  	if lt := lease.Type(); lt != roachpb.LeaseExpiration {
   137  		t.Fatalf("expected lease type expiration; got %d", lt)
   138  	}
   139  
   140  	// Now, one more time, switch back to epoch-based.
   141  	mtc.stopStore(0)
   142  	sc.EnableEpochRangeLeases = true
   143  	mtc.restartStore(0)
   144  
   145  	mtc.advanceClock(context.Background())
   146  	if _, err := mtc.dbs[0].Inc(context.Background(), splitKey, 1); err != nil {
   147  		t.Fatalf("failed to increment: %+v", err)
   148  	}
   149  
   150  	// Verify we end up with an epoch lease on restart.
   151  	repl = mtc.stores[0].LookupReplica(roachpb.RKey(splitKey))
   152  	lease, _ = repl.GetLease()
   153  	if lt := lease.Type(); lt != roachpb.LeaseEpoch {
   154  		t.Fatalf("expected lease type epoch; got %d", lt)
   155  	}
   156  }
   157  
   158  // TestStoreGossipSystemData verifies that the system-config and node-liveness
   159  // data is gossiped at startup.
   160  func TestStoreGossipSystemData(t *testing.T) {
   161  	defer leaktest.AfterTest(t)()
   162  	sc := kvserver.TestStoreConfig(nil)
   163  	sc.TestingKnobs.DisableMergeQueue = true
   164  	sc.EnableEpochRangeLeases = true
   165  	mtc := &multiTestContext{storeConfig: &sc}
   166  	defer mtc.Stop()
   167  	mtc.Start(t, 1)
   168  
   169  	splitKey := keys.SystemConfigSplitKey
   170  	splitArgs := adminSplitArgs(splitKey)
   171  	if _, pErr := kv.SendWrapped(context.Background(), mtc.distSenders[0], splitArgs); pErr != nil {
   172  		t.Fatal(pErr)
   173  	}
   174  	if _, err := mtc.dbs[0].Inc(context.Background(), splitKey, 1); err != nil {
   175  		t.Fatalf("failed to increment: %+v", err)
   176  	}
   177  
   178  	mtc.stopStore(0)
   179  
   180  	getSystemConfig := func() *config.SystemConfig {
   181  		systemConfig := mtc.gossips[0].GetSystemConfig()
   182  		return systemConfig
   183  	}
   184  	getNodeLiveness := func() kvserverpb.Liveness {
   185  		var liveness kvserverpb.Liveness
   186  		if err := mtc.gossips[0].GetInfoProto(gossip.MakeNodeLivenessKey(1), &liveness); err == nil {
   187  			return liveness
   188  		}
   189  		return kvserverpb.Liveness{}
   190  	}
   191  
   192  	// Clear the system-config and node liveness gossip data. This is necessary
   193  	// because multiTestContext.restartStore reuse the Gossip structure.
   194  	if err := mtc.gossips[0].AddInfoProto(
   195  		gossip.KeySystemConfig, &config.SystemConfigEntries{}, 0); err != nil {
   196  		t.Fatal(err)
   197  	}
   198  	if err := mtc.gossips[0].AddInfoProto(
   199  		gossip.MakeNodeLivenessKey(1), &kvserverpb.Liveness{}, 0); err != nil {
   200  		t.Fatal(err)
   201  	}
   202  	testutils.SucceedsSoon(t, func() error {
   203  		if !getSystemConfig().DefaultZoneConfig.Equal(sc.DefaultZoneConfig) {
   204  			return errors.New("system config not empty")
   205  		}
   206  		if getNodeLiveness() != (kvserverpb.Liveness{}) {
   207  			return errors.New("node liveness not empty")
   208  		}
   209  		return nil
   210  	})
   211  
   212  	// Restart the store and verify that both the system-config and node-liveness
   213  	// data is gossiped.
   214  	mtc.restartStore(0)
   215  	testutils.SucceedsSoon(t, func() error {
   216  		if !getSystemConfig().DefaultZoneConfig.Equal(sc.DefaultZoneConfig) {
   217  			return errors.New("system config not gossiped")
   218  		}
   219  		if getNodeLiveness() == (kvserverpb.Liveness{}) {
   220  			return errors.New("node liveness not gossiped")
   221  		}
   222  		return nil
   223  	})
   224  }
   225  
   226  // TestGossipSystemConfigOnLeaseChange verifies that the system-config gets
   227  // re-gossiped on lease transfer even if it hasn't changed. This helps prevent
   228  // situations where a previous leaseholder can restart and not receive the
   229  // system config because it was the original source of it within the gossip
   230  // network.
   231  func TestGossipSystemConfigOnLeaseChange(t *testing.T) {
   232  	defer leaktest.AfterTest(t)()
   233  	sc := kvserver.TestStoreConfig(nil)
   234  	sc.TestingKnobs.DisableReplicateQueue = true
   235  	mtc := &multiTestContext{storeConfig: &sc}
   236  	defer mtc.Stop()
   237  	const numStores = 3
   238  	mtc.Start(t, numStores)
   239  
   240  	rangeID := mtc.stores[0].LookupReplica(roachpb.RKey(keys.SystemConfigSpan.Key)).RangeID
   241  	mtc.replicateRange(rangeID, 1, 2)
   242  
   243  	initialStoreIdx := -1
   244  	for i := range mtc.stores {
   245  		if mtc.stores[i].Gossip().InfoOriginatedHere(gossip.KeySystemConfig) {
   246  			initialStoreIdx = i
   247  		}
   248  	}
   249  	if initialStoreIdx == -1 {
   250  		t.Fatalf("no store has gossiped system config; gossip contents: %+v", mtc.stores[0].Gossip().GetInfoStatus())
   251  	}
   252  
   253  	newStoreIdx := (initialStoreIdx + 1) % numStores
   254  	mtc.transferLease(context.Background(), rangeID, initialStoreIdx, newStoreIdx)
   255  
   256  	testutils.SucceedsSoon(t, func() error {
   257  		if mtc.stores[initialStoreIdx].Gossip().InfoOriginatedHere(gossip.KeySystemConfig) {
   258  			return errors.New("system config still most recently gossiped by original leaseholder")
   259  		}
   260  		if !mtc.stores[newStoreIdx].Gossip().InfoOriginatedHere(gossip.KeySystemConfig) {
   261  			return errors.New("system config not most recently gossiped by new leaseholder")
   262  		}
   263  		return nil
   264  	})
   265  }
   266  
   267  func TestGossipNodeLivenessOnLeaseChange(t *testing.T) {
   268  	defer leaktest.AfterTest(t)()
   269  	sc := kvserver.TestStoreConfig(nil)
   270  	sc.TestingKnobs.DisableReplicateQueue = true
   271  	mtc := &multiTestContext{storeConfig: &sc}
   272  	defer mtc.Stop()
   273  	const numStores = 3
   274  	mtc.Start(t, numStores)
   275  
   276  	rangeID := mtc.stores[0].LookupReplica(roachpb.RKey(keys.NodeLivenessSpan.Key)).RangeID
   277  	mtc.replicateRange(rangeID, 1, 2)
   278  
   279  	// Turn off liveness heartbeats on all nodes to ensure that updates to node
   280  	// liveness are not triggering gossiping.
   281  	for i := range mtc.nodeLivenesses {
   282  		mtc.nodeLivenesses[i].PauseHeartbeat(true)
   283  	}
   284  
   285  	nodeLivenessKey := gossip.MakeNodeLivenessKey(1)
   286  
   287  	initialStoreIdx := -1
   288  	for i := range mtc.stores {
   289  		if mtc.stores[i].Gossip().InfoOriginatedHere(nodeLivenessKey) {
   290  			initialStoreIdx = i
   291  		}
   292  	}
   293  	if initialStoreIdx == -1 {
   294  		t.Fatalf("no store has gossiped %s; gossip contents: %+v",
   295  			nodeLivenessKey, mtc.stores[0].Gossip().GetInfoStatus())
   296  	}
   297  	log.Infof(context.Background(), "%s gossiped from n%d",
   298  		nodeLivenessKey, mtc.stores[initialStoreIdx].Ident.NodeID)
   299  
   300  	newStoreIdx := (initialStoreIdx + 1) % numStores
   301  	mtc.transferLease(context.Background(), rangeID, initialStoreIdx, newStoreIdx)
   302  
   303  	testutils.SucceedsSoon(t, func() error {
   304  		if mtc.stores[initialStoreIdx].Gossip().InfoOriginatedHere(nodeLivenessKey) {
   305  			return fmt.Errorf("%s still most recently gossiped by original leaseholder", nodeLivenessKey)
   306  		}
   307  		if !mtc.stores[newStoreIdx].Gossip().InfoOriginatedHere(nodeLivenessKey) {
   308  			return fmt.Errorf("%s not most recently gossiped by new leaseholder", nodeLivenessKey)
   309  		}
   310  		return nil
   311  	})
   312  }
   313  
   314  // TestCannotTransferLeaseToVoterOutgoing ensures that the evaluation of lease
   315  // requests for nodes which are already in the VOTER_OUTGOING state will fail.
   316  func TestCannotTransferLeaseToVoterOutgoing(t *testing.T) {
   317  	defer leaktest.AfterTest(t)()
   318  	ctx := context.Background()
   319  
   320  	knobs, ltk := makeReplicationTestKnobs()
   321  	// Add a testing knob to allow us to block the change replicas command
   322  	// while it is being proposed. When we detect that the change replicas
   323  	// command to move n3 to VOTER_OUTGOING has been evaluated, we'll send
   324  	// the request to transfer the lease to n3. The hope is that it will
   325  	// get past the sanity above latch acquisition prior to change replicas
   326  	// command committing.
   327  	var scratchRangeID atomic.Value
   328  	scratchRangeID.Store(roachpb.RangeID(0))
   329  	changeReplicasChan := make(chan chan struct{}, 1)
   330  	shouldBlock := func(args kvserverbase.ProposalFilterArgs) bool {
   331  		// Block if a ChangeReplicas command is removing a node from our range.
   332  		return args.Req.RangeID == scratchRangeID.Load().(roachpb.RangeID) &&
   333  			args.Cmd.ReplicatedEvalResult.ChangeReplicas != nil &&
   334  			len(args.Cmd.ReplicatedEvalResult.ChangeReplicas.Removed()) > 0
   335  	}
   336  	blockIfShould := func(args kvserverbase.ProposalFilterArgs) {
   337  		if shouldBlock(args) {
   338  			ch := make(chan struct{})
   339  			changeReplicasChan <- ch
   340  			<-ch
   341  		}
   342  	}
   343  	knobs.Store.(*kvserver.StoreTestingKnobs).TestingProposalFilter = func(args kvserverbase.ProposalFilterArgs) *roachpb.Error {
   344  		blockIfShould(args)
   345  		return nil
   346  	}
   347  	tc := testcluster.StartTestCluster(t, 4, base.TestClusterArgs{
   348  		ServerArgs:      base.TestServerArgs{Knobs: knobs},
   349  		ReplicationMode: base.ReplicationManual,
   350  	})
   351  	defer tc.Stopper().Stop(ctx)
   352  
   353  	scratchStartKey := tc.ScratchRange(t)
   354  	desc := tc.AddReplicasOrFatal(t, scratchStartKey, tc.Targets(1, 2)...)
   355  	scratchRangeID.Store(desc.RangeID)
   356  	// Make sure n1 has the lease to start with.
   357  	err := tc.Server(0).DB().AdminTransferLease(context.Background(),
   358  		scratchStartKey, tc.Target(0).StoreID)
   359  	require.NoError(t, err)
   360  
   361  	// The test proceeds as follows:
   362  	//
   363  	//  - Send an AdminChangeReplicasRequest to remove n3 and add n4
   364  	//  - Block the step that moves n3 to VOTER_OUTGOING on changeReplicasChan
   365  	//  - Send an AdminLeaseTransfer to make n3 the leaseholder
   366  	//  - Try really hard to make sure that the lease transfer at least gets to
   367  	//    latch acquisition before unblocking the ChangeReplicas.
   368  	//  - Unblock the ChangeReplicas.
   369  	//  - Make sure the lease transfer fails.
   370  
   371  	ltk.withStopAfterJointConfig(func() {
   372  		var wg sync.WaitGroup
   373  		wg.Add(1)
   374  		go func() {
   375  			defer wg.Done()
   376  			_, err = tc.Server(0).DB().AdminChangeReplicas(ctx,
   377  				scratchStartKey, desc, []roachpb.ReplicationChange{
   378  					{ChangeType: roachpb.REMOVE_REPLICA, Target: tc.Target(2)},
   379  					{ChangeType: roachpb.ADD_REPLICA, Target: tc.Target(3)},
   380  				})
   381  			require.NoError(t, err)
   382  		}()
   383  		ch := <-changeReplicasChan
   384  		wg.Add(1)
   385  		go func() {
   386  			defer wg.Done()
   387  			err := tc.Server(0).DB().AdminTransferLease(context.Background(),
   388  				scratchStartKey, tc.Target(2).StoreID)
   389  			require.Error(t, err)
   390  			require.Regexp(t,
   391  				// The error generated during evaluation.
   392  				"replica.*of type VOTER_DEMOTING cannot hold lease|"+
   393  					// If the lease transfer request has not yet made it to the latching
   394  					// phase by the time we close(ch) below, we can receive the following
   395  					// error due to the sanity checking which happens in
   396  					// AdminTransferLease before attempting to evaluate the lease
   397  					// transfer.
   398  					// We have a sleep loop below to try to encourage the lease transfer
   399  					// to make it past that sanity check prior to letting the change
   400  					// of replicas proceed.
   401  					"cannot transfer lease to replica of type VOTER_DEMOTING", err.Error())
   402  		}()
   403  		// Try really hard to make sure that our request makes it past the
   404  		// sanity check error to the evaluation error.
   405  		for i := 0; i < 100; i++ {
   406  			runtime.Gosched()
   407  			time.Sleep(time.Microsecond)
   408  		}
   409  		close(ch)
   410  		wg.Wait()
   411  	})
   412  
   413  }
   414  
   415  // Test the error returned by attempts to create a txn record after a lease
   416  // transfer.
   417  func TestTimestampCacheErrorAfterLeaseTransfer(t *testing.T) {
   418  	defer leaktest.AfterTest(t)()
   419  	ctx := context.Background()
   420  	tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{})
   421  	defer tc.Stopper().Stop(ctx)
   422  
   423  	key := []byte("a")
   424  	rangeDesc, err := tc.LookupRange(key)
   425  	require.NoError(t, err)
   426  
   427  	// Transfer the lease to Servers[0] so we start in a known state. Otherwise,
   428  	// there might be already a lease owned by a random node.
   429  	require.NoError(t, tc.TransferRangeLease(rangeDesc, tc.Target(0)))
   430  
   431  	// Start a txn and perform a write, so that a txn record has to be created by
   432  	// the EndTxn.
   433  	txn := tc.Servers[0].DB().NewTxn(ctx, "test")
   434  	require.NoError(t, txn.Put(ctx, "a", "val"))
   435  	// After starting the transaction, transfer the lease. This will wipe the
   436  	// timestamp cache, which means that the txn record will not be able to be
   437  	// created (because someone might have already aborted the txn).
   438  	require.NoError(t, tc.TransferRangeLease(rangeDesc, tc.Target(1)))
   439  
   440  	err = txn.Commit(ctx)
   441  	require.Regexp(t, `TransactionAbortedError\(ABORT_REASON_NEW_LEASE_PREVENTS_TXN\)`, err)
   442  }