github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/client_closed_timestamp_test.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver_test
    12  
    13  import (
    14  	"context"
    15  	"testing"
    16  	"time"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/base"
    19  	"github.com/cockroachdb/cockroach/pkg/keys"
    20  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    21  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    22  	"github.com/cockroachdb/cockroach/pkg/testutils"
    23  	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
    24  	"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
    25  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    26  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    27  	"github.com/cockroachdb/errors"
    28  	"github.com/stretchr/testify/require"
    29  )
    30  
    31  // TestTimestampsCanBeClosedWhenRequestsAreSentToNonLeaseHolders ensures that
    32  // the errant closed timestamp requests sent to non-leaseholder nodes do not
    33  // prevent future closed timestamps from being created if that node later
    34  // becomes the leaseholder. See #48553 for more details.
    35  func TestClosedTimestampWorksWhenRequestsAreSentToNonLeaseHolders(t *testing.T) {
    36  	defer leaktest.AfterTest(t)()
    37  
    38  	ctx := context.Background()
    39  	// Set an incredibly long timeout so we don't need to risk node liveness
    40  	// failures and subsequent unexpected lease transfers under extreme stress.
    41  	serverArgs := base.TestServerArgs{
    42  		RaftConfig: base.RaftConfig{RaftElectionTimeoutTicks: 1000},
    43  	}
    44  	tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{
    45  		ServerArgs:      serverArgs,
    46  		ReplicationMode: base.ReplicationManual,
    47  	})
    48  	defer tc.Stopper().Stop(ctx)
    49  
    50  	// We want to ensure that node 3 has a high epoch and then we want to
    51  	// make it the leaseholder of range and then we want to tickle requesting an
    52  	// MLAI from node 1. Then make node 1 the leaseholder and ensure that it
    53  	// can still close timestamps.
    54  	db1 := tc.Server(0).DB()
    55  	sqlRunner := sqlutils.MakeSQLRunner(tc.ServerConn(0))
    56  
    57  	// Set a very short closed timestamp target duration so that we don't need to
    58  	// wait long for the closed timestamp machinery to propagate information.
    59  	const closeInterval = 10 * time.Millisecond
    60  	sqlRunner.Exec(t, "SET CLUSTER SETTING kv.closed_timestamp.target_duration = '"+
    61  		closeInterval.String()+"'")
    62  
    63  	// To make node3 have a large epoch, synthesize a liveness record for with
    64  	// epoch 1000 before starting the node.
    65  	require.NoError(t, db1.Put(ctx, keys.NodeLivenessKey(3),
    66  		&kvserverpb.Liveness{
    67  			NodeID:     3,
    68  			Epoch:      1000,
    69  			Expiration: hlc.LegacyTimestamp{WallTime: 1},
    70  		}))
    71  	tc.AddServer(t, serverArgs)
    72  
    73  	// Create our scratch range and up-replicate it.
    74  	k := tc.ScratchRange(t)
    75  	_, err := tc.AddReplicas(k, tc.Target(1), tc.Target(2))
    76  	require.NoError(t, err)
    77  	require.NoError(t, tc.WaitForVoters(k, tc.Target(1), tc.Target(2)))
    78  
    79  	// Wrap transferring the lease to deal with errors due to initial node
    80  	// liveness for n3. We could probably alternatively wait for n3 to be live but
    81  	// that felt like more work at the time and this works.
    82  	transferLease := func(desc *roachpb.RangeDescriptor, target roachpb.ReplicationTarget) {
    83  		testutils.SucceedsSoon(t, func() error {
    84  			return tc.TransferRangeLease(*desc, target)
    85  		})
    86  	}
    87  
    88  	// transferLeaseAndWaitForClosed will transfer the lease to the serverIdx
    89  	// specified. It will ensure that the lease transfer happens and then will
    90  	// call afterLease. It will then wait until at the closed timestamp moves
    91  	// forward a few intervals.
    92  	transferLeaseAndWaitForClosed := func(serverIdx int, afterLease func()) {
    93  		_, repl := getFirstStoreReplica(t, tc.Server(serverIdx), k)
    94  		target := tc.Target(serverIdx)
    95  		transferLease(repl.Desc(), target)
    96  		testutils.SucceedsSoon(t, func() error {
    97  			if !repl.OwnsValidLease(db1.Clock().Now()) {
    98  				return errors.Errorf("don't yet have the lease")
    99  			}
   100  			return nil
   101  		})
   102  		if afterLease != nil {
   103  			afterLease()
   104  		}
   105  		nowClosed, ok := repl.MaxClosed(ctx)
   106  		require.True(t, ok)
   107  		lease, _ := repl.GetLease()
   108  		if lease.Replica.NodeID != target.NodeID {
   109  			t.Fatalf("lease was unexpectedly transferred away which should" +
   110  				" not happen given the very long timeouts")
   111  		}
   112  		const closedMultiple = 5
   113  		targetClosed := nowClosed.Add(closedMultiple*closeInterval.Nanoseconds(), 0)
   114  		testutils.SucceedsSoon(t, func() error {
   115  			curLease, _ := repl.GetLease()
   116  			if !lease.Equivalent(curLease) {
   117  				t.Fatalf("lease was unexpectedly transferred away which should" +
   118  					" not happen given the very long timeouts")
   119  			}
   120  			closed, ok := repl.MaxClosed(ctx)
   121  			require.True(t, ok)
   122  			if closed.Less(targetClosed) {
   123  				return errors.Errorf("closed timestamp %v not yet after target %v", closed, targetClosed)
   124  			}
   125  			return nil
   126  		})
   127  	}
   128  
   129  	// Our new server should have a liveness epoch of 1000.
   130  	s3, repl3 := getFirstStoreReplica(t, tc.Server(2), k)
   131  	transferLeaseAndWaitForClosed(2, func() {
   132  		s3.RequestClosedTimestamp(1, repl3.RangeID)
   133  	})
   134  
   135  	// At this point we expect there's a high chance that the request made its
   136  	// way to n1. Now we're going to transfer the lease to n1 and make sure that
   137  	// the closed timestamp advances.
   138  	transferLeaseAndWaitForClosed(0, nil)
   139  }