github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/closedts/container/container_test.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package container_test // intentionally test from external package
    12  
    13  import (
    14  	"context"
    15  	"reflect"
    16  	"sync"
    17  	"testing"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts"
    21  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/container"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb"
    23  	providertestutils "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/provider/testutils"
    24  	transporttestutils "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/transport/testutils"
    25  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    26  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    27  	"github.com/cockroachdb/cockroach/pkg/testutils"
    28  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    29  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    30  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    31  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    32  	"github.com/cockroachdb/errors"
    33  	"github.com/kr/pretty"
    34  	"github.com/stretchr/testify/require"
    35  )
    36  
    37  type LateBoundDialer struct {
    38  	Wrapped *transporttestutils.ChanDialer
    39  }
    40  
    41  func (d *LateBoundDialer) Dial(ctx context.Context, nodeID roachpb.NodeID) (ctpb.Client, error) {
    42  	return d.Wrapped.Dial(ctx, nodeID)
    43  }
    44  
    45  func (d *LateBoundDialer) Ready(nodeID roachpb.NodeID) bool {
    46  	return d.Wrapped.Ready(nodeID)
    47  }
    48  
    49  type TestContainer struct {
    50  	*container.Container
    51  	NodeID    roachpb.NodeID
    52  	Refreshed struct {
    53  		syncutil.Mutex
    54  		RangeIDs []roachpb.RangeID
    55  	}
    56  	Dialer    *LateBoundDialer
    57  	TestClock *providertestutils.TestClock
    58  }
    59  
    60  func prepareContainer() *TestContainer {
    61  	stopper := stop.NewStopper()
    62  
    63  	tc := &TestContainer{}
    64  
    65  	tc.TestClock = providertestutils.NewTestClock(stopper)
    66  
    67  	var wg sync.WaitGroup
    68  	wg.Add(1)
    69  	refresh := func(requested ...roachpb.RangeID) {
    70  		tc.Refreshed.Lock()
    71  		tc.Refreshed.RangeIDs = append(tc.Refreshed.RangeIDs, requested...)
    72  		tc.Refreshed.Unlock()
    73  	}
    74  
    75  	st := cluster.MakeTestingClusterSettings()
    76  
    77  	// Set the target duration to a second and the close fraction so small
    78  	// that the Provider will essentially close in a hot loop. In this test
    79  	// we'll block in the clock to pace the Provider's closer loop.
    80  	closedts.TargetDuration.Override(&st.SV, time.Second)
    81  	closedts.CloseFraction.Override(&st.SV, 1e-9)
    82  
    83  	// We perform a little dance with the Dialer. It needs to be hooked up to the
    84  	// Server, but that's only created in NewContainer. The Dialer isn't used until
    85  	// that point, so we just create it a little later.
    86  	tc.Dialer = &LateBoundDialer{}
    87  
    88  	cfg := container.Config{
    89  		Settings: st,
    90  		Stopper:  stopper,
    91  		Clock:    tc.TestClock.LiveNow,
    92  		Refresh:  refresh,
    93  		Dialer:   tc.Dialer,
    94  	}
    95  
    96  	tc.Container = container.NewContainer(cfg)
    97  	return tc
    98  }
    99  
   100  func setupTwoNodeTest() (_ *TestContainer, _ *TestContainer, shutdown func()) {
   101  	c1 := prepareContainer()
   102  	c2 := prepareContainer()
   103  
   104  	c1.NodeID = roachpb.NodeID(1)
   105  	c2.NodeID = roachpb.NodeID(2)
   106  
   107  	c1.Start(c1.NodeID)
   108  	c2.Start(c2.NodeID)
   109  
   110  	// Link the containers.
   111  	c1.Dialer.Wrapped = transporttestutils.NewChanDialer(c1.Stopper, c2.Server)
   112  	c2.Dialer.Wrapped = transporttestutils.NewChanDialer(c2.Stopper, c1.Server)
   113  
   114  	return c1, c2, func() {
   115  		// Oh, the joy of multiple stoppers.
   116  		var wg sync.WaitGroup
   117  		wg.Add(2)
   118  		go func() {
   119  			defer wg.Done()
   120  			c1.Stopper.Stop(context.Background())
   121  		}()
   122  		go func() {
   123  			defer wg.Done()
   124  			c2.Stopper.Stop(context.Background())
   125  		}()
   126  	}
   127  }
   128  
   129  func TestTwoNodes(t *testing.T) {
   130  	defer leaktest.AfterTest(t)()
   131  
   132  	ctx := context.Background()
   133  
   134  	c1, c2, shutdown := setupTwoNodeTest()
   135  	defer shutdown()
   136  	defer func() {
   137  		t.Logf("n1 -> n2: %s", pretty.Sprint(c1.Dialer.Wrapped.Transcript(c2.NodeID)))
   138  		t.Logf("n2 -> n1: %s", pretty.Sprint(c2.Dialer.Wrapped.Transcript(c1.NodeID)))
   139  	}()
   140  	const (
   141  		ep0 ctpb.Epoch = iota
   142  		ep1
   143  		ep2
   144  	)
   145  	// Initially, can't serve random things for either n1 or n2.
   146  	require.True(t, c1.Container.Provider.MaxClosed(
   147  		c1.NodeID, roachpb.RangeID(5), ep0, ctpb.LAI(0)).IsEmpty(),
   148  	)
   149  	require.True(t, c1.Container.Provider.MaxClosed(
   150  		c2.NodeID, roachpb.RangeID(5), ep0, ctpb.LAI(0)).IsEmpty(),
   151  	)
   152  
   153  	// Track and release a command.
   154  	ts, release := c1.Tracker.Track(ctx)
   155  	release(ctx, ep1, roachpb.RangeID(17), ctpb.LAI(12))
   156  
   157  	// The command is forced above ts=0.2. This is just an artifact of how the
   158  	// Tracker is implemented - it closes out 0.1 first, so it begins by forcing
   159  	// commands just above that.
   160  	require.Equal(t, hlc.Timestamp{Logical: 2}, ts)
   161  
   162  	// The clock gives a timestamp to the Provider, which should close out the
   163  	// current timestamp and set up 2E9-1E9=1E9 as the next one it wants to close.
   164  	// We do this twice (for the same timestamp) to make sure that the Provider
   165  	// not only read the tick, but also processed it. Otherwise, it becomes hard
   166  	// to write the remainder of the test because the commands we track below may
   167  	// fall into either case, and may be forced above the old or new timestamp.
   168  	for i := 0; i < 2; i++ {
   169  		c1.TestClock.Tick(hlc.Timestamp{WallTime: 2e9}, ep1, nil)
   170  	}
   171  
   172  	// The Tracker still won't let us serve anything, even though it has closed out
   173  	// 0.1 - this is because it has no information about any ranges at that timestamp.
   174  	// (Note that the Tracker may not have processed the closing yet, so if there were
   175  	// a bug here, this test would fail flakily - that's ok).
   176  	require.True(t, c1.Container.Provider.MaxClosed(
   177  		c1.NodeID, roachpb.RangeID(17), ep1, ctpb.LAI(12)).IsEmpty(),
   178  	)
   179  
   180  	// Two more commands come in.
   181  	ts, release = c1.Tracker.Track(ctx)
   182  	release(ctx, ep1, roachpb.RangeID(17), ctpb.LAI(16))
   183  	require.Equal(t, hlc.Timestamp{WallTime: 1e9, Logical: 1}, ts)
   184  
   185  	ts, release = c1.Tracker.Track(ctx)
   186  	release(ctx, ep1, roachpb.RangeID(8), ctpb.LAI(88))
   187  	require.Equal(t, hlc.Timestamp{WallTime: 1e9, Logical: 1}, ts)
   188  
   189  	// Now another tick. Shortly after it, we should be able to serve below 1E9, and 2E9 should
   190  	// be the next planned closed timestamp (though we can only verify the former).
   191  	c1.TestClock.Tick(hlc.Timestamp{WallTime: 3e9}, ep1, nil)
   192  
   193  	testutils.SucceedsSoon(t, func() error {
   194  		if c1.Container.Provider.MaxClosed(
   195  			c1.NodeID, roachpb.RangeID(17), ep1, ctpb.LAI(12),
   196  		).Less(hlc.Timestamp{WallTime: 1e9}) {
   197  			return errors.New("still can't serve")
   198  		}
   199  		return nil
   200  	})
   201  
   202  	// Shouldn't be able to serve the same thing if we haven't caught up yet.
   203  	require.False(t, !c1.Container.Provider.MaxClosed(
   204  		c1.NodeID, roachpb.RangeID(17), ep1, ctpb.LAI(11),
   205  	).Less(hlc.Timestamp{WallTime: 1e9}))
   206  
   207  	// Shouldn't be able to serve at a higher timestamp.
   208  	require.False(t, !c1.Container.Provider.MaxClosed(
   209  		c1.NodeID, roachpb.RangeID(17), ep1, ctpb.LAI(12),
   210  	).Less(hlc.Timestamp{WallTime: 1e9, Logical: 1}))
   211  
   212  	// Now things get a little more interesting. Tell node2 to get a stream of
   213  	// information from node1. We do this via Request, which as a side effect lets
   214  	// us ascertain that this request makes it to n1.
   215  	c2.Clients.Request(roachpb.NodeID(1), roachpb.RangeID(18))
   216  	testutils.SucceedsSoon(t, func() error {
   217  		exp := []roachpb.RangeID{18}
   218  		c1.Refreshed.Lock()
   219  		defer c1.Refreshed.Unlock()
   220  		if !reflect.DeepEqual(exp, c1.Refreshed.RangeIDs) {
   221  			return errors.Errorf("still waiting for %v: currently %v", exp, c1.Refreshed.RangeIDs)
   222  		}
   223  		return nil
   224  	})
   225  
   226  	// And n2 should soon also be able to serve follower reads for a range lead by
   227  	// n1 when it has caught up.
   228  	testutils.SucceedsSoon(t, func() error {
   229  		if c2.Container.Provider.MaxClosed(
   230  			c1.NodeID, roachpb.RangeID(17), ep1, ctpb.LAI(12),
   231  		).Less(hlc.Timestamp{WallTime: 1e9}) {
   232  			return errors.New("n2 still can't serve")
   233  		}
   234  		return nil
   235  	})
   236  
   237  	// Remember the other proposals we tracked above on n1: (r17, 16) and (r8, 88). Feeding another
   238  	// timestamp to n1, we should see them closed out at t=2E9, and both n1 and n2 should automatically
   239  	// be able to serve them soon thereafter.
   240  	c1.TestClock.Tick(hlc.Timestamp{WallTime: 4e9}, ep1, nil)
   241  
   242  	checkEpoch1Reads := func(ts hlc.Timestamp) {
   243  		t.Helper()
   244  		for i, c := range []*TestContainer{c1, c2} {
   245  			for _, tuple := range []struct {
   246  				roachpb.RangeID
   247  				ctpb.LAI
   248  			}{
   249  				{17, 16},
   250  				{8, 88},
   251  			} {
   252  				testutils.SucceedsSoon(t, func() error {
   253  					t.Helper()
   254  					if c.Container.Provider.MaxClosed(
   255  						c1.NodeID, tuple.RangeID, ep1, tuple.LAI,
   256  					).Less(ts) {
   257  						return errors.Errorf("n%d still can't serve (r%d,%d) @ %s", i+1, tuple.RangeID, tuple.LAI, ts)
   258  					}
   259  					return nil
   260  				})
   261  				// Still can't serve when not caught up.
   262  				require.False(t, !c.Container.Provider.MaxClosed(
   263  					c1.NodeID, tuple.RangeID, ep1, tuple.LAI-1,
   264  				).Less(ts))
   265  				// Can serve when more than caught up.
   266  				require.True(t, !c.Container.Provider.MaxClosed(
   267  					c1.NodeID, tuple.RangeID, ep1, tuple.LAI+1,
   268  				).Less(ts))
   269  				// Can't serve when in different epoch, no matter larger or smaller.
   270  				require.False(t, !c.Container.Provider.MaxClosed(
   271  					c1.NodeID, tuple.RangeID, ep0, tuple.LAI,
   272  				).Less(ts))
   273  				require.False(t, !c.Container.Provider.MaxClosed(
   274  					c1.NodeID, tuple.RangeID, ep2, tuple.LAI,
   275  				).Less(ts))
   276  			}
   277  		}
   278  	}
   279  	checkEpoch1Reads(hlc.Timestamp{WallTime: 2e9})
   280  
   281  	// Tick again in epoch 1 and ensure that reads at t=3E9 can be safely served.
   282  	// 3E9 gets closed out under the first epoch in this tick with 4E9 as the
   283  	// timestamp to be closed next due to the 1s target interval.
   284  	c1.TestClock.Tick(hlc.Timestamp{WallTime: 5e9}, ep1, nil)
   285  	checkEpoch1Reads(hlc.Timestamp{WallTime: 3e9})
   286  
   287  	// Uh-oh! n1 must've missed a heartbeat. The epoch goes up by one. This means
   288  	// that soon (after the next tick) timestamps should be closed out under the
   289  	// the epoch. The timestamp at which this happens is doctored to make sure the
   290  	// Storage holds on to the past information, because we want to end-to-end test
   291  	// that this all works out. Consequently we try Tick at the rotation interval
   292  	// plus the target duration next (so that the next closed timestamp is the
   293  	// rotation interval).
   294  	c1.TestClock.Tick(hlc.Timestamp{WallTime: int64(container.StorageBucketScale) + 5e9}, ep2, nil)
   295  
   296  	// Previously valid reads should remain valid.
   297  	checkEpoch1Reads(hlc.Timestamp{WallTime: 2e9})
   298  	checkEpoch1Reads(hlc.Timestamp{WallTime: 3e9})
   299  
   300  	// After the above tick makes it to the tracker, commands get forced above
   301  	// the next closed timestamp (from the tick above) minus target interval.
   302  	// The SucceedsSoon is to ensure that the above tick in ep2 has made it to the tracker.
   303  	testutils.SucceedsSoon(t, func() error {
   304  		ts, release = c1.Tracker.Track(ctx)
   305  		release(ctx, ep2, roachpb.RangeID(123), ctpb.LAI(456))
   306  		if !(&hlc.Timestamp{WallTime: int64(container.StorageBucketScale) + 4e9, Logical: 1}).Equal(ts) {
   307  			return errors.Errorf("command still not forced above %v", ts)
   308  		}
   309  		return nil
   310  	})
   311  
   312  	// Previously valid reads should remain valid.
   313  	checkEpoch1Reads(hlc.Timestamp{WallTime: 2e9})
   314  	checkEpoch1Reads(hlc.Timestamp{WallTime: 3e9})
   315  
   316  	// With the next tick, epoch two fully goes into effect (as the first epoch two
   317  	// timestamp gets closed out). We do this twice to make sure it's processed before
   318  	// the test proceeds.
   319  	c1.TestClock.Tick(hlc.Timestamp{WallTime: int64(container.StorageBucketScale) + 6e9}, ep2, nil)
   320  
   321  	// Previously valid reads should remain valid. Note that this is because the
   322  	// storage keeps historical data, and we've fine tuned the epoch flip so that
   323  	// it happens after the epoch 1 information rotates into another bucket and
   324  	// thus is preserved. If the epoch changed at a smaller timestamp, that
   325  	// would've wiped out the first epoch's information.
   326  	//
   327  	// TODO(tschottdorf): we could make the storage smarter so that it forces a
   328  	// rotation when the epoch changes, at the expense of pushing out historical
   329  	// information earlier. Frequent epoch changes could lead to very little
   330  	// historical information in the storage. Probably better not to risk that.
   331  	checkEpoch1Reads(hlc.Timestamp{WallTime: 2e9})
   332  	checkEpoch1Reads(hlc.Timestamp{WallTime: 3e9})
   333  
   334  	// Another second, another tick. Now the proposal tracked during epoch 2 should
   335  	// be readable from followers (as `scale+5E9` gets closed out).
   336  	c1.TestClock.Tick(hlc.Timestamp{WallTime: int64(container.StorageBucketScale) + 7e9}, ep2, nil)
   337  	for i, c := range []*TestContainer{c1, c2} {
   338  		rangeID := roachpb.RangeID(123)
   339  		lai := ctpb.LAI(456)
   340  		epoch := ep2
   341  		ts := hlc.Timestamp{WallTime: int64(container.StorageBucketScale) + 5e9}
   342  
   343  		testutils.SucceedsSoon(t, func() error {
   344  			if c.Container.Provider.MaxClosed(
   345  				c1.NodeID, rangeID, epoch, lai,
   346  			).Less(ts) {
   347  				return errors.Errorf("n%d still can't serve (r%d,%d) @ %s", i+1, rangeID, lai, ts)
   348  			}
   349  			return nil
   350  		})
   351  
   352  		// Still can't serve when not caught up.
   353  		require.False(t, !c.Container.Provider.MaxClosed(
   354  			c1.NodeID, rangeID, epoch, lai-1,
   355  		).Less(ts))
   356  
   357  		// Can serve when more than caught up.
   358  		require.True(t, !c.Container.Provider.MaxClosed(
   359  			c1.NodeID, rangeID, epoch, lai+1,
   360  		).Less(ts))
   361  
   362  		// Can't serve when in different epoch, no matter larger or smaller.
   363  		require.False(t, !c.Container.Provider.MaxClosed(
   364  			c1.NodeID, rangeID, epoch-1, lai,
   365  		).Less(ts))
   366  		require.False(t, !c.Container.Provider.MaxClosed(
   367  			c1.NodeID, rangeID, epoch+1, lai,
   368  		).Less(ts))
   369  	}
   370  }