github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/closedts/provider/provider_test.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package provider_test
    12  
    13  import (
    14  	"context"
    15  	"reflect"
    16  	"sync"
    17  	"sync/atomic"
    18  	"testing"
    19  	"time"
    20  
    21  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb"
    23  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/provider"
    24  	providertestutils "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/provider/testutils"
    25  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    26  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    27  	"github.com/cockroachdb/cockroach/pkg/testutils"
    28  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    29  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    30  	"github.com/cockroachdb/cockroach/pkg/util/log"
    31  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    32  	"github.com/cockroachdb/errors"
    33  	"github.com/cockroachdb/logtags"
    34  	"github.com/stretchr/testify/require"
    35  	"golang.org/x/sync/errgroup"
    36  )
    37  
    38  func TestProviderSubscribeNotify(t *testing.T) {
    39  	defer leaktest.AfterTest(t)()
    40  
    41  	ctx := context.Background()
    42  	stopper := stop.NewStopper()
    43  	defer stopper.Stop(ctx)
    44  
    45  	st := cluster.MakeTestingClusterSettings()
    46  	// We'll only unleash the closer loop when the test is basically done, and
    47  	// once we do that we want it to run aggressively.
    48  	// Testing that the closer loop works as advertised is left to another test.
    49  	closedts.TargetDuration.Override(&st.SV, time.Millisecond)
    50  	closedts.CloseFraction.Override(&st.SV, 1.0)
    51  
    52  	storage := &providertestutils.TestStorage{}
    53  	unblockClockCh := make(chan struct{})
    54  	cfg := &provider.Config{
    55  		NodeID:   2, // note that we're not using 1, just for kicks
    56  		Settings: st,
    57  		Stopper:  stopper,
    58  		Storage:  storage,
    59  		Clock: func(roachpb.NodeID) (hlc.Timestamp, ctpb.Epoch, error) {
    60  			select {
    61  			case <-stopper.ShouldQuiesce():
    62  			case <-unblockClockCh:
    63  			}
    64  			return hlc.Timestamp{}, ctpb.Epoch(1), errors.New("injected clock error")
    65  		},
    66  		Close: func(next hlc.Timestamp, expCurEpoch ctpb.Epoch) (hlc.Timestamp, map[roachpb.RangeID]ctpb.LAI, bool) {
    67  			panic("should never be called")
    68  		},
    69  	}
    70  
    71  	p := provider.NewProvider(cfg)
    72  	p.Start()
    73  
    74  	// We won't touch n1 in this test, so this entry should never pop up.
    75  	unseenEntry := ctpb.Entry{
    76  		ClosedTimestamp: hlc.Timestamp{WallTime: 456},
    77  		Epoch:           17,
    78  	}
    79  	cfg.Storage.Add(1, unseenEntry)
    80  
    81  	entryAt := func(i int) ctpb.Entry {
    82  		return ctpb.Entry{
    83  			ClosedTimestamp: hlc.Timestamp{WallTime: int64(i) * 1e9},
    84  			Epoch:           ctpb.Epoch(i),
    85  			MLAI: map[roachpb.RangeID]ctpb.LAI{
    86  				roachpb.RangeID(i): ctpb.LAI(10 * i),
    87  			},
    88  		}
    89  	}
    90  
    91  	const numEntries = 10 // must be even
    92  	var entries []ctpb.Entry
    93  	for i := 0; i < numEntries; i++ {
    94  		entries = append(entries, entryAt(i))
    95  	}
    96  
    97  	var readerSeq int32 // atomically
    98  	reader := func() error {
    99  		i := atomic.AddInt32(&readerSeq, 1)
   100  		ctx, cancel := context.WithCancel(context.Background())
   101  		defer cancel()
   102  		ctx = logtags.AddTag(ctx, "reader", int(i))
   103  
   104  		log.Infof(ctx, "starting")
   105  		defer log.Infof(ctx, "done")
   106  
   107  		ch := make(chan ctpb.Entry)
   108  		stopper.RunWorker(ctx, func(ctx context.Context) {
   109  			p.Subscribe(ctx, ch)
   110  		})
   111  
   112  		var sl []ctpb.Entry // for debug purposes only
   113  		// Read entries off the subscription. We check two invariants:
   114  		// 1. we see each Entry (identified via its Epoch) at least twice
   115  		//    (morally exactly twice, but the Provider gives a weaker guarantee)
   116  		// 2. An Entry can only be observed after the previous Entry has been seen
   117  		//    at least once. That is, to see Epoch X, we need to have seen Epoch X-1.
   118  		//
   119  		// These could be sharpened somewhat, but only at a distinct loss of clarity
   120  		// in this part of the test.
   121  		//
   122  		// Examples, writing only the Epoch (which uniquely identifies the Entry in this test):
   123  		// OK:
   124  		// - 1 2 3 1 2 3
   125  		//   First writer sends everything before second writer.
   126  		// - 1 1 2 3 1 2 3
   127  		//   First writer sends everything before second, but first 1 gets duplicated by Provider.
   128  		// - 1 2 3 1 2 3 1 2 3
   129  		//   Same as last, but whole thing gets duplicated.
   130  		// - 1 2 3 2 3 1 2 3
   131  		//   Only 2 3 gets duplicated.
   132  		// Not OK:
   133  		// - 1 1 2 3 3
   134  		//   Two seen only once.
   135  		// - 1 3 2 1 2 3
   136  		//   Three observed before two.
   137  		m := map[ctpb.Epoch]int{-1: 2} // pretend we've seen Epoch -1 twice, streamlines code below
   138  		expM := map[ctpb.Epoch]int{-1: 2}
   139  		for _, entry := range entries {
   140  			expM[entry.Epoch] = 2
   141  		}
   142  		for {
   143  			select {
   144  			case <-time.After(10 * time.Second):
   145  				return errors.Errorf("nothing emitted after %v", sl)
   146  			case entry, ok := <-ch: // implies runtime.Gosched
   147  				if !ok {
   148  					if ctx.Err() != nil {
   149  						// Expected, we must've canceled the context below earlier, which means the
   150  						// checks were successful.
   151  						return nil
   152  					}
   153  					return errors.New("sender closed channel before reader canceled their context")
   154  				}
   155  				sl = append(sl, entry)
   156  				log.Infof(ctx, "got %d entries now,latest: %+v", len(sl), entry)
   157  				diagErr := errors.Errorf("saw: %v", sl)
   158  				prevEpo := entry.Epoch - 1
   159  				if m[prevEpo] < 1 {
   160  					return errors.Wrapf(
   161  						diagErr,
   162  						"entry for epoch %d received before a matching entry for immediately preceding epoch %d",
   163  						entry.Epoch, prevEpo,
   164  					)
   165  				}
   166  				m[entry.Epoch]++
   167  				if m[entry.Epoch] > 2 {
   168  					m[entry.Epoch] = 2
   169  				}
   170  
   171  				if reflect.DeepEqual(expM, m) && ctx.Err() == nil {
   172  					log.Info(ctx, "canceling subscription")
   173  					cancel()
   174  					// As a little gotcha, we need to work around the implementation a tiny bit.
   175  					// The provider uses a sync.Cond to notify clients and it is likely waiting
   176  					// for new activity for our subscription. Thus, it's not going to notice
   177  					// that this client is going away; it would notice if the Provider's closer
   178  					// did its job (we've blocked it so far) because that periodically wakes
   179  					// up all clients, rain or shine. So we unblock it now; the Clock is set up
   180  					// to return errors, so as a nice little benefit we verify that even in that
   181  					// case the subscription does get woken up.
   182  					close(unblockClockCh)
   183  				}
   184  			}
   185  		}
   186  	}
   187  
   188  	// Add some entries via Notify, and race them with various subscriptions. Note
   189  	// that in reality, we have only a single notification going on for the local node
   190  	// (run by a Provider goroutine). But the data that comes in from other nodes uses
   191  	// the same mechanism, and it's nice to get coverage for it. In particular, during
   192  	// reconnections, you could imagine two notification streams for the same NodeID to
   193  	// be active in parallel.
   194  	var g errgroup.Group
   195  	for i := range []struct{}{{}, {}} { // twice
   196  		i := i // goroutine copy
   197  		g.Go(func() error {
   198  			ctx := logtags.AddTag(context.Background(), "writer", i)
   199  			log.Info(ctx, "starting")
   200  			defer log.Info(ctx, "done")
   201  			nCh := p.Notify(roachpb.NodeID(2))
   202  			defer close(nCh)
   203  			for _, entry := range entries {
   204  				nCh <- entry // implies runtime.Gosched
   205  				log.Infof(ctx, "wrote %s", entry)
   206  			}
   207  			return nil
   208  		})
   209  	}
   210  
   211  	for i := 0; i < 1; i++ { // HACK
   212  		g.Go(reader)
   213  	}
   214  	if err := g.Wait(); err != nil {
   215  		t.Fatal(err)
   216  	}
   217  
   218  	testutils.SucceedsSoon(t, func() error {
   219  		snap := storage.Snapshot()
   220  		require.Equal(t, 2, len(snap))    // definitely should have records about two nodes
   221  		require.Equal(t, 1, len(snap[1])) // one persisted entry for n1
   222  		// Morally this is true immediately, but consider that the goroutine consuming
   223  		// from the writer threads above may have read the entry but not put it into
   224  		// the Storage yet. The reader threads would usually remove this race, but
   225  		// they can be satisfied early by a duplicate that is emitted during the
   226  		// switchover from storage to subscription.
   227  		if exp, act := 2*numEntries, len(snap[2]); exp < act {
   228  			t.Fatalf("got %d entries in storage, expected no more than %d", act, exp)
   229  		} else if exp > act {
   230  			return errors.Errorf("storage has %d entries, need %d", exp, act)
   231  		}
   232  		return nil
   233  	})
   234  }
   235  
   236  // TestProviderSubscribeConcurrent prevents regression of a bug that improperly
   237  // handled concurrent subscriptions.
   238  func TestProviderSubscribeConcurrent(t *testing.T) {
   239  	defer leaktest.AfterTest(t)()
   240  
   241  	st := cluster.MakeTestingClusterSettings()
   242  	closedts.TargetDuration.Override(&st.SV, time.Millisecond)
   243  	closedts.CloseFraction.Override(&st.SV, 1.0)
   244  
   245  	stopper := stop.NewStopper()
   246  	storage := &providertestutils.TestStorage{}
   247  
   248  	var ts int64 // atomic
   249  	cfg := &provider.Config{
   250  		NodeID:   1,
   251  		Settings: st,
   252  		Stopper:  stopper,
   253  		Storage:  storage,
   254  		Clock: func(roachpb.NodeID) (hlc.Timestamp, ctpb.Epoch, error) {
   255  			return hlc.Timestamp{}, 1, nil
   256  		},
   257  		Close: func(next hlc.Timestamp, expCurEpoch ctpb.Epoch) (hlc.Timestamp, map[roachpb.RangeID]ctpb.LAI, bool) {
   258  			return hlc.Timestamp{
   259  					WallTime: atomic.AddInt64(&ts, 1),
   260  				}, map[roachpb.RangeID]ctpb.LAI{
   261  					1: ctpb.LAI(atomic.LoadInt64(&ts)),
   262  				}, true
   263  		},
   264  	}
   265  
   266  	p := provider.NewProvider(cfg)
   267  	p.Start()
   268  
   269  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Millisecond)
   270  	defer cancel()
   271  	cancel = func() {}
   272  	const n = 10
   273  	var wg sync.WaitGroup
   274  	wg.Add(n)
   275  	for i := 0; i < n; i++ {
   276  		go func() {
   277  			defer wg.Done()
   278  			ch := make(chan ctpb.Entry, 3)
   279  			p.Subscribe(ctx, ch)
   280  			// Read from channel until stopper stops Provider (and in turn Provider
   281  			// closes channel).
   282  			for range ch {
   283  			}
   284  		}()
   285  	}
   286  	stopper.Stop(context.Background())
   287  	wg.Wait()
   288  }
   289  
   290  // TestProviderTargetDurationSetting ensures that setting the target duration to
   291  // zero disables closing the timestamp and that setting it back to a positive
   292  // value re-enables it.
   293  func TestProviderTargetDurationSetting(t *testing.T) {
   294  	defer leaktest.AfterTest(t)()
   295  
   296  	st := cluster.MakeTestingClusterSettings()
   297  	closedts.TargetDuration.Override(&st.SV, time.Millisecond)
   298  	closedts.CloseFraction.Override(&st.SV, 1.0)
   299  
   300  	stopper := stop.NewStopper()
   301  	storage := &providertestutils.TestStorage{}
   302  	defer stopper.Stop(context.Background())
   303  
   304  	var ts int64 // atomic
   305  	var called int
   306  	calledCh := make(chan struct{})
   307  	cfg := &provider.Config{
   308  		NodeID:   1,
   309  		Settings: st,
   310  		Stopper:  stopper,
   311  		Storage:  storage,
   312  		Clock: func(roachpb.NodeID) (hlc.Timestamp, ctpb.Epoch, error) {
   313  			return hlc.Timestamp{}, 1, nil
   314  		},
   315  		Close: func(next hlc.Timestamp, expCurEpoch ctpb.Epoch) (hlc.Timestamp, map[roachpb.RangeID]ctpb.LAI, bool) {
   316  			if called++; called == 1 {
   317  				closedts.TargetDuration.Override(&st.SV, 0)
   318  			}
   319  			select {
   320  			case calledCh <- struct{}{}:
   321  			case <-stopper.ShouldQuiesce():
   322  			}
   323  			return hlc.Timestamp{
   324  					WallTime: atomic.AddInt64(&ts, 1),
   325  				}, map[roachpb.RangeID]ctpb.LAI{
   326  					1: ctpb.LAI(atomic.LoadInt64(&ts)),
   327  				}, true
   328  		},
   329  	}
   330  
   331  	p := provider.NewProvider(cfg)
   332  	p.Start()
   333  
   334  	// Get called once. While it's being called, we set the target duration to 0,
   335  	// disabling the updates. We wait someTime and ensure we don't get called
   336  	// again. Then we re-enable the setting and ensure we do get called.
   337  	<-calledCh
   338  	const someTime = 10 * time.Millisecond
   339  	select {
   340  	case <-calledCh:
   341  		t.Fatal("expected no updates to be sent")
   342  	case <-time.After(someTime):
   343  	}
   344  	closedts.TargetDuration.Override(&st.SV, time.Millisecond)
   345  	<-calledCh
   346  }