github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_protected_timestamp_test.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"testing"
    17  	"time"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    20  	"github.com/cockroachdb/cockroach/pkg/keys"
    21  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts/ptpb"
    23  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    24  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    25  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    26  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    27  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    28  	"github.com/cockroachdb/errors"
    29  	"github.com/stretchr/testify/require"
    30  )
    31  
    32  // TestProtectedTimestampRecordApplies exercises
    33  // Replica.protectedTimestampWillApply() at a low level.
    34  // It does so by passing a Replica connected to an already
    35  // shut down store to a variety of test cases.
    36  func TestProtectedTimestampRecordApplies(t *testing.T) {
    37  	defer leaktest.AfterTest(t)()
    38  	ctx := context.Background()
    39  
    40  	makeArgs := func(r *Replica, ts, aliveAt hlc.Timestamp) roachpb.AdminVerifyProtectedTimestampRequest {
    41  		args := roachpb.AdminVerifyProtectedTimestampRequest{
    42  			RecordID:      uuid.MakeV4(),
    43  			Protected:     ts,
    44  			RecordAliveAt: aliveAt,
    45  		}
    46  		args.Key, args.EndKey = r.Desc().StartKey.AsRawKey(), r.Desc().EndKey.AsRawKey()
    47  		return args
    48  	}
    49  	for _, testCase := range []struct {
    50  		name string
    51  		// Note that the store underneath the passed in Replica has been stopped.
    52  		// This leaves the test to mutate the Replica state as it sees fit.
    53  		test func(t *testing.T, r *Replica, mt *manualCache)
    54  	}{
    55  
    56  		// Test that if the lease started after the timestamp at which the record
    57  		// was known to be live then we know that the Replica cannot GC until it
    58  		// reads protected timestamp state after the lease start time. If the
    59  		// relevant record is not found then it must have been removed.
    60  		{
    61  			name: "lease started after",
    62  			test: func(t *testing.T, r *Replica, mt *manualCache) {
    63  				r.mu.state.Lease.Start = r.store.Clock().Now()
    64  				l, _ := r.GetLease()
    65  				aliveAt := l.Start.Prev()
    66  				ts := aliveAt.Prev()
    67  				args := makeArgs(r, ts, aliveAt)
    68  				willApply, err := r.protectedTimestampRecordApplies(ctx, &args)
    69  				require.True(t, willApply)
    70  				require.NoError(t, err)
    71  			},
    72  		},
    73  		// If the GC threshold is already newer than the timestamp we want to
    74  		// protect then we failed.
    75  		{
    76  			name: "gc threshold is after ts",
    77  			test: func(t *testing.T, r *Replica, mt *manualCache) {
    78  				thresh := r.store.Clock().Now()
    79  				r.mu.state.GCThreshold = &thresh
    80  				ts := thresh.Prev().Prev()
    81  				aliveAt := ts.Next()
    82  				args := makeArgs(r, ts, aliveAt)
    83  				willApply, err := r.protectedTimestampRecordApplies(ctx, &args)
    84  				require.False(t, willApply)
    85  				require.NoError(t, err)
    86  			},
    87  		},
    88  		// If the GC threshold we're about to protect is newer than the timestamp
    89  		// we want to protect then we're almost certain to fail. Treat it as a
    90  		// failure.
    91  		{
    92  			name: "pending GC threshold is newer than the timestamp we want to protect",
    93  			test: func(t *testing.T, r *Replica, mt *manualCache) {
    94  				thresh := r.store.Clock().Now()
    95  				require.NoError(t, r.markPendingGC(hlc.Timestamp{}, thresh))
    96  				ts := thresh.Prev().Prev()
    97  				aliveAt := ts.Next()
    98  				args := makeArgs(r, ts, aliveAt)
    99  				willApply, err := r.protectedTimestampRecordApplies(ctx, &args)
   100  				require.False(t, willApply)
   101  				require.NoError(t, err)
   102  			},
   103  		},
   104  		// If the timestamp at which the record is known to be alive is newer than
   105  		// our current view of the protected timestamp subsystem and we don't
   106  		// already see the record, then we will refresh. In this case we refresh
   107  		// and find it. We also verify that we cannot set the pending gc threshold
   108  		// to above the timestamp we put in the record.
   109  		{
   110  			name: "newer aliveAt triggers refresh leading to success",
   111  			test: func(t *testing.T, r *Replica, mt *manualCache) {
   112  				ts := r.store.Clock().Now()
   113  				aliveAt := ts.Next()
   114  				mt.asOf = ts.Prev()
   115  				args := makeArgs(r, ts, aliveAt)
   116  				mt.refresh = func(_ context.Context, refreshTo hlc.Timestamp) error {
   117  					require.Equal(t, refreshTo, aliveAt)
   118  					mt.records = append(mt.records, &ptpb.Record{
   119  						ID:        args.RecordID,
   120  						Timestamp: ts,
   121  						Spans: []roachpb.Span{
   122  							{
   123  								Key:    roachpb.Key(r.startKey()),
   124  								EndKey: roachpb.Key(r.startKey().Next()),
   125  							},
   126  						},
   127  					})
   128  					mt.asOf = refreshTo.Next()
   129  					return nil
   130  				}
   131  				willApply, err := r.protectedTimestampRecordApplies(ctx, &args)
   132  				require.True(t, willApply)
   133  				require.NoError(t, err)
   134  				require.Equal(t,
   135  					fmt.Sprintf("cannot set gc threshold to %v because read at %v < min %v",
   136  						ts.Next(), ts, aliveAt.Next()),
   137  					r.markPendingGC(ts, ts.Next()).Error())
   138  			},
   139  		},
   140  		// If the timestamp at which the record is known to be alive is older than
   141  		// our current view of the protected timestamp subsystem and we don't
   142  		// already see the record, then we know that the record must have been
   143  		// deleted already. Ensure we fail.
   144  		{
   145  			name: "record does not exist",
   146  			test: func(t *testing.T, r *Replica, mt *manualCache) {
   147  				ts := r.store.Clock().Now()
   148  				aliveAt := ts.Next()
   149  				mt.asOf = aliveAt.Next()
   150  				args := makeArgs(r, ts, aliveAt)
   151  				willApply, err := r.protectedTimestampRecordApplies(ctx, &args)
   152  				require.False(t, willApply)
   153  				require.NoError(t, err)
   154  			},
   155  		},
   156  		// If we see the record then we know we're good.
   157  		{
   158  			name: "record already exists",
   159  			test: func(t *testing.T, r *Replica, mt *manualCache) {
   160  				ts := r.store.Clock().Now()
   161  				aliveAt := ts.Next()
   162  				args := makeArgs(r, ts, aliveAt)
   163  				mt.asOf = aliveAt.Next()
   164  				mt.records = append(mt.records, &ptpb.Record{
   165  					ID:        args.RecordID,
   166  					Timestamp: ts,
   167  					Spans: []roachpb.Span{
   168  						{
   169  							Key:    keys.MinKey,
   170  							EndKey: keys.MaxKey,
   171  						},
   172  					},
   173  				})
   174  				willApply, err := r.protectedTimestampRecordApplies(ctx, &args)
   175  				require.True(t, willApply)
   176  				require.NoError(t, err)
   177  			},
   178  		},
   179  		// Ensure that a failure to Refresh propagates.
   180  		{
   181  			name: "refresh fails",
   182  			test: func(t *testing.T, r *Replica, mt *manualCache) {
   183  				ts := r.store.Clock().Now()
   184  				aliveAt := ts.Next()
   185  				mt.asOf = ts.Prev()
   186  				mt.refresh = func(_ context.Context, refreshTo hlc.Timestamp) error {
   187  					return errors.New("boom")
   188  				}
   189  				args := makeArgs(r, ts, aliveAt)
   190  				willApply, err := r.protectedTimestampRecordApplies(ctx, &args)
   191  				require.False(t, willApply)
   192  				require.EqualError(t, err, "boom")
   193  			},
   194  		},
   195  		// Ensure NLE propagates.
   196  		{
   197  			name: "not leaseholder before refresh",
   198  			test: func(t *testing.T, r *Replica, mt *manualCache) {
   199  				r.mu.Lock()
   200  				lease := *r.mu.state.Lease
   201  				lease.Sequence++
   202  				lease.Replica = roachpb.ReplicaDescriptor{
   203  					ReplicaID: 2,
   204  					StoreID:   2,
   205  					NodeID:    2,
   206  				}
   207  				r.mu.state.Lease = &lease
   208  				r.mu.Unlock()
   209  				ts := r.store.Clock().Now()
   210  				aliveAt := ts.Prev().Prev()
   211  				mt.asOf = ts.Prev()
   212  				args := makeArgs(r, ts, aliveAt)
   213  				willApply, err := r.protectedTimestampRecordApplies(ctx, &args)
   214  				require.False(t, willApply)
   215  				require.Regexp(t, "NotLeaseHolderError", err.Error())
   216  			},
   217  		},
   218  		// Ensure NLE after performing a refresh propagates.
   219  		{
   220  			name: "not leaseholder after refresh",
   221  			test: func(t *testing.T, r *Replica, mt *manualCache) {
   222  				ts := r.store.Clock().Now()
   223  				aliveAt := ts.Next()
   224  				mt.asOf = ts.Prev()
   225  				mt.refresh = func(ctx context.Context, refreshTo hlc.Timestamp) error {
   226  					r.mu.Lock()
   227  					defer r.mu.Unlock()
   228  					lease := *r.mu.state.Lease
   229  					lease.Sequence++
   230  					lease.Replica = roachpb.ReplicaDescriptor{
   231  						ReplicaID: 2,
   232  						StoreID:   2,
   233  						NodeID:    2,
   234  					}
   235  					r.mu.state.Lease = &lease
   236  					return nil
   237  				}
   238  				args := makeArgs(r, ts, aliveAt)
   239  				willApply, err := r.protectedTimestampRecordApplies(ctx, &args)
   240  				require.False(t, willApply)
   241  				require.Regexp(t, "NotLeaseHolderError", err.Error())
   242  			},
   243  		},
   244  		// If refresh succeeds but the timestamp of the cache does not advance as
   245  		// anticipated, ensure that an assertion failure error is returned.
   246  		{
   247  			name: "successful refresh does not update timestamp (assertion failure)",
   248  			test: func(t *testing.T, r *Replica, mt *manualCache) {
   249  				ts := r.store.Clock().Now()
   250  				aliveAt := ts.Next()
   251  				mt.asOf = ts.Prev()
   252  				mt.refresh = func(_ context.Context, refreshTo hlc.Timestamp) error {
   253  					return nil
   254  				}
   255  				args := makeArgs(r, ts, aliveAt)
   256  				willApply, err := r.protectedTimestampRecordApplies(ctx, &args)
   257  				require.False(t, willApply)
   258  				require.EqualError(t, err, "cache was not updated after being refreshed")
   259  				require.True(t, errors.IsAssertionFailure(err), "%v", err)
   260  			},
   261  		},
   262  		// If a request header is for a key span which is not owned by this replica,
   263  		// ensure that a roachpb.RangeKeyMismatchError is returned.
   264  		{
   265  			name: "request span is respected",
   266  			test: func(t *testing.T, r *Replica, mt *manualCache) {
   267  				ts := r.store.Clock().Now()
   268  				aliveAt := ts.Next()
   269  				mt.asOf = ts.Prev()
   270  				args := makeArgs(r, ts, aliveAt)
   271  				r.mu.state.Desc.StartKey = roachpb.RKey(keys.TableDataMax)
   272  				willApply, err := r.protectedTimestampRecordApplies(ctx, &args)
   273  				require.False(t, willApply)
   274  				require.EqualError(t, err, "key range /Min-/Max outside of bounds of range /Table/Max-/Max")
   275  			},
   276  		},
   277  	} {
   278  		t.Run(testCase.name, func(t *testing.T) {
   279  			tc := testContext{}
   280  			tsc := TestStoreConfig(nil)
   281  			mc := &manualCache{}
   282  			tsc.ProtectedTimestampCache = mc
   283  			// Under extreme stressrace scenarios the single replica can somehow
   284  			// lose the lease. Make the timeout extremely long.
   285  			tsc.RaftConfig.RangeLeaseRaftElectionTimeoutMultiplier = 100
   286  			stopper := stop.NewStopper()
   287  			tc.StartWithStoreConfig(t, stopper, tsc)
   288  			stopper.Stop(ctx)
   289  			testCase.test(t, tc.repl, mc)
   290  		})
   291  	}
   292  }
   293  
   294  // TestCheckProtectedTimestampsForGC exercises
   295  // Replica.checkProtectedTimestampsForGC() at a low level.
   296  // It does so by passing a Replica connected to an already
   297  // shut down store to a variety of test cases.
   298  func TestCheckProtectedTimestampsForGC(t *testing.T) {
   299  	defer leaktest.AfterTest(t)()
   300  	ctx := context.Background()
   301  
   302  	makePolicy := func(ttlSec int32) zonepb.GCPolicy {
   303  		return zonepb.GCPolicy{TTLSeconds: ttlSec}
   304  	}
   305  	for _, testCase := range []struct {
   306  		name string
   307  		// Note that the store underneath the passed in Replica has been stopped.
   308  		// This leaves the test to mutate the Replica state as it sees fit.
   309  		test func(t *testing.T, r *Replica, mt *manualCache)
   310  	}{
   311  		{
   312  			name: "lease is too new",
   313  			test: func(t *testing.T, r *Replica, mt *manualCache) {
   314  				r.mu.state.Lease.Start = r.store.Clock().Now()
   315  				canGC, _, gcTimestamp, _ := r.checkProtectedTimestampsForGC(ctx, makePolicy(10))
   316  				require.False(t, canGC)
   317  				require.Zero(t, gcTimestamp)
   318  			},
   319  		},
   320  		{
   321  			name: "have overlapping but new enough that it's okay",
   322  			test: func(t *testing.T, r *Replica, mt *manualCache) {
   323  				ts := r.store.Clock().Now()
   324  				mt.asOf = r.store.Clock().Now().Next()
   325  				mt.records = append(mt.records, &ptpb.Record{
   326  					ID:        uuid.MakeV4(),
   327  					Timestamp: ts,
   328  					Spans: []roachpb.Span{
   329  						{
   330  							Key:    keys.MinKey,
   331  							EndKey: keys.MaxKey,
   332  						},
   333  					},
   334  				})
   335  				// We should allow gc to proceed with the normal new threshold if that
   336  				// threshold is earlier than all of the records.
   337  				canGC, _, gcTimestamp, _ := r.checkProtectedTimestampsForGC(ctx, makePolicy(10))
   338  				require.True(t, canGC)
   339  				require.Equal(t, mt.asOf, gcTimestamp)
   340  			},
   341  		},
   342  		{
   343  			// In this case we have a record which protects some data but we can
   344  			// set the threshold to a later point.
   345  			name: "have overlapping but can still GC some",
   346  			test: func(t *testing.T, r *Replica, mt *manualCache) {
   347  				ts := r.store.Clock().Now().Add(-11*time.Second.Nanoseconds(), 0)
   348  				mt.asOf = r.store.Clock().Now().Next()
   349  				mt.records = append(mt.records, &ptpb.Record{
   350  					ID:        uuid.MakeV4(),
   351  					Timestamp: ts,
   352  					Spans: []roachpb.Span{
   353  						{
   354  							Key:    keys.MinKey,
   355  							EndKey: keys.MaxKey,
   356  						},
   357  					},
   358  				})
   359  				// We should allow gc to proceed up to the timestamp which precedes the
   360  				// protected timestamp. This means we expect a GC timestamp 10 seconds
   361  				// after ts.Prev() given the policy.
   362  				canGC, _, gcTimestamp, _ := r.checkProtectedTimestampsForGC(ctx, makePolicy(10))
   363  				require.True(t, canGC)
   364  				require.Equal(t, ts.Prev().Add(10*time.Second.Nanoseconds(), 0), gcTimestamp)
   365  			},
   366  		},
   367  		{
   368  			// In this case we have a record which is right up against the GC
   369  			// threshold.
   370  			name: "have overlapping but have already GC'd right up to the threshold",
   371  			test: func(t *testing.T, r *Replica, mt *manualCache) {
   372  				r.mu.Lock()
   373  				th := *r.mu.state.GCThreshold
   374  				r.mu.Unlock()
   375  				mt.asOf = r.store.Clock().Now().Next()
   376  				mt.records = append(mt.records, &ptpb.Record{
   377  					ID:        uuid.MakeV4(),
   378  					Timestamp: th.Next(),
   379  					Spans: []roachpb.Span{
   380  						{
   381  							Key:    keys.MinKey,
   382  							EndKey: keys.MaxKey,
   383  						},
   384  					},
   385  				})
   386  				// We should not allow GC if the threshold is already the predecessor
   387  				// of the earliest valid record.
   388  				canGC, _, gcTimestamp, _ := r.checkProtectedTimestampsForGC(ctx, makePolicy(10))
   389  				require.False(t, canGC)
   390  				require.Zero(t, gcTimestamp)
   391  			},
   392  		},
   393  		{
   394  			name: "failed record does not prevent GC",
   395  			test: func(t *testing.T, r *Replica, mt *manualCache) {
   396  				ts := r.store.Clock().Now()
   397  				id := uuid.MakeV4()
   398  				thresh := ts.Next()
   399  				r.mu.state.GCThreshold = &thresh
   400  				mt.asOf = thresh.Next()
   401  				mt.records = append(mt.records, &ptpb.Record{
   402  					ID:        id,
   403  					Timestamp: ts,
   404  					Spans: []roachpb.Span{
   405  						{
   406  							Key:    keys.MinKey,
   407  							EndKey: keys.MaxKey,
   408  						},
   409  					},
   410  				})
   411  				canGC, _, gcTimestamp, _ := r.checkProtectedTimestampsForGC(ctx, makePolicy(10))
   412  				require.True(t, canGC)
   413  				require.Equal(t, mt.asOf, gcTimestamp)
   414  			},
   415  		},
   416  	} {
   417  		t.Run(testCase.name, func(t *testing.T) {
   418  			tc := testContext{}
   419  			tsc := TestStoreConfig(nil)
   420  			mc := &manualCache{}
   421  			tsc.ProtectedTimestampCache = mc
   422  			stopper := stop.NewStopper()
   423  			tc.StartWithStoreConfig(t, stopper, tsc)
   424  			stopper.Stop(ctx)
   425  			testCase.test(t, tc.repl, mc)
   426  		})
   427  	}
   428  }
   429  
   430  type manualCache struct {
   431  	asOf    hlc.Timestamp
   432  	records []*ptpb.Record
   433  	refresh func(ctx context.Context, asOf hlc.Timestamp) error
   434  }
   435  
   436  func (c *manualCache) Iterate(
   437  	ctx context.Context, start, end roachpb.Key, it protectedts.Iterator,
   438  ) hlc.Timestamp {
   439  	query := roachpb.Span{Key: start, EndKey: end}
   440  	for _, r := range c.records {
   441  		for _, sp := range r.Spans {
   442  			if query.Overlaps(sp) {
   443  				it(r)
   444  				break
   445  			}
   446  		}
   447  	}
   448  	return c.asOf
   449  }
   450  
   451  func (c *manualCache) Refresh(ctx context.Context, asOf hlc.Timestamp) error {
   452  	if c.refresh == nil {
   453  		c.asOf = asOf
   454  		return nil
   455  	}
   456  	return c.refresh(ctx, asOf)
   457  }
   458  
   459  func (c *manualCache) QueryRecord(
   460  	ctx context.Context, id uuid.UUID,
   461  ) (exists bool, asOf hlc.Timestamp) {
   462  	for _, r := range c.records {
   463  		if r.ID == id {
   464  			return true, c.asOf
   465  		}
   466  	}
   467  	return false, c.asOf
   468  }
   469  
   470  var _ protectedts.Cache = (*manualCache)(nil)