vitess.io/vitess@v0.16.2/go/vt/vtgate/buffer/buffer_test.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package buffer
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"strings"
    23  	"testing"
    24  	"time"
    25  
    26  	"vitess.io/vitess/go/vt/topo/topoproto"
    27  	"vitess.io/vitess/go/vt/vterrors"
    28  
    29  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    30  	vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
    31  )
    32  
    33  const (
    34  	keyspace = "ks1"
    35  	shard    = "0"
    36  	// shard2 is only used for tests with two concurrent failovers.
    37  	shard2 = "-80"
    38  )
    39  
    40  var (
    41  	failoverErr = vterrors.New(vtrpcpb.Code_CLUSTER_EVENT,
    42  		"vttablet: rpc error: code = 17 desc = gRPCServerError: retry: operation not allowed in state SHUTTING_DOWN")
    43  	nonFailoverErr = vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION,
    44  		"vttablet: rpc error: code = 9 desc = gRPCServerError: retry: TODO(mberlin): Insert here any realistic error not caused by a failover")
    45  
    46  	statsKeyJoined = fmt.Sprintf("%s.%s", keyspace, shard)
    47  
    48  	statsKeyJoinedFailoverEndDetected = statsKeyJoined + "." + string(stopFailoverEndDetected)
    49  
    50  	statsKeyJoinedWindowExceeded = statsKeyJoined + "." + string(evictedWindowExceeded)
    51  
    52  	statsKeyJoinedLastReparentTooRecent = statsKeyJoined + "." + string(skippedLastReparentTooRecent)
    53  	statsKeyJoinedLastFailoverTooRecent = statsKeyJoined + "." + string(skippedLastFailoverTooRecent)
    54  
    55  	oldPrimary = &topodatapb.Tablet{
    56  		Alias:    &topodatapb.TabletAlias{Cell: "cell1", Uid: 100},
    57  		Keyspace: keyspace,
    58  		Shard:    shard,
    59  		Type:     topodatapb.TabletType_PRIMARY,
    60  		PortMap:  map[string]int32{"vt": int32(100)},
    61  	}
    62  	newPrimary = &topodatapb.Tablet{
    63  		Alias:    &topodatapb.TabletAlias{Cell: "cell1", Uid: 101},
    64  		Keyspace: keyspace,
    65  		Shard:    shard,
    66  		Type:     topodatapb.TabletType_PRIMARY,
    67  		PortMap:  map[string]int32{"vt": int32(101)},
    68  	}
    69  )
    70  
    71  func TestBuffering(t *testing.T) {
    72  	testAllImplementations(t, testBuffering1)
    73  }
    74  
    75  func testBuffering1(t *testing.T, fail failover) {
    76  	resetVariables()
    77  	defer checkVariables(t)
    78  
    79  	// Create the buffer.
    80  	now := time.Now()
    81  	cfg := NewDefaultConfig()
    82  	cfg.Enabled = true
    83  	// Dry-run mode will apply to other keyspaces and shards. Not tested here.
    84  	cfg.DryRun = true
    85  	cfg.Shards = map[string]bool{
    86  		topoproto.KeyspaceShardString(keyspace, shard): true,
    87  	}
    88  	cfg.now = func() time.Time { return now }
    89  
    90  	b := New(cfg)
    91  
    92  	// Simulate that the current primary reports its ExternallyReparentedTimestamp.
    93  	// vtgate sees this at startup. Additional periodic updates will be sent out
    94  	// after this. If the TabletExternallyReparented RPC is called regularly by
    95  	// an external failover tool, the timestamp will be increased (even though
    96  	// the primary did not change.)
    97  	fail(b, oldPrimary, keyspace, shard, now)
    98  
    99  	// First request with failover error starts buffering.
   100  	stopped := issueRequest(context.Background(), t, b, failoverErr)
   101  	if err := waitForRequestsInFlight(b, 1); err != nil {
   102  		t.Fatal(err)
   103  	}
   104  	// Start counter must have been increased.
   105  	if got, want := starts.Counts()[statsKeyJoined], int64(1); got != want {
   106  		t.Fatalf("buffering start was not tracked: got = %v, want = %v", got, want)
   107  	}
   108  
   109  	// Subsequent requests with errors not related to the failover are not buffered.
   110  	if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, shard, nonFailoverErr); err != nil || retryDone != nil {
   111  		t.Fatalf("requests with non-failover errors must never be buffered. err: %v retryDone: %v", err, retryDone)
   112  	}
   113  
   114  	// Subsequent requests are buffered (if their error is nil or caused by the failover).
   115  	stopped2 := issueRequest(context.Background(), t, b, nil)
   116  	stopped3 := issueRequest(context.Background(), t, b, failoverErr)
   117  	if err := waitForRequestsInFlight(b, 3); err != nil {
   118  		t.Fatal(err)
   119  	}
   120  
   121  	// Mimic the failover end.
   122  	now = now.Add(1 * time.Second)
   123  	fail(b, newPrimary, keyspace, shard, now)
   124  
   125  	// Check that the drain is successful.
   126  	if err := <-stopped; err != nil {
   127  		t.Fatalf("request should have been buffered and not returned an error: %v", err)
   128  	}
   129  	if err := <-stopped2; err != nil {
   130  		t.Fatalf("request should have been buffered and not returned an error: %v", err)
   131  	}
   132  	if err := <-stopped3; err != nil {
   133  		t.Fatalf("request should have been buffered and not returned an error: %v", err)
   134  	}
   135  	// Failover time should have been be published.
   136  	durations := failoverDurationSumMs.Counts()
   137  	if _, ok := durations[statsKeyJoined]; !ok {
   138  		t.Fatalf("a failover time must have been recorded: %v", durations)
   139  	}
   140  	// Recorded max buffer usage should be 3 now.
   141  	if got, want := lastRequestsInFlightMax.Counts()[statsKeyJoined], int64(3); got != want {
   142  		t.Fatalf("wrong value for BufferRequestsInFlightMax: got = %v, want = %v", got, want)
   143  	}
   144  	// Stop counter should have been increased.
   145  	if got, want := stops.Counts()[statsKeyJoinedFailoverEndDetected], int64(1); got != want {
   146  		t.Fatalf("buffering stop was not tracked: got = %v, want = %v", got, want)
   147  	}
   148  	// Utilization in percentage has increased.
   149  	if got, want := utilizationSum.Counts()[statsKeyJoined], int64(30); got != want {
   150  		t.Fatalf("wrong buffer utilization: got = %v, want = %v", got, want)
   151  	}
   152  	// Drain will reset the state to "idle" eventually.
   153  	if err := waitForState(b, stateIdle); err != nil {
   154  		t.Fatal(err)
   155  	}
   156  
   157  	// Second failover: Buffering is skipped because last failover is too recent.
   158  	if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, shard, failoverErr); err != nil || retryDone != nil {
   159  		t.Fatalf("subsequent failovers must be skipped due to -buffer_min_time_between_failovers setting. err: %v retryDone: %v", err, retryDone)
   160  	}
   161  	if got, want := requestsSkipped.Counts()[statsKeyJoinedLastFailoverTooRecent], int64(1); got != want {
   162  		t.Fatalf("skipped request was not tracked: got = %v, want = %v", got, want)
   163  	}
   164  
   165  	// Second failover is buffered if enough time has passed.
   166  	now = now.Add(cfg.MinTimeBetweenFailovers)
   167  	stopped4 := issueRequest(context.Background(), t, b, failoverErr)
   168  	if err := waitForRequestsInFlight(b, 1); err != nil {
   169  		t.Fatal(err)
   170  	}
   171  	// Recorded max buffer usage should be 1 for the second failover.
   172  	if got, want := lastRequestsInFlightMax.Counts()[statsKeyJoined], int64(1); got != want {
   173  		t.Fatalf("wrong value for BufferRequestsInFlightMax: got = %v, want = %v", got, want)
   174  	}
   175  	// Start counter must have been increased for the second failover.
   176  	if got, want := starts.Counts()[statsKeyJoined], int64(2); got != want {
   177  		t.Fatalf("buffering start was not tracked: got = %v, want = %v", got, want)
   178  	}
   179  	// Stop buffering.
   180  	fail(b, oldPrimary, keyspace, shard, now)
   181  
   182  	if err := <-stopped4; err != nil {
   183  		t.Fatalf("request should have been buffered and not returned an error: %v", err)
   184  	}
   185  	if err := waitForState(b, stateIdle); err != nil {
   186  		t.Fatal(err)
   187  	}
   188  	if err := waitForPoolSlots(b, cfg.Size); err != nil {
   189  		t.Fatal(err)
   190  	}
   191  
   192  	// Stop counter must have been increased for the second failover.
   193  	if got, want := stops.Counts()[statsKeyJoinedFailoverEndDetected], int64(2); got != want {
   194  		t.Fatalf("buffering stop was not tracked: got = %v, want = %v", got, want)
   195  	}
   196  	// Utilization in percentage has increased.
   197  	if got, want := utilizationSum.Counts()[statsKeyJoined], int64(40); got != want {
   198  		t.Fatalf("wrong buffer utilization: got = %v, want = %v", got, want)
   199  	}
   200  }
   201  
   202  // TestDryRun tests the case when only the dry-run mode is enabled globally.
   203  func TestDryRun(t *testing.T) {
   204  	testAllImplementations(t, testDryRun1)
   205  }
   206  
   207  func testDryRun1(t *testing.T, fail failover) {
   208  	resetVariables()
   209  
   210  	cfg := NewDefaultConfig()
   211  	cfg.DryRun = true
   212  
   213  	b := New(cfg)
   214  
   215  	// Request does not get buffered.
   216  	if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, shard, failoverErr); err != nil || retryDone != nil {
   217  		t.Fatalf("requests must not be buffered during dry-run. err: %v retryDone: %v", err, retryDone)
   218  	}
   219  	// But the internal state changes though.
   220  	if err := waitForState(b, stateBuffering); err != nil {
   221  		t.Fatal(err)
   222  	}
   223  	if err := waitForPoolSlots(b, cfg.Size); err != nil {
   224  		t.Fatal(err)
   225  	}
   226  	if got, want := starts.Counts()[statsKeyJoined], int64(1); got != want {
   227  		t.Fatalf("buffering start was not tracked: got = %v, want = %v", got, want)
   228  	}
   229  	if got, want := lastRequestsDryRunMax.Counts()[statsKeyJoined], int64(1); got != want {
   230  		t.Fatalf("dry-run request count did not increase: got = %v, want = %v", got, want)
   231  	}
   232  
   233  	// End of failover is tracked as well.
   234  	fail(b, newPrimary, keyspace, shard, time.Unix(1, 0))
   235  
   236  	if err := waitForState(b, stateIdle); err != nil {
   237  		t.Fatal(err)
   238  	}
   239  	if got, want := stops.Counts()[statsKeyJoinedFailoverEndDetected], int64(1); got != want {
   240  		t.Fatalf("buffering stop was not tracked: got = %v, want = %v", got, want)
   241  	}
   242  	if got, want := utilizationDryRunSum.Counts()[statsKeyJoined], int64(10); got != want {
   243  		t.Fatalf("wrong buffer utilization: got = %v, want = %v", got, want)
   244  	}
   245  }
   246  
   247  // TestPassthrough tests the case when no failover is in progress and
   248  // requests have no failover related error.
   249  func TestPassthrough(t *testing.T) {
   250  	testAllImplementations(t, testPassthrough1)
   251  }
   252  
   253  func testPassthrough1(t *testing.T, fail failover) {
   254  	cfg := NewDefaultConfig()
   255  	cfg.Enabled = true
   256  	cfg.Shards = map[string]bool{
   257  		topoproto.KeyspaceShardString(keyspace, shard): true,
   258  	}
   259  
   260  	b := New(cfg)
   261  
   262  	if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, shard, nil); err != nil || retryDone != nil {
   263  		t.Fatalf("requests with no error must never be buffered. err: %v retryDone: %v", err, retryDone)
   264  	}
   265  	if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, shard, nonFailoverErr); err != nil || retryDone != nil {
   266  		t.Fatalf("requests with non-failover errors must never be buffered. err: %v retryDone: %v", err, retryDone)
   267  	}
   268  
   269  	if err := waitForPoolSlots(b, cfg.Size); err != nil {
   270  		t.Fatal(err)
   271  	}
   272  }
   273  
   274  // TestLastReparentTooRecentBufferingSkipped tests that buffering is skipped if
   275  // we see the reparent (end) *before* any request failures due to it.
   276  // We must not start buffering because we already observed the trigger for
   277  // stopping buffering (the reparent) and may not see it again.
   278  func TestLastReparentTooRecentBufferingSkipped(t *testing.T) {
   279  	testAllImplementations(t, testLastReparentTooRecentBufferingSkipped1)
   280  }
   281  
   282  func testLastReparentTooRecentBufferingSkipped1(t *testing.T, fail failover) {
   283  	resetVariables()
   284  
   285  	now := time.Now()
   286  	cfg := NewDefaultConfig()
   287  	cfg.Enabled = true
   288  	cfg.now = func() time.Time { return now }
   289  	b := New(cfg)
   290  
   291  	// Simulate that the old primary notified us about its reparented timestamp
   292  	// very recently (time.Now()).
   293  	// vtgate should see this immediately after the start.
   294  	fail(b, oldPrimary, keyspace, shard, now)
   295  
   296  	// Failover to new primary. Its end is detected faster than the beginning.
   297  	// Do not start buffering.
   298  	now = now.Add(1 * time.Second)
   299  	fail(b, newPrimary, keyspace, shard, now)
   300  
   301  	if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, shard, failoverErr); err != nil || retryDone != nil {
   302  		t.Fatalf("requests where the failover end was recently detected before the start must not be buffered. err: %v retryDone: %v", err, retryDone)
   303  	}
   304  	if err := waitForPoolSlots(b, cfg.Size); err != nil {
   305  		t.Fatal(err)
   306  	}
   307  	if got, want := requestsSkipped.Counts()[statsKeyJoinedLastReparentTooRecent], int64(1); got != want {
   308  		t.Fatalf("skipped request was not tracked: got = %v, want = %v", got, want)
   309  	}
   310  	if got, want := requestsBuffered.Counts()[statsKeyJoined], int64(0); got != want {
   311  		t.Fatalf("no request should have been tracked as buffered: got = %v, want = %v", got, want)
   312  	}
   313  }
   314  
   315  // TestLastReparentTooRecentBuffering explicitly tests that the "too recent"
   316  // skipping of the buffering does NOT get triggered because enough time has
   317  // elapsed since the last seen reparent.
   318  func TestLastReparentTooRecentBuffering(t *testing.T) {
   319  	testAllImplementations(t, testLastReparentTooRecentBuffering1)
   320  }
   321  
   322  func testLastReparentTooRecentBuffering1(t *testing.T, fail failover) {
   323  	resetVariables()
   324  
   325  	now := time.Now()
   326  	cfg := NewDefaultConfig()
   327  	cfg.Enabled = true
   328  	cfg.now = func() time.Time { return now }
   329  	b := New(cfg)
   330  
   331  	// Simulate that the old primary notified us about its reparented timestamp
   332  	// very recently (time.Now()).
   333  	// vtgate should see this immediately after the start.
   334  	fail(b, oldPrimary, keyspace, shard, now)
   335  
   336  	// Failover to new primary. Do not issue any requests before or after i.e.
   337  	// there was 0 QPS traffic and no buffering was started.
   338  	now = now.Add(1 * time.Second)
   339  	fail(b, newPrimary, keyspace, shard, now)
   340  
   341  	// After we're past the --buffer_min_time_between_failovers threshold, go
   342  	// through a failover with non-zero QPS.
   343  	now = now.Add(cfg.MinTimeBetweenFailovers)
   344  	// We're seeing errors first.
   345  	stopped := issueRequest(context.Background(), t, b, failoverErr)
   346  	if err := waitForRequestsInFlight(b, 1); err != nil {
   347  		t.Fatal(err)
   348  	}
   349  	// And then the failover end.
   350  	fail(b, newPrimary, keyspace, shard, now)
   351  
   352  	// Check that the drain is successful.
   353  	if err := <-stopped; err != nil {
   354  		t.Fatalf("request should have been buffered and not returned an error: %v", err)
   355  	}
   356  	// Drain will reset the state to "idle" eventually.
   357  	if err := waitForState(b, stateIdle); err != nil {
   358  		t.Fatal(err)
   359  	}
   360  
   361  	if got, want := requestsSkipped.Counts()[statsKeyJoinedLastReparentTooRecent], int64(0); got != want {
   362  		t.Fatalf("request should not have been skipped: got = %v, want = %v", got, want)
   363  	}
   364  	if got, want := requestsBuffered.Counts()[statsKeyJoined], int64(1); got != want {
   365  		t.Fatalf("request should have been tracked as buffered: got = %v, want = %v", got, want)
   366  	}
   367  }
   368  
   369  // TestPassthroughDuringDrain tests the behavior of requests while the buffer is
   370  // in the drain phase: They should not be buffered and passed through instead.
   371  func TestPassthroughDuringDrain(t *testing.T) {
   372  	testAllImplementations(t, testPassthroughDuringDrain1)
   373  }
   374  
   375  func testPassthroughDuringDrain1(t *testing.T, fail failover) {
   376  	cfg := NewDefaultConfig()
   377  	cfg.Enabled = true
   378  	cfg.Shards = map[string]bool{
   379  		topoproto.KeyspaceShardString(keyspace, shard): true,
   380  	}
   381  	b := New(cfg)
   382  
   383  	// Buffer one request.
   384  	markRetryDone := make(chan struct{})
   385  	stopped := issueRequestAndBlockRetry(context.Background(), t, b, failoverErr, markRetryDone)
   386  	if err := waitForRequestsInFlight(b, 1); err != nil {
   387  		t.Fatal(err)
   388  	}
   389  
   390  	// Stop buffering and trigger drain.
   391  	fail(b, newPrimary, keyspace, shard, time.Unix(1, 0))
   392  
   393  	if got, want := b.getOrCreateBuffer(keyspace, shard).testGetState(), stateDraining; got != want {
   394  		t.Fatalf("wrong expected state. got = %v, want = %v", got, want)
   395  	}
   396  
   397  	// Requests during the drain will be passed through and not buffered.
   398  	if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, shard, nil); err != nil || retryDone != nil {
   399  		t.Fatalf("requests with no error must not be buffered during a drain. err: %v retryDone: %v", err, retryDone)
   400  	}
   401  	if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, shard, failoverErr); err != nil || retryDone != nil {
   402  		t.Fatalf("requests with failover errors must not be buffered during a drain. err: %v retryDone: %v", err, retryDone)
   403  	}
   404  
   405  	// Finish draining by telling the buffer that the retry is done.
   406  	close(markRetryDone)
   407  	<-stopped
   408  
   409  	// Wait for the drain to complete to avoid races with other tests.
   410  	if err := waitForState(b, stateIdle); err != nil {
   411  		t.Fatal(err)
   412  	}
   413  	if err := waitForPoolSlots(b, cfg.Size); err != nil {
   414  		t.Fatal(err)
   415  	}
   416  }
   417  
   418  // TestPassthroughIgnoredKeyspaceOrShard tests that the explicit whitelisting
   419  // of keyspaces (and optionally shards) ignores entries which are not listed.
   420  func TestPassthroughIgnoredKeyspaceOrShard(t *testing.T) {
   421  	testAllImplementations(t, testPassthroughIgnoredKeyspaceOrShard1)
   422  }
   423  
   424  func testPassthroughIgnoredKeyspaceOrShard1(t *testing.T, fail failover) {
   425  	cfg := NewDefaultConfig()
   426  	cfg.Enabled = true
   427  	cfg.Shards = map[string]bool{
   428  		topoproto.KeyspaceShardString(keyspace, shard): true,
   429  	}
   430  	b := New(cfg)
   431  
   432  	ignoredKeyspace := "ignored_ks"
   433  	if retryDone, err := b.WaitForFailoverEnd(context.Background(), ignoredKeyspace, shard, failoverErr); err != nil || retryDone != nil {
   434  		t.Fatalf("requests for ignored keyspaces must not be buffered. err: %v retryDone: %v", err, retryDone)
   435  	}
   436  	statsKeyJoined := strings.Join([]string{ignoredKeyspace, shard, skippedDisabled}, ".")
   437  	if got, want := requestsSkipped.Counts()[statsKeyJoined], int64(1); got != want {
   438  		t.Fatalf("request was not skipped as disabled: got = %v, want = %v", got, want)
   439  	}
   440  
   441  	ignoredShard := "ff-"
   442  	if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, ignoredShard, failoverErr); err != nil || retryDone != nil {
   443  		t.Fatalf("requests for ignored shards must not be buffered. err: %v retryDone: %v", err, retryDone)
   444  	}
   445  	if err := waitForPoolSlots(b, cfg.Size); err != nil {
   446  		t.Fatal(err)
   447  	}
   448  	statsKeyJoined = strings.Join([]string{keyspace, ignoredShard, skippedDisabled}, ".")
   449  	if got, want := requestsSkipped.Counts()[statsKeyJoined], int64(1); got != want {
   450  		t.Fatalf("request was not skipped as disabled: got = %v, want = %v", got, want)
   451  	}
   452  }
   453  
   454  // TestRequestCanceled_ExplicitEnd stops the buffering because the we see the
   455  // new primary.
   456  func TestRequestCanceled_ExplicitEnd(t *testing.T) {
   457  	testAllImplementations(t, func(t *testing.T, fail failover) {
   458  		t.Helper()
   459  		testRequestCanceled(t, true, fail)
   460  	})
   461  }
   462  
   463  // TestRequestCanceled_MaxDurationEnd stops the buffering because the max
   464  // failover duration is reached.
   465  func TestRequestCanceled_MaxDurationEnd(t *testing.T) {
   466  	testAllImplementations(t, func(t *testing.T, fail failover) {
   467  		t.Helper()
   468  		testRequestCanceled(t, false, fail)
   469  	})
   470  }
   471  
   472  // testRequestCanceled tests the case when a buffered request is canceled
   473  // (more precisively its context) before the failover/buffering ends.
   474  func testRequestCanceled(t *testing.T, explicitEnd bool, fail failover) {
   475  	resetVariables()
   476  	defer checkVariables(t)
   477  
   478  	cfg := NewDefaultConfig()
   479  	cfg.Enabled = true
   480  	cfg.Keyspaces = map[string]bool{keyspace: true}
   481  
   482  	if !explicitEnd {
   483  		cfg.Window = 100 * time.Millisecond
   484  		cfg.MaxFailoverDuration = 100 * time.Millisecond
   485  	}
   486  
   487  	b := New(cfg)
   488  
   489  	// Buffer 2 requests. The second will be canceled and the first will be drained.
   490  	stopped1 := issueRequest(context.Background(), t, b, failoverErr)
   491  	if err := waitForRequestsInFlight(b, 1); err != nil {
   492  		t.Fatal(err)
   493  	}
   494  	ctx2, cancel2 := context.WithCancel(context.Background())
   495  	stopped2 := issueRequest(ctx2, t, b, failoverErr)
   496  	if err := waitForRequestsInFlight(b, 2); err != nil {
   497  		t.Fatal(err)
   498  	}
   499  
   500  	// Cancel second request before buffering stops.
   501  	cancel2()
   502  	// Canceled request will see an error from the buffer.
   503  	if err := isCanceledError(<-stopped2); err != nil {
   504  		t.Fatal(err)
   505  	}
   506  	if err := waitForRequestsInFlight(b, 1); err != nil {
   507  		t.Fatal(err)
   508  	}
   509  	// Recorded max buffer usage stay at 2 although the second request was canceled.
   510  	if got, want := lastRequestsInFlightMax.Counts()[statsKeyJoined], int64(2); got != want {
   511  		t.Fatalf("wrong value for BufferRequestsInFlightMax: got = %v, want = %v", got, want)
   512  	}
   513  
   514  	if explicitEnd {
   515  		fail(b, newPrimary, keyspace, shard, time.Unix(1, 0))
   516  	}
   517  
   518  	// Failover will end eventually.
   519  	if err := waitForState(b, stateIdle); err != nil {
   520  		t.Fatal(err)
   521  	}
   522  	// First request must have been drained without an error.
   523  	if err := <-stopped1; err != nil {
   524  		t.Fatalf("request should have been buffered and not returned an error: %v", err)
   525  	}
   526  
   527  	// If buffering stopped implicitly, the explicit signal will still happen
   528  	// shortly after. In that case, the buffer should ignore it.
   529  	if !explicitEnd {
   530  		fail(b, newPrimary, keyspace, shard, time.Unix(1, 0))
   531  	}
   532  	if err := waitForState(b, stateIdle); err != nil {
   533  		t.Fatal(err)
   534  	}
   535  	if err := waitForPoolSlots(b, cfg.Size); err != nil {
   536  		t.Fatal(err)
   537  	}
   538  }
   539  
   540  func TestEviction(t *testing.T) {
   541  	testAllImplementations(t, testEviction1)
   542  }
   543  
   544  func testEviction1(t *testing.T, fail failover) {
   545  	resetVariables()
   546  	defer checkVariables(t)
   547  
   548  	cfg := NewDefaultConfig()
   549  	cfg.Enabled = true
   550  	cfg.Shards = map[string]bool{
   551  		topoproto.KeyspaceShardString(keyspace, shard): true,
   552  	}
   553  	cfg.Size = 2
   554  	b := New(cfg)
   555  
   556  	stopped1 := issueRequest(context.Background(), t, b, failoverErr)
   557  	// This wait is important because each request gets inserted asynchronously
   558  	// in the buffer. Usually, they end up in the correct order (1, 2), but there
   559  	// is a chance that it's reversed (2, 1). This wait ensures that 1 goes into
   560  	// the buffer first.
   561  	if err := waitForRequestsInFlight(b, 1); err != nil {
   562  		t.Fatal(err)
   563  	}
   564  	stopped2 := issueRequest(context.Background(), t, b, failoverErr)
   565  	if err := waitForRequestsInFlight(b, 2); err != nil {
   566  		t.Fatal(err)
   567  	}
   568  
   569  	// Third request will evict the oldest.
   570  	stopped3 := issueRequest(context.Background(), t, b, failoverErr)
   571  
   572  	// Evicted request will see an error from the buffer.
   573  	if err := isEvictedError(<-stopped1); err != nil {
   574  		t.Fatal(err)
   575  	}
   576  
   577  	// End of failover. Stop buffering.
   578  	fail(b, newPrimary, keyspace, shard, time.Unix(1, 0))
   579  
   580  	if err := <-stopped2; err != nil {
   581  		t.Fatalf("request should have been buffered and not returned an error: %v", err)
   582  	}
   583  	if err := <-stopped3; err != nil {
   584  		t.Fatalf("request should have been buffered and not returned an error: %v", err)
   585  	}
   586  	if err := waitForState(b, stateIdle); err != nil {
   587  		t.Fatal(err)
   588  	}
   589  	if err := waitForPoolSlots(b, 2); err != nil {
   590  		t.Fatal(err)
   591  	}
   592  }
   593  
   594  // TestEvictionNotPossible tests the case that the buffer is a) fully in use
   595  // by two failovers and b) the second failover doesn't use any slot in the
   596  // buffer and therefore cannot evict older entries.
   597  func TestEvictionNotPossible(t *testing.T) {
   598  	testAllImplementations(t, testEvictionNotPossible1)
   599  }
   600  
   601  func testEvictionNotPossible1(t *testing.T, fail failover) {
   602  	resetVariables()
   603  	defer checkVariables(t)
   604  
   605  	cfg := NewDefaultConfig()
   606  	cfg.Enabled = true
   607  	cfg.Shards = map[string]bool{
   608  		topoproto.KeyspaceShardString(keyspace, shard):  true,
   609  		topoproto.KeyspaceShardString(keyspace, shard2): true,
   610  	}
   611  	cfg.Size = 1
   612  
   613  	b := New(cfg)
   614  
   615  	// Make the buffer full (applies to all failovers).
   616  	// Also triggers buffering for the first shard.
   617  	stoppedFirstFailover := issueRequest(context.Background(), t, b, failoverErr)
   618  	if err := waitForRequestsInFlight(b, 1); err != nil {
   619  		t.Fatal(err)
   620  	}
   621  
   622  	// Newer requests of the second failover cannot evict anything because
   623  	// they have no entries buffered.
   624  	retryDone, bufferErr := b.WaitForFailoverEnd(context.Background(), keyspace, shard2, failoverErr)
   625  	if bufferErr == nil || retryDone != nil {
   626  		t.Fatalf("buffer should have returned an error because it's full: err: %v retryDone: %v", bufferErr, retryDone)
   627  	}
   628  	if got, want := vterrors.Code(bufferErr), vtrpcpb.Code_UNAVAILABLE; got != want {
   629  		t.Fatalf("wrong error code for evicted buffered request. got = %v, want = %v", got, want)
   630  	}
   631  	if got, want := bufferErr.Error(), bufferFullError.Error(); !strings.Contains(got, want) {
   632  		t.Fatalf("evicted buffered request should return a different error message. got = %v, want substring = %v", got, want)
   633  	}
   634  
   635  	// End of failover. Stop buffering.
   636  	fail(b, newPrimary, keyspace, shard, time.Unix(1, 0))
   637  
   638  	if err := <-stoppedFirstFailover; err != nil {
   639  		t.Fatalf("request should have been buffered and not returned an error: %v", err)
   640  	}
   641  	// Wait for the failover end to avoid races.
   642  	if err := waitForState(b, stateIdle); err != nil {
   643  		t.Fatal(err)
   644  	}
   645  	if err := waitForPoolSlots(b, 1); err != nil {
   646  		t.Fatal(err)
   647  	}
   648  	statsKeyJoined := strings.Join([]string{keyspace, shard2, string(skippedBufferFull)}, ".")
   649  	if got, want := requestsSkipped.Counts()[statsKeyJoined], int64(1); got != want {
   650  		t.Fatalf("skipped request was not tracked: got = %v, want = %v", got, want)
   651  	}
   652  }
   653  
   654  func TestWindow(t *testing.T) {
   655  	testAllImplementations(t, testWindow1)
   656  }
   657  
   658  func testWindow1(t *testing.T, fail failover) {
   659  	resetVariables()
   660  	defer checkVariables(t)
   661  
   662  	cfg := NewDefaultConfig()
   663  	cfg.Enabled = true
   664  	cfg.Shards = map[string]bool{
   665  		topoproto.KeyspaceShardString(keyspace, shard):  true,
   666  		topoproto.KeyspaceShardString(keyspace, shard2): true,
   667  	}
   668  	cfg.Size = 1
   669  	cfg.Window = 1 * time.Millisecond
   670  
   671  	b := New(cfg)
   672  
   673  	// Buffer one request.
   674  	t.Logf("first request exceeds its window")
   675  	stopped1 := issueRequest(context.Background(), t, b, failoverErr)
   676  
   677  	// Let it go out of the buffering window and expire.
   678  	if err := <-stopped1; err != nil {
   679  		t.Fatalf("buffering should have stopped after exceeding the window without an error: %v", err)
   680  	}
   681  	// Verify that the window was actually exceeded.
   682  	if err := waitForRequestsExceededWindow(1); err != nil {
   683  		t.Fatal(err)
   684  	}
   685  
   686  	// Increase the window and buffer a request again
   687  	// (queue becomes not empty a second time).
   688  	cfg.Window = 10 * time.Minute
   689  
   690  	// This is a hack. The buffering semaphore gets released asynchronously.
   691  	// Sometimes the next issueRequest tries to acquire before that release
   692  	// and ends up failing. Waiting for the previous goroutines to exit ensures
   693  	// that the sema will get released.
   694  	b.waitForShutdown()
   695  
   696  	// This time the request does not go out of window and gets evicted by a third
   697  	// request instead.
   698  	t.Logf("second request does not exceed its window")
   699  	stopped2 := issueRequest(context.Background(), t, b, failoverErr)
   700  	if err := waitForRequestsInFlight(b, 1); err != nil {
   701  		t.Fatal(err)
   702  	}
   703  
   704  	// Third request will evict the second one.
   705  	t.Logf("third request evicts the second request")
   706  	stopped3 := issueRequest(context.Background(), t, b, failoverErr)
   707  
   708  	// Evicted request will see an error from the buffer.
   709  	if err := isEvictedError(<-stopped2); err != nil {
   710  		t.Fatal(err)
   711  	}
   712  	// Block until the third request is buffered. Avoids data race with *window.
   713  	if err := waitForRequestsInFlight(b, 1); err != nil {
   714  		t.Fatal(err)
   715  	}
   716  
   717  	// Verify that the window was not exceeded.
   718  	if got, want := requestsEvicted.Counts()[statsKeyJoinedWindowExceeded], int64(1); got != want {
   719  		t.Fatalf("second or third request should not have exceed its buffering window. got = %v, want = %v", got, want)
   720  	}
   721  
   722  	// Reduce the window again.
   723  	cfg.Window = 100 * time.Millisecond
   724  
   725  	// Fourth request evicts the third
   726  	t.Logf("fourth request exceeds its window (and evicts the third)")
   727  	stopped4 := issueRequest(context.Background(), t, b, failoverErr)
   728  	if err := isEvictedError(<-stopped3); err != nil {
   729  		t.Fatal(err)
   730  	}
   731  
   732  	// Fourth request will exceed its window and finish early.
   733  	if err := <-stopped4; err != nil {
   734  		t.Fatalf("buffering should have stopped after 10ms without an error: %v", err)
   735  	}
   736  	// Verify that the window was actually exceeded.
   737  	if err := waitForRequestsExceededWindow(2); err != nil {
   738  		t.Fatal(err)
   739  	}
   740  
   741  	// At this point the buffer is empty but buffering is still active.
   742  	// Simulate that the buffering stops because the max duration (10m) was reached.
   743  	b.getOrCreateBuffer(keyspace, shard).stopBufferingDueToMaxDuration()
   744  	// Wait for the failover end to avoid races.
   745  	if err := waitForState(b, stateIdle); err != nil {
   746  		t.Fatal(err)
   747  	}
   748  	if err := waitForPoolSlots(b, 1); err != nil {
   749  		t.Fatal(err)
   750  	}
   751  }
   752  
   753  // TestShutdown tests that Buffer.Shutdown() unblocks any pending bufferings
   754  // immediately.
   755  func TestShutdown(t *testing.T) {
   756  	testAllImplementations(t, testShutdown1)
   757  }
   758  
   759  func testShutdown1(t *testing.T, fail failover) {
   760  	resetVariables()
   761  	defer checkVariables(t)
   762  
   763  	cfg := NewDefaultConfig()
   764  	cfg.Enabled = true
   765  	b := New(cfg)
   766  
   767  	// Buffer one request.
   768  	stopped1 := issueRequest(context.Background(), t, b, failoverErr)
   769  	if err := waitForRequestsInFlight(b, 1); err != nil {
   770  		t.Fatal(err)
   771  	}
   772  
   773  	// Shutdown buffer and unblock buffered request immediately.
   774  	b.Shutdown()
   775  
   776  	// Request must have been drained without an error.
   777  	if err := <-stopped1; err != nil {
   778  		t.Fatalf("request should have been buffered and not returned an error: %v", err)
   779  	}
   780  
   781  	if err := waitForPoolSlots(b, cfg.Size); err != nil {
   782  		t.Fatal(err)
   783  	}
   784  }