vitess.io/vitess@v0.16.2/go/vt/vtgate/buffer/buffer_test.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package buffer 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 "testing" 24 "time" 25 26 "vitess.io/vitess/go/vt/topo/topoproto" 27 "vitess.io/vitess/go/vt/vterrors" 28 29 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 30 vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" 31 ) 32 33 const ( 34 keyspace = "ks1" 35 shard = "0" 36 // shard2 is only used for tests with two concurrent failovers. 37 shard2 = "-80" 38 ) 39 40 var ( 41 failoverErr = vterrors.New(vtrpcpb.Code_CLUSTER_EVENT, 42 "vttablet: rpc error: code = 17 desc = gRPCServerError: retry: operation not allowed in state SHUTTING_DOWN") 43 nonFailoverErr = vterrors.New(vtrpcpb.Code_FAILED_PRECONDITION, 44 "vttablet: rpc error: code = 9 desc = gRPCServerError: retry: TODO(mberlin): Insert here any realistic error not caused by a failover") 45 46 statsKeyJoined = fmt.Sprintf("%s.%s", keyspace, shard) 47 48 statsKeyJoinedFailoverEndDetected = statsKeyJoined + "." + string(stopFailoverEndDetected) 49 50 statsKeyJoinedWindowExceeded = statsKeyJoined + "." + string(evictedWindowExceeded) 51 52 statsKeyJoinedLastReparentTooRecent = statsKeyJoined + "." + string(skippedLastReparentTooRecent) 53 statsKeyJoinedLastFailoverTooRecent = statsKeyJoined + "." + string(skippedLastFailoverTooRecent) 54 55 oldPrimary = &topodatapb.Tablet{ 56 Alias: &topodatapb.TabletAlias{Cell: "cell1", Uid: 100}, 57 Keyspace: keyspace, 58 Shard: shard, 59 Type: topodatapb.TabletType_PRIMARY, 60 PortMap: map[string]int32{"vt": int32(100)}, 61 } 62 newPrimary = &topodatapb.Tablet{ 63 Alias: &topodatapb.TabletAlias{Cell: "cell1", Uid: 101}, 64 Keyspace: keyspace, 65 Shard: shard, 66 Type: topodatapb.TabletType_PRIMARY, 67 PortMap: map[string]int32{"vt": int32(101)}, 68 } 69 ) 70 71 func TestBuffering(t *testing.T) { 72 testAllImplementations(t, testBuffering1) 73 } 74 75 func testBuffering1(t *testing.T, fail failover) { 76 resetVariables() 77 defer checkVariables(t) 78 79 // Create the buffer. 80 now := time.Now() 81 cfg := NewDefaultConfig() 82 cfg.Enabled = true 83 // Dry-run mode will apply to other keyspaces and shards. Not tested here. 84 cfg.DryRun = true 85 cfg.Shards = map[string]bool{ 86 topoproto.KeyspaceShardString(keyspace, shard): true, 87 } 88 cfg.now = func() time.Time { return now } 89 90 b := New(cfg) 91 92 // Simulate that the current primary reports its ExternallyReparentedTimestamp. 93 // vtgate sees this at startup. Additional periodic updates will be sent out 94 // after this. If the TabletExternallyReparented RPC is called regularly by 95 // an external failover tool, the timestamp will be increased (even though 96 // the primary did not change.) 97 fail(b, oldPrimary, keyspace, shard, now) 98 99 // First request with failover error starts buffering. 100 stopped := issueRequest(context.Background(), t, b, failoverErr) 101 if err := waitForRequestsInFlight(b, 1); err != nil { 102 t.Fatal(err) 103 } 104 // Start counter must have been increased. 105 if got, want := starts.Counts()[statsKeyJoined], int64(1); got != want { 106 t.Fatalf("buffering start was not tracked: got = %v, want = %v", got, want) 107 } 108 109 // Subsequent requests with errors not related to the failover are not buffered. 110 if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, shard, nonFailoverErr); err != nil || retryDone != nil { 111 t.Fatalf("requests with non-failover errors must never be buffered. err: %v retryDone: %v", err, retryDone) 112 } 113 114 // Subsequent requests are buffered (if their error is nil or caused by the failover). 115 stopped2 := issueRequest(context.Background(), t, b, nil) 116 stopped3 := issueRequest(context.Background(), t, b, failoverErr) 117 if err := waitForRequestsInFlight(b, 3); err != nil { 118 t.Fatal(err) 119 } 120 121 // Mimic the failover end. 122 now = now.Add(1 * time.Second) 123 fail(b, newPrimary, keyspace, shard, now) 124 125 // Check that the drain is successful. 126 if err := <-stopped; err != nil { 127 t.Fatalf("request should have been buffered and not returned an error: %v", err) 128 } 129 if err := <-stopped2; err != nil { 130 t.Fatalf("request should have been buffered and not returned an error: %v", err) 131 } 132 if err := <-stopped3; err != nil { 133 t.Fatalf("request should have been buffered and not returned an error: %v", err) 134 } 135 // Failover time should have been be published. 136 durations := failoverDurationSumMs.Counts() 137 if _, ok := durations[statsKeyJoined]; !ok { 138 t.Fatalf("a failover time must have been recorded: %v", durations) 139 } 140 // Recorded max buffer usage should be 3 now. 141 if got, want := lastRequestsInFlightMax.Counts()[statsKeyJoined], int64(3); got != want { 142 t.Fatalf("wrong value for BufferRequestsInFlightMax: got = %v, want = %v", got, want) 143 } 144 // Stop counter should have been increased. 145 if got, want := stops.Counts()[statsKeyJoinedFailoverEndDetected], int64(1); got != want { 146 t.Fatalf("buffering stop was not tracked: got = %v, want = %v", got, want) 147 } 148 // Utilization in percentage has increased. 149 if got, want := utilizationSum.Counts()[statsKeyJoined], int64(30); got != want { 150 t.Fatalf("wrong buffer utilization: got = %v, want = %v", got, want) 151 } 152 // Drain will reset the state to "idle" eventually. 153 if err := waitForState(b, stateIdle); err != nil { 154 t.Fatal(err) 155 } 156 157 // Second failover: Buffering is skipped because last failover is too recent. 158 if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, shard, failoverErr); err != nil || retryDone != nil { 159 t.Fatalf("subsequent failovers must be skipped due to -buffer_min_time_between_failovers setting. err: %v retryDone: %v", err, retryDone) 160 } 161 if got, want := requestsSkipped.Counts()[statsKeyJoinedLastFailoverTooRecent], int64(1); got != want { 162 t.Fatalf("skipped request was not tracked: got = %v, want = %v", got, want) 163 } 164 165 // Second failover is buffered if enough time has passed. 166 now = now.Add(cfg.MinTimeBetweenFailovers) 167 stopped4 := issueRequest(context.Background(), t, b, failoverErr) 168 if err := waitForRequestsInFlight(b, 1); err != nil { 169 t.Fatal(err) 170 } 171 // Recorded max buffer usage should be 1 for the second failover. 172 if got, want := lastRequestsInFlightMax.Counts()[statsKeyJoined], int64(1); got != want { 173 t.Fatalf("wrong value for BufferRequestsInFlightMax: got = %v, want = %v", got, want) 174 } 175 // Start counter must have been increased for the second failover. 176 if got, want := starts.Counts()[statsKeyJoined], int64(2); got != want { 177 t.Fatalf("buffering start was not tracked: got = %v, want = %v", got, want) 178 } 179 // Stop buffering. 180 fail(b, oldPrimary, keyspace, shard, now) 181 182 if err := <-stopped4; err != nil { 183 t.Fatalf("request should have been buffered and not returned an error: %v", err) 184 } 185 if err := waitForState(b, stateIdle); err != nil { 186 t.Fatal(err) 187 } 188 if err := waitForPoolSlots(b, cfg.Size); err != nil { 189 t.Fatal(err) 190 } 191 192 // Stop counter must have been increased for the second failover. 193 if got, want := stops.Counts()[statsKeyJoinedFailoverEndDetected], int64(2); got != want { 194 t.Fatalf("buffering stop was not tracked: got = %v, want = %v", got, want) 195 } 196 // Utilization in percentage has increased. 197 if got, want := utilizationSum.Counts()[statsKeyJoined], int64(40); got != want { 198 t.Fatalf("wrong buffer utilization: got = %v, want = %v", got, want) 199 } 200 } 201 202 // TestDryRun tests the case when only the dry-run mode is enabled globally. 203 func TestDryRun(t *testing.T) { 204 testAllImplementations(t, testDryRun1) 205 } 206 207 func testDryRun1(t *testing.T, fail failover) { 208 resetVariables() 209 210 cfg := NewDefaultConfig() 211 cfg.DryRun = true 212 213 b := New(cfg) 214 215 // Request does not get buffered. 216 if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, shard, failoverErr); err != nil || retryDone != nil { 217 t.Fatalf("requests must not be buffered during dry-run. err: %v retryDone: %v", err, retryDone) 218 } 219 // But the internal state changes though. 220 if err := waitForState(b, stateBuffering); err != nil { 221 t.Fatal(err) 222 } 223 if err := waitForPoolSlots(b, cfg.Size); err != nil { 224 t.Fatal(err) 225 } 226 if got, want := starts.Counts()[statsKeyJoined], int64(1); got != want { 227 t.Fatalf("buffering start was not tracked: got = %v, want = %v", got, want) 228 } 229 if got, want := lastRequestsDryRunMax.Counts()[statsKeyJoined], int64(1); got != want { 230 t.Fatalf("dry-run request count did not increase: got = %v, want = %v", got, want) 231 } 232 233 // End of failover is tracked as well. 234 fail(b, newPrimary, keyspace, shard, time.Unix(1, 0)) 235 236 if err := waitForState(b, stateIdle); err != nil { 237 t.Fatal(err) 238 } 239 if got, want := stops.Counts()[statsKeyJoinedFailoverEndDetected], int64(1); got != want { 240 t.Fatalf("buffering stop was not tracked: got = %v, want = %v", got, want) 241 } 242 if got, want := utilizationDryRunSum.Counts()[statsKeyJoined], int64(10); got != want { 243 t.Fatalf("wrong buffer utilization: got = %v, want = %v", got, want) 244 } 245 } 246 247 // TestPassthrough tests the case when no failover is in progress and 248 // requests have no failover related error. 249 func TestPassthrough(t *testing.T) { 250 testAllImplementations(t, testPassthrough1) 251 } 252 253 func testPassthrough1(t *testing.T, fail failover) { 254 cfg := NewDefaultConfig() 255 cfg.Enabled = true 256 cfg.Shards = map[string]bool{ 257 topoproto.KeyspaceShardString(keyspace, shard): true, 258 } 259 260 b := New(cfg) 261 262 if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, shard, nil); err != nil || retryDone != nil { 263 t.Fatalf("requests with no error must never be buffered. err: %v retryDone: %v", err, retryDone) 264 } 265 if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, shard, nonFailoverErr); err != nil || retryDone != nil { 266 t.Fatalf("requests with non-failover errors must never be buffered. err: %v retryDone: %v", err, retryDone) 267 } 268 269 if err := waitForPoolSlots(b, cfg.Size); err != nil { 270 t.Fatal(err) 271 } 272 } 273 274 // TestLastReparentTooRecentBufferingSkipped tests that buffering is skipped if 275 // we see the reparent (end) *before* any request failures due to it. 276 // We must not start buffering because we already observed the trigger for 277 // stopping buffering (the reparent) and may not see it again. 278 func TestLastReparentTooRecentBufferingSkipped(t *testing.T) { 279 testAllImplementations(t, testLastReparentTooRecentBufferingSkipped1) 280 } 281 282 func testLastReparentTooRecentBufferingSkipped1(t *testing.T, fail failover) { 283 resetVariables() 284 285 now := time.Now() 286 cfg := NewDefaultConfig() 287 cfg.Enabled = true 288 cfg.now = func() time.Time { return now } 289 b := New(cfg) 290 291 // Simulate that the old primary notified us about its reparented timestamp 292 // very recently (time.Now()). 293 // vtgate should see this immediately after the start. 294 fail(b, oldPrimary, keyspace, shard, now) 295 296 // Failover to new primary. Its end is detected faster than the beginning. 297 // Do not start buffering. 298 now = now.Add(1 * time.Second) 299 fail(b, newPrimary, keyspace, shard, now) 300 301 if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, shard, failoverErr); err != nil || retryDone != nil { 302 t.Fatalf("requests where the failover end was recently detected before the start must not be buffered. err: %v retryDone: %v", err, retryDone) 303 } 304 if err := waitForPoolSlots(b, cfg.Size); err != nil { 305 t.Fatal(err) 306 } 307 if got, want := requestsSkipped.Counts()[statsKeyJoinedLastReparentTooRecent], int64(1); got != want { 308 t.Fatalf("skipped request was not tracked: got = %v, want = %v", got, want) 309 } 310 if got, want := requestsBuffered.Counts()[statsKeyJoined], int64(0); got != want { 311 t.Fatalf("no request should have been tracked as buffered: got = %v, want = %v", got, want) 312 } 313 } 314 315 // TestLastReparentTooRecentBuffering explicitly tests that the "too recent" 316 // skipping of the buffering does NOT get triggered because enough time has 317 // elapsed since the last seen reparent. 318 func TestLastReparentTooRecentBuffering(t *testing.T) { 319 testAllImplementations(t, testLastReparentTooRecentBuffering1) 320 } 321 322 func testLastReparentTooRecentBuffering1(t *testing.T, fail failover) { 323 resetVariables() 324 325 now := time.Now() 326 cfg := NewDefaultConfig() 327 cfg.Enabled = true 328 cfg.now = func() time.Time { return now } 329 b := New(cfg) 330 331 // Simulate that the old primary notified us about its reparented timestamp 332 // very recently (time.Now()). 333 // vtgate should see this immediately after the start. 334 fail(b, oldPrimary, keyspace, shard, now) 335 336 // Failover to new primary. Do not issue any requests before or after i.e. 337 // there was 0 QPS traffic and no buffering was started. 338 now = now.Add(1 * time.Second) 339 fail(b, newPrimary, keyspace, shard, now) 340 341 // After we're past the --buffer_min_time_between_failovers threshold, go 342 // through a failover with non-zero QPS. 343 now = now.Add(cfg.MinTimeBetweenFailovers) 344 // We're seeing errors first. 345 stopped := issueRequest(context.Background(), t, b, failoverErr) 346 if err := waitForRequestsInFlight(b, 1); err != nil { 347 t.Fatal(err) 348 } 349 // And then the failover end. 350 fail(b, newPrimary, keyspace, shard, now) 351 352 // Check that the drain is successful. 353 if err := <-stopped; err != nil { 354 t.Fatalf("request should have been buffered and not returned an error: %v", err) 355 } 356 // Drain will reset the state to "idle" eventually. 357 if err := waitForState(b, stateIdle); err != nil { 358 t.Fatal(err) 359 } 360 361 if got, want := requestsSkipped.Counts()[statsKeyJoinedLastReparentTooRecent], int64(0); got != want { 362 t.Fatalf("request should not have been skipped: got = %v, want = %v", got, want) 363 } 364 if got, want := requestsBuffered.Counts()[statsKeyJoined], int64(1); got != want { 365 t.Fatalf("request should have been tracked as buffered: got = %v, want = %v", got, want) 366 } 367 } 368 369 // TestPassthroughDuringDrain tests the behavior of requests while the buffer is 370 // in the drain phase: They should not be buffered and passed through instead. 371 func TestPassthroughDuringDrain(t *testing.T) { 372 testAllImplementations(t, testPassthroughDuringDrain1) 373 } 374 375 func testPassthroughDuringDrain1(t *testing.T, fail failover) { 376 cfg := NewDefaultConfig() 377 cfg.Enabled = true 378 cfg.Shards = map[string]bool{ 379 topoproto.KeyspaceShardString(keyspace, shard): true, 380 } 381 b := New(cfg) 382 383 // Buffer one request. 384 markRetryDone := make(chan struct{}) 385 stopped := issueRequestAndBlockRetry(context.Background(), t, b, failoverErr, markRetryDone) 386 if err := waitForRequestsInFlight(b, 1); err != nil { 387 t.Fatal(err) 388 } 389 390 // Stop buffering and trigger drain. 391 fail(b, newPrimary, keyspace, shard, time.Unix(1, 0)) 392 393 if got, want := b.getOrCreateBuffer(keyspace, shard).testGetState(), stateDraining; got != want { 394 t.Fatalf("wrong expected state. got = %v, want = %v", got, want) 395 } 396 397 // Requests during the drain will be passed through and not buffered. 398 if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, shard, nil); err != nil || retryDone != nil { 399 t.Fatalf("requests with no error must not be buffered during a drain. err: %v retryDone: %v", err, retryDone) 400 } 401 if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, shard, failoverErr); err != nil || retryDone != nil { 402 t.Fatalf("requests with failover errors must not be buffered during a drain. err: %v retryDone: %v", err, retryDone) 403 } 404 405 // Finish draining by telling the buffer that the retry is done. 406 close(markRetryDone) 407 <-stopped 408 409 // Wait for the drain to complete to avoid races with other tests. 410 if err := waitForState(b, stateIdle); err != nil { 411 t.Fatal(err) 412 } 413 if err := waitForPoolSlots(b, cfg.Size); err != nil { 414 t.Fatal(err) 415 } 416 } 417 418 // TestPassthroughIgnoredKeyspaceOrShard tests that the explicit whitelisting 419 // of keyspaces (and optionally shards) ignores entries which are not listed. 420 func TestPassthroughIgnoredKeyspaceOrShard(t *testing.T) { 421 testAllImplementations(t, testPassthroughIgnoredKeyspaceOrShard1) 422 } 423 424 func testPassthroughIgnoredKeyspaceOrShard1(t *testing.T, fail failover) { 425 cfg := NewDefaultConfig() 426 cfg.Enabled = true 427 cfg.Shards = map[string]bool{ 428 topoproto.KeyspaceShardString(keyspace, shard): true, 429 } 430 b := New(cfg) 431 432 ignoredKeyspace := "ignored_ks" 433 if retryDone, err := b.WaitForFailoverEnd(context.Background(), ignoredKeyspace, shard, failoverErr); err != nil || retryDone != nil { 434 t.Fatalf("requests for ignored keyspaces must not be buffered. err: %v retryDone: %v", err, retryDone) 435 } 436 statsKeyJoined := strings.Join([]string{ignoredKeyspace, shard, skippedDisabled}, ".") 437 if got, want := requestsSkipped.Counts()[statsKeyJoined], int64(1); got != want { 438 t.Fatalf("request was not skipped as disabled: got = %v, want = %v", got, want) 439 } 440 441 ignoredShard := "ff-" 442 if retryDone, err := b.WaitForFailoverEnd(context.Background(), keyspace, ignoredShard, failoverErr); err != nil || retryDone != nil { 443 t.Fatalf("requests for ignored shards must not be buffered. err: %v retryDone: %v", err, retryDone) 444 } 445 if err := waitForPoolSlots(b, cfg.Size); err != nil { 446 t.Fatal(err) 447 } 448 statsKeyJoined = strings.Join([]string{keyspace, ignoredShard, skippedDisabled}, ".") 449 if got, want := requestsSkipped.Counts()[statsKeyJoined], int64(1); got != want { 450 t.Fatalf("request was not skipped as disabled: got = %v, want = %v", got, want) 451 } 452 } 453 454 // TestRequestCanceled_ExplicitEnd stops the buffering because the we see the 455 // new primary. 456 func TestRequestCanceled_ExplicitEnd(t *testing.T) { 457 testAllImplementations(t, func(t *testing.T, fail failover) { 458 t.Helper() 459 testRequestCanceled(t, true, fail) 460 }) 461 } 462 463 // TestRequestCanceled_MaxDurationEnd stops the buffering because the max 464 // failover duration is reached. 465 func TestRequestCanceled_MaxDurationEnd(t *testing.T) { 466 testAllImplementations(t, func(t *testing.T, fail failover) { 467 t.Helper() 468 testRequestCanceled(t, false, fail) 469 }) 470 } 471 472 // testRequestCanceled tests the case when a buffered request is canceled 473 // (more precisively its context) before the failover/buffering ends. 474 func testRequestCanceled(t *testing.T, explicitEnd bool, fail failover) { 475 resetVariables() 476 defer checkVariables(t) 477 478 cfg := NewDefaultConfig() 479 cfg.Enabled = true 480 cfg.Keyspaces = map[string]bool{keyspace: true} 481 482 if !explicitEnd { 483 cfg.Window = 100 * time.Millisecond 484 cfg.MaxFailoverDuration = 100 * time.Millisecond 485 } 486 487 b := New(cfg) 488 489 // Buffer 2 requests. The second will be canceled and the first will be drained. 490 stopped1 := issueRequest(context.Background(), t, b, failoverErr) 491 if err := waitForRequestsInFlight(b, 1); err != nil { 492 t.Fatal(err) 493 } 494 ctx2, cancel2 := context.WithCancel(context.Background()) 495 stopped2 := issueRequest(ctx2, t, b, failoverErr) 496 if err := waitForRequestsInFlight(b, 2); err != nil { 497 t.Fatal(err) 498 } 499 500 // Cancel second request before buffering stops. 501 cancel2() 502 // Canceled request will see an error from the buffer. 503 if err := isCanceledError(<-stopped2); err != nil { 504 t.Fatal(err) 505 } 506 if err := waitForRequestsInFlight(b, 1); err != nil { 507 t.Fatal(err) 508 } 509 // Recorded max buffer usage stay at 2 although the second request was canceled. 510 if got, want := lastRequestsInFlightMax.Counts()[statsKeyJoined], int64(2); got != want { 511 t.Fatalf("wrong value for BufferRequestsInFlightMax: got = %v, want = %v", got, want) 512 } 513 514 if explicitEnd { 515 fail(b, newPrimary, keyspace, shard, time.Unix(1, 0)) 516 } 517 518 // Failover will end eventually. 519 if err := waitForState(b, stateIdle); err != nil { 520 t.Fatal(err) 521 } 522 // First request must have been drained without an error. 523 if err := <-stopped1; err != nil { 524 t.Fatalf("request should have been buffered and not returned an error: %v", err) 525 } 526 527 // If buffering stopped implicitly, the explicit signal will still happen 528 // shortly after. In that case, the buffer should ignore it. 529 if !explicitEnd { 530 fail(b, newPrimary, keyspace, shard, time.Unix(1, 0)) 531 } 532 if err := waitForState(b, stateIdle); err != nil { 533 t.Fatal(err) 534 } 535 if err := waitForPoolSlots(b, cfg.Size); err != nil { 536 t.Fatal(err) 537 } 538 } 539 540 func TestEviction(t *testing.T) { 541 testAllImplementations(t, testEviction1) 542 } 543 544 func testEviction1(t *testing.T, fail failover) { 545 resetVariables() 546 defer checkVariables(t) 547 548 cfg := NewDefaultConfig() 549 cfg.Enabled = true 550 cfg.Shards = map[string]bool{ 551 topoproto.KeyspaceShardString(keyspace, shard): true, 552 } 553 cfg.Size = 2 554 b := New(cfg) 555 556 stopped1 := issueRequest(context.Background(), t, b, failoverErr) 557 // This wait is important because each request gets inserted asynchronously 558 // in the buffer. Usually, they end up in the correct order (1, 2), but there 559 // is a chance that it's reversed (2, 1). This wait ensures that 1 goes into 560 // the buffer first. 561 if err := waitForRequestsInFlight(b, 1); err != nil { 562 t.Fatal(err) 563 } 564 stopped2 := issueRequest(context.Background(), t, b, failoverErr) 565 if err := waitForRequestsInFlight(b, 2); err != nil { 566 t.Fatal(err) 567 } 568 569 // Third request will evict the oldest. 570 stopped3 := issueRequest(context.Background(), t, b, failoverErr) 571 572 // Evicted request will see an error from the buffer. 573 if err := isEvictedError(<-stopped1); err != nil { 574 t.Fatal(err) 575 } 576 577 // End of failover. Stop buffering. 578 fail(b, newPrimary, keyspace, shard, time.Unix(1, 0)) 579 580 if err := <-stopped2; err != nil { 581 t.Fatalf("request should have been buffered and not returned an error: %v", err) 582 } 583 if err := <-stopped3; err != nil { 584 t.Fatalf("request should have been buffered and not returned an error: %v", err) 585 } 586 if err := waitForState(b, stateIdle); err != nil { 587 t.Fatal(err) 588 } 589 if err := waitForPoolSlots(b, 2); err != nil { 590 t.Fatal(err) 591 } 592 } 593 594 // TestEvictionNotPossible tests the case that the buffer is a) fully in use 595 // by two failovers and b) the second failover doesn't use any slot in the 596 // buffer and therefore cannot evict older entries. 597 func TestEvictionNotPossible(t *testing.T) { 598 testAllImplementations(t, testEvictionNotPossible1) 599 } 600 601 func testEvictionNotPossible1(t *testing.T, fail failover) { 602 resetVariables() 603 defer checkVariables(t) 604 605 cfg := NewDefaultConfig() 606 cfg.Enabled = true 607 cfg.Shards = map[string]bool{ 608 topoproto.KeyspaceShardString(keyspace, shard): true, 609 topoproto.KeyspaceShardString(keyspace, shard2): true, 610 } 611 cfg.Size = 1 612 613 b := New(cfg) 614 615 // Make the buffer full (applies to all failovers). 616 // Also triggers buffering for the first shard. 617 stoppedFirstFailover := issueRequest(context.Background(), t, b, failoverErr) 618 if err := waitForRequestsInFlight(b, 1); err != nil { 619 t.Fatal(err) 620 } 621 622 // Newer requests of the second failover cannot evict anything because 623 // they have no entries buffered. 624 retryDone, bufferErr := b.WaitForFailoverEnd(context.Background(), keyspace, shard2, failoverErr) 625 if bufferErr == nil || retryDone != nil { 626 t.Fatalf("buffer should have returned an error because it's full: err: %v retryDone: %v", bufferErr, retryDone) 627 } 628 if got, want := vterrors.Code(bufferErr), vtrpcpb.Code_UNAVAILABLE; got != want { 629 t.Fatalf("wrong error code for evicted buffered request. got = %v, want = %v", got, want) 630 } 631 if got, want := bufferErr.Error(), bufferFullError.Error(); !strings.Contains(got, want) { 632 t.Fatalf("evicted buffered request should return a different error message. got = %v, want substring = %v", got, want) 633 } 634 635 // End of failover. Stop buffering. 636 fail(b, newPrimary, keyspace, shard, time.Unix(1, 0)) 637 638 if err := <-stoppedFirstFailover; err != nil { 639 t.Fatalf("request should have been buffered and not returned an error: %v", err) 640 } 641 // Wait for the failover end to avoid races. 642 if err := waitForState(b, stateIdle); err != nil { 643 t.Fatal(err) 644 } 645 if err := waitForPoolSlots(b, 1); err != nil { 646 t.Fatal(err) 647 } 648 statsKeyJoined := strings.Join([]string{keyspace, shard2, string(skippedBufferFull)}, ".") 649 if got, want := requestsSkipped.Counts()[statsKeyJoined], int64(1); got != want { 650 t.Fatalf("skipped request was not tracked: got = %v, want = %v", got, want) 651 } 652 } 653 654 func TestWindow(t *testing.T) { 655 testAllImplementations(t, testWindow1) 656 } 657 658 func testWindow1(t *testing.T, fail failover) { 659 resetVariables() 660 defer checkVariables(t) 661 662 cfg := NewDefaultConfig() 663 cfg.Enabled = true 664 cfg.Shards = map[string]bool{ 665 topoproto.KeyspaceShardString(keyspace, shard): true, 666 topoproto.KeyspaceShardString(keyspace, shard2): true, 667 } 668 cfg.Size = 1 669 cfg.Window = 1 * time.Millisecond 670 671 b := New(cfg) 672 673 // Buffer one request. 674 t.Logf("first request exceeds its window") 675 stopped1 := issueRequest(context.Background(), t, b, failoverErr) 676 677 // Let it go out of the buffering window and expire. 678 if err := <-stopped1; err != nil { 679 t.Fatalf("buffering should have stopped after exceeding the window without an error: %v", err) 680 } 681 // Verify that the window was actually exceeded. 682 if err := waitForRequestsExceededWindow(1); err != nil { 683 t.Fatal(err) 684 } 685 686 // Increase the window and buffer a request again 687 // (queue becomes not empty a second time). 688 cfg.Window = 10 * time.Minute 689 690 // This is a hack. The buffering semaphore gets released asynchronously. 691 // Sometimes the next issueRequest tries to acquire before that release 692 // and ends up failing. Waiting for the previous goroutines to exit ensures 693 // that the sema will get released. 694 b.waitForShutdown() 695 696 // This time the request does not go out of window and gets evicted by a third 697 // request instead. 698 t.Logf("second request does not exceed its window") 699 stopped2 := issueRequest(context.Background(), t, b, failoverErr) 700 if err := waitForRequestsInFlight(b, 1); err != nil { 701 t.Fatal(err) 702 } 703 704 // Third request will evict the second one. 705 t.Logf("third request evicts the second request") 706 stopped3 := issueRequest(context.Background(), t, b, failoverErr) 707 708 // Evicted request will see an error from the buffer. 709 if err := isEvictedError(<-stopped2); err != nil { 710 t.Fatal(err) 711 } 712 // Block until the third request is buffered. Avoids data race with *window. 713 if err := waitForRequestsInFlight(b, 1); err != nil { 714 t.Fatal(err) 715 } 716 717 // Verify that the window was not exceeded. 718 if got, want := requestsEvicted.Counts()[statsKeyJoinedWindowExceeded], int64(1); got != want { 719 t.Fatalf("second or third request should not have exceed its buffering window. got = %v, want = %v", got, want) 720 } 721 722 // Reduce the window again. 723 cfg.Window = 100 * time.Millisecond 724 725 // Fourth request evicts the third 726 t.Logf("fourth request exceeds its window (and evicts the third)") 727 stopped4 := issueRequest(context.Background(), t, b, failoverErr) 728 if err := isEvictedError(<-stopped3); err != nil { 729 t.Fatal(err) 730 } 731 732 // Fourth request will exceed its window and finish early. 733 if err := <-stopped4; err != nil { 734 t.Fatalf("buffering should have stopped after 10ms without an error: %v", err) 735 } 736 // Verify that the window was actually exceeded. 737 if err := waitForRequestsExceededWindow(2); err != nil { 738 t.Fatal(err) 739 } 740 741 // At this point the buffer is empty but buffering is still active. 742 // Simulate that the buffering stops because the max duration (10m) was reached. 743 b.getOrCreateBuffer(keyspace, shard).stopBufferingDueToMaxDuration() 744 // Wait for the failover end to avoid races. 745 if err := waitForState(b, stateIdle); err != nil { 746 t.Fatal(err) 747 } 748 if err := waitForPoolSlots(b, 1); err != nil { 749 t.Fatal(err) 750 } 751 } 752 753 // TestShutdown tests that Buffer.Shutdown() unblocks any pending bufferings 754 // immediately. 755 func TestShutdown(t *testing.T) { 756 testAllImplementations(t, testShutdown1) 757 } 758 759 func testShutdown1(t *testing.T, fail failover) { 760 resetVariables() 761 defer checkVariables(t) 762 763 cfg := NewDefaultConfig() 764 cfg.Enabled = true 765 b := New(cfg) 766 767 // Buffer one request. 768 stopped1 := issueRequest(context.Background(), t, b, failoverErr) 769 if err := waitForRequestsInFlight(b, 1); err != nil { 770 t.Fatal(err) 771 } 772 773 // Shutdown buffer and unblock buffered request immediately. 774 b.Shutdown() 775 776 // Request must have been drained without an error. 777 if err := <-stopped1; err != nil { 778 t.Fatalf("request should have been buffered and not returned an error: %v", err) 779 } 780 781 if err := waitForPoolSlots(b, cfg.Size); err != nil { 782 t.Fatal(err) 783 } 784 }