google.golang.org/grpc@v1.72.2/xds/internal/balancer/outlierdetection/balancer_test.go (about) 1 /* 2 * 3 * Copyright 2022 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 package outlierdetection 20 21 import ( 22 "context" 23 "encoding/json" 24 "errors" 25 "fmt" 26 "math" 27 "strings" 28 "sync" 29 "testing" 30 "time" 31 32 "github.com/google/go-cmp/cmp" 33 "github.com/google/go-cmp/cmp/cmpopts" 34 "google.golang.org/grpc" 35 "google.golang.org/grpc/balancer" 36 "google.golang.org/grpc/balancer/pickfirst/pickfirstleaf" 37 "google.golang.org/grpc/balancer/weightedroundrobin" 38 "google.golang.org/grpc/codes" 39 "google.golang.org/grpc/connectivity" 40 "google.golang.org/grpc/credentials/insecure" 41 "google.golang.org/grpc/internal/balancer/stub" 42 "google.golang.org/grpc/internal/channelz" 43 "google.golang.org/grpc/internal/grpcsync" 44 "google.golang.org/grpc/internal/grpctest" 45 iserviceconfig "google.golang.org/grpc/internal/serviceconfig" 46 "google.golang.org/grpc/internal/stubserver" 47 "google.golang.org/grpc/internal/testutils" 48 "google.golang.org/grpc/internal/testutils/roundrobin" 49 "google.golang.org/grpc/peer" 50 "google.golang.org/grpc/resolver" 51 "google.golang.org/grpc/resolver/manual" 52 "google.golang.org/grpc/serviceconfig" 53 "google.golang.org/grpc/status" 54 "google.golang.org/grpc/xds/internal/balancer/clusterimpl" 55 56 testgrpc "google.golang.org/grpc/interop/grpc_testing" 57 testpb "google.golang.org/grpc/interop/grpc_testing" 58 ) 59 60 var ( 61 defaultTestTimeout = 5 * time.Second 62 defaultTestShortTimeout = 10 * time.Millisecond 63 ) 64 65 type s struct { 66 grpctest.Tester 67 } 68 69 func Test(t *testing.T) { 70 grpctest.RunSubTests(t, s{}) 71 } 72 73 // TestParseConfig verifies the ParseConfig() method in the Outlier Detection 74 // Balancer. 75 func (s) TestParseConfig(t *testing.T) { 76 const errParseConfigName = "errParseConfigBalancer" 77 stub.Register(errParseConfigName, stub.BalancerFuncs{ 78 ParseConfig: func(json.RawMessage) (serviceconfig.LoadBalancingConfig, error) { 79 return nil, errors.New("some error") 80 }, 81 }) 82 83 parser := bb{} 84 const ( 85 defaultInterval = iserviceconfig.Duration(10 * time.Second) 86 defaultBaseEjectionTime = iserviceconfig.Duration(30 * time.Second) 87 defaultMaxEjectionTime = iserviceconfig.Duration(300 * time.Second) 88 defaultMaxEjectionPercent = 10 89 defaultSuccessRateStdevFactor = 1900 90 defaultEnforcingSuccessRate = 100 91 defaultSuccessRateMinimumHosts = 5 92 defaultSuccessRateRequestVolume = 100 93 defaultFailurePercentageThreshold = 85 94 defaultEnforcingFailurePercentage = 0 95 defaultFailurePercentageMinimumHosts = 5 96 defaultFailurePercentageRequestVolume = 50 97 ) 98 tests := []struct { 99 name string 100 input string 101 wantCfg serviceconfig.LoadBalancingConfig 102 wantErr string 103 }{ 104 { 105 name: "no-fields-set-should-get-default", 106 input: `{ 107 "childPolicy": [ 108 { 109 "xds_cluster_impl_experimental": { 110 "cluster": "test_cluster" 111 } 112 } 113 ] 114 }`, 115 wantCfg: &LBConfig{ 116 Interval: defaultInterval, 117 BaseEjectionTime: defaultBaseEjectionTime, 118 MaxEjectionTime: defaultMaxEjectionTime, 119 MaxEjectionPercent: defaultMaxEjectionPercent, 120 ChildPolicy: &iserviceconfig.BalancerConfig{ 121 Name: "xds_cluster_impl_experimental", 122 Config: &clusterimpl.LBConfig{ 123 Cluster: "test_cluster", 124 }, 125 }, 126 }, 127 }, 128 129 { 130 name: "some-top-level-fields-set", 131 input: `{ 132 "interval": "15s", 133 "maxEjectionTime": "350s", 134 "childPolicy": [ 135 { 136 "xds_cluster_impl_experimental": { 137 "cluster": "test_cluster" 138 } 139 } 140 ] 141 }`, 142 // Should get set fields + defaults for unset fields. 143 wantCfg: &LBConfig{ 144 Interval: iserviceconfig.Duration(15 * time.Second), 145 BaseEjectionTime: defaultBaseEjectionTime, 146 MaxEjectionTime: iserviceconfig.Duration(350 * time.Second), 147 MaxEjectionPercent: defaultMaxEjectionPercent, 148 ChildPolicy: &iserviceconfig.BalancerConfig{ 149 Name: "xds_cluster_impl_experimental", 150 Config: &clusterimpl.LBConfig{ 151 Cluster: "test_cluster", 152 }, 153 }, 154 }, 155 }, 156 { 157 name: "success-rate-ejection-present-but-no-fields", 158 input: `{ 159 "successRateEjection": {}, 160 "childPolicy": [ 161 { 162 "xds_cluster_impl_experimental": { 163 "cluster": "test_cluster" 164 } 165 } 166 ] 167 }`, 168 // Should get defaults of success-rate-ejection struct. 169 wantCfg: &LBConfig{ 170 Interval: defaultInterval, 171 BaseEjectionTime: defaultBaseEjectionTime, 172 MaxEjectionTime: defaultMaxEjectionTime, 173 MaxEjectionPercent: defaultMaxEjectionPercent, 174 SuccessRateEjection: &SuccessRateEjection{ 175 StdevFactor: defaultSuccessRateStdevFactor, 176 EnforcementPercentage: defaultEnforcingSuccessRate, 177 MinimumHosts: defaultSuccessRateMinimumHosts, 178 RequestVolume: defaultSuccessRateRequestVolume, 179 }, 180 ChildPolicy: &iserviceconfig.BalancerConfig{ 181 Name: "xds_cluster_impl_experimental", 182 Config: &clusterimpl.LBConfig{ 183 Cluster: "test_cluster", 184 }, 185 }, 186 }, 187 }, 188 { 189 name: "success-rate-ejection-present-partially-set", 190 input: `{ 191 "successRateEjection": { 192 "stdevFactor": 1000, 193 "minimumHosts": 5 194 }, 195 "childPolicy": [ 196 { 197 "xds_cluster_impl_experimental": { 198 "cluster": "test_cluster" 199 } 200 } 201 ] 202 }`, 203 // Should get set fields + defaults for others in success rate 204 // ejection layer. 205 wantCfg: &LBConfig{ 206 Interval: defaultInterval, 207 BaseEjectionTime: defaultBaseEjectionTime, 208 MaxEjectionTime: defaultMaxEjectionTime, 209 MaxEjectionPercent: defaultMaxEjectionPercent, 210 SuccessRateEjection: &SuccessRateEjection{ 211 StdevFactor: 1000, 212 EnforcementPercentage: defaultEnforcingSuccessRate, 213 MinimumHosts: 5, 214 RequestVolume: defaultSuccessRateRequestVolume, 215 }, 216 ChildPolicy: &iserviceconfig.BalancerConfig{ 217 Name: "xds_cluster_impl_experimental", 218 Config: &clusterimpl.LBConfig{ 219 Cluster: "test_cluster", 220 }, 221 }, 222 }, 223 }, 224 { 225 name: "success-rate-ejection-present-fully-set", 226 input: `{ 227 "successRateEjection": { 228 "stdevFactor": 1000, 229 "enforcementPercentage": 50, 230 "minimumHosts": 5, 231 "requestVolume": 50 232 }, 233 "childPolicy": [ 234 { 235 "xds_cluster_impl_experimental": { 236 "cluster": "test_cluster" 237 } 238 } 239 ] 240 }`, 241 wantCfg: &LBConfig{ 242 Interval: defaultInterval, 243 BaseEjectionTime: defaultBaseEjectionTime, 244 MaxEjectionTime: defaultMaxEjectionTime, 245 MaxEjectionPercent: defaultMaxEjectionPercent, 246 SuccessRateEjection: &SuccessRateEjection{ 247 StdevFactor: 1000, 248 EnforcementPercentage: 50, 249 MinimumHosts: 5, 250 RequestVolume: 50, 251 }, 252 ChildPolicy: &iserviceconfig.BalancerConfig{ 253 Name: "xds_cluster_impl_experimental", 254 Config: &clusterimpl.LBConfig{ 255 Cluster: "test_cluster", 256 }, 257 }, 258 }, 259 }, 260 { 261 name: "failure-percentage-ejection-present-but-no-fields", 262 input: `{ 263 "failurePercentageEjection": {}, 264 "childPolicy": [ 265 { 266 "xds_cluster_impl_experimental": { 267 "cluster": "test_cluster" 268 } 269 } 270 ] 271 }`, 272 // Should get defaults of failure percentage ejection layer. 273 wantCfg: &LBConfig{ 274 Interval: defaultInterval, 275 BaseEjectionTime: defaultBaseEjectionTime, 276 MaxEjectionTime: defaultMaxEjectionTime, 277 MaxEjectionPercent: defaultMaxEjectionPercent, 278 FailurePercentageEjection: &FailurePercentageEjection{ 279 Threshold: defaultFailurePercentageThreshold, 280 EnforcementPercentage: defaultEnforcingFailurePercentage, 281 MinimumHosts: defaultFailurePercentageMinimumHosts, 282 RequestVolume: defaultFailurePercentageRequestVolume, 283 }, 284 ChildPolicy: &iserviceconfig.BalancerConfig{ 285 Name: "xds_cluster_impl_experimental", 286 Config: &clusterimpl.LBConfig{ 287 Cluster: "test_cluster", 288 }, 289 }, 290 }, 291 }, 292 { 293 name: "failure-percentage-ejection-present-partially-set", 294 input: `{ 295 "failurePercentageEjection": { 296 "threshold": 80, 297 "minimumHosts": 10 298 }, 299 "childPolicy": [ 300 { 301 "xds_cluster_impl_experimental": { 302 "cluster": "test_cluster" 303 } 304 } 305 ] 306 }`, 307 // Should get set fields + defaults for others in success rate 308 // ejection layer. 309 wantCfg: &LBConfig{ 310 Interval: defaultInterval, 311 BaseEjectionTime: defaultBaseEjectionTime, 312 MaxEjectionTime: defaultMaxEjectionTime, 313 MaxEjectionPercent: defaultMaxEjectionPercent, 314 FailurePercentageEjection: &FailurePercentageEjection{ 315 Threshold: 80, 316 EnforcementPercentage: defaultEnforcingFailurePercentage, 317 MinimumHosts: 10, 318 RequestVolume: defaultFailurePercentageRequestVolume, 319 }, 320 ChildPolicy: &iserviceconfig.BalancerConfig{ 321 Name: "xds_cluster_impl_experimental", 322 Config: &clusterimpl.LBConfig{ 323 Cluster: "test_cluster", 324 }, 325 }, 326 }, 327 }, 328 { 329 name: "failure-percentage-ejection-present-fully-set", 330 input: `{ 331 "failurePercentageEjection": { 332 "threshold": 80, 333 "enforcementPercentage": 100, 334 "minimumHosts": 10, 335 "requestVolume": 40 336 }, 337 "childPolicy": [ 338 { 339 "xds_cluster_impl_experimental": { 340 "cluster": "test_cluster" 341 } 342 } 343 ] 344 }`, 345 wantCfg: &LBConfig{ 346 Interval: defaultInterval, 347 BaseEjectionTime: defaultBaseEjectionTime, 348 MaxEjectionTime: defaultMaxEjectionTime, 349 MaxEjectionPercent: defaultMaxEjectionPercent, 350 FailurePercentageEjection: &FailurePercentageEjection{ 351 Threshold: 80, 352 EnforcementPercentage: 100, 353 MinimumHosts: 10, 354 RequestVolume: 40, 355 }, 356 ChildPolicy: &iserviceconfig.BalancerConfig{ 357 Name: "xds_cluster_impl_experimental", 358 Config: &clusterimpl.LBConfig{ 359 Cluster: "test_cluster", 360 }, 361 }, 362 }, 363 }, 364 { // to make sure zero values aren't overwritten by defaults 365 name: "lb-config-every-field-set-zero-value", 366 input: `{ 367 "interval": "0s", 368 "baseEjectionTime": "0s", 369 "maxEjectionTime": "0s", 370 "maxEjectionPercent": 0, 371 "successRateEjection": { 372 "stdevFactor": 0, 373 "enforcementPercentage": 0, 374 "minimumHosts": 0, 375 "requestVolume": 0 376 }, 377 "failurePercentageEjection": { 378 "threshold": 0, 379 "enforcementPercentage": 0, 380 "minimumHosts": 0, 381 "requestVolume": 0 382 }, 383 "childPolicy": [ 384 { 385 "xds_cluster_impl_experimental": { 386 "cluster": "test_cluster" 387 } 388 } 389 ] 390 }`, 391 wantCfg: &LBConfig{ 392 SuccessRateEjection: &SuccessRateEjection{}, 393 FailurePercentageEjection: &FailurePercentageEjection{}, 394 ChildPolicy: &iserviceconfig.BalancerConfig{ 395 Name: "xds_cluster_impl_experimental", 396 Config: &clusterimpl.LBConfig{ 397 Cluster: "test_cluster", 398 }, 399 }, 400 }, 401 }, 402 { 403 name: "lb-config-every-field-set", 404 input: `{ 405 "interval": "10s", 406 "baseEjectionTime": "30s", 407 "maxEjectionTime": "300s", 408 "maxEjectionPercent": 10, 409 "successRateEjection": { 410 "stdevFactor": 1900, 411 "enforcementPercentage": 100, 412 "minimumHosts": 5, 413 "requestVolume": 100 414 }, 415 "failurePercentageEjection": { 416 "threshold": 85, 417 "enforcementPercentage": 5, 418 "minimumHosts": 5, 419 "requestVolume": 50 420 }, 421 "childPolicy": [ 422 { 423 "xds_cluster_impl_experimental": { 424 "cluster": "test_cluster" 425 } 426 } 427 ] 428 }`, 429 wantCfg: &LBConfig{ 430 Interval: iserviceconfig.Duration(10 * time.Second), 431 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 432 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 433 MaxEjectionPercent: 10, 434 SuccessRateEjection: &SuccessRateEjection{ 435 StdevFactor: 1900, 436 EnforcementPercentage: 100, 437 MinimumHosts: 5, 438 RequestVolume: 100, 439 }, 440 FailurePercentageEjection: &FailurePercentageEjection{ 441 Threshold: 85, 442 EnforcementPercentage: 5, 443 MinimumHosts: 5, 444 RequestVolume: 50, 445 }, 446 ChildPolicy: &iserviceconfig.BalancerConfig{ 447 Name: "xds_cluster_impl_experimental", 448 Config: &clusterimpl.LBConfig{ 449 Cluster: "test_cluster", 450 }, 451 }, 452 }, 453 }, 454 { 455 name: "interval-is-negative", 456 input: `{"interval": "-10s"}`, 457 wantErr: "OutlierDetectionLoadBalancingConfig.interval = -10s; must be >= 0", 458 }, 459 { 460 name: "base-ejection-time-is-negative", 461 input: `{"baseEjectionTime": "-10s"}`, 462 wantErr: "OutlierDetectionLoadBalancingConfig.base_ejection_time = -10s; must be >= 0", 463 }, 464 { 465 name: "max-ejection-time-is-negative", 466 input: `{"maxEjectionTime": "-10s"}`, 467 wantErr: "OutlierDetectionLoadBalancingConfig.max_ejection_time = -10s; must be >= 0", 468 }, 469 { 470 name: "max-ejection-percent-is-greater-than-100", 471 input: `{"maxEjectionPercent": 150}`, 472 wantErr: "OutlierDetectionLoadBalancingConfig.max_ejection_percent = 150; must be <= 100", 473 }, 474 { 475 name: "enforcement-percentage-success-rate-is-greater-than-100", 476 input: `{ 477 "successRateEjection": { 478 "enforcementPercentage": 150 479 } 480 }`, 481 wantErr: "OutlierDetectionLoadBalancingConfig.SuccessRateEjection.enforcement_percentage = 150; must be <= 100", 482 }, 483 { 484 name: "failure-percentage-threshold-is-greater-than-100", 485 input: `{ 486 "failurePercentageEjection": { 487 "threshold": 150 488 } 489 }`, 490 wantErr: "OutlierDetectionLoadBalancingConfig.FailurePercentageEjection.threshold = 150; must be <= 100", 491 }, 492 { 493 name: "enforcement-percentage-failure-percentage-ejection-is-greater-than-100", 494 input: `{ 495 "failurePercentageEjection": { 496 "enforcementPercentage": 150 497 } 498 }`, 499 wantErr: "OutlierDetectionLoadBalancingConfig.FailurePercentageEjection.enforcement_percentage = 150; must be <= 100", 500 }, 501 { 502 name: "child-policy-present-but-parse-error", 503 input: `{ 504 "childPolicy": [ 505 { 506 "errParseConfigBalancer": { 507 "cluster": "test_cluster" 508 } 509 } 510 ] 511 }`, 512 wantErr: "error parsing loadBalancingConfig for policy \"errParseConfigBalancer\"", 513 }, 514 { 515 name: "no-supported-child-policy", 516 input: `{ 517 "childPolicy": [ 518 { 519 "doesNotExistBalancer": { 520 "cluster": "test_cluster" 521 } 522 } 523 ] 524 }`, 525 wantErr: "invalid loadBalancingConfig: no supported policies found", 526 }, 527 } 528 for _, test := range tests { 529 t.Run(test.name, func(t *testing.T) { 530 gotCfg, gotErr := parser.ParseConfig(json.RawMessage(test.input)) 531 if gotErr != nil && !strings.Contains(gotErr.Error(), test.wantErr) { 532 t.Fatalf("ParseConfig(%v) = %v, wantErr %v", test.input, gotErr, test.wantErr) 533 } 534 if (gotErr != nil) != (test.wantErr != "") { 535 t.Fatalf("ParseConfig(%v) = %v, wantErr %v", test.input, gotErr, test.wantErr) 536 } 537 if test.wantErr != "" { 538 return 539 } 540 if diff := cmp.Diff(gotCfg, test.wantCfg); diff != "" { 541 t.Fatalf("parseConfig(%v) got unexpected output, diff (-got +want): %v", string(test.input), diff) 542 } 543 }) 544 } 545 } 546 547 func (lbc *LBConfig) Equal(lbc2 *LBConfig) bool { 548 if !lbc.EqualIgnoringChildPolicy(lbc2) { 549 return false 550 } 551 return cmp.Equal(lbc.ChildPolicy, lbc2.ChildPolicy) 552 } 553 554 type subConnWithState struct { 555 sc balancer.SubConn 556 state balancer.SubConnState 557 } 558 559 func setup(t *testing.T) (*outlierDetectionBalancer, *testutils.BalancerClientConn, func()) { 560 t.Helper() 561 builder := balancer.Get(Name) 562 if builder == nil { 563 t.Fatalf("balancer.Get(%q) returned nil", Name) 564 } 565 tcc := testutils.NewBalancerClientConn(t) 566 ch := channelz.RegisterChannel(nil, "test channel") 567 t.Cleanup(func() { channelz.RemoveEntry(ch.ID) }) 568 odB := builder.Build(tcc, balancer.BuildOptions{ChannelzParent: ch}) 569 return odB.(*outlierDetectionBalancer), tcc, odB.Close 570 } 571 572 type emptyChildConfig struct { 573 serviceconfig.LoadBalancingConfig 574 } 575 576 // TestChildBasicOperations tests basic operations of the Outlier Detection 577 // Balancer and its interaction with its child. The following scenarios are 578 // tested, in a step by step fashion: 579 // 1. The Outlier Detection Balancer receives it's first good configuration. The 580 // balancer is expected to create a child and sent the child it's configuration. 581 // 2. The Outlier Detection Balancer receives new configuration that specifies a 582 // child's type, and the new type immediately reports READY inline. The first 583 // child balancer should be closed and the second child balancer should receive 584 // a config update. 585 // 3. The Outlier Detection Balancer is closed. The second child balancer should 586 // be closed. 587 func (s) TestChildBasicOperations(t *testing.T) { 588 bc := emptyChildConfig{} 589 590 ccsCh := testutils.NewChannel() 591 closeCh := testutils.NewChannel() 592 593 stub.Register(t.Name()+"child1", stub.BalancerFuncs{ 594 UpdateClientConnState: func(_ *stub.BalancerData, ccs balancer.ClientConnState) error { 595 ccsCh.Send(ccs.BalancerConfig) 596 return nil 597 }, 598 Close: func(*stub.BalancerData) { 599 closeCh.Send(nil) 600 }, 601 }) 602 603 stub.Register(t.Name()+"child2", stub.BalancerFuncs{ 604 UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error { 605 // UpdateState inline to READY to complete graceful switch process 606 // synchronously from any UpdateClientConnState call. 607 bd.ClientConn.UpdateState(balancer.State{ 608 ConnectivityState: connectivity.Ready, 609 Picker: &testutils.TestConstPicker{}, 610 }) 611 ccsCh.Send(nil) 612 return nil 613 }, 614 Close: func(*stub.BalancerData) { 615 closeCh.Send(nil) 616 }, 617 }) 618 619 od, tcc, _ := setup(t) 620 621 // This first config update should cause a child to be built and forwarded 622 // its first update. 623 od.UpdateClientConnState(balancer.ClientConnState{ 624 BalancerConfig: &LBConfig{ 625 ChildPolicy: &iserviceconfig.BalancerConfig{ 626 Name: t.Name() + "child1", 627 Config: bc, 628 }, 629 }, 630 }) 631 632 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 633 defer cancel() 634 cr, err := ccsCh.Receive(ctx) 635 if err != nil { 636 t.Fatalf("timed out waiting for UpdateClientConnState on the first child balancer: %v", err) 637 } 638 if _, ok := cr.(emptyChildConfig); !ok { 639 t.Fatalf("Received child policy config of type %T, want %T", cr, emptyChildConfig{}) 640 } 641 642 // This Update Client Conn State call should cause the first child balancer 643 // to close, and a new child to be created and also forwarded its first 644 // config update. 645 od.UpdateClientConnState(balancer.ClientConnState{ 646 BalancerConfig: &LBConfig{ 647 Interval: math.MaxInt64, 648 ChildPolicy: &iserviceconfig.BalancerConfig{ 649 Name: t.Name() + "child2", 650 Config: emptyChildConfig{}, 651 }, 652 }, 653 }) 654 655 // Verify inline UpdateState() call from the new child eventually makes it's 656 // way to the Test Client Conn. 657 select { 658 case <-ctx.Done(): 659 t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn") 660 case state := <-tcc.NewStateCh: 661 if state != connectivity.Ready { 662 t.Fatalf("ClientConn received connectivity state %v, want %v", state, connectivity.Ready) 663 } 664 } 665 666 // Verify the first child balancer closed. 667 if _, err = closeCh.Receive(ctx); err != nil { 668 t.Fatalf("timed out waiting for the first child balancer to be closed: %v", err) 669 } 670 // Verify the second child balancer received its first config update. 671 if _, err = ccsCh.Receive(ctx); err != nil { 672 t.Fatalf("timed out waiting for UpdateClientConnState on the second child balancer: %v", err) 673 } 674 // Closing the Outlier Detection Balancer should close the newly created 675 // child. 676 od.Close() 677 if _, err = closeCh.Receive(ctx); err != nil { 678 t.Fatalf("timed out waiting for the second child balancer to be closed: %v", err) 679 } 680 } 681 682 // TestUpdateAddresses tests the functionality of UpdateAddresses and any 683 // changes in the addresses/plurality of those addresses for a SubConn. The 684 // Balancer is set up with two upstreams, with one of the upstreams being 685 // ejected. Initially, there is one SubConn for each address. The following 686 // scenarios are tested, in a step by step fashion: 687 // 1. The SubConn not currently ejected switches addresses to the address that 688 // is ejected. This should cause the SubConn to get ejected. 689 // 2. Update this same SubConn to multiple addresses. This should cause the 690 // SubConn to get unejected, as it is no longer being tracked by Outlier 691 // Detection at that point. 692 // 3. Update this same SubConn to different addresses, still multiple. This 693 // should be a noop, as the SubConn is still no longer being tracked by Outlier 694 // Detection. 695 // 4. Update this same SubConn to the a single address which is ejected. This 696 // should cause the SubConn to be ejected. 697 func (s) TestUpdateAddresses(t *testing.T) { 698 scsCh := testutils.NewChannel() 699 var scw1, scw2 balancer.SubConn 700 var err error 701 stub.Register(t.Name(), stub.BalancerFuncs{ 702 UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error { 703 scw1, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{ 704 StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw1, state: state}) }, 705 }) 706 if err != nil { 707 t.Errorf("error in od.NewSubConn call: %v", err) 708 } 709 scw2, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{ 710 StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw2, state: state}) }, 711 }) 712 if err != nil { 713 t.Errorf("error in od.NewSubConn call: %v", err) 714 } 715 bd.ClientConn.UpdateState(balancer.State{ 716 ConnectivityState: connectivity.Ready, 717 Picker: &rrPicker{ 718 scs: []balancer.SubConn{scw1, scw2}, 719 }, 720 }) 721 return nil 722 }, 723 }) 724 725 od, tcc, cleanup := setup(t) 726 defer cleanup() 727 728 od.UpdateClientConnState(balancer.ClientConnState{ 729 ResolverState: resolver.State{ 730 Endpoints: []resolver.Endpoint{ 731 {Addresses: []resolver.Address{{Addr: "address1"}}}, 732 {Addresses: []resolver.Address{{Addr: "address2"}}}, 733 }, 734 }, 735 BalancerConfig: &LBConfig{ 736 Interval: iserviceconfig.Duration(10 * time.Second), 737 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 738 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 739 MaxEjectionPercent: 10, 740 FailurePercentageEjection: &FailurePercentageEjection{ 741 Threshold: 50, 742 EnforcementPercentage: 100, 743 MinimumHosts: 2, 744 RequestVolume: 3, 745 }, 746 ChildPolicy: &iserviceconfig.BalancerConfig{ 747 Name: t.Name(), 748 Config: emptyChildConfig{}, 749 }, 750 }, 751 }) 752 753 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 754 defer cancel() 755 756 // Setup the system to where one address is ejected and one address 757 // isn't. 758 select { 759 case <-ctx.Done(): 760 t.Fatal("timeout while waiting for a UpdateState call on the ClientConn") 761 case picker := <-tcc.NewPickerCh: 762 pi, err := picker.Pick(balancer.PickInfo{}) 763 if err != nil { 764 t.Fatalf("picker.Pick failed with error: %v", err) 765 } 766 // Simulate 5 successful RPC calls on the first SubConn (the first call 767 // to picker.Pick). 768 for c := 0; c < 5; c++ { 769 pi.Done(balancer.DoneInfo{}) 770 } 771 pi, err = picker.Pick(balancer.PickInfo{}) 772 if err != nil { 773 t.Fatalf("picker.Pick failed with error: %v", err) 774 } 775 // Simulate 5 failed RPC calls on the second SubConn (the second call to 776 // picker.Pick). Thus, when the interval timer algorithm is run, the 777 // second SubConn's address should be ejected, which will allow us to 778 // further test UpdateAddresses() logic. 779 for c := 0; c < 5; c++ { 780 pi.Done(balancer.DoneInfo{Err: errors.New("some error")}) 781 } 782 od.intervalTimerAlgorithm() 783 // verify StateListener() got called with TRANSIENT_FAILURE for child 784 // with address that was ejected. 785 gotSCWS, err := scsCh.Receive(ctx) 786 if err != nil { 787 t.Fatalf("Error waiting for Sub Conn update: %v", err) 788 } 789 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 790 sc: scw2, 791 state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}, 792 }); err != nil { 793 t.Fatalf("Error in Sub Conn update: %v", err) 794 } 795 } 796 797 // Update scw1 to another address that is currently ejected. This should 798 // cause scw1 to get ejected. 799 od.UpdateAddresses(scw1, []resolver.Address{{Addr: "address2"}}) 800 801 // Verify that update addresses gets forwarded to ClientConn. 802 select { 803 case <-ctx.Done(): 804 t.Fatal("timeout while waiting for a UpdateState call on the ClientConn") 805 case <-tcc.UpdateAddressesAddrsCh: 806 } 807 // Verify scw1 got ejected (StateListener called with TRANSIENT_FAILURE). 808 gotSCWS, err := scsCh.Receive(ctx) 809 if err != nil { 810 t.Fatalf("Error waiting for Sub Conn update: %v", err) 811 } 812 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 813 sc: scw1, 814 state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}, 815 }); err != nil { 816 t.Fatalf("Error in Sub Conn update: %v", err) 817 } 818 819 // Update scw1 to multiple addresses. This should cause scw1 to get 820 // unejected, as is it no longer being tracked for Outlier Detection. 821 od.UpdateAddresses(scw1, []resolver.Address{ 822 {Addr: "address1"}, 823 {Addr: "address2"}, 824 }) 825 // Verify scw1 got unejected (StateListener called with recent state). 826 gotSCWS, err = scsCh.Receive(ctx) 827 if err != nil { 828 t.Fatalf("Error waiting for Sub Conn update: %v", err) 829 } 830 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 831 sc: scw1, 832 state: balancer.SubConnState{ConnectivityState: connectivity.Idle}, 833 }); err != nil { 834 t.Fatalf("Error in Sub Conn update: %v", err) 835 } 836 837 // Update scw1 to a different multiple addresses list. A change of addresses 838 // in which the plurality goes from multiple to multiple should be a no-op, 839 // as the address continues to be ignored by outlier detection. 840 od.UpdateAddresses(scw1, []resolver.Address{ 841 {Addr: "address2"}, 842 {Addr: "address3"}, 843 }) 844 // Verify no downstream effects. 845 sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout) 846 defer cancel() 847 if _, err := scsCh.Receive(sCtx); err == nil { 848 t.Fatalf("no SubConn update should have been sent (no SubConn got ejected/unejected)") 849 } 850 851 // Update scw1 back to a single address, which is ejected. This should cause 852 // the SubConn to be re-ejected. 853 od.UpdateAddresses(scw1, []resolver.Address{{Addr: "address2"}}) 854 // Verify scw1 got ejected (StateListener called with TRANSIENT FAILURE). 855 gotSCWS, err = scsCh.Receive(ctx) 856 if err != nil { 857 t.Fatalf("Error waiting for Sub Conn update: %v", err) 858 } 859 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 860 sc: scw1, 861 state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}, 862 }); err != nil { 863 t.Fatalf("Error in Sub Conn update: %v", err) 864 } 865 } 866 867 func scwsEqual(gotSCWS subConnWithState, wantSCWS subConnWithState) error { 868 if gotSCWS.sc != wantSCWS.sc || !cmp.Equal(gotSCWS.state, wantSCWS.state, cmp.AllowUnexported(subConnWrapper{}, endpointInfo{}, balancer.SubConnState{}), cmpopts.IgnoreFields(subConnWrapper{}, "scUpdateCh")) { 869 return fmt.Errorf("received SubConnState: %+v, want %+v", gotSCWS, wantSCWS) 870 } 871 return nil 872 } 873 874 type rrPicker struct { 875 scs []balancer.SubConn 876 next int 877 } 878 879 func (rrp *rrPicker) Pick(balancer.PickInfo) (balancer.PickResult, error) { 880 sc := rrp.scs[rrp.next] 881 rrp.next = (rrp.next + 1) % len(rrp.scs) 882 return balancer.PickResult{SubConn: sc}, nil 883 } 884 885 // TestDurationOfInterval tests the configured interval timer. 886 // The following scenarios are tested: 887 // 1. The Outlier Detection Balancer receives it's first config. The balancer 888 // should configure the timer with whatever is directly specified on the config. 889 // 2. The Outlier Detection Balancer receives a subsequent config. The balancer 890 // should configure with whatever interval is configured minus the difference 891 // between the current time and the previous start timestamp. 892 // 3. The Outlier Detection Balancer receives a no-op configuration. The 893 // balancer should not configure a timer at all. 894 func (s) TestDurationOfInterval(t *testing.T) { 895 stub.Register(t.Name(), stub.BalancerFuncs{}) 896 897 od, _, cleanup := setup(t) 898 defer func(af func(d time.Duration, f func()) *time.Timer) { 899 cleanup() 900 afterFunc = af 901 }(afterFunc) 902 903 durationChan := testutils.NewChannel() 904 afterFunc = func(dur time.Duration, _ func()) *time.Timer { 905 durationChan.Send(dur) 906 return time.NewTimer(math.MaxInt64) 907 } 908 909 od.UpdateClientConnState(balancer.ClientConnState{ 910 BalancerConfig: &LBConfig{ 911 Interval: iserviceconfig.Duration(8 * time.Second), 912 SuccessRateEjection: &SuccessRateEjection{ 913 StdevFactor: 1900, 914 EnforcementPercentage: 100, 915 MinimumHosts: 5, 916 RequestVolume: 100, 917 }, 918 ChildPolicy: &iserviceconfig.BalancerConfig{ 919 Name: t.Name(), 920 Config: emptyChildConfig{}, 921 }, 922 }, 923 }) 924 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 925 defer cancel() 926 d, err := durationChan.Receive(ctx) 927 if err != nil { 928 t.Fatalf("Error receiving duration from afterFunc() call: %v", err) 929 } 930 dur := d.(time.Duration) 931 // The configured duration should be 8 seconds - what the balancer was 932 // configured with. 933 if dur != 8*time.Second { 934 t.Fatalf("configured duration should have been 8 seconds to start timer") 935 } 936 937 // Override time.Now to time.Now() + 5 seconds. This will represent 5 938 // seconds already passing for the next check in UpdateClientConnState. 939 defer func(n func() time.Time) { 940 now = n 941 }(now) 942 now = func() time.Time { 943 return time.Now().Add(time.Second * 5) 944 } 945 946 // UpdateClientConnState with an interval of 9 seconds. Due to 5 seconds 947 // already passing (from overridden time.Now function), this should start an 948 // interval timer of ~4 seconds. 949 od.UpdateClientConnState(balancer.ClientConnState{ 950 BalancerConfig: &LBConfig{ 951 Interval: iserviceconfig.Duration(9 * time.Second), 952 SuccessRateEjection: &SuccessRateEjection{ 953 StdevFactor: 1900, 954 EnforcementPercentage: 100, 955 MinimumHosts: 5, 956 RequestVolume: 100, 957 }, 958 ChildPolicy: &iserviceconfig.BalancerConfig{ 959 Name: t.Name(), 960 Config: emptyChildConfig{}, 961 }, 962 }, 963 }) 964 965 d, err = durationChan.Receive(ctx) 966 if err != nil { 967 t.Fatalf("Error receiving duration from afterFunc() call: %v", err) 968 } 969 dur = d.(time.Duration) 970 if dur.Seconds() < 3.5 || 4.5 < dur.Seconds() { 971 t.Fatalf("configured duration should have been around 4 seconds to start timer") 972 } 973 974 // UpdateClientConnState with a no-op config. This shouldn't configure the 975 // interval timer at all due to it being a no-op. 976 od.UpdateClientConnState(balancer.ClientConnState{ 977 BalancerConfig: &LBConfig{ 978 Interval: iserviceconfig.Duration(10 * time.Second), 979 ChildPolicy: &iserviceconfig.BalancerConfig{ 980 Name: t.Name(), 981 Config: emptyChildConfig{}, 982 }, 983 }, 984 }) 985 986 // No timer should have been started. 987 sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout) 988 defer cancel() 989 if _, err = durationChan.Receive(sCtx); err == nil { 990 t.Fatal("No timer should have started.") 991 } 992 } 993 994 // TestEjectUnejectSuccessRate tests the functionality of the interval timer 995 // algorithm when configured with SuccessRateEjection. The Outlier Detection 996 // Balancer will be set up with 3 SubConns, each with a different address. 997 // It tests the following scenarios, in a step by step fashion: 998 // 1. The three addresses each have 5 successes. The interval timer algorithm should 999 // not eject any of the addresses. 1000 // 2. Two of the addresses have 5 successes, the third has five failures. The 1001 // interval timer algorithm should eject the third address with five failures. 1002 // 3. The interval timer algorithm is run at a later time past max ejection 1003 // time. The interval timer algorithm should uneject the third address. 1004 func (s) TestEjectUnejectSuccessRate(t *testing.T) { 1005 scsCh := testutils.NewChannel() 1006 var scw1, scw2, scw3 balancer.SubConn 1007 var err error 1008 stub.Register(t.Name(), stub.BalancerFuncs{ 1009 UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error { 1010 scw1, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{ 1011 StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw1, state: state}) }, 1012 }) 1013 if err != nil { 1014 t.Errorf("error in od.NewSubConn call: %v", err) 1015 } 1016 scw2, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{ 1017 StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw2, state: state}) }, 1018 }) 1019 if err != nil { 1020 t.Errorf("error in od.NewSubConn call: %v", err) 1021 } 1022 scw3, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address3"}}, balancer.NewSubConnOptions{ 1023 StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw3, state: state}) }, 1024 }) 1025 if err != nil { 1026 t.Errorf("error in od.NewSubConn call: %v", err) 1027 } 1028 bd.ClientConn.UpdateState(balancer.State{ 1029 ConnectivityState: connectivity.Ready, 1030 Picker: &rrPicker{ 1031 scs: []balancer.SubConn{scw1, scw2, scw3}, 1032 }, 1033 }) 1034 return nil 1035 }, 1036 }) 1037 1038 od, tcc, cleanup := setup(t) 1039 defer func() { 1040 cleanup() 1041 }() 1042 1043 od.UpdateClientConnState(balancer.ClientConnState{ 1044 ResolverState: resolver.State{ 1045 Endpoints: []resolver.Endpoint{ 1046 {Addresses: []resolver.Address{{Addr: "address1"}}}, 1047 {Addresses: []resolver.Address{{Addr: "address2"}}}, 1048 {Addresses: []resolver.Address{{Addr: "address3"}}}, 1049 }, 1050 }, 1051 BalancerConfig: &LBConfig{ 1052 Interval: math.MaxInt64, // so the interval will never run unless called manually in test. 1053 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 1054 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 1055 MaxEjectionPercent: 10, 1056 FailurePercentageEjection: &FailurePercentageEjection{ 1057 Threshold: 50, 1058 EnforcementPercentage: 100, 1059 MinimumHosts: 3, 1060 RequestVolume: 3, 1061 }, 1062 ChildPolicy: &iserviceconfig.BalancerConfig{ 1063 Name: t.Name(), 1064 Config: emptyChildConfig{}, 1065 }, 1066 }, 1067 }) 1068 1069 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1070 defer cancel() 1071 1072 select { 1073 case <-ctx.Done(): 1074 t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn") 1075 case picker := <-tcc.NewPickerCh: 1076 // Set each of the three upstream addresses to have five successes each. 1077 // This should cause none of the addresses to be ejected as none of them 1078 // are outliers according to the success rate algorithm. 1079 for i := 0; i < 3; i++ { 1080 pi, err := picker.Pick(balancer.PickInfo{}) 1081 if err != nil { 1082 t.Fatalf("picker.Pick failed with error: %v", err) 1083 } 1084 for c := 0; c < 5; c++ { 1085 pi.Done(balancer.DoneInfo{}) 1086 } 1087 } 1088 1089 od.intervalTimerAlgorithm() 1090 1091 // verify no StateListener() call on the child, as no addresses got 1092 // ejected (ejected address will cause an StateListener call). 1093 sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout) 1094 defer cancel() 1095 if _, err := scsCh.Receive(sCtx); err == nil { 1096 t.Fatalf("no SubConn update should have been sent (no SubConn got ejected)") 1097 } 1098 1099 // Since no addresses are ejected, a SubConn update should forward down 1100 // to the child. 1101 od.updateSubConnState(scw1.(*subConnWrapper), balancer.SubConnState{ 1102 ConnectivityState: connectivity.Connecting, 1103 }) 1104 1105 gotSCWS, err := scsCh.Receive(ctx) 1106 if err != nil { 1107 t.Fatalf("Error waiting for Sub Conn update: %v", err) 1108 } 1109 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 1110 sc: scw1, 1111 state: balancer.SubConnState{ConnectivityState: connectivity.Connecting}, 1112 }); err != nil { 1113 t.Fatalf("Error in Sub Conn update: %v", err) 1114 } 1115 1116 // Set two of the upstream addresses to have five successes each, and 1117 // one of the upstream addresses to have five failures. This should 1118 // cause the address which has five failures to be ejected according to 1119 // the SuccessRateAlgorithm. 1120 for i := 0; i < 2; i++ { 1121 pi, err := picker.Pick(balancer.PickInfo{}) 1122 if err != nil { 1123 t.Fatalf("picker.Pick failed with error: %v", err) 1124 } 1125 for c := 0; c < 5; c++ { 1126 pi.Done(balancer.DoneInfo{}) 1127 } 1128 } 1129 pi, err := picker.Pick(balancer.PickInfo{}) 1130 if err != nil { 1131 t.Fatalf("picker.Pick failed with error: %v", err) 1132 } 1133 if got, want := pi.SubConn, scw3.(*subConnWrapper).SubConn; got != want { 1134 t.Fatalf("Unexpected SubConn chosen by picker: got %v, want %v", got, want) 1135 } 1136 for c := 0; c < 5; c++ { 1137 pi.Done(balancer.DoneInfo{Err: errors.New("some error")}) 1138 } 1139 1140 // should eject address that always errored. 1141 od.intervalTimerAlgorithm() 1142 // Due to the address being ejected, the SubConn with that address 1143 // should be ejected, meaning a TRANSIENT_FAILURE connectivity state 1144 // gets reported to the child. 1145 gotSCWS, err = scsCh.Receive(ctx) 1146 if err != nil { 1147 t.Fatalf("Error waiting for Sub Conn update: %v", err) 1148 } 1149 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 1150 sc: scw3, 1151 state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}, 1152 }); err != nil { 1153 t.Fatalf("Error in Sub Conn update: %v", err) 1154 } 1155 // Only one address should be ejected. 1156 sCtx, cancel = context.WithTimeout(context.Background(), defaultTestShortTimeout) 1157 defer cancel() 1158 if _, err := scsCh.Receive(sCtx); err == nil { 1159 t.Fatalf("Only one SubConn update should have been sent (only one SubConn got ejected)") 1160 } 1161 1162 // Now that an address is ejected, SubConn updates for SubConns using 1163 // that address should not be forwarded downward. These SubConn updates 1164 // will be cached to update the child sometime in the future when the 1165 // address gets unejected. 1166 od.updateSubConnState(scw3.(*subConnWrapper), balancer.SubConnState{ 1167 ConnectivityState: connectivity.Connecting, 1168 }) 1169 sCtx, cancel = context.WithTimeout(context.Background(), defaultTestShortTimeout) 1170 defer cancel() 1171 if _, err := scsCh.Receive(sCtx); err == nil { 1172 t.Fatalf("SubConn update should not have been forwarded (the SubConn is ejected)") 1173 } 1174 1175 // Override now to cause the interval timer algorithm to always uneject 1176 // the ejected address. This will always uneject the ejected address 1177 // because this time is set way past the max ejection time set in the 1178 // configuration, which will make the next interval timer algorithm run 1179 // uneject any ejected addresses. 1180 defer func(n func() time.Time) { 1181 now = n 1182 }(now) 1183 now = func() time.Time { 1184 return time.Now().Add(time.Second * 1000) 1185 } 1186 od.intervalTimerAlgorithm() 1187 1188 // unejected SubConn should report latest persisted state - which is 1189 // connecting from earlier. 1190 gotSCWS, err = scsCh.Receive(ctx) 1191 if err != nil { 1192 t.Fatalf("Error waiting for Sub Conn update: %v", err) 1193 } 1194 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 1195 sc: scw3, 1196 state: balancer.SubConnState{ConnectivityState: connectivity.Connecting}, 1197 }); err != nil { 1198 t.Fatalf("Error in Sub Conn update: %v", err) 1199 } 1200 } 1201 } 1202 1203 // TestEjectFailureRate tests the functionality of the interval timer algorithm 1204 // when configured with FailurePercentageEjection, and also the functionality of 1205 // noop configuration. The Outlier Detection Balancer will be set up with 3 1206 // SubConns, each with a different address. It tests the following scenarios, in 1207 // a step by step fashion: 1208 // 1. The three addresses each have 5 successes. The interval timer algorithm 1209 // should not eject any of the addresses. 1210 // 2. Two of the addresses have 5 successes, the third has five failures. The 1211 // interval timer algorithm should eject the third address with five failures. 1212 // 3. The Outlier Detection Balancer receives a subsequent noop config update. 1213 // The balancer should uneject all ejected addresses. 1214 func (s) TestEjectFailureRate(t *testing.T) { 1215 scsCh := testutils.NewChannel() 1216 var scw1, scw2, scw3 balancer.SubConn 1217 var err error 1218 stub.Register(t.Name(), stub.BalancerFuncs{ 1219 UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error { 1220 if scw1 != nil { // UpdateClientConnState was already called, no need to recreate SubConns. 1221 return nil 1222 } 1223 scw1, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{ 1224 StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw1, state: state}) }, 1225 }) 1226 if err != nil { 1227 t.Errorf("error in od.NewSubConn call: %v", err) 1228 } 1229 scw2, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{ 1230 StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw2, state: state}) }, 1231 }) 1232 if err != nil { 1233 t.Errorf("error in od.NewSubConn call: %v", err) 1234 } 1235 scw3, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address3"}}, balancer.NewSubConnOptions{ 1236 StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw3, state: state}) }, 1237 }) 1238 if err != nil { 1239 t.Errorf("error in od.NewSubConn call: %v", err) 1240 } 1241 return nil 1242 }, 1243 }) 1244 1245 od, tcc, cleanup := setup(t) 1246 defer func() { 1247 cleanup() 1248 }() 1249 1250 od.UpdateClientConnState(balancer.ClientConnState{ 1251 ResolverState: resolver.State{ 1252 Endpoints: []resolver.Endpoint{ 1253 {Addresses: []resolver.Address{{Addr: "address1"}}}, 1254 {Addresses: []resolver.Address{{Addr: "address2"}}}, 1255 {Addresses: []resolver.Address{{Addr: "address3"}}}, 1256 }, 1257 }, 1258 BalancerConfig: &LBConfig{ 1259 Interval: math.MaxInt64, // so the interval will never run unless called manually in test. 1260 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 1261 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 1262 MaxEjectionPercent: 10, 1263 SuccessRateEjection: &SuccessRateEjection{ 1264 StdevFactor: 500, 1265 EnforcementPercentage: 100, 1266 MinimumHosts: 3, 1267 RequestVolume: 3, 1268 }, 1269 ChildPolicy: &iserviceconfig.BalancerConfig{ 1270 Name: t.Name(), 1271 Config: emptyChildConfig{}, 1272 }, 1273 }, 1274 }) 1275 1276 od.UpdateState(balancer.State{ 1277 ConnectivityState: connectivity.Ready, 1278 Picker: &rrPicker{ 1279 scs: []balancer.SubConn{scw1, scw2, scw3}, 1280 }, 1281 }) 1282 1283 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1284 defer cancel() 1285 1286 select { 1287 case <-ctx.Done(): 1288 t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn") 1289 case picker := <-tcc.NewPickerCh: 1290 // Set each upstream address to have five successes each. This should 1291 // cause none of the addresses to be ejected as none of them are below 1292 // the failure percentage threshold. 1293 for i := 0; i < 3; i++ { 1294 pi, err := picker.Pick(balancer.PickInfo{}) 1295 if err != nil { 1296 t.Fatalf("picker.Pick failed with error: %v", err) 1297 } 1298 for c := 0; c < 5; c++ { 1299 pi.Done(balancer.DoneInfo{}) 1300 } 1301 } 1302 1303 od.intervalTimerAlgorithm() 1304 sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout) 1305 defer cancel() 1306 if _, err := scsCh.Receive(sCtx); err == nil { 1307 t.Fatalf("no SubConn update should have been sent (no SubConn got ejected)") 1308 } 1309 1310 // Set two upstream addresses to have five successes each, and one 1311 // upstream address to have five failures. This should cause the address 1312 // with five failures to be ejected according to the Failure Percentage 1313 // Algorithm. 1314 for i := 0; i < 2; i++ { 1315 pi, err := picker.Pick(balancer.PickInfo{}) 1316 if err != nil { 1317 t.Fatalf("picker.Pick failed with error: %v", err) 1318 } 1319 for c := 0; c < 5; c++ { 1320 pi.Done(balancer.DoneInfo{}) 1321 } 1322 } 1323 pi, err := picker.Pick(balancer.PickInfo{}) 1324 if err != nil { 1325 t.Fatalf("picker.Pick failed with error: %v", err) 1326 } 1327 for c := 0; c < 5; c++ { 1328 pi.Done(balancer.DoneInfo{Err: errors.New("some error")}) 1329 } 1330 1331 // should eject address that always errored. 1332 od.intervalTimerAlgorithm() 1333 1334 // verify StateListener() got called with TRANSIENT_FAILURE for child 1335 // in address that was ejected. 1336 gotSCWS, err := scsCh.Receive(ctx) 1337 if err != nil { 1338 t.Fatalf("Error waiting for Sub Conn update: %v", err) 1339 } 1340 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 1341 sc: scw3, 1342 state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}, 1343 }); err != nil { 1344 t.Fatalf("Error in Sub Conn update: %v", err) 1345 } 1346 1347 // verify only one address got ejected. 1348 sCtx, cancel = context.WithTimeout(context.Background(), defaultTestShortTimeout) 1349 defer cancel() 1350 if _, err := scsCh.Receive(sCtx); err == nil { 1351 t.Fatalf("Only one SubConn update should have been sent (only one SubConn got ejected)") 1352 } 1353 1354 // upon the Outlier Detection balancer being reconfigured with a noop 1355 // configuration, every ejected SubConn should be unejected. 1356 od.UpdateClientConnState(balancer.ClientConnState{ 1357 ResolverState: resolver.State{ 1358 Endpoints: []resolver.Endpoint{ 1359 {Addresses: []resolver.Address{{Addr: "address1"}}}, 1360 {Addresses: []resolver.Address{{Addr: "address2"}}}, 1361 {Addresses: []resolver.Address{{Addr: "address3"}}}, 1362 }, 1363 }, 1364 BalancerConfig: &LBConfig{ 1365 Interval: math.MaxInt64, 1366 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 1367 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 1368 MaxEjectionPercent: 10, 1369 ChildPolicy: &iserviceconfig.BalancerConfig{ 1370 Name: t.Name(), 1371 Config: emptyChildConfig{}, 1372 }, 1373 }, 1374 }) 1375 gotSCWS, err = scsCh.Receive(ctx) 1376 if err != nil { 1377 t.Fatalf("Error waiting for Sub Conn update: %v", err) 1378 } 1379 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 1380 sc: scw3, 1381 state: balancer.SubConnState{ConnectivityState: connectivity.Idle}, 1382 }); err != nil { 1383 t.Fatalf("Error in Sub Conn update: %v", err) 1384 } 1385 } 1386 } 1387 1388 // TestConcurrentOperations calls different operations on the balancer in 1389 // separate goroutines to test for any race conditions and deadlocks. It also 1390 // uses a child balancer which verifies that no operations on the child get 1391 // called after the child balancer is closed. 1392 func (s) TestConcurrentOperations(t *testing.T) { 1393 closed := grpcsync.NewEvent() 1394 stub.Register(t.Name(), stub.BalancerFuncs{ 1395 UpdateClientConnState: func(*stub.BalancerData, balancer.ClientConnState) error { 1396 if closed.HasFired() { 1397 t.Error("UpdateClientConnState was called after Close(), which breaks the balancer API") 1398 } 1399 return nil 1400 }, 1401 ResolverError: func(*stub.BalancerData, error) { 1402 if closed.HasFired() { 1403 t.Error("ResolverError was called after Close(), which breaks the balancer API") 1404 } 1405 }, 1406 Close: func(*stub.BalancerData) { 1407 closed.Fire() 1408 }, 1409 ExitIdle: func(*stub.BalancerData) { 1410 if closed.HasFired() { 1411 t.Error("ExitIdle was called after Close(), which breaks the balancer API") 1412 } 1413 }, 1414 }) 1415 1416 od, tcc, cleanup := setup(t) 1417 defer func() { 1418 cleanup() 1419 }() 1420 1421 od.UpdateClientConnState(balancer.ClientConnState{ 1422 ResolverState: resolver.State{ 1423 Endpoints: []resolver.Endpoint{ 1424 {Addresses: []resolver.Address{{Addr: "address1"}}}, 1425 {Addresses: []resolver.Address{{Addr: "address2"}}}, 1426 {Addresses: []resolver.Address{{Addr: "address3"}}}, 1427 }, 1428 }, 1429 BalancerConfig: &LBConfig{ 1430 Interval: math.MaxInt64, // so the interval will never run unless called manually in test. 1431 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 1432 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 1433 MaxEjectionPercent: 10, 1434 SuccessRateEjection: &SuccessRateEjection{ // Have both Success Rate and Failure Percentage to step through all the interval timer code 1435 StdevFactor: 500, 1436 EnforcementPercentage: 100, 1437 MinimumHosts: 3, 1438 RequestVolume: 3, 1439 }, 1440 FailurePercentageEjection: &FailurePercentageEjection{ 1441 Threshold: 50, 1442 EnforcementPercentage: 100, 1443 MinimumHosts: 3, 1444 RequestVolume: 3, 1445 }, 1446 ChildPolicy: &iserviceconfig.BalancerConfig{ 1447 Name: t.Name(), 1448 Config: emptyChildConfig{}, 1449 }, 1450 }, 1451 }) 1452 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1453 defer cancel() 1454 1455 scw1, err := od.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{}) 1456 if err != nil { 1457 t.Fatalf("error in od.NewSubConn call: %v", err) 1458 } 1459 if err != nil { 1460 t.Fatalf("error in od.NewSubConn call: %v", err) 1461 } 1462 1463 scw2, err := od.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{}) 1464 if err != nil { 1465 t.Fatalf("error in od.NewSubConn call: %v", err) 1466 } 1467 1468 scw3, err := od.NewSubConn([]resolver.Address{{Addr: "address3"}}, balancer.NewSubConnOptions{}) 1469 if err != nil { 1470 t.Fatalf("error in od.NewSubConn call: %v", err) 1471 } 1472 1473 od.UpdateState(balancer.State{ 1474 ConnectivityState: connectivity.Ready, 1475 Picker: &rrPicker{ 1476 scs: []balancer.SubConn{scw2, scw3}, 1477 }, 1478 }) 1479 1480 var picker balancer.Picker 1481 select { 1482 case <-ctx.Done(): 1483 t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn") 1484 case picker = <-tcc.NewPickerCh: 1485 } 1486 1487 finished := make(chan struct{}) 1488 var wg sync.WaitGroup 1489 wg.Add(1) 1490 go func() { 1491 defer wg.Done() 1492 for { 1493 select { 1494 case <-finished: 1495 return 1496 default: 1497 } 1498 pi, err := picker.Pick(balancer.PickInfo{}) 1499 if err != nil { 1500 continue 1501 } 1502 pi.Done(balancer.DoneInfo{}) 1503 pi.Done(balancer.DoneInfo{Err: errors.New("some error")}) 1504 time.Sleep(1 * time.Nanosecond) 1505 } 1506 }() 1507 1508 wg.Add(1) 1509 go func() { 1510 defer wg.Done() 1511 for { 1512 select { 1513 case <-finished: 1514 return 1515 default: 1516 } 1517 od.intervalTimerAlgorithm() 1518 } 1519 }() 1520 1521 // call Outlier Detection's balancer.ClientConn operations asynchronously. 1522 // balancer.ClientConn operations have no guarantee from the API to be 1523 // called synchronously. 1524 wg.Add(1) 1525 go func() { 1526 defer wg.Done() 1527 for { 1528 select { 1529 case <-finished: 1530 return 1531 default: 1532 } 1533 od.UpdateState(balancer.State{ 1534 ConnectivityState: connectivity.Ready, 1535 Picker: &rrPicker{ 1536 scs: []balancer.SubConn{scw2, scw3}, 1537 }, 1538 }) 1539 time.Sleep(1 * time.Nanosecond) 1540 } 1541 }() 1542 1543 wg.Add(1) 1544 go func() { 1545 defer wg.Done() 1546 od.NewSubConn([]resolver.Address{{Addr: "address4"}}, balancer.NewSubConnOptions{}) 1547 }() 1548 1549 wg.Add(1) 1550 go func() { 1551 defer wg.Done() 1552 scw1.Shutdown() 1553 }() 1554 1555 wg.Add(1) 1556 go func() { 1557 defer wg.Done() 1558 od.UpdateAddresses(scw2, []resolver.Address{{Addr: "address3"}}) 1559 }() 1560 1561 // Call balancer.Balancers synchronously in this goroutine, upholding the 1562 // balancer.Balancer API guarantee of synchronous calls. 1563 od.UpdateClientConnState(balancer.ClientConnState{ // This will delete addresses and flip to no op 1564 ResolverState: resolver.State{ 1565 Endpoints: []resolver.Endpoint{{Addresses: []resolver.Address{{Addr: "address1"}}}}, 1566 }, 1567 BalancerConfig: &LBConfig{ 1568 Interval: math.MaxInt64, 1569 ChildPolicy: &iserviceconfig.BalancerConfig{ 1570 Name: t.Name(), 1571 Config: emptyChildConfig{}, 1572 }, 1573 }, 1574 }) 1575 1576 // Call balancer.Balancers synchronously in this goroutine, upholding the 1577 // balancer.Balancer API guarantee. 1578 od.updateSubConnState(scw1.(*subConnWrapper), balancer.SubConnState{ 1579 ConnectivityState: connectivity.Connecting, 1580 }) 1581 od.ResolverError(errors.New("some error")) 1582 od.ExitIdle() 1583 od.Close() 1584 close(finished) 1585 wg.Wait() 1586 } 1587 1588 // Test verifies that outlier detection doesn't eject subchannels created by 1589 // the new pickfirst balancer when pickfirst is a non-leaf policy, i.e. not 1590 // under a petiole policy. When pickfirst is not under a petiole policy, it will 1591 // not register a health listener. pickfirst will still set the address 1592 // attribute to disable ejection through the raw connectivity listener. When 1593 // Outlier Detection processes a health update and sees the health listener is 1594 // enabled but a health listener is not registered, it will drop the ejection 1595 // update. 1596 func (s) TestPickFirstHealthListenerDisabled(t *testing.T) { 1597 backend := &stubserver.StubServer{ 1598 EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) { 1599 return nil, errors.New("some error") 1600 }, 1601 } 1602 if err := backend.StartServer(); err != nil { 1603 t.Fatalf("Failed to start backend: %v", err) 1604 } 1605 defer backend.Stop() 1606 t.Logf("Started bad TestService backend at: %q", backend.Address) 1607 1608 // The interval is intentionally kept very large, the interval algorithm 1609 // will be triggered manually. 1610 odCfg := &LBConfig{ 1611 Interval: iserviceconfig.Duration(300 * time.Second), 1612 BaseEjectionTime: iserviceconfig.Duration(300 * time.Second), 1613 MaxEjectionTime: iserviceconfig.Duration(500 * time.Second), 1614 FailurePercentageEjection: &FailurePercentageEjection{ 1615 Threshold: 50, 1616 EnforcementPercentage: 100, 1617 MinimumHosts: 0, 1618 RequestVolume: 2, 1619 }, 1620 MaxEjectionPercent: 100, 1621 ChildPolicy: &iserviceconfig.BalancerConfig{ 1622 Name: pickfirstleaf.Name, 1623 }, 1624 } 1625 1626 lbChan := make(chan *outlierDetectionBalancer, 1) 1627 bf := stub.BalancerFuncs{ 1628 Init: func(bd *stub.BalancerData) { 1629 bd.Data = balancer.Get(Name).Build(bd.ClientConn, bd.BuildOptions) 1630 lbChan <- bd.Data.(*outlierDetectionBalancer) 1631 }, 1632 Close: func(bd *stub.BalancerData) { 1633 bd.Data.(balancer.Balancer).Close() 1634 }, 1635 UpdateClientConnState: func(bd *stub.BalancerData, ccs balancer.ClientConnState) error { 1636 ccs.BalancerConfig = odCfg 1637 return bd.Data.(balancer.Balancer).UpdateClientConnState(ccs) 1638 }, 1639 } 1640 1641 stub.Register(t.Name(), bf) 1642 1643 opts := []grpc.DialOption{ 1644 grpc.WithTransportCredentials(insecure.NewCredentials()), 1645 grpc.WithDefaultServiceConfig(fmt.Sprintf(`{ "loadBalancingConfig": [{%q: {}}] }`, t.Name())), 1646 } 1647 cc, err := grpc.NewClient(backend.Address, opts...) 1648 if err != nil { 1649 t.Fatalf("grpc.NewClient() failed: %v", err) 1650 } 1651 defer cc.Close() 1652 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1653 defer cancel() 1654 testServiceClient := testgrpc.NewTestServiceClient(cc) 1655 testServiceClient.EmptyCall(ctx, &testpb.Empty{}) 1656 testutils.AwaitState(ctx, t, cc, connectivity.Ready) 1657 1658 // Failing request should not cause ejection. 1659 testServiceClient.EmptyCall(ctx, &testpb.Empty{}) 1660 testServiceClient.EmptyCall(ctx, &testpb.Empty{}) 1661 testServiceClient.EmptyCall(ctx, &testpb.Empty{}) 1662 testServiceClient.EmptyCall(ctx, &testpb.Empty{}) 1663 1664 // Run the interval algorithm. 1665 select { 1666 case <-ctx.Done(): 1667 t.Fatal("Timed out waiting for the outlier detection LB policy to be built.") 1668 case od := <-lbChan: 1669 od.intervalTimerAlgorithm() 1670 } 1671 1672 shortCtx, shortCancel := context.WithTimeout(ctx, defaultTestShortTimeout) 1673 defer shortCancel() 1674 testutils.AwaitNoStateChange(shortCtx, t, cc, connectivity.Ready) 1675 } 1676 1677 // Tests handling of endpoints with multiple addresses. The test creates two 1678 // endpoints, each with two addresses. The first endpoint has a backend that 1679 // always returns errors. The test verifies that the first endpoint is ejected 1680 // after running the intervalTimerAlgorithm. The test stops the unhealthy 1681 // backend and verifies that the second backend in the first endpoint is dialed 1682 // but it doesn't receive requests due to its ejection status. The test stops 1683 // the connected backend in the second endpoint and verifies that requests 1684 // start going to the second address in the second endpoint. The test reduces 1685 // the ejection interval and runs the intervalTimerAlgorithm again. The test 1686 // verifies that the first endpoint is unejected and requests reach both 1687 // endpoints. 1688 func (s) TestMultipleAddressesPerEndpoint(t *testing.T) { 1689 unhealthyBackend := &stubserver.StubServer{ 1690 EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) { 1691 return nil, errors.New("some error") 1692 }, 1693 } 1694 if err := unhealthyBackend.StartServer(); err != nil { 1695 t.Fatalf("Failed to start backend: %v", err) 1696 } 1697 defer unhealthyBackend.Stop() 1698 t.Logf("Started unhealthy TestService backend at: %q", unhealthyBackend.Address) 1699 1700 healthyBackends := make([]*stubserver.StubServer, 3) 1701 for i := 0; i < 3; i++ { 1702 healthyBackends[i] = stubserver.StartTestService(t, nil) 1703 defer healthyBackends[i].Stop() 1704 } 1705 1706 wrrCfg, err := balancer.Get(weightedroundrobin.Name).(balancer.ConfigParser).ParseConfig(json.RawMessage("{}")) 1707 if err != nil { 1708 t.Fatalf("Failed to parse %q config: %v", weightedroundrobin.Name, err) 1709 } 1710 // The interval is intentionally kept very large, the interval algorithm 1711 // will be triggered manually. 1712 odCfg := &LBConfig{ 1713 Interval: iserviceconfig.Duration(300 * time.Second), 1714 BaseEjectionTime: iserviceconfig.Duration(300 * time.Second), 1715 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 1716 FailurePercentageEjection: &FailurePercentageEjection{ 1717 Threshold: 50, 1718 EnforcementPercentage: 100, 1719 MinimumHosts: 0, 1720 RequestVolume: 2, 1721 }, 1722 MaxEjectionPercent: 100, 1723 ChildPolicy: &iserviceconfig.BalancerConfig{ 1724 Name: weightedroundrobin.Name, 1725 Config: wrrCfg, 1726 }, 1727 } 1728 1729 lbChan := make(chan *outlierDetectionBalancer, 1) 1730 bf := stub.BalancerFuncs{ 1731 Init: func(bd *stub.BalancerData) { 1732 bd.Data = balancer.Get(Name).Build(bd.ClientConn, bd.BuildOptions) 1733 lbChan <- bd.Data.(*outlierDetectionBalancer) 1734 }, 1735 Close: func(bd *stub.BalancerData) { 1736 bd.Data.(balancer.Balancer).Close() 1737 }, 1738 UpdateClientConnState: func(bd *stub.BalancerData, ccs balancer.ClientConnState) error { 1739 ccs.BalancerConfig = odCfg 1740 return bd.Data.(balancer.Balancer).UpdateClientConnState(ccs) 1741 }, 1742 } 1743 1744 stub.Register(t.Name(), bf) 1745 r := manual.NewBuilderWithScheme("whatever") 1746 endpoints := []resolver.Endpoint{ 1747 { 1748 Addresses: []resolver.Address{ 1749 {Addr: unhealthyBackend.Address}, 1750 {Addr: healthyBackends[0].Address}, 1751 }, 1752 }, 1753 { 1754 Addresses: []resolver.Address{ 1755 {Addr: healthyBackends[1].Address}, 1756 {Addr: healthyBackends[2].Address}, 1757 }, 1758 }, 1759 } 1760 1761 r.InitialState(resolver.State{ 1762 Endpoints: endpoints, 1763 }) 1764 dialer := testutils.NewBlockingDialer() 1765 opts := []grpc.DialOption{ 1766 grpc.WithTransportCredentials(insecure.NewCredentials()), 1767 grpc.WithDefaultServiceConfig(fmt.Sprintf(`{ "loadBalancingConfig": [{%q: {}}] }`, t.Name())), 1768 grpc.WithResolvers(r), 1769 grpc.WithContextDialer(dialer.DialContext), 1770 } 1771 cc, err := grpc.NewClient(r.Scheme()+":///", opts...) 1772 if err != nil { 1773 t.Fatalf("grpc.NewClient() failed: %v", err) 1774 } 1775 defer cc.Close() 1776 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1777 defer cancel() 1778 client := testgrpc.NewTestServiceClient(cc) 1779 client.EmptyCall(ctx, &testpb.Empty{}) 1780 testutils.AwaitState(ctx, t, cc, connectivity.Ready) 1781 1782 // Wait until both endpoints start receiving requests. 1783 addrsSeen := map[string]bool{} 1784 for ; ctx.Err() == nil && len(addrsSeen) < 2; <-time.After(time.Millisecond) { 1785 var peer peer.Peer 1786 client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer)) 1787 addrsSeen[peer.String()] = true 1788 } 1789 1790 if len(addrsSeen) < 2 { 1791 t.Fatalf("Context timed out waiting for requests to reach both endpoints.") 1792 } 1793 1794 // Make 2 requests to each endpoint and verify the first endpoint gets 1795 // ejected. 1796 for i := 0; i < 2*len(endpoints); i++ { 1797 client.EmptyCall(ctx, &testpb.Empty{}) 1798 } 1799 var od *outlierDetectionBalancer 1800 select { 1801 case <-ctx.Done(): 1802 t.Fatal("Timed out waiting for the outlier detection LB policy to be built.") 1803 case od = <-lbChan: 1804 } 1805 od.intervalTimerAlgorithm() 1806 1807 // The first endpoint should be ejected, requests should only go to 1808 // endpoints[1]. 1809 if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[1].Addresses[0]}); err != nil { 1810 t.Fatalf("RPCs didn't go to the second endpoint: %v", err) 1811 } 1812 1813 // Shutdown the unhealthy backend. The second address in the endpoint should 1814 // be connected, but it should be ejected by outlier detection. 1815 hold := dialer.Hold(healthyBackends[0].Address) 1816 unhealthyBackend.Stop() 1817 if hold.Wait(ctx) != true { 1818 t.Fatalf("Timeout waiting for second address in endpoint[0] with address %q to be contacted", healthyBackends[0].Address) 1819 } 1820 hold.Resume() 1821 1822 // Verify requests go only to healthyBackends[1] for a short time. 1823 shortCtx, cancel := context.WithTimeout(ctx, defaultTestShortTimeout) 1824 defer cancel() 1825 for ; shortCtx.Err() == nil; <-time.After(time.Millisecond) { 1826 var peer peer.Peer 1827 if _, err := client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer)); err != nil { 1828 if status.Code(err) != codes.DeadlineExceeded { 1829 t.Fatalf("EmptyCall() returned unexpected error %v", err) 1830 } 1831 break 1832 } 1833 if got, want := peer.Addr.String(), healthyBackends[1].Address; got != want { 1834 t.Fatalf("EmptyCall() went to unexpected backend: got %q, want %q", got, want) 1835 } 1836 } 1837 1838 // shutdown the connected backend in endpoints[1], requests should start 1839 // going to the second address in the same endpoint. 1840 healthyBackends[1].Stop() 1841 if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[1].Addresses[1]}); err != nil { 1842 t.Fatalf("RPCs didn't go to second address in the second endpoint: %v", err) 1843 } 1844 1845 // Reduce the ejection interval and run the interval algorithm again, it 1846 // should uneject endpoints[0]. 1847 odCfg.MaxEjectionTime = 0 1848 odCfg.BaseEjectionTime = 0 1849 <-time.After(time.Millisecond) 1850 r.UpdateState(resolver.State{Endpoints: endpoints}) 1851 od.intervalTimerAlgorithm() 1852 if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[0].Addresses[1], endpoints[1].Addresses[1]}); err != nil { 1853 t.Fatalf("RPCs didn't go to the second addresses of both endpoints: %v", err) 1854 } 1855 } 1856 1857 // Tests that removing an address from an endpoint resets its ejection state. 1858 // The test creates two endpoints, each with two addresses. The first endpoint 1859 // has a backend that always returns errors. The test verifies that the first 1860 // endpoint is ejected after running the intervalTimerAlgorithm. The test sends 1861 // a resolver update that removes the first address in the ejected endpoint. The 1862 // test verifies that requests start reaching the remaining address from the 1863 // first endpoint. 1864 func (s) TestEjectionStateResetsWhenEndpointAddressesChange(t *testing.T) { 1865 unhealthyBackend := &stubserver.StubServer{ 1866 EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) { 1867 return nil, errors.New("some error") 1868 }, 1869 } 1870 if err := unhealthyBackend.StartServer(); err != nil { 1871 t.Fatalf("Failed to start backend: %v", err) 1872 } 1873 defer unhealthyBackend.Stop() 1874 t.Logf("Started unhealthy TestService backend at: %q", unhealthyBackend.Address) 1875 1876 healthyBackends := make([]*stubserver.StubServer, 3) 1877 for i := 0; i < 3; i++ { 1878 healthyBackends[i] = stubserver.StartTestService(t, nil) 1879 defer healthyBackends[i].Stop() 1880 } 1881 1882 wrrCfg, err := balancer.Get(weightedroundrobin.Name).(balancer.ConfigParser).ParseConfig(json.RawMessage("{}")) 1883 if err != nil { 1884 t.Fatalf("Failed to parse %q config: %v", weightedroundrobin.Name, err) 1885 } 1886 // The interval is intentionally kept very large, the interval algorithm 1887 // will be triggered manually. 1888 odCfg := &LBConfig{ 1889 Interval: iserviceconfig.Duration(300 * time.Second), 1890 BaseEjectionTime: iserviceconfig.Duration(300 * time.Second), 1891 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 1892 FailurePercentageEjection: &FailurePercentageEjection{ 1893 Threshold: 50, 1894 EnforcementPercentage: 100, 1895 MinimumHosts: 0, 1896 RequestVolume: 2, 1897 }, 1898 MaxEjectionPercent: 100, 1899 ChildPolicy: &iserviceconfig.BalancerConfig{ 1900 Name: weightedroundrobin.Name, 1901 Config: wrrCfg, 1902 }, 1903 } 1904 1905 lbChan := make(chan *outlierDetectionBalancer, 1) 1906 bf := stub.BalancerFuncs{ 1907 Init: func(bd *stub.BalancerData) { 1908 bd.Data = balancer.Get(Name).Build(bd.ClientConn, bd.BuildOptions) 1909 lbChan <- bd.Data.(*outlierDetectionBalancer) 1910 }, 1911 Close: func(bd *stub.BalancerData) { 1912 bd.Data.(balancer.Balancer).Close() 1913 }, 1914 UpdateClientConnState: func(bd *stub.BalancerData, ccs balancer.ClientConnState) error { 1915 ccs.BalancerConfig = odCfg 1916 return bd.Data.(balancer.Balancer).UpdateClientConnState(ccs) 1917 }, 1918 } 1919 1920 stub.Register(t.Name(), bf) 1921 r := manual.NewBuilderWithScheme("whatever") 1922 endpoints := []resolver.Endpoint{ 1923 { 1924 Addresses: []resolver.Address{ 1925 {Addr: unhealthyBackend.Address}, 1926 {Addr: healthyBackends[0].Address}, 1927 }, 1928 }, 1929 { 1930 Addresses: []resolver.Address{ 1931 {Addr: healthyBackends[1].Address}, 1932 {Addr: healthyBackends[2].Address}, 1933 }, 1934 }, 1935 } 1936 1937 r.InitialState(resolver.State{ 1938 Endpoints: endpoints, 1939 }) 1940 dialer := testutils.NewBlockingDialer() 1941 opts := []grpc.DialOption{ 1942 grpc.WithTransportCredentials(insecure.NewCredentials()), 1943 grpc.WithDefaultServiceConfig(fmt.Sprintf(`{ "loadBalancingConfig": [{%q: {}}] }`, t.Name())), 1944 grpc.WithResolvers(r), 1945 grpc.WithContextDialer(dialer.DialContext), 1946 } 1947 cc, err := grpc.NewClient(r.Scheme()+":///", opts...) 1948 if err != nil { 1949 t.Fatalf("grpc.NewClient() failed: %v", err) 1950 } 1951 defer cc.Close() 1952 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1953 defer cancel() 1954 client := testgrpc.NewTestServiceClient(cc) 1955 client.EmptyCall(ctx, &testpb.Empty{}) 1956 testutils.AwaitState(ctx, t, cc, connectivity.Ready) 1957 1958 // Wait until both endpoints start receiving requests. 1959 addrsSeen := map[string]bool{} 1960 for ; ctx.Err() == nil && len(addrsSeen) < 2; <-time.After(time.Millisecond) { 1961 var peer peer.Peer 1962 client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer)) 1963 addrsSeen[peer.String()] = true 1964 } 1965 1966 if len(addrsSeen) < 2 { 1967 t.Fatalf("Context timed out waiting for requests to reach both endpoints.") 1968 } 1969 1970 // Make 2 requests to each endpoint and verify the first endpoint gets 1971 // ejected. 1972 for i := 0; i < 2*len(endpoints); i++ { 1973 client.EmptyCall(ctx, &testpb.Empty{}) 1974 } 1975 var od *outlierDetectionBalancer 1976 select { 1977 case <-ctx.Done(): 1978 t.Fatal("Timed out waiting for the outlier detection LB policy to be built.") 1979 case od = <-lbChan: 1980 } 1981 od.intervalTimerAlgorithm() 1982 1983 // The first endpoint should be ejected, requests should only go to 1984 // endpoints[1]. 1985 if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[1].Addresses[0]}); err != nil { 1986 t.Fatalf("RPCs didn't go to the second endpoint: %v", err) 1987 } 1988 1989 // Remove the first address from the first endpoint. This makes the first 1990 // endpoint a new endpoint for outlier detection, resetting its ejection 1991 // status. 1992 r.UpdateState(resolver.State{Endpoints: []resolver.Endpoint{ 1993 {Addresses: []resolver.Address{endpoints[0].Addresses[1]}}, 1994 endpoints[1], 1995 }}) 1996 od.intervalTimerAlgorithm() 1997 if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[0].Addresses[1], endpoints[1].Addresses[0]}); err != nil { 1998 t.Fatalf("RPCs didn't go to the second addresses of both endpoints: %v", err) 1999 } 2000 }