google.golang.org/grpc@v1.74.2/xds/internal/balancer/outlierdetection/balancer_test.go (about) 1 /* 2 * 3 * Copyright 2022 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 package outlierdetection 20 21 import ( 22 "context" 23 "encoding/json" 24 "errors" 25 "fmt" 26 "math" 27 "strings" 28 "sync" 29 "testing" 30 "time" 31 32 "github.com/google/go-cmp/cmp" 33 "github.com/google/go-cmp/cmp/cmpopts" 34 "google.golang.org/grpc" 35 "google.golang.org/grpc/balancer" 36 "google.golang.org/grpc/balancer/pickfirst/pickfirstleaf" 37 "google.golang.org/grpc/balancer/weightedroundrobin" 38 "google.golang.org/grpc/codes" 39 "google.golang.org/grpc/connectivity" 40 "google.golang.org/grpc/credentials/insecure" 41 "google.golang.org/grpc/internal/balancer/stub" 42 "google.golang.org/grpc/internal/channelz" 43 "google.golang.org/grpc/internal/grpcsync" 44 "google.golang.org/grpc/internal/grpctest" 45 iserviceconfig "google.golang.org/grpc/internal/serviceconfig" 46 "google.golang.org/grpc/internal/stubserver" 47 "google.golang.org/grpc/internal/testutils" 48 "google.golang.org/grpc/internal/testutils/roundrobin" 49 "google.golang.org/grpc/peer" 50 "google.golang.org/grpc/resolver" 51 "google.golang.org/grpc/resolver/manual" 52 "google.golang.org/grpc/serviceconfig" 53 "google.golang.org/grpc/status" 54 "google.golang.org/grpc/xds/internal/balancer/clusterimpl" 55 56 testgrpc "google.golang.org/grpc/interop/grpc_testing" 57 testpb "google.golang.org/grpc/interop/grpc_testing" 58 ) 59 60 var ( 61 defaultTestTimeout = 5 * time.Second 62 defaultTestShortTimeout = 10 * time.Millisecond 63 ) 64 65 type s struct { 66 grpctest.Tester 67 } 68 69 func Test(t *testing.T) { 70 grpctest.RunSubTests(t, s{}) 71 } 72 73 // TestParseConfig verifies the ParseConfig() method in the Outlier Detection 74 // Balancer. 75 func (s) TestParseConfig(t *testing.T) { 76 const errParseConfigName = "errParseConfigBalancer" 77 stub.Register(errParseConfigName, stub.BalancerFuncs{ 78 ParseConfig: func(json.RawMessage) (serviceconfig.LoadBalancingConfig, error) { 79 return nil, errors.New("some error") 80 }, 81 }) 82 83 parser := bb{} 84 const ( 85 defaultInterval = iserviceconfig.Duration(10 * time.Second) 86 defaultBaseEjectionTime = iserviceconfig.Duration(30 * time.Second) 87 defaultMaxEjectionTime = iserviceconfig.Duration(300 * time.Second) 88 defaultMaxEjectionPercent = 10 89 defaultSuccessRateStdevFactor = 1900 90 defaultEnforcingSuccessRate = 100 91 defaultSuccessRateMinimumHosts = 5 92 defaultSuccessRateRequestVolume = 100 93 defaultFailurePercentageThreshold = 85 94 defaultEnforcingFailurePercentage = 0 95 defaultFailurePercentageMinimumHosts = 5 96 defaultFailurePercentageRequestVolume = 50 97 ) 98 tests := []struct { 99 name string 100 input string 101 wantCfg serviceconfig.LoadBalancingConfig 102 wantErr string 103 }{ 104 { 105 name: "no-fields-set-should-get-default", 106 input: `{ 107 "childPolicy": [ 108 { 109 "xds_cluster_impl_experimental": { 110 "cluster": "test_cluster" 111 } 112 } 113 ] 114 }`, 115 wantCfg: &LBConfig{ 116 Interval: defaultInterval, 117 BaseEjectionTime: defaultBaseEjectionTime, 118 MaxEjectionTime: defaultMaxEjectionTime, 119 MaxEjectionPercent: defaultMaxEjectionPercent, 120 ChildPolicy: &iserviceconfig.BalancerConfig{ 121 Name: "xds_cluster_impl_experimental", 122 Config: &clusterimpl.LBConfig{ 123 Cluster: "test_cluster", 124 }, 125 }, 126 }, 127 }, 128 129 { 130 name: "some-top-level-fields-set", 131 input: `{ 132 "interval": "15s", 133 "maxEjectionTime": "350s", 134 "childPolicy": [ 135 { 136 "xds_cluster_impl_experimental": { 137 "cluster": "test_cluster" 138 } 139 } 140 ] 141 }`, 142 // Should get set fields + defaults for unset fields. 143 wantCfg: &LBConfig{ 144 Interval: iserviceconfig.Duration(15 * time.Second), 145 BaseEjectionTime: defaultBaseEjectionTime, 146 MaxEjectionTime: iserviceconfig.Duration(350 * time.Second), 147 MaxEjectionPercent: defaultMaxEjectionPercent, 148 ChildPolicy: &iserviceconfig.BalancerConfig{ 149 Name: "xds_cluster_impl_experimental", 150 Config: &clusterimpl.LBConfig{ 151 Cluster: "test_cluster", 152 }, 153 }, 154 }, 155 }, 156 { 157 name: "success-rate-ejection-present-but-no-fields", 158 input: `{ 159 "successRateEjection": {}, 160 "childPolicy": [ 161 { 162 "xds_cluster_impl_experimental": { 163 "cluster": "test_cluster" 164 } 165 } 166 ] 167 }`, 168 // Should get defaults of success-rate-ejection struct. 169 wantCfg: &LBConfig{ 170 Interval: defaultInterval, 171 BaseEjectionTime: defaultBaseEjectionTime, 172 MaxEjectionTime: defaultMaxEjectionTime, 173 MaxEjectionPercent: defaultMaxEjectionPercent, 174 SuccessRateEjection: &SuccessRateEjection{ 175 StdevFactor: defaultSuccessRateStdevFactor, 176 EnforcementPercentage: defaultEnforcingSuccessRate, 177 MinimumHosts: defaultSuccessRateMinimumHosts, 178 RequestVolume: defaultSuccessRateRequestVolume, 179 }, 180 ChildPolicy: &iserviceconfig.BalancerConfig{ 181 Name: "xds_cluster_impl_experimental", 182 Config: &clusterimpl.LBConfig{ 183 Cluster: "test_cluster", 184 }, 185 }, 186 }, 187 }, 188 { 189 name: "success-rate-ejection-present-partially-set", 190 input: `{ 191 "successRateEjection": { 192 "stdevFactor": 1000, 193 "minimumHosts": 5 194 }, 195 "childPolicy": [ 196 { 197 "xds_cluster_impl_experimental": { 198 "cluster": "test_cluster" 199 } 200 } 201 ] 202 }`, 203 // Should get set fields + defaults for others in success rate 204 // ejection layer. 205 wantCfg: &LBConfig{ 206 Interval: defaultInterval, 207 BaseEjectionTime: defaultBaseEjectionTime, 208 MaxEjectionTime: defaultMaxEjectionTime, 209 MaxEjectionPercent: defaultMaxEjectionPercent, 210 SuccessRateEjection: &SuccessRateEjection{ 211 StdevFactor: 1000, 212 EnforcementPercentage: defaultEnforcingSuccessRate, 213 MinimumHosts: 5, 214 RequestVolume: defaultSuccessRateRequestVolume, 215 }, 216 ChildPolicy: &iserviceconfig.BalancerConfig{ 217 Name: "xds_cluster_impl_experimental", 218 Config: &clusterimpl.LBConfig{ 219 Cluster: "test_cluster", 220 }, 221 }, 222 }, 223 }, 224 { 225 name: "success-rate-ejection-present-fully-set", 226 input: `{ 227 "successRateEjection": { 228 "stdevFactor": 1000, 229 "enforcementPercentage": 50, 230 "minimumHosts": 5, 231 "requestVolume": 50 232 }, 233 "childPolicy": [ 234 { 235 "xds_cluster_impl_experimental": { 236 "cluster": "test_cluster" 237 } 238 } 239 ] 240 }`, 241 wantCfg: &LBConfig{ 242 Interval: defaultInterval, 243 BaseEjectionTime: defaultBaseEjectionTime, 244 MaxEjectionTime: defaultMaxEjectionTime, 245 MaxEjectionPercent: defaultMaxEjectionPercent, 246 SuccessRateEjection: &SuccessRateEjection{ 247 StdevFactor: 1000, 248 EnforcementPercentage: 50, 249 MinimumHosts: 5, 250 RequestVolume: 50, 251 }, 252 ChildPolicy: &iserviceconfig.BalancerConfig{ 253 Name: "xds_cluster_impl_experimental", 254 Config: &clusterimpl.LBConfig{ 255 Cluster: "test_cluster", 256 }, 257 }, 258 }, 259 }, 260 { 261 name: "failure-percentage-ejection-present-but-no-fields", 262 input: `{ 263 "failurePercentageEjection": {}, 264 "childPolicy": [ 265 { 266 "xds_cluster_impl_experimental": { 267 "cluster": "test_cluster" 268 } 269 } 270 ] 271 }`, 272 // Should get defaults of failure percentage ejection layer. 273 wantCfg: &LBConfig{ 274 Interval: defaultInterval, 275 BaseEjectionTime: defaultBaseEjectionTime, 276 MaxEjectionTime: defaultMaxEjectionTime, 277 MaxEjectionPercent: defaultMaxEjectionPercent, 278 FailurePercentageEjection: &FailurePercentageEjection{ 279 Threshold: defaultFailurePercentageThreshold, 280 EnforcementPercentage: defaultEnforcingFailurePercentage, 281 MinimumHosts: defaultFailurePercentageMinimumHosts, 282 RequestVolume: defaultFailurePercentageRequestVolume, 283 }, 284 ChildPolicy: &iserviceconfig.BalancerConfig{ 285 Name: "xds_cluster_impl_experimental", 286 Config: &clusterimpl.LBConfig{ 287 Cluster: "test_cluster", 288 }, 289 }, 290 }, 291 }, 292 { 293 name: "failure-percentage-ejection-present-partially-set", 294 input: `{ 295 "failurePercentageEjection": { 296 "threshold": 80, 297 "minimumHosts": 10 298 }, 299 "childPolicy": [ 300 { 301 "xds_cluster_impl_experimental": { 302 "cluster": "test_cluster" 303 } 304 } 305 ] 306 }`, 307 // Should get set fields + defaults for others in success rate 308 // ejection layer. 309 wantCfg: &LBConfig{ 310 Interval: defaultInterval, 311 BaseEjectionTime: defaultBaseEjectionTime, 312 MaxEjectionTime: defaultMaxEjectionTime, 313 MaxEjectionPercent: defaultMaxEjectionPercent, 314 FailurePercentageEjection: &FailurePercentageEjection{ 315 Threshold: 80, 316 EnforcementPercentage: defaultEnforcingFailurePercentage, 317 MinimumHosts: 10, 318 RequestVolume: defaultFailurePercentageRequestVolume, 319 }, 320 ChildPolicy: &iserviceconfig.BalancerConfig{ 321 Name: "xds_cluster_impl_experimental", 322 Config: &clusterimpl.LBConfig{ 323 Cluster: "test_cluster", 324 }, 325 }, 326 }, 327 }, 328 { 329 name: "failure-percentage-ejection-present-fully-set", 330 input: `{ 331 "failurePercentageEjection": { 332 "threshold": 80, 333 "enforcementPercentage": 100, 334 "minimumHosts": 10, 335 "requestVolume": 40 336 }, 337 "childPolicy": [ 338 { 339 "xds_cluster_impl_experimental": { 340 "cluster": "test_cluster" 341 } 342 } 343 ] 344 }`, 345 wantCfg: &LBConfig{ 346 Interval: defaultInterval, 347 BaseEjectionTime: defaultBaseEjectionTime, 348 MaxEjectionTime: defaultMaxEjectionTime, 349 MaxEjectionPercent: defaultMaxEjectionPercent, 350 FailurePercentageEjection: &FailurePercentageEjection{ 351 Threshold: 80, 352 EnforcementPercentage: 100, 353 MinimumHosts: 10, 354 RequestVolume: 40, 355 }, 356 ChildPolicy: &iserviceconfig.BalancerConfig{ 357 Name: "xds_cluster_impl_experimental", 358 Config: &clusterimpl.LBConfig{ 359 Cluster: "test_cluster", 360 }, 361 }, 362 }, 363 }, 364 { // to make sure zero values aren't overwritten by defaults 365 name: "lb-config-every-field-set-zero-value", 366 input: `{ 367 "interval": "0s", 368 "baseEjectionTime": "0s", 369 "maxEjectionTime": "0s", 370 "maxEjectionPercent": 0, 371 "successRateEjection": { 372 "stdevFactor": 0, 373 "enforcementPercentage": 0, 374 "minimumHosts": 0, 375 "requestVolume": 0 376 }, 377 "failurePercentageEjection": { 378 "threshold": 0, 379 "enforcementPercentage": 0, 380 "minimumHosts": 0, 381 "requestVolume": 0 382 }, 383 "childPolicy": [ 384 { 385 "xds_cluster_impl_experimental": { 386 "cluster": "test_cluster" 387 } 388 } 389 ] 390 }`, 391 wantCfg: &LBConfig{ 392 SuccessRateEjection: &SuccessRateEjection{}, 393 FailurePercentageEjection: &FailurePercentageEjection{}, 394 ChildPolicy: &iserviceconfig.BalancerConfig{ 395 Name: "xds_cluster_impl_experimental", 396 Config: &clusterimpl.LBConfig{ 397 Cluster: "test_cluster", 398 }, 399 }, 400 }, 401 }, 402 { 403 name: "lb-config-every-field-set", 404 input: `{ 405 "interval": "10s", 406 "baseEjectionTime": "30s", 407 "maxEjectionTime": "300s", 408 "maxEjectionPercent": 10, 409 "successRateEjection": { 410 "stdevFactor": 1900, 411 "enforcementPercentage": 100, 412 "minimumHosts": 5, 413 "requestVolume": 100 414 }, 415 "failurePercentageEjection": { 416 "threshold": 85, 417 "enforcementPercentage": 5, 418 "minimumHosts": 5, 419 "requestVolume": 50 420 }, 421 "childPolicy": [ 422 { 423 "xds_cluster_impl_experimental": { 424 "cluster": "test_cluster" 425 } 426 } 427 ] 428 }`, 429 wantCfg: &LBConfig{ 430 Interval: iserviceconfig.Duration(10 * time.Second), 431 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 432 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 433 MaxEjectionPercent: 10, 434 SuccessRateEjection: &SuccessRateEjection{ 435 StdevFactor: 1900, 436 EnforcementPercentage: 100, 437 MinimumHosts: 5, 438 RequestVolume: 100, 439 }, 440 FailurePercentageEjection: &FailurePercentageEjection{ 441 Threshold: 85, 442 EnforcementPercentage: 5, 443 MinimumHosts: 5, 444 RequestVolume: 50, 445 }, 446 ChildPolicy: &iserviceconfig.BalancerConfig{ 447 Name: "xds_cluster_impl_experimental", 448 Config: &clusterimpl.LBConfig{ 449 Cluster: "test_cluster", 450 }, 451 }, 452 }, 453 }, 454 { 455 name: "interval-is-negative", 456 input: `{"interval": "-10s"}`, 457 wantErr: "OutlierDetectionLoadBalancingConfig.interval = -10s; must be >= 0", 458 }, 459 { 460 name: "base-ejection-time-is-negative", 461 input: `{"baseEjectionTime": "-10s"}`, 462 wantErr: "OutlierDetectionLoadBalancingConfig.base_ejection_time = -10s; must be >= 0", 463 }, 464 { 465 name: "max-ejection-time-is-negative", 466 input: `{"maxEjectionTime": "-10s"}`, 467 wantErr: "OutlierDetectionLoadBalancingConfig.max_ejection_time = -10s; must be >= 0", 468 }, 469 { 470 name: "max-ejection-percent-is-greater-than-100", 471 input: `{"maxEjectionPercent": 150}`, 472 wantErr: "OutlierDetectionLoadBalancingConfig.max_ejection_percent = 150; must be <= 100", 473 }, 474 { 475 name: "enforcement-percentage-success-rate-is-greater-than-100", 476 input: `{ 477 "successRateEjection": { 478 "enforcementPercentage": 150 479 } 480 }`, 481 wantErr: "OutlierDetectionLoadBalancingConfig.SuccessRateEjection.enforcement_percentage = 150; must be <= 100", 482 }, 483 { 484 name: "failure-percentage-threshold-is-greater-than-100", 485 input: `{ 486 "failurePercentageEjection": { 487 "threshold": 150 488 } 489 }`, 490 wantErr: "OutlierDetectionLoadBalancingConfig.FailurePercentageEjection.threshold = 150; must be <= 100", 491 }, 492 { 493 name: "enforcement-percentage-failure-percentage-ejection-is-greater-than-100", 494 input: `{ 495 "failurePercentageEjection": { 496 "enforcementPercentage": 150 497 } 498 }`, 499 wantErr: "OutlierDetectionLoadBalancingConfig.FailurePercentageEjection.enforcement_percentage = 150; must be <= 100", 500 }, 501 { 502 name: "child-policy-present-but-parse-error", 503 input: `{ 504 "childPolicy": [ 505 { 506 "errParseConfigBalancer": { 507 "cluster": "test_cluster" 508 } 509 } 510 ] 511 }`, 512 wantErr: "error parsing loadBalancingConfig for policy \"errParseConfigBalancer\"", 513 }, 514 { 515 name: "no-supported-child-policy", 516 input: `{ 517 "childPolicy": [ 518 { 519 "doesNotExistBalancer": { 520 "cluster": "test_cluster" 521 } 522 } 523 ] 524 }`, 525 wantErr: "invalid loadBalancingConfig: no supported policies found", 526 }, 527 } 528 for _, test := range tests { 529 t.Run(test.name, func(t *testing.T) { 530 gotCfg, gotErr := parser.ParseConfig(json.RawMessage(test.input)) 531 if gotErr != nil && !strings.Contains(gotErr.Error(), test.wantErr) { 532 t.Fatalf("ParseConfig(%v) = %v, wantErr %v", test.input, gotErr, test.wantErr) 533 } 534 if (gotErr != nil) != (test.wantErr != "") { 535 t.Fatalf("ParseConfig(%v) = %v, wantErr %v", test.input, gotErr, test.wantErr) 536 } 537 if test.wantErr != "" { 538 return 539 } 540 if diff := cmp.Diff(gotCfg, test.wantCfg); diff != "" { 541 t.Fatalf("parseConfig(%v) got unexpected output, diff (-got +want): %v", string(test.input), diff) 542 } 543 }) 544 } 545 } 546 547 func (lbc *LBConfig) Equal(lbc2 *LBConfig) bool { 548 if !lbc.EqualIgnoringChildPolicy(lbc2) { 549 return false 550 } 551 return cmp.Equal(lbc.ChildPolicy, lbc2.ChildPolicy) 552 } 553 554 type subConnWithState struct { 555 sc balancer.SubConn 556 state balancer.SubConnState 557 } 558 559 func setup(t *testing.T) (*outlierDetectionBalancer, *testutils.BalancerClientConn, func()) { 560 t.Helper() 561 builder := balancer.Get(Name) 562 if builder == nil { 563 t.Fatalf("balancer.Get(%q) returned nil", Name) 564 } 565 tcc := testutils.NewBalancerClientConn(t) 566 ch := channelz.RegisterChannel(nil, "test channel") 567 t.Cleanup(func() { channelz.RemoveEntry(ch.ID) }) 568 odB := builder.Build(tcc, balancer.BuildOptions{ChannelzParent: ch}) 569 return odB.(*outlierDetectionBalancer), tcc, odB.Close 570 } 571 572 type emptyChildConfig struct { 573 serviceconfig.LoadBalancingConfig 574 } 575 576 // TestChildBasicOperations tests basic operations of the Outlier Detection 577 // Balancer and its interaction with its child. The following scenarios are 578 // tested, in a step by step fashion: 579 // 1. The Outlier Detection Balancer receives it's first good configuration. The 580 // balancer is expected to create a child and sent the child it's configuration. 581 // 2. The Outlier Detection Balancer receives new configuration that specifies a 582 // child's type, and the new type immediately reports READY inline. The first 583 // child balancer should be closed and the second child balancer should receive 584 // a config update. 585 // 3. The Outlier Detection Balancer is closed. The second child balancer should 586 // be closed. 587 func (s) TestChildBasicOperations(t *testing.T) { 588 bc := emptyChildConfig{} 589 590 ccsCh := testutils.NewChannel() 591 closeCh := testutils.NewChannel() 592 593 stub.Register(t.Name()+"child1", stub.BalancerFuncs{ 594 UpdateClientConnState: func(_ *stub.BalancerData, ccs balancer.ClientConnState) error { 595 ccsCh.Send(ccs.BalancerConfig) 596 return nil 597 }, 598 Close: func(*stub.BalancerData) { 599 closeCh.Send(nil) 600 }, 601 }) 602 603 stub.Register(t.Name()+"child2", stub.BalancerFuncs{ 604 UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error { 605 // UpdateState inline to READY to complete graceful switch process 606 // synchronously from any UpdateClientConnState call. 607 bd.ClientConn.UpdateState(balancer.State{ 608 ConnectivityState: connectivity.Ready, 609 Picker: &testutils.TestConstPicker{}, 610 }) 611 ccsCh.Send(nil) 612 return nil 613 }, 614 Close: func(*stub.BalancerData) { 615 closeCh.Send(nil) 616 }, 617 }) 618 619 od, tcc, _ := setup(t) 620 621 // This first config update should cause a child to be built and forwarded 622 // its first update. 623 od.UpdateClientConnState(balancer.ClientConnState{ 624 BalancerConfig: &LBConfig{ 625 ChildPolicy: &iserviceconfig.BalancerConfig{ 626 Name: t.Name() + "child1", 627 Config: bc, 628 }, 629 }, 630 }) 631 632 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 633 defer cancel() 634 cr, err := ccsCh.Receive(ctx) 635 if err != nil { 636 t.Fatalf("timed out waiting for UpdateClientConnState on the first child balancer: %v", err) 637 } 638 if _, ok := cr.(emptyChildConfig); !ok { 639 t.Fatalf("Received child policy config of type %T, want %T", cr, emptyChildConfig{}) 640 } 641 642 // This Update Client Conn State call should cause the first child balancer 643 // to close, and a new child to be created and also forwarded its first 644 // config update. 645 od.UpdateClientConnState(balancer.ClientConnState{ 646 BalancerConfig: &LBConfig{ 647 Interval: math.MaxInt64, 648 ChildPolicy: &iserviceconfig.BalancerConfig{ 649 Name: t.Name() + "child2", 650 Config: emptyChildConfig{}, 651 }, 652 }, 653 }) 654 655 // Verify inline UpdateState() call from the new child eventually makes it's 656 // way to the Test Client Conn. 657 select { 658 case <-ctx.Done(): 659 t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn") 660 case state := <-tcc.NewStateCh: 661 if state != connectivity.Ready { 662 t.Fatalf("ClientConn received connectivity state %v, want %v", state, connectivity.Ready) 663 } 664 } 665 666 // Verify the first child balancer closed. 667 if _, err = closeCh.Receive(ctx); err != nil { 668 t.Fatalf("timed out waiting for the first child balancer to be closed: %v", err) 669 } 670 // Verify the second child balancer received its first config update. 671 if _, err = ccsCh.Receive(ctx); err != nil { 672 t.Fatalf("timed out waiting for UpdateClientConnState on the second child balancer: %v", err) 673 } 674 // Closing the Outlier Detection Balancer should close the newly created 675 // child. 676 od.Close() 677 if _, err = closeCh.Receive(ctx); err != nil { 678 t.Fatalf("timed out waiting for the second child balancer to be closed: %v", err) 679 } 680 } 681 682 // TestUpdateAddresses tests the functionality of UpdateAddresses and any 683 // changes in the addresses/plurality of those addresses for a SubConn. The 684 // Balancer is set up with two upstreams, with one of the upstreams being 685 // ejected. Initially, there is one SubConn for each address. The following 686 // scenarios are tested, in a step by step fashion: 687 // 1. The SubConn not currently ejected switches addresses to the address that 688 // is ejected. This should cause the SubConn to get ejected. 689 // 2. Update this same SubConn to multiple addresses. This should cause the 690 // SubConn to get unejected, as it is no longer being tracked by Outlier 691 // Detection at that point. 692 // 3. Update this same SubConn to different addresses, still multiple. This 693 // should be a noop, as the SubConn is still no longer being tracked by Outlier 694 // Detection. 695 // 4. Update this same SubConn to the a single address which is ejected. This 696 // should cause the SubConn to be ejected. 697 func (s) TestUpdateAddresses(t *testing.T) { 698 scsCh := testutils.NewChannel() 699 var scw1, scw2 balancer.SubConn 700 var err error 701 connectivityCh := make(chan struct{}) 702 stub.Register(t.Name(), stub.BalancerFuncs{ 703 UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error { 704 scw1, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{ 705 StateListener: func(balancer.SubConnState) {}, 706 }) 707 if err != nil { 708 t.Errorf("error in od.NewSubConn call: %v", err) 709 } 710 scw1.Connect() 711 scw2, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{ 712 StateListener: func(state balancer.SubConnState) { 713 if state.ConnectivityState == connectivity.Ready { 714 close(connectivityCh) 715 } 716 }, 717 }) 718 if err != nil { 719 t.Errorf("error in od.NewSubConn call: %v", err) 720 } 721 scw2.Connect() 722 bd.ClientConn.UpdateState(balancer.State{ 723 ConnectivityState: connectivity.Ready, 724 Picker: &rrPicker{ 725 scs: []balancer.SubConn{scw1, scw2}, 726 }, 727 }) 728 return nil 729 }, 730 }) 731 732 od, tcc, cleanup := setup(t) 733 defer cleanup() 734 735 od.UpdateClientConnState(balancer.ClientConnState{ 736 ResolverState: resolver.State{ 737 Endpoints: []resolver.Endpoint{ 738 {Addresses: []resolver.Address{{Addr: "address1"}}}, 739 {Addresses: []resolver.Address{{Addr: "address2"}}}, 740 }, 741 }, 742 BalancerConfig: &LBConfig{ 743 Interval: iserviceconfig.Duration(10 * time.Second), 744 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 745 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 746 MaxEjectionPercent: 10, 747 FailurePercentageEjection: &FailurePercentageEjection{ 748 Threshold: 50, 749 EnforcementPercentage: 100, 750 MinimumHosts: 2, 751 RequestVolume: 3, 752 }, 753 ChildPolicy: &iserviceconfig.BalancerConfig{ 754 Name: t.Name(), 755 Config: emptyChildConfig{}, 756 }, 757 }, 758 }) 759 760 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 761 defer cancel() 762 763 // Transition SubConns to READY so that they can register a health listener. 764 for range 2 { 765 select { 766 case <-ctx.Done(): 767 t.Fatalf("Timed out waiting for creation of new SubConn.") 768 case sc := <-tcc.NewSubConnCh: 769 sc.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting}) 770 sc.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Ready}) 771 } 772 } 773 774 // Register health listeners after all the connectivity updates are 775 // processed to avoid data races while accessing the health listener within 776 // the TestClientConn. 777 select { 778 case <-ctx.Done(): 779 t.Fatal("Context timed out waiting for all SubConns to become READY.") 780 case <-connectivityCh: 781 } 782 783 scw1.RegisterHealthListener(func(healthState balancer.SubConnState) { 784 scsCh.Send(subConnWithState{sc: scw1, state: healthState}) 785 }) 786 scw2.RegisterHealthListener(func(healthState balancer.SubConnState) { 787 scsCh.Send(subConnWithState{sc: scw2, state: healthState}) 788 }) 789 790 // Setup the system to where one address is ejected and one address 791 // isn't. 792 select { 793 case <-ctx.Done(): 794 t.Fatal("timeout while waiting for a UpdateState call on the ClientConn") 795 case picker := <-tcc.NewPickerCh: 796 pi, err := picker.Pick(balancer.PickInfo{}) 797 if err != nil { 798 t.Fatalf("picker.Pick failed with error: %v", err) 799 } 800 // Simulate 5 successful RPC calls on the first SubConn (the first call 801 // to picker.Pick). 802 for c := 0; c < 5; c++ { 803 pi.Done(balancer.DoneInfo{}) 804 } 805 pi, err = picker.Pick(balancer.PickInfo{}) 806 if err != nil { 807 t.Fatalf("picker.Pick failed with error: %v", err) 808 } 809 // Simulate 5 failed RPC calls on the second SubConn (the second call to 810 // picker.Pick). Thus, when the interval timer algorithm is run, the 811 // second SubConn's address should be ejected, which will allow us to 812 // further test UpdateAddresses() logic. 813 for c := 0; c < 5; c++ { 814 pi.Done(balancer.DoneInfo{Err: errors.New("some error")}) 815 } 816 od.intervalTimerAlgorithm() 817 // verify StateListener() got called with TRANSIENT_FAILURE for child 818 // with address that was ejected. 819 gotSCWS, err := scsCh.Receive(ctx) 820 if err != nil { 821 t.Fatalf("Error waiting for Sub Conn update: %v", err) 822 } 823 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 824 sc: scw2, 825 state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}, 826 }); err != nil { 827 t.Fatalf("Error in Sub Conn update: %v", err) 828 } 829 } 830 831 // Update scw1 to another address that is currently ejected. This should 832 // cause scw1 to get ejected. 833 od.UpdateAddresses(scw1, []resolver.Address{{Addr: "address2"}}) 834 835 // Verify that update addresses gets forwarded to ClientConn. 836 select { 837 case <-ctx.Done(): 838 t.Fatal("timeout while waiting for a UpdateState call on the ClientConn") 839 case <-tcc.UpdateAddressesAddrsCh: 840 } 841 // Verify scw1 got ejected (StateListener called with TRANSIENT_FAILURE). 842 gotSCWS, err := scsCh.Receive(ctx) 843 if err != nil { 844 t.Fatalf("Error waiting for Sub Conn update: %v", err) 845 } 846 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 847 sc: scw1, 848 state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}, 849 }); err != nil { 850 t.Fatalf("Error in Sub Conn update: %v", err) 851 } 852 853 // Update scw1 to multiple addresses. This should cause scw1 to get 854 // unejected, as is it no longer being tracked for Outlier Detection. 855 od.UpdateAddresses(scw1, []resolver.Address{ 856 {Addr: "address1"}, 857 {Addr: "address2"}, 858 }) 859 // Verify scw1 got unejected (StateListener called with recent state). 860 gotSCWS, err = scsCh.Receive(ctx) 861 if err != nil { 862 t.Fatalf("Error waiting for Sub Conn update: %v", err) 863 } 864 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 865 sc: scw1, 866 state: balancer.SubConnState{ConnectivityState: connectivity.Connecting}, 867 }); err != nil { 868 t.Fatalf("Error in Sub Conn update: %v", err) 869 } 870 871 // Update scw1 to a different multiple addresses list. A change of addresses 872 // in which the plurality goes from multiple to multiple should be a no-op, 873 // as the address continues to be ignored by outlier detection. 874 od.UpdateAddresses(scw1, []resolver.Address{ 875 {Addr: "address2"}, 876 {Addr: "address3"}, 877 }) 878 // Verify no downstream effects. 879 sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout) 880 defer cancel() 881 if _, err := scsCh.Receive(sCtx); err == nil { 882 t.Fatalf("no SubConn update should have been sent (no SubConn got ejected/unejected)") 883 } 884 885 // Update scw1 back to a single address, which is ejected. This should cause 886 // the SubConn to be re-ejected. 887 od.UpdateAddresses(scw1, []resolver.Address{{Addr: "address2"}}) 888 // Verify scw1 got ejected (StateListener called with TRANSIENT FAILURE). 889 gotSCWS, err = scsCh.Receive(ctx) 890 if err != nil { 891 t.Fatalf("Error waiting for Sub Conn update: %v", err) 892 } 893 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 894 sc: scw1, 895 state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}, 896 }); err != nil { 897 t.Fatalf("Error in Sub Conn update: %v", err) 898 } 899 } 900 901 func scwsEqual(gotSCWS subConnWithState, wantSCWS subConnWithState) error { 902 if gotSCWS.sc != wantSCWS.sc || !cmp.Equal(gotSCWS.state, wantSCWS.state, cmp.AllowUnexported(subConnWrapper{}, endpointInfo{}, balancer.SubConnState{}), cmpopts.IgnoreFields(subConnWrapper{}, "scUpdateCh")) { 903 return fmt.Errorf("received SubConnState: %+v, want %+v", gotSCWS, wantSCWS) 904 } 905 return nil 906 } 907 908 type rrPicker struct { 909 scs []balancer.SubConn 910 next int 911 } 912 913 func (rrp *rrPicker) Pick(balancer.PickInfo) (balancer.PickResult, error) { 914 sc := rrp.scs[rrp.next] 915 rrp.next = (rrp.next + 1) % len(rrp.scs) 916 return balancer.PickResult{SubConn: sc}, nil 917 } 918 919 // TestDurationOfInterval tests the configured interval timer. 920 // The following scenarios are tested: 921 // 1. The Outlier Detection Balancer receives it's first config. The balancer 922 // should configure the timer with whatever is directly specified on the config. 923 // 2. The Outlier Detection Balancer receives a subsequent config. The balancer 924 // should configure with whatever interval is configured minus the difference 925 // between the current time and the previous start timestamp. 926 // 3. The Outlier Detection Balancer receives a no-op configuration. The 927 // balancer should not configure a timer at all. 928 func (s) TestDurationOfInterval(t *testing.T) { 929 stub.Register(t.Name(), stub.BalancerFuncs{}) 930 931 od, _, cleanup := setup(t) 932 defer func(af func(d time.Duration, f func()) *time.Timer) { 933 cleanup() 934 afterFunc = af 935 }(afterFunc) 936 937 durationChan := testutils.NewChannel() 938 afterFunc = func(dur time.Duration, _ func()) *time.Timer { 939 durationChan.Send(dur) 940 return time.NewTimer(math.MaxInt64) 941 } 942 943 od.UpdateClientConnState(balancer.ClientConnState{ 944 BalancerConfig: &LBConfig{ 945 Interval: iserviceconfig.Duration(8 * time.Second), 946 SuccessRateEjection: &SuccessRateEjection{ 947 StdevFactor: 1900, 948 EnforcementPercentage: 100, 949 MinimumHosts: 5, 950 RequestVolume: 100, 951 }, 952 ChildPolicy: &iserviceconfig.BalancerConfig{ 953 Name: t.Name(), 954 Config: emptyChildConfig{}, 955 }, 956 }, 957 }) 958 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 959 defer cancel() 960 d, err := durationChan.Receive(ctx) 961 if err != nil { 962 t.Fatalf("Error receiving duration from afterFunc() call: %v", err) 963 } 964 dur := d.(time.Duration) 965 // The configured duration should be 8 seconds - what the balancer was 966 // configured with. 967 if dur != 8*time.Second { 968 t.Fatalf("configured duration should have been 8 seconds to start timer") 969 } 970 971 // Override time.Now to time.Now() + 5 seconds. This will represent 5 972 // seconds already passing for the next check in UpdateClientConnState. 973 defer func(n func() time.Time) { 974 now = n 975 }(now) 976 now = func() time.Time { 977 return time.Now().Add(time.Second * 5) 978 } 979 980 // UpdateClientConnState with an interval of 9 seconds. Due to 5 seconds 981 // already passing (from overridden time.Now function), this should start an 982 // interval timer of ~4 seconds. 983 od.UpdateClientConnState(balancer.ClientConnState{ 984 BalancerConfig: &LBConfig{ 985 Interval: iserviceconfig.Duration(9 * time.Second), 986 SuccessRateEjection: &SuccessRateEjection{ 987 StdevFactor: 1900, 988 EnforcementPercentage: 100, 989 MinimumHosts: 5, 990 RequestVolume: 100, 991 }, 992 ChildPolicy: &iserviceconfig.BalancerConfig{ 993 Name: t.Name(), 994 Config: emptyChildConfig{}, 995 }, 996 }, 997 }) 998 999 d, err = durationChan.Receive(ctx) 1000 if err != nil { 1001 t.Fatalf("Error receiving duration from afterFunc() call: %v", err) 1002 } 1003 dur = d.(time.Duration) 1004 if dur.Seconds() < 3.5 || 4.5 < dur.Seconds() { 1005 t.Fatalf("configured duration should have been around 4 seconds to start timer") 1006 } 1007 1008 // UpdateClientConnState with a no-op config. This shouldn't configure the 1009 // interval timer at all due to it being a no-op. 1010 od.UpdateClientConnState(balancer.ClientConnState{ 1011 BalancerConfig: &LBConfig{ 1012 Interval: iserviceconfig.Duration(10 * time.Second), 1013 ChildPolicy: &iserviceconfig.BalancerConfig{ 1014 Name: t.Name(), 1015 Config: emptyChildConfig{}, 1016 }, 1017 }, 1018 }) 1019 1020 // No timer should have been started. 1021 sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout) 1022 defer cancel() 1023 if _, err = durationChan.Receive(sCtx); err == nil { 1024 t.Fatal("No timer should have started.") 1025 } 1026 } 1027 1028 // TestEjectUnejectSuccessRate tests the functionality of the interval timer 1029 // algorithm when configured with SuccessRateEjection. The Outlier Detection 1030 // Balancer will be set up with 3 SubConns, each with a different address. 1031 // It tests the following scenarios, in a step by step fashion: 1032 // 1. The three addresses each have 5 successes. The interval timer algorithm should 1033 // not eject any of the addresses. 1034 // 2. Two of the addresses have 5 successes, the third has five failures. The 1035 // interval timer algorithm should eject the third address with five failures. 1036 // 3. The interval timer algorithm is run at a later time past max ejection 1037 // time. The interval timer algorithm should uneject the third address. 1038 func (s) TestEjectUnejectSuccessRate(t *testing.T) { 1039 scsCh := testutils.NewChannel() 1040 var scw1, scw2, scw3 balancer.SubConn 1041 var err error 1042 connectivityCh := make(chan struct{}) 1043 stub.Register(t.Name(), stub.BalancerFuncs{ 1044 UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error { 1045 scw1, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{ 1046 StateListener: func(balancer.SubConnState) {}, 1047 }) 1048 if err != nil { 1049 t.Errorf("error in od.NewSubConn call: %v", err) 1050 } 1051 scw1.Connect() 1052 scw2, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{ 1053 StateListener: func(balancer.SubConnState) {}, 1054 }) 1055 if err != nil { 1056 t.Errorf("error in od.NewSubConn call: %v", err) 1057 } 1058 scw2.Connect() 1059 scw3, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address3"}}, balancer.NewSubConnOptions{ 1060 StateListener: func(state balancer.SubConnState) { 1061 if state.ConnectivityState == connectivity.Ready { 1062 close(connectivityCh) 1063 } 1064 }, 1065 }) 1066 if err != nil { 1067 t.Errorf("error in od.NewSubConn call: %v", err) 1068 } 1069 scw3.Connect() 1070 bd.ClientConn.UpdateState(balancer.State{ 1071 ConnectivityState: connectivity.Ready, 1072 Picker: &rrPicker{ 1073 scs: []balancer.SubConn{scw1, scw2, scw3}, 1074 }, 1075 }) 1076 return nil 1077 }, 1078 }) 1079 1080 od, tcc, cleanup := setup(t) 1081 defer func() { 1082 cleanup() 1083 }() 1084 1085 od.UpdateClientConnState(balancer.ClientConnState{ 1086 ResolverState: resolver.State{ 1087 Endpoints: []resolver.Endpoint{ 1088 {Addresses: []resolver.Address{{Addr: "address1"}}}, 1089 {Addresses: []resolver.Address{{Addr: "address2"}}}, 1090 {Addresses: []resolver.Address{{Addr: "address3"}}}, 1091 }, 1092 }, 1093 BalancerConfig: &LBConfig{ 1094 Interval: math.MaxInt64, // so the interval will never run unless called manually in test. 1095 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 1096 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 1097 MaxEjectionPercent: 10, 1098 FailurePercentageEjection: &FailurePercentageEjection{ 1099 Threshold: 50, 1100 EnforcementPercentage: 100, 1101 MinimumHosts: 3, 1102 RequestVolume: 3, 1103 }, 1104 ChildPolicy: &iserviceconfig.BalancerConfig{ 1105 Name: t.Name(), 1106 Config: emptyChildConfig{}, 1107 }, 1108 }, 1109 }) 1110 1111 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1112 defer cancel() 1113 1114 // Transition the SubConns to READY so that they can register health 1115 // listeners. 1116 for range 3 { 1117 select { 1118 case <-ctx.Done(): 1119 t.Fatalf("Timed out waiting for creation of new SubConn.") 1120 case sc := <-tcc.NewSubConnCh: 1121 sc.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting}) 1122 sc.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Ready}) 1123 } 1124 } 1125 1126 // Register health listeners after all the connectivity updates are 1127 // processed to avoid data races while accessing the health listener within 1128 // the TestClientConn. 1129 select { 1130 case <-ctx.Done(): 1131 t.Fatal("Context timed out waiting for all SubConns to become READY.") 1132 case <-connectivityCh: 1133 } 1134 1135 scw1.RegisterHealthListener(func(healthState balancer.SubConnState) { 1136 scsCh.Send(subConnWithState{sc: scw1, state: healthState}) 1137 }) 1138 scw2.RegisterHealthListener(func(healthState balancer.SubConnState) { 1139 scsCh.Send(subConnWithState{sc: scw2, state: healthState}) 1140 }) 1141 scw3.RegisterHealthListener(func(healthState balancer.SubConnState) { 1142 scsCh.Send(subConnWithState{sc: scw3, state: healthState}) 1143 }) 1144 1145 select { 1146 case <-ctx.Done(): 1147 t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn") 1148 case picker := <-tcc.NewPickerCh: 1149 // Set each of the three upstream addresses to have five successes each. 1150 // This should cause none of the addresses to be ejected as none of them 1151 // are outliers according to the success rate algorithm. 1152 for i := 0; i < 3; i++ { 1153 pi, err := picker.Pick(balancer.PickInfo{}) 1154 if err != nil { 1155 t.Fatalf("picker.Pick failed with error: %v", err) 1156 } 1157 for c := 0; c < 5; c++ { 1158 pi.Done(balancer.DoneInfo{}) 1159 } 1160 } 1161 1162 od.intervalTimerAlgorithm() 1163 1164 // verify no StateListener() call on the child, as no addresses got 1165 // ejected (ejected address will cause an StateListener call). 1166 sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout) 1167 defer cancel() 1168 if _, err := scsCh.Receive(sCtx); err == nil { 1169 t.Fatalf("no SubConn update should have been sent (no SubConn got ejected)") 1170 } 1171 1172 // Since no addresses are ejected, a SubConn update should forward down 1173 // to the child. 1174 od.scUpdateCh.Put(&scHealthUpdate{ 1175 scw: scw1.(*subConnWrapper), 1176 state: balancer.SubConnState{ 1177 ConnectivityState: connectivity.Connecting, 1178 }}, 1179 ) 1180 1181 gotSCWS, err := scsCh.Receive(ctx) 1182 if err != nil { 1183 t.Fatalf("Error waiting for Sub Conn update: %v", err) 1184 } 1185 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 1186 sc: scw1, 1187 state: balancer.SubConnState{ConnectivityState: connectivity.Connecting}, 1188 }); err != nil { 1189 t.Fatalf("Error in Sub Conn update: %v", err) 1190 } 1191 1192 // Set two of the upstream addresses to have five successes each, and 1193 // one of the upstream addresses to have five failures. This should 1194 // cause the address which has five failures to be ejected according to 1195 // the SuccessRateAlgorithm. 1196 for i := 0; i < 2; i++ { 1197 pi, err := picker.Pick(balancer.PickInfo{}) 1198 if err != nil { 1199 t.Fatalf("picker.Pick failed with error: %v", err) 1200 } 1201 for c := 0; c < 5; c++ { 1202 pi.Done(balancer.DoneInfo{}) 1203 } 1204 } 1205 pi, err := picker.Pick(balancer.PickInfo{}) 1206 if err != nil { 1207 t.Fatalf("picker.Pick failed with error: %v", err) 1208 } 1209 if got, want := pi.SubConn, scw3.(*subConnWrapper).SubConn; got != want { 1210 t.Fatalf("Unexpected SubConn chosen by picker: got %v, want %v", got, want) 1211 } 1212 for c := 0; c < 5; c++ { 1213 pi.Done(balancer.DoneInfo{Err: errors.New("some error")}) 1214 } 1215 1216 // should eject address that always errored. 1217 od.intervalTimerAlgorithm() 1218 // Due to the address being ejected, the SubConn with that address 1219 // should be ejected, meaning a TRANSIENT_FAILURE connectivity state 1220 // gets reported to the child. 1221 gotSCWS, err = scsCh.Receive(ctx) 1222 if err != nil { 1223 t.Fatalf("Error waiting for Sub Conn update: %v", err) 1224 } 1225 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 1226 sc: scw3, 1227 state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}, 1228 }); err != nil { 1229 t.Fatalf("Error in Sub Conn update: %v", err) 1230 } 1231 // Only one address should be ejected. 1232 sCtx, cancel = context.WithTimeout(context.Background(), defaultTestShortTimeout) 1233 defer cancel() 1234 if _, err := scsCh.Receive(sCtx); err == nil { 1235 t.Fatalf("Only one SubConn update should have been sent (only one SubConn got ejected)") 1236 } 1237 1238 // Now that an address is ejected, SubConn updates for SubConns using 1239 // that address should not be forwarded downward. These SubConn updates 1240 // will be cached to update the child sometime in the future when the 1241 // address gets unejected. 1242 od.scUpdateCh.Put(&scHealthUpdate{ 1243 scw: scw3.(*subConnWrapper), 1244 state: balancer.SubConnState{ConnectivityState: connectivity.Connecting}, 1245 }) 1246 sCtx, cancel = context.WithTimeout(context.Background(), defaultTestShortTimeout) 1247 defer cancel() 1248 if _, err := scsCh.Receive(sCtx); err == nil { 1249 t.Fatalf("SubConn update should not have been forwarded (the SubConn is ejected)") 1250 } 1251 1252 // Override now to cause the interval timer algorithm to always uneject 1253 // the ejected address. This will always uneject the ejected address 1254 // because this time is set way past the max ejection time set in the 1255 // configuration, which will make the next interval timer algorithm run 1256 // uneject any ejected addresses. 1257 defer func(n func() time.Time) { 1258 now = n 1259 }(now) 1260 now = func() time.Time { 1261 return time.Now().Add(time.Second * 1000) 1262 } 1263 od.intervalTimerAlgorithm() 1264 1265 // unejected SubConn should report latest persisted state - which is 1266 // connecting from earlier. 1267 gotSCWS, err = scsCh.Receive(ctx) 1268 if err != nil { 1269 t.Fatalf("Error waiting for Sub Conn update: %v", err) 1270 } 1271 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 1272 sc: scw3, 1273 state: balancer.SubConnState{ConnectivityState: connectivity.Connecting}, 1274 }); err != nil { 1275 t.Fatalf("Error in Sub Conn update: %v", err) 1276 } 1277 } 1278 } 1279 1280 // TestEjectFailureRate tests the functionality of the interval timer algorithm 1281 // when configured with FailurePercentageEjection, and also the functionality of 1282 // noop configuration. The Outlier Detection Balancer will be set up with 3 1283 // SubConns, each with a different address. It tests the following scenarios, in 1284 // a step by step fashion: 1285 // 1. The three addresses each have 5 successes. The interval timer algorithm 1286 // should not eject any of the addresses. 1287 // 2. Two of the addresses have 5 successes, the third has five failures. The 1288 // interval timer algorithm should eject the third address with five failures. 1289 // 3. The Outlier Detection Balancer receives a subsequent noop config update. 1290 // The balancer should uneject all ejected addresses. 1291 func (s) TestEjectFailureRate(t *testing.T) { 1292 scsCh := testutils.NewChannel() 1293 var scw1, scw2, scw3 balancer.SubConn 1294 var err error 1295 connectivityCh := make(chan struct{}) 1296 stub.Register(t.Name(), stub.BalancerFuncs{ 1297 UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error { 1298 if scw1 != nil { // UpdateClientConnState was already called, no need to recreate SubConns. 1299 return nil 1300 } 1301 scw1, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{ 1302 StateListener: func(balancer.SubConnState) {}, 1303 }) 1304 if err != nil { 1305 t.Errorf("error in od.NewSubConn call: %v", err) 1306 } 1307 scw1.Connect() 1308 scw2, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{ 1309 StateListener: func(balancer.SubConnState) {}, 1310 }) 1311 if err != nil { 1312 t.Errorf("error in od.NewSubConn call: %v", err) 1313 } 1314 scw2.Connect() 1315 scw3, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address3"}}, balancer.NewSubConnOptions{ 1316 StateListener: func(scs balancer.SubConnState) { 1317 if scs.ConnectivityState == connectivity.Ready { 1318 close(connectivityCh) 1319 } 1320 }, 1321 }) 1322 if err != nil { 1323 t.Errorf("error in od.NewSubConn call: %v", err) 1324 } 1325 scw3.Connect() 1326 return nil 1327 }, 1328 }) 1329 1330 od, tcc, cleanup := setup(t) 1331 defer func() { 1332 cleanup() 1333 }() 1334 1335 od.UpdateClientConnState(balancer.ClientConnState{ 1336 ResolverState: resolver.State{ 1337 Endpoints: []resolver.Endpoint{ 1338 {Addresses: []resolver.Address{{Addr: "address1"}}}, 1339 {Addresses: []resolver.Address{{Addr: "address2"}}}, 1340 {Addresses: []resolver.Address{{Addr: "address3"}}}, 1341 }, 1342 }, 1343 BalancerConfig: &LBConfig{ 1344 Interval: math.MaxInt64, // so the interval will never run unless called manually in test. 1345 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 1346 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 1347 MaxEjectionPercent: 10, 1348 SuccessRateEjection: &SuccessRateEjection{ 1349 StdevFactor: 500, 1350 EnforcementPercentage: 100, 1351 MinimumHosts: 3, 1352 RequestVolume: 3, 1353 }, 1354 ChildPolicy: &iserviceconfig.BalancerConfig{ 1355 Name: t.Name(), 1356 Config: emptyChildConfig{}, 1357 }, 1358 }, 1359 }) 1360 1361 od.UpdateState(balancer.State{ 1362 ConnectivityState: connectivity.Ready, 1363 Picker: &rrPicker{ 1364 scs: []balancer.SubConn{scw1, scw2, scw3}, 1365 }, 1366 }) 1367 1368 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1369 defer cancel() 1370 1371 // Transition the SubConns to READY so that they can register health 1372 // listeners. 1373 for range 3 { 1374 select { 1375 case <-ctx.Done(): 1376 t.Fatal("Timed out waiting for creation of new SubConn.") 1377 case sc := <-tcc.NewSubConnCh: 1378 sc.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting}) 1379 sc.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Ready}) 1380 } 1381 } 1382 // Register health listeners after all the connectivity updates are 1383 // processed to avoid data races while accessing the health listener within 1384 // the TestClientConn. 1385 select { 1386 case <-ctx.Done(): 1387 t.Fatal("Context timed out waiting for all SubConns to become READY.") 1388 case <-connectivityCh: 1389 } 1390 1391 scw1.RegisterHealthListener(func(healthState balancer.SubConnState) { 1392 scsCh.Send(subConnWithState{sc: scw1, state: healthState}) 1393 }) 1394 scw2.RegisterHealthListener(func(healthState balancer.SubConnState) { 1395 scsCh.Send(subConnWithState{sc: scw2, state: healthState}) 1396 }) 1397 scw3.RegisterHealthListener(func(healthState balancer.SubConnState) { 1398 scsCh.Send(subConnWithState{sc: scw3, state: healthState}) 1399 }) 1400 1401 select { 1402 case <-ctx.Done(): 1403 t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn") 1404 case picker := <-tcc.NewPickerCh: 1405 // Set each upstream address to have five successes each. This should 1406 // cause none of the addresses to be ejected as none of them are below 1407 // the failure percentage threshold. 1408 for i := 0; i < 3; i++ { 1409 pi, err := picker.Pick(balancer.PickInfo{}) 1410 if err != nil { 1411 t.Fatalf("picker.Pick failed with error: %v", err) 1412 } 1413 for c := 0; c < 5; c++ { 1414 pi.Done(balancer.DoneInfo{}) 1415 } 1416 } 1417 1418 od.intervalTimerAlgorithm() 1419 sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout) 1420 defer cancel() 1421 if _, err := scsCh.Receive(sCtx); err == nil { 1422 t.Fatalf("no SubConn update should have been sent (no SubConn got ejected)") 1423 } 1424 1425 // Set two upstream addresses to have five successes each, and one 1426 // upstream address to have five failures. This should cause the address 1427 // with five failures to be ejected according to the Failure Percentage 1428 // Algorithm. 1429 for i := 0; i < 2; i++ { 1430 pi, err := picker.Pick(balancer.PickInfo{}) 1431 if err != nil { 1432 t.Fatalf("picker.Pick failed with error: %v", err) 1433 } 1434 for c := 0; c < 5; c++ { 1435 pi.Done(balancer.DoneInfo{}) 1436 } 1437 } 1438 pi, err := picker.Pick(balancer.PickInfo{}) 1439 if err != nil { 1440 t.Fatalf("picker.Pick failed with error: %v", err) 1441 } 1442 for c := 0; c < 5; c++ { 1443 pi.Done(balancer.DoneInfo{Err: errors.New("some error")}) 1444 } 1445 1446 // should eject address that always errored. 1447 od.intervalTimerAlgorithm() 1448 1449 // verify StateListener() got called with TRANSIENT_FAILURE for child 1450 // in address that was ejected. 1451 gotSCWS, err := scsCh.Receive(ctx) 1452 if err != nil { 1453 t.Fatalf("Error waiting for Sub Conn update: %v", err) 1454 } 1455 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 1456 sc: scw3, 1457 state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}, 1458 }); err != nil { 1459 t.Fatalf("Error in Sub Conn update: %v", err) 1460 } 1461 1462 // verify only one address got ejected. 1463 sCtx, cancel = context.WithTimeout(context.Background(), defaultTestShortTimeout) 1464 defer cancel() 1465 if _, err := scsCh.Receive(sCtx); err == nil { 1466 t.Fatalf("Only one SubConn update should have been sent (only one SubConn got ejected)") 1467 } 1468 1469 // upon the Outlier Detection balancer being reconfigured with a noop 1470 // configuration, every ejected SubConn should be unejected. 1471 od.UpdateClientConnState(balancer.ClientConnState{ 1472 ResolverState: resolver.State{ 1473 Endpoints: []resolver.Endpoint{ 1474 {Addresses: []resolver.Address{{Addr: "address1"}}}, 1475 {Addresses: []resolver.Address{{Addr: "address2"}}}, 1476 {Addresses: []resolver.Address{{Addr: "address3"}}}, 1477 }, 1478 }, 1479 BalancerConfig: &LBConfig{ 1480 Interval: math.MaxInt64, 1481 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 1482 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 1483 MaxEjectionPercent: 10, 1484 ChildPolicy: &iserviceconfig.BalancerConfig{ 1485 Name: t.Name(), 1486 Config: emptyChildConfig{}, 1487 }, 1488 }, 1489 }) 1490 gotSCWS, err = scsCh.Receive(ctx) 1491 if err != nil { 1492 t.Fatalf("Error waiting for Sub Conn update: %v", err) 1493 } 1494 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 1495 sc: scw3, 1496 state: balancer.SubConnState{ConnectivityState: connectivity.Connecting}, 1497 }); err != nil { 1498 t.Fatalf("Error in Sub Conn update: %v", err) 1499 } 1500 } 1501 } 1502 1503 // TestConcurrentOperations calls different operations on the balancer in 1504 // separate goroutines to test for any race conditions and deadlocks. It also 1505 // uses a child balancer which verifies that no operations on the child get 1506 // called after the child balancer is closed. 1507 func (s) TestConcurrentOperations(t *testing.T) { 1508 closed := grpcsync.NewEvent() 1509 stub.Register(t.Name(), stub.BalancerFuncs{ 1510 UpdateClientConnState: func(*stub.BalancerData, balancer.ClientConnState) error { 1511 if closed.HasFired() { 1512 t.Error("UpdateClientConnState was called after Close(), which breaks the balancer API") 1513 } 1514 return nil 1515 }, 1516 ResolverError: func(*stub.BalancerData, error) { 1517 if closed.HasFired() { 1518 t.Error("ResolverError was called after Close(), which breaks the balancer API") 1519 } 1520 }, 1521 Close: func(*stub.BalancerData) { 1522 closed.Fire() 1523 }, 1524 ExitIdle: func(*stub.BalancerData) { 1525 if closed.HasFired() { 1526 t.Error("ExitIdle was called after Close(), which breaks the balancer API") 1527 } 1528 }, 1529 }) 1530 1531 od, tcc, cleanup := setup(t) 1532 defer func() { 1533 cleanup() 1534 }() 1535 1536 od.UpdateClientConnState(balancer.ClientConnState{ 1537 ResolverState: resolver.State{ 1538 Endpoints: []resolver.Endpoint{ 1539 {Addresses: []resolver.Address{{Addr: "address1"}}}, 1540 {Addresses: []resolver.Address{{Addr: "address2"}}}, 1541 {Addresses: []resolver.Address{{Addr: "address3"}}}, 1542 }, 1543 }, 1544 BalancerConfig: &LBConfig{ 1545 Interval: math.MaxInt64, // so the interval will never run unless called manually in test. 1546 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 1547 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 1548 MaxEjectionPercent: 10, 1549 SuccessRateEjection: &SuccessRateEjection{ // Have both Success Rate and Failure Percentage to step through all the interval timer code 1550 StdevFactor: 500, 1551 EnforcementPercentage: 100, 1552 MinimumHosts: 3, 1553 RequestVolume: 3, 1554 }, 1555 FailurePercentageEjection: &FailurePercentageEjection{ 1556 Threshold: 50, 1557 EnforcementPercentage: 100, 1558 MinimumHosts: 3, 1559 RequestVolume: 3, 1560 }, 1561 ChildPolicy: &iserviceconfig.BalancerConfig{ 1562 Name: t.Name(), 1563 Config: emptyChildConfig{}, 1564 }, 1565 }, 1566 }) 1567 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1568 defer cancel() 1569 1570 scw1, err := od.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{}) 1571 if err != nil { 1572 t.Fatalf("error in od.NewSubConn call: %v", err) 1573 } 1574 if err != nil { 1575 t.Fatalf("error in od.NewSubConn call: %v", err) 1576 } 1577 1578 scw2, err := od.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{}) 1579 if err != nil { 1580 t.Fatalf("error in od.NewSubConn call: %v", err) 1581 } 1582 1583 scw3, err := od.NewSubConn([]resolver.Address{{Addr: "address3"}}, balancer.NewSubConnOptions{}) 1584 if err != nil { 1585 t.Fatalf("error in od.NewSubConn call: %v", err) 1586 } 1587 1588 od.UpdateState(balancer.State{ 1589 ConnectivityState: connectivity.Ready, 1590 Picker: &rrPicker{ 1591 scs: []balancer.SubConn{scw2, scw3}, 1592 }, 1593 }) 1594 1595 var picker balancer.Picker 1596 select { 1597 case <-ctx.Done(): 1598 t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn") 1599 case picker = <-tcc.NewPickerCh: 1600 } 1601 1602 finished := make(chan struct{}) 1603 var wg sync.WaitGroup 1604 wg.Add(1) 1605 go func() { 1606 defer wg.Done() 1607 for { 1608 select { 1609 case <-finished: 1610 return 1611 default: 1612 } 1613 pi, err := picker.Pick(balancer.PickInfo{}) 1614 if err != nil { 1615 continue 1616 } 1617 pi.Done(balancer.DoneInfo{}) 1618 pi.Done(balancer.DoneInfo{Err: errors.New("some error")}) 1619 time.Sleep(1 * time.Nanosecond) 1620 } 1621 }() 1622 1623 wg.Add(1) 1624 go func() { 1625 defer wg.Done() 1626 for { 1627 select { 1628 case <-finished: 1629 return 1630 default: 1631 } 1632 od.intervalTimerAlgorithm() 1633 } 1634 }() 1635 1636 // call Outlier Detection's balancer.ClientConn operations asynchronously. 1637 // balancer.ClientConn operations have no guarantee from the API to be 1638 // called synchronously. 1639 wg.Add(1) 1640 go func() { 1641 defer wg.Done() 1642 for { 1643 select { 1644 case <-finished: 1645 return 1646 default: 1647 } 1648 od.UpdateState(balancer.State{ 1649 ConnectivityState: connectivity.Ready, 1650 Picker: &rrPicker{ 1651 scs: []balancer.SubConn{scw2, scw3}, 1652 }, 1653 }) 1654 time.Sleep(1 * time.Nanosecond) 1655 } 1656 }() 1657 1658 wg.Add(1) 1659 go func() { 1660 defer wg.Done() 1661 od.NewSubConn([]resolver.Address{{Addr: "address4"}}, balancer.NewSubConnOptions{}) 1662 }() 1663 1664 wg.Add(1) 1665 go func() { 1666 defer wg.Done() 1667 scw1.Shutdown() 1668 }() 1669 1670 wg.Add(1) 1671 go func() { 1672 defer wg.Done() 1673 od.UpdateAddresses(scw2, []resolver.Address{{Addr: "address3"}}) 1674 }() 1675 1676 // Call balancer.Balancers synchronously in this goroutine, upholding the 1677 // balancer.Balancer API guarantee of synchronous calls. 1678 od.UpdateClientConnState(balancer.ClientConnState{ // This will delete addresses and flip to no op 1679 ResolverState: resolver.State{ 1680 Endpoints: []resolver.Endpoint{{Addresses: []resolver.Address{{Addr: "address1"}}}}, 1681 }, 1682 BalancerConfig: &LBConfig{ 1683 Interval: math.MaxInt64, 1684 ChildPolicy: &iserviceconfig.BalancerConfig{ 1685 Name: t.Name(), 1686 Config: emptyChildConfig{}, 1687 }, 1688 }, 1689 }) 1690 1691 // Call balancer.Balancers synchronously in this goroutine, upholding the 1692 // balancer.Balancer API guarantee. 1693 od.updateSubConnState(scw1.(*subConnWrapper), balancer.SubConnState{ 1694 ConnectivityState: connectivity.Connecting, 1695 }) 1696 od.ResolverError(errors.New("some error")) 1697 od.ExitIdle() 1698 od.Close() 1699 close(finished) 1700 wg.Wait() 1701 } 1702 1703 // Test verifies that outlier detection doesn't eject subchannels created by 1704 // the new pickfirst balancer when pickfirst is a non-leaf policy, i.e. not 1705 // under a petiole policy. When pickfirst is not under a petiole policy, it will 1706 // not register a health listener. pickfirst will still set the address 1707 // attribute to disable ejection through the raw connectivity listener. When 1708 // Outlier Detection processes a health update and sees the health listener is 1709 // enabled but a health listener is not registered, it will drop the ejection 1710 // update. 1711 func (s) TestPickFirstHealthListenerDisabled(t *testing.T) { 1712 backend := &stubserver.StubServer{ 1713 EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) { 1714 return nil, errors.New("some error") 1715 }, 1716 } 1717 if err := backend.StartServer(); err != nil { 1718 t.Fatalf("Failed to start backend: %v", err) 1719 } 1720 defer backend.Stop() 1721 t.Logf("Started bad TestService backend at: %q", backend.Address) 1722 1723 // The interval is intentionally kept very large, the interval algorithm 1724 // will be triggered manually. 1725 odCfg := &LBConfig{ 1726 Interval: iserviceconfig.Duration(300 * time.Second), 1727 BaseEjectionTime: iserviceconfig.Duration(300 * time.Second), 1728 MaxEjectionTime: iserviceconfig.Duration(500 * time.Second), 1729 FailurePercentageEjection: &FailurePercentageEjection{ 1730 Threshold: 50, 1731 EnforcementPercentage: 100, 1732 MinimumHosts: 0, 1733 RequestVolume: 2, 1734 }, 1735 MaxEjectionPercent: 100, 1736 ChildPolicy: &iserviceconfig.BalancerConfig{ 1737 Name: pickfirstleaf.Name, 1738 }, 1739 } 1740 1741 lbChan := make(chan *outlierDetectionBalancer, 1) 1742 bf := stub.BalancerFuncs{ 1743 Init: func(bd *stub.BalancerData) { 1744 bd.ChildBalancer = balancer.Get(Name).Build(bd.ClientConn, bd.BuildOptions) 1745 lbChan <- bd.ChildBalancer.(*outlierDetectionBalancer) 1746 }, 1747 Close: func(bd *stub.BalancerData) { 1748 bd.ChildBalancer.Close() 1749 }, 1750 UpdateClientConnState: func(bd *stub.BalancerData, ccs balancer.ClientConnState) error { 1751 ccs.BalancerConfig = odCfg 1752 return bd.ChildBalancer.UpdateClientConnState(ccs) 1753 }, 1754 } 1755 1756 stub.Register(t.Name(), bf) 1757 1758 opts := []grpc.DialOption{ 1759 grpc.WithTransportCredentials(insecure.NewCredentials()), 1760 grpc.WithDefaultServiceConfig(fmt.Sprintf(`{ "loadBalancingConfig": [{%q: {}}] }`, t.Name())), 1761 } 1762 cc, err := grpc.NewClient(backend.Address, opts...) 1763 if err != nil { 1764 t.Fatalf("grpc.NewClient() failed: %v", err) 1765 } 1766 defer cc.Close() 1767 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1768 defer cancel() 1769 testServiceClient := testgrpc.NewTestServiceClient(cc) 1770 testServiceClient.EmptyCall(ctx, &testpb.Empty{}) 1771 testutils.AwaitState(ctx, t, cc, connectivity.Ready) 1772 1773 // Failing request should not cause ejection. 1774 testServiceClient.EmptyCall(ctx, &testpb.Empty{}) 1775 testServiceClient.EmptyCall(ctx, &testpb.Empty{}) 1776 testServiceClient.EmptyCall(ctx, &testpb.Empty{}) 1777 testServiceClient.EmptyCall(ctx, &testpb.Empty{}) 1778 1779 // Run the interval algorithm. 1780 select { 1781 case <-ctx.Done(): 1782 t.Fatal("Timed out waiting for the outlier detection LB policy to be built.") 1783 case od := <-lbChan: 1784 od.intervalTimerAlgorithm() 1785 } 1786 1787 shortCtx, shortCancel := context.WithTimeout(ctx, defaultTestShortTimeout) 1788 defer shortCancel() 1789 testutils.AwaitNoStateChange(shortCtx, t, cc, connectivity.Ready) 1790 } 1791 1792 // Tests handling of endpoints with multiple addresses. The test creates two 1793 // endpoints, each with two addresses. The first endpoint has a backend that 1794 // always returns errors. The test verifies that the first endpoint is ejected 1795 // after running the intervalTimerAlgorithm. The test stops the unhealthy 1796 // backend and verifies that the second backend in the first endpoint is dialed 1797 // but it doesn't receive requests due to its ejection status. The test stops 1798 // the connected backend in the second endpoint and verifies that requests 1799 // start going to the second address in the second endpoint. The test reduces 1800 // the ejection interval and runs the intervalTimerAlgorithm again. The test 1801 // verifies that the first endpoint is unejected and requests reach both 1802 // endpoints. 1803 func (s) TestMultipleAddressesPerEndpoint(t *testing.T) { 1804 unhealthyBackend := &stubserver.StubServer{ 1805 EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) { 1806 return nil, errors.New("some error") 1807 }, 1808 } 1809 if err := unhealthyBackend.StartServer(); err != nil { 1810 t.Fatalf("Failed to start backend: %v", err) 1811 } 1812 defer unhealthyBackend.Stop() 1813 t.Logf("Started unhealthy TestService backend at: %q", unhealthyBackend.Address) 1814 1815 healthyBackends := make([]*stubserver.StubServer, 3) 1816 for i := 0; i < 3; i++ { 1817 healthyBackends[i] = stubserver.StartTestService(t, nil) 1818 defer healthyBackends[i].Stop() 1819 } 1820 1821 wrrCfg, err := balancer.Get(weightedroundrobin.Name).(balancer.ConfigParser).ParseConfig(json.RawMessage("{}")) 1822 if err != nil { 1823 t.Fatalf("Failed to parse %q config: %v", weightedroundrobin.Name, err) 1824 } 1825 // The interval is intentionally kept very large, the interval algorithm 1826 // will be triggered manually. 1827 odCfg := &LBConfig{ 1828 Interval: iserviceconfig.Duration(300 * time.Second), 1829 BaseEjectionTime: iserviceconfig.Duration(300 * time.Second), 1830 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 1831 FailurePercentageEjection: &FailurePercentageEjection{ 1832 Threshold: 50, 1833 EnforcementPercentage: 100, 1834 MinimumHosts: 0, 1835 RequestVolume: 2, 1836 }, 1837 MaxEjectionPercent: 100, 1838 ChildPolicy: &iserviceconfig.BalancerConfig{ 1839 Name: weightedroundrobin.Name, 1840 Config: wrrCfg, 1841 }, 1842 } 1843 1844 lbChan := make(chan *outlierDetectionBalancer, 1) 1845 bf := stub.BalancerFuncs{ 1846 Init: func(bd *stub.BalancerData) { 1847 bd.ChildBalancer = balancer.Get(Name).Build(bd.ClientConn, bd.BuildOptions) 1848 lbChan <- bd.ChildBalancer.(*outlierDetectionBalancer) 1849 }, 1850 Close: func(bd *stub.BalancerData) { 1851 bd.ChildBalancer.Close() 1852 }, 1853 UpdateClientConnState: func(bd *stub.BalancerData, ccs balancer.ClientConnState) error { 1854 ccs.BalancerConfig = odCfg 1855 return bd.ChildBalancer.UpdateClientConnState(ccs) 1856 }, 1857 } 1858 1859 stub.Register(t.Name(), bf) 1860 r := manual.NewBuilderWithScheme("whatever") 1861 endpoints := []resolver.Endpoint{ 1862 { 1863 Addresses: []resolver.Address{ 1864 {Addr: unhealthyBackend.Address}, 1865 {Addr: healthyBackends[0].Address}, 1866 }, 1867 }, 1868 { 1869 Addresses: []resolver.Address{ 1870 {Addr: healthyBackends[1].Address}, 1871 {Addr: healthyBackends[2].Address}, 1872 }, 1873 }, 1874 } 1875 1876 r.InitialState(resolver.State{ 1877 Endpoints: endpoints, 1878 }) 1879 dialer := testutils.NewBlockingDialer() 1880 opts := []grpc.DialOption{ 1881 grpc.WithTransportCredentials(insecure.NewCredentials()), 1882 grpc.WithDefaultServiceConfig(fmt.Sprintf(`{ "loadBalancingConfig": [{%q: {}}] }`, t.Name())), 1883 grpc.WithResolvers(r), 1884 grpc.WithContextDialer(dialer.DialContext), 1885 } 1886 cc, err := grpc.NewClient(r.Scheme()+":///", opts...) 1887 if err != nil { 1888 t.Fatalf("grpc.NewClient() failed: %v", err) 1889 } 1890 defer cc.Close() 1891 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1892 defer cancel() 1893 client := testgrpc.NewTestServiceClient(cc) 1894 client.EmptyCall(ctx, &testpb.Empty{}) 1895 testutils.AwaitState(ctx, t, cc, connectivity.Ready) 1896 1897 // Wait until both endpoints start receiving requests. 1898 addrsSeen := map[string]bool{} 1899 for ; ctx.Err() == nil && len(addrsSeen) < 2; <-time.After(time.Millisecond) { 1900 var peer peer.Peer 1901 client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer)) 1902 addrsSeen[peer.String()] = true 1903 } 1904 1905 if len(addrsSeen) < 2 { 1906 t.Fatalf("Context timed out waiting for requests to reach both endpoints.") 1907 } 1908 1909 // Make 2 requests to each endpoint and verify the first endpoint gets 1910 // ejected. 1911 for i := 0; i < 2*len(endpoints); i++ { 1912 client.EmptyCall(ctx, &testpb.Empty{}) 1913 } 1914 var od *outlierDetectionBalancer 1915 select { 1916 case <-ctx.Done(): 1917 t.Fatal("Timed out waiting for the outlier detection LB policy to be built.") 1918 case od = <-lbChan: 1919 } 1920 od.intervalTimerAlgorithm() 1921 1922 // The first endpoint should be ejected, requests should only go to 1923 // endpoints[1]. 1924 if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[1].Addresses[0]}); err != nil { 1925 t.Fatalf("RPCs didn't go to the second endpoint: %v", err) 1926 } 1927 1928 // Shutdown the unhealthy backend. The second address in the endpoint should 1929 // be connected, but it should be ejected by outlier detection. 1930 hold := dialer.Hold(healthyBackends[0].Address) 1931 unhealthyBackend.Stop() 1932 if hold.Wait(ctx) != true { 1933 t.Fatalf("Timeout waiting for second address in endpoint[0] with address %q to be contacted", healthyBackends[0].Address) 1934 } 1935 hold.Resume() 1936 1937 // Verify requests go only to healthyBackends[1] for a short time. 1938 shortCtx, cancel := context.WithTimeout(ctx, defaultTestShortTimeout) 1939 defer cancel() 1940 for ; shortCtx.Err() == nil; <-time.After(time.Millisecond) { 1941 var peer peer.Peer 1942 if _, err := client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer)); err != nil { 1943 if status.Code(err) != codes.DeadlineExceeded { 1944 t.Fatalf("EmptyCall() returned unexpected error %v", err) 1945 } 1946 break 1947 } 1948 if got, want := peer.Addr.String(), healthyBackends[1].Address; got != want { 1949 t.Fatalf("EmptyCall() went to unexpected backend: got %q, want %q", got, want) 1950 } 1951 } 1952 1953 // shutdown the connected backend in endpoints[1], requests should start 1954 // going to the second address in the same endpoint. 1955 healthyBackends[1].Stop() 1956 if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[1].Addresses[1]}); err != nil { 1957 t.Fatalf("RPCs didn't go to second address in the second endpoint: %v", err) 1958 } 1959 1960 // Reduce the ejection interval and run the interval algorithm again, it 1961 // should uneject endpoints[0]. 1962 odCfg.MaxEjectionTime = 0 1963 odCfg.BaseEjectionTime = 0 1964 <-time.After(time.Millisecond) 1965 r.UpdateState(resolver.State{Endpoints: endpoints}) 1966 od.intervalTimerAlgorithm() 1967 if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[0].Addresses[1], endpoints[1].Addresses[1]}); err != nil { 1968 t.Fatalf("RPCs didn't go to the second addresses of both endpoints: %v", err) 1969 } 1970 } 1971 1972 // Tests that removing an address from an endpoint resets its ejection state. 1973 // The test creates two endpoints, each with two addresses. The first endpoint 1974 // has a backend that always returns errors. The test verifies that the first 1975 // endpoint is ejected after running the intervalTimerAlgorithm. The test sends 1976 // a resolver update that removes the first address in the ejected endpoint. The 1977 // test verifies that requests start reaching the remaining address from the 1978 // first endpoint. 1979 func (s) TestEjectionStateResetsWhenEndpointAddressesChange(t *testing.T) { 1980 unhealthyBackend := &stubserver.StubServer{ 1981 EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) { 1982 return nil, errors.New("some error") 1983 }, 1984 } 1985 if err := unhealthyBackend.StartServer(); err != nil { 1986 t.Fatalf("Failed to start backend: %v", err) 1987 } 1988 defer unhealthyBackend.Stop() 1989 t.Logf("Started unhealthy TestService backend at: %q", unhealthyBackend.Address) 1990 1991 healthyBackends := make([]*stubserver.StubServer, 3) 1992 for i := 0; i < 3; i++ { 1993 healthyBackends[i] = stubserver.StartTestService(t, nil) 1994 defer healthyBackends[i].Stop() 1995 } 1996 1997 wrrCfg, err := balancer.Get(weightedroundrobin.Name).(balancer.ConfigParser).ParseConfig(json.RawMessage("{}")) 1998 if err != nil { 1999 t.Fatalf("Failed to parse %q config: %v", weightedroundrobin.Name, err) 2000 } 2001 // The interval is intentionally kept very large, the interval algorithm 2002 // will be triggered manually. 2003 odCfg := &LBConfig{ 2004 Interval: iserviceconfig.Duration(300 * time.Second), 2005 BaseEjectionTime: iserviceconfig.Duration(300 * time.Second), 2006 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 2007 FailurePercentageEjection: &FailurePercentageEjection{ 2008 Threshold: 50, 2009 EnforcementPercentage: 100, 2010 MinimumHosts: 0, 2011 RequestVolume: 2, 2012 }, 2013 MaxEjectionPercent: 100, 2014 ChildPolicy: &iserviceconfig.BalancerConfig{ 2015 Name: weightedroundrobin.Name, 2016 Config: wrrCfg, 2017 }, 2018 } 2019 2020 lbChan := make(chan *outlierDetectionBalancer, 1) 2021 bf := stub.BalancerFuncs{ 2022 Init: func(bd *stub.BalancerData) { 2023 bd.ChildBalancer = balancer.Get(Name).Build(bd.ClientConn, bd.BuildOptions) 2024 lbChan <- bd.ChildBalancer.(*outlierDetectionBalancer) 2025 }, 2026 Close: func(bd *stub.BalancerData) { 2027 bd.ChildBalancer.Close() 2028 }, 2029 UpdateClientConnState: func(bd *stub.BalancerData, ccs balancer.ClientConnState) error { 2030 ccs.BalancerConfig = odCfg 2031 return bd.ChildBalancer.UpdateClientConnState(ccs) 2032 }, 2033 } 2034 2035 stub.Register(t.Name(), bf) 2036 r := manual.NewBuilderWithScheme("whatever") 2037 endpoints := []resolver.Endpoint{ 2038 { 2039 Addresses: []resolver.Address{ 2040 {Addr: unhealthyBackend.Address}, 2041 {Addr: healthyBackends[0].Address}, 2042 }, 2043 }, 2044 { 2045 Addresses: []resolver.Address{ 2046 {Addr: healthyBackends[1].Address}, 2047 {Addr: healthyBackends[2].Address}, 2048 }, 2049 }, 2050 } 2051 2052 r.InitialState(resolver.State{ 2053 Endpoints: endpoints, 2054 }) 2055 dialer := testutils.NewBlockingDialer() 2056 opts := []grpc.DialOption{ 2057 grpc.WithTransportCredentials(insecure.NewCredentials()), 2058 grpc.WithDefaultServiceConfig(fmt.Sprintf(`{ "loadBalancingConfig": [{%q: {}}] }`, t.Name())), 2059 grpc.WithResolvers(r), 2060 grpc.WithContextDialer(dialer.DialContext), 2061 } 2062 cc, err := grpc.NewClient(r.Scheme()+":///", opts...) 2063 if err != nil { 2064 t.Fatalf("grpc.NewClient() failed: %v", err) 2065 } 2066 defer cc.Close() 2067 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 2068 defer cancel() 2069 client := testgrpc.NewTestServiceClient(cc) 2070 client.EmptyCall(ctx, &testpb.Empty{}) 2071 testutils.AwaitState(ctx, t, cc, connectivity.Ready) 2072 2073 // Wait until both endpoints start receiving requests. 2074 addrsSeen := map[string]bool{} 2075 for ; ctx.Err() == nil && len(addrsSeen) < 2; <-time.After(time.Millisecond) { 2076 var peer peer.Peer 2077 client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer)) 2078 addrsSeen[peer.String()] = true 2079 } 2080 2081 if len(addrsSeen) < 2 { 2082 t.Fatalf("Context timed out waiting for requests to reach both endpoints.") 2083 } 2084 2085 // Make 2 requests to each endpoint and verify the first endpoint gets 2086 // ejected. 2087 for i := 0; i < 2*len(endpoints); i++ { 2088 client.EmptyCall(ctx, &testpb.Empty{}) 2089 } 2090 var od *outlierDetectionBalancer 2091 select { 2092 case <-ctx.Done(): 2093 t.Fatal("Timed out waiting for the outlier detection LB policy to be built.") 2094 case od = <-lbChan: 2095 } 2096 od.intervalTimerAlgorithm() 2097 2098 // The first endpoint should be ejected, requests should only go to 2099 // endpoints[1]. 2100 if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[1].Addresses[0]}); err != nil { 2101 t.Fatalf("RPCs didn't go to the second endpoint: %v", err) 2102 } 2103 2104 // Remove the first address from the first endpoint. This makes the first 2105 // endpoint a new endpoint for outlier detection, resetting its ejection 2106 // status. 2107 r.UpdateState(resolver.State{Endpoints: []resolver.Endpoint{ 2108 {Addresses: []resolver.Address{endpoints[0].Addresses[1]}}, 2109 endpoints[1], 2110 }}) 2111 od.intervalTimerAlgorithm() 2112 if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[0].Addresses[1], endpoints[1].Addresses[0]}); err != nil { 2113 t.Fatalf("RPCs didn't go to the second addresses of both endpoints: %v", err) 2114 } 2115 }