google.golang.org/grpc@v1.62.1/xds/internal/balancer/outlierdetection/balancer_test.go (about) 1 /* 2 * 3 * Copyright 2022 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 package outlierdetection 20 21 import ( 22 "context" 23 "encoding/json" 24 "errors" 25 "fmt" 26 "math" 27 "strings" 28 "sync" 29 "testing" 30 "time" 31 32 "github.com/google/go-cmp/cmp" 33 "github.com/google/go-cmp/cmp/cmpopts" 34 "google.golang.org/grpc/balancer" 35 "google.golang.org/grpc/connectivity" 36 "google.golang.org/grpc/internal/balancer/stub" 37 "google.golang.org/grpc/internal/channelz" 38 "google.golang.org/grpc/internal/grpcsync" 39 "google.golang.org/grpc/internal/grpctest" 40 iserviceconfig "google.golang.org/grpc/internal/serviceconfig" 41 "google.golang.org/grpc/internal/testutils" 42 "google.golang.org/grpc/resolver" 43 "google.golang.org/grpc/serviceconfig" 44 "google.golang.org/grpc/xds/internal/balancer/clusterimpl" 45 ) 46 47 var ( 48 defaultTestTimeout = 5 * time.Second 49 defaultTestShortTimeout = 10 * time.Millisecond 50 ) 51 52 type s struct { 53 grpctest.Tester 54 } 55 56 func Test(t *testing.T) { 57 grpctest.RunSubTests(t, s{}) 58 } 59 60 // TestParseConfig verifies the ParseConfig() method in the Outlier Detection 61 // Balancer. 62 func (s) TestParseConfig(t *testing.T) { 63 const errParseConfigName = "errParseConfigBalancer" 64 stub.Register(errParseConfigName, stub.BalancerFuncs{ 65 ParseConfig: func(json.RawMessage) (serviceconfig.LoadBalancingConfig, error) { 66 return nil, errors.New("some error") 67 }, 68 }) 69 70 parser := bb{} 71 const ( 72 defaultInterval = iserviceconfig.Duration(10 * time.Second) 73 defaultBaseEjectionTime = iserviceconfig.Duration(30 * time.Second) 74 defaultMaxEjectionTime = iserviceconfig.Duration(300 * time.Second) 75 defaultMaxEjectionPercent = 10 76 defaultSuccessRateStdevFactor = 1900 77 defaultEnforcingSuccessRate = 100 78 defaultSuccessRateMinimumHosts = 5 79 defaultSuccessRateRequestVolume = 100 80 defaultFailurePercentageThreshold = 85 81 defaultEnforcingFailurePercentage = 0 82 defaultFailurePercentageMinimumHosts = 5 83 defaultFailurePercentageRequestVolume = 50 84 ) 85 tests := []struct { 86 name string 87 input string 88 wantCfg serviceconfig.LoadBalancingConfig 89 wantErr string 90 }{ 91 { 92 name: "no-fields-set-should-get-default", 93 input: `{ 94 "childPolicy": [ 95 { 96 "xds_cluster_impl_experimental": { 97 "cluster": "test_cluster" 98 } 99 } 100 ] 101 }`, 102 wantCfg: &LBConfig{ 103 Interval: defaultInterval, 104 BaseEjectionTime: defaultBaseEjectionTime, 105 MaxEjectionTime: defaultMaxEjectionTime, 106 MaxEjectionPercent: defaultMaxEjectionPercent, 107 ChildPolicy: &iserviceconfig.BalancerConfig{ 108 Name: "xds_cluster_impl_experimental", 109 Config: &clusterimpl.LBConfig{ 110 Cluster: "test_cluster", 111 }, 112 }, 113 }, 114 }, 115 116 { 117 name: "some-top-level-fields-set", 118 input: `{ 119 "interval": "15s", 120 "maxEjectionTime": "350s", 121 "childPolicy": [ 122 { 123 "xds_cluster_impl_experimental": { 124 "cluster": "test_cluster" 125 } 126 } 127 ] 128 }`, 129 // Should get set fields + defaults for unset fields. 130 wantCfg: &LBConfig{ 131 Interval: iserviceconfig.Duration(15 * time.Second), 132 BaseEjectionTime: defaultBaseEjectionTime, 133 MaxEjectionTime: iserviceconfig.Duration(350 * time.Second), 134 MaxEjectionPercent: defaultMaxEjectionPercent, 135 ChildPolicy: &iserviceconfig.BalancerConfig{ 136 Name: "xds_cluster_impl_experimental", 137 Config: &clusterimpl.LBConfig{ 138 Cluster: "test_cluster", 139 }, 140 }, 141 }, 142 }, 143 { 144 name: "success-rate-ejection-present-but-no-fields", 145 input: `{ 146 "successRateEjection": {}, 147 "childPolicy": [ 148 { 149 "xds_cluster_impl_experimental": { 150 "cluster": "test_cluster" 151 } 152 } 153 ] 154 }`, 155 // Should get defaults of success-rate-ejection struct. 156 wantCfg: &LBConfig{ 157 Interval: defaultInterval, 158 BaseEjectionTime: defaultBaseEjectionTime, 159 MaxEjectionTime: defaultMaxEjectionTime, 160 MaxEjectionPercent: defaultMaxEjectionPercent, 161 SuccessRateEjection: &SuccessRateEjection{ 162 StdevFactor: defaultSuccessRateStdevFactor, 163 EnforcementPercentage: defaultEnforcingSuccessRate, 164 MinimumHosts: defaultSuccessRateMinimumHosts, 165 RequestVolume: defaultSuccessRateRequestVolume, 166 }, 167 ChildPolicy: &iserviceconfig.BalancerConfig{ 168 Name: "xds_cluster_impl_experimental", 169 Config: &clusterimpl.LBConfig{ 170 Cluster: "test_cluster", 171 }, 172 }, 173 }, 174 }, 175 { 176 name: "success-rate-ejection-present-partially-set", 177 input: `{ 178 "successRateEjection": { 179 "stdevFactor": 1000, 180 "minimumHosts": 5 181 }, 182 "childPolicy": [ 183 { 184 "xds_cluster_impl_experimental": { 185 "cluster": "test_cluster" 186 } 187 } 188 ] 189 }`, 190 // Should get set fields + defaults for others in success rate 191 // ejection layer. 192 wantCfg: &LBConfig{ 193 Interval: defaultInterval, 194 BaseEjectionTime: defaultBaseEjectionTime, 195 MaxEjectionTime: defaultMaxEjectionTime, 196 MaxEjectionPercent: defaultMaxEjectionPercent, 197 SuccessRateEjection: &SuccessRateEjection{ 198 StdevFactor: 1000, 199 EnforcementPercentage: defaultEnforcingSuccessRate, 200 MinimumHosts: 5, 201 RequestVolume: defaultSuccessRateRequestVolume, 202 }, 203 ChildPolicy: &iserviceconfig.BalancerConfig{ 204 Name: "xds_cluster_impl_experimental", 205 Config: &clusterimpl.LBConfig{ 206 Cluster: "test_cluster", 207 }, 208 }, 209 }, 210 }, 211 { 212 name: "success-rate-ejection-present-fully-set", 213 input: `{ 214 "successRateEjection": { 215 "stdevFactor": 1000, 216 "enforcementPercentage": 50, 217 "minimumHosts": 5, 218 "requestVolume": 50 219 }, 220 "childPolicy": [ 221 { 222 "xds_cluster_impl_experimental": { 223 "cluster": "test_cluster" 224 } 225 } 226 ] 227 }`, 228 wantCfg: &LBConfig{ 229 Interval: defaultInterval, 230 BaseEjectionTime: defaultBaseEjectionTime, 231 MaxEjectionTime: defaultMaxEjectionTime, 232 MaxEjectionPercent: defaultMaxEjectionPercent, 233 SuccessRateEjection: &SuccessRateEjection{ 234 StdevFactor: 1000, 235 EnforcementPercentage: 50, 236 MinimumHosts: 5, 237 RequestVolume: 50, 238 }, 239 ChildPolicy: &iserviceconfig.BalancerConfig{ 240 Name: "xds_cluster_impl_experimental", 241 Config: &clusterimpl.LBConfig{ 242 Cluster: "test_cluster", 243 }, 244 }, 245 }, 246 }, 247 { 248 name: "failure-percentage-ejection-present-but-no-fields", 249 input: `{ 250 "failurePercentageEjection": {}, 251 "childPolicy": [ 252 { 253 "xds_cluster_impl_experimental": { 254 "cluster": "test_cluster" 255 } 256 } 257 ] 258 }`, 259 // Should get defaults of failure percentage ejection layer. 260 wantCfg: &LBConfig{ 261 Interval: defaultInterval, 262 BaseEjectionTime: defaultBaseEjectionTime, 263 MaxEjectionTime: defaultMaxEjectionTime, 264 MaxEjectionPercent: defaultMaxEjectionPercent, 265 FailurePercentageEjection: &FailurePercentageEjection{ 266 Threshold: defaultFailurePercentageThreshold, 267 EnforcementPercentage: defaultEnforcingFailurePercentage, 268 MinimumHosts: defaultFailurePercentageMinimumHosts, 269 RequestVolume: defaultFailurePercentageRequestVolume, 270 }, 271 ChildPolicy: &iserviceconfig.BalancerConfig{ 272 Name: "xds_cluster_impl_experimental", 273 Config: &clusterimpl.LBConfig{ 274 Cluster: "test_cluster", 275 }, 276 }, 277 }, 278 }, 279 { 280 name: "failure-percentage-ejection-present-partially-set", 281 input: `{ 282 "failurePercentageEjection": { 283 "threshold": 80, 284 "minimumHosts": 10 285 }, 286 "childPolicy": [ 287 { 288 "xds_cluster_impl_experimental": { 289 "cluster": "test_cluster" 290 } 291 } 292 ] 293 }`, 294 // Should get set fields + defaults for others in success rate 295 // ejection layer. 296 wantCfg: &LBConfig{ 297 Interval: defaultInterval, 298 BaseEjectionTime: defaultBaseEjectionTime, 299 MaxEjectionTime: defaultMaxEjectionTime, 300 MaxEjectionPercent: defaultMaxEjectionPercent, 301 FailurePercentageEjection: &FailurePercentageEjection{ 302 Threshold: 80, 303 EnforcementPercentage: defaultEnforcingFailurePercentage, 304 MinimumHosts: 10, 305 RequestVolume: defaultFailurePercentageRequestVolume, 306 }, 307 ChildPolicy: &iserviceconfig.BalancerConfig{ 308 Name: "xds_cluster_impl_experimental", 309 Config: &clusterimpl.LBConfig{ 310 Cluster: "test_cluster", 311 }, 312 }, 313 }, 314 }, 315 { 316 name: "failure-percentage-ejection-present-fully-set", 317 input: `{ 318 "failurePercentageEjection": { 319 "threshold": 80, 320 "enforcementPercentage": 100, 321 "minimumHosts": 10, 322 "requestVolume": 40 323 }, 324 "childPolicy": [ 325 { 326 "xds_cluster_impl_experimental": { 327 "cluster": "test_cluster" 328 } 329 } 330 ] 331 }`, 332 wantCfg: &LBConfig{ 333 Interval: defaultInterval, 334 BaseEjectionTime: defaultBaseEjectionTime, 335 MaxEjectionTime: defaultMaxEjectionTime, 336 MaxEjectionPercent: defaultMaxEjectionPercent, 337 FailurePercentageEjection: &FailurePercentageEjection{ 338 Threshold: 80, 339 EnforcementPercentage: 100, 340 MinimumHosts: 10, 341 RequestVolume: 40, 342 }, 343 ChildPolicy: &iserviceconfig.BalancerConfig{ 344 Name: "xds_cluster_impl_experimental", 345 Config: &clusterimpl.LBConfig{ 346 Cluster: "test_cluster", 347 }, 348 }, 349 }, 350 }, 351 { // to make sure zero values aren't overwritten by defaults 352 name: "lb-config-every-field-set-zero-value", 353 input: `{ 354 "interval": "0s", 355 "baseEjectionTime": "0s", 356 "maxEjectionTime": "0s", 357 "maxEjectionPercent": 0, 358 "successRateEjection": { 359 "stdevFactor": 0, 360 "enforcementPercentage": 0, 361 "minimumHosts": 0, 362 "requestVolume": 0 363 }, 364 "failurePercentageEjection": { 365 "threshold": 0, 366 "enforcementPercentage": 0, 367 "minimumHosts": 0, 368 "requestVolume": 0 369 }, 370 "childPolicy": [ 371 { 372 "xds_cluster_impl_experimental": { 373 "cluster": "test_cluster" 374 } 375 } 376 ] 377 }`, 378 wantCfg: &LBConfig{ 379 SuccessRateEjection: &SuccessRateEjection{}, 380 FailurePercentageEjection: &FailurePercentageEjection{}, 381 ChildPolicy: &iserviceconfig.BalancerConfig{ 382 Name: "xds_cluster_impl_experimental", 383 Config: &clusterimpl.LBConfig{ 384 Cluster: "test_cluster", 385 }, 386 }, 387 }, 388 }, 389 { 390 name: "lb-config-every-field-set", 391 input: `{ 392 "interval": "10s", 393 "baseEjectionTime": "30s", 394 "maxEjectionTime": "300s", 395 "maxEjectionPercent": 10, 396 "successRateEjection": { 397 "stdevFactor": 1900, 398 "enforcementPercentage": 100, 399 "minimumHosts": 5, 400 "requestVolume": 100 401 }, 402 "failurePercentageEjection": { 403 "threshold": 85, 404 "enforcementPercentage": 5, 405 "minimumHosts": 5, 406 "requestVolume": 50 407 }, 408 "childPolicy": [ 409 { 410 "xds_cluster_impl_experimental": { 411 "cluster": "test_cluster" 412 } 413 } 414 ] 415 }`, 416 wantCfg: &LBConfig{ 417 Interval: iserviceconfig.Duration(10 * time.Second), 418 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 419 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 420 MaxEjectionPercent: 10, 421 SuccessRateEjection: &SuccessRateEjection{ 422 StdevFactor: 1900, 423 EnforcementPercentage: 100, 424 MinimumHosts: 5, 425 RequestVolume: 100, 426 }, 427 FailurePercentageEjection: &FailurePercentageEjection{ 428 Threshold: 85, 429 EnforcementPercentage: 5, 430 MinimumHosts: 5, 431 RequestVolume: 50, 432 }, 433 ChildPolicy: &iserviceconfig.BalancerConfig{ 434 Name: "xds_cluster_impl_experimental", 435 Config: &clusterimpl.LBConfig{ 436 Cluster: "test_cluster", 437 }, 438 }, 439 }, 440 }, 441 { 442 name: "interval-is-negative", 443 input: `{"interval": "-10s"}`, 444 wantErr: "OutlierDetectionLoadBalancingConfig.interval = -10s; must be >= 0", 445 }, 446 { 447 name: "base-ejection-time-is-negative", 448 input: `{"baseEjectionTime": "-10s"}`, 449 wantErr: "OutlierDetectionLoadBalancingConfig.base_ejection_time = -10s; must be >= 0", 450 }, 451 { 452 name: "max-ejection-time-is-negative", 453 input: `{"maxEjectionTime": "-10s"}`, 454 wantErr: "OutlierDetectionLoadBalancingConfig.max_ejection_time = -10s; must be >= 0", 455 }, 456 { 457 name: "max-ejection-percent-is-greater-than-100", 458 input: `{"maxEjectionPercent": 150}`, 459 wantErr: "OutlierDetectionLoadBalancingConfig.max_ejection_percent = 150; must be <= 100", 460 }, 461 { 462 name: "enforcement-percentage-success-rate-is-greater-than-100", 463 input: `{ 464 "successRateEjection": { 465 "enforcementPercentage": 150 466 } 467 }`, 468 wantErr: "OutlierDetectionLoadBalancingConfig.SuccessRateEjection.enforcement_percentage = 150; must be <= 100", 469 }, 470 { 471 name: "failure-percentage-threshold-is-greater-than-100", 472 input: `{ 473 "failurePercentageEjection": { 474 "threshold": 150 475 } 476 }`, 477 wantErr: "OutlierDetectionLoadBalancingConfig.FailurePercentageEjection.threshold = 150; must be <= 100", 478 }, 479 { 480 name: "enforcement-percentage-failure-percentage-ejection-is-greater-than-100", 481 input: `{ 482 "failurePercentageEjection": { 483 "enforcementPercentage": 150 484 } 485 }`, 486 wantErr: "OutlierDetectionLoadBalancingConfig.FailurePercentageEjection.enforcement_percentage = 150; must be <= 100", 487 }, 488 { 489 name: "child-policy-present-but-parse-error", 490 input: `{ 491 "childPolicy": [ 492 { 493 "errParseConfigBalancer": { 494 "cluster": "test_cluster" 495 } 496 } 497 ] 498 }`, 499 wantErr: "error parsing loadBalancingConfig for policy \"errParseConfigBalancer\"", 500 }, 501 { 502 name: "no-supported-child-policy", 503 input: `{ 504 "childPolicy": [ 505 { 506 "doesNotExistBalancer": { 507 "cluster": "test_cluster" 508 } 509 } 510 ] 511 }`, 512 wantErr: "invalid loadBalancingConfig: no supported policies found", 513 }, 514 } 515 for _, test := range tests { 516 t.Run(test.name, func(t *testing.T) { 517 gotCfg, gotErr := parser.ParseConfig(json.RawMessage(test.input)) 518 if gotErr != nil && !strings.Contains(gotErr.Error(), test.wantErr) { 519 t.Fatalf("ParseConfig(%v) = %v, wantErr %v", test.input, gotErr, test.wantErr) 520 } 521 if (gotErr != nil) != (test.wantErr != "") { 522 t.Fatalf("ParseConfig(%v) = %v, wantErr %v", test.input, gotErr, test.wantErr) 523 } 524 if test.wantErr != "" { 525 return 526 } 527 if diff := cmp.Diff(gotCfg, test.wantCfg); diff != "" { 528 t.Fatalf("parseConfig(%v) got unexpected output, diff (-got +want): %v", string(test.input), diff) 529 } 530 }) 531 } 532 } 533 534 func (lbc *LBConfig) Equal(lbc2 *LBConfig) bool { 535 if !lbc.EqualIgnoringChildPolicy(lbc2) { 536 return false 537 } 538 return cmp.Equal(lbc.ChildPolicy, lbc2.ChildPolicy) 539 } 540 541 type subConnWithState struct { 542 sc balancer.SubConn 543 state balancer.SubConnState 544 } 545 546 func setup(t *testing.T) (*outlierDetectionBalancer, *testutils.BalancerClientConn, func()) { 547 t.Helper() 548 builder := balancer.Get(Name) 549 if builder == nil { 550 t.Fatalf("balancer.Get(%q) returned nil", Name) 551 } 552 tcc := testutils.NewBalancerClientConn(t) 553 odB := builder.Build(tcc, balancer.BuildOptions{ChannelzParentID: channelz.NewIdentifierForTesting(channelz.RefChannel, time.Now().Unix(), nil)}) 554 return odB.(*outlierDetectionBalancer), tcc, odB.Close 555 } 556 557 type emptyChildConfig struct { 558 serviceconfig.LoadBalancingConfig 559 } 560 561 // TestChildBasicOperations tests basic operations of the Outlier Detection 562 // Balancer and it's interaction with it's child. The following scenarios are 563 // tested, in a step by step fashion: 564 // 1. The Outlier Detection Balancer receives it's first good configuration. The 565 // balancer is expected to create a child and sent the child it's configuration. 566 // 2. The Outlier Detection Balancer receives new configuration that specifies a 567 // child's type, and the new type immediately reports READY inline. The first 568 // child balancer should be closed and the second child balancer should receive 569 // a config update. 570 // 3. The Outlier Detection Balancer is closed. The second child balancer should 571 // be closed. 572 func (s) TestChildBasicOperations(t *testing.T) { 573 bc := emptyChildConfig{} 574 575 ccsCh := testutils.NewChannel() 576 closeCh := testutils.NewChannel() 577 578 stub.Register(t.Name()+"child1", stub.BalancerFuncs{ 579 UpdateClientConnState: func(bd *stub.BalancerData, ccs balancer.ClientConnState) error { 580 ccsCh.Send(ccs.BalancerConfig) 581 return nil 582 }, 583 Close: func(bd *stub.BalancerData) { 584 closeCh.Send(nil) 585 }, 586 }) 587 588 stub.Register(t.Name()+"child2", stub.BalancerFuncs{ 589 UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error { 590 // UpdateState inline to READY to complete graceful switch process 591 // synchronously from any UpdateClientConnState call. 592 bd.ClientConn.UpdateState(balancer.State{ 593 ConnectivityState: connectivity.Ready, 594 Picker: &testutils.TestConstPicker{}, 595 }) 596 ccsCh.Send(nil) 597 return nil 598 }, 599 Close: func(bd *stub.BalancerData) { 600 closeCh.Send(nil) 601 }, 602 }) 603 604 od, tcc, _ := setup(t) 605 606 // This first config update should cause a child to be built and forwarded 607 // it's first update. 608 od.UpdateClientConnState(balancer.ClientConnState{ 609 BalancerConfig: &LBConfig{ 610 ChildPolicy: &iserviceconfig.BalancerConfig{ 611 Name: t.Name() + "child1", 612 Config: bc, 613 }, 614 }, 615 }) 616 617 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 618 defer cancel() 619 cr, err := ccsCh.Receive(ctx) 620 if err != nil { 621 t.Fatalf("timed out waiting for UpdateClientConnState on the first child balancer: %v", err) 622 } 623 if _, ok := cr.(emptyChildConfig); !ok { 624 t.Fatalf("Received child policy config of type %T, want %T", cr, emptyChildConfig{}) 625 } 626 627 // This Update Client Conn State call should cause the first child balancer 628 // to close, and a new child to be created and also forwarded it's first 629 // config update. 630 od.UpdateClientConnState(balancer.ClientConnState{ 631 BalancerConfig: &LBConfig{ 632 Interval: math.MaxInt64, 633 ChildPolicy: &iserviceconfig.BalancerConfig{ 634 Name: t.Name() + "child2", 635 Config: emptyChildConfig{}, 636 }, 637 }, 638 }) 639 640 // Verify inline UpdateState() call from the new child eventually makes it's 641 // way to the Test Client Conn. 642 select { 643 case <-ctx.Done(): 644 t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn") 645 case state := <-tcc.NewStateCh: 646 if state != connectivity.Ready { 647 t.Fatalf("ClientConn received connectivity state %v, want %v", state, connectivity.Ready) 648 } 649 } 650 651 // Verify the first child balancer closed. 652 if _, err = closeCh.Receive(ctx); err != nil { 653 t.Fatalf("timed out waiting for the first child balancer to be closed: %v", err) 654 } 655 // Verify the second child balancer received it's first config update. 656 if _, err = ccsCh.Receive(ctx); err != nil { 657 t.Fatalf("timed out waiting for UpdateClientConnState on the second child balancer: %v", err) 658 } 659 // Closing the Outlier Detection Balancer should close the newly created 660 // child. 661 od.Close() 662 if _, err = closeCh.Receive(ctx); err != nil { 663 t.Fatalf("timed out waiting for the second child balancer to be closed: %v", err) 664 } 665 } 666 667 // TestUpdateAddresses tests the functionality of UpdateAddresses and any 668 // changes in the addresses/plurality of those addresses for a SubConn. The 669 // Balancer is set up with two upstreams, with one of the upstreams being 670 // ejected. Initially, there is one SubConn for each address. The following 671 // scenarios are tested, in a step by step fashion: 672 // 1. The SubConn not currently ejected switches addresses to the address that 673 // is ejected. This should cause the SubConn to get ejected. 674 // 2. Update this same SubConn to multiple addresses. This should cause the 675 // SubConn to get unejected, as it is no longer being tracked by Outlier 676 // Detection at that point. 677 // 3. Update this same SubConn to different addresses, still multiple. This 678 // should be a noop, as the SubConn is still no longer being tracked by Outlier 679 // Detection. 680 // 4. Update this same SubConn to the a single address which is ejected. This 681 // should cause the SubConn to be ejected. 682 func (s) TestUpdateAddresses(t *testing.T) { 683 scsCh := testutils.NewChannel() 684 var scw1, scw2 balancer.SubConn 685 var err error 686 stub.Register(t.Name(), stub.BalancerFuncs{ 687 UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error { 688 scw1, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{ 689 StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw1, state: state}) }, 690 }) 691 if err != nil { 692 t.Errorf("error in od.NewSubConn call: %v", err) 693 } 694 scw2, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{ 695 StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw2, state: state}) }, 696 }) 697 if err != nil { 698 t.Errorf("error in od.NewSubConn call: %v", err) 699 } 700 bd.ClientConn.UpdateState(balancer.State{ 701 ConnectivityState: connectivity.Ready, 702 Picker: &rrPicker{ 703 scs: []balancer.SubConn{scw1, scw2}, 704 }, 705 }) 706 return nil 707 }, 708 }) 709 710 od, tcc, cleanup := setup(t) 711 defer cleanup() 712 713 od.UpdateClientConnState(balancer.ClientConnState{ 714 ResolverState: resolver.State{ 715 Addresses: []resolver.Address{ 716 {Addr: "address1"}, 717 {Addr: "address2"}, 718 }, 719 }, 720 BalancerConfig: &LBConfig{ 721 Interval: iserviceconfig.Duration(10 * time.Second), 722 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 723 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 724 MaxEjectionPercent: 10, 725 FailurePercentageEjection: &FailurePercentageEjection{ 726 Threshold: 50, 727 EnforcementPercentage: 100, 728 MinimumHosts: 2, 729 RequestVolume: 3, 730 }, 731 ChildPolicy: &iserviceconfig.BalancerConfig{ 732 Name: t.Name(), 733 Config: emptyChildConfig{}, 734 }, 735 }, 736 }) 737 738 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 739 defer cancel() 740 741 // Setup the system to where one address is ejected and one address 742 // isn't. 743 select { 744 case <-ctx.Done(): 745 t.Fatal("timeout while waiting for a UpdateState call on the ClientConn") 746 case picker := <-tcc.NewPickerCh: 747 pi, err := picker.Pick(balancer.PickInfo{}) 748 if err != nil { 749 t.Fatalf("picker.Pick failed with error: %v", err) 750 } 751 // Simulate 5 successful RPC calls on the first SubConn (the first call 752 // to picker.Pick). 753 for c := 0; c < 5; c++ { 754 pi.Done(balancer.DoneInfo{}) 755 } 756 pi, err = picker.Pick(balancer.PickInfo{}) 757 if err != nil { 758 t.Fatalf("picker.Pick failed with error: %v", err) 759 } 760 // Simulate 5 failed RPC calls on the second SubConn (the second call to 761 // picker.Pick). Thus, when the interval timer algorithm is run, the 762 // second SubConn's address should be ejected, which will allow us to 763 // further test UpdateAddresses() logic. 764 for c := 0; c < 5; c++ { 765 pi.Done(balancer.DoneInfo{Err: errors.New("some error")}) 766 } 767 od.intervalTimerAlgorithm() 768 // verify StateListener() got called with TRANSIENT_FAILURE for child 769 // with address that was ejected. 770 gotSCWS, err := scsCh.Receive(ctx) 771 if err != nil { 772 t.Fatalf("Error waiting for Sub Conn update: %v", err) 773 } 774 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 775 sc: scw2, 776 state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}, 777 }); err != nil { 778 t.Fatalf("Error in Sub Conn update: %v", err) 779 } 780 } 781 782 // Update scw1 to another address that is currently ejected. This should 783 // cause scw1 to get ejected. 784 od.UpdateAddresses(scw1, []resolver.Address{{Addr: "address2"}}) 785 786 // Verify that update addresses gets forwarded to ClientConn. 787 select { 788 case <-ctx.Done(): 789 t.Fatal("timeout while waiting for a UpdateState call on the ClientConn") 790 case <-tcc.UpdateAddressesAddrsCh: 791 } 792 // Verify scw1 got ejected (StateListener called with TRANSIENT_FAILURE). 793 gotSCWS, err := scsCh.Receive(ctx) 794 if err != nil { 795 t.Fatalf("Error waiting for Sub Conn update: %v", err) 796 } 797 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 798 sc: scw1, 799 state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}, 800 }); err != nil { 801 t.Fatalf("Error in Sub Conn update: %v", err) 802 } 803 804 // Update scw1 to multiple addresses. This should cause scw1 to get 805 // unejected, as is it no longer being tracked for Outlier Detection. 806 od.UpdateAddresses(scw1, []resolver.Address{ 807 {Addr: "address1"}, 808 {Addr: "address2"}, 809 }) 810 // Verify scw1 got unejected (StateListener called with recent state). 811 gotSCWS, err = scsCh.Receive(ctx) 812 if err != nil { 813 t.Fatalf("Error waiting for Sub Conn update: %v", err) 814 } 815 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 816 sc: scw1, 817 state: balancer.SubConnState{ConnectivityState: connectivity.Idle}, 818 }); err != nil { 819 t.Fatalf("Error in Sub Conn update: %v", err) 820 } 821 822 // Update scw1 to a different multiple addresses list. A change of addresses 823 // in which the plurality goes from multiple to multiple should be a no-op, 824 // as the address continues to be ignored by outlier detection. 825 od.UpdateAddresses(scw1, []resolver.Address{ 826 {Addr: "address2"}, 827 {Addr: "address3"}, 828 }) 829 // Verify no downstream effects. 830 sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout) 831 defer cancel() 832 if _, err := scsCh.Receive(sCtx); err == nil { 833 t.Fatalf("no SubConn update should have been sent (no SubConn got ejected/unejected)") 834 } 835 836 // Update scw1 back to a single address, which is ejected. This should cause 837 // the SubConn to be re-ejected. 838 od.UpdateAddresses(scw1, []resolver.Address{{Addr: "address2"}}) 839 // Verify scw1 got ejected (StateListener called with TRANSIENT FAILURE). 840 gotSCWS, err = scsCh.Receive(ctx) 841 if err != nil { 842 t.Fatalf("Error waiting for Sub Conn update: %v", err) 843 } 844 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 845 sc: scw1, 846 state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}, 847 }); err != nil { 848 t.Fatalf("Error in Sub Conn update: %v", err) 849 } 850 } 851 852 func scwsEqual(gotSCWS subConnWithState, wantSCWS subConnWithState) error { 853 if gotSCWS.sc != wantSCWS.sc || !cmp.Equal(gotSCWS.state, wantSCWS.state, cmp.AllowUnexported(subConnWrapper{}, addressInfo{}), cmpopts.IgnoreFields(subConnWrapper{}, "scUpdateCh")) { 854 return fmt.Errorf("received SubConnState: %+v, want %+v", gotSCWS, wantSCWS) 855 } 856 return nil 857 } 858 859 type rrPicker struct { 860 scs []balancer.SubConn 861 next int 862 } 863 864 func (rrp *rrPicker) Pick(balancer.PickInfo) (balancer.PickResult, error) { 865 sc := rrp.scs[rrp.next] 866 rrp.next = (rrp.next + 1) % len(rrp.scs) 867 return balancer.PickResult{SubConn: sc}, nil 868 } 869 870 // TestDurationOfInterval tests the configured interval timer. 871 // The following scenarios are tested: 872 // 1. The Outlier Detection Balancer receives it's first config. The balancer 873 // should configure the timer with whatever is directly specified on the config. 874 // 2. The Outlier Detection Balancer receives a subsequent config. The balancer 875 // should configure with whatever interval is configured minus the difference 876 // between the current time and the previous start timestamp. 877 // 3. The Outlier Detection Balancer receives a no-op configuration. The 878 // balancer should not configure a timer at all. 879 func (s) TestDurationOfInterval(t *testing.T) { 880 stub.Register(t.Name(), stub.BalancerFuncs{}) 881 882 od, _, cleanup := setup(t) 883 defer func(af func(d time.Duration, f func()) *time.Timer) { 884 cleanup() 885 afterFunc = af 886 }(afterFunc) 887 888 durationChan := testutils.NewChannel() 889 afterFunc = func(dur time.Duration, _ func()) *time.Timer { 890 durationChan.Send(dur) 891 return time.NewTimer(math.MaxInt64) 892 } 893 894 od.UpdateClientConnState(balancer.ClientConnState{ 895 BalancerConfig: &LBConfig{ 896 Interval: iserviceconfig.Duration(8 * time.Second), 897 SuccessRateEjection: &SuccessRateEjection{ 898 StdevFactor: 1900, 899 EnforcementPercentage: 100, 900 MinimumHosts: 5, 901 RequestVolume: 100, 902 }, 903 ChildPolicy: &iserviceconfig.BalancerConfig{ 904 Name: t.Name(), 905 Config: emptyChildConfig{}, 906 }, 907 }, 908 }) 909 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 910 defer cancel() 911 d, err := durationChan.Receive(ctx) 912 if err != nil { 913 t.Fatalf("Error receiving duration from afterFunc() call: %v", err) 914 } 915 dur := d.(time.Duration) 916 // The configured duration should be 8 seconds - what the balancer was 917 // configured with. 918 if dur != 8*time.Second { 919 t.Fatalf("configured duration should have been 8 seconds to start timer") 920 } 921 922 // Override time.Now to time.Now() + 5 seconds. This will represent 5 923 // seconds already passing for the next check in UpdateClientConnState. 924 defer func(n func() time.Time) { 925 now = n 926 }(now) 927 now = func() time.Time { 928 return time.Now().Add(time.Second * 5) 929 } 930 931 // UpdateClientConnState with an interval of 9 seconds. Due to 5 seconds 932 // already passing (from overridden time.Now function), this should start an 933 // interval timer of ~4 seconds. 934 od.UpdateClientConnState(balancer.ClientConnState{ 935 BalancerConfig: &LBConfig{ 936 Interval: iserviceconfig.Duration(9 * time.Second), 937 SuccessRateEjection: &SuccessRateEjection{ 938 StdevFactor: 1900, 939 EnforcementPercentage: 100, 940 MinimumHosts: 5, 941 RequestVolume: 100, 942 }, 943 ChildPolicy: &iserviceconfig.BalancerConfig{ 944 Name: t.Name(), 945 Config: emptyChildConfig{}, 946 }, 947 }, 948 }) 949 950 d, err = durationChan.Receive(ctx) 951 if err != nil { 952 t.Fatalf("Error receiving duration from afterFunc() call: %v", err) 953 } 954 dur = d.(time.Duration) 955 if dur.Seconds() < 3.5 || 4.5 < dur.Seconds() { 956 t.Fatalf("configured duration should have been around 4 seconds to start timer") 957 } 958 959 // UpdateClientConnState with a no-op config. This shouldn't configure the 960 // interval timer at all due to it being a no-op. 961 od.UpdateClientConnState(balancer.ClientConnState{ 962 BalancerConfig: &LBConfig{ 963 Interval: iserviceconfig.Duration(10 * time.Second), 964 ChildPolicy: &iserviceconfig.BalancerConfig{ 965 Name: t.Name(), 966 Config: emptyChildConfig{}, 967 }, 968 }, 969 }) 970 971 // No timer should have been started. 972 sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout) 973 defer cancel() 974 if _, err = durationChan.Receive(sCtx); err == nil { 975 t.Fatal("No timer should have started.") 976 } 977 } 978 979 // TestEjectUnejectSuccessRate tests the functionality of the interval timer 980 // algorithm when configured with SuccessRateEjection. The Outlier Detection 981 // Balancer will be set up with 3 SubConns, each with a different address. 982 // It tests the following scenarios, in a step by step fashion: 983 // 1. The three addresses each have 5 successes. The interval timer algorithm should 984 // not eject any of the addresses. 985 // 2. Two of the addresses have 5 successes, the third has five failures. The 986 // interval timer algorithm should eject the third address with five failures. 987 // 3. The interval timer algorithm is run at a later time past max ejection 988 // time. The interval timer algorithm should uneject the third address. 989 func (s) TestEjectUnejectSuccessRate(t *testing.T) { 990 scsCh := testutils.NewChannel() 991 var scw1, scw2, scw3 balancer.SubConn 992 var err error 993 stub.Register(t.Name(), stub.BalancerFuncs{ 994 UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error { 995 scw1, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{ 996 StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw1, state: state}) }, 997 }) 998 if err != nil { 999 t.Errorf("error in od.NewSubConn call: %v", err) 1000 } 1001 scw2, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{ 1002 StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw2, state: state}) }, 1003 }) 1004 if err != nil { 1005 t.Errorf("error in od.NewSubConn call: %v", err) 1006 } 1007 scw3, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address3"}}, balancer.NewSubConnOptions{ 1008 StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw3, state: state}) }, 1009 }) 1010 if err != nil { 1011 t.Errorf("error in od.NewSubConn call: %v", err) 1012 } 1013 bd.ClientConn.UpdateState(balancer.State{ 1014 ConnectivityState: connectivity.Ready, 1015 Picker: &rrPicker{ 1016 scs: []balancer.SubConn{scw1, scw2, scw3}, 1017 }, 1018 }) 1019 return nil 1020 }, 1021 }) 1022 1023 od, tcc, cleanup := setup(t) 1024 defer func() { 1025 cleanup() 1026 }() 1027 1028 od.UpdateClientConnState(balancer.ClientConnState{ 1029 ResolverState: resolver.State{ 1030 Addresses: []resolver.Address{ 1031 {Addr: "address1"}, 1032 {Addr: "address2"}, 1033 {Addr: "address3"}, 1034 }, 1035 }, 1036 BalancerConfig: &LBConfig{ 1037 Interval: math.MaxInt64, // so the interval will never run unless called manually in test. 1038 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 1039 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 1040 MaxEjectionPercent: 10, 1041 FailurePercentageEjection: &FailurePercentageEjection{ 1042 Threshold: 50, 1043 EnforcementPercentage: 100, 1044 MinimumHosts: 3, 1045 RequestVolume: 3, 1046 }, 1047 ChildPolicy: &iserviceconfig.BalancerConfig{ 1048 Name: t.Name(), 1049 Config: emptyChildConfig{}, 1050 }, 1051 }, 1052 }) 1053 1054 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1055 defer cancel() 1056 1057 select { 1058 case <-ctx.Done(): 1059 t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn") 1060 case picker := <-tcc.NewPickerCh: 1061 // Set each of the three upstream addresses to have five successes each. 1062 // This should cause none of the addresses to be ejected as none of them 1063 // are outliers according to the success rate algorithm. 1064 for i := 0; i < 3; i++ { 1065 pi, err := picker.Pick(balancer.PickInfo{}) 1066 if err != nil { 1067 t.Fatalf("picker.Pick failed with error: %v", err) 1068 } 1069 for c := 0; c < 5; c++ { 1070 pi.Done(balancer.DoneInfo{}) 1071 } 1072 } 1073 1074 od.intervalTimerAlgorithm() 1075 1076 // verify no StateListener() call on the child, as no addresses got 1077 // ejected (ejected address will cause an StateListener call). 1078 sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout) 1079 defer cancel() 1080 if _, err := scsCh.Receive(sCtx); err == nil { 1081 t.Fatalf("no SubConn update should have been sent (no SubConn got ejected)") 1082 } 1083 1084 // Since no addresses are ejected, a SubConn update should forward down 1085 // to the child. 1086 od.updateSubConnState(scw1.(*subConnWrapper).SubConn, balancer.SubConnState{ 1087 ConnectivityState: connectivity.Connecting, 1088 }) 1089 1090 gotSCWS, err := scsCh.Receive(ctx) 1091 if err != nil { 1092 t.Fatalf("Error waiting for Sub Conn update: %v", err) 1093 } 1094 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 1095 sc: scw1, 1096 state: balancer.SubConnState{ConnectivityState: connectivity.Connecting}, 1097 }); err != nil { 1098 t.Fatalf("Error in Sub Conn update: %v", err) 1099 } 1100 1101 // Set two of the upstream addresses to have five successes each, and 1102 // one of the upstream addresses to have five failures. This should 1103 // cause the address which has five failures to be ejected according to 1104 // the SuccessRateAlgorithm. 1105 for i := 0; i < 2; i++ { 1106 pi, err := picker.Pick(balancer.PickInfo{}) 1107 if err != nil { 1108 t.Fatalf("picker.Pick failed with error: %v", err) 1109 } 1110 for c := 0; c < 5; c++ { 1111 pi.Done(balancer.DoneInfo{}) 1112 } 1113 } 1114 pi, err := picker.Pick(balancer.PickInfo{}) 1115 if err != nil { 1116 t.Fatalf("picker.Pick failed with error: %v", err) 1117 } 1118 for c := 0; c < 5; c++ { 1119 pi.Done(balancer.DoneInfo{Err: errors.New("some error")}) 1120 } 1121 1122 // should eject address that always errored. 1123 od.intervalTimerAlgorithm() 1124 // Due to the address being ejected, the SubConn with that address 1125 // should be ejected, meaning a TRANSIENT_FAILURE connectivity state 1126 // gets reported to the child. 1127 gotSCWS, err = scsCh.Receive(ctx) 1128 if err != nil { 1129 t.Fatalf("Error waiting for Sub Conn update: %v", err) 1130 } 1131 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 1132 sc: scw3, 1133 state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}, 1134 }); err != nil { 1135 t.Fatalf("Error in Sub Conn update: %v", err) 1136 } 1137 // Only one address should be ejected. 1138 sCtx, cancel = context.WithTimeout(context.Background(), defaultTestShortTimeout) 1139 defer cancel() 1140 if _, err := scsCh.Receive(sCtx); err == nil { 1141 t.Fatalf("Only one SubConn update should have been sent (only one SubConn got ejected)") 1142 } 1143 1144 // Now that an address is ejected, SubConn updates for SubConns using 1145 // that address should not be forwarded downward. These SubConn updates 1146 // will be cached to update the child sometime in the future when the 1147 // address gets unejected. 1148 od.updateSubConnState(pi.SubConn, balancer.SubConnState{ 1149 ConnectivityState: connectivity.Connecting, 1150 }) 1151 sCtx, cancel = context.WithTimeout(context.Background(), defaultTestShortTimeout) 1152 defer cancel() 1153 if _, err := scsCh.Receive(sCtx); err == nil { 1154 t.Fatalf("SubConn update should not have been forwarded (the SubConn is ejected)") 1155 } 1156 1157 // Override now to cause the interval timer algorithm to always uneject 1158 // the ejected address. This will always uneject the ejected address 1159 // because this time is set way past the max ejection time set in the 1160 // configuration, which will make the next interval timer algorithm run 1161 // uneject any ejected addresses. 1162 defer func(n func() time.Time) { 1163 now = n 1164 }(now) 1165 now = func() time.Time { 1166 return time.Now().Add(time.Second * 1000) 1167 } 1168 od.intervalTimerAlgorithm() 1169 1170 // unejected SubConn should report latest persisted state - which is 1171 // connecting from earlier. 1172 gotSCWS, err = scsCh.Receive(ctx) 1173 if err != nil { 1174 t.Fatalf("Error waiting for Sub Conn update: %v", err) 1175 } 1176 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 1177 sc: scw3, 1178 state: balancer.SubConnState{ConnectivityState: connectivity.Connecting}, 1179 }); err != nil { 1180 t.Fatalf("Error in Sub Conn update: %v", err) 1181 } 1182 } 1183 } 1184 1185 // TestEjectFailureRate tests the functionality of the interval timer algorithm 1186 // when configured with FailurePercentageEjection, and also the functionality of 1187 // noop configuration. The Outlier Detection Balancer will be set up with 3 1188 // SubConns, each with a different address. It tests the following scenarios, in 1189 // a step by step fashion: 1190 // 1. The three addresses each have 5 successes. The interval timer algorithm 1191 // should not eject any of the addresses. 1192 // 2. Two of the addresses have 5 successes, the third has five failures. The 1193 // interval timer algorithm should eject the third address with five failures. 1194 // 3. The Outlier Detection Balancer receives a subsequent noop config update. 1195 // The balancer should uneject all ejected addresses. 1196 func (s) TestEjectFailureRate(t *testing.T) { 1197 scsCh := testutils.NewChannel() 1198 var scw1, scw2, scw3 balancer.SubConn 1199 var err error 1200 stub.Register(t.Name(), stub.BalancerFuncs{ 1201 UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error { 1202 if scw1 != nil { // UpdateClientConnState was already called, no need to recreate SubConns. 1203 return nil 1204 } 1205 scw1, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{ 1206 StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw1, state: state}) }, 1207 }) 1208 if err != nil { 1209 t.Errorf("error in od.NewSubConn call: %v", err) 1210 } 1211 scw2, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{ 1212 StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw2, state: state}) }, 1213 }) 1214 if err != nil { 1215 t.Errorf("error in od.NewSubConn call: %v", err) 1216 } 1217 scw3, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address3"}}, balancer.NewSubConnOptions{ 1218 StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw3, state: state}) }, 1219 }) 1220 if err != nil { 1221 t.Errorf("error in od.NewSubConn call: %v", err) 1222 } 1223 return nil 1224 }, 1225 }) 1226 1227 od, tcc, cleanup := setup(t) 1228 defer func() { 1229 cleanup() 1230 }() 1231 1232 od.UpdateClientConnState(balancer.ClientConnState{ 1233 ResolverState: resolver.State{ 1234 Addresses: []resolver.Address{ 1235 {Addr: "address1"}, 1236 {Addr: "address2"}, 1237 {Addr: "address3"}, 1238 }, 1239 }, 1240 BalancerConfig: &LBConfig{ 1241 Interval: math.MaxInt64, // so the interval will never run unless called manually in test. 1242 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 1243 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 1244 MaxEjectionPercent: 10, 1245 SuccessRateEjection: &SuccessRateEjection{ 1246 StdevFactor: 500, 1247 EnforcementPercentage: 100, 1248 MinimumHosts: 3, 1249 RequestVolume: 3, 1250 }, 1251 ChildPolicy: &iserviceconfig.BalancerConfig{ 1252 Name: t.Name(), 1253 Config: emptyChildConfig{}, 1254 }, 1255 }, 1256 }) 1257 1258 od.UpdateState(balancer.State{ 1259 ConnectivityState: connectivity.Ready, 1260 Picker: &rrPicker{ 1261 scs: []balancer.SubConn{scw1, scw2, scw3}, 1262 }, 1263 }) 1264 1265 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1266 defer cancel() 1267 1268 select { 1269 case <-ctx.Done(): 1270 t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn") 1271 case picker := <-tcc.NewPickerCh: 1272 // Set each upstream address to have five successes each. This should 1273 // cause none of the addresses to be ejected as none of them are below 1274 // the failure percentage threshold. 1275 for i := 0; i < 3; i++ { 1276 pi, err := picker.Pick(balancer.PickInfo{}) 1277 if err != nil { 1278 t.Fatalf("picker.Pick failed with error: %v", err) 1279 } 1280 for c := 0; c < 5; c++ { 1281 pi.Done(balancer.DoneInfo{}) 1282 } 1283 } 1284 1285 od.intervalTimerAlgorithm() 1286 sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout) 1287 defer cancel() 1288 if _, err := scsCh.Receive(sCtx); err == nil { 1289 t.Fatalf("no SubConn update should have been sent (no SubConn got ejected)") 1290 } 1291 1292 // Set two upstream addresses to have five successes each, and one 1293 // upstream address to have five failures. This should cause the address 1294 // with five failures to be ejected according to the Failure Percentage 1295 // Algorithm. 1296 for i := 0; i < 2; i++ { 1297 pi, err := picker.Pick(balancer.PickInfo{}) 1298 if err != nil { 1299 t.Fatalf("picker.Pick failed with error: %v", err) 1300 } 1301 for c := 0; c < 5; c++ { 1302 pi.Done(balancer.DoneInfo{}) 1303 } 1304 } 1305 pi, err := picker.Pick(balancer.PickInfo{}) 1306 if err != nil { 1307 t.Fatalf("picker.Pick failed with error: %v", err) 1308 } 1309 for c := 0; c < 5; c++ { 1310 pi.Done(balancer.DoneInfo{Err: errors.New("some error")}) 1311 } 1312 1313 // should eject address that always errored. 1314 od.intervalTimerAlgorithm() 1315 1316 // verify StateListener() got called with TRANSIENT_FAILURE for child 1317 // in address that was ejected. 1318 gotSCWS, err := scsCh.Receive(ctx) 1319 if err != nil { 1320 t.Fatalf("Error waiting for Sub Conn update: %v", err) 1321 } 1322 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 1323 sc: scw3, 1324 state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure}, 1325 }); err != nil { 1326 t.Fatalf("Error in Sub Conn update: %v", err) 1327 } 1328 1329 // verify only one address got ejected. 1330 sCtx, cancel = context.WithTimeout(context.Background(), defaultTestShortTimeout) 1331 defer cancel() 1332 if _, err := scsCh.Receive(sCtx); err == nil { 1333 t.Fatalf("Only one SubConn update should have been sent (only one SubConn got ejected)") 1334 } 1335 1336 // upon the Outlier Detection balancer being reconfigured with a noop 1337 // configuration, every ejected SubConn should be unejected. 1338 od.UpdateClientConnState(balancer.ClientConnState{ 1339 ResolverState: resolver.State{ 1340 Addresses: []resolver.Address{ 1341 {Addr: "address1"}, 1342 {Addr: "address2"}, 1343 {Addr: "address3"}, 1344 }, 1345 }, 1346 BalancerConfig: &LBConfig{ 1347 Interval: math.MaxInt64, 1348 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 1349 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 1350 MaxEjectionPercent: 10, 1351 ChildPolicy: &iserviceconfig.BalancerConfig{ 1352 Name: t.Name(), 1353 Config: emptyChildConfig{}, 1354 }, 1355 }, 1356 }) 1357 gotSCWS, err = scsCh.Receive(ctx) 1358 if err != nil { 1359 t.Fatalf("Error waiting for Sub Conn update: %v", err) 1360 } 1361 if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{ 1362 sc: scw3, 1363 state: balancer.SubConnState{ConnectivityState: connectivity.Idle}, 1364 }); err != nil { 1365 t.Fatalf("Error in Sub Conn update: %v", err) 1366 } 1367 } 1368 } 1369 1370 // TestConcurrentOperations calls different operations on the balancer in 1371 // separate goroutines to test for any race conditions and deadlocks. It also 1372 // uses a child balancer which verifies that no operations on the child get 1373 // called after the child balancer is closed. 1374 func (s) TestConcurrentOperations(t *testing.T) { 1375 closed := grpcsync.NewEvent() 1376 stub.Register(t.Name(), stub.BalancerFuncs{ 1377 UpdateClientConnState: func(*stub.BalancerData, balancer.ClientConnState) error { 1378 if closed.HasFired() { 1379 t.Error("UpdateClientConnState was called after Close(), which breaks the balancer API") 1380 } 1381 return nil 1382 }, 1383 ResolverError: func(*stub.BalancerData, error) { 1384 if closed.HasFired() { 1385 t.Error("ResolverError was called after Close(), which breaks the balancer API") 1386 } 1387 }, 1388 Close: func(*stub.BalancerData) { 1389 closed.Fire() 1390 }, 1391 ExitIdle: func(*stub.BalancerData) { 1392 if closed.HasFired() { 1393 t.Error("ExitIdle was called after Close(), which breaks the balancer API") 1394 } 1395 }, 1396 }) 1397 1398 od, tcc, cleanup := setup(t) 1399 defer func() { 1400 cleanup() 1401 }() 1402 1403 od.UpdateClientConnState(balancer.ClientConnState{ 1404 ResolverState: resolver.State{ 1405 Addresses: []resolver.Address{ 1406 {Addr: "address1"}, 1407 {Addr: "address2"}, 1408 {Addr: "address3"}, 1409 }, 1410 }, 1411 BalancerConfig: &LBConfig{ 1412 Interval: math.MaxInt64, // so the interval will never run unless called manually in test. 1413 BaseEjectionTime: iserviceconfig.Duration(30 * time.Second), 1414 MaxEjectionTime: iserviceconfig.Duration(300 * time.Second), 1415 MaxEjectionPercent: 10, 1416 SuccessRateEjection: &SuccessRateEjection{ // Have both Success Rate and Failure Percentage to step through all the interval timer code 1417 StdevFactor: 500, 1418 EnforcementPercentage: 100, 1419 MinimumHosts: 3, 1420 RequestVolume: 3, 1421 }, 1422 FailurePercentageEjection: &FailurePercentageEjection{ 1423 Threshold: 50, 1424 EnforcementPercentage: 100, 1425 MinimumHosts: 3, 1426 RequestVolume: 3, 1427 }, 1428 ChildPolicy: &iserviceconfig.BalancerConfig{ 1429 Name: t.Name(), 1430 Config: emptyChildConfig{}, 1431 }, 1432 }, 1433 }) 1434 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1435 defer cancel() 1436 1437 scw1, err := od.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{}) 1438 if err != nil { 1439 t.Fatalf("error in od.NewSubConn call: %v", err) 1440 } 1441 if err != nil { 1442 t.Fatalf("error in od.NewSubConn call: %v", err) 1443 } 1444 1445 scw2, err := od.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{}) 1446 if err != nil { 1447 t.Fatalf("error in od.NewSubConn call: %v", err) 1448 } 1449 1450 scw3, err := od.NewSubConn([]resolver.Address{{Addr: "address3"}}, balancer.NewSubConnOptions{}) 1451 if err != nil { 1452 t.Fatalf("error in od.NewSubConn call: %v", err) 1453 } 1454 1455 od.UpdateState(balancer.State{ 1456 ConnectivityState: connectivity.Ready, 1457 Picker: &rrPicker{ 1458 scs: []balancer.SubConn{scw2, scw3}, 1459 }, 1460 }) 1461 1462 var picker balancer.Picker 1463 select { 1464 case <-ctx.Done(): 1465 t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn") 1466 case picker = <-tcc.NewPickerCh: 1467 } 1468 1469 finished := make(chan struct{}) 1470 var wg sync.WaitGroup 1471 wg.Add(1) 1472 go func() { 1473 defer wg.Done() 1474 for { 1475 select { 1476 case <-finished: 1477 return 1478 default: 1479 } 1480 pi, err := picker.Pick(balancer.PickInfo{}) 1481 if err != nil { 1482 continue 1483 } 1484 pi.Done(balancer.DoneInfo{}) 1485 pi.Done(balancer.DoneInfo{Err: errors.New("some error")}) 1486 time.Sleep(1 * time.Nanosecond) 1487 } 1488 }() 1489 1490 wg.Add(1) 1491 go func() { 1492 defer wg.Done() 1493 for { 1494 select { 1495 case <-finished: 1496 return 1497 default: 1498 } 1499 od.intervalTimerAlgorithm() 1500 } 1501 }() 1502 1503 // call Outlier Detection's balancer.ClientConn operations asynchronously. 1504 // balancer.ClientConn operations have no guarantee from the API to be 1505 // called synchronously. 1506 wg.Add(1) 1507 go func() { 1508 defer wg.Done() 1509 for { 1510 select { 1511 case <-finished: 1512 return 1513 default: 1514 } 1515 od.UpdateState(balancer.State{ 1516 ConnectivityState: connectivity.Ready, 1517 Picker: &rrPicker{ 1518 scs: []balancer.SubConn{scw2, scw3}, 1519 }, 1520 }) 1521 time.Sleep(1 * time.Nanosecond) 1522 } 1523 }() 1524 1525 wg.Add(1) 1526 go func() { 1527 defer wg.Done() 1528 od.NewSubConn([]resolver.Address{{Addr: "address4"}}, balancer.NewSubConnOptions{}) 1529 }() 1530 1531 wg.Add(1) 1532 go func() { 1533 defer wg.Done() 1534 scw1.Shutdown() 1535 }() 1536 1537 wg.Add(1) 1538 go func() { 1539 defer wg.Done() 1540 od.UpdateAddresses(scw2, []resolver.Address{{Addr: "address3"}}) 1541 }() 1542 1543 // Call balancer.Balancers synchronously in this goroutine, upholding the 1544 // balancer.Balancer API guarantee of synchronous calls. 1545 od.UpdateClientConnState(balancer.ClientConnState{ // This will delete addresses and flip to no op 1546 ResolverState: resolver.State{ 1547 Addresses: []resolver.Address{{Addr: "address1"}}, 1548 }, 1549 BalancerConfig: &LBConfig{ 1550 Interval: math.MaxInt64, 1551 ChildPolicy: &iserviceconfig.BalancerConfig{ 1552 Name: t.Name(), 1553 Config: emptyChildConfig{}, 1554 }, 1555 }, 1556 }) 1557 1558 // Call balancer.Balancers synchronously in this goroutine, upholding the 1559 // balancer.Balancer API guarantee. 1560 od.updateSubConnState(scw1.(*subConnWrapper).SubConn, balancer.SubConnState{ 1561 ConnectivityState: connectivity.Connecting, 1562 }) 1563 od.ResolverError(errors.New("some error")) 1564 od.ExitIdle() 1565 od.Close() 1566 close(finished) 1567 wg.Wait() 1568 }