google.golang.org/grpc@v1.74.2/xds/internal/balancer/outlierdetection/balancer_test.go (about)

     1  /*
     2   *
     3   * Copyright 2022 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   *
    17   */
    18  
    19  package outlierdetection
    20  
    21  import (
    22  	"context"
    23  	"encoding/json"
    24  	"errors"
    25  	"fmt"
    26  	"math"
    27  	"strings"
    28  	"sync"
    29  	"testing"
    30  	"time"
    31  
    32  	"github.com/google/go-cmp/cmp"
    33  	"github.com/google/go-cmp/cmp/cmpopts"
    34  	"google.golang.org/grpc"
    35  	"google.golang.org/grpc/balancer"
    36  	"google.golang.org/grpc/balancer/pickfirst/pickfirstleaf"
    37  	"google.golang.org/grpc/balancer/weightedroundrobin"
    38  	"google.golang.org/grpc/codes"
    39  	"google.golang.org/grpc/connectivity"
    40  	"google.golang.org/grpc/credentials/insecure"
    41  	"google.golang.org/grpc/internal/balancer/stub"
    42  	"google.golang.org/grpc/internal/channelz"
    43  	"google.golang.org/grpc/internal/grpcsync"
    44  	"google.golang.org/grpc/internal/grpctest"
    45  	iserviceconfig "google.golang.org/grpc/internal/serviceconfig"
    46  	"google.golang.org/grpc/internal/stubserver"
    47  	"google.golang.org/grpc/internal/testutils"
    48  	"google.golang.org/grpc/internal/testutils/roundrobin"
    49  	"google.golang.org/grpc/peer"
    50  	"google.golang.org/grpc/resolver"
    51  	"google.golang.org/grpc/resolver/manual"
    52  	"google.golang.org/grpc/serviceconfig"
    53  	"google.golang.org/grpc/status"
    54  	"google.golang.org/grpc/xds/internal/balancer/clusterimpl"
    55  
    56  	testgrpc "google.golang.org/grpc/interop/grpc_testing"
    57  	testpb "google.golang.org/grpc/interop/grpc_testing"
    58  )
    59  
    60  var (
    61  	defaultTestTimeout      = 5 * time.Second
    62  	defaultTestShortTimeout = 10 * time.Millisecond
    63  )
    64  
    65  type s struct {
    66  	grpctest.Tester
    67  }
    68  
    69  func Test(t *testing.T) {
    70  	grpctest.RunSubTests(t, s{})
    71  }
    72  
    73  // TestParseConfig verifies the ParseConfig() method in the Outlier Detection
    74  // Balancer.
    75  func (s) TestParseConfig(t *testing.T) {
    76  	const errParseConfigName = "errParseConfigBalancer"
    77  	stub.Register(errParseConfigName, stub.BalancerFuncs{
    78  		ParseConfig: func(json.RawMessage) (serviceconfig.LoadBalancingConfig, error) {
    79  			return nil, errors.New("some error")
    80  		},
    81  	})
    82  
    83  	parser := bb{}
    84  	const (
    85  		defaultInterval                       = iserviceconfig.Duration(10 * time.Second)
    86  		defaultBaseEjectionTime               = iserviceconfig.Duration(30 * time.Second)
    87  		defaultMaxEjectionTime                = iserviceconfig.Duration(300 * time.Second)
    88  		defaultMaxEjectionPercent             = 10
    89  		defaultSuccessRateStdevFactor         = 1900
    90  		defaultEnforcingSuccessRate           = 100
    91  		defaultSuccessRateMinimumHosts        = 5
    92  		defaultSuccessRateRequestVolume       = 100
    93  		defaultFailurePercentageThreshold     = 85
    94  		defaultEnforcingFailurePercentage     = 0
    95  		defaultFailurePercentageMinimumHosts  = 5
    96  		defaultFailurePercentageRequestVolume = 50
    97  	)
    98  	tests := []struct {
    99  		name    string
   100  		input   string
   101  		wantCfg serviceconfig.LoadBalancingConfig
   102  		wantErr string
   103  	}{
   104  		{
   105  			name: "no-fields-set-should-get-default",
   106  			input: `{
   107  				"childPolicy": [
   108  				{
   109  					"xds_cluster_impl_experimental": {
   110  						"cluster": "test_cluster"
   111  					}
   112  				}
   113  				]
   114  			}`,
   115  			wantCfg: &LBConfig{
   116  				Interval:           defaultInterval,
   117  				BaseEjectionTime:   defaultBaseEjectionTime,
   118  				MaxEjectionTime:    defaultMaxEjectionTime,
   119  				MaxEjectionPercent: defaultMaxEjectionPercent,
   120  				ChildPolicy: &iserviceconfig.BalancerConfig{
   121  					Name: "xds_cluster_impl_experimental",
   122  					Config: &clusterimpl.LBConfig{
   123  						Cluster: "test_cluster",
   124  					},
   125  				},
   126  			},
   127  		},
   128  
   129  		{
   130  			name: "some-top-level-fields-set",
   131  			input: `{
   132  				"interval": "15s",
   133  				"maxEjectionTime": "350s",
   134  				"childPolicy": [
   135  				{
   136  					"xds_cluster_impl_experimental": {
   137  						"cluster": "test_cluster"
   138  					}
   139  				}
   140  				]
   141  			}`,
   142  			// Should get set fields + defaults for unset fields.
   143  			wantCfg: &LBConfig{
   144  				Interval:           iserviceconfig.Duration(15 * time.Second),
   145  				BaseEjectionTime:   defaultBaseEjectionTime,
   146  				MaxEjectionTime:    iserviceconfig.Duration(350 * time.Second),
   147  				MaxEjectionPercent: defaultMaxEjectionPercent,
   148  				ChildPolicy: &iserviceconfig.BalancerConfig{
   149  					Name: "xds_cluster_impl_experimental",
   150  					Config: &clusterimpl.LBConfig{
   151  						Cluster: "test_cluster",
   152  					},
   153  				},
   154  			},
   155  		},
   156  		{
   157  			name: "success-rate-ejection-present-but-no-fields",
   158  			input: `{
   159  				"successRateEjection": {},
   160                  "childPolicy": [
   161  				{
   162  					"xds_cluster_impl_experimental": {
   163  						"cluster": "test_cluster"
   164  					}
   165  				}
   166  				]
   167  			}`,
   168  			// Should get defaults of success-rate-ejection struct.
   169  			wantCfg: &LBConfig{
   170  				Interval:           defaultInterval,
   171  				BaseEjectionTime:   defaultBaseEjectionTime,
   172  				MaxEjectionTime:    defaultMaxEjectionTime,
   173  				MaxEjectionPercent: defaultMaxEjectionPercent,
   174  				SuccessRateEjection: &SuccessRateEjection{
   175  					StdevFactor:           defaultSuccessRateStdevFactor,
   176  					EnforcementPercentage: defaultEnforcingSuccessRate,
   177  					MinimumHosts:          defaultSuccessRateMinimumHosts,
   178  					RequestVolume:         defaultSuccessRateRequestVolume,
   179  				},
   180  				ChildPolicy: &iserviceconfig.BalancerConfig{
   181  					Name: "xds_cluster_impl_experimental",
   182  					Config: &clusterimpl.LBConfig{
   183  						Cluster: "test_cluster",
   184  					},
   185  				},
   186  			},
   187  		},
   188  		{
   189  			name: "success-rate-ejection-present-partially-set",
   190  			input: `{
   191  				"successRateEjection": {
   192  					"stdevFactor": 1000,
   193  					"minimumHosts": 5
   194  				},
   195                  "childPolicy": [
   196  				{
   197  					"xds_cluster_impl_experimental": {
   198  						"cluster": "test_cluster"
   199  					}
   200  				}
   201  				]
   202  			}`,
   203  			// Should get set fields + defaults for others in success rate
   204  			// ejection layer.
   205  			wantCfg: &LBConfig{
   206  				Interval:           defaultInterval,
   207  				BaseEjectionTime:   defaultBaseEjectionTime,
   208  				MaxEjectionTime:    defaultMaxEjectionTime,
   209  				MaxEjectionPercent: defaultMaxEjectionPercent,
   210  				SuccessRateEjection: &SuccessRateEjection{
   211  					StdevFactor:           1000,
   212  					EnforcementPercentage: defaultEnforcingSuccessRate,
   213  					MinimumHosts:          5,
   214  					RequestVolume:         defaultSuccessRateRequestVolume,
   215  				},
   216  				ChildPolicy: &iserviceconfig.BalancerConfig{
   217  					Name: "xds_cluster_impl_experimental",
   218  					Config: &clusterimpl.LBConfig{
   219  						Cluster: "test_cluster",
   220  					},
   221  				},
   222  			},
   223  		},
   224  		{
   225  			name: "success-rate-ejection-present-fully-set",
   226  			input: `{
   227  				"successRateEjection": {
   228  					"stdevFactor": 1000,
   229  					"enforcementPercentage": 50,
   230  					"minimumHosts": 5,
   231  					"requestVolume": 50
   232  				},
   233                  "childPolicy": [
   234  				{
   235  					"xds_cluster_impl_experimental": {
   236  						"cluster": "test_cluster"
   237  					}
   238  				}
   239  				]
   240  			}`,
   241  			wantCfg: &LBConfig{
   242  				Interval:           defaultInterval,
   243  				BaseEjectionTime:   defaultBaseEjectionTime,
   244  				MaxEjectionTime:    defaultMaxEjectionTime,
   245  				MaxEjectionPercent: defaultMaxEjectionPercent,
   246  				SuccessRateEjection: &SuccessRateEjection{
   247  					StdevFactor:           1000,
   248  					EnforcementPercentage: 50,
   249  					MinimumHosts:          5,
   250  					RequestVolume:         50,
   251  				},
   252  				ChildPolicy: &iserviceconfig.BalancerConfig{
   253  					Name: "xds_cluster_impl_experimental",
   254  					Config: &clusterimpl.LBConfig{
   255  						Cluster: "test_cluster",
   256  					},
   257  				},
   258  			},
   259  		},
   260  		{
   261  			name: "failure-percentage-ejection-present-but-no-fields",
   262  			input: `{
   263  				"failurePercentageEjection": {},
   264                  "childPolicy": [
   265  				{
   266  					"xds_cluster_impl_experimental": {
   267  						"cluster": "test_cluster"
   268  					}
   269  				}
   270  				]
   271  			}`,
   272  			// Should get defaults of failure percentage ejection layer.
   273  			wantCfg: &LBConfig{
   274  				Interval:           defaultInterval,
   275  				BaseEjectionTime:   defaultBaseEjectionTime,
   276  				MaxEjectionTime:    defaultMaxEjectionTime,
   277  				MaxEjectionPercent: defaultMaxEjectionPercent,
   278  				FailurePercentageEjection: &FailurePercentageEjection{
   279  					Threshold:             defaultFailurePercentageThreshold,
   280  					EnforcementPercentage: defaultEnforcingFailurePercentage,
   281  					MinimumHosts:          defaultFailurePercentageMinimumHosts,
   282  					RequestVolume:         defaultFailurePercentageRequestVolume,
   283  				},
   284  				ChildPolicy: &iserviceconfig.BalancerConfig{
   285  					Name: "xds_cluster_impl_experimental",
   286  					Config: &clusterimpl.LBConfig{
   287  						Cluster: "test_cluster",
   288  					},
   289  				},
   290  			},
   291  		},
   292  		{
   293  			name: "failure-percentage-ejection-present-partially-set",
   294  			input: `{
   295  				"failurePercentageEjection": {
   296  					"threshold": 80,
   297  					"minimumHosts": 10
   298  				},
   299                  "childPolicy": [
   300  				{
   301  					"xds_cluster_impl_experimental": {
   302  						"cluster": "test_cluster"
   303  					}
   304  				}
   305  				]
   306  			}`,
   307  			// Should get set fields + defaults for others in success rate
   308  			// ejection layer.
   309  			wantCfg: &LBConfig{
   310  				Interval:           defaultInterval,
   311  				BaseEjectionTime:   defaultBaseEjectionTime,
   312  				MaxEjectionTime:    defaultMaxEjectionTime,
   313  				MaxEjectionPercent: defaultMaxEjectionPercent,
   314  				FailurePercentageEjection: &FailurePercentageEjection{
   315  					Threshold:             80,
   316  					EnforcementPercentage: defaultEnforcingFailurePercentage,
   317  					MinimumHosts:          10,
   318  					RequestVolume:         defaultFailurePercentageRequestVolume,
   319  				},
   320  				ChildPolicy: &iserviceconfig.BalancerConfig{
   321  					Name: "xds_cluster_impl_experimental",
   322  					Config: &clusterimpl.LBConfig{
   323  						Cluster: "test_cluster",
   324  					},
   325  				},
   326  			},
   327  		},
   328  		{
   329  			name: "failure-percentage-ejection-present-fully-set",
   330  			input: `{
   331  				"failurePercentageEjection": {
   332  					"threshold": 80,
   333  					"enforcementPercentage": 100,
   334  					"minimumHosts": 10,
   335  					"requestVolume": 40
   336                  },
   337                  "childPolicy": [
   338  				{
   339  					"xds_cluster_impl_experimental": {
   340  						"cluster": "test_cluster"
   341  					}
   342  				}
   343  				]
   344  			}`,
   345  			wantCfg: &LBConfig{
   346  				Interval:           defaultInterval,
   347  				BaseEjectionTime:   defaultBaseEjectionTime,
   348  				MaxEjectionTime:    defaultMaxEjectionTime,
   349  				MaxEjectionPercent: defaultMaxEjectionPercent,
   350  				FailurePercentageEjection: &FailurePercentageEjection{
   351  					Threshold:             80,
   352  					EnforcementPercentage: 100,
   353  					MinimumHosts:          10,
   354  					RequestVolume:         40,
   355  				},
   356  				ChildPolicy: &iserviceconfig.BalancerConfig{
   357  					Name: "xds_cluster_impl_experimental",
   358  					Config: &clusterimpl.LBConfig{
   359  						Cluster: "test_cluster",
   360  					},
   361  				},
   362  			},
   363  		},
   364  		{ // to make sure zero values aren't overwritten by defaults
   365  			name: "lb-config-every-field-set-zero-value",
   366  			input: `{
   367  				"interval": "0s",
   368  				"baseEjectionTime": "0s",
   369  				"maxEjectionTime": "0s",
   370  				"maxEjectionPercent": 0,
   371  				"successRateEjection": {
   372  					"stdevFactor": 0,
   373  					"enforcementPercentage": 0,
   374  					"minimumHosts": 0,
   375  					"requestVolume": 0
   376  				},
   377  				"failurePercentageEjection": {
   378  					"threshold": 0,
   379  					"enforcementPercentage": 0,
   380  					"minimumHosts": 0,
   381  					"requestVolume": 0
   382  				},
   383                  "childPolicy": [
   384  				{
   385  					"xds_cluster_impl_experimental": {
   386  						"cluster": "test_cluster"
   387  					}
   388  				}
   389  				]
   390  			}`,
   391  			wantCfg: &LBConfig{
   392  				SuccessRateEjection:       &SuccessRateEjection{},
   393  				FailurePercentageEjection: &FailurePercentageEjection{},
   394  				ChildPolicy: &iserviceconfig.BalancerConfig{
   395  					Name: "xds_cluster_impl_experimental",
   396  					Config: &clusterimpl.LBConfig{
   397  						Cluster: "test_cluster",
   398  					},
   399  				},
   400  			},
   401  		},
   402  		{
   403  			name: "lb-config-every-field-set",
   404  			input: `{
   405  				"interval": "10s",
   406  				"baseEjectionTime": "30s",
   407  				"maxEjectionTime": "300s",
   408  				"maxEjectionPercent": 10,
   409  				"successRateEjection": {
   410  					"stdevFactor": 1900,
   411  					"enforcementPercentage": 100,
   412  					"minimumHosts": 5,
   413  					"requestVolume": 100
   414  				},
   415  				"failurePercentageEjection": {
   416  					"threshold": 85,
   417  					"enforcementPercentage": 5,
   418  					"minimumHosts": 5,
   419  					"requestVolume": 50
   420  				},
   421                  "childPolicy": [
   422  				{
   423  					"xds_cluster_impl_experimental": {
   424  						"cluster": "test_cluster"
   425  					}
   426  				}
   427  				]
   428  			}`,
   429  			wantCfg: &LBConfig{
   430  				Interval:           iserviceconfig.Duration(10 * time.Second),
   431  				BaseEjectionTime:   iserviceconfig.Duration(30 * time.Second),
   432  				MaxEjectionTime:    iserviceconfig.Duration(300 * time.Second),
   433  				MaxEjectionPercent: 10,
   434  				SuccessRateEjection: &SuccessRateEjection{
   435  					StdevFactor:           1900,
   436  					EnforcementPercentage: 100,
   437  					MinimumHosts:          5,
   438  					RequestVolume:         100,
   439  				},
   440  				FailurePercentageEjection: &FailurePercentageEjection{
   441  					Threshold:             85,
   442  					EnforcementPercentage: 5,
   443  					MinimumHosts:          5,
   444  					RequestVolume:         50,
   445  				},
   446  				ChildPolicy: &iserviceconfig.BalancerConfig{
   447  					Name: "xds_cluster_impl_experimental",
   448  					Config: &clusterimpl.LBConfig{
   449  						Cluster: "test_cluster",
   450  					},
   451  				},
   452  			},
   453  		},
   454  		{
   455  			name:    "interval-is-negative",
   456  			input:   `{"interval": "-10s"}`,
   457  			wantErr: "OutlierDetectionLoadBalancingConfig.interval = -10s; must be >= 0",
   458  		},
   459  		{
   460  			name:    "base-ejection-time-is-negative",
   461  			input:   `{"baseEjectionTime": "-10s"}`,
   462  			wantErr: "OutlierDetectionLoadBalancingConfig.base_ejection_time = -10s; must be >= 0",
   463  		},
   464  		{
   465  			name:    "max-ejection-time-is-negative",
   466  			input:   `{"maxEjectionTime": "-10s"}`,
   467  			wantErr: "OutlierDetectionLoadBalancingConfig.max_ejection_time = -10s; must be >= 0",
   468  		},
   469  		{
   470  			name:    "max-ejection-percent-is-greater-than-100",
   471  			input:   `{"maxEjectionPercent": 150}`,
   472  			wantErr: "OutlierDetectionLoadBalancingConfig.max_ejection_percent = 150; must be <= 100",
   473  		},
   474  		{
   475  			name: "enforcement-percentage-success-rate-is-greater-than-100",
   476  			input: `{
   477  				"successRateEjection": {
   478  					"enforcementPercentage": 150
   479  				}
   480  			}`,
   481  			wantErr: "OutlierDetectionLoadBalancingConfig.SuccessRateEjection.enforcement_percentage = 150; must be <= 100",
   482  		},
   483  		{
   484  			name: "failure-percentage-threshold-is-greater-than-100",
   485  			input: `{
   486  				"failurePercentageEjection": {
   487  					"threshold": 150
   488  				}
   489  			}`,
   490  			wantErr: "OutlierDetectionLoadBalancingConfig.FailurePercentageEjection.threshold = 150; must be <= 100",
   491  		},
   492  		{
   493  			name: "enforcement-percentage-failure-percentage-ejection-is-greater-than-100",
   494  			input: `{
   495  				"failurePercentageEjection": {
   496  					"enforcementPercentage": 150
   497  				}
   498  			}`,
   499  			wantErr: "OutlierDetectionLoadBalancingConfig.FailurePercentageEjection.enforcement_percentage = 150; must be <= 100",
   500  		},
   501  		{
   502  			name: "child-policy-present-but-parse-error",
   503  			input: `{
   504  				"childPolicy": [
   505  				{
   506  					"errParseConfigBalancer": {
   507  						"cluster": "test_cluster"
   508  					}
   509  				}
   510  			]
   511  			}`,
   512  			wantErr: "error parsing loadBalancingConfig for policy \"errParseConfigBalancer\"",
   513  		},
   514  		{
   515  			name: "no-supported-child-policy",
   516  			input: `{
   517  				"childPolicy": [
   518  				{
   519  					"doesNotExistBalancer": {
   520  						"cluster": "test_cluster"
   521  					}
   522  				}
   523  			]
   524  			}`,
   525  			wantErr: "invalid loadBalancingConfig: no supported policies found",
   526  		},
   527  	}
   528  	for _, test := range tests {
   529  		t.Run(test.name, func(t *testing.T) {
   530  			gotCfg, gotErr := parser.ParseConfig(json.RawMessage(test.input))
   531  			if gotErr != nil && !strings.Contains(gotErr.Error(), test.wantErr) {
   532  				t.Fatalf("ParseConfig(%v) = %v, wantErr %v", test.input, gotErr, test.wantErr)
   533  			}
   534  			if (gotErr != nil) != (test.wantErr != "") {
   535  				t.Fatalf("ParseConfig(%v) = %v, wantErr %v", test.input, gotErr, test.wantErr)
   536  			}
   537  			if test.wantErr != "" {
   538  				return
   539  			}
   540  			if diff := cmp.Diff(gotCfg, test.wantCfg); diff != "" {
   541  				t.Fatalf("parseConfig(%v) got unexpected output, diff (-got +want): %v", string(test.input), diff)
   542  			}
   543  		})
   544  	}
   545  }
   546  
   547  func (lbc *LBConfig) Equal(lbc2 *LBConfig) bool {
   548  	if !lbc.EqualIgnoringChildPolicy(lbc2) {
   549  		return false
   550  	}
   551  	return cmp.Equal(lbc.ChildPolicy, lbc2.ChildPolicy)
   552  }
   553  
   554  type subConnWithState struct {
   555  	sc    balancer.SubConn
   556  	state balancer.SubConnState
   557  }
   558  
   559  func setup(t *testing.T) (*outlierDetectionBalancer, *testutils.BalancerClientConn, func()) {
   560  	t.Helper()
   561  	builder := balancer.Get(Name)
   562  	if builder == nil {
   563  		t.Fatalf("balancer.Get(%q) returned nil", Name)
   564  	}
   565  	tcc := testutils.NewBalancerClientConn(t)
   566  	ch := channelz.RegisterChannel(nil, "test channel")
   567  	t.Cleanup(func() { channelz.RemoveEntry(ch.ID) })
   568  	odB := builder.Build(tcc, balancer.BuildOptions{ChannelzParent: ch})
   569  	return odB.(*outlierDetectionBalancer), tcc, odB.Close
   570  }
   571  
   572  type emptyChildConfig struct {
   573  	serviceconfig.LoadBalancingConfig
   574  }
   575  
   576  // TestChildBasicOperations tests basic operations of the Outlier Detection
   577  // Balancer and its interaction with its child. The following scenarios are
   578  // tested, in a step by step fashion:
   579  // 1. The Outlier Detection Balancer receives it's first good configuration. The
   580  // balancer is expected to create a child and sent the child it's configuration.
   581  // 2. The Outlier Detection Balancer receives new configuration that specifies a
   582  // child's type, and the new type immediately reports READY inline. The first
   583  // child balancer should be closed and the second child balancer should receive
   584  // a config update.
   585  // 3. The Outlier Detection Balancer is closed. The second child balancer should
   586  // be closed.
   587  func (s) TestChildBasicOperations(t *testing.T) {
   588  	bc := emptyChildConfig{}
   589  
   590  	ccsCh := testutils.NewChannel()
   591  	closeCh := testutils.NewChannel()
   592  
   593  	stub.Register(t.Name()+"child1", stub.BalancerFuncs{
   594  		UpdateClientConnState: func(_ *stub.BalancerData, ccs balancer.ClientConnState) error {
   595  			ccsCh.Send(ccs.BalancerConfig)
   596  			return nil
   597  		},
   598  		Close: func(*stub.BalancerData) {
   599  			closeCh.Send(nil)
   600  		},
   601  	})
   602  
   603  	stub.Register(t.Name()+"child2", stub.BalancerFuncs{
   604  		UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error {
   605  			// UpdateState inline to READY to complete graceful switch process
   606  			// synchronously from any UpdateClientConnState call.
   607  			bd.ClientConn.UpdateState(balancer.State{
   608  				ConnectivityState: connectivity.Ready,
   609  				Picker:            &testutils.TestConstPicker{},
   610  			})
   611  			ccsCh.Send(nil)
   612  			return nil
   613  		},
   614  		Close: func(*stub.BalancerData) {
   615  			closeCh.Send(nil)
   616  		},
   617  	})
   618  
   619  	od, tcc, _ := setup(t)
   620  
   621  	// This first config update should cause a child to be built and forwarded
   622  	// its first update.
   623  	od.UpdateClientConnState(balancer.ClientConnState{
   624  		BalancerConfig: &LBConfig{
   625  			ChildPolicy: &iserviceconfig.BalancerConfig{
   626  				Name:   t.Name() + "child1",
   627  				Config: bc,
   628  			},
   629  		},
   630  	})
   631  
   632  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   633  	defer cancel()
   634  	cr, err := ccsCh.Receive(ctx)
   635  	if err != nil {
   636  		t.Fatalf("timed out waiting for UpdateClientConnState on the first child balancer: %v", err)
   637  	}
   638  	if _, ok := cr.(emptyChildConfig); !ok {
   639  		t.Fatalf("Received child policy config of type %T, want %T", cr, emptyChildConfig{})
   640  	}
   641  
   642  	// This Update Client Conn State call should cause the first child balancer
   643  	// to close, and a new child to be created and also forwarded its first
   644  	// config update.
   645  	od.UpdateClientConnState(balancer.ClientConnState{
   646  		BalancerConfig: &LBConfig{
   647  			Interval: math.MaxInt64,
   648  			ChildPolicy: &iserviceconfig.BalancerConfig{
   649  				Name:   t.Name() + "child2",
   650  				Config: emptyChildConfig{},
   651  			},
   652  		},
   653  	})
   654  
   655  	// Verify inline UpdateState() call from the new child eventually makes it's
   656  	// way to the Test Client Conn.
   657  	select {
   658  	case <-ctx.Done():
   659  		t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn")
   660  	case state := <-tcc.NewStateCh:
   661  		if state != connectivity.Ready {
   662  			t.Fatalf("ClientConn received connectivity state %v, want %v", state, connectivity.Ready)
   663  		}
   664  	}
   665  
   666  	// Verify the first child balancer closed.
   667  	if _, err = closeCh.Receive(ctx); err != nil {
   668  		t.Fatalf("timed out waiting for the first child balancer to be closed: %v", err)
   669  	}
   670  	// Verify the second child balancer received its first config update.
   671  	if _, err = ccsCh.Receive(ctx); err != nil {
   672  		t.Fatalf("timed out waiting for UpdateClientConnState on the second child balancer: %v", err)
   673  	}
   674  	// Closing the Outlier Detection Balancer should close the newly created
   675  	// child.
   676  	od.Close()
   677  	if _, err = closeCh.Receive(ctx); err != nil {
   678  		t.Fatalf("timed out waiting for the second child balancer to be closed: %v", err)
   679  	}
   680  }
   681  
   682  // TestUpdateAddresses tests the functionality of UpdateAddresses and any
   683  // changes in the addresses/plurality of those addresses for a SubConn. The
   684  // Balancer is set up with two upstreams, with one of the upstreams being
   685  // ejected. Initially, there is one SubConn for each address. The following
   686  // scenarios are tested, in a step by step fashion:
   687  // 1. The SubConn not currently ejected switches addresses to the address that
   688  // is ejected. This should cause the SubConn to get ejected.
   689  // 2. Update this same SubConn to multiple addresses. This should cause the
   690  // SubConn to get unejected, as it is no longer being tracked by Outlier
   691  // Detection at that point.
   692  // 3. Update this same SubConn to different addresses, still multiple. This
   693  // should be a noop, as the SubConn is still no longer being tracked by Outlier
   694  // Detection.
   695  // 4. Update this same SubConn to the a single address which is ejected. This
   696  // should cause the SubConn to be ejected.
   697  func (s) TestUpdateAddresses(t *testing.T) {
   698  	scsCh := testutils.NewChannel()
   699  	var scw1, scw2 balancer.SubConn
   700  	var err error
   701  	connectivityCh := make(chan struct{})
   702  	stub.Register(t.Name(), stub.BalancerFuncs{
   703  		UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error {
   704  			scw1, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{
   705  				StateListener: func(balancer.SubConnState) {},
   706  			})
   707  			if err != nil {
   708  				t.Errorf("error in od.NewSubConn call: %v", err)
   709  			}
   710  			scw1.Connect()
   711  			scw2, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{
   712  				StateListener: func(state balancer.SubConnState) {
   713  					if state.ConnectivityState == connectivity.Ready {
   714  						close(connectivityCh)
   715  					}
   716  				},
   717  			})
   718  			if err != nil {
   719  				t.Errorf("error in od.NewSubConn call: %v", err)
   720  			}
   721  			scw2.Connect()
   722  			bd.ClientConn.UpdateState(balancer.State{
   723  				ConnectivityState: connectivity.Ready,
   724  				Picker: &rrPicker{
   725  					scs: []balancer.SubConn{scw1, scw2},
   726  				},
   727  			})
   728  			return nil
   729  		},
   730  	})
   731  
   732  	od, tcc, cleanup := setup(t)
   733  	defer cleanup()
   734  
   735  	od.UpdateClientConnState(balancer.ClientConnState{
   736  		ResolverState: resolver.State{
   737  			Endpoints: []resolver.Endpoint{
   738  				{Addresses: []resolver.Address{{Addr: "address1"}}},
   739  				{Addresses: []resolver.Address{{Addr: "address2"}}},
   740  			},
   741  		},
   742  		BalancerConfig: &LBConfig{
   743  			Interval:           iserviceconfig.Duration(10 * time.Second),
   744  			BaseEjectionTime:   iserviceconfig.Duration(30 * time.Second),
   745  			MaxEjectionTime:    iserviceconfig.Duration(300 * time.Second),
   746  			MaxEjectionPercent: 10,
   747  			FailurePercentageEjection: &FailurePercentageEjection{
   748  				Threshold:             50,
   749  				EnforcementPercentage: 100,
   750  				MinimumHosts:          2,
   751  				RequestVolume:         3,
   752  			},
   753  			ChildPolicy: &iserviceconfig.BalancerConfig{
   754  				Name:   t.Name(),
   755  				Config: emptyChildConfig{},
   756  			},
   757  		},
   758  	})
   759  
   760  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   761  	defer cancel()
   762  
   763  	// Transition SubConns to READY so that they can register a health listener.
   764  	for range 2 {
   765  		select {
   766  		case <-ctx.Done():
   767  			t.Fatalf("Timed out waiting for creation of new SubConn.")
   768  		case sc := <-tcc.NewSubConnCh:
   769  			sc.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting})
   770  			sc.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Ready})
   771  		}
   772  	}
   773  
   774  	// Register health listeners after all the connectivity updates are
   775  	// processed to avoid data races while accessing the health listener within
   776  	// the TestClientConn.
   777  	select {
   778  	case <-ctx.Done():
   779  		t.Fatal("Context timed out waiting for all SubConns to become READY.")
   780  	case <-connectivityCh:
   781  	}
   782  
   783  	scw1.RegisterHealthListener(func(healthState balancer.SubConnState) {
   784  		scsCh.Send(subConnWithState{sc: scw1, state: healthState})
   785  	})
   786  	scw2.RegisterHealthListener(func(healthState balancer.SubConnState) {
   787  		scsCh.Send(subConnWithState{sc: scw2, state: healthState})
   788  	})
   789  
   790  	// Setup the system to where one address is ejected and one address
   791  	// isn't.
   792  	select {
   793  	case <-ctx.Done():
   794  		t.Fatal("timeout while waiting for a UpdateState call on the ClientConn")
   795  	case picker := <-tcc.NewPickerCh:
   796  		pi, err := picker.Pick(balancer.PickInfo{})
   797  		if err != nil {
   798  			t.Fatalf("picker.Pick failed with error: %v", err)
   799  		}
   800  		// Simulate 5 successful RPC calls on the first SubConn (the first call
   801  		// to picker.Pick).
   802  		for c := 0; c < 5; c++ {
   803  			pi.Done(balancer.DoneInfo{})
   804  		}
   805  		pi, err = picker.Pick(balancer.PickInfo{})
   806  		if err != nil {
   807  			t.Fatalf("picker.Pick failed with error: %v", err)
   808  		}
   809  		// Simulate 5 failed RPC calls on the second SubConn (the second call to
   810  		// picker.Pick). Thus, when the interval timer algorithm is run, the
   811  		// second SubConn's address should be ejected, which will allow us to
   812  		// further test UpdateAddresses() logic.
   813  		for c := 0; c < 5; c++ {
   814  			pi.Done(balancer.DoneInfo{Err: errors.New("some error")})
   815  		}
   816  		od.intervalTimerAlgorithm()
   817  		// verify StateListener() got called with TRANSIENT_FAILURE for child
   818  		// with address that was ejected.
   819  		gotSCWS, err := scsCh.Receive(ctx)
   820  		if err != nil {
   821  			t.Fatalf("Error waiting for Sub Conn update: %v", err)
   822  		}
   823  		if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
   824  			sc:    scw2,
   825  			state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure},
   826  		}); err != nil {
   827  			t.Fatalf("Error in Sub Conn update: %v", err)
   828  		}
   829  	}
   830  
   831  	// Update scw1 to another address that is currently ejected. This should
   832  	// cause scw1 to get ejected.
   833  	od.UpdateAddresses(scw1, []resolver.Address{{Addr: "address2"}})
   834  
   835  	// Verify that update addresses gets forwarded to ClientConn.
   836  	select {
   837  	case <-ctx.Done():
   838  		t.Fatal("timeout while waiting for a UpdateState call on the ClientConn")
   839  	case <-tcc.UpdateAddressesAddrsCh:
   840  	}
   841  	// Verify scw1 got ejected (StateListener called with TRANSIENT_FAILURE).
   842  	gotSCWS, err := scsCh.Receive(ctx)
   843  	if err != nil {
   844  		t.Fatalf("Error waiting for Sub Conn update: %v", err)
   845  	}
   846  	if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
   847  		sc:    scw1,
   848  		state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure},
   849  	}); err != nil {
   850  		t.Fatalf("Error in Sub Conn update: %v", err)
   851  	}
   852  
   853  	// Update scw1 to multiple addresses. This should cause scw1 to get
   854  	// unejected, as is it no longer being tracked for Outlier Detection.
   855  	od.UpdateAddresses(scw1, []resolver.Address{
   856  		{Addr: "address1"},
   857  		{Addr: "address2"},
   858  	})
   859  	// Verify scw1 got unejected (StateListener called with recent state).
   860  	gotSCWS, err = scsCh.Receive(ctx)
   861  	if err != nil {
   862  		t.Fatalf("Error waiting for Sub Conn update: %v", err)
   863  	}
   864  	if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
   865  		sc:    scw1,
   866  		state: balancer.SubConnState{ConnectivityState: connectivity.Connecting},
   867  	}); err != nil {
   868  		t.Fatalf("Error in Sub Conn update: %v", err)
   869  	}
   870  
   871  	// Update scw1 to a different multiple addresses list. A change of addresses
   872  	// in which the plurality goes from multiple to multiple should be a no-op,
   873  	// as the address continues to be ignored by outlier detection.
   874  	od.UpdateAddresses(scw1, []resolver.Address{
   875  		{Addr: "address2"},
   876  		{Addr: "address3"},
   877  	})
   878  	// Verify no downstream effects.
   879  	sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout)
   880  	defer cancel()
   881  	if _, err := scsCh.Receive(sCtx); err == nil {
   882  		t.Fatalf("no SubConn update should have been sent (no SubConn got ejected/unejected)")
   883  	}
   884  
   885  	// Update scw1 back to a single address, which is ejected. This should cause
   886  	// the SubConn to be re-ejected.
   887  	od.UpdateAddresses(scw1, []resolver.Address{{Addr: "address2"}})
   888  	// Verify scw1 got ejected (StateListener called with TRANSIENT FAILURE).
   889  	gotSCWS, err = scsCh.Receive(ctx)
   890  	if err != nil {
   891  		t.Fatalf("Error waiting for Sub Conn update: %v", err)
   892  	}
   893  	if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
   894  		sc:    scw1,
   895  		state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure},
   896  	}); err != nil {
   897  		t.Fatalf("Error in Sub Conn update: %v", err)
   898  	}
   899  }
   900  
   901  func scwsEqual(gotSCWS subConnWithState, wantSCWS subConnWithState) error {
   902  	if gotSCWS.sc != wantSCWS.sc || !cmp.Equal(gotSCWS.state, wantSCWS.state, cmp.AllowUnexported(subConnWrapper{}, endpointInfo{}, balancer.SubConnState{}), cmpopts.IgnoreFields(subConnWrapper{}, "scUpdateCh")) {
   903  		return fmt.Errorf("received SubConnState: %+v, want %+v", gotSCWS, wantSCWS)
   904  	}
   905  	return nil
   906  }
   907  
   908  type rrPicker struct {
   909  	scs  []balancer.SubConn
   910  	next int
   911  }
   912  
   913  func (rrp *rrPicker) Pick(balancer.PickInfo) (balancer.PickResult, error) {
   914  	sc := rrp.scs[rrp.next]
   915  	rrp.next = (rrp.next + 1) % len(rrp.scs)
   916  	return balancer.PickResult{SubConn: sc}, nil
   917  }
   918  
   919  // TestDurationOfInterval tests the configured interval timer.
   920  // The following scenarios are tested:
   921  // 1. The Outlier Detection Balancer receives it's first config. The balancer
   922  // should configure the timer with whatever is directly specified on the config.
   923  // 2. The Outlier Detection Balancer receives a subsequent config. The balancer
   924  // should configure with whatever interval is configured minus the difference
   925  // between the current time and the previous start timestamp.
   926  // 3. The Outlier Detection Balancer receives a no-op configuration. The
   927  // balancer should not configure a timer at all.
   928  func (s) TestDurationOfInterval(t *testing.T) {
   929  	stub.Register(t.Name(), stub.BalancerFuncs{})
   930  
   931  	od, _, cleanup := setup(t)
   932  	defer func(af func(d time.Duration, f func()) *time.Timer) {
   933  		cleanup()
   934  		afterFunc = af
   935  	}(afterFunc)
   936  
   937  	durationChan := testutils.NewChannel()
   938  	afterFunc = func(dur time.Duration, _ func()) *time.Timer {
   939  		durationChan.Send(dur)
   940  		return time.NewTimer(math.MaxInt64)
   941  	}
   942  
   943  	od.UpdateClientConnState(balancer.ClientConnState{
   944  		BalancerConfig: &LBConfig{
   945  			Interval: iserviceconfig.Duration(8 * time.Second),
   946  			SuccessRateEjection: &SuccessRateEjection{
   947  				StdevFactor:           1900,
   948  				EnforcementPercentage: 100,
   949  				MinimumHosts:          5,
   950  				RequestVolume:         100,
   951  			},
   952  			ChildPolicy: &iserviceconfig.BalancerConfig{
   953  				Name:   t.Name(),
   954  				Config: emptyChildConfig{},
   955  			},
   956  		},
   957  	})
   958  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   959  	defer cancel()
   960  	d, err := durationChan.Receive(ctx)
   961  	if err != nil {
   962  		t.Fatalf("Error receiving duration from afterFunc() call: %v", err)
   963  	}
   964  	dur := d.(time.Duration)
   965  	// The configured duration should be 8 seconds - what the balancer was
   966  	// configured with.
   967  	if dur != 8*time.Second {
   968  		t.Fatalf("configured duration should have been 8 seconds to start timer")
   969  	}
   970  
   971  	// Override time.Now to time.Now() + 5 seconds. This will represent 5
   972  	// seconds already passing for the next check in UpdateClientConnState.
   973  	defer func(n func() time.Time) {
   974  		now = n
   975  	}(now)
   976  	now = func() time.Time {
   977  		return time.Now().Add(time.Second * 5)
   978  	}
   979  
   980  	// UpdateClientConnState with an interval of 9 seconds. Due to 5 seconds
   981  	// already passing (from overridden time.Now function), this should start an
   982  	// interval timer of ~4 seconds.
   983  	od.UpdateClientConnState(balancer.ClientConnState{
   984  		BalancerConfig: &LBConfig{
   985  			Interval: iserviceconfig.Duration(9 * time.Second),
   986  			SuccessRateEjection: &SuccessRateEjection{
   987  				StdevFactor:           1900,
   988  				EnforcementPercentage: 100,
   989  				MinimumHosts:          5,
   990  				RequestVolume:         100,
   991  			},
   992  			ChildPolicy: &iserviceconfig.BalancerConfig{
   993  				Name:   t.Name(),
   994  				Config: emptyChildConfig{},
   995  			},
   996  		},
   997  	})
   998  
   999  	d, err = durationChan.Receive(ctx)
  1000  	if err != nil {
  1001  		t.Fatalf("Error receiving duration from afterFunc() call: %v", err)
  1002  	}
  1003  	dur = d.(time.Duration)
  1004  	if dur.Seconds() < 3.5 || 4.5 < dur.Seconds() {
  1005  		t.Fatalf("configured duration should have been around 4 seconds to start timer")
  1006  	}
  1007  
  1008  	// UpdateClientConnState with a no-op config. This shouldn't configure the
  1009  	// interval timer at all due to it being a no-op.
  1010  	od.UpdateClientConnState(balancer.ClientConnState{
  1011  		BalancerConfig: &LBConfig{
  1012  			Interval: iserviceconfig.Duration(10 * time.Second),
  1013  			ChildPolicy: &iserviceconfig.BalancerConfig{
  1014  				Name:   t.Name(),
  1015  				Config: emptyChildConfig{},
  1016  			},
  1017  		},
  1018  	})
  1019  
  1020  	// No timer should have been started.
  1021  	sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout)
  1022  	defer cancel()
  1023  	if _, err = durationChan.Receive(sCtx); err == nil {
  1024  		t.Fatal("No timer should have started.")
  1025  	}
  1026  }
  1027  
  1028  // TestEjectUnejectSuccessRate tests the functionality of the interval timer
  1029  // algorithm when configured with SuccessRateEjection. The Outlier Detection
  1030  // Balancer will be set up with 3 SubConns, each with a different address.
  1031  // It tests the following scenarios, in a step by step fashion:
  1032  // 1. The three addresses each have 5 successes. The interval timer algorithm should
  1033  // not eject any of the addresses.
  1034  // 2. Two of the addresses have 5 successes, the third has five failures. The
  1035  // interval timer algorithm should eject the third address with five failures.
  1036  // 3. The interval timer algorithm is run at a later time past max ejection
  1037  // time. The interval timer algorithm should uneject the third address.
  1038  func (s) TestEjectUnejectSuccessRate(t *testing.T) {
  1039  	scsCh := testutils.NewChannel()
  1040  	var scw1, scw2, scw3 balancer.SubConn
  1041  	var err error
  1042  	connectivityCh := make(chan struct{})
  1043  	stub.Register(t.Name(), stub.BalancerFuncs{
  1044  		UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error {
  1045  			scw1, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{
  1046  				StateListener: func(balancer.SubConnState) {},
  1047  			})
  1048  			if err != nil {
  1049  				t.Errorf("error in od.NewSubConn call: %v", err)
  1050  			}
  1051  			scw1.Connect()
  1052  			scw2, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{
  1053  				StateListener: func(balancer.SubConnState) {},
  1054  			})
  1055  			if err != nil {
  1056  				t.Errorf("error in od.NewSubConn call: %v", err)
  1057  			}
  1058  			scw2.Connect()
  1059  			scw3, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address3"}}, balancer.NewSubConnOptions{
  1060  				StateListener: func(state balancer.SubConnState) {
  1061  					if state.ConnectivityState == connectivity.Ready {
  1062  						close(connectivityCh)
  1063  					}
  1064  				},
  1065  			})
  1066  			if err != nil {
  1067  				t.Errorf("error in od.NewSubConn call: %v", err)
  1068  			}
  1069  			scw3.Connect()
  1070  			bd.ClientConn.UpdateState(balancer.State{
  1071  				ConnectivityState: connectivity.Ready,
  1072  				Picker: &rrPicker{
  1073  					scs: []balancer.SubConn{scw1, scw2, scw3},
  1074  				},
  1075  			})
  1076  			return nil
  1077  		},
  1078  	})
  1079  
  1080  	od, tcc, cleanup := setup(t)
  1081  	defer func() {
  1082  		cleanup()
  1083  	}()
  1084  
  1085  	od.UpdateClientConnState(balancer.ClientConnState{
  1086  		ResolverState: resolver.State{
  1087  			Endpoints: []resolver.Endpoint{
  1088  				{Addresses: []resolver.Address{{Addr: "address1"}}},
  1089  				{Addresses: []resolver.Address{{Addr: "address2"}}},
  1090  				{Addresses: []resolver.Address{{Addr: "address3"}}},
  1091  			},
  1092  		},
  1093  		BalancerConfig: &LBConfig{
  1094  			Interval:           math.MaxInt64, // so the interval will never run unless called manually in test.
  1095  			BaseEjectionTime:   iserviceconfig.Duration(30 * time.Second),
  1096  			MaxEjectionTime:    iserviceconfig.Duration(300 * time.Second),
  1097  			MaxEjectionPercent: 10,
  1098  			FailurePercentageEjection: &FailurePercentageEjection{
  1099  				Threshold:             50,
  1100  				EnforcementPercentage: 100,
  1101  				MinimumHosts:          3,
  1102  				RequestVolume:         3,
  1103  			},
  1104  			ChildPolicy: &iserviceconfig.BalancerConfig{
  1105  				Name:   t.Name(),
  1106  				Config: emptyChildConfig{},
  1107  			},
  1108  		},
  1109  	})
  1110  
  1111  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
  1112  	defer cancel()
  1113  
  1114  	// Transition the SubConns to READY so that they can register health
  1115  	// listeners.
  1116  	for range 3 {
  1117  		select {
  1118  		case <-ctx.Done():
  1119  			t.Fatalf("Timed out waiting for creation of new SubConn.")
  1120  		case sc := <-tcc.NewSubConnCh:
  1121  			sc.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting})
  1122  			sc.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Ready})
  1123  		}
  1124  	}
  1125  
  1126  	// Register health listeners after all the connectivity updates are
  1127  	// processed to avoid data races while accessing the health listener within
  1128  	// the TestClientConn.
  1129  	select {
  1130  	case <-ctx.Done():
  1131  		t.Fatal("Context timed out waiting for all SubConns to become READY.")
  1132  	case <-connectivityCh:
  1133  	}
  1134  
  1135  	scw1.RegisterHealthListener(func(healthState balancer.SubConnState) {
  1136  		scsCh.Send(subConnWithState{sc: scw1, state: healthState})
  1137  	})
  1138  	scw2.RegisterHealthListener(func(healthState balancer.SubConnState) {
  1139  		scsCh.Send(subConnWithState{sc: scw2, state: healthState})
  1140  	})
  1141  	scw3.RegisterHealthListener(func(healthState balancer.SubConnState) {
  1142  		scsCh.Send(subConnWithState{sc: scw3, state: healthState})
  1143  	})
  1144  
  1145  	select {
  1146  	case <-ctx.Done():
  1147  		t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn")
  1148  	case picker := <-tcc.NewPickerCh:
  1149  		// Set each of the three upstream addresses to have five successes each.
  1150  		// This should cause none of the addresses to be ejected as none of them
  1151  		// are outliers according to the success rate algorithm.
  1152  		for i := 0; i < 3; i++ {
  1153  			pi, err := picker.Pick(balancer.PickInfo{})
  1154  			if err != nil {
  1155  				t.Fatalf("picker.Pick failed with error: %v", err)
  1156  			}
  1157  			for c := 0; c < 5; c++ {
  1158  				pi.Done(balancer.DoneInfo{})
  1159  			}
  1160  		}
  1161  
  1162  		od.intervalTimerAlgorithm()
  1163  
  1164  		// verify no StateListener() call on the child, as no addresses got
  1165  		// ejected (ejected address will cause an StateListener call).
  1166  		sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout)
  1167  		defer cancel()
  1168  		if _, err := scsCh.Receive(sCtx); err == nil {
  1169  			t.Fatalf("no SubConn update should have been sent (no SubConn got ejected)")
  1170  		}
  1171  
  1172  		// Since no addresses are ejected, a SubConn update should forward down
  1173  		// to the child.
  1174  		od.scUpdateCh.Put(&scHealthUpdate{
  1175  			scw: scw1.(*subConnWrapper),
  1176  			state: balancer.SubConnState{
  1177  				ConnectivityState: connectivity.Connecting,
  1178  			}},
  1179  		)
  1180  
  1181  		gotSCWS, err := scsCh.Receive(ctx)
  1182  		if err != nil {
  1183  			t.Fatalf("Error waiting for Sub Conn update: %v", err)
  1184  		}
  1185  		if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
  1186  			sc:    scw1,
  1187  			state: balancer.SubConnState{ConnectivityState: connectivity.Connecting},
  1188  		}); err != nil {
  1189  			t.Fatalf("Error in Sub Conn update: %v", err)
  1190  		}
  1191  
  1192  		// Set two of the upstream addresses to have five successes each, and
  1193  		// one of the upstream addresses to have five failures. This should
  1194  		// cause the address which has five failures to be ejected according to
  1195  		// the SuccessRateAlgorithm.
  1196  		for i := 0; i < 2; i++ {
  1197  			pi, err := picker.Pick(balancer.PickInfo{})
  1198  			if err != nil {
  1199  				t.Fatalf("picker.Pick failed with error: %v", err)
  1200  			}
  1201  			for c := 0; c < 5; c++ {
  1202  				pi.Done(balancer.DoneInfo{})
  1203  			}
  1204  		}
  1205  		pi, err := picker.Pick(balancer.PickInfo{})
  1206  		if err != nil {
  1207  			t.Fatalf("picker.Pick failed with error: %v", err)
  1208  		}
  1209  		if got, want := pi.SubConn, scw3.(*subConnWrapper).SubConn; got != want {
  1210  			t.Fatalf("Unexpected SubConn chosen by picker: got %v, want %v", got, want)
  1211  		}
  1212  		for c := 0; c < 5; c++ {
  1213  			pi.Done(balancer.DoneInfo{Err: errors.New("some error")})
  1214  		}
  1215  
  1216  		// should eject address that always errored.
  1217  		od.intervalTimerAlgorithm()
  1218  		// Due to the address being ejected, the SubConn with that address
  1219  		// should be ejected, meaning a TRANSIENT_FAILURE connectivity state
  1220  		// gets reported to the child.
  1221  		gotSCWS, err = scsCh.Receive(ctx)
  1222  		if err != nil {
  1223  			t.Fatalf("Error waiting for Sub Conn update: %v", err)
  1224  		}
  1225  		if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
  1226  			sc:    scw3,
  1227  			state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure},
  1228  		}); err != nil {
  1229  			t.Fatalf("Error in Sub Conn update: %v", err)
  1230  		}
  1231  		// Only one address should be ejected.
  1232  		sCtx, cancel = context.WithTimeout(context.Background(), defaultTestShortTimeout)
  1233  		defer cancel()
  1234  		if _, err := scsCh.Receive(sCtx); err == nil {
  1235  			t.Fatalf("Only one SubConn update should have been sent (only one SubConn got ejected)")
  1236  		}
  1237  
  1238  		// Now that an address is ejected, SubConn updates for SubConns using
  1239  		// that address should not be forwarded downward. These SubConn updates
  1240  		// will be cached to update the child sometime in the future when the
  1241  		// address gets unejected.
  1242  		od.scUpdateCh.Put(&scHealthUpdate{
  1243  			scw:   scw3.(*subConnWrapper),
  1244  			state: balancer.SubConnState{ConnectivityState: connectivity.Connecting},
  1245  		})
  1246  		sCtx, cancel = context.WithTimeout(context.Background(), defaultTestShortTimeout)
  1247  		defer cancel()
  1248  		if _, err := scsCh.Receive(sCtx); err == nil {
  1249  			t.Fatalf("SubConn update should not have been forwarded (the SubConn is ejected)")
  1250  		}
  1251  
  1252  		// Override now to cause the interval timer algorithm to always uneject
  1253  		// the ejected address. This will always uneject the ejected address
  1254  		// because this time is set way past the max ejection time set in the
  1255  		// configuration, which will make the next interval timer algorithm run
  1256  		// uneject any ejected addresses.
  1257  		defer func(n func() time.Time) {
  1258  			now = n
  1259  		}(now)
  1260  		now = func() time.Time {
  1261  			return time.Now().Add(time.Second * 1000)
  1262  		}
  1263  		od.intervalTimerAlgorithm()
  1264  
  1265  		// unejected SubConn should report latest persisted state - which is
  1266  		// connecting from earlier.
  1267  		gotSCWS, err = scsCh.Receive(ctx)
  1268  		if err != nil {
  1269  			t.Fatalf("Error waiting for Sub Conn update: %v", err)
  1270  		}
  1271  		if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
  1272  			sc:    scw3,
  1273  			state: balancer.SubConnState{ConnectivityState: connectivity.Connecting},
  1274  		}); err != nil {
  1275  			t.Fatalf("Error in Sub Conn update: %v", err)
  1276  		}
  1277  	}
  1278  }
  1279  
  1280  // TestEjectFailureRate tests the functionality of the interval timer algorithm
  1281  // when configured with FailurePercentageEjection, and also the functionality of
  1282  // noop configuration. The Outlier Detection Balancer will be set up with 3
  1283  // SubConns, each with a different address. It tests the following scenarios, in
  1284  // a step by step fashion:
  1285  // 1. The three addresses each have 5 successes. The interval timer algorithm
  1286  // should not eject any of the addresses.
  1287  // 2. Two of the addresses have 5 successes, the third has five failures. The
  1288  // interval timer algorithm should eject the third address with five failures.
  1289  // 3. The Outlier Detection Balancer receives a subsequent noop config update.
  1290  // The balancer should uneject all ejected addresses.
  1291  func (s) TestEjectFailureRate(t *testing.T) {
  1292  	scsCh := testutils.NewChannel()
  1293  	var scw1, scw2, scw3 balancer.SubConn
  1294  	var err error
  1295  	connectivityCh := make(chan struct{})
  1296  	stub.Register(t.Name(), stub.BalancerFuncs{
  1297  		UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error {
  1298  			if scw1 != nil { // UpdateClientConnState was already called, no need to recreate SubConns.
  1299  				return nil
  1300  			}
  1301  			scw1, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{
  1302  				StateListener: func(balancer.SubConnState) {},
  1303  			})
  1304  			if err != nil {
  1305  				t.Errorf("error in od.NewSubConn call: %v", err)
  1306  			}
  1307  			scw1.Connect()
  1308  			scw2, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{
  1309  				StateListener: func(balancer.SubConnState) {},
  1310  			})
  1311  			if err != nil {
  1312  				t.Errorf("error in od.NewSubConn call: %v", err)
  1313  			}
  1314  			scw2.Connect()
  1315  			scw3, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address3"}}, balancer.NewSubConnOptions{
  1316  				StateListener: func(scs balancer.SubConnState) {
  1317  					if scs.ConnectivityState == connectivity.Ready {
  1318  						close(connectivityCh)
  1319  					}
  1320  				},
  1321  			})
  1322  			if err != nil {
  1323  				t.Errorf("error in od.NewSubConn call: %v", err)
  1324  			}
  1325  			scw3.Connect()
  1326  			return nil
  1327  		},
  1328  	})
  1329  
  1330  	od, tcc, cleanup := setup(t)
  1331  	defer func() {
  1332  		cleanup()
  1333  	}()
  1334  
  1335  	od.UpdateClientConnState(balancer.ClientConnState{
  1336  		ResolverState: resolver.State{
  1337  			Endpoints: []resolver.Endpoint{
  1338  				{Addresses: []resolver.Address{{Addr: "address1"}}},
  1339  				{Addresses: []resolver.Address{{Addr: "address2"}}},
  1340  				{Addresses: []resolver.Address{{Addr: "address3"}}},
  1341  			},
  1342  		},
  1343  		BalancerConfig: &LBConfig{
  1344  			Interval:           math.MaxInt64, // so the interval will never run unless called manually in test.
  1345  			BaseEjectionTime:   iserviceconfig.Duration(30 * time.Second),
  1346  			MaxEjectionTime:    iserviceconfig.Duration(300 * time.Second),
  1347  			MaxEjectionPercent: 10,
  1348  			SuccessRateEjection: &SuccessRateEjection{
  1349  				StdevFactor:           500,
  1350  				EnforcementPercentage: 100,
  1351  				MinimumHosts:          3,
  1352  				RequestVolume:         3,
  1353  			},
  1354  			ChildPolicy: &iserviceconfig.BalancerConfig{
  1355  				Name:   t.Name(),
  1356  				Config: emptyChildConfig{},
  1357  			},
  1358  		},
  1359  	})
  1360  
  1361  	od.UpdateState(balancer.State{
  1362  		ConnectivityState: connectivity.Ready,
  1363  		Picker: &rrPicker{
  1364  			scs: []balancer.SubConn{scw1, scw2, scw3},
  1365  		},
  1366  	})
  1367  
  1368  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
  1369  	defer cancel()
  1370  
  1371  	// Transition the SubConns to READY so that they can register health
  1372  	// listeners.
  1373  	for range 3 {
  1374  		select {
  1375  		case <-ctx.Done():
  1376  			t.Fatal("Timed out waiting for creation of new SubConn.")
  1377  		case sc := <-tcc.NewSubConnCh:
  1378  			sc.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting})
  1379  			sc.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Ready})
  1380  		}
  1381  	}
  1382  	// Register health listeners after all the connectivity updates are
  1383  	// processed to avoid data races while accessing the health listener within
  1384  	// the TestClientConn.
  1385  	select {
  1386  	case <-ctx.Done():
  1387  		t.Fatal("Context timed out waiting for all SubConns to become READY.")
  1388  	case <-connectivityCh:
  1389  	}
  1390  
  1391  	scw1.RegisterHealthListener(func(healthState balancer.SubConnState) {
  1392  		scsCh.Send(subConnWithState{sc: scw1, state: healthState})
  1393  	})
  1394  	scw2.RegisterHealthListener(func(healthState balancer.SubConnState) {
  1395  		scsCh.Send(subConnWithState{sc: scw2, state: healthState})
  1396  	})
  1397  	scw3.RegisterHealthListener(func(healthState balancer.SubConnState) {
  1398  		scsCh.Send(subConnWithState{sc: scw3, state: healthState})
  1399  	})
  1400  
  1401  	select {
  1402  	case <-ctx.Done():
  1403  		t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn")
  1404  	case picker := <-tcc.NewPickerCh:
  1405  		// Set each upstream address to have five successes each. This should
  1406  		// cause none of the addresses to be ejected as none of them are below
  1407  		// the failure percentage threshold.
  1408  		for i := 0; i < 3; i++ {
  1409  			pi, err := picker.Pick(balancer.PickInfo{})
  1410  			if err != nil {
  1411  				t.Fatalf("picker.Pick failed with error: %v", err)
  1412  			}
  1413  			for c := 0; c < 5; c++ {
  1414  				pi.Done(balancer.DoneInfo{})
  1415  			}
  1416  		}
  1417  
  1418  		od.intervalTimerAlgorithm()
  1419  		sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout)
  1420  		defer cancel()
  1421  		if _, err := scsCh.Receive(sCtx); err == nil {
  1422  			t.Fatalf("no SubConn update should have been sent (no SubConn got ejected)")
  1423  		}
  1424  
  1425  		// Set two upstream addresses to have five successes each, and one
  1426  		// upstream address to have five failures. This should cause the address
  1427  		// with five failures to be ejected according to the Failure Percentage
  1428  		// Algorithm.
  1429  		for i := 0; i < 2; i++ {
  1430  			pi, err := picker.Pick(balancer.PickInfo{})
  1431  			if err != nil {
  1432  				t.Fatalf("picker.Pick failed with error: %v", err)
  1433  			}
  1434  			for c := 0; c < 5; c++ {
  1435  				pi.Done(balancer.DoneInfo{})
  1436  			}
  1437  		}
  1438  		pi, err := picker.Pick(balancer.PickInfo{})
  1439  		if err != nil {
  1440  			t.Fatalf("picker.Pick failed with error: %v", err)
  1441  		}
  1442  		for c := 0; c < 5; c++ {
  1443  			pi.Done(balancer.DoneInfo{Err: errors.New("some error")})
  1444  		}
  1445  
  1446  		// should eject address that always errored.
  1447  		od.intervalTimerAlgorithm()
  1448  
  1449  		// verify StateListener() got called with TRANSIENT_FAILURE for child
  1450  		// in address that was ejected.
  1451  		gotSCWS, err := scsCh.Receive(ctx)
  1452  		if err != nil {
  1453  			t.Fatalf("Error waiting for Sub Conn update: %v", err)
  1454  		}
  1455  		if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
  1456  			sc:    scw3,
  1457  			state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure},
  1458  		}); err != nil {
  1459  			t.Fatalf("Error in Sub Conn update: %v", err)
  1460  		}
  1461  
  1462  		// verify only one address got ejected.
  1463  		sCtx, cancel = context.WithTimeout(context.Background(), defaultTestShortTimeout)
  1464  		defer cancel()
  1465  		if _, err := scsCh.Receive(sCtx); err == nil {
  1466  			t.Fatalf("Only one SubConn update should have been sent (only one SubConn got ejected)")
  1467  		}
  1468  
  1469  		// upon the Outlier Detection balancer being reconfigured with a noop
  1470  		// configuration, every ejected SubConn should be unejected.
  1471  		od.UpdateClientConnState(balancer.ClientConnState{
  1472  			ResolverState: resolver.State{
  1473  				Endpoints: []resolver.Endpoint{
  1474  					{Addresses: []resolver.Address{{Addr: "address1"}}},
  1475  					{Addresses: []resolver.Address{{Addr: "address2"}}},
  1476  					{Addresses: []resolver.Address{{Addr: "address3"}}},
  1477  				},
  1478  			},
  1479  			BalancerConfig: &LBConfig{
  1480  				Interval:           math.MaxInt64,
  1481  				BaseEjectionTime:   iserviceconfig.Duration(30 * time.Second),
  1482  				MaxEjectionTime:    iserviceconfig.Duration(300 * time.Second),
  1483  				MaxEjectionPercent: 10,
  1484  				ChildPolicy: &iserviceconfig.BalancerConfig{
  1485  					Name:   t.Name(),
  1486  					Config: emptyChildConfig{},
  1487  				},
  1488  			},
  1489  		})
  1490  		gotSCWS, err = scsCh.Receive(ctx)
  1491  		if err != nil {
  1492  			t.Fatalf("Error waiting for Sub Conn update: %v", err)
  1493  		}
  1494  		if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
  1495  			sc:    scw3,
  1496  			state: balancer.SubConnState{ConnectivityState: connectivity.Connecting},
  1497  		}); err != nil {
  1498  			t.Fatalf("Error in Sub Conn update: %v", err)
  1499  		}
  1500  	}
  1501  }
  1502  
  1503  // TestConcurrentOperations calls different operations on the balancer in
  1504  // separate goroutines to test for any race conditions and deadlocks. It also
  1505  // uses a child balancer which verifies that no operations on the child get
  1506  // called after the child balancer is closed.
  1507  func (s) TestConcurrentOperations(t *testing.T) {
  1508  	closed := grpcsync.NewEvent()
  1509  	stub.Register(t.Name(), stub.BalancerFuncs{
  1510  		UpdateClientConnState: func(*stub.BalancerData, balancer.ClientConnState) error {
  1511  			if closed.HasFired() {
  1512  				t.Error("UpdateClientConnState was called after Close(), which breaks the balancer API")
  1513  			}
  1514  			return nil
  1515  		},
  1516  		ResolverError: func(*stub.BalancerData, error) {
  1517  			if closed.HasFired() {
  1518  				t.Error("ResolverError was called after Close(), which breaks the balancer API")
  1519  			}
  1520  		},
  1521  		Close: func(*stub.BalancerData) {
  1522  			closed.Fire()
  1523  		},
  1524  		ExitIdle: func(*stub.BalancerData) {
  1525  			if closed.HasFired() {
  1526  				t.Error("ExitIdle was called after Close(), which breaks the balancer API")
  1527  			}
  1528  		},
  1529  	})
  1530  
  1531  	od, tcc, cleanup := setup(t)
  1532  	defer func() {
  1533  		cleanup()
  1534  	}()
  1535  
  1536  	od.UpdateClientConnState(balancer.ClientConnState{
  1537  		ResolverState: resolver.State{
  1538  			Endpoints: []resolver.Endpoint{
  1539  				{Addresses: []resolver.Address{{Addr: "address1"}}},
  1540  				{Addresses: []resolver.Address{{Addr: "address2"}}},
  1541  				{Addresses: []resolver.Address{{Addr: "address3"}}},
  1542  			},
  1543  		},
  1544  		BalancerConfig: &LBConfig{
  1545  			Interval:           math.MaxInt64, // so the interval will never run unless called manually in test.
  1546  			BaseEjectionTime:   iserviceconfig.Duration(30 * time.Second),
  1547  			MaxEjectionTime:    iserviceconfig.Duration(300 * time.Second),
  1548  			MaxEjectionPercent: 10,
  1549  			SuccessRateEjection: &SuccessRateEjection{ // Have both Success Rate and Failure Percentage to step through all the interval timer code
  1550  				StdevFactor:           500,
  1551  				EnforcementPercentage: 100,
  1552  				MinimumHosts:          3,
  1553  				RequestVolume:         3,
  1554  			},
  1555  			FailurePercentageEjection: &FailurePercentageEjection{
  1556  				Threshold:             50,
  1557  				EnforcementPercentage: 100,
  1558  				MinimumHosts:          3,
  1559  				RequestVolume:         3,
  1560  			},
  1561  			ChildPolicy: &iserviceconfig.BalancerConfig{
  1562  				Name:   t.Name(),
  1563  				Config: emptyChildConfig{},
  1564  			},
  1565  		},
  1566  	})
  1567  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
  1568  	defer cancel()
  1569  
  1570  	scw1, err := od.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{})
  1571  	if err != nil {
  1572  		t.Fatalf("error in od.NewSubConn call: %v", err)
  1573  	}
  1574  	if err != nil {
  1575  		t.Fatalf("error in od.NewSubConn call: %v", err)
  1576  	}
  1577  
  1578  	scw2, err := od.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{})
  1579  	if err != nil {
  1580  		t.Fatalf("error in od.NewSubConn call: %v", err)
  1581  	}
  1582  
  1583  	scw3, err := od.NewSubConn([]resolver.Address{{Addr: "address3"}}, balancer.NewSubConnOptions{})
  1584  	if err != nil {
  1585  		t.Fatalf("error in od.NewSubConn call: %v", err)
  1586  	}
  1587  
  1588  	od.UpdateState(balancer.State{
  1589  		ConnectivityState: connectivity.Ready,
  1590  		Picker: &rrPicker{
  1591  			scs: []balancer.SubConn{scw2, scw3},
  1592  		},
  1593  	})
  1594  
  1595  	var picker balancer.Picker
  1596  	select {
  1597  	case <-ctx.Done():
  1598  		t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn")
  1599  	case picker = <-tcc.NewPickerCh:
  1600  	}
  1601  
  1602  	finished := make(chan struct{})
  1603  	var wg sync.WaitGroup
  1604  	wg.Add(1)
  1605  	go func() {
  1606  		defer wg.Done()
  1607  		for {
  1608  			select {
  1609  			case <-finished:
  1610  				return
  1611  			default:
  1612  			}
  1613  			pi, err := picker.Pick(balancer.PickInfo{})
  1614  			if err != nil {
  1615  				continue
  1616  			}
  1617  			pi.Done(balancer.DoneInfo{})
  1618  			pi.Done(balancer.DoneInfo{Err: errors.New("some error")})
  1619  			time.Sleep(1 * time.Nanosecond)
  1620  		}
  1621  	}()
  1622  
  1623  	wg.Add(1)
  1624  	go func() {
  1625  		defer wg.Done()
  1626  		for {
  1627  			select {
  1628  			case <-finished:
  1629  				return
  1630  			default:
  1631  			}
  1632  			od.intervalTimerAlgorithm()
  1633  		}
  1634  	}()
  1635  
  1636  	// call Outlier Detection's balancer.ClientConn operations asynchronously.
  1637  	// balancer.ClientConn operations have no guarantee from the API to be
  1638  	// called synchronously.
  1639  	wg.Add(1)
  1640  	go func() {
  1641  		defer wg.Done()
  1642  		for {
  1643  			select {
  1644  			case <-finished:
  1645  				return
  1646  			default:
  1647  			}
  1648  			od.UpdateState(balancer.State{
  1649  				ConnectivityState: connectivity.Ready,
  1650  				Picker: &rrPicker{
  1651  					scs: []balancer.SubConn{scw2, scw3},
  1652  				},
  1653  			})
  1654  			time.Sleep(1 * time.Nanosecond)
  1655  		}
  1656  	}()
  1657  
  1658  	wg.Add(1)
  1659  	go func() {
  1660  		defer wg.Done()
  1661  		od.NewSubConn([]resolver.Address{{Addr: "address4"}}, balancer.NewSubConnOptions{})
  1662  	}()
  1663  
  1664  	wg.Add(1)
  1665  	go func() {
  1666  		defer wg.Done()
  1667  		scw1.Shutdown()
  1668  	}()
  1669  
  1670  	wg.Add(1)
  1671  	go func() {
  1672  		defer wg.Done()
  1673  		od.UpdateAddresses(scw2, []resolver.Address{{Addr: "address3"}})
  1674  	}()
  1675  
  1676  	// Call balancer.Balancers synchronously in this goroutine, upholding the
  1677  	// balancer.Balancer API guarantee of synchronous calls.
  1678  	od.UpdateClientConnState(balancer.ClientConnState{ // This will delete addresses and flip to no op
  1679  		ResolverState: resolver.State{
  1680  			Endpoints: []resolver.Endpoint{{Addresses: []resolver.Address{{Addr: "address1"}}}},
  1681  		},
  1682  		BalancerConfig: &LBConfig{
  1683  			Interval: math.MaxInt64,
  1684  			ChildPolicy: &iserviceconfig.BalancerConfig{
  1685  				Name:   t.Name(),
  1686  				Config: emptyChildConfig{},
  1687  			},
  1688  		},
  1689  	})
  1690  
  1691  	// Call balancer.Balancers synchronously in this goroutine, upholding the
  1692  	// balancer.Balancer API guarantee.
  1693  	od.updateSubConnState(scw1.(*subConnWrapper), balancer.SubConnState{
  1694  		ConnectivityState: connectivity.Connecting,
  1695  	})
  1696  	od.ResolverError(errors.New("some error"))
  1697  	od.ExitIdle()
  1698  	od.Close()
  1699  	close(finished)
  1700  	wg.Wait()
  1701  }
  1702  
  1703  // Test verifies that outlier detection doesn't eject subchannels created by
  1704  // the new pickfirst balancer when pickfirst is a non-leaf policy, i.e. not
  1705  // under a petiole policy. When pickfirst is not under a petiole policy, it will
  1706  // not register a health listener. pickfirst will still set the address
  1707  // attribute to disable ejection through the raw connectivity listener. When
  1708  // Outlier Detection processes a health update and sees the health listener is
  1709  // enabled but a health listener is not registered, it will drop the ejection
  1710  // update.
  1711  func (s) TestPickFirstHealthListenerDisabled(t *testing.T) {
  1712  	backend := &stubserver.StubServer{
  1713  		EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) {
  1714  			return nil, errors.New("some error")
  1715  		},
  1716  	}
  1717  	if err := backend.StartServer(); err != nil {
  1718  		t.Fatalf("Failed to start backend: %v", err)
  1719  	}
  1720  	defer backend.Stop()
  1721  	t.Logf("Started bad TestService backend at: %q", backend.Address)
  1722  
  1723  	// The interval is intentionally kept very large, the interval algorithm
  1724  	// will be triggered manually.
  1725  	odCfg := &LBConfig{
  1726  		Interval:         iserviceconfig.Duration(300 * time.Second),
  1727  		BaseEjectionTime: iserviceconfig.Duration(300 * time.Second),
  1728  		MaxEjectionTime:  iserviceconfig.Duration(500 * time.Second),
  1729  		FailurePercentageEjection: &FailurePercentageEjection{
  1730  			Threshold:             50,
  1731  			EnforcementPercentage: 100,
  1732  			MinimumHosts:          0,
  1733  			RequestVolume:         2,
  1734  		},
  1735  		MaxEjectionPercent: 100,
  1736  		ChildPolicy: &iserviceconfig.BalancerConfig{
  1737  			Name: pickfirstleaf.Name,
  1738  		},
  1739  	}
  1740  
  1741  	lbChan := make(chan *outlierDetectionBalancer, 1)
  1742  	bf := stub.BalancerFuncs{
  1743  		Init: func(bd *stub.BalancerData) {
  1744  			bd.ChildBalancer = balancer.Get(Name).Build(bd.ClientConn, bd.BuildOptions)
  1745  			lbChan <- bd.ChildBalancer.(*outlierDetectionBalancer)
  1746  		},
  1747  		Close: func(bd *stub.BalancerData) {
  1748  			bd.ChildBalancer.Close()
  1749  		},
  1750  		UpdateClientConnState: func(bd *stub.BalancerData, ccs balancer.ClientConnState) error {
  1751  			ccs.BalancerConfig = odCfg
  1752  			return bd.ChildBalancer.UpdateClientConnState(ccs)
  1753  		},
  1754  	}
  1755  
  1756  	stub.Register(t.Name(), bf)
  1757  
  1758  	opts := []grpc.DialOption{
  1759  		grpc.WithTransportCredentials(insecure.NewCredentials()),
  1760  		grpc.WithDefaultServiceConfig(fmt.Sprintf(`{ "loadBalancingConfig": [{%q: {}}] }`, t.Name())),
  1761  	}
  1762  	cc, err := grpc.NewClient(backend.Address, opts...)
  1763  	if err != nil {
  1764  		t.Fatalf("grpc.NewClient() failed: %v", err)
  1765  	}
  1766  	defer cc.Close()
  1767  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
  1768  	defer cancel()
  1769  	testServiceClient := testgrpc.NewTestServiceClient(cc)
  1770  	testServiceClient.EmptyCall(ctx, &testpb.Empty{})
  1771  	testutils.AwaitState(ctx, t, cc, connectivity.Ready)
  1772  
  1773  	// Failing request should not cause ejection.
  1774  	testServiceClient.EmptyCall(ctx, &testpb.Empty{})
  1775  	testServiceClient.EmptyCall(ctx, &testpb.Empty{})
  1776  	testServiceClient.EmptyCall(ctx, &testpb.Empty{})
  1777  	testServiceClient.EmptyCall(ctx, &testpb.Empty{})
  1778  
  1779  	// Run the interval algorithm.
  1780  	select {
  1781  	case <-ctx.Done():
  1782  		t.Fatal("Timed out waiting for the outlier detection LB policy to be built.")
  1783  	case od := <-lbChan:
  1784  		od.intervalTimerAlgorithm()
  1785  	}
  1786  
  1787  	shortCtx, shortCancel := context.WithTimeout(ctx, defaultTestShortTimeout)
  1788  	defer shortCancel()
  1789  	testutils.AwaitNoStateChange(shortCtx, t, cc, connectivity.Ready)
  1790  }
  1791  
  1792  // Tests handling of endpoints with multiple addresses. The test creates two
  1793  // endpoints, each with two addresses. The first endpoint has a backend that
  1794  // always returns errors. The test verifies that the first endpoint is ejected
  1795  // after running the intervalTimerAlgorithm. The test stops the unhealthy
  1796  // backend and verifies that the second backend in the first endpoint is dialed
  1797  // but it doesn't receive requests due to its ejection status. The test stops
  1798  // the connected backend in the second endpoint and verifies that requests
  1799  // start going to the second address in the second endpoint. The test reduces
  1800  // the ejection interval and runs the intervalTimerAlgorithm again. The test
  1801  // verifies that the first endpoint is unejected and requests reach both
  1802  // endpoints.
  1803  func (s) TestMultipleAddressesPerEndpoint(t *testing.T) {
  1804  	unhealthyBackend := &stubserver.StubServer{
  1805  		EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) {
  1806  			return nil, errors.New("some error")
  1807  		},
  1808  	}
  1809  	if err := unhealthyBackend.StartServer(); err != nil {
  1810  		t.Fatalf("Failed to start backend: %v", err)
  1811  	}
  1812  	defer unhealthyBackend.Stop()
  1813  	t.Logf("Started unhealthy TestService backend at: %q", unhealthyBackend.Address)
  1814  
  1815  	healthyBackends := make([]*stubserver.StubServer, 3)
  1816  	for i := 0; i < 3; i++ {
  1817  		healthyBackends[i] = stubserver.StartTestService(t, nil)
  1818  		defer healthyBackends[i].Stop()
  1819  	}
  1820  
  1821  	wrrCfg, err := balancer.Get(weightedroundrobin.Name).(balancer.ConfigParser).ParseConfig(json.RawMessage("{}"))
  1822  	if err != nil {
  1823  		t.Fatalf("Failed to parse %q config: %v", weightedroundrobin.Name, err)
  1824  	}
  1825  	// The interval is intentionally kept very large, the interval algorithm
  1826  	// will be triggered manually.
  1827  	odCfg := &LBConfig{
  1828  		Interval:         iserviceconfig.Duration(300 * time.Second),
  1829  		BaseEjectionTime: iserviceconfig.Duration(300 * time.Second),
  1830  		MaxEjectionTime:  iserviceconfig.Duration(300 * time.Second),
  1831  		FailurePercentageEjection: &FailurePercentageEjection{
  1832  			Threshold:             50,
  1833  			EnforcementPercentage: 100,
  1834  			MinimumHosts:          0,
  1835  			RequestVolume:         2,
  1836  		},
  1837  		MaxEjectionPercent: 100,
  1838  		ChildPolicy: &iserviceconfig.BalancerConfig{
  1839  			Name:   weightedroundrobin.Name,
  1840  			Config: wrrCfg,
  1841  		},
  1842  	}
  1843  
  1844  	lbChan := make(chan *outlierDetectionBalancer, 1)
  1845  	bf := stub.BalancerFuncs{
  1846  		Init: func(bd *stub.BalancerData) {
  1847  			bd.ChildBalancer = balancer.Get(Name).Build(bd.ClientConn, bd.BuildOptions)
  1848  			lbChan <- bd.ChildBalancer.(*outlierDetectionBalancer)
  1849  		},
  1850  		Close: func(bd *stub.BalancerData) {
  1851  			bd.ChildBalancer.Close()
  1852  		},
  1853  		UpdateClientConnState: func(bd *stub.BalancerData, ccs balancer.ClientConnState) error {
  1854  			ccs.BalancerConfig = odCfg
  1855  			return bd.ChildBalancer.UpdateClientConnState(ccs)
  1856  		},
  1857  	}
  1858  
  1859  	stub.Register(t.Name(), bf)
  1860  	r := manual.NewBuilderWithScheme("whatever")
  1861  	endpoints := []resolver.Endpoint{
  1862  		{
  1863  			Addresses: []resolver.Address{
  1864  				{Addr: unhealthyBackend.Address},
  1865  				{Addr: healthyBackends[0].Address},
  1866  			},
  1867  		},
  1868  		{
  1869  			Addresses: []resolver.Address{
  1870  				{Addr: healthyBackends[1].Address},
  1871  				{Addr: healthyBackends[2].Address},
  1872  			},
  1873  		},
  1874  	}
  1875  
  1876  	r.InitialState(resolver.State{
  1877  		Endpoints: endpoints,
  1878  	})
  1879  	dialer := testutils.NewBlockingDialer()
  1880  	opts := []grpc.DialOption{
  1881  		grpc.WithTransportCredentials(insecure.NewCredentials()),
  1882  		grpc.WithDefaultServiceConfig(fmt.Sprintf(`{ "loadBalancingConfig": [{%q: {}}] }`, t.Name())),
  1883  		grpc.WithResolvers(r),
  1884  		grpc.WithContextDialer(dialer.DialContext),
  1885  	}
  1886  	cc, err := grpc.NewClient(r.Scheme()+":///", opts...)
  1887  	if err != nil {
  1888  		t.Fatalf("grpc.NewClient() failed: %v", err)
  1889  	}
  1890  	defer cc.Close()
  1891  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
  1892  	defer cancel()
  1893  	client := testgrpc.NewTestServiceClient(cc)
  1894  	client.EmptyCall(ctx, &testpb.Empty{})
  1895  	testutils.AwaitState(ctx, t, cc, connectivity.Ready)
  1896  
  1897  	// Wait until both endpoints start receiving requests.
  1898  	addrsSeen := map[string]bool{}
  1899  	for ; ctx.Err() == nil && len(addrsSeen) < 2; <-time.After(time.Millisecond) {
  1900  		var peer peer.Peer
  1901  		client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer))
  1902  		addrsSeen[peer.String()] = true
  1903  	}
  1904  
  1905  	if len(addrsSeen) < 2 {
  1906  		t.Fatalf("Context timed out waiting for requests to reach both endpoints.")
  1907  	}
  1908  
  1909  	// Make 2 requests to each endpoint and verify the first endpoint gets
  1910  	// ejected.
  1911  	for i := 0; i < 2*len(endpoints); i++ {
  1912  		client.EmptyCall(ctx, &testpb.Empty{})
  1913  	}
  1914  	var od *outlierDetectionBalancer
  1915  	select {
  1916  	case <-ctx.Done():
  1917  		t.Fatal("Timed out waiting for the outlier detection LB policy to be built.")
  1918  	case od = <-lbChan:
  1919  	}
  1920  	od.intervalTimerAlgorithm()
  1921  
  1922  	// The first endpoint should be ejected, requests should only go to
  1923  	// endpoints[1].
  1924  	if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[1].Addresses[0]}); err != nil {
  1925  		t.Fatalf("RPCs didn't go to the second endpoint: %v", err)
  1926  	}
  1927  
  1928  	// Shutdown the unhealthy backend. The second address in the endpoint should
  1929  	// be connected, but it should be ejected by outlier detection.
  1930  	hold := dialer.Hold(healthyBackends[0].Address)
  1931  	unhealthyBackend.Stop()
  1932  	if hold.Wait(ctx) != true {
  1933  		t.Fatalf("Timeout waiting for second address in endpoint[0] with address %q to be contacted", healthyBackends[0].Address)
  1934  	}
  1935  	hold.Resume()
  1936  
  1937  	// Verify requests go only to healthyBackends[1] for a short time.
  1938  	shortCtx, cancel := context.WithTimeout(ctx, defaultTestShortTimeout)
  1939  	defer cancel()
  1940  	for ; shortCtx.Err() == nil; <-time.After(time.Millisecond) {
  1941  		var peer peer.Peer
  1942  		if _, err := client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer)); err != nil {
  1943  			if status.Code(err) != codes.DeadlineExceeded {
  1944  				t.Fatalf("EmptyCall() returned unexpected error %v", err)
  1945  			}
  1946  			break
  1947  		}
  1948  		if got, want := peer.Addr.String(), healthyBackends[1].Address; got != want {
  1949  			t.Fatalf("EmptyCall() went to unexpected backend: got %q, want %q", got, want)
  1950  		}
  1951  	}
  1952  
  1953  	// shutdown the connected backend in endpoints[1], requests should start
  1954  	// going to the second address in the same endpoint.
  1955  	healthyBackends[1].Stop()
  1956  	if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[1].Addresses[1]}); err != nil {
  1957  		t.Fatalf("RPCs didn't go to second address in the second endpoint: %v", err)
  1958  	}
  1959  
  1960  	// Reduce the ejection interval and run the interval algorithm again, it
  1961  	// should uneject endpoints[0].
  1962  	odCfg.MaxEjectionTime = 0
  1963  	odCfg.BaseEjectionTime = 0
  1964  	<-time.After(time.Millisecond)
  1965  	r.UpdateState(resolver.State{Endpoints: endpoints})
  1966  	od.intervalTimerAlgorithm()
  1967  	if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[0].Addresses[1], endpoints[1].Addresses[1]}); err != nil {
  1968  		t.Fatalf("RPCs didn't go to the second addresses of both endpoints: %v", err)
  1969  	}
  1970  }
  1971  
  1972  // Tests that removing an address from an endpoint resets its ejection state.
  1973  // The test creates two endpoints, each with two addresses. The first endpoint
  1974  // has a backend that always returns errors. The test verifies that the first
  1975  // endpoint is ejected after running the intervalTimerAlgorithm. The test sends
  1976  // a resolver update that removes the first address in the ejected endpoint. The
  1977  // test verifies that requests start reaching the remaining address from the
  1978  // first endpoint.
  1979  func (s) TestEjectionStateResetsWhenEndpointAddressesChange(t *testing.T) {
  1980  	unhealthyBackend := &stubserver.StubServer{
  1981  		EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) {
  1982  			return nil, errors.New("some error")
  1983  		},
  1984  	}
  1985  	if err := unhealthyBackend.StartServer(); err != nil {
  1986  		t.Fatalf("Failed to start backend: %v", err)
  1987  	}
  1988  	defer unhealthyBackend.Stop()
  1989  	t.Logf("Started unhealthy TestService backend at: %q", unhealthyBackend.Address)
  1990  
  1991  	healthyBackends := make([]*stubserver.StubServer, 3)
  1992  	for i := 0; i < 3; i++ {
  1993  		healthyBackends[i] = stubserver.StartTestService(t, nil)
  1994  		defer healthyBackends[i].Stop()
  1995  	}
  1996  
  1997  	wrrCfg, err := balancer.Get(weightedroundrobin.Name).(balancer.ConfigParser).ParseConfig(json.RawMessage("{}"))
  1998  	if err != nil {
  1999  		t.Fatalf("Failed to parse %q config: %v", weightedroundrobin.Name, err)
  2000  	}
  2001  	// The interval is intentionally kept very large, the interval algorithm
  2002  	// will be triggered manually.
  2003  	odCfg := &LBConfig{
  2004  		Interval:         iserviceconfig.Duration(300 * time.Second),
  2005  		BaseEjectionTime: iserviceconfig.Duration(300 * time.Second),
  2006  		MaxEjectionTime:  iserviceconfig.Duration(300 * time.Second),
  2007  		FailurePercentageEjection: &FailurePercentageEjection{
  2008  			Threshold:             50,
  2009  			EnforcementPercentage: 100,
  2010  			MinimumHosts:          0,
  2011  			RequestVolume:         2,
  2012  		},
  2013  		MaxEjectionPercent: 100,
  2014  		ChildPolicy: &iserviceconfig.BalancerConfig{
  2015  			Name:   weightedroundrobin.Name,
  2016  			Config: wrrCfg,
  2017  		},
  2018  	}
  2019  
  2020  	lbChan := make(chan *outlierDetectionBalancer, 1)
  2021  	bf := stub.BalancerFuncs{
  2022  		Init: func(bd *stub.BalancerData) {
  2023  			bd.ChildBalancer = balancer.Get(Name).Build(bd.ClientConn, bd.BuildOptions)
  2024  			lbChan <- bd.ChildBalancer.(*outlierDetectionBalancer)
  2025  		},
  2026  		Close: func(bd *stub.BalancerData) {
  2027  			bd.ChildBalancer.Close()
  2028  		},
  2029  		UpdateClientConnState: func(bd *stub.BalancerData, ccs balancer.ClientConnState) error {
  2030  			ccs.BalancerConfig = odCfg
  2031  			return bd.ChildBalancer.UpdateClientConnState(ccs)
  2032  		},
  2033  	}
  2034  
  2035  	stub.Register(t.Name(), bf)
  2036  	r := manual.NewBuilderWithScheme("whatever")
  2037  	endpoints := []resolver.Endpoint{
  2038  		{
  2039  			Addresses: []resolver.Address{
  2040  				{Addr: unhealthyBackend.Address},
  2041  				{Addr: healthyBackends[0].Address},
  2042  			},
  2043  		},
  2044  		{
  2045  			Addresses: []resolver.Address{
  2046  				{Addr: healthyBackends[1].Address},
  2047  				{Addr: healthyBackends[2].Address},
  2048  			},
  2049  		},
  2050  	}
  2051  
  2052  	r.InitialState(resolver.State{
  2053  		Endpoints: endpoints,
  2054  	})
  2055  	dialer := testutils.NewBlockingDialer()
  2056  	opts := []grpc.DialOption{
  2057  		grpc.WithTransportCredentials(insecure.NewCredentials()),
  2058  		grpc.WithDefaultServiceConfig(fmt.Sprintf(`{ "loadBalancingConfig": [{%q: {}}] }`, t.Name())),
  2059  		grpc.WithResolvers(r),
  2060  		grpc.WithContextDialer(dialer.DialContext),
  2061  	}
  2062  	cc, err := grpc.NewClient(r.Scheme()+":///", opts...)
  2063  	if err != nil {
  2064  		t.Fatalf("grpc.NewClient() failed: %v", err)
  2065  	}
  2066  	defer cc.Close()
  2067  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
  2068  	defer cancel()
  2069  	client := testgrpc.NewTestServiceClient(cc)
  2070  	client.EmptyCall(ctx, &testpb.Empty{})
  2071  	testutils.AwaitState(ctx, t, cc, connectivity.Ready)
  2072  
  2073  	// Wait until both endpoints start receiving requests.
  2074  	addrsSeen := map[string]bool{}
  2075  	for ; ctx.Err() == nil && len(addrsSeen) < 2; <-time.After(time.Millisecond) {
  2076  		var peer peer.Peer
  2077  		client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer))
  2078  		addrsSeen[peer.String()] = true
  2079  	}
  2080  
  2081  	if len(addrsSeen) < 2 {
  2082  		t.Fatalf("Context timed out waiting for requests to reach both endpoints.")
  2083  	}
  2084  
  2085  	// Make 2 requests to each endpoint and verify the first endpoint gets
  2086  	// ejected.
  2087  	for i := 0; i < 2*len(endpoints); i++ {
  2088  		client.EmptyCall(ctx, &testpb.Empty{})
  2089  	}
  2090  	var od *outlierDetectionBalancer
  2091  	select {
  2092  	case <-ctx.Done():
  2093  		t.Fatal("Timed out waiting for the outlier detection LB policy to be built.")
  2094  	case od = <-lbChan:
  2095  	}
  2096  	od.intervalTimerAlgorithm()
  2097  
  2098  	// The first endpoint should be ejected, requests should only go to
  2099  	// endpoints[1].
  2100  	if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[1].Addresses[0]}); err != nil {
  2101  		t.Fatalf("RPCs didn't go to the second endpoint: %v", err)
  2102  	}
  2103  
  2104  	// Remove the first address from the first endpoint. This makes the first
  2105  	// endpoint a new endpoint for outlier detection, resetting its ejection
  2106  	// status.
  2107  	r.UpdateState(resolver.State{Endpoints: []resolver.Endpoint{
  2108  		{Addresses: []resolver.Address{endpoints[0].Addresses[1]}},
  2109  		endpoints[1],
  2110  	}})
  2111  	od.intervalTimerAlgorithm()
  2112  	if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[0].Addresses[1], endpoints[1].Addresses[0]}); err != nil {
  2113  		t.Fatalf("RPCs didn't go to the second addresses of both endpoints: %v", err)
  2114  	}
  2115  }