google.golang.org/grpc@v1.72.2/xds/internal/balancer/outlierdetection/balancer_test.go (about)

     1  /*
     2   *
     3   * Copyright 2022 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   *
    17   */
    18  
    19  package outlierdetection
    20  
    21  import (
    22  	"context"
    23  	"encoding/json"
    24  	"errors"
    25  	"fmt"
    26  	"math"
    27  	"strings"
    28  	"sync"
    29  	"testing"
    30  	"time"
    31  
    32  	"github.com/google/go-cmp/cmp"
    33  	"github.com/google/go-cmp/cmp/cmpopts"
    34  	"google.golang.org/grpc"
    35  	"google.golang.org/grpc/balancer"
    36  	"google.golang.org/grpc/balancer/pickfirst/pickfirstleaf"
    37  	"google.golang.org/grpc/balancer/weightedroundrobin"
    38  	"google.golang.org/grpc/codes"
    39  	"google.golang.org/grpc/connectivity"
    40  	"google.golang.org/grpc/credentials/insecure"
    41  	"google.golang.org/grpc/internal/balancer/stub"
    42  	"google.golang.org/grpc/internal/channelz"
    43  	"google.golang.org/grpc/internal/grpcsync"
    44  	"google.golang.org/grpc/internal/grpctest"
    45  	iserviceconfig "google.golang.org/grpc/internal/serviceconfig"
    46  	"google.golang.org/grpc/internal/stubserver"
    47  	"google.golang.org/grpc/internal/testutils"
    48  	"google.golang.org/grpc/internal/testutils/roundrobin"
    49  	"google.golang.org/grpc/peer"
    50  	"google.golang.org/grpc/resolver"
    51  	"google.golang.org/grpc/resolver/manual"
    52  	"google.golang.org/grpc/serviceconfig"
    53  	"google.golang.org/grpc/status"
    54  	"google.golang.org/grpc/xds/internal/balancer/clusterimpl"
    55  
    56  	testgrpc "google.golang.org/grpc/interop/grpc_testing"
    57  	testpb "google.golang.org/grpc/interop/grpc_testing"
    58  )
    59  
    60  var (
    61  	defaultTestTimeout      = 5 * time.Second
    62  	defaultTestShortTimeout = 10 * time.Millisecond
    63  )
    64  
    65  type s struct {
    66  	grpctest.Tester
    67  }
    68  
    69  func Test(t *testing.T) {
    70  	grpctest.RunSubTests(t, s{})
    71  }
    72  
    73  // TestParseConfig verifies the ParseConfig() method in the Outlier Detection
    74  // Balancer.
    75  func (s) TestParseConfig(t *testing.T) {
    76  	const errParseConfigName = "errParseConfigBalancer"
    77  	stub.Register(errParseConfigName, stub.BalancerFuncs{
    78  		ParseConfig: func(json.RawMessage) (serviceconfig.LoadBalancingConfig, error) {
    79  			return nil, errors.New("some error")
    80  		},
    81  	})
    82  
    83  	parser := bb{}
    84  	const (
    85  		defaultInterval                       = iserviceconfig.Duration(10 * time.Second)
    86  		defaultBaseEjectionTime               = iserviceconfig.Duration(30 * time.Second)
    87  		defaultMaxEjectionTime                = iserviceconfig.Duration(300 * time.Second)
    88  		defaultMaxEjectionPercent             = 10
    89  		defaultSuccessRateStdevFactor         = 1900
    90  		defaultEnforcingSuccessRate           = 100
    91  		defaultSuccessRateMinimumHosts        = 5
    92  		defaultSuccessRateRequestVolume       = 100
    93  		defaultFailurePercentageThreshold     = 85
    94  		defaultEnforcingFailurePercentage     = 0
    95  		defaultFailurePercentageMinimumHosts  = 5
    96  		defaultFailurePercentageRequestVolume = 50
    97  	)
    98  	tests := []struct {
    99  		name    string
   100  		input   string
   101  		wantCfg serviceconfig.LoadBalancingConfig
   102  		wantErr string
   103  	}{
   104  		{
   105  			name: "no-fields-set-should-get-default",
   106  			input: `{
   107  				"childPolicy": [
   108  				{
   109  					"xds_cluster_impl_experimental": {
   110  						"cluster": "test_cluster"
   111  					}
   112  				}
   113  				]
   114  			}`,
   115  			wantCfg: &LBConfig{
   116  				Interval:           defaultInterval,
   117  				BaseEjectionTime:   defaultBaseEjectionTime,
   118  				MaxEjectionTime:    defaultMaxEjectionTime,
   119  				MaxEjectionPercent: defaultMaxEjectionPercent,
   120  				ChildPolicy: &iserviceconfig.BalancerConfig{
   121  					Name: "xds_cluster_impl_experimental",
   122  					Config: &clusterimpl.LBConfig{
   123  						Cluster: "test_cluster",
   124  					},
   125  				},
   126  			},
   127  		},
   128  
   129  		{
   130  			name: "some-top-level-fields-set",
   131  			input: `{
   132  				"interval": "15s",
   133  				"maxEjectionTime": "350s",
   134  				"childPolicy": [
   135  				{
   136  					"xds_cluster_impl_experimental": {
   137  						"cluster": "test_cluster"
   138  					}
   139  				}
   140  				]
   141  			}`,
   142  			// Should get set fields + defaults for unset fields.
   143  			wantCfg: &LBConfig{
   144  				Interval:           iserviceconfig.Duration(15 * time.Second),
   145  				BaseEjectionTime:   defaultBaseEjectionTime,
   146  				MaxEjectionTime:    iserviceconfig.Duration(350 * time.Second),
   147  				MaxEjectionPercent: defaultMaxEjectionPercent,
   148  				ChildPolicy: &iserviceconfig.BalancerConfig{
   149  					Name: "xds_cluster_impl_experimental",
   150  					Config: &clusterimpl.LBConfig{
   151  						Cluster: "test_cluster",
   152  					},
   153  				},
   154  			},
   155  		},
   156  		{
   157  			name: "success-rate-ejection-present-but-no-fields",
   158  			input: `{
   159  				"successRateEjection": {},
   160                  "childPolicy": [
   161  				{
   162  					"xds_cluster_impl_experimental": {
   163  						"cluster": "test_cluster"
   164  					}
   165  				}
   166  				]
   167  			}`,
   168  			// Should get defaults of success-rate-ejection struct.
   169  			wantCfg: &LBConfig{
   170  				Interval:           defaultInterval,
   171  				BaseEjectionTime:   defaultBaseEjectionTime,
   172  				MaxEjectionTime:    defaultMaxEjectionTime,
   173  				MaxEjectionPercent: defaultMaxEjectionPercent,
   174  				SuccessRateEjection: &SuccessRateEjection{
   175  					StdevFactor:           defaultSuccessRateStdevFactor,
   176  					EnforcementPercentage: defaultEnforcingSuccessRate,
   177  					MinimumHosts:          defaultSuccessRateMinimumHosts,
   178  					RequestVolume:         defaultSuccessRateRequestVolume,
   179  				},
   180  				ChildPolicy: &iserviceconfig.BalancerConfig{
   181  					Name: "xds_cluster_impl_experimental",
   182  					Config: &clusterimpl.LBConfig{
   183  						Cluster: "test_cluster",
   184  					},
   185  				},
   186  			},
   187  		},
   188  		{
   189  			name: "success-rate-ejection-present-partially-set",
   190  			input: `{
   191  				"successRateEjection": {
   192  					"stdevFactor": 1000,
   193  					"minimumHosts": 5
   194  				},
   195                  "childPolicy": [
   196  				{
   197  					"xds_cluster_impl_experimental": {
   198  						"cluster": "test_cluster"
   199  					}
   200  				}
   201  				]
   202  			}`,
   203  			// Should get set fields + defaults for others in success rate
   204  			// ejection layer.
   205  			wantCfg: &LBConfig{
   206  				Interval:           defaultInterval,
   207  				BaseEjectionTime:   defaultBaseEjectionTime,
   208  				MaxEjectionTime:    defaultMaxEjectionTime,
   209  				MaxEjectionPercent: defaultMaxEjectionPercent,
   210  				SuccessRateEjection: &SuccessRateEjection{
   211  					StdevFactor:           1000,
   212  					EnforcementPercentage: defaultEnforcingSuccessRate,
   213  					MinimumHosts:          5,
   214  					RequestVolume:         defaultSuccessRateRequestVolume,
   215  				},
   216  				ChildPolicy: &iserviceconfig.BalancerConfig{
   217  					Name: "xds_cluster_impl_experimental",
   218  					Config: &clusterimpl.LBConfig{
   219  						Cluster: "test_cluster",
   220  					},
   221  				},
   222  			},
   223  		},
   224  		{
   225  			name: "success-rate-ejection-present-fully-set",
   226  			input: `{
   227  				"successRateEjection": {
   228  					"stdevFactor": 1000,
   229  					"enforcementPercentage": 50,
   230  					"minimumHosts": 5,
   231  					"requestVolume": 50
   232  				},
   233                  "childPolicy": [
   234  				{
   235  					"xds_cluster_impl_experimental": {
   236  						"cluster": "test_cluster"
   237  					}
   238  				}
   239  				]
   240  			}`,
   241  			wantCfg: &LBConfig{
   242  				Interval:           defaultInterval,
   243  				BaseEjectionTime:   defaultBaseEjectionTime,
   244  				MaxEjectionTime:    defaultMaxEjectionTime,
   245  				MaxEjectionPercent: defaultMaxEjectionPercent,
   246  				SuccessRateEjection: &SuccessRateEjection{
   247  					StdevFactor:           1000,
   248  					EnforcementPercentage: 50,
   249  					MinimumHosts:          5,
   250  					RequestVolume:         50,
   251  				},
   252  				ChildPolicy: &iserviceconfig.BalancerConfig{
   253  					Name: "xds_cluster_impl_experimental",
   254  					Config: &clusterimpl.LBConfig{
   255  						Cluster: "test_cluster",
   256  					},
   257  				},
   258  			},
   259  		},
   260  		{
   261  			name: "failure-percentage-ejection-present-but-no-fields",
   262  			input: `{
   263  				"failurePercentageEjection": {},
   264                  "childPolicy": [
   265  				{
   266  					"xds_cluster_impl_experimental": {
   267  						"cluster": "test_cluster"
   268  					}
   269  				}
   270  				]
   271  			}`,
   272  			// Should get defaults of failure percentage ejection layer.
   273  			wantCfg: &LBConfig{
   274  				Interval:           defaultInterval,
   275  				BaseEjectionTime:   defaultBaseEjectionTime,
   276  				MaxEjectionTime:    defaultMaxEjectionTime,
   277  				MaxEjectionPercent: defaultMaxEjectionPercent,
   278  				FailurePercentageEjection: &FailurePercentageEjection{
   279  					Threshold:             defaultFailurePercentageThreshold,
   280  					EnforcementPercentage: defaultEnforcingFailurePercentage,
   281  					MinimumHosts:          defaultFailurePercentageMinimumHosts,
   282  					RequestVolume:         defaultFailurePercentageRequestVolume,
   283  				},
   284  				ChildPolicy: &iserviceconfig.BalancerConfig{
   285  					Name: "xds_cluster_impl_experimental",
   286  					Config: &clusterimpl.LBConfig{
   287  						Cluster: "test_cluster",
   288  					},
   289  				},
   290  			},
   291  		},
   292  		{
   293  			name: "failure-percentage-ejection-present-partially-set",
   294  			input: `{
   295  				"failurePercentageEjection": {
   296  					"threshold": 80,
   297  					"minimumHosts": 10
   298  				},
   299                  "childPolicy": [
   300  				{
   301  					"xds_cluster_impl_experimental": {
   302  						"cluster": "test_cluster"
   303  					}
   304  				}
   305  				]
   306  			}`,
   307  			// Should get set fields + defaults for others in success rate
   308  			// ejection layer.
   309  			wantCfg: &LBConfig{
   310  				Interval:           defaultInterval,
   311  				BaseEjectionTime:   defaultBaseEjectionTime,
   312  				MaxEjectionTime:    defaultMaxEjectionTime,
   313  				MaxEjectionPercent: defaultMaxEjectionPercent,
   314  				FailurePercentageEjection: &FailurePercentageEjection{
   315  					Threshold:             80,
   316  					EnforcementPercentage: defaultEnforcingFailurePercentage,
   317  					MinimumHosts:          10,
   318  					RequestVolume:         defaultFailurePercentageRequestVolume,
   319  				},
   320  				ChildPolicy: &iserviceconfig.BalancerConfig{
   321  					Name: "xds_cluster_impl_experimental",
   322  					Config: &clusterimpl.LBConfig{
   323  						Cluster: "test_cluster",
   324  					},
   325  				},
   326  			},
   327  		},
   328  		{
   329  			name: "failure-percentage-ejection-present-fully-set",
   330  			input: `{
   331  				"failurePercentageEjection": {
   332  					"threshold": 80,
   333  					"enforcementPercentage": 100,
   334  					"minimumHosts": 10,
   335  					"requestVolume": 40
   336                  },
   337                  "childPolicy": [
   338  				{
   339  					"xds_cluster_impl_experimental": {
   340  						"cluster": "test_cluster"
   341  					}
   342  				}
   343  				]
   344  			}`,
   345  			wantCfg: &LBConfig{
   346  				Interval:           defaultInterval,
   347  				BaseEjectionTime:   defaultBaseEjectionTime,
   348  				MaxEjectionTime:    defaultMaxEjectionTime,
   349  				MaxEjectionPercent: defaultMaxEjectionPercent,
   350  				FailurePercentageEjection: &FailurePercentageEjection{
   351  					Threshold:             80,
   352  					EnforcementPercentage: 100,
   353  					MinimumHosts:          10,
   354  					RequestVolume:         40,
   355  				},
   356  				ChildPolicy: &iserviceconfig.BalancerConfig{
   357  					Name: "xds_cluster_impl_experimental",
   358  					Config: &clusterimpl.LBConfig{
   359  						Cluster: "test_cluster",
   360  					},
   361  				},
   362  			},
   363  		},
   364  		{ // to make sure zero values aren't overwritten by defaults
   365  			name: "lb-config-every-field-set-zero-value",
   366  			input: `{
   367  				"interval": "0s",
   368  				"baseEjectionTime": "0s",
   369  				"maxEjectionTime": "0s",
   370  				"maxEjectionPercent": 0,
   371  				"successRateEjection": {
   372  					"stdevFactor": 0,
   373  					"enforcementPercentage": 0,
   374  					"minimumHosts": 0,
   375  					"requestVolume": 0
   376  				},
   377  				"failurePercentageEjection": {
   378  					"threshold": 0,
   379  					"enforcementPercentage": 0,
   380  					"minimumHosts": 0,
   381  					"requestVolume": 0
   382  				},
   383                  "childPolicy": [
   384  				{
   385  					"xds_cluster_impl_experimental": {
   386  						"cluster": "test_cluster"
   387  					}
   388  				}
   389  				]
   390  			}`,
   391  			wantCfg: &LBConfig{
   392  				SuccessRateEjection:       &SuccessRateEjection{},
   393  				FailurePercentageEjection: &FailurePercentageEjection{},
   394  				ChildPolicy: &iserviceconfig.BalancerConfig{
   395  					Name: "xds_cluster_impl_experimental",
   396  					Config: &clusterimpl.LBConfig{
   397  						Cluster: "test_cluster",
   398  					},
   399  				},
   400  			},
   401  		},
   402  		{
   403  			name: "lb-config-every-field-set",
   404  			input: `{
   405  				"interval": "10s",
   406  				"baseEjectionTime": "30s",
   407  				"maxEjectionTime": "300s",
   408  				"maxEjectionPercent": 10,
   409  				"successRateEjection": {
   410  					"stdevFactor": 1900,
   411  					"enforcementPercentage": 100,
   412  					"minimumHosts": 5,
   413  					"requestVolume": 100
   414  				},
   415  				"failurePercentageEjection": {
   416  					"threshold": 85,
   417  					"enforcementPercentage": 5,
   418  					"minimumHosts": 5,
   419  					"requestVolume": 50
   420  				},
   421                  "childPolicy": [
   422  				{
   423  					"xds_cluster_impl_experimental": {
   424  						"cluster": "test_cluster"
   425  					}
   426  				}
   427  				]
   428  			}`,
   429  			wantCfg: &LBConfig{
   430  				Interval:           iserviceconfig.Duration(10 * time.Second),
   431  				BaseEjectionTime:   iserviceconfig.Duration(30 * time.Second),
   432  				MaxEjectionTime:    iserviceconfig.Duration(300 * time.Second),
   433  				MaxEjectionPercent: 10,
   434  				SuccessRateEjection: &SuccessRateEjection{
   435  					StdevFactor:           1900,
   436  					EnforcementPercentage: 100,
   437  					MinimumHosts:          5,
   438  					RequestVolume:         100,
   439  				},
   440  				FailurePercentageEjection: &FailurePercentageEjection{
   441  					Threshold:             85,
   442  					EnforcementPercentage: 5,
   443  					MinimumHosts:          5,
   444  					RequestVolume:         50,
   445  				},
   446  				ChildPolicy: &iserviceconfig.BalancerConfig{
   447  					Name: "xds_cluster_impl_experimental",
   448  					Config: &clusterimpl.LBConfig{
   449  						Cluster: "test_cluster",
   450  					},
   451  				},
   452  			},
   453  		},
   454  		{
   455  			name:    "interval-is-negative",
   456  			input:   `{"interval": "-10s"}`,
   457  			wantErr: "OutlierDetectionLoadBalancingConfig.interval = -10s; must be >= 0",
   458  		},
   459  		{
   460  			name:    "base-ejection-time-is-negative",
   461  			input:   `{"baseEjectionTime": "-10s"}`,
   462  			wantErr: "OutlierDetectionLoadBalancingConfig.base_ejection_time = -10s; must be >= 0",
   463  		},
   464  		{
   465  			name:    "max-ejection-time-is-negative",
   466  			input:   `{"maxEjectionTime": "-10s"}`,
   467  			wantErr: "OutlierDetectionLoadBalancingConfig.max_ejection_time = -10s; must be >= 0",
   468  		},
   469  		{
   470  			name:    "max-ejection-percent-is-greater-than-100",
   471  			input:   `{"maxEjectionPercent": 150}`,
   472  			wantErr: "OutlierDetectionLoadBalancingConfig.max_ejection_percent = 150; must be <= 100",
   473  		},
   474  		{
   475  			name: "enforcement-percentage-success-rate-is-greater-than-100",
   476  			input: `{
   477  				"successRateEjection": {
   478  					"enforcementPercentage": 150
   479  				}
   480  			}`,
   481  			wantErr: "OutlierDetectionLoadBalancingConfig.SuccessRateEjection.enforcement_percentage = 150; must be <= 100",
   482  		},
   483  		{
   484  			name: "failure-percentage-threshold-is-greater-than-100",
   485  			input: `{
   486  				"failurePercentageEjection": {
   487  					"threshold": 150
   488  				}
   489  			}`,
   490  			wantErr: "OutlierDetectionLoadBalancingConfig.FailurePercentageEjection.threshold = 150; must be <= 100",
   491  		},
   492  		{
   493  			name: "enforcement-percentage-failure-percentage-ejection-is-greater-than-100",
   494  			input: `{
   495  				"failurePercentageEjection": {
   496  					"enforcementPercentage": 150
   497  				}
   498  			}`,
   499  			wantErr: "OutlierDetectionLoadBalancingConfig.FailurePercentageEjection.enforcement_percentage = 150; must be <= 100",
   500  		},
   501  		{
   502  			name: "child-policy-present-but-parse-error",
   503  			input: `{
   504  				"childPolicy": [
   505  				{
   506  					"errParseConfigBalancer": {
   507  						"cluster": "test_cluster"
   508  					}
   509  				}
   510  			]
   511  			}`,
   512  			wantErr: "error parsing loadBalancingConfig for policy \"errParseConfigBalancer\"",
   513  		},
   514  		{
   515  			name: "no-supported-child-policy",
   516  			input: `{
   517  				"childPolicy": [
   518  				{
   519  					"doesNotExistBalancer": {
   520  						"cluster": "test_cluster"
   521  					}
   522  				}
   523  			]
   524  			}`,
   525  			wantErr: "invalid loadBalancingConfig: no supported policies found",
   526  		},
   527  	}
   528  	for _, test := range tests {
   529  		t.Run(test.name, func(t *testing.T) {
   530  			gotCfg, gotErr := parser.ParseConfig(json.RawMessage(test.input))
   531  			if gotErr != nil && !strings.Contains(gotErr.Error(), test.wantErr) {
   532  				t.Fatalf("ParseConfig(%v) = %v, wantErr %v", test.input, gotErr, test.wantErr)
   533  			}
   534  			if (gotErr != nil) != (test.wantErr != "") {
   535  				t.Fatalf("ParseConfig(%v) = %v, wantErr %v", test.input, gotErr, test.wantErr)
   536  			}
   537  			if test.wantErr != "" {
   538  				return
   539  			}
   540  			if diff := cmp.Diff(gotCfg, test.wantCfg); diff != "" {
   541  				t.Fatalf("parseConfig(%v) got unexpected output, diff (-got +want): %v", string(test.input), diff)
   542  			}
   543  		})
   544  	}
   545  }
   546  
   547  func (lbc *LBConfig) Equal(lbc2 *LBConfig) bool {
   548  	if !lbc.EqualIgnoringChildPolicy(lbc2) {
   549  		return false
   550  	}
   551  	return cmp.Equal(lbc.ChildPolicy, lbc2.ChildPolicy)
   552  }
   553  
   554  type subConnWithState struct {
   555  	sc    balancer.SubConn
   556  	state balancer.SubConnState
   557  }
   558  
   559  func setup(t *testing.T) (*outlierDetectionBalancer, *testutils.BalancerClientConn, func()) {
   560  	t.Helper()
   561  	builder := balancer.Get(Name)
   562  	if builder == nil {
   563  		t.Fatalf("balancer.Get(%q) returned nil", Name)
   564  	}
   565  	tcc := testutils.NewBalancerClientConn(t)
   566  	ch := channelz.RegisterChannel(nil, "test channel")
   567  	t.Cleanup(func() { channelz.RemoveEntry(ch.ID) })
   568  	odB := builder.Build(tcc, balancer.BuildOptions{ChannelzParent: ch})
   569  	return odB.(*outlierDetectionBalancer), tcc, odB.Close
   570  }
   571  
   572  type emptyChildConfig struct {
   573  	serviceconfig.LoadBalancingConfig
   574  }
   575  
   576  // TestChildBasicOperations tests basic operations of the Outlier Detection
   577  // Balancer and its interaction with its child. The following scenarios are
   578  // tested, in a step by step fashion:
   579  // 1. The Outlier Detection Balancer receives it's first good configuration. The
   580  // balancer is expected to create a child and sent the child it's configuration.
   581  // 2. The Outlier Detection Balancer receives new configuration that specifies a
   582  // child's type, and the new type immediately reports READY inline. The first
   583  // child balancer should be closed and the second child balancer should receive
   584  // a config update.
   585  // 3. The Outlier Detection Balancer is closed. The second child balancer should
   586  // be closed.
   587  func (s) TestChildBasicOperations(t *testing.T) {
   588  	bc := emptyChildConfig{}
   589  
   590  	ccsCh := testutils.NewChannel()
   591  	closeCh := testutils.NewChannel()
   592  
   593  	stub.Register(t.Name()+"child1", stub.BalancerFuncs{
   594  		UpdateClientConnState: func(_ *stub.BalancerData, ccs balancer.ClientConnState) error {
   595  			ccsCh.Send(ccs.BalancerConfig)
   596  			return nil
   597  		},
   598  		Close: func(*stub.BalancerData) {
   599  			closeCh.Send(nil)
   600  		},
   601  	})
   602  
   603  	stub.Register(t.Name()+"child2", stub.BalancerFuncs{
   604  		UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error {
   605  			// UpdateState inline to READY to complete graceful switch process
   606  			// synchronously from any UpdateClientConnState call.
   607  			bd.ClientConn.UpdateState(balancer.State{
   608  				ConnectivityState: connectivity.Ready,
   609  				Picker:            &testutils.TestConstPicker{},
   610  			})
   611  			ccsCh.Send(nil)
   612  			return nil
   613  		},
   614  		Close: func(*stub.BalancerData) {
   615  			closeCh.Send(nil)
   616  		},
   617  	})
   618  
   619  	od, tcc, _ := setup(t)
   620  
   621  	// This first config update should cause a child to be built and forwarded
   622  	// its first update.
   623  	od.UpdateClientConnState(balancer.ClientConnState{
   624  		BalancerConfig: &LBConfig{
   625  			ChildPolicy: &iserviceconfig.BalancerConfig{
   626  				Name:   t.Name() + "child1",
   627  				Config: bc,
   628  			},
   629  		},
   630  	})
   631  
   632  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   633  	defer cancel()
   634  	cr, err := ccsCh.Receive(ctx)
   635  	if err != nil {
   636  		t.Fatalf("timed out waiting for UpdateClientConnState on the first child balancer: %v", err)
   637  	}
   638  	if _, ok := cr.(emptyChildConfig); !ok {
   639  		t.Fatalf("Received child policy config of type %T, want %T", cr, emptyChildConfig{})
   640  	}
   641  
   642  	// This Update Client Conn State call should cause the first child balancer
   643  	// to close, and a new child to be created and also forwarded its first
   644  	// config update.
   645  	od.UpdateClientConnState(balancer.ClientConnState{
   646  		BalancerConfig: &LBConfig{
   647  			Interval: math.MaxInt64,
   648  			ChildPolicy: &iserviceconfig.BalancerConfig{
   649  				Name:   t.Name() + "child2",
   650  				Config: emptyChildConfig{},
   651  			},
   652  		},
   653  	})
   654  
   655  	// Verify inline UpdateState() call from the new child eventually makes it's
   656  	// way to the Test Client Conn.
   657  	select {
   658  	case <-ctx.Done():
   659  		t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn")
   660  	case state := <-tcc.NewStateCh:
   661  		if state != connectivity.Ready {
   662  			t.Fatalf("ClientConn received connectivity state %v, want %v", state, connectivity.Ready)
   663  		}
   664  	}
   665  
   666  	// Verify the first child balancer closed.
   667  	if _, err = closeCh.Receive(ctx); err != nil {
   668  		t.Fatalf("timed out waiting for the first child balancer to be closed: %v", err)
   669  	}
   670  	// Verify the second child balancer received its first config update.
   671  	if _, err = ccsCh.Receive(ctx); err != nil {
   672  		t.Fatalf("timed out waiting for UpdateClientConnState on the second child balancer: %v", err)
   673  	}
   674  	// Closing the Outlier Detection Balancer should close the newly created
   675  	// child.
   676  	od.Close()
   677  	if _, err = closeCh.Receive(ctx); err != nil {
   678  		t.Fatalf("timed out waiting for the second child balancer to be closed: %v", err)
   679  	}
   680  }
   681  
   682  // TestUpdateAddresses tests the functionality of UpdateAddresses and any
   683  // changes in the addresses/plurality of those addresses for a SubConn. The
   684  // Balancer is set up with two upstreams, with one of the upstreams being
   685  // ejected. Initially, there is one SubConn for each address. The following
   686  // scenarios are tested, in a step by step fashion:
   687  // 1. The SubConn not currently ejected switches addresses to the address that
   688  // is ejected. This should cause the SubConn to get ejected.
   689  // 2. Update this same SubConn to multiple addresses. This should cause the
   690  // SubConn to get unejected, as it is no longer being tracked by Outlier
   691  // Detection at that point.
   692  // 3. Update this same SubConn to different addresses, still multiple. This
   693  // should be a noop, as the SubConn is still no longer being tracked by Outlier
   694  // Detection.
   695  // 4. Update this same SubConn to the a single address which is ejected. This
   696  // should cause the SubConn to be ejected.
   697  func (s) TestUpdateAddresses(t *testing.T) {
   698  	scsCh := testutils.NewChannel()
   699  	var scw1, scw2 balancer.SubConn
   700  	var err error
   701  	stub.Register(t.Name(), stub.BalancerFuncs{
   702  		UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error {
   703  			scw1, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{
   704  				StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw1, state: state}) },
   705  			})
   706  			if err != nil {
   707  				t.Errorf("error in od.NewSubConn call: %v", err)
   708  			}
   709  			scw2, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{
   710  				StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw2, state: state}) },
   711  			})
   712  			if err != nil {
   713  				t.Errorf("error in od.NewSubConn call: %v", err)
   714  			}
   715  			bd.ClientConn.UpdateState(balancer.State{
   716  				ConnectivityState: connectivity.Ready,
   717  				Picker: &rrPicker{
   718  					scs: []balancer.SubConn{scw1, scw2},
   719  				},
   720  			})
   721  			return nil
   722  		},
   723  	})
   724  
   725  	od, tcc, cleanup := setup(t)
   726  	defer cleanup()
   727  
   728  	od.UpdateClientConnState(balancer.ClientConnState{
   729  		ResolverState: resolver.State{
   730  			Endpoints: []resolver.Endpoint{
   731  				{Addresses: []resolver.Address{{Addr: "address1"}}},
   732  				{Addresses: []resolver.Address{{Addr: "address2"}}},
   733  			},
   734  		},
   735  		BalancerConfig: &LBConfig{
   736  			Interval:           iserviceconfig.Duration(10 * time.Second),
   737  			BaseEjectionTime:   iserviceconfig.Duration(30 * time.Second),
   738  			MaxEjectionTime:    iserviceconfig.Duration(300 * time.Second),
   739  			MaxEjectionPercent: 10,
   740  			FailurePercentageEjection: &FailurePercentageEjection{
   741  				Threshold:             50,
   742  				EnforcementPercentage: 100,
   743  				MinimumHosts:          2,
   744  				RequestVolume:         3,
   745  			},
   746  			ChildPolicy: &iserviceconfig.BalancerConfig{
   747  				Name:   t.Name(),
   748  				Config: emptyChildConfig{},
   749  			},
   750  		},
   751  	})
   752  
   753  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   754  	defer cancel()
   755  
   756  	// Setup the system to where one address is ejected and one address
   757  	// isn't.
   758  	select {
   759  	case <-ctx.Done():
   760  		t.Fatal("timeout while waiting for a UpdateState call on the ClientConn")
   761  	case picker := <-tcc.NewPickerCh:
   762  		pi, err := picker.Pick(balancer.PickInfo{})
   763  		if err != nil {
   764  			t.Fatalf("picker.Pick failed with error: %v", err)
   765  		}
   766  		// Simulate 5 successful RPC calls on the first SubConn (the first call
   767  		// to picker.Pick).
   768  		for c := 0; c < 5; c++ {
   769  			pi.Done(balancer.DoneInfo{})
   770  		}
   771  		pi, err = picker.Pick(balancer.PickInfo{})
   772  		if err != nil {
   773  			t.Fatalf("picker.Pick failed with error: %v", err)
   774  		}
   775  		// Simulate 5 failed RPC calls on the second SubConn (the second call to
   776  		// picker.Pick). Thus, when the interval timer algorithm is run, the
   777  		// second SubConn's address should be ejected, which will allow us to
   778  		// further test UpdateAddresses() logic.
   779  		for c := 0; c < 5; c++ {
   780  			pi.Done(balancer.DoneInfo{Err: errors.New("some error")})
   781  		}
   782  		od.intervalTimerAlgorithm()
   783  		// verify StateListener() got called with TRANSIENT_FAILURE for child
   784  		// with address that was ejected.
   785  		gotSCWS, err := scsCh.Receive(ctx)
   786  		if err != nil {
   787  			t.Fatalf("Error waiting for Sub Conn update: %v", err)
   788  		}
   789  		if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
   790  			sc:    scw2,
   791  			state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure},
   792  		}); err != nil {
   793  			t.Fatalf("Error in Sub Conn update: %v", err)
   794  		}
   795  	}
   796  
   797  	// Update scw1 to another address that is currently ejected. This should
   798  	// cause scw1 to get ejected.
   799  	od.UpdateAddresses(scw1, []resolver.Address{{Addr: "address2"}})
   800  
   801  	// Verify that update addresses gets forwarded to ClientConn.
   802  	select {
   803  	case <-ctx.Done():
   804  		t.Fatal("timeout while waiting for a UpdateState call on the ClientConn")
   805  	case <-tcc.UpdateAddressesAddrsCh:
   806  	}
   807  	// Verify scw1 got ejected (StateListener called with TRANSIENT_FAILURE).
   808  	gotSCWS, err := scsCh.Receive(ctx)
   809  	if err != nil {
   810  		t.Fatalf("Error waiting for Sub Conn update: %v", err)
   811  	}
   812  	if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
   813  		sc:    scw1,
   814  		state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure},
   815  	}); err != nil {
   816  		t.Fatalf("Error in Sub Conn update: %v", err)
   817  	}
   818  
   819  	// Update scw1 to multiple addresses. This should cause scw1 to get
   820  	// unejected, as is it no longer being tracked for Outlier Detection.
   821  	od.UpdateAddresses(scw1, []resolver.Address{
   822  		{Addr: "address1"},
   823  		{Addr: "address2"},
   824  	})
   825  	// Verify scw1 got unejected (StateListener called with recent state).
   826  	gotSCWS, err = scsCh.Receive(ctx)
   827  	if err != nil {
   828  		t.Fatalf("Error waiting for Sub Conn update: %v", err)
   829  	}
   830  	if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
   831  		sc:    scw1,
   832  		state: balancer.SubConnState{ConnectivityState: connectivity.Idle},
   833  	}); err != nil {
   834  		t.Fatalf("Error in Sub Conn update: %v", err)
   835  	}
   836  
   837  	// Update scw1 to a different multiple addresses list. A change of addresses
   838  	// in which the plurality goes from multiple to multiple should be a no-op,
   839  	// as the address continues to be ignored by outlier detection.
   840  	od.UpdateAddresses(scw1, []resolver.Address{
   841  		{Addr: "address2"},
   842  		{Addr: "address3"},
   843  	})
   844  	// Verify no downstream effects.
   845  	sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout)
   846  	defer cancel()
   847  	if _, err := scsCh.Receive(sCtx); err == nil {
   848  		t.Fatalf("no SubConn update should have been sent (no SubConn got ejected/unejected)")
   849  	}
   850  
   851  	// Update scw1 back to a single address, which is ejected. This should cause
   852  	// the SubConn to be re-ejected.
   853  	od.UpdateAddresses(scw1, []resolver.Address{{Addr: "address2"}})
   854  	// Verify scw1 got ejected (StateListener called with TRANSIENT FAILURE).
   855  	gotSCWS, err = scsCh.Receive(ctx)
   856  	if err != nil {
   857  		t.Fatalf("Error waiting for Sub Conn update: %v", err)
   858  	}
   859  	if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
   860  		sc:    scw1,
   861  		state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure},
   862  	}); err != nil {
   863  		t.Fatalf("Error in Sub Conn update: %v", err)
   864  	}
   865  }
   866  
   867  func scwsEqual(gotSCWS subConnWithState, wantSCWS subConnWithState) error {
   868  	if gotSCWS.sc != wantSCWS.sc || !cmp.Equal(gotSCWS.state, wantSCWS.state, cmp.AllowUnexported(subConnWrapper{}, endpointInfo{}, balancer.SubConnState{}), cmpopts.IgnoreFields(subConnWrapper{}, "scUpdateCh")) {
   869  		return fmt.Errorf("received SubConnState: %+v, want %+v", gotSCWS, wantSCWS)
   870  	}
   871  	return nil
   872  }
   873  
   874  type rrPicker struct {
   875  	scs  []balancer.SubConn
   876  	next int
   877  }
   878  
   879  func (rrp *rrPicker) Pick(balancer.PickInfo) (balancer.PickResult, error) {
   880  	sc := rrp.scs[rrp.next]
   881  	rrp.next = (rrp.next + 1) % len(rrp.scs)
   882  	return balancer.PickResult{SubConn: sc}, nil
   883  }
   884  
   885  // TestDurationOfInterval tests the configured interval timer.
   886  // The following scenarios are tested:
   887  // 1. The Outlier Detection Balancer receives it's first config. The balancer
   888  // should configure the timer with whatever is directly specified on the config.
   889  // 2. The Outlier Detection Balancer receives a subsequent config. The balancer
   890  // should configure with whatever interval is configured minus the difference
   891  // between the current time and the previous start timestamp.
   892  // 3. The Outlier Detection Balancer receives a no-op configuration. The
   893  // balancer should not configure a timer at all.
   894  func (s) TestDurationOfInterval(t *testing.T) {
   895  	stub.Register(t.Name(), stub.BalancerFuncs{})
   896  
   897  	od, _, cleanup := setup(t)
   898  	defer func(af func(d time.Duration, f func()) *time.Timer) {
   899  		cleanup()
   900  		afterFunc = af
   901  	}(afterFunc)
   902  
   903  	durationChan := testutils.NewChannel()
   904  	afterFunc = func(dur time.Duration, _ func()) *time.Timer {
   905  		durationChan.Send(dur)
   906  		return time.NewTimer(math.MaxInt64)
   907  	}
   908  
   909  	od.UpdateClientConnState(balancer.ClientConnState{
   910  		BalancerConfig: &LBConfig{
   911  			Interval: iserviceconfig.Duration(8 * time.Second),
   912  			SuccessRateEjection: &SuccessRateEjection{
   913  				StdevFactor:           1900,
   914  				EnforcementPercentage: 100,
   915  				MinimumHosts:          5,
   916  				RequestVolume:         100,
   917  			},
   918  			ChildPolicy: &iserviceconfig.BalancerConfig{
   919  				Name:   t.Name(),
   920  				Config: emptyChildConfig{},
   921  			},
   922  		},
   923  	})
   924  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   925  	defer cancel()
   926  	d, err := durationChan.Receive(ctx)
   927  	if err != nil {
   928  		t.Fatalf("Error receiving duration from afterFunc() call: %v", err)
   929  	}
   930  	dur := d.(time.Duration)
   931  	// The configured duration should be 8 seconds - what the balancer was
   932  	// configured with.
   933  	if dur != 8*time.Second {
   934  		t.Fatalf("configured duration should have been 8 seconds to start timer")
   935  	}
   936  
   937  	// Override time.Now to time.Now() + 5 seconds. This will represent 5
   938  	// seconds already passing for the next check in UpdateClientConnState.
   939  	defer func(n func() time.Time) {
   940  		now = n
   941  	}(now)
   942  	now = func() time.Time {
   943  		return time.Now().Add(time.Second * 5)
   944  	}
   945  
   946  	// UpdateClientConnState with an interval of 9 seconds. Due to 5 seconds
   947  	// already passing (from overridden time.Now function), this should start an
   948  	// interval timer of ~4 seconds.
   949  	od.UpdateClientConnState(balancer.ClientConnState{
   950  		BalancerConfig: &LBConfig{
   951  			Interval: iserviceconfig.Duration(9 * time.Second),
   952  			SuccessRateEjection: &SuccessRateEjection{
   953  				StdevFactor:           1900,
   954  				EnforcementPercentage: 100,
   955  				MinimumHosts:          5,
   956  				RequestVolume:         100,
   957  			},
   958  			ChildPolicy: &iserviceconfig.BalancerConfig{
   959  				Name:   t.Name(),
   960  				Config: emptyChildConfig{},
   961  			},
   962  		},
   963  	})
   964  
   965  	d, err = durationChan.Receive(ctx)
   966  	if err != nil {
   967  		t.Fatalf("Error receiving duration from afterFunc() call: %v", err)
   968  	}
   969  	dur = d.(time.Duration)
   970  	if dur.Seconds() < 3.5 || 4.5 < dur.Seconds() {
   971  		t.Fatalf("configured duration should have been around 4 seconds to start timer")
   972  	}
   973  
   974  	// UpdateClientConnState with a no-op config. This shouldn't configure the
   975  	// interval timer at all due to it being a no-op.
   976  	od.UpdateClientConnState(balancer.ClientConnState{
   977  		BalancerConfig: &LBConfig{
   978  			Interval: iserviceconfig.Duration(10 * time.Second),
   979  			ChildPolicy: &iserviceconfig.BalancerConfig{
   980  				Name:   t.Name(),
   981  				Config: emptyChildConfig{},
   982  			},
   983  		},
   984  	})
   985  
   986  	// No timer should have been started.
   987  	sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout)
   988  	defer cancel()
   989  	if _, err = durationChan.Receive(sCtx); err == nil {
   990  		t.Fatal("No timer should have started.")
   991  	}
   992  }
   993  
   994  // TestEjectUnejectSuccessRate tests the functionality of the interval timer
   995  // algorithm when configured with SuccessRateEjection. The Outlier Detection
   996  // Balancer will be set up with 3 SubConns, each with a different address.
   997  // It tests the following scenarios, in a step by step fashion:
   998  // 1. The three addresses each have 5 successes. The interval timer algorithm should
   999  // not eject any of the addresses.
  1000  // 2. Two of the addresses have 5 successes, the third has five failures. The
  1001  // interval timer algorithm should eject the third address with five failures.
  1002  // 3. The interval timer algorithm is run at a later time past max ejection
  1003  // time. The interval timer algorithm should uneject the third address.
  1004  func (s) TestEjectUnejectSuccessRate(t *testing.T) {
  1005  	scsCh := testutils.NewChannel()
  1006  	var scw1, scw2, scw3 balancer.SubConn
  1007  	var err error
  1008  	stub.Register(t.Name(), stub.BalancerFuncs{
  1009  		UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error {
  1010  			scw1, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{
  1011  				StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw1, state: state}) },
  1012  			})
  1013  			if err != nil {
  1014  				t.Errorf("error in od.NewSubConn call: %v", err)
  1015  			}
  1016  			scw2, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{
  1017  				StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw2, state: state}) },
  1018  			})
  1019  			if err != nil {
  1020  				t.Errorf("error in od.NewSubConn call: %v", err)
  1021  			}
  1022  			scw3, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address3"}}, balancer.NewSubConnOptions{
  1023  				StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw3, state: state}) },
  1024  			})
  1025  			if err != nil {
  1026  				t.Errorf("error in od.NewSubConn call: %v", err)
  1027  			}
  1028  			bd.ClientConn.UpdateState(balancer.State{
  1029  				ConnectivityState: connectivity.Ready,
  1030  				Picker: &rrPicker{
  1031  					scs: []balancer.SubConn{scw1, scw2, scw3},
  1032  				},
  1033  			})
  1034  			return nil
  1035  		},
  1036  	})
  1037  
  1038  	od, tcc, cleanup := setup(t)
  1039  	defer func() {
  1040  		cleanup()
  1041  	}()
  1042  
  1043  	od.UpdateClientConnState(balancer.ClientConnState{
  1044  		ResolverState: resolver.State{
  1045  			Endpoints: []resolver.Endpoint{
  1046  				{Addresses: []resolver.Address{{Addr: "address1"}}},
  1047  				{Addresses: []resolver.Address{{Addr: "address2"}}},
  1048  				{Addresses: []resolver.Address{{Addr: "address3"}}},
  1049  			},
  1050  		},
  1051  		BalancerConfig: &LBConfig{
  1052  			Interval:           math.MaxInt64, // so the interval will never run unless called manually in test.
  1053  			BaseEjectionTime:   iserviceconfig.Duration(30 * time.Second),
  1054  			MaxEjectionTime:    iserviceconfig.Duration(300 * time.Second),
  1055  			MaxEjectionPercent: 10,
  1056  			FailurePercentageEjection: &FailurePercentageEjection{
  1057  				Threshold:             50,
  1058  				EnforcementPercentage: 100,
  1059  				MinimumHosts:          3,
  1060  				RequestVolume:         3,
  1061  			},
  1062  			ChildPolicy: &iserviceconfig.BalancerConfig{
  1063  				Name:   t.Name(),
  1064  				Config: emptyChildConfig{},
  1065  			},
  1066  		},
  1067  	})
  1068  
  1069  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
  1070  	defer cancel()
  1071  
  1072  	select {
  1073  	case <-ctx.Done():
  1074  		t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn")
  1075  	case picker := <-tcc.NewPickerCh:
  1076  		// Set each of the three upstream addresses to have five successes each.
  1077  		// This should cause none of the addresses to be ejected as none of them
  1078  		// are outliers according to the success rate algorithm.
  1079  		for i := 0; i < 3; i++ {
  1080  			pi, err := picker.Pick(balancer.PickInfo{})
  1081  			if err != nil {
  1082  				t.Fatalf("picker.Pick failed with error: %v", err)
  1083  			}
  1084  			for c := 0; c < 5; c++ {
  1085  				pi.Done(balancer.DoneInfo{})
  1086  			}
  1087  		}
  1088  
  1089  		od.intervalTimerAlgorithm()
  1090  
  1091  		// verify no StateListener() call on the child, as no addresses got
  1092  		// ejected (ejected address will cause an StateListener call).
  1093  		sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout)
  1094  		defer cancel()
  1095  		if _, err := scsCh.Receive(sCtx); err == nil {
  1096  			t.Fatalf("no SubConn update should have been sent (no SubConn got ejected)")
  1097  		}
  1098  
  1099  		// Since no addresses are ejected, a SubConn update should forward down
  1100  		// to the child.
  1101  		od.updateSubConnState(scw1.(*subConnWrapper), balancer.SubConnState{
  1102  			ConnectivityState: connectivity.Connecting,
  1103  		})
  1104  
  1105  		gotSCWS, err := scsCh.Receive(ctx)
  1106  		if err != nil {
  1107  			t.Fatalf("Error waiting for Sub Conn update: %v", err)
  1108  		}
  1109  		if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
  1110  			sc:    scw1,
  1111  			state: balancer.SubConnState{ConnectivityState: connectivity.Connecting},
  1112  		}); err != nil {
  1113  			t.Fatalf("Error in Sub Conn update: %v", err)
  1114  		}
  1115  
  1116  		// Set two of the upstream addresses to have five successes each, and
  1117  		// one of the upstream addresses to have five failures. This should
  1118  		// cause the address which has five failures to be ejected according to
  1119  		// the SuccessRateAlgorithm.
  1120  		for i := 0; i < 2; i++ {
  1121  			pi, err := picker.Pick(balancer.PickInfo{})
  1122  			if err != nil {
  1123  				t.Fatalf("picker.Pick failed with error: %v", err)
  1124  			}
  1125  			for c := 0; c < 5; c++ {
  1126  				pi.Done(balancer.DoneInfo{})
  1127  			}
  1128  		}
  1129  		pi, err := picker.Pick(balancer.PickInfo{})
  1130  		if err != nil {
  1131  			t.Fatalf("picker.Pick failed with error: %v", err)
  1132  		}
  1133  		if got, want := pi.SubConn, scw3.(*subConnWrapper).SubConn; got != want {
  1134  			t.Fatalf("Unexpected SubConn chosen by picker: got %v, want %v", got, want)
  1135  		}
  1136  		for c := 0; c < 5; c++ {
  1137  			pi.Done(balancer.DoneInfo{Err: errors.New("some error")})
  1138  		}
  1139  
  1140  		// should eject address that always errored.
  1141  		od.intervalTimerAlgorithm()
  1142  		// Due to the address being ejected, the SubConn with that address
  1143  		// should be ejected, meaning a TRANSIENT_FAILURE connectivity state
  1144  		// gets reported to the child.
  1145  		gotSCWS, err = scsCh.Receive(ctx)
  1146  		if err != nil {
  1147  			t.Fatalf("Error waiting for Sub Conn update: %v", err)
  1148  		}
  1149  		if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
  1150  			sc:    scw3,
  1151  			state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure},
  1152  		}); err != nil {
  1153  			t.Fatalf("Error in Sub Conn update: %v", err)
  1154  		}
  1155  		// Only one address should be ejected.
  1156  		sCtx, cancel = context.WithTimeout(context.Background(), defaultTestShortTimeout)
  1157  		defer cancel()
  1158  		if _, err := scsCh.Receive(sCtx); err == nil {
  1159  			t.Fatalf("Only one SubConn update should have been sent (only one SubConn got ejected)")
  1160  		}
  1161  
  1162  		// Now that an address is ejected, SubConn updates for SubConns using
  1163  		// that address should not be forwarded downward. These SubConn updates
  1164  		// will be cached to update the child sometime in the future when the
  1165  		// address gets unejected.
  1166  		od.updateSubConnState(scw3.(*subConnWrapper), balancer.SubConnState{
  1167  			ConnectivityState: connectivity.Connecting,
  1168  		})
  1169  		sCtx, cancel = context.WithTimeout(context.Background(), defaultTestShortTimeout)
  1170  		defer cancel()
  1171  		if _, err := scsCh.Receive(sCtx); err == nil {
  1172  			t.Fatalf("SubConn update should not have been forwarded (the SubConn is ejected)")
  1173  		}
  1174  
  1175  		// Override now to cause the interval timer algorithm to always uneject
  1176  		// the ejected address. This will always uneject the ejected address
  1177  		// because this time is set way past the max ejection time set in the
  1178  		// configuration, which will make the next interval timer algorithm run
  1179  		// uneject any ejected addresses.
  1180  		defer func(n func() time.Time) {
  1181  			now = n
  1182  		}(now)
  1183  		now = func() time.Time {
  1184  			return time.Now().Add(time.Second * 1000)
  1185  		}
  1186  		od.intervalTimerAlgorithm()
  1187  
  1188  		// unejected SubConn should report latest persisted state - which is
  1189  		// connecting from earlier.
  1190  		gotSCWS, err = scsCh.Receive(ctx)
  1191  		if err != nil {
  1192  			t.Fatalf("Error waiting for Sub Conn update: %v", err)
  1193  		}
  1194  		if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
  1195  			sc:    scw3,
  1196  			state: balancer.SubConnState{ConnectivityState: connectivity.Connecting},
  1197  		}); err != nil {
  1198  			t.Fatalf("Error in Sub Conn update: %v", err)
  1199  		}
  1200  	}
  1201  }
  1202  
  1203  // TestEjectFailureRate tests the functionality of the interval timer algorithm
  1204  // when configured with FailurePercentageEjection, and also the functionality of
  1205  // noop configuration. The Outlier Detection Balancer will be set up with 3
  1206  // SubConns, each with a different address. It tests the following scenarios, in
  1207  // a step by step fashion:
  1208  // 1. The three addresses each have 5 successes. The interval timer algorithm
  1209  // should not eject any of the addresses.
  1210  // 2. Two of the addresses have 5 successes, the third has five failures. The
  1211  // interval timer algorithm should eject the third address with five failures.
  1212  // 3. The Outlier Detection Balancer receives a subsequent noop config update.
  1213  // The balancer should uneject all ejected addresses.
  1214  func (s) TestEjectFailureRate(t *testing.T) {
  1215  	scsCh := testutils.NewChannel()
  1216  	var scw1, scw2, scw3 balancer.SubConn
  1217  	var err error
  1218  	stub.Register(t.Name(), stub.BalancerFuncs{
  1219  		UpdateClientConnState: func(bd *stub.BalancerData, _ balancer.ClientConnState) error {
  1220  			if scw1 != nil { // UpdateClientConnState was already called, no need to recreate SubConns.
  1221  				return nil
  1222  			}
  1223  			scw1, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{
  1224  				StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw1, state: state}) },
  1225  			})
  1226  			if err != nil {
  1227  				t.Errorf("error in od.NewSubConn call: %v", err)
  1228  			}
  1229  			scw2, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{
  1230  				StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw2, state: state}) },
  1231  			})
  1232  			if err != nil {
  1233  				t.Errorf("error in od.NewSubConn call: %v", err)
  1234  			}
  1235  			scw3, err = bd.ClientConn.NewSubConn([]resolver.Address{{Addr: "address3"}}, balancer.NewSubConnOptions{
  1236  				StateListener: func(state balancer.SubConnState) { scsCh.Send(subConnWithState{sc: scw3, state: state}) },
  1237  			})
  1238  			if err != nil {
  1239  				t.Errorf("error in od.NewSubConn call: %v", err)
  1240  			}
  1241  			return nil
  1242  		},
  1243  	})
  1244  
  1245  	od, tcc, cleanup := setup(t)
  1246  	defer func() {
  1247  		cleanup()
  1248  	}()
  1249  
  1250  	od.UpdateClientConnState(balancer.ClientConnState{
  1251  		ResolverState: resolver.State{
  1252  			Endpoints: []resolver.Endpoint{
  1253  				{Addresses: []resolver.Address{{Addr: "address1"}}},
  1254  				{Addresses: []resolver.Address{{Addr: "address2"}}},
  1255  				{Addresses: []resolver.Address{{Addr: "address3"}}},
  1256  			},
  1257  		},
  1258  		BalancerConfig: &LBConfig{
  1259  			Interval:           math.MaxInt64, // so the interval will never run unless called manually in test.
  1260  			BaseEjectionTime:   iserviceconfig.Duration(30 * time.Second),
  1261  			MaxEjectionTime:    iserviceconfig.Duration(300 * time.Second),
  1262  			MaxEjectionPercent: 10,
  1263  			SuccessRateEjection: &SuccessRateEjection{
  1264  				StdevFactor:           500,
  1265  				EnforcementPercentage: 100,
  1266  				MinimumHosts:          3,
  1267  				RequestVolume:         3,
  1268  			},
  1269  			ChildPolicy: &iserviceconfig.BalancerConfig{
  1270  				Name:   t.Name(),
  1271  				Config: emptyChildConfig{},
  1272  			},
  1273  		},
  1274  	})
  1275  
  1276  	od.UpdateState(balancer.State{
  1277  		ConnectivityState: connectivity.Ready,
  1278  		Picker: &rrPicker{
  1279  			scs: []balancer.SubConn{scw1, scw2, scw3},
  1280  		},
  1281  	})
  1282  
  1283  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
  1284  	defer cancel()
  1285  
  1286  	select {
  1287  	case <-ctx.Done():
  1288  		t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn")
  1289  	case picker := <-tcc.NewPickerCh:
  1290  		// Set each upstream address to have five successes each. This should
  1291  		// cause none of the addresses to be ejected as none of them are below
  1292  		// the failure percentage threshold.
  1293  		for i := 0; i < 3; i++ {
  1294  			pi, err := picker.Pick(balancer.PickInfo{})
  1295  			if err != nil {
  1296  				t.Fatalf("picker.Pick failed with error: %v", err)
  1297  			}
  1298  			for c := 0; c < 5; c++ {
  1299  				pi.Done(balancer.DoneInfo{})
  1300  			}
  1301  		}
  1302  
  1303  		od.intervalTimerAlgorithm()
  1304  		sCtx, cancel := context.WithTimeout(context.Background(), defaultTestShortTimeout)
  1305  		defer cancel()
  1306  		if _, err := scsCh.Receive(sCtx); err == nil {
  1307  			t.Fatalf("no SubConn update should have been sent (no SubConn got ejected)")
  1308  		}
  1309  
  1310  		// Set two upstream addresses to have five successes each, and one
  1311  		// upstream address to have five failures. This should cause the address
  1312  		// with five failures to be ejected according to the Failure Percentage
  1313  		// Algorithm.
  1314  		for i := 0; i < 2; i++ {
  1315  			pi, err := picker.Pick(balancer.PickInfo{})
  1316  			if err != nil {
  1317  				t.Fatalf("picker.Pick failed with error: %v", err)
  1318  			}
  1319  			for c := 0; c < 5; c++ {
  1320  				pi.Done(balancer.DoneInfo{})
  1321  			}
  1322  		}
  1323  		pi, err := picker.Pick(balancer.PickInfo{})
  1324  		if err != nil {
  1325  			t.Fatalf("picker.Pick failed with error: %v", err)
  1326  		}
  1327  		for c := 0; c < 5; c++ {
  1328  			pi.Done(balancer.DoneInfo{Err: errors.New("some error")})
  1329  		}
  1330  
  1331  		// should eject address that always errored.
  1332  		od.intervalTimerAlgorithm()
  1333  
  1334  		// verify StateListener() got called with TRANSIENT_FAILURE for child
  1335  		// in address that was ejected.
  1336  		gotSCWS, err := scsCh.Receive(ctx)
  1337  		if err != nil {
  1338  			t.Fatalf("Error waiting for Sub Conn update: %v", err)
  1339  		}
  1340  		if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
  1341  			sc:    scw3,
  1342  			state: balancer.SubConnState{ConnectivityState: connectivity.TransientFailure},
  1343  		}); err != nil {
  1344  			t.Fatalf("Error in Sub Conn update: %v", err)
  1345  		}
  1346  
  1347  		// verify only one address got ejected.
  1348  		sCtx, cancel = context.WithTimeout(context.Background(), defaultTestShortTimeout)
  1349  		defer cancel()
  1350  		if _, err := scsCh.Receive(sCtx); err == nil {
  1351  			t.Fatalf("Only one SubConn update should have been sent (only one SubConn got ejected)")
  1352  		}
  1353  
  1354  		// upon the Outlier Detection balancer being reconfigured with a noop
  1355  		// configuration, every ejected SubConn should be unejected.
  1356  		od.UpdateClientConnState(balancer.ClientConnState{
  1357  			ResolverState: resolver.State{
  1358  				Endpoints: []resolver.Endpoint{
  1359  					{Addresses: []resolver.Address{{Addr: "address1"}}},
  1360  					{Addresses: []resolver.Address{{Addr: "address2"}}},
  1361  					{Addresses: []resolver.Address{{Addr: "address3"}}},
  1362  				},
  1363  			},
  1364  			BalancerConfig: &LBConfig{
  1365  				Interval:           math.MaxInt64,
  1366  				BaseEjectionTime:   iserviceconfig.Duration(30 * time.Second),
  1367  				MaxEjectionTime:    iserviceconfig.Duration(300 * time.Second),
  1368  				MaxEjectionPercent: 10,
  1369  				ChildPolicy: &iserviceconfig.BalancerConfig{
  1370  					Name:   t.Name(),
  1371  					Config: emptyChildConfig{},
  1372  				},
  1373  			},
  1374  		})
  1375  		gotSCWS, err = scsCh.Receive(ctx)
  1376  		if err != nil {
  1377  			t.Fatalf("Error waiting for Sub Conn update: %v", err)
  1378  		}
  1379  		if err = scwsEqual(gotSCWS.(subConnWithState), subConnWithState{
  1380  			sc:    scw3,
  1381  			state: balancer.SubConnState{ConnectivityState: connectivity.Idle},
  1382  		}); err != nil {
  1383  			t.Fatalf("Error in Sub Conn update: %v", err)
  1384  		}
  1385  	}
  1386  }
  1387  
  1388  // TestConcurrentOperations calls different operations on the balancer in
  1389  // separate goroutines to test for any race conditions and deadlocks. It also
  1390  // uses a child balancer which verifies that no operations on the child get
  1391  // called after the child balancer is closed.
  1392  func (s) TestConcurrentOperations(t *testing.T) {
  1393  	closed := grpcsync.NewEvent()
  1394  	stub.Register(t.Name(), stub.BalancerFuncs{
  1395  		UpdateClientConnState: func(*stub.BalancerData, balancer.ClientConnState) error {
  1396  			if closed.HasFired() {
  1397  				t.Error("UpdateClientConnState was called after Close(), which breaks the balancer API")
  1398  			}
  1399  			return nil
  1400  		},
  1401  		ResolverError: func(*stub.BalancerData, error) {
  1402  			if closed.HasFired() {
  1403  				t.Error("ResolverError was called after Close(), which breaks the balancer API")
  1404  			}
  1405  		},
  1406  		Close: func(*stub.BalancerData) {
  1407  			closed.Fire()
  1408  		},
  1409  		ExitIdle: func(*stub.BalancerData) {
  1410  			if closed.HasFired() {
  1411  				t.Error("ExitIdle was called after Close(), which breaks the balancer API")
  1412  			}
  1413  		},
  1414  	})
  1415  
  1416  	od, tcc, cleanup := setup(t)
  1417  	defer func() {
  1418  		cleanup()
  1419  	}()
  1420  
  1421  	od.UpdateClientConnState(balancer.ClientConnState{
  1422  		ResolverState: resolver.State{
  1423  			Endpoints: []resolver.Endpoint{
  1424  				{Addresses: []resolver.Address{{Addr: "address1"}}},
  1425  				{Addresses: []resolver.Address{{Addr: "address2"}}},
  1426  				{Addresses: []resolver.Address{{Addr: "address3"}}},
  1427  			},
  1428  		},
  1429  		BalancerConfig: &LBConfig{
  1430  			Interval:           math.MaxInt64, // so the interval will never run unless called manually in test.
  1431  			BaseEjectionTime:   iserviceconfig.Duration(30 * time.Second),
  1432  			MaxEjectionTime:    iserviceconfig.Duration(300 * time.Second),
  1433  			MaxEjectionPercent: 10,
  1434  			SuccessRateEjection: &SuccessRateEjection{ // Have both Success Rate and Failure Percentage to step through all the interval timer code
  1435  				StdevFactor:           500,
  1436  				EnforcementPercentage: 100,
  1437  				MinimumHosts:          3,
  1438  				RequestVolume:         3,
  1439  			},
  1440  			FailurePercentageEjection: &FailurePercentageEjection{
  1441  				Threshold:             50,
  1442  				EnforcementPercentage: 100,
  1443  				MinimumHosts:          3,
  1444  				RequestVolume:         3,
  1445  			},
  1446  			ChildPolicy: &iserviceconfig.BalancerConfig{
  1447  				Name:   t.Name(),
  1448  				Config: emptyChildConfig{},
  1449  			},
  1450  		},
  1451  	})
  1452  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
  1453  	defer cancel()
  1454  
  1455  	scw1, err := od.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{})
  1456  	if err != nil {
  1457  		t.Fatalf("error in od.NewSubConn call: %v", err)
  1458  	}
  1459  	if err != nil {
  1460  		t.Fatalf("error in od.NewSubConn call: %v", err)
  1461  	}
  1462  
  1463  	scw2, err := od.NewSubConn([]resolver.Address{{Addr: "address2"}}, balancer.NewSubConnOptions{})
  1464  	if err != nil {
  1465  		t.Fatalf("error in od.NewSubConn call: %v", err)
  1466  	}
  1467  
  1468  	scw3, err := od.NewSubConn([]resolver.Address{{Addr: "address3"}}, balancer.NewSubConnOptions{})
  1469  	if err != nil {
  1470  		t.Fatalf("error in od.NewSubConn call: %v", err)
  1471  	}
  1472  
  1473  	od.UpdateState(balancer.State{
  1474  		ConnectivityState: connectivity.Ready,
  1475  		Picker: &rrPicker{
  1476  			scs: []balancer.SubConn{scw2, scw3},
  1477  		},
  1478  	})
  1479  
  1480  	var picker balancer.Picker
  1481  	select {
  1482  	case <-ctx.Done():
  1483  		t.Fatalf("timeout while waiting for a UpdateState call on the ClientConn")
  1484  	case picker = <-tcc.NewPickerCh:
  1485  	}
  1486  
  1487  	finished := make(chan struct{})
  1488  	var wg sync.WaitGroup
  1489  	wg.Add(1)
  1490  	go func() {
  1491  		defer wg.Done()
  1492  		for {
  1493  			select {
  1494  			case <-finished:
  1495  				return
  1496  			default:
  1497  			}
  1498  			pi, err := picker.Pick(balancer.PickInfo{})
  1499  			if err != nil {
  1500  				continue
  1501  			}
  1502  			pi.Done(balancer.DoneInfo{})
  1503  			pi.Done(balancer.DoneInfo{Err: errors.New("some error")})
  1504  			time.Sleep(1 * time.Nanosecond)
  1505  		}
  1506  	}()
  1507  
  1508  	wg.Add(1)
  1509  	go func() {
  1510  		defer wg.Done()
  1511  		for {
  1512  			select {
  1513  			case <-finished:
  1514  				return
  1515  			default:
  1516  			}
  1517  			od.intervalTimerAlgorithm()
  1518  		}
  1519  	}()
  1520  
  1521  	// call Outlier Detection's balancer.ClientConn operations asynchronously.
  1522  	// balancer.ClientConn operations have no guarantee from the API to be
  1523  	// called synchronously.
  1524  	wg.Add(1)
  1525  	go func() {
  1526  		defer wg.Done()
  1527  		for {
  1528  			select {
  1529  			case <-finished:
  1530  				return
  1531  			default:
  1532  			}
  1533  			od.UpdateState(balancer.State{
  1534  				ConnectivityState: connectivity.Ready,
  1535  				Picker: &rrPicker{
  1536  					scs: []balancer.SubConn{scw2, scw3},
  1537  				},
  1538  			})
  1539  			time.Sleep(1 * time.Nanosecond)
  1540  		}
  1541  	}()
  1542  
  1543  	wg.Add(1)
  1544  	go func() {
  1545  		defer wg.Done()
  1546  		od.NewSubConn([]resolver.Address{{Addr: "address4"}}, balancer.NewSubConnOptions{})
  1547  	}()
  1548  
  1549  	wg.Add(1)
  1550  	go func() {
  1551  		defer wg.Done()
  1552  		scw1.Shutdown()
  1553  	}()
  1554  
  1555  	wg.Add(1)
  1556  	go func() {
  1557  		defer wg.Done()
  1558  		od.UpdateAddresses(scw2, []resolver.Address{{Addr: "address3"}})
  1559  	}()
  1560  
  1561  	// Call balancer.Balancers synchronously in this goroutine, upholding the
  1562  	// balancer.Balancer API guarantee of synchronous calls.
  1563  	od.UpdateClientConnState(balancer.ClientConnState{ // This will delete addresses and flip to no op
  1564  		ResolverState: resolver.State{
  1565  			Endpoints: []resolver.Endpoint{{Addresses: []resolver.Address{{Addr: "address1"}}}},
  1566  		},
  1567  		BalancerConfig: &LBConfig{
  1568  			Interval: math.MaxInt64,
  1569  			ChildPolicy: &iserviceconfig.BalancerConfig{
  1570  				Name:   t.Name(),
  1571  				Config: emptyChildConfig{},
  1572  			},
  1573  		},
  1574  	})
  1575  
  1576  	// Call balancer.Balancers synchronously in this goroutine, upholding the
  1577  	// balancer.Balancer API guarantee.
  1578  	od.updateSubConnState(scw1.(*subConnWrapper), balancer.SubConnState{
  1579  		ConnectivityState: connectivity.Connecting,
  1580  	})
  1581  	od.ResolverError(errors.New("some error"))
  1582  	od.ExitIdle()
  1583  	od.Close()
  1584  	close(finished)
  1585  	wg.Wait()
  1586  }
  1587  
  1588  // Test verifies that outlier detection doesn't eject subchannels created by
  1589  // the new pickfirst balancer when pickfirst is a non-leaf policy, i.e. not
  1590  // under a petiole policy. When pickfirst is not under a petiole policy, it will
  1591  // not register a health listener. pickfirst will still set the address
  1592  // attribute to disable ejection through the raw connectivity listener. When
  1593  // Outlier Detection processes a health update and sees the health listener is
  1594  // enabled but a health listener is not registered, it will drop the ejection
  1595  // update.
  1596  func (s) TestPickFirstHealthListenerDisabled(t *testing.T) {
  1597  	backend := &stubserver.StubServer{
  1598  		EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) {
  1599  			return nil, errors.New("some error")
  1600  		},
  1601  	}
  1602  	if err := backend.StartServer(); err != nil {
  1603  		t.Fatalf("Failed to start backend: %v", err)
  1604  	}
  1605  	defer backend.Stop()
  1606  	t.Logf("Started bad TestService backend at: %q", backend.Address)
  1607  
  1608  	// The interval is intentionally kept very large, the interval algorithm
  1609  	// will be triggered manually.
  1610  	odCfg := &LBConfig{
  1611  		Interval:         iserviceconfig.Duration(300 * time.Second),
  1612  		BaseEjectionTime: iserviceconfig.Duration(300 * time.Second),
  1613  		MaxEjectionTime:  iserviceconfig.Duration(500 * time.Second),
  1614  		FailurePercentageEjection: &FailurePercentageEjection{
  1615  			Threshold:             50,
  1616  			EnforcementPercentage: 100,
  1617  			MinimumHosts:          0,
  1618  			RequestVolume:         2,
  1619  		},
  1620  		MaxEjectionPercent: 100,
  1621  		ChildPolicy: &iserviceconfig.BalancerConfig{
  1622  			Name: pickfirstleaf.Name,
  1623  		},
  1624  	}
  1625  
  1626  	lbChan := make(chan *outlierDetectionBalancer, 1)
  1627  	bf := stub.BalancerFuncs{
  1628  		Init: func(bd *stub.BalancerData) {
  1629  			bd.Data = balancer.Get(Name).Build(bd.ClientConn, bd.BuildOptions)
  1630  			lbChan <- bd.Data.(*outlierDetectionBalancer)
  1631  		},
  1632  		Close: func(bd *stub.BalancerData) {
  1633  			bd.Data.(balancer.Balancer).Close()
  1634  		},
  1635  		UpdateClientConnState: func(bd *stub.BalancerData, ccs balancer.ClientConnState) error {
  1636  			ccs.BalancerConfig = odCfg
  1637  			return bd.Data.(balancer.Balancer).UpdateClientConnState(ccs)
  1638  		},
  1639  	}
  1640  
  1641  	stub.Register(t.Name(), bf)
  1642  
  1643  	opts := []grpc.DialOption{
  1644  		grpc.WithTransportCredentials(insecure.NewCredentials()),
  1645  		grpc.WithDefaultServiceConfig(fmt.Sprintf(`{ "loadBalancingConfig": [{%q: {}}] }`, t.Name())),
  1646  	}
  1647  	cc, err := grpc.NewClient(backend.Address, opts...)
  1648  	if err != nil {
  1649  		t.Fatalf("grpc.NewClient() failed: %v", err)
  1650  	}
  1651  	defer cc.Close()
  1652  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
  1653  	defer cancel()
  1654  	testServiceClient := testgrpc.NewTestServiceClient(cc)
  1655  	testServiceClient.EmptyCall(ctx, &testpb.Empty{})
  1656  	testutils.AwaitState(ctx, t, cc, connectivity.Ready)
  1657  
  1658  	// Failing request should not cause ejection.
  1659  	testServiceClient.EmptyCall(ctx, &testpb.Empty{})
  1660  	testServiceClient.EmptyCall(ctx, &testpb.Empty{})
  1661  	testServiceClient.EmptyCall(ctx, &testpb.Empty{})
  1662  	testServiceClient.EmptyCall(ctx, &testpb.Empty{})
  1663  
  1664  	// Run the interval algorithm.
  1665  	select {
  1666  	case <-ctx.Done():
  1667  		t.Fatal("Timed out waiting for the outlier detection LB policy to be built.")
  1668  	case od := <-lbChan:
  1669  		od.intervalTimerAlgorithm()
  1670  	}
  1671  
  1672  	shortCtx, shortCancel := context.WithTimeout(ctx, defaultTestShortTimeout)
  1673  	defer shortCancel()
  1674  	testutils.AwaitNoStateChange(shortCtx, t, cc, connectivity.Ready)
  1675  }
  1676  
  1677  // Tests handling of endpoints with multiple addresses. The test creates two
  1678  // endpoints, each with two addresses. The first endpoint has a backend that
  1679  // always returns errors. The test verifies that the first endpoint is ejected
  1680  // after running the intervalTimerAlgorithm. The test stops the unhealthy
  1681  // backend and verifies that the second backend in the first endpoint is dialed
  1682  // but it doesn't receive requests due to its ejection status. The test stops
  1683  // the connected backend in the second endpoint and verifies that requests
  1684  // start going to the second address in the second endpoint. The test reduces
  1685  // the ejection interval and runs the intervalTimerAlgorithm again. The test
  1686  // verifies that the first endpoint is unejected and requests reach both
  1687  // endpoints.
  1688  func (s) TestMultipleAddressesPerEndpoint(t *testing.T) {
  1689  	unhealthyBackend := &stubserver.StubServer{
  1690  		EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) {
  1691  			return nil, errors.New("some error")
  1692  		},
  1693  	}
  1694  	if err := unhealthyBackend.StartServer(); err != nil {
  1695  		t.Fatalf("Failed to start backend: %v", err)
  1696  	}
  1697  	defer unhealthyBackend.Stop()
  1698  	t.Logf("Started unhealthy TestService backend at: %q", unhealthyBackend.Address)
  1699  
  1700  	healthyBackends := make([]*stubserver.StubServer, 3)
  1701  	for i := 0; i < 3; i++ {
  1702  		healthyBackends[i] = stubserver.StartTestService(t, nil)
  1703  		defer healthyBackends[i].Stop()
  1704  	}
  1705  
  1706  	wrrCfg, err := balancer.Get(weightedroundrobin.Name).(balancer.ConfigParser).ParseConfig(json.RawMessage("{}"))
  1707  	if err != nil {
  1708  		t.Fatalf("Failed to parse %q config: %v", weightedroundrobin.Name, err)
  1709  	}
  1710  	// The interval is intentionally kept very large, the interval algorithm
  1711  	// will be triggered manually.
  1712  	odCfg := &LBConfig{
  1713  		Interval:         iserviceconfig.Duration(300 * time.Second),
  1714  		BaseEjectionTime: iserviceconfig.Duration(300 * time.Second),
  1715  		MaxEjectionTime:  iserviceconfig.Duration(300 * time.Second),
  1716  		FailurePercentageEjection: &FailurePercentageEjection{
  1717  			Threshold:             50,
  1718  			EnforcementPercentage: 100,
  1719  			MinimumHosts:          0,
  1720  			RequestVolume:         2,
  1721  		},
  1722  		MaxEjectionPercent: 100,
  1723  		ChildPolicy: &iserviceconfig.BalancerConfig{
  1724  			Name:   weightedroundrobin.Name,
  1725  			Config: wrrCfg,
  1726  		},
  1727  	}
  1728  
  1729  	lbChan := make(chan *outlierDetectionBalancer, 1)
  1730  	bf := stub.BalancerFuncs{
  1731  		Init: func(bd *stub.BalancerData) {
  1732  			bd.Data = balancer.Get(Name).Build(bd.ClientConn, bd.BuildOptions)
  1733  			lbChan <- bd.Data.(*outlierDetectionBalancer)
  1734  		},
  1735  		Close: func(bd *stub.BalancerData) {
  1736  			bd.Data.(balancer.Balancer).Close()
  1737  		},
  1738  		UpdateClientConnState: func(bd *stub.BalancerData, ccs balancer.ClientConnState) error {
  1739  			ccs.BalancerConfig = odCfg
  1740  			return bd.Data.(balancer.Balancer).UpdateClientConnState(ccs)
  1741  		},
  1742  	}
  1743  
  1744  	stub.Register(t.Name(), bf)
  1745  	r := manual.NewBuilderWithScheme("whatever")
  1746  	endpoints := []resolver.Endpoint{
  1747  		{
  1748  			Addresses: []resolver.Address{
  1749  				{Addr: unhealthyBackend.Address},
  1750  				{Addr: healthyBackends[0].Address},
  1751  			},
  1752  		},
  1753  		{
  1754  			Addresses: []resolver.Address{
  1755  				{Addr: healthyBackends[1].Address},
  1756  				{Addr: healthyBackends[2].Address},
  1757  			},
  1758  		},
  1759  	}
  1760  
  1761  	r.InitialState(resolver.State{
  1762  		Endpoints: endpoints,
  1763  	})
  1764  	dialer := testutils.NewBlockingDialer()
  1765  	opts := []grpc.DialOption{
  1766  		grpc.WithTransportCredentials(insecure.NewCredentials()),
  1767  		grpc.WithDefaultServiceConfig(fmt.Sprintf(`{ "loadBalancingConfig": [{%q: {}}] }`, t.Name())),
  1768  		grpc.WithResolvers(r),
  1769  		grpc.WithContextDialer(dialer.DialContext),
  1770  	}
  1771  	cc, err := grpc.NewClient(r.Scheme()+":///", opts...)
  1772  	if err != nil {
  1773  		t.Fatalf("grpc.NewClient() failed: %v", err)
  1774  	}
  1775  	defer cc.Close()
  1776  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
  1777  	defer cancel()
  1778  	client := testgrpc.NewTestServiceClient(cc)
  1779  	client.EmptyCall(ctx, &testpb.Empty{})
  1780  	testutils.AwaitState(ctx, t, cc, connectivity.Ready)
  1781  
  1782  	// Wait until both endpoints start receiving requests.
  1783  	addrsSeen := map[string]bool{}
  1784  	for ; ctx.Err() == nil && len(addrsSeen) < 2; <-time.After(time.Millisecond) {
  1785  		var peer peer.Peer
  1786  		client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer))
  1787  		addrsSeen[peer.String()] = true
  1788  	}
  1789  
  1790  	if len(addrsSeen) < 2 {
  1791  		t.Fatalf("Context timed out waiting for requests to reach both endpoints.")
  1792  	}
  1793  
  1794  	// Make 2 requests to each endpoint and verify the first endpoint gets
  1795  	// ejected.
  1796  	for i := 0; i < 2*len(endpoints); i++ {
  1797  		client.EmptyCall(ctx, &testpb.Empty{})
  1798  	}
  1799  	var od *outlierDetectionBalancer
  1800  	select {
  1801  	case <-ctx.Done():
  1802  		t.Fatal("Timed out waiting for the outlier detection LB policy to be built.")
  1803  	case od = <-lbChan:
  1804  	}
  1805  	od.intervalTimerAlgorithm()
  1806  
  1807  	// The first endpoint should be ejected, requests should only go to
  1808  	// endpoints[1].
  1809  	if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[1].Addresses[0]}); err != nil {
  1810  		t.Fatalf("RPCs didn't go to the second endpoint: %v", err)
  1811  	}
  1812  
  1813  	// Shutdown the unhealthy backend. The second address in the endpoint should
  1814  	// be connected, but it should be ejected by outlier detection.
  1815  	hold := dialer.Hold(healthyBackends[0].Address)
  1816  	unhealthyBackend.Stop()
  1817  	if hold.Wait(ctx) != true {
  1818  		t.Fatalf("Timeout waiting for second address in endpoint[0] with address %q to be contacted", healthyBackends[0].Address)
  1819  	}
  1820  	hold.Resume()
  1821  
  1822  	// Verify requests go only to healthyBackends[1] for a short time.
  1823  	shortCtx, cancel := context.WithTimeout(ctx, defaultTestShortTimeout)
  1824  	defer cancel()
  1825  	for ; shortCtx.Err() == nil; <-time.After(time.Millisecond) {
  1826  		var peer peer.Peer
  1827  		if _, err := client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer)); err != nil {
  1828  			if status.Code(err) != codes.DeadlineExceeded {
  1829  				t.Fatalf("EmptyCall() returned unexpected error %v", err)
  1830  			}
  1831  			break
  1832  		}
  1833  		if got, want := peer.Addr.String(), healthyBackends[1].Address; got != want {
  1834  			t.Fatalf("EmptyCall() went to unexpected backend: got %q, want %q", got, want)
  1835  		}
  1836  	}
  1837  
  1838  	// shutdown the connected backend in endpoints[1], requests should start
  1839  	// going to the second address in the same endpoint.
  1840  	healthyBackends[1].Stop()
  1841  	if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[1].Addresses[1]}); err != nil {
  1842  		t.Fatalf("RPCs didn't go to second address in the second endpoint: %v", err)
  1843  	}
  1844  
  1845  	// Reduce the ejection interval and run the interval algorithm again, it
  1846  	// should uneject endpoints[0].
  1847  	odCfg.MaxEjectionTime = 0
  1848  	odCfg.BaseEjectionTime = 0
  1849  	<-time.After(time.Millisecond)
  1850  	r.UpdateState(resolver.State{Endpoints: endpoints})
  1851  	od.intervalTimerAlgorithm()
  1852  	if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[0].Addresses[1], endpoints[1].Addresses[1]}); err != nil {
  1853  		t.Fatalf("RPCs didn't go to the second addresses of both endpoints: %v", err)
  1854  	}
  1855  }
  1856  
  1857  // Tests that removing an address from an endpoint resets its ejection state.
  1858  // The test creates two endpoints, each with two addresses. The first endpoint
  1859  // has a backend that always returns errors. The test verifies that the first
  1860  // endpoint is ejected after running the intervalTimerAlgorithm. The test sends
  1861  // a resolver update that removes the first address in the ejected endpoint. The
  1862  // test verifies that requests start reaching the remaining address from the
  1863  // first endpoint.
  1864  func (s) TestEjectionStateResetsWhenEndpointAddressesChange(t *testing.T) {
  1865  	unhealthyBackend := &stubserver.StubServer{
  1866  		EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) {
  1867  			return nil, errors.New("some error")
  1868  		},
  1869  	}
  1870  	if err := unhealthyBackend.StartServer(); err != nil {
  1871  		t.Fatalf("Failed to start backend: %v", err)
  1872  	}
  1873  	defer unhealthyBackend.Stop()
  1874  	t.Logf("Started unhealthy TestService backend at: %q", unhealthyBackend.Address)
  1875  
  1876  	healthyBackends := make([]*stubserver.StubServer, 3)
  1877  	for i := 0; i < 3; i++ {
  1878  		healthyBackends[i] = stubserver.StartTestService(t, nil)
  1879  		defer healthyBackends[i].Stop()
  1880  	}
  1881  
  1882  	wrrCfg, err := balancer.Get(weightedroundrobin.Name).(balancer.ConfigParser).ParseConfig(json.RawMessage("{}"))
  1883  	if err != nil {
  1884  		t.Fatalf("Failed to parse %q config: %v", weightedroundrobin.Name, err)
  1885  	}
  1886  	// The interval is intentionally kept very large, the interval algorithm
  1887  	// will be triggered manually.
  1888  	odCfg := &LBConfig{
  1889  		Interval:         iserviceconfig.Duration(300 * time.Second),
  1890  		BaseEjectionTime: iserviceconfig.Duration(300 * time.Second),
  1891  		MaxEjectionTime:  iserviceconfig.Duration(300 * time.Second),
  1892  		FailurePercentageEjection: &FailurePercentageEjection{
  1893  			Threshold:             50,
  1894  			EnforcementPercentage: 100,
  1895  			MinimumHosts:          0,
  1896  			RequestVolume:         2,
  1897  		},
  1898  		MaxEjectionPercent: 100,
  1899  		ChildPolicy: &iserviceconfig.BalancerConfig{
  1900  			Name:   weightedroundrobin.Name,
  1901  			Config: wrrCfg,
  1902  		},
  1903  	}
  1904  
  1905  	lbChan := make(chan *outlierDetectionBalancer, 1)
  1906  	bf := stub.BalancerFuncs{
  1907  		Init: func(bd *stub.BalancerData) {
  1908  			bd.Data = balancer.Get(Name).Build(bd.ClientConn, bd.BuildOptions)
  1909  			lbChan <- bd.Data.(*outlierDetectionBalancer)
  1910  		},
  1911  		Close: func(bd *stub.BalancerData) {
  1912  			bd.Data.(balancer.Balancer).Close()
  1913  		},
  1914  		UpdateClientConnState: func(bd *stub.BalancerData, ccs balancer.ClientConnState) error {
  1915  			ccs.BalancerConfig = odCfg
  1916  			return bd.Data.(balancer.Balancer).UpdateClientConnState(ccs)
  1917  		},
  1918  	}
  1919  
  1920  	stub.Register(t.Name(), bf)
  1921  	r := manual.NewBuilderWithScheme("whatever")
  1922  	endpoints := []resolver.Endpoint{
  1923  		{
  1924  			Addresses: []resolver.Address{
  1925  				{Addr: unhealthyBackend.Address},
  1926  				{Addr: healthyBackends[0].Address},
  1927  			},
  1928  		},
  1929  		{
  1930  			Addresses: []resolver.Address{
  1931  				{Addr: healthyBackends[1].Address},
  1932  				{Addr: healthyBackends[2].Address},
  1933  			},
  1934  		},
  1935  	}
  1936  
  1937  	r.InitialState(resolver.State{
  1938  		Endpoints: endpoints,
  1939  	})
  1940  	dialer := testutils.NewBlockingDialer()
  1941  	opts := []grpc.DialOption{
  1942  		grpc.WithTransportCredentials(insecure.NewCredentials()),
  1943  		grpc.WithDefaultServiceConfig(fmt.Sprintf(`{ "loadBalancingConfig": [{%q: {}}] }`, t.Name())),
  1944  		grpc.WithResolvers(r),
  1945  		grpc.WithContextDialer(dialer.DialContext),
  1946  	}
  1947  	cc, err := grpc.NewClient(r.Scheme()+":///", opts...)
  1948  	if err != nil {
  1949  		t.Fatalf("grpc.NewClient() failed: %v", err)
  1950  	}
  1951  	defer cc.Close()
  1952  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
  1953  	defer cancel()
  1954  	client := testgrpc.NewTestServiceClient(cc)
  1955  	client.EmptyCall(ctx, &testpb.Empty{})
  1956  	testutils.AwaitState(ctx, t, cc, connectivity.Ready)
  1957  
  1958  	// Wait until both endpoints start receiving requests.
  1959  	addrsSeen := map[string]bool{}
  1960  	for ; ctx.Err() == nil && len(addrsSeen) < 2; <-time.After(time.Millisecond) {
  1961  		var peer peer.Peer
  1962  		client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer))
  1963  		addrsSeen[peer.String()] = true
  1964  	}
  1965  
  1966  	if len(addrsSeen) < 2 {
  1967  		t.Fatalf("Context timed out waiting for requests to reach both endpoints.")
  1968  	}
  1969  
  1970  	// Make 2 requests to each endpoint and verify the first endpoint gets
  1971  	// ejected.
  1972  	for i := 0; i < 2*len(endpoints); i++ {
  1973  		client.EmptyCall(ctx, &testpb.Empty{})
  1974  	}
  1975  	var od *outlierDetectionBalancer
  1976  	select {
  1977  	case <-ctx.Done():
  1978  		t.Fatal("Timed out waiting for the outlier detection LB policy to be built.")
  1979  	case od = <-lbChan:
  1980  	}
  1981  	od.intervalTimerAlgorithm()
  1982  
  1983  	// The first endpoint should be ejected, requests should only go to
  1984  	// endpoints[1].
  1985  	if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[1].Addresses[0]}); err != nil {
  1986  		t.Fatalf("RPCs didn't go to the second endpoint: %v", err)
  1987  	}
  1988  
  1989  	// Remove the first address from the first endpoint. This makes the first
  1990  	// endpoint a new endpoint for outlier detection, resetting its ejection
  1991  	// status.
  1992  	r.UpdateState(resolver.State{Endpoints: []resolver.Endpoint{
  1993  		{Addresses: []resolver.Address{endpoints[0].Addresses[1]}},
  1994  		endpoints[1],
  1995  	}})
  1996  	od.intervalTimerAlgorithm()
  1997  	if err := roundrobin.CheckRoundRobinRPCs(ctx, client, []resolver.Address{endpoints[0].Addresses[1], endpoints[1].Addresses[0]}); err != nil {
  1998  		t.Fatalf("RPCs didn't go to the second addresses of both endpoints: %v", err)
  1999  	}
  2000  }