google.golang.org/grpc@v1.72.2/balancer/weightedroundrobin/balancer_test.go (about)

     1  /*
     2   *
     3   * Copyright 2023 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   *
    17   */
    18  
    19  package weightedroundrobin_test
    20  
    21  import (
    22  	"context"
    23  	"encoding/json"
    24  	"fmt"
    25  	"sync"
    26  	"sync/atomic"
    27  	"testing"
    28  	"time"
    29  
    30  	"google.golang.org/grpc"
    31  	"google.golang.org/grpc/internal"
    32  	"google.golang.org/grpc/internal/grpctest"
    33  	"google.golang.org/grpc/internal/stubserver"
    34  	"google.golang.org/grpc/internal/testutils/roundrobin"
    35  	"google.golang.org/grpc/internal/testutils/stats"
    36  	"google.golang.org/grpc/orca"
    37  	"google.golang.org/grpc/peer"
    38  	"google.golang.org/grpc/resolver"
    39  
    40  	wrr "google.golang.org/grpc/balancer/weightedroundrobin"
    41  	iwrr "google.golang.org/grpc/balancer/weightedroundrobin/internal"
    42  
    43  	testgrpc "google.golang.org/grpc/interop/grpc_testing"
    44  	testpb "google.golang.org/grpc/interop/grpc_testing"
    45  )
    46  
    47  type s struct {
    48  	grpctest.Tester
    49  }
    50  
    51  func Test(t *testing.T) {
    52  	grpctest.RunSubTests(t, s{})
    53  }
    54  
    55  const defaultTestTimeout = 10 * time.Second
    56  const weightUpdatePeriod = 50 * time.Millisecond
    57  const weightExpirationPeriod = time.Minute
    58  const oobReportingInterval = 10 * time.Millisecond
    59  
    60  func init() {
    61  	iwrr.AllowAnyWeightUpdatePeriod = true
    62  }
    63  
    64  func boolp(b bool) *bool          { return &b }
    65  func float64p(f float64) *float64 { return &f }
    66  func stringp(s string) *string    { return &s }
    67  
    68  var (
    69  	perCallConfig = iwrr.LBConfig{
    70  		EnableOOBLoadReport:     boolp(false),
    71  		OOBReportingPeriod:      stringp("0.005s"),
    72  		BlackoutPeriod:          stringp("0s"),
    73  		WeightExpirationPeriod:  stringp("60s"),
    74  		WeightUpdatePeriod:      stringp(".050s"),
    75  		ErrorUtilizationPenalty: float64p(0),
    76  	}
    77  	oobConfig = iwrr.LBConfig{
    78  		EnableOOBLoadReport:     boolp(true),
    79  		OOBReportingPeriod:      stringp("0.005s"),
    80  		BlackoutPeriod:          stringp("0s"),
    81  		WeightExpirationPeriod:  stringp("60s"),
    82  		WeightUpdatePeriod:      stringp(".050s"),
    83  		ErrorUtilizationPenalty: float64p(0),
    84  	}
    85  	testMetricsConfig = iwrr.LBConfig{
    86  		EnableOOBLoadReport:     boolp(false),
    87  		OOBReportingPeriod:      stringp("0.005s"),
    88  		BlackoutPeriod:          stringp("0s"),
    89  		WeightExpirationPeriod:  stringp("60s"),
    90  		WeightUpdatePeriod:      stringp("30s"),
    91  		ErrorUtilizationPenalty: float64p(0),
    92  	}
    93  )
    94  
    95  type testServer struct {
    96  	*stubserver.StubServer
    97  
    98  	oobMetrics  orca.ServerMetricsRecorder // Attached to the OOB stream.
    99  	callMetrics orca.CallMetricsRecorder   // Attached to per-call metrics.
   100  }
   101  
   102  type reportType int
   103  
   104  const (
   105  	reportNone reportType = iota
   106  	reportOOB
   107  	reportCall
   108  	reportBoth
   109  )
   110  
   111  func startServer(t *testing.T, r reportType) *testServer {
   112  	t.Helper()
   113  
   114  	smr := orca.NewServerMetricsRecorder()
   115  	cmr := orca.NewServerMetricsRecorder().(orca.CallMetricsRecorder)
   116  
   117  	ss := &stubserver.StubServer{
   118  		EmptyCallF: func(ctx context.Context, in *testpb.Empty) (*testpb.Empty, error) {
   119  			if r := orca.CallMetricsRecorderFromContext(ctx); r != nil {
   120  				// Copy metrics from what the test set in cmr into r.
   121  				sm := cmr.(orca.ServerMetricsProvider).ServerMetrics()
   122  				r.SetApplicationUtilization(sm.AppUtilization)
   123  				r.SetQPS(sm.QPS)
   124  				r.SetEPS(sm.EPS)
   125  			}
   126  			return &testpb.Empty{}, nil
   127  		},
   128  	}
   129  
   130  	var sopts []grpc.ServerOption
   131  	if r == reportCall || r == reportBoth {
   132  		sopts = append(sopts, orca.CallMetricsServerOption(nil))
   133  	}
   134  
   135  	if r == reportOOB || r == reportBoth {
   136  		oso := orca.ServiceOptions{
   137  			ServerMetricsProvider: smr,
   138  			MinReportingInterval:  10 * time.Millisecond,
   139  		}
   140  		internal.ORCAAllowAnyMinReportingInterval.(func(so *orca.ServiceOptions))(&oso)
   141  		sopts = append(sopts, stubserver.RegisterServiceServerOption(func(s grpc.ServiceRegistrar) {
   142  			if err := orca.Register(s, oso); err != nil {
   143  				t.Fatalf("Failed to register orca service: %v", err)
   144  			}
   145  		}))
   146  	}
   147  
   148  	if err := ss.StartServer(sopts...); err != nil {
   149  		t.Fatalf("Error starting server: %v", err)
   150  	}
   151  	t.Cleanup(ss.Stop)
   152  
   153  	return &testServer{
   154  		StubServer:  ss,
   155  		oobMetrics:  smr,
   156  		callMetrics: cmr,
   157  	}
   158  }
   159  
   160  func svcConfig(t *testing.T, wrrCfg iwrr.LBConfig) string {
   161  	t.Helper()
   162  	m, err := json.Marshal(wrrCfg)
   163  	if err != nil {
   164  		t.Fatalf("Error marshaling JSON %v: %v", wrrCfg, err)
   165  	}
   166  	sc := fmt.Sprintf(`{"loadBalancingConfig": [ {%q:%v} ] }`, wrr.Name, string(m))
   167  	t.Logf("Marshaled service config: %v", sc)
   168  	return sc
   169  }
   170  
   171  // Tests basic functionality with one address.  With only one address, load
   172  // reporting doesn't affect routing at all.
   173  func (s) TestBalancer_OneAddress(t *testing.T) {
   174  	testCases := []struct {
   175  		rt  reportType
   176  		cfg iwrr.LBConfig
   177  	}{
   178  		{rt: reportNone, cfg: perCallConfig},
   179  		{rt: reportCall, cfg: perCallConfig},
   180  		{rt: reportOOB, cfg: oobConfig},
   181  	}
   182  
   183  	for _, tc := range testCases {
   184  		t.Run(fmt.Sprintf("reportType:%v", tc.rt), func(t *testing.T) {
   185  			ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   186  			defer cancel()
   187  
   188  			srv := startServer(t, tc.rt)
   189  
   190  			sc := svcConfig(t, tc.cfg)
   191  			if err := srv.StartClient(grpc.WithDefaultServiceConfig(sc)); err != nil {
   192  				t.Fatalf("Error starting client: %v", err)
   193  			}
   194  
   195  			// Perform many RPCs to ensure the LB policy works with 1 address.
   196  			for i := 0; i < 100; i++ {
   197  				srv.callMetrics.SetQPS(float64(i))
   198  				srv.oobMetrics.SetQPS(float64(i))
   199  				if _, err := srv.Client.EmptyCall(ctx, &testpb.Empty{}); err != nil {
   200  					t.Fatalf("Error from EmptyCall: %v", err)
   201  				}
   202  				time.Sleep(time.Millisecond) // Delay; test will run 100ms and should perform ~10 weight updates
   203  			}
   204  		})
   205  	}
   206  }
   207  
   208  // TestWRRMetricsBasic tests metrics emitted from the WRR balancer. It
   209  // configures a weighted round robin balancer as the top level balancer of a
   210  // ClientConn, and configures a fake stats handler on the ClientConn to receive
   211  // metrics. It verifies stats emitted from the Weighted Round Robin Balancer on
   212  // balancer startup case which triggers the first picker and scheduler update
   213  // before any load reports are received.
   214  //
   215  // Note that this test and others, metrics emission assertions are a snapshot
   216  // of the most recently emitted metrics. This is due to the nondeterminism of
   217  // scheduler updates with respect to test bodies, so the assertions made are
   218  // from the most recently synced state of the system (picker/scheduler) from the
   219  // test body.
   220  func (s) TestWRRMetricsBasic(t *testing.T) {
   221  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   222  	defer cancel()
   223  
   224  	srv := startServer(t, reportCall)
   225  	sc := svcConfig(t, testMetricsConfig)
   226  
   227  	tmr := stats.NewTestMetricsRecorder()
   228  	if err := srv.StartClient(grpc.WithDefaultServiceConfig(sc), grpc.WithStatsHandler(tmr)); err != nil {
   229  		t.Fatalf("Error starting client: %v", err)
   230  	}
   231  	srv.callMetrics.SetQPS(float64(1))
   232  
   233  	if _, err := srv.Client.EmptyCall(ctx, &testpb.Empty{}); err != nil {
   234  		t.Fatalf("Error from EmptyCall: %v", err)
   235  	}
   236  
   237  	if got, _ := tmr.Metric("grpc.lb.wrr.rr_fallback"); got != 1 {
   238  		t.Fatalf("Unexpected data for metric %v, got: %v, want: %v", "grpc.lb.wrr.rr_fallback", got, 1)
   239  	}
   240  	if got, _ := tmr.Metric("grpc.lb.wrr.endpoint_weight_stale"); got != 0 {
   241  		t.Fatalf("Unexpected data for metric %v, got: %v, want: %v", "grpc.lb.wrr.endpoint_weight_stale", got, 0)
   242  	}
   243  	if got, _ := tmr.Metric("grpc.lb.wrr.endpoint_weight_not_yet_usable"); got != 1 {
   244  		t.Fatalf("Unexpected data for metric %v, got: %v, want: %v", "grpc.lb.wrr.endpoint_weight_not_yet_usable", got, 1)
   245  	}
   246  	// Unusable, so no endpoint weight. Due to only one SubConn, this will never
   247  	// update the weight. Thus, this will stay 0.
   248  	if got, _ := tmr.Metric("grpc.lb.wrr.endpoint_weight_stale"); got != 0 {
   249  		t.Fatalf("Unexpected data for metric %v, got: %v, want: %v", "grpc.lb.wrr.endpoint_weight_stale", got, 0)
   250  	}
   251  }
   252  
   253  // Tests two addresses with ORCA reporting disabled (should fall back to pure
   254  // RR).
   255  func (s) TestBalancer_TwoAddresses_ReportingDisabled(t *testing.T) {
   256  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   257  	defer cancel()
   258  
   259  	srv1 := startServer(t, reportNone)
   260  	srv2 := startServer(t, reportNone)
   261  
   262  	sc := svcConfig(t, perCallConfig)
   263  	if err := srv1.StartClient(grpc.WithDefaultServiceConfig(sc)); err != nil {
   264  		t.Fatalf("Error starting client: %v", err)
   265  	}
   266  	addrs := []resolver.Address{{Addr: srv1.Address}, {Addr: srv2.Address}}
   267  	srv1.R.UpdateState(resolver.State{Addresses: addrs})
   268  
   269  	// Perform many RPCs to ensure the LB policy works with 2 addresses.
   270  	for i := 0; i < 20; i++ {
   271  		roundrobin.CheckRoundRobinRPCs(ctx, srv1.Client, addrs)
   272  	}
   273  }
   274  
   275  // Tests two addresses with per-call ORCA reporting enabled.  Checks the
   276  // backends are called in the appropriate ratios.
   277  func (s) TestBalancer_TwoAddresses_ReportingEnabledPerCall(t *testing.T) {
   278  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   279  	defer cancel()
   280  
   281  	srv1 := startServer(t, reportCall)
   282  	srv2 := startServer(t, reportCall)
   283  
   284  	// srv1 starts loaded and srv2 starts without load; ensure RPCs are routed
   285  	// disproportionately to srv2 (10:1).
   286  	srv1.callMetrics.SetQPS(10.0)
   287  	srv1.callMetrics.SetApplicationUtilization(1.0)
   288  
   289  	srv2.callMetrics.SetQPS(10.0)
   290  	srv2.callMetrics.SetApplicationUtilization(.1)
   291  
   292  	sc := svcConfig(t, perCallConfig)
   293  	if err := srv1.StartClient(grpc.WithDefaultServiceConfig(sc)); err != nil {
   294  		t.Fatalf("Error starting client: %v", err)
   295  	}
   296  	addrs := []resolver.Address{{Addr: srv1.Address}, {Addr: srv2.Address}}
   297  	srv1.R.UpdateState(resolver.State{Addresses: addrs})
   298  
   299  	// Call each backend once to ensure the weights have been received.
   300  	ensureReached(ctx, t, srv1.Client, 2)
   301  
   302  	// Wait for the weight update period to allow the new weights to be processed.
   303  	time.Sleep(weightUpdatePeriod)
   304  	checkWeights(ctx, t, srvWeight{srv1, 1}, srvWeight{srv2, 10})
   305  }
   306  
   307  // Tests two addresses with OOB ORCA reporting enabled.  Checks the backends
   308  // are called in the appropriate ratios.
   309  func (s) TestBalancer_TwoAddresses_ReportingEnabledOOB(t *testing.T) {
   310  	testCases := []struct {
   311  		name       string
   312  		utilSetter func(orca.ServerMetricsRecorder, float64)
   313  	}{{
   314  		name: "application_utilization",
   315  		utilSetter: func(smr orca.ServerMetricsRecorder, val float64) {
   316  			smr.SetApplicationUtilization(val)
   317  		},
   318  	}, {
   319  		name: "cpu_utilization",
   320  		utilSetter: func(smr orca.ServerMetricsRecorder, val float64) {
   321  			smr.SetCPUUtilization(val)
   322  		},
   323  	}, {
   324  		name: "application over cpu",
   325  		utilSetter: func(smr orca.ServerMetricsRecorder, val float64) {
   326  			smr.SetApplicationUtilization(val)
   327  			smr.SetCPUUtilization(2.0) // ignored because ApplicationUtilization is set
   328  		},
   329  	}}
   330  
   331  	for _, tc := range testCases {
   332  		t.Run(tc.name, func(t *testing.T) {
   333  			ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   334  			defer cancel()
   335  
   336  			srv1 := startServer(t, reportOOB)
   337  			srv2 := startServer(t, reportOOB)
   338  
   339  			// srv1 starts loaded and srv2 starts without load; ensure RPCs are routed
   340  			// disproportionately to srv2 (10:1).
   341  			srv1.oobMetrics.SetQPS(10.0)
   342  			tc.utilSetter(srv1.oobMetrics, 1.0)
   343  
   344  			srv2.oobMetrics.SetQPS(10.0)
   345  			tc.utilSetter(srv2.oobMetrics, 0.1)
   346  
   347  			sc := svcConfig(t, oobConfig)
   348  			if err := srv1.StartClient(grpc.WithDefaultServiceConfig(sc)); err != nil {
   349  				t.Fatalf("Error starting client: %v", err)
   350  			}
   351  			addrs := []resolver.Address{{Addr: srv1.Address}, {Addr: srv2.Address}}
   352  			srv1.R.UpdateState(resolver.State{Addresses: addrs})
   353  
   354  			// Call each backend once to ensure the weights have been received.
   355  			ensureReached(ctx, t, srv1.Client, 2)
   356  
   357  			// Wait for the weight update period to allow the new weights to be processed.
   358  			time.Sleep(weightUpdatePeriod)
   359  			checkWeights(ctx, t, srvWeight{srv1, 1}, srvWeight{srv2, 10})
   360  		})
   361  	}
   362  }
   363  
   364  // Tests two addresses with OOB ORCA reporting enabled, where the reports
   365  // change over time.  Checks the backends are called in the appropriate ratios
   366  // before and after modifying the reports.
   367  func (s) TestBalancer_TwoAddresses_UpdateLoads(t *testing.T) {
   368  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   369  	defer cancel()
   370  
   371  	srv1 := startServer(t, reportOOB)
   372  	srv2 := startServer(t, reportOOB)
   373  
   374  	// srv1 starts loaded and srv2 starts without load; ensure RPCs are routed
   375  	// disproportionately to srv2 (10:1).
   376  	srv1.oobMetrics.SetQPS(10.0)
   377  	srv1.oobMetrics.SetApplicationUtilization(1.0)
   378  
   379  	srv2.oobMetrics.SetQPS(10.0)
   380  	srv2.oobMetrics.SetApplicationUtilization(.1)
   381  
   382  	sc := svcConfig(t, oobConfig)
   383  	if err := srv1.StartClient(grpc.WithDefaultServiceConfig(sc)); err != nil {
   384  		t.Fatalf("Error starting client: %v", err)
   385  	}
   386  	addrs := []resolver.Address{{Addr: srv1.Address}, {Addr: srv2.Address}}
   387  	srv1.R.UpdateState(resolver.State{Addresses: addrs})
   388  
   389  	// Call each backend once to ensure the weights have been received.
   390  	ensureReached(ctx, t, srv1.Client, 2)
   391  
   392  	// Wait for the weight update period to allow the new weights to be processed.
   393  	time.Sleep(weightUpdatePeriod)
   394  	checkWeights(ctx, t, srvWeight{srv1, 1}, srvWeight{srv2, 10})
   395  
   396  	// Update the loads so srv2 is loaded and srv1 is not; ensure RPCs are
   397  	// routed disproportionately to srv1.
   398  	srv1.oobMetrics.SetQPS(10.0)
   399  	srv1.oobMetrics.SetApplicationUtilization(.1)
   400  
   401  	srv2.oobMetrics.SetQPS(10.0)
   402  	srv2.oobMetrics.SetApplicationUtilization(1.0)
   403  
   404  	// Wait for the weight update period to allow the new weights to be processed.
   405  	time.Sleep(weightUpdatePeriod + oobReportingInterval)
   406  	checkWeights(ctx, t, srvWeight{srv1, 10}, srvWeight{srv2, 1})
   407  }
   408  
   409  // Tests two addresses with OOB ORCA reporting enabled, then with switching to
   410  // per-call reporting.  Checks the backends are called in the appropriate
   411  // ratios before and after the change.
   412  func (s) TestBalancer_TwoAddresses_OOBThenPerCall(t *testing.T) {
   413  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   414  	defer cancel()
   415  
   416  	srv1 := startServer(t, reportBoth)
   417  	srv2 := startServer(t, reportBoth)
   418  
   419  	// srv1 starts loaded and srv2 starts without load; ensure RPCs are routed
   420  	// disproportionately to srv2 (10:1).
   421  	srv1.oobMetrics.SetQPS(10.0)
   422  	srv1.oobMetrics.SetApplicationUtilization(1.0)
   423  
   424  	srv2.oobMetrics.SetQPS(10.0)
   425  	srv2.oobMetrics.SetApplicationUtilization(.1)
   426  
   427  	// For per-call metrics (not used initially), srv2 reports that it is
   428  	// loaded and srv1 reports low load.  After confirming OOB works, switch to
   429  	// per-call and confirm the new routing weights are applied.
   430  	srv1.callMetrics.SetQPS(10.0)
   431  	srv1.callMetrics.SetApplicationUtilization(.1)
   432  
   433  	srv2.callMetrics.SetQPS(10.0)
   434  	srv2.callMetrics.SetApplicationUtilization(1.0)
   435  
   436  	sc := svcConfig(t, oobConfig)
   437  	if err := srv1.StartClient(grpc.WithDefaultServiceConfig(sc)); err != nil {
   438  		t.Fatalf("Error starting client: %v", err)
   439  	}
   440  	addrs := []resolver.Address{{Addr: srv1.Address}, {Addr: srv2.Address}}
   441  	srv1.R.UpdateState(resolver.State{Addresses: addrs})
   442  
   443  	// Call each backend once to ensure the weights have been received.
   444  	ensureReached(ctx, t, srv1.Client, 2)
   445  
   446  	// Wait for the weight update period to allow the new weights to be processed.
   447  	time.Sleep(weightUpdatePeriod)
   448  	checkWeights(ctx, t, srvWeight{srv1, 1}, srvWeight{srv2, 10})
   449  
   450  	// Update to per-call weights.
   451  	c := svcConfig(t, perCallConfig)
   452  	parsedCfg := srv1.R.CC().ParseServiceConfig(c)
   453  	if parsedCfg.Err != nil {
   454  		panic(fmt.Sprintf("Error parsing config %q: %v", c, parsedCfg.Err))
   455  	}
   456  	srv1.R.UpdateState(resolver.State{Addresses: addrs, ServiceConfig: parsedCfg})
   457  
   458  	// Wait for the weight update period to allow the new weights to be processed.
   459  	time.Sleep(weightUpdatePeriod)
   460  	checkWeights(ctx, t, srvWeight{srv1, 10}, srvWeight{srv2, 1})
   461  }
   462  
   463  // TestEndpoints_SharedAddress tests the case where two endpoints have the same
   464  // address. The expected behavior is undefined, however the program should not
   465  // crash.
   466  func (s) TestEndpoints_SharedAddress(t *testing.T) {
   467  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   468  	defer cancel()
   469  
   470  	srv := startServer(t, reportCall)
   471  	sc := svcConfig(t, perCallConfig)
   472  	if err := srv.StartClient(grpc.WithDefaultServiceConfig(sc)); err != nil {
   473  		t.Fatalf("Error starting client: %v", err)
   474  	}
   475  
   476  	endpointsSharedAddress := []resolver.Endpoint{{Addresses: []resolver.Address{{Addr: srv.Address}}}, {Addresses: []resolver.Address{{Addr: srv.Address}}}}
   477  	srv.R.UpdateState(resolver.State{Endpoints: endpointsSharedAddress})
   478  
   479  	// Make some RPC's and make sure doesn't crash. It should go to one of the
   480  	// endpoints addresses, it's undefined which one it will choose and the load
   481  	// reporting might not work, but it should be able to make an RPC.
   482  	for i := 0; i < 10; i++ {
   483  		if _, err := srv.Client.EmptyCall(ctx, &testpb.Empty{}); err != nil {
   484  			t.Fatalf("EmptyCall failed with err: %v", err)
   485  		}
   486  	}
   487  }
   488  
   489  // TestEndpoints_MultipleAddresses tests WRR on endpoints with numerous
   490  // addresses. It configures WRR with two endpoints with one bad address followed
   491  // by a good address. It configures two backends that each report per call
   492  // metrics, each corresponding to the two endpoints good address. It then
   493  // asserts load is distributed as expected corresponding to the call metrics
   494  // received.
   495  func (s) TestEndpoints_MultipleAddresses(t *testing.T) {
   496  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   497  	defer cancel()
   498  	srv1 := startServer(t, reportCall)
   499  	srv2 := startServer(t, reportCall)
   500  
   501  	srv1.callMetrics.SetQPS(10.0)
   502  	srv1.callMetrics.SetApplicationUtilization(.1)
   503  
   504  	srv2.callMetrics.SetQPS(10.0)
   505  	srv2.callMetrics.SetApplicationUtilization(1.0)
   506  
   507  	sc := svcConfig(t, perCallConfig)
   508  	if err := srv1.StartClient(grpc.WithDefaultServiceConfig(sc)); err != nil {
   509  		t.Fatalf("Error starting client: %v", err)
   510  	}
   511  
   512  	twoEndpoints := []resolver.Endpoint{{Addresses: []resolver.Address{{Addr: "bad-address-1"}, {Addr: srv1.Address}}}, {Addresses: []resolver.Address{{Addr: "bad-address-2"}, {Addr: srv2.Address}}}}
   513  	srv1.R.UpdateState(resolver.State{Endpoints: twoEndpoints})
   514  
   515  	// Call each backend once to ensure the weights have been received.
   516  	ensureReached(ctx, t, srv1.Client, 2)
   517  	// Wait for the weight update period to allow the new weights to be processed.
   518  	time.Sleep(weightUpdatePeriod)
   519  	checkWeights(ctx, t, srvWeight{srv1, 10}, srvWeight{srv2, 1})
   520  }
   521  
   522  // Tests two addresses with OOB ORCA reporting enabled and a non-zero error
   523  // penalty applied.
   524  func (s) TestBalancer_TwoAddresses_ErrorPenalty(t *testing.T) {
   525  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   526  	defer cancel()
   527  
   528  	srv1 := startServer(t, reportOOB)
   529  	srv2 := startServer(t, reportOOB)
   530  
   531  	// srv1 starts loaded and srv2 starts without load; ensure RPCs are routed
   532  	// disproportionately to srv2 (10:1).  EPS values are set (but ignored
   533  	// initially due to ErrorUtilizationPenalty=0).  Later EUP will be updated
   534  	// to 0.9 which will cause the weights to be equal and RPCs to be routed
   535  	// 50/50.
   536  	srv1.oobMetrics.SetQPS(10.0)
   537  	srv1.oobMetrics.SetApplicationUtilization(1.0)
   538  	srv1.oobMetrics.SetEPS(0)
   539  	// srv1 weight before: 10.0 / 1.0 = 10.0
   540  	// srv1 weight after:  10.0 / 1.0 = 10.0
   541  
   542  	srv2.oobMetrics.SetQPS(10.0)
   543  	srv2.oobMetrics.SetApplicationUtilization(.1)
   544  	srv2.oobMetrics.SetEPS(10.0)
   545  	// srv2 weight before: 10.0 / 0.1 = 100.0
   546  	// srv2 weight after:  10.0 / 1.0 = 10.0
   547  
   548  	sc := svcConfig(t, oobConfig)
   549  	if err := srv1.StartClient(grpc.WithDefaultServiceConfig(sc)); err != nil {
   550  		t.Fatalf("Error starting client: %v", err)
   551  	}
   552  	addrs := []resolver.Address{{Addr: srv1.Address}, {Addr: srv2.Address}}
   553  	srv1.R.UpdateState(resolver.State{Addresses: addrs})
   554  
   555  	// Call each backend once to ensure the weights have been received.
   556  	ensureReached(ctx, t, srv1.Client, 2)
   557  
   558  	// Wait for the weight update period to allow the new weights to be processed.
   559  	time.Sleep(weightUpdatePeriod)
   560  	checkWeights(ctx, t, srvWeight{srv1, 1}, srvWeight{srv2, 10})
   561  
   562  	// Update to include an error penalty in the weights.
   563  	newCfg := oobConfig
   564  	newCfg.ErrorUtilizationPenalty = float64p(0.9)
   565  	c := svcConfig(t, newCfg)
   566  	parsedCfg := srv1.R.CC().ParseServiceConfig(c)
   567  	if parsedCfg.Err != nil {
   568  		panic(fmt.Sprintf("Error parsing config %q: %v", c, parsedCfg.Err))
   569  	}
   570  	srv1.R.UpdateState(resolver.State{Addresses: addrs, ServiceConfig: parsedCfg})
   571  
   572  	// Wait for the weight update period to allow the new weights to be processed.
   573  	time.Sleep(weightUpdatePeriod + oobReportingInterval)
   574  	checkWeights(ctx, t, srvWeight{srv1, 1}, srvWeight{srv2, 1})
   575  }
   576  
   577  // Tests that the blackout period causes backends to use 0 as their weight
   578  // (meaning to use the average weight) until the blackout period elapses.
   579  func (s) TestBalancer_TwoAddresses_BlackoutPeriod(t *testing.T) {
   580  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   581  	defer cancel()
   582  
   583  	var mu sync.Mutex
   584  	start := time.Now()
   585  	now := start
   586  	setNow := func(t time.Time) {
   587  		mu.Lock()
   588  		defer mu.Unlock()
   589  		now = t
   590  	}
   591  
   592  	setTimeNow(func() time.Time {
   593  		mu.Lock()
   594  		defer mu.Unlock()
   595  		return now
   596  	})
   597  	t.Cleanup(func() { setTimeNow(time.Now) })
   598  
   599  	testCases := []struct {
   600  		blackoutPeriodCfg *string
   601  		blackoutPeriod    time.Duration
   602  	}{{
   603  		blackoutPeriodCfg: stringp("1s"),
   604  		blackoutPeriod:    time.Second,
   605  	}, {
   606  		blackoutPeriodCfg: nil,
   607  		blackoutPeriod:    10 * time.Second, // the default
   608  	}}
   609  	for _, tc := range testCases {
   610  		setNow(start)
   611  		srv1 := startServer(t, reportOOB)
   612  		srv2 := startServer(t, reportOOB)
   613  
   614  		// srv1 starts loaded and srv2 starts without load; ensure RPCs are routed
   615  		// disproportionately to srv2 (10:1).
   616  		srv1.oobMetrics.SetQPS(10.0)
   617  		srv1.oobMetrics.SetApplicationUtilization(1.0)
   618  
   619  		srv2.oobMetrics.SetQPS(10.0)
   620  		srv2.oobMetrics.SetApplicationUtilization(.1)
   621  
   622  		cfg := oobConfig
   623  		cfg.BlackoutPeriod = tc.blackoutPeriodCfg
   624  		sc := svcConfig(t, cfg)
   625  		if err := srv1.StartClient(grpc.WithDefaultServiceConfig(sc)); err != nil {
   626  			t.Fatalf("Error starting client: %v", err)
   627  		}
   628  		addrs := []resolver.Address{{Addr: srv1.Address}, {Addr: srv2.Address}}
   629  		srv1.R.UpdateState(resolver.State{Addresses: addrs})
   630  
   631  		// Call each backend once to ensure the weights have been received.
   632  		ensureReached(ctx, t, srv1.Client, 2)
   633  
   634  		// Wait for the weight update period to allow the new weights to be processed.
   635  		time.Sleep(weightUpdatePeriod)
   636  		// During the blackout period (1s) we should route roughly 50/50.
   637  		checkWeights(ctx, t, srvWeight{srv1, 1}, srvWeight{srv2, 1})
   638  
   639  		// Advance time to right before the blackout period ends and the weights
   640  		// should still be zero.
   641  		setNow(start.Add(tc.blackoutPeriod - time.Nanosecond))
   642  		// Wait for the weight update period to allow the new weights to be processed.
   643  		time.Sleep(weightUpdatePeriod)
   644  		checkWeights(ctx, t, srvWeight{srv1, 1}, srvWeight{srv2, 1})
   645  
   646  		// Advance time to right after the blackout period ends and the weights
   647  		// should now activate.
   648  		setNow(start.Add(tc.blackoutPeriod))
   649  		// Wait for the weight update period to allow the new weights to be processed.
   650  		time.Sleep(weightUpdatePeriod)
   651  		checkWeights(ctx, t, srvWeight{srv1, 1}, srvWeight{srv2, 10})
   652  	}
   653  }
   654  
   655  // Tests that the weight expiration period causes backends to use 0 as their
   656  // weight (meaning to use the average weight) once the expiration period
   657  // elapses.
   658  func (s) TestBalancer_TwoAddresses_WeightExpiration(t *testing.T) {
   659  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   660  	defer cancel()
   661  
   662  	var mu sync.Mutex
   663  	start := time.Now()
   664  	now := start
   665  	setNow := func(t time.Time) {
   666  		mu.Lock()
   667  		defer mu.Unlock()
   668  		now = t
   669  	}
   670  	setTimeNow(func() time.Time {
   671  		mu.Lock()
   672  		defer mu.Unlock()
   673  		return now
   674  	})
   675  	t.Cleanup(func() { setTimeNow(time.Now) })
   676  
   677  	srv1 := startServer(t, reportBoth)
   678  	srv2 := startServer(t, reportBoth)
   679  
   680  	// srv1 starts loaded and srv2 starts without load; ensure RPCs are routed
   681  	// disproportionately to srv2 (10:1).  Because the OOB reporting interval
   682  	// is 1 minute but the weights expire in 1 second, routing will go to 50/50
   683  	// after the weights expire.
   684  	srv1.oobMetrics.SetQPS(10.0)
   685  	srv1.oobMetrics.SetApplicationUtilization(1.0)
   686  
   687  	srv2.oobMetrics.SetQPS(10.0)
   688  	srv2.oobMetrics.SetApplicationUtilization(.1)
   689  
   690  	cfg := oobConfig
   691  	cfg.OOBReportingPeriod = stringp("60s")
   692  	sc := svcConfig(t, cfg)
   693  	if err := srv1.StartClient(grpc.WithDefaultServiceConfig(sc)); err != nil {
   694  		t.Fatalf("Error starting client: %v", err)
   695  	}
   696  	addrs := []resolver.Address{{Addr: srv1.Address}, {Addr: srv2.Address}}
   697  	srv1.R.UpdateState(resolver.State{Addresses: addrs})
   698  
   699  	// Call each backend once to ensure the weights have been received.
   700  	ensureReached(ctx, t, srv1.Client, 2)
   701  
   702  	// Wait for the weight update period to allow the new weights to be processed.
   703  	time.Sleep(weightUpdatePeriod)
   704  	checkWeights(ctx, t, srvWeight{srv1, 1}, srvWeight{srv2, 10})
   705  
   706  	// Advance what time.Now returns to the weight expiration time minus 1s to
   707  	// ensure all weights are still honored.
   708  	setNow(start.Add(weightExpirationPeriod - time.Second))
   709  
   710  	// Wait for the weight update period to allow the new weights to be processed.
   711  	time.Sleep(weightUpdatePeriod)
   712  	checkWeights(ctx, t, srvWeight{srv1, 1}, srvWeight{srv2, 10})
   713  
   714  	// Advance what time.Now returns to the weight expiration time plus 1s to
   715  	// ensure all weights expired and addresses are routed evenly.
   716  	setNow(start.Add(weightExpirationPeriod + time.Second))
   717  
   718  	// Wait for the weight expiration period so the weights have expired.
   719  	time.Sleep(weightUpdatePeriod)
   720  	checkWeights(ctx, t, srvWeight{srv1, 1}, srvWeight{srv2, 1})
   721  }
   722  
   723  // Tests logic surrounding subchannel management.
   724  func (s) TestBalancer_AddressesChanging(t *testing.T) {
   725  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   726  	defer cancel()
   727  
   728  	srv1 := startServer(t, reportBoth)
   729  	srv2 := startServer(t, reportBoth)
   730  	srv3 := startServer(t, reportBoth)
   731  	srv4 := startServer(t, reportBoth)
   732  
   733  	// srv1: weight 10
   734  	srv1.oobMetrics.SetQPS(10.0)
   735  	srv1.oobMetrics.SetApplicationUtilization(1.0)
   736  	// srv2: weight 100
   737  	srv2.oobMetrics.SetQPS(10.0)
   738  	srv2.oobMetrics.SetApplicationUtilization(.1)
   739  	// srv3: weight 20
   740  	srv3.oobMetrics.SetQPS(20.0)
   741  	srv3.oobMetrics.SetApplicationUtilization(1.0)
   742  	// srv4: weight 200
   743  	srv4.oobMetrics.SetQPS(20.0)
   744  	srv4.oobMetrics.SetApplicationUtilization(.1)
   745  
   746  	sc := svcConfig(t, oobConfig)
   747  	if err := srv1.StartClient(grpc.WithDefaultServiceConfig(sc)); err != nil {
   748  		t.Fatalf("Error starting client: %v", err)
   749  	}
   750  	srv2.Client = srv1.Client
   751  	addrs := []resolver.Address{{Addr: srv1.Address}, {Addr: srv2.Address}, {Addr: srv3.Address}}
   752  	srv1.R.UpdateState(resolver.State{Addresses: addrs})
   753  
   754  	// Call each backend once to ensure the weights have been received.
   755  	ensureReached(ctx, t, srv1.Client, 3)
   756  	time.Sleep(weightUpdatePeriod)
   757  	checkWeights(ctx, t, srvWeight{srv1, 1}, srvWeight{srv2, 10}, srvWeight{srv3, 2})
   758  
   759  	// Add backend 4
   760  	addrs = append(addrs, resolver.Address{Addr: srv4.Address})
   761  	srv1.R.UpdateState(resolver.State{Addresses: addrs})
   762  	time.Sleep(weightUpdatePeriod)
   763  	checkWeights(ctx, t, srvWeight{srv1, 1}, srvWeight{srv2, 10}, srvWeight{srv3, 2}, srvWeight{srv4, 20})
   764  
   765  	// Shutdown backend 3.  RPCs will no longer be routed to it.
   766  	srv3.Stop()
   767  	time.Sleep(weightUpdatePeriod)
   768  	checkWeights(ctx, t, srvWeight{srv1, 1}, srvWeight{srv2, 10}, srvWeight{srv4, 20})
   769  
   770  	// Remove addresses 2 and 3.  RPCs will no longer be routed to 2 either.
   771  	addrs = []resolver.Address{{Addr: srv1.Address}, {Addr: srv4.Address}}
   772  	srv1.R.UpdateState(resolver.State{Addresses: addrs})
   773  	time.Sleep(weightUpdatePeriod)
   774  	checkWeights(ctx, t, srvWeight{srv1, 1}, srvWeight{srv4, 20})
   775  
   776  	// Re-add 2 and remove the rest.
   777  	addrs = []resolver.Address{{Addr: srv2.Address}}
   778  	srv1.R.UpdateState(resolver.State{Addresses: addrs})
   779  	time.Sleep(weightUpdatePeriod)
   780  	checkWeights(ctx, t, srvWeight{srv2, 10})
   781  
   782  	// Re-add 4.
   783  	addrs = append(addrs, resolver.Address{Addr: srv4.Address})
   784  	srv1.R.UpdateState(resolver.State{Addresses: addrs})
   785  	time.Sleep(weightUpdatePeriod)
   786  	checkWeights(ctx, t, srvWeight{srv2, 10}, srvWeight{srv4, 20})
   787  }
   788  
   789  func ensureReached(ctx context.Context, t *testing.T, c testgrpc.TestServiceClient, n int) {
   790  	t.Helper()
   791  	reached := make(map[string]struct{})
   792  	for len(reached) != n {
   793  		var peer peer.Peer
   794  		if _, err := c.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer)); err != nil {
   795  			t.Fatalf("Error from EmptyCall: %v", err)
   796  		}
   797  		reached[peer.Addr.String()] = struct{}{}
   798  	}
   799  }
   800  
   801  type srvWeight struct {
   802  	srv *testServer
   803  	w   int
   804  }
   805  
   806  const rrIterations = 100
   807  
   808  // checkWeights does rrIterations RPCs and expects the different backends to be
   809  // routed in a ratio as determined by the srvWeights passed in.  Allows for
   810  // some variance (+/- 2 RPCs per backend).
   811  func checkWeights(ctx context.Context, t *testing.T, sws ...srvWeight) {
   812  	t.Helper()
   813  
   814  	c := sws[0].srv.Client
   815  
   816  	// Replace the weights with approximate counts of RPCs wanted given the
   817  	// iterations performed.
   818  	weightSum := 0
   819  	for _, sw := range sws {
   820  		weightSum += sw.w
   821  	}
   822  	for i := range sws {
   823  		sws[i].w = rrIterations * sws[i].w / weightSum
   824  	}
   825  
   826  	for attempts := 0; attempts < 10; attempts++ {
   827  		serverCounts := make(map[string]int)
   828  		for i := 0; i < rrIterations; i++ {
   829  			var peer peer.Peer
   830  			if _, err := c.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer)); err != nil {
   831  				t.Fatalf("Error from EmptyCall: %v; timed out waiting for weighted RR behavior?", err)
   832  			}
   833  			serverCounts[peer.Addr.String()]++
   834  		}
   835  		if len(serverCounts) != len(sws) {
   836  			continue
   837  		}
   838  		success := true
   839  		for _, sw := range sws {
   840  			c := serverCounts[sw.srv.Address]
   841  			if c < sw.w-2 || c > sw.w+2 {
   842  				success = false
   843  				break
   844  			}
   845  		}
   846  		if success {
   847  			t.Logf("Passed iteration %v; counts: %v", attempts, serverCounts)
   848  			return
   849  		}
   850  		t.Logf("Failed iteration %v; counts: %v; want %+v", attempts, serverCounts, sws)
   851  		time.Sleep(5 * time.Millisecond)
   852  	}
   853  	t.Fatalf("Failed to route RPCs with proper ratio")
   854  }
   855  
   856  func init() {
   857  	setTimeNow(time.Now)
   858  	iwrr.TimeNow = timeNow
   859  }
   860  
   861  var timeNowFunc atomic.Value // func() time.Time
   862  
   863  func timeNow() time.Time {
   864  	return timeNowFunc.Load().(func() time.Time)()
   865  }
   866  
   867  func setTimeNow(f func() time.Time) {
   868  	timeNowFunc.Store(f)
   869  }