google.golang.org/grpc@v1.72.2/xds/internal/balancer/outlierdetection/e2e_test/outlierdetection_test.go (about)

     1  /*
     2   *
     3   * Copyright 2022 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   *
    17   */
    18  
    19  // Package e2e_test contains e2e test cases for the Outlier Detection LB Policy.
    20  package e2e_test
    21  
    22  import (
    23  	"context"
    24  	"errors"
    25  	"fmt"
    26  	"testing"
    27  	"time"
    28  
    29  	"github.com/google/go-cmp/cmp"
    30  	"google.golang.org/grpc"
    31  	"google.golang.org/grpc/balancer/weightedroundrobin"
    32  	"google.golang.org/grpc/credentials/insecure"
    33  	"google.golang.org/grpc/internal"
    34  	"google.golang.org/grpc/internal/envconfig"
    35  	"google.golang.org/grpc/internal/grpctest"
    36  	"google.golang.org/grpc/internal/stubserver"
    37  	"google.golang.org/grpc/peer"
    38  	"google.golang.org/grpc/resolver"
    39  	"google.golang.org/grpc/resolver/manual"
    40  	"google.golang.org/grpc/serviceconfig"
    41  
    42  	testgrpc "google.golang.org/grpc/interop/grpc_testing"
    43  	testpb "google.golang.org/grpc/interop/grpc_testing"
    44  
    45  	_ "google.golang.org/grpc/xds/internal/balancer/outlierdetection" // To register helper functions which register/unregister Outlier Detection LB Policy.
    46  )
    47  
    48  var (
    49  	defaultTestTimeout = 5 * time.Second
    50  	leafPolicyName     = "round_robin"
    51  )
    52  
    53  func init() {
    54  	// Test the health listener code path for ejection when the experimental
    55  	// pickfirst is enabled.
    56  	if envconfig.NewPickFirstEnabled {
    57  		leafPolicyName = weightedroundrobin.Name
    58  	}
    59  }
    60  
    61  type s struct {
    62  	grpctest.Tester
    63  }
    64  
    65  func Test(t *testing.T) {
    66  	grpctest.RunSubTests(t, s{})
    67  }
    68  
    69  // Setup spins up three test backends, each listening on a port on localhost.
    70  // Two of the backends are configured to always reply with an empty response and
    71  // no error and one is configured to always return an error.
    72  func setupBackends(t *testing.T) ([]string, func()) {
    73  	t.Helper()
    74  
    75  	backends := make([]*stubserver.StubServer, 3)
    76  	addresses := make([]string, 3)
    77  	// Construct and start 2 working backends.
    78  	for i := 0; i < 2; i++ {
    79  		backend := &stubserver.StubServer{
    80  			EmptyCallF: func(ctx context.Context, in *testpb.Empty) (*testpb.Empty, error) {
    81  				return &testpb.Empty{}, nil
    82  			},
    83  		}
    84  		if err := backend.StartServer(); err != nil {
    85  			t.Fatalf("Failed to start backend: %v", err)
    86  		}
    87  		t.Logf("Started good TestService backend at: %q", backend.Address)
    88  		backends[i] = backend
    89  		addresses[i] = backend.Address
    90  	}
    91  
    92  	// Construct and start a failing backend.
    93  	backend := &stubserver.StubServer{
    94  		EmptyCallF: func(ctx context.Context, in *testpb.Empty) (*testpb.Empty, error) {
    95  			return nil, errors.New("some error")
    96  		},
    97  	}
    98  	if err := backend.StartServer(); err != nil {
    99  		t.Fatalf("Failed to start backend: %v", err)
   100  	}
   101  	t.Logf("Started bad TestService backend at: %q", backend.Address)
   102  	backends[2] = backend
   103  	addresses[2] = backend.Address
   104  	cancel := func() {
   105  		for _, backend := range backends {
   106  			backend.Stop()
   107  		}
   108  	}
   109  	return addresses, cancel
   110  }
   111  
   112  // checkRoundRobinRPCs verifies that EmptyCall RPCs on the given ClientConn,
   113  // connected to a server exposing the test.grpc_testing.TestService, are
   114  // roundrobined across the given backend addresses.
   115  //
   116  // Returns a non-nil error if context deadline expires before RPCs start to get
   117  // roundrobined across the given backends.
   118  func checkRoundRobinRPCs(ctx context.Context, client testgrpc.TestServiceClient, addrs []resolver.Address) error {
   119  	wantAddrCount := make(map[string]int)
   120  	for _, addr := range addrs {
   121  		wantAddrCount[addr.Addr]++
   122  	}
   123  	gotAddrCount := make(map[string]int)
   124  	for ; ctx.Err() == nil; <-time.After(time.Millisecond) {
   125  		gotAddrCount = make(map[string]int)
   126  		// Perform 3 iterations.
   127  		var iterations [][]string
   128  		for i := 0; i < 3; i++ {
   129  			iteration := make([]string, len(addrs))
   130  			for c := 0; c < len(addrs); c++ {
   131  				var peer peer.Peer
   132  				client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer))
   133  				if peer.Addr != nil {
   134  					iteration[c] = peer.Addr.String()
   135  				}
   136  			}
   137  			iterations = append(iterations, iteration)
   138  		}
   139  		// Ensure the first iteration contains all addresses in addrs.
   140  		for _, addr := range iterations[0] {
   141  			gotAddrCount[addr]++
   142  		}
   143  		if diff := cmp.Diff(gotAddrCount, wantAddrCount); diff != "" {
   144  			continue
   145  		}
   146  		// Ensure all three iterations contain the same addresses.
   147  		if !cmp.Equal(iterations[0], iterations[1]) || !cmp.Equal(iterations[0], iterations[2]) {
   148  			continue
   149  		}
   150  		return nil
   151  	}
   152  	return fmt.Errorf("timeout when waiting for roundrobin distribution of RPCs across addresses: %v; got: %v", addrs, gotAddrCount)
   153  }
   154  
   155  // TestOutlierDetectionAlgorithmsE2E tests the Outlier Detection Success Rate
   156  // and Failure Percentage algorithms in an e2e fashion. The Outlier Detection
   157  // Balancer is configured as the top level LB Policy of the channel with a Round
   158  // Robin child, and connects to three upstreams. Two of the upstreams are healthy and
   159  // one is unhealthy. The two algorithms should at some point eject the failing
   160  // upstream, causing RPC's to not be routed to that upstream, and only be
   161  // Round Robined across the two healthy upstreams. Other than the intervals the
   162  // unhealthy upstream is ejected, RPC's should regularly round robin
   163  // across all three upstreams.
   164  func (s) TestOutlierDetectionAlgorithmsE2E(t *testing.T) {
   165  	tests := []struct {
   166  		name     string
   167  		odscJSON string
   168  	}{
   169  		{
   170  			name: "Success Rate Algorithm",
   171  			odscJSON: fmt.Sprintf(`
   172  			{
   173  			  "loadBalancingConfig": [
   174  				{
   175  				  "outlier_detection_experimental": {
   176  					"interval": "0.050s",
   177  					"baseEjectionTime": "0.100s",
   178  					"maxEjectionTime": "300s",
   179  					"maxEjectionPercent": 33,
   180  					"successRateEjection": {
   181  						"stdevFactor": 50,
   182  						"enforcementPercentage": 100,
   183  						"minimumHosts": 3,
   184  						"requestVolume": 5
   185  					},
   186  					"childPolicy": [{"%s": {}}]
   187  				  }
   188  				}
   189  			  ]
   190  			}`, leafPolicyName),
   191  		},
   192  		{
   193  			name: "Failure Percentage Algorithm",
   194  			odscJSON: fmt.Sprintf(`
   195  			{
   196  			  "loadBalancingConfig": [
   197  				{
   198  				  "outlier_detection_experimental": {
   199  					"interval": "0.050s",
   200  					"baseEjectionTime": "0.100s",
   201  					"maxEjectionTime": "300s",
   202  					"maxEjectionPercent": 33,
   203  					"failurePercentageEjection": {
   204  						"threshold": 50,
   205  						"enforcementPercentage": 100,
   206  						"minimumHosts": 3,
   207  						"requestVolume": 5
   208  					},
   209  					"childPolicy": [{"%s": {}}
   210  					]
   211  				  }
   212  				}
   213  			  ]
   214  			}`, leafPolicyName),
   215  		},
   216  	}
   217  	for _, test := range tests {
   218  		t.Run(test.name, func(t *testing.T) {
   219  			addresses, cancel := setupBackends(t)
   220  			defer cancel()
   221  
   222  			mr := manual.NewBuilderWithScheme("od-e2e")
   223  			defer mr.Close()
   224  
   225  			sc := internal.ParseServiceConfig.(func(string) *serviceconfig.ParseResult)(test.odscJSON)
   226  			// The full list of addresses.
   227  			fullAddresses := []resolver.Address{
   228  				{Addr: addresses[0]},
   229  				{Addr: addresses[1]},
   230  				{Addr: addresses[2]},
   231  			}
   232  			mr.InitialState(resolver.State{
   233  				Addresses:     fullAddresses,
   234  				ServiceConfig: sc,
   235  			})
   236  
   237  			cc, err := grpc.NewClient(mr.Scheme()+":///", grpc.WithResolvers(mr), grpc.WithTransportCredentials(insecure.NewCredentials()))
   238  			if err != nil {
   239  				t.Fatalf("grpc.NewClient() failed: %v", err)
   240  			}
   241  			defer cc.Close()
   242  			ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   243  			defer cancel()
   244  			testServiceClient := testgrpc.NewTestServiceClient(cc)
   245  
   246  			// At first, due to no statistics on each of the backends, the 3
   247  			// upstreams should all be round robined across.
   248  			if err = checkRoundRobinRPCs(ctx, testServiceClient, fullAddresses); err != nil {
   249  				t.Fatalf("error in expected round robin: %v", err)
   250  			}
   251  
   252  			// The addresses which don't return errors.
   253  			okAddresses := []resolver.Address{
   254  				{Addr: addresses[0]},
   255  				{Addr: addresses[1]},
   256  			}
   257  			// After calling the three upstreams, one of them constantly error
   258  			// and should eventually be ejected for a period of time. This
   259  			// period of time should cause the RPC's to be round robined only
   260  			// across the two that are healthy.
   261  			if err = checkRoundRobinRPCs(ctx, testServiceClient, okAddresses); err != nil {
   262  				t.Fatalf("error in expected round robin: %v", err)
   263  			}
   264  
   265  			// The failing upstream isn't ejected indefinitely, and eventually
   266  			// should be unejected in subsequent iterations of the interval
   267  			// algorithm as per the spec for the two specific algorithms.
   268  			if err = checkRoundRobinRPCs(ctx, testServiceClient, fullAddresses); err != nil {
   269  				t.Fatalf("error in expected round robin: %v", err)
   270  			}
   271  		})
   272  	}
   273  }
   274  
   275  // TestNoopConfiguration tests the Outlier Detection Balancer configured with a
   276  // noop configuration. The noop configuration should cause the Outlier Detection
   277  // Balancer to not count RPC's, and thus never eject any upstreams and continue
   278  // to route to every upstream connected to, even if they continuously error.
   279  // Once the Outlier Detection Balancer gets reconfigured with configuration
   280  // requiring counting RPC's, the Outlier Detection Balancer should start
   281  // ejecting any upstreams as specified in the configuration.
   282  func (s) TestNoopConfiguration(t *testing.T) {
   283  	addresses, cancel := setupBackends(t)
   284  	defer cancel()
   285  
   286  	mr := manual.NewBuilderWithScheme("od-e2e")
   287  	defer mr.Close()
   288  
   289  	noopODServiceConfigJSON := fmt.Sprintf(`
   290  	{
   291  	  "loadBalancingConfig": [
   292  		{
   293  		  "outlier_detection_experimental": {
   294  			"interval": "0.050s",
   295  			"baseEjectionTime": "0.100s",
   296  			"maxEjectionTime": "300s",
   297  			"maxEjectionPercent": 33,
   298  			"childPolicy": [{"%s": {}}]
   299  		  }
   300  		}
   301  	  ]
   302  	}`, leafPolicyName)
   303  	sc := internal.ParseServiceConfig.(func(string) *serviceconfig.ParseResult)(noopODServiceConfigJSON)
   304  	// The full list of addresses.
   305  	fullAddresses := []resolver.Address{
   306  		{Addr: addresses[0]},
   307  		{Addr: addresses[1]},
   308  		{Addr: addresses[2]},
   309  	}
   310  	mr.InitialState(resolver.State{
   311  		Addresses:     fullAddresses,
   312  		ServiceConfig: sc,
   313  	})
   314  	cc, err := grpc.NewClient(mr.Scheme()+":///", grpc.WithResolvers(mr), grpc.WithTransportCredentials(insecure.NewCredentials()))
   315  	if err != nil {
   316  		t.Fatalf("grpc.NewClient() failed: %v", err)
   317  	}
   318  	defer cc.Close()
   319  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   320  	defer cancel()
   321  	testServiceClient := testgrpc.NewTestServiceClient(cc)
   322  
   323  	for i := 0; i < 2; i++ {
   324  		// Since the Outlier Detection Balancer starts with a noop
   325  		// configuration, it shouldn't count RPCs or eject any upstreams. Thus,
   326  		// even though an upstream it connects to constantly errors, it should
   327  		// continue to Round Robin across every upstream.
   328  		if err := checkRoundRobinRPCs(ctx, testServiceClient, fullAddresses); err != nil {
   329  			t.Fatalf("error in expected round robin: %v", err)
   330  		}
   331  	}
   332  
   333  	// Reconfigure the Outlier Detection Balancer with a configuration that
   334  	// specifies to count RPC's and eject upstreams. Due to the balancer no
   335  	// longer being a noop, it should eject any unhealthy addresses as specified
   336  	// by the failure percentage portion of the configuration.
   337  	countingODServiceConfigJSON := fmt.Sprintf(`
   338  	{
   339  	  "loadBalancingConfig": [
   340  		{
   341  		  "outlier_detection_experimental": {
   342  			"interval": "0.050s",
   343  			"baseEjectionTime": "0.100s",
   344  			"maxEjectionTime": "300s",
   345  			"maxEjectionPercent": 33,
   346  			"failurePercentageEjection": {
   347  				"threshold": 50,
   348  				"enforcementPercentage": 100,
   349  				"minimumHosts": 3,
   350  				"requestVolume": 5
   351  			},
   352  			"childPolicy": [{"%s": {}}]
   353  		  }
   354  		}
   355  	  ]
   356  	}`, leafPolicyName)
   357  	sc = internal.ParseServiceConfig.(func(string) *serviceconfig.ParseResult)(countingODServiceConfigJSON)
   358  
   359  	mr.UpdateState(resolver.State{
   360  		Addresses:     fullAddresses,
   361  		ServiceConfig: sc,
   362  	})
   363  
   364  	// At first on the reconfigured balancer, the balancer has no stats
   365  	// collected about upstreams. Thus, it should at first route across the full
   366  	// upstream list.
   367  	if err = checkRoundRobinRPCs(ctx, testServiceClient, fullAddresses); err != nil {
   368  		t.Fatalf("error in expected round robin: %v", err)
   369  	}
   370  
   371  	// The addresses which don't return errors.
   372  	okAddresses := []resolver.Address{
   373  		{Addr: addresses[0]},
   374  		{Addr: addresses[1]},
   375  	}
   376  	// Now that the reconfigured balancer has data about the failing upstream,
   377  	// it should eject the upstream and only route across the two healthy
   378  	// upstreams.
   379  	if err = checkRoundRobinRPCs(ctx, testServiceClient, okAddresses); err != nil {
   380  		t.Fatalf("error in expected round robin: %v", err)
   381  	}
   382  }