google.golang.org/grpc@v1.72.2/test/xds/xds_client_outlier_detection_test.go (about)

     1  /*
     2   *
     3   * Copyright 2022 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   *
    17   */
    18  
    19  package xds_test
    20  
    21  import (
    22  	"context"
    23  	"errors"
    24  	"fmt"
    25  	"testing"
    26  	"time"
    27  
    28  	v3clusterpb "github.com/envoyproxy/go-control-plane/envoy/config/cluster/v3"
    29  	v3endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3"
    30  	v3listenerpb "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3"
    31  	v3routepb "github.com/envoyproxy/go-control-plane/envoy/config/route/v3"
    32  	"github.com/google/go-cmp/cmp"
    33  	"google.golang.org/grpc"
    34  	"google.golang.org/grpc/credentials/insecure"
    35  	"google.golang.org/grpc/internal/stubserver"
    36  	"google.golang.org/grpc/internal/testutils"
    37  	"google.golang.org/grpc/internal/testutils/xds/e2e"
    38  	"google.golang.org/grpc/internal/testutils/xds/e2e/setup"
    39  	testgrpc "google.golang.org/grpc/interop/grpc_testing"
    40  	testpb "google.golang.org/grpc/interop/grpc_testing"
    41  	"google.golang.org/grpc/peer"
    42  	"google.golang.org/grpc/resolver"
    43  	"google.golang.org/protobuf/types/known/durationpb"
    44  	"google.golang.org/protobuf/types/known/wrapperspb"
    45  )
    46  
    47  // TestOutlierDetection_NoopConfig tests the scenario where the Outlier
    48  // Detection feature is enabled on the gRPC client, but it receives no Outlier
    49  // Detection configuration from the management server. This should result in a
    50  // no-op Outlier Detection configuration being used to configure the Outlier
    51  // Detection balancer. This test verifies that an RPC is able to proceed
    52  // normally with this configuration.
    53  func (s) TestOutlierDetection_NoopConfig(t *testing.T) {
    54  	managementServer, nodeID, _, xdsResolver := setup.ManagementServerAndResolver(t)
    55  
    56  	server := &stubserver.StubServer{
    57  		EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) { return &testpb.Empty{}, nil },
    58  	}
    59  	server.StartServer()
    60  	t.Logf("Started test service backend at %q", server.Address)
    61  	defer server.Stop()
    62  
    63  	const serviceName = "my-service-client-side-xds"
    64  	resources := e2e.DefaultClientResources(e2e.ResourceParams{
    65  		DialTarget: serviceName,
    66  		NodeID:     nodeID,
    67  		Host:       "localhost",
    68  		Port:       testutils.ParsePort(t, server.Address),
    69  		SecLevel:   e2e.SecurityLevelNone,
    70  	})
    71  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
    72  	defer cancel()
    73  	if err := managementServer.Update(ctx, resources); err != nil {
    74  		t.Fatal(err)
    75  	}
    76  
    77  	// Create a ClientConn and make a successful RPC.
    78  	cc, err := grpc.NewClient(fmt.Sprintf("xds:///%s", serviceName), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithResolvers(xdsResolver))
    79  	if err != nil {
    80  		t.Fatalf("failed to dial local test server: %v", err)
    81  	}
    82  	defer cc.Close()
    83  
    84  	client := testgrpc.NewTestServiceClient(cc)
    85  	if _, err := client.EmptyCall(ctx, &testpb.Empty{}, grpc.WaitForReady(true)); err != nil {
    86  		t.Fatalf("rpc EmptyCall() failed: %v", err)
    87  	}
    88  }
    89  
    90  // clientResourcesMultipleBackendsAndOD returns xDS resources which correspond
    91  // to multiple upstreams, corresponding different backends listening on
    92  // different localhost:port combinations. The resources also configure an
    93  // Outlier Detection Balancer configured through the passed in Outlier Detection
    94  // proto.
    95  func clientResourcesMultipleBackendsAndOD(params e2e.ResourceParams, ports []uint32, od *v3clusterpb.OutlierDetection) e2e.UpdateOptions {
    96  	routeConfigName := "route-" + params.DialTarget
    97  	clusterName := "cluster-" + params.DialTarget
    98  	endpointsName := "endpoints-" + params.DialTarget
    99  	return e2e.UpdateOptions{
   100  		NodeID:    params.NodeID,
   101  		Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(params.DialTarget, routeConfigName)},
   102  		Routes:    []*v3routepb.RouteConfiguration{e2e.DefaultRouteConfig(routeConfigName, params.DialTarget, clusterName)},
   103  		Clusters:  []*v3clusterpb.Cluster{clusterWithOutlierDetection(clusterName, endpointsName, params.SecLevel, od)},
   104  		Endpoints: []*v3endpointpb.ClusterLoadAssignment{e2e.DefaultEndpoint(endpointsName, params.Host, ports)},
   105  	}
   106  }
   107  
   108  func clusterWithOutlierDetection(clusterName, edsServiceName string, secLevel e2e.SecurityLevel, od *v3clusterpb.OutlierDetection) *v3clusterpb.Cluster {
   109  	cluster := e2e.DefaultCluster(clusterName, edsServiceName, secLevel)
   110  	cluster.OutlierDetection = od
   111  	return cluster
   112  }
   113  
   114  // checkRoundRobinRPCs verifies that EmptyCall RPCs on the given ClientConn,
   115  // connected to a server exposing the test.grpc_testing.TestService, are
   116  // roundrobined across the given backend addresses.
   117  //
   118  // Returns a non-nil error if context deadline expires before RPCs start to get
   119  // roundrobined across the given backends.
   120  func checkRoundRobinRPCs(ctx context.Context, client testgrpc.TestServiceClient, addrs []resolver.Address) error {
   121  	wantAddrCount := make(map[string]int)
   122  	for _, addr := range addrs {
   123  		wantAddrCount[addr.Addr]++
   124  	}
   125  	for ; ctx.Err() == nil; <-time.After(time.Millisecond) {
   126  		// Perform 3 iterations.
   127  		var iterations [][]string
   128  		for i := 0; i < 3; i++ {
   129  			iteration := make([]string, len(addrs))
   130  			for c := 0; c < len(addrs); c++ {
   131  				var peer peer.Peer
   132  				client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer))
   133  				if peer.Addr != nil {
   134  					iteration[c] = peer.Addr.String()
   135  				}
   136  			}
   137  			iterations = append(iterations, iteration)
   138  		}
   139  		// Ensure the first iteration contains all addresses in addrs.
   140  		gotAddrCount := make(map[string]int)
   141  		for _, addr := range iterations[0] {
   142  			gotAddrCount[addr]++
   143  		}
   144  		if diff := cmp.Diff(gotAddrCount, wantAddrCount); diff != "" {
   145  			continue
   146  		}
   147  		// Ensure all three iterations contain the same addresses.
   148  		if !cmp.Equal(iterations[0], iterations[1]) || !cmp.Equal(iterations[0], iterations[2]) {
   149  			continue
   150  		}
   151  		return nil
   152  	}
   153  	return fmt.Errorf("timeout when waiting for roundrobin distribution of RPCs across addresses: %v", addrs)
   154  }
   155  
   156  // TestOutlierDetectionWithOutlier tests the Outlier Detection Balancer e2e. It
   157  // spins up three backends, one which consistently errors, and configures the
   158  // ClientConn using xDS to connect to all three of those backends. The Outlier
   159  // Detection Balancer should eject the connection to the backend which
   160  // constantly errors, causing RPC's to not be routed to that upstream, and only
   161  // be Round Robined across the two healthy upstreams. Other than the intervals
   162  // the unhealthy upstream is ejected, RPC's should regularly round robin across
   163  // all three upstreams.
   164  func (s) TestOutlierDetectionWithOutlier(t *testing.T) {
   165  	managementServer, nodeID, _, xdsResolver := setup.ManagementServerAndResolver(t)
   166  
   167  	// Working backend 1.
   168  	backend1 := stubserver.StartTestService(t, nil)
   169  	port1 := testutils.ParsePort(t, backend1.Address)
   170  	defer backend1.Stop()
   171  
   172  	// Working backend 2.
   173  	backend2 := stubserver.StartTestService(t, nil)
   174  	port2 := testutils.ParsePort(t, backend2.Address)
   175  	defer backend2.Stop()
   176  
   177  	// Backend 3 that will always return an error and eventually ejected.
   178  	backend3 := stubserver.StartTestService(t, &stubserver.StubServer{
   179  		EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) { return nil, errors.New("some error") },
   180  	})
   181  	port3 := testutils.ParsePort(t, backend3.Address)
   182  	defer backend3.Stop()
   183  
   184  	const serviceName = "my-service-client-side-xds"
   185  	resources := clientResourcesMultipleBackendsAndOD(e2e.ResourceParams{
   186  		DialTarget: serviceName,
   187  		NodeID:     nodeID,
   188  		Host:       "localhost",
   189  		SecLevel:   e2e.SecurityLevelNone,
   190  	}, []uint32{port1, port2, port3}, &v3clusterpb.OutlierDetection{
   191  		Interval:                       &durationpb.Duration{Nanos: 50000000}, // .5 seconds
   192  		BaseEjectionTime:               &durationpb.Duration{Seconds: 30},
   193  		MaxEjectionTime:                &durationpb.Duration{Seconds: 300},
   194  		MaxEjectionPercent:             &wrapperspb.UInt32Value{Value: 1},
   195  		FailurePercentageThreshold:     &wrapperspb.UInt32Value{Value: 50},
   196  		EnforcingFailurePercentage:     &wrapperspb.UInt32Value{Value: 100},
   197  		FailurePercentageRequestVolume: &wrapperspb.UInt32Value{Value: 8},
   198  		FailurePercentageMinimumHosts:  &wrapperspb.UInt32Value{Value: 3},
   199  	})
   200  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   201  	defer cancel()
   202  	if err := managementServer.Update(ctx, resources); err != nil {
   203  		t.Fatal(err)
   204  	}
   205  
   206  	cc, err := grpc.NewClient(fmt.Sprintf("xds:///%s", serviceName), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithResolvers(xdsResolver))
   207  	if err != nil {
   208  		t.Fatalf("failed to dial local test server: %v", err)
   209  	}
   210  	defer cc.Close()
   211  
   212  	client := testgrpc.NewTestServiceClient(cc)
   213  
   214  	fullAddresses := []resolver.Address{
   215  		{Addr: backend1.Address},
   216  		{Addr: backend2.Address},
   217  		{Addr: backend3.Address},
   218  	}
   219  	// At first, due to no statistics on each of the backends, the 3
   220  	// upstreams should all be round robined across.
   221  	if err = checkRoundRobinRPCs(ctx, client, fullAddresses); err != nil {
   222  		t.Fatalf("error in expected round robin: %v", err)
   223  	}
   224  
   225  	// The addresses which don't return errors.
   226  	okAddresses := []resolver.Address{
   227  		{Addr: backend1.Address},
   228  		{Addr: backend2.Address},
   229  	}
   230  	// After calling the three upstreams, one of them constantly error
   231  	// and should eventually be ejected for a period of time. This
   232  	// period of time should cause the RPC's to be round robined only
   233  	// across the two that are healthy.
   234  	if err = checkRoundRobinRPCs(ctx, client, okAddresses); err != nil {
   235  		t.Fatalf("error in expected round robin: %v", err)
   236  	}
   237  }
   238  
   239  // TestOutlierDetectionXDSDefaultOn tests that Outlier Detection is by default
   240  // configured on in the xDS Flow. If the Outlier Detection proto message is
   241  // present with SuccessRateEjection unset, then Outlier Detection should be
   242  // turned on. The test setups and xDS system with xDS resources with Outlier
   243  // Detection present in the CDS update, but with SuccessRateEjection unset, and
   244  // asserts that Outlier Detection is turned on and ejects upstreams.
   245  func (s) TestOutlierDetectionXDSDefaultOn(t *testing.T) {
   246  	managementServer, nodeID, _, xdsResolver := setup.ManagementServerAndResolver(t)
   247  
   248  	// Working backend 1.
   249  	backend1 := stubserver.StartTestService(t, nil)
   250  	port1 := testutils.ParsePort(t, backend1.Address)
   251  	defer backend1.Stop()
   252  
   253  	// Working backend 2.
   254  	backend2 := stubserver.StartTestService(t, nil)
   255  	port2 := testutils.ParsePort(t, backend2.Address)
   256  	defer backend2.Stop()
   257  
   258  	// Backend 3 that will always return an error and eventually ejected.
   259  	backend3 := stubserver.StartTestService(t, &stubserver.StubServer{
   260  		EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) { return nil, errors.New("some error") },
   261  	})
   262  	port3 := testutils.ParsePort(t, backend3.Address)
   263  	defer backend3.Stop()
   264  
   265  	// Configure CDS resources with Outlier Detection set but
   266  	// EnforcingSuccessRate unset. This should cause Outlier Detection to be
   267  	// configured with SuccessRateEjection present in configuration, which will
   268  	// eventually be populated with its default values along with the knobs set
   269  	// as SuccessRate fields in the proto, and thus Outlier Detection should be
   270  	// on and actively eject upstreams.
   271  	const serviceName = "my-service-client-side-xds"
   272  	resources := clientResourcesMultipleBackendsAndOD(e2e.ResourceParams{
   273  		DialTarget: serviceName,
   274  		NodeID:     nodeID,
   275  		Host:       "localhost",
   276  		SecLevel:   e2e.SecurityLevelNone,
   277  	}, []uint32{port1, port2, port3}, &v3clusterpb.OutlierDetection{
   278  		// Need to set knobs to trigger ejection within the test time frame.
   279  		Interval: &durationpb.Duration{Nanos: 50000000},
   280  		// EnforcingSuccessRateSet to nil, causes success rate algorithm to be
   281  		// turned on.
   282  		SuccessRateMinimumHosts:  &wrapperspb.UInt32Value{Value: 1},
   283  		SuccessRateRequestVolume: &wrapperspb.UInt32Value{Value: 8},
   284  		SuccessRateStdevFactor:   &wrapperspb.UInt32Value{Value: 1},
   285  	})
   286  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   287  	defer cancel()
   288  	if err := managementServer.Update(ctx, resources); err != nil {
   289  		t.Fatal(err)
   290  	}
   291  
   292  	cc, err := grpc.NewClient(fmt.Sprintf("xds:///%s", serviceName), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithResolvers(xdsResolver))
   293  	if err != nil {
   294  		t.Fatalf("failed to dial local test server: %v", err)
   295  	}
   296  	defer cc.Close()
   297  
   298  	client := testgrpc.NewTestServiceClient(cc)
   299  
   300  	fullAddresses := []resolver.Address{
   301  		{Addr: backend1.Address},
   302  		{Addr: backend2.Address},
   303  		{Addr: backend3.Address},
   304  	}
   305  	// At first, due to no statistics on each of the backends, the 3
   306  	// upstreams should all be round robined across.
   307  	if err = checkRoundRobinRPCs(ctx, client, fullAddresses); err != nil {
   308  		t.Fatalf("error in expected round robin: %v", err)
   309  	}
   310  
   311  	// The addresses which don't return errors.
   312  	okAddresses := []resolver.Address{
   313  		{Addr: backend1.Address},
   314  		{Addr: backend2.Address},
   315  	}
   316  	// After calling the three upstreams, one of them constantly error
   317  	// and should eventually be ejected for a period of time. This
   318  	// period of time should cause the RPC's to be round robined only
   319  	// across the two that are healthy.
   320  	if err = checkRoundRobinRPCs(ctx, client, okAddresses); err != nil {
   321  		t.Fatalf("error in expected round robin: %v", err)
   322  	}
   323  }