google.golang.org/grpc@v1.62.1/test/xds/xds_client_outlier_detection_test.go (about) 1 /* 2 * 3 * Copyright 2022 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 package xds_test 20 21 import ( 22 "context" 23 "errors" 24 "fmt" 25 "testing" 26 "time" 27 28 v3clusterpb "github.com/envoyproxy/go-control-plane/envoy/config/cluster/v3" 29 v3endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3" 30 v3listenerpb "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3" 31 v3routepb "github.com/envoyproxy/go-control-plane/envoy/config/route/v3" 32 "github.com/google/go-cmp/cmp" 33 "google.golang.org/grpc" 34 "google.golang.org/grpc/credentials/insecure" 35 "google.golang.org/grpc/internal/stubserver" 36 "google.golang.org/grpc/internal/testutils" 37 "google.golang.org/grpc/internal/testutils/xds/e2e" 38 testgrpc "google.golang.org/grpc/interop/grpc_testing" 39 testpb "google.golang.org/grpc/interop/grpc_testing" 40 "google.golang.org/grpc/peer" 41 "google.golang.org/grpc/resolver" 42 "google.golang.org/protobuf/types/known/durationpb" 43 "google.golang.org/protobuf/types/known/wrapperspb" 44 ) 45 46 // TestOutlierDetection_NoopConfig tests the scenario where the Outlier 47 // Detection feature is enabled on the gRPC client, but it receives no Outlier 48 // Detection configuration from the management server. This should result in a 49 // no-op Outlier Detection configuration being used to configure the Outlier 50 // Detection balancer. This test verifies that an RPC is able to proceed 51 // normally with this configuration. 52 func (s) TestOutlierDetection_NoopConfig(t *testing.T) { 53 managementServer, nodeID, _, resolver, cleanup1 := e2e.SetupManagementServer(t, e2e.ManagementServerOptions{}) 54 defer cleanup1() 55 56 server := &stubserver.StubServer{ 57 EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) { return &testpb.Empty{}, nil }, 58 } 59 server.StartServer() 60 t.Logf("Started test service backend at %q", server.Address) 61 defer server.Stop() 62 63 const serviceName = "my-service-client-side-xds" 64 resources := e2e.DefaultClientResources(e2e.ResourceParams{ 65 DialTarget: serviceName, 66 NodeID: nodeID, 67 Host: "localhost", 68 Port: testutils.ParsePort(t, server.Address), 69 SecLevel: e2e.SecurityLevelNone, 70 }) 71 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 72 defer cancel() 73 if err := managementServer.Update(ctx, resources); err != nil { 74 t.Fatal(err) 75 } 76 77 // Create a ClientConn and make a successful RPC. 78 cc, err := grpc.Dial(fmt.Sprintf("xds:///%s", serviceName), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithResolvers(resolver)) 79 if err != nil { 80 t.Fatalf("failed to dial local test server: %v", err) 81 } 82 defer cc.Close() 83 84 client := testgrpc.NewTestServiceClient(cc) 85 if _, err := client.EmptyCall(ctx, &testpb.Empty{}, grpc.WaitForReady(true)); err != nil { 86 t.Fatalf("rpc EmptyCall() failed: %v", err) 87 } 88 } 89 90 // clientResourcesMultipleBackendsAndOD returns xDS resources which correspond 91 // to multiple upstreams, corresponding different backends listening on 92 // different localhost:port combinations. The resources also configure an 93 // Outlier Detection Balancer configured through the passed in Outlier Detection 94 // proto. 95 func clientResourcesMultipleBackendsAndOD(params e2e.ResourceParams, ports []uint32, od *v3clusterpb.OutlierDetection) e2e.UpdateOptions { 96 routeConfigName := "route-" + params.DialTarget 97 clusterName := "cluster-" + params.DialTarget 98 endpointsName := "endpoints-" + params.DialTarget 99 return e2e.UpdateOptions{ 100 NodeID: params.NodeID, 101 Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(params.DialTarget, routeConfigName)}, 102 Routes: []*v3routepb.RouteConfiguration{e2e.DefaultRouteConfig(routeConfigName, params.DialTarget, clusterName)}, 103 Clusters: []*v3clusterpb.Cluster{clusterWithOutlierDetection(clusterName, endpointsName, params.SecLevel, od)}, 104 Endpoints: []*v3endpointpb.ClusterLoadAssignment{e2e.DefaultEndpoint(endpointsName, params.Host, ports)}, 105 } 106 } 107 108 func clusterWithOutlierDetection(clusterName, edsServiceName string, secLevel e2e.SecurityLevel, od *v3clusterpb.OutlierDetection) *v3clusterpb.Cluster { 109 cluster := e2e.DefaultCluster(clusterName, edsServiceName, secLevel) 110 cluster.OutlierDetection = od 111 return cluster 112 } 113 114 // checkRoundRobinRPCs verifies that EmptyCall RPCs on the given ClientConn, 115 // connected to a server exposing the test.grpc_testing.TestService, are 116 // roundrobined across the given backend addresses. 117 // 118 // Returns a non-nil error if context deadline expires before RPCs start to get 119 // roundrobined across the given backends. 120 func checkRoundRobinRPCs(ctx context.Context, client testgrpc.TestServiceClient, addrs []resolver.Address) error { 121 wantAddrCount := make(map[string]int) 122 for _, addr := range addrs { 123 wantAddrCount[addr.Addr]++ 124 } 125 for ; ctx.Err() == nil; <-time.After(time.Millisecond) { 126 // Perform 3 iterations. 127 var iterations [][]string 128 for i := 0; i < 3; i++ { 129 iteration := make([]string, len(addrs)) 130 for c := 0; c < len(addrs); c++ { 131 var peer peer.Peer 132 client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer)) 133 if peer.Addr != nil { 134 iteration[c] = peer.Addr.String() 135 } 136 } 137 iterations = append(iterations, iteration) 138 } 139 // Ensure the the first iteration contains all addresses in addrs. 140 gotAddrCount := make(map[string]int) 141 for _, addr := range iterations[0] { 142 gotAddrCount[addr]++ 143 } 144 if diff := cmp.Diff(gotAddrCount, wantAddrCount); diff != "" { 145 continue 146 } 147 // Ensure all three iterations contain the same addresses. 148 if !cmp.Equal(iterations[0], iterations[1]) || !cmp.Equal(iterations[0], iterations[2]) { 149 continue 150 } 151 return nil 152 } 153 return fmt.Errorf("timeout when waiting for roundrobin distribution of RPCs across addresses: %v", addrs) 154 } 155 156 // TestOutlierDetectionWithOutlier tests the Outlier Detection Balancer e2e. It 157 // spins up three backends, one which consistently errors, and configures the 158 // ClientConn using xDS to connect to all three of those backends. The Outlier 159 // Detection Balancer should eject the connection to the backend which 160 // constantly errors, causing RPC's to not be routed to that upstream, and only 161 // be Round Robined across the two healthy upstreams. Other than the intervals 162 // the unhealthy upstream is ejected, RPC's should regularly round robin across 163 // all three upstreams. 164 func (s) TestOutlierDetectionWithOutlier(t *testing.T) { 165 managementServer, nodeID, _, r, cleanup := e2e.SetupManagementServer(t, e2e.ManagementServerOptions{}) 166 defer cleanup() 167 168 // Working backend 1. 169 backend1 := stubserver.StartTestService(t, nil) 170 port1 := testutils.ParsePort(t, backend1.Address) 171 defer backend1.Stop() 172 173 // Working backend 2. 174 backend2 := stubserver.StartTestService(t, nil) 175 port2 := testutils.ParsePort(t, backend2.Address) 176 defer backend2.Stop() 177 178 // Backend 3 that will always return an error and eventually ejected. 179 backend3 := stubserver.StartTestService(t, &stubserver.StubServer{ 180 EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) { return nil, errors.New("some error") }, 181 }) 182 port3 := testutils.ParsePort(t, backend3.Address) 183 defer backend3.Stop() 184 185 const serviceName = "my-service-client-side-xds" 186 resources := clientResourcesMultipleBackendsAndOD(e2e.ResourceParams{ 187 DialTarget: serviceName, 188 NodeID: nodeID, 189 Host: "localhost", 190 SecLevel: e2e.SecurityLevelNone, 191 }, []uint32{port1, port2, port3}, &v3clusterpb.OutlierDetection{ 192 Interval: &durationpb.Duration{Nanos: 50000000}, // .5 seconds 193 BaseEjectionTime: &durationpb.Duration{Seconds: 30}, 194 MaxEjectionTime: &durationpb.Duration{Seconds: 300}, 195 MaxEjectionPercent: &wrapperspb.UInt32Value{Value: 1}, 196 FailurePercentageThreshold: &wrapperspb.UInt32Value{Value: 50}, 197 EnforcingFailurePercentage: &wrapperspb.UInt32Value{Value: 100}, 198 FailurePercentageRequestVolume: &wrapperspb.UInt32Value{Value: 8}, 199 FailurePercentageMinimumHosts: &wrapperspb.UInt32Value{Value: 3}, 200 }) 201 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 202 defer cancel() 203 if err := managementServer.Update(ctx, resources); err != nil { 204 t.Fatal(err) 205 } 206 207 cc, err := grpc.Dial(fmt.Sprintf("xds:///%s", serviceName), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithResolvers(r)) 208 if err != nil { 209 t.Fatalf("failed to dial local test server: %v", err) 210 } 211 defer cc.Close() 212 213 client := testgrpc.NewTestServiceClient(cc) 214 215 fullAddresses := []resolver.Address{ 216 {Addr: backend1.Address}, 217 {Addr: backend2.Address}, 218 {Addr: backend3.Address}, 219 } 220 // At first, due to no statistics on each of the backends, the 3 221 // upstreams should all be round robined across. 222 if err = checkRoundRobinRPCs(ctx, client, fullAddresses); err != nil { 223 t.Fatalf("error in expected round robin: %v", err) 224 } 225 226 // The addresses which don't return errors. 227 okAddresses := []resolver.Address{ 228 {Addr: backend1.Address}, 229 {Addr: backend2.Address}, 230 } 231 // After calling the three upstreams, one of them constantly error 232 // and should eventually be ejected for a period of time. This 233 // period of time should cause the RPC's to be round robined only 234 // across the two that are healthy. 235 if err = checkRoundRobinRPCs(ctx, client, okAddresses); err != nil { 236 t.Fatalf("error in expected round robin: %v", err) 237 } 238 } 239 240 // TestOutlierDetectionXDSDefaultOn tests that Outlier Detection is by default 241 // configured on in the xDS Flow. If the Outlier Detection proto message is 242 // present with SuccessRateEjection unset, then Outlier Detection should be 243 // turned on. The test setups and xDS system with xDS resources with Outlier 244 // Detection present in the CDS update, but with SuccessRateEjection unset, and 245 // asserts that Outlier Detection is turned on and ejects upstreams. 246 func (s) TestOutlierDetectionXDSDefaultOn(t *testing.T) { 247 managementServer, nodeID, _, r, cleanup := e2e.SetupManagementServer(t, e2e.ManagementServerOptions{}) 248 defer cleanup() 249 250 // Working backend 1. 251 backend1 := stubserver.StartTestService(t, nil) 252 port1 := testutils.ParsePort(t, backend1.Address) 253 defer backend1.Stop() 254 255 // Working backend 2. 256 backend2 := stubserver.StartTestService(t, nil) 257 port2 := testutils.ParsePort(t, backend2.Address) 258 defer backend2.Stop() 259 260 // Backend 3 that will always return an error and eventually ejected. 261 backend3 := stubserver.StartTestService(t, &stubserver.StubServer{ 262 EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) { return nil, errors.New("some error") }, 263 }) 264 port3 := testutils.ParsePort(t, backend3.Address) 265 defer backend3.Stop() 266 267 // Configure CDS resources with Outlier Detection set but 268 // EnforcingSuccessRate unset. This should cause Outlier Detection to be 269 // configured with SuccessRateEjection present in configuration, which will 270 // eventually be populated with its default values along with the knobs set 271 // as SuccessRate fields in the proto, and thus Outlier Detection should be 272 // on and actively eject upstreams. 273 const serviceName = "my-service-client-side-xds" 274 resources := clientResourcesMultipleBackendsAndOD(e2e.ResourceParams{ 275 DialTarget: serviceName, 276 NodeID: nodeID, 277 Host: "localhost", 278 SecLevel: e2e.SecurityLevelNone, 279 }, []uint32{port1, port2, port3}, &v3clusterpb.OutlierDetection{ 280 // Need to set knobs to trigger ejection within the test time frame. 281 Interval: &durationpb.Duration{Nanos: 50000000}, 282 // EnforcingSuccessRateSet to nil, causes success rate algorithm to be 283 // turned on. 284 SuccessRateMinimumHosts: &wrapperspb.UInt32Value{Value: 1}, 285 SuccessRateRequestVolume: &wrapperspb.UInt32Value{Value: 8}, 286 SuccessRateStdevFactor: &wrapperspb.UInt32Value{Value: 1}, 287 }) 288 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 289 defer cancel() 290 if err := managementServer.Update(ctx, resources); err != nil { 291 t.Fatal(err) 292 } 293 294 cc, err := grpc.Dial(fmt.Sprintf("xds:///%s", serviceName), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithResolvers(r)) 295 if err != nil { 296 t.Fatalf("failed to dial local test server: %v", err) 297 } 298 defer cc.Close() 299 300 client := testgrpc.NewTestServiceClient(cc) 301 302 fullAddresses := []resolver.Address{ 303 {Addr: backend1.Address}, 304 {Addr: backend2.Address}, 305 {Addr: backend3.Address}, 306 } 307 // At first, due to no statistics on each of the backends, the 3 308 // upstreams should all be round robined across. 309 if err = checkRoundRobinRPCs(ctx, client, fullAddresses); err != nil { 310 t.Fatalf("error in expected round robin: %v", err) 311 } 312 313 // The addresses which don't return errors. 314 okAddresses := []resolver.Address{ 315 {Addr: backend1.Address}, 316 {Addr: backend2.Address}, 317 } 318 // After calling the three upstreams, one of them constantly error 319 // and should eventually be ejected for a period of time. This 320 // period of time should cause the RPC's to be round robined only 321 // across the two that are healthy. 322 if err = checkRoundRobinRPCs(ctx, client, okAddresses); err != nil { 323 t.Fatalf("error in expected round robin: %v", err) 324 } 325 }