google.golang.org/grpc@v1.72.2/xds/internal/balancer/outlierdetection/e2e_test/outlierdetection_test.go (about) 1 /* 2 * 3 * Copyright 2022 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 // Package e2e_test contains e2e test cases for the Outlier Detection LB Policy. 20 package e2e_test 21 22 import ( 23 "context" 24 "errors" 25 "fmt" 26 "testing" 27 "time" 28 29 "github.com/google/go-cmp/cmp" 30 "google.golang.org/grpc" 31 "google.golang.org/grpc/balancer/weightedroundrobin" 32 "google.golang.org/grpc/credentials/insecure" 33 "google.golang.org/grpc/internal" 34 "google.golang.org/grpc/internal/envconfig" 35 "google.golang.org/grpc/internal/grpctest" 36 "google.golang.org/grpc/internal/stubserver" 37 "google.golang.org/grpc/peer" 38 "google.golang.org/grpc/resolver" 39 "google.golang.org/grpc/resolver/manual" 40 "google.golang.org/grpc/serviceconfig" 41 42 testgrpc "google.golang.org/grpc/interop/grpc_testing" 43 testpb "google.golang.org/grpc/interop/grpc_testing" 44 45 _ "google.golang.org/grpc/xds/internal/balancer/outlierdetection" // To register helper functions which register/unregister Outlier Detection LB Policy. 46 ) 47 48 var ( 49 defaultTestTimeout = 5 * time.Second 50 leafPolicyName = "round_robin" 51 ) 52 53 func init() { 54 // Test the health listener code path for ejection when the experimental 55 // pickfirst is enabled. 56 if envconfig.NewPickFirstEnabled { 57 leafPolicyName = weightedroundrobin.Name 58 } 59 } 60 61 type s struct { 62 grpctest.Tester 63 } 64 65 func Test(t *testing.T) { 66 grpctest.RunSubTests(t, s{}) 67 } 68 69 // Setup spins up three test backends, each listening on a port on localhost. 70 // Two of the backends are configured to always reply with an empty response and 71 // no error and one is configured to always return an error. 72 func setupBackends(t *testing.T) ([]string, func()) { 73 t.Helper() 74 75 backends := make([]*stubserver.StubServer, 3) 76 addresses := make([]string, 3) 77 // Construct and start 2 working backends. 78 for i := 0; i < 2; i++ { 79 backend := &stubserver.StubServer{ 80 EmptyCallF: func(ctx context.Context, in *testpb.Empty) (*testpb.Empty, error) { 81 return &testpb.Empty{}, nil 82 }, 83 } 84 if err := backend.StartServer(); err != nil { 85 t.Fatalf("Failed to start backend: %v", err) 86 } 87 t.Logf("Started good TestService backend at: %q", backend.Address) 88 backends[i] = backend 89 addresses[i] = backend.Address 90 } 91 92 // Construct and start a failing backend. 93 backend := &stubserver.StubServer{ 94 EmptyCallF: func(ctx context.Context, in *testpb.Empty) (*testpb.Empty, error) { 95 return nil, errors.New("some error") 96 }, 97 } 98 if err := backend.StartServer(); err != nil { 99 t.Fatalf("Failed to start backend: %v", err) 100 } 101 t.Logf("Started bad TestService backend at: %q", backend.Address) 102 backends[2] = backend 103 addresses[2] = backend.Address 104 cancel := func() { 105 for _, backend := range backends { 106 backend.Stop() 107 } 108 } 109 return addresses, cancel 110 } 111 112 // checkRoundRobinRPCs verifies that EmptyCall RPCs on the given ClientConn, 113 // connected to a server exposing the test.grpc_testing.TestService, are 114 // roundrobined across the given backend addresses. 115 // 116 // Returns a non-nil error if context deadline expires before RPCs start to get 117 // roundrobined across the given backends. 118 func checkRoundRobinRPCs(ctx context.Context, client testgrpc.TestServiceClient, addrs []resolver.Address) error { 119 wantAddrCount := make(map[string]int) 120 for _, addr := range addrs { 121 wantAddrCount[addr.Addr]++ 122 } 123 gotAddrCount := make(map[string]int) 124 for ; ctx.Err() == nil; <-time.After(time.Millisecond) { 125 gotAddrCount = make(map[string]int) 126 // Perform 3 iterations. 127 var iterations [][]string 128 for i := 0; i < 3; i++ { 129 iteration := make([]string, len(addrs)) 130 for c := 0; c < len(addrs); c++ { 131 var peer peer.Peer 132 client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer)) 133 if peer.Addr != nil { 134 iteration[c] = peer.Addr.String() 135 } 136 } 137 iterations = append(iterations, iteration) 138 } 139 // Ensure the first iteration contains all addresses in addrs. 140 for _, addr := range iterations[0] { 141 gotAddrCount[addr]++ 142 } 143 if diff := cmp.Diff(gotAddrCount, wantAddrCount); diff != "" { 144 continue 145 } 146 // Ensure all three iterations contain the same addresses. 147 if !cmp.Equal(iterations[0], iterations[1]) || !cmp.Equal(iterations[0], iterations[2]) { 148 continue 149 } 150 return nil 151 } 152 return fmt.Errorf("timeout when waiting for roundrobin distribution of RPCs across addresses: %v; got: %v", addrs, gotAddrCount) 153 } 154 155 // TestOutlierDetectionAlgorithmsE2E tests the Outlier Detection Success Rate 156 // and Failure Percentage algorithms in an e2e fashion. The Outlier Detection 157 // Balancer is configured as the top level LB Policy of the channel with a Round 158 // Robin child, and connects to three upstreams. Two of the upstreams are healthy and 159 // one is unhealthy. The two algorithms should at some point eject the failing 160 // upstream, causing RPC's to not be routed to that upstream, and only be 161 // Round Robined across the two healthy upstreams. Other than the intervals the 162 // unhealthy upstream is ejected, RPC's should regularly round robin 163 // across all three upstreams. 164 func (s) TestOutlierDetectionAlgorithmsE2E(t *testing.T) { 165 tests := []struct { 166 name string 167 odscJSON string 168 }{ 169 { 170 name: "Success Rate Algorithm", 171 odscJSON: fmt.Sprintf(` 172 { 173 "loadBalancingConfig": [ 174 { 175 "outlier_detection_experimental": { 176 "interval": "0.050s", 177 "baseEjectionTime": "0.100s", 178 "maxEjectionTime": "300s", 179 "maxEjectionPercent": 33, 180 "successRateEjection": { 181 "stdevFactor": 50, 182 "enforcementPercentage": 100, 183 "minimumHosts": 3, 184 "requestVolume": 5 185 }, 186 "childPolicy": [{"%s": {}}] 187 } 188 } 189 ] 190 }`, leafPolicyName), 191 }, 192 { 193 name: "Failure Percentage Algorithm", 194 odscJSON: fmt.Sprintf(` 195 { 196 "loadBalancingConfig": [ 197 { 198 "outlier_detection_experimental": { 199 "interval": "0.050s", 200 "baseEjectionTime": "0.100s", 201 "maxEjectionTime": "300s", 202 "maxEjectionPercent": 33, 203 "failurePercentageEjection": { 204 "threshold": 50, 205 "enforcementPercentage": 100, 206 "minimumHosts": 3, 207 "requestVolume": 5 208 }, 209 "childPolicy": [{"%s": {}} 210 ] 211 } 212 } 213 ] 214 }`, leafPolicyName), 215 }, 216 } 217 for _, test := range tests { 218 t.Run(test.name, func(t *testing.T) { 219 addresses, cancel := setupBackends(t) 220 defer cancel() 221 222 mr := manual.NewBuilderWithScheme("od-e2e") 223 defer mr.Close() 224 225 sc := internal.ParseServiceConfig.(func(string) *serviceconfig.ParseResult)(test.odscJSON) 226 // The full list of addresses. 227 fullAddresses := []resolver.Address{ 228 {Addr: addresses[0]}, 229 {Addr: addresses[1]}, 230 {Addr: addresses[2]}, 231 } 232 mr.InitialState(resolver.State{ 233 Addresses: fullAddresses, 234 ServiceConfig: sc, 235 }) 236 237 cc, err := grpc.NewClient(mr.Scheme()+":///", grpc.WithResolvers(mr), grpc.WithTransportCredentials(insecure.NewCredentials())) 238 if err != nil { 239 t.Fatalf("grpc.NewClient() failed: %v", err) 240 } 241 defer cc.Close() 242 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 243 defer cancel() 244 testServiceClient := testgrpc.NewTestServiceClient(cc) 245 246 // At first, due to no statistics on each of the backends, the 3 247 // upstreams should all be round robined across. 248 if err = checkRoundRobinRPCs(ctx, testServiceClient, fullAddresses); err != nil { 249 t.Fatalf("error in expected round robin: %v", err) 250 } 251 252 // The addresses which don't return errors. 253 okAddresses := []resolver.Address{ 254 {Addr: addresses[0]}, 255 {Addr: addresses[1]}, 256 } 257 // After calling the three upstreams, one of them constantly error 258 // and should eventually be ejected for a period of time. This 259 // period of time should cause the RPC's to be round robined only 260 // across the two that are healthy. 261 if err = checkRoundRobinRPCs(ctx, testServiceClient, okAddresses); err != nil { 262 t.Fatalf("error in expected round robin: %v", err) 263 } 264 265 // The failing upstream isn't ejected indefinitely, and eventually 266 // should be unejected in subsequent iterations of the interval 267 // algorithm as per the spec for the two specific algorithms. 268 if err = checkRoundRobinRPCs(ctx, testServiceClient, fullAddresses); err != nil { 269 t.Fatalf("error in expected round robin: %v", err) 270 } 271 }) 272 } 273 } 274 275 // TestNoopConfiguration tests the Outlier Detection Balancer configured with a 276 // noop configuration. The noop configuration should cause the Outlier Detection 277 // Balancer to not count RPC's, and thus never eject any upstreams and continue 278 // to route to every upstream connected to, even if they continuously error. 279 // Once the Outlier Detection Balancer gets reconfigured with configuration 280 // requiring counting RPC's, the Outlier Detection Balancer should start 281 // ejecting any upstreams as specified in the configuration. 282 func (s) TestNoopConfiguration(t *testing.T) { 283 addresses, cancel := setupBackends(t) 284 defer cancel() 285 286 mr := manual.NewBuilderWithScheme("od-e2e") 287 defer mr.Close() 288 289 noopODServiceConfigJSON := fmt.Sprintf(` 290 { 291 "loadBalancingConfig": [ 292 { 293 "outlier_detection_experimental": { 294 "interval": "0.050s", 295 "baseEjectionTime": "0.100s", 296 "maxEjectionTime": "300s", 297 "maxEjectionPercent": 33, 298 "childPolicy": [{"%s": {}}] 299 } 300 } 301 ] 302 }`, leafPolicyName) 303 sc := internal.ParseServiceConfig.(func(string) *serviceconfig.ParseResult)(noopODServiceConfigJSON) 304 // The full list of addresses. 305 fullAddresses := []resolver.Address{ 306 {Addr: addresses[0]}, 307 {Addr: addresses[1]}, 308 {Addr: addresses[2]}, 309 } 310 mr.InitialState(resolver.State{ 311 Addresses: fullAddresses, 312 ServiceConfig: sc, 313 }) 314 cc, err := grpc.NewClient(mr.Scheme()+":///", grpc.WithResolvers(mr), grpc.WithTransportCredentials(insecure.NewCredentials())) 315 if err != nil { 316 t.Fatalf("grpc.NewClient() failed: %v", err) 317 } 318 defer cc.Close() 319 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 320 defer cancel() 321 testServiceClient := testgrpc.NewTestServiceClient(cc) 322 323 for i := 0; i < 2; i++ { 324 // Since the Outlier Detection Balancer starts with a noop 325 // configuration, it shouldn't count RPCs or eject any upstreams. Thus, 326 // even though an upstream it connects to constantly errors, it should 327 // continue to Round Robin across every upstream. 328 if err := checkRoundRobinRPCs(ctx, testServiceClient, fullAddresses); err != nil { 329 t.Fatalf("error in expected round robin: %v", err) 330 } 331 } 332 333 // Reconfigure the Outlier Detection Balancer with a configuration that 334 // specifies to count RPC's and eject upstreams. Due to the balancer no 335 // longer being a noop, it should eject any unhealthy addresses as specified 336 // by the failure percentage portion of the configuration. 337 countingODServiceConfigJSON := fmt.Sprintf(` 338 { 339 "loadBalancingConfig": [ 340 { 341 "outlier_detection_experimental": { 342 "interval": "0.050s", 343 "baseEjectionTime": "0.100s", 344 "maxEjectionTime": "300s", 345 "maxEjectionPercent": 33, 346 "failurePercentageEjection": { 347 "threshold": 50, 348 "enforcementPercentage": 100, 349 "minimumHosts": 3, 350 "requestVolume": 5 351 }, 352 "childPolicy": [{"%s": {}}] 353 } 354 } 355 ] 356 }`, leafPolicyName) 357 sc = internal.ParseServiceConfig.(func(string) *serviceconfig.ParseResult)(countingODServiceConfigJSON) 358 359 mr.UpdateState(resolver.State{ 360 Addresses: fullAddresses, 361 ServiceConfig: sc, 362 }) 363 364 // At first on the reconfigured balancer, the balancer has no stats 365 // collected about upstreams. Thus, it should at first route across the full 366 // upstream list. 367 if err = checkRoundRobinRPCs(ctx, testServiceClient, fullAddresses); err != nil { 368 t.Fatalf("error in expected round robin: %v", err) 369 } 370 371 // The addresses which don't return errors. 372 okAddresses := []resolver.Address{ 373 {Addr: addresses[0]}, 374 {Addr: addresses[1]}, 375 } 376 // Now that the reconfigured balancer has data about the failing upstream, 377 // it should eject the upstream and only route across the two healthy 378 // upstreams. 379 if err = checkRoundRobinRPCs(ctx, testServiceClient, okAddresses); err != nil { 380 t.Fatalf("error in expected round robin: %v", err) 381 } 382 }