google.golang.org/grpc@v1.72.2/xds/internal/balancer/ringhash/e2e/ringhash_balancer_test.go (about) 1 /* 2 * 3 * Copyright 2022 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 package ringhash_test 20 21 import ( 22 "context" 23 "errors" 24 "fmt" 25 "math" 26 rand "math/rand/v2" 27 "net" 28 "slices" 29 "strconv" 30 "sync" 31 "testing" 32 "time" 33 34 "github.com/google/go-cmp/cmp" 35 "github.com/google/go-cmp/cmp/cmpopts" 36 "github.com/google/uuid" 37 "google.golang.org/grpc" 38 "google.golang.org/grpc/backoff" 39 "google.golang.org/grpc/codes" 40 "google.golang.org/grpc/connectivity" 41 "google.golang.org/grpc/credentials/insecure" 42 "google.golang.org/grpc/internal" 43 "google.golang.org/grpc/internal/envconfig" 44 "google.golang.org/grpc/internal/grpctest" 45 "google.golang.org/grpc/internal/stubserver" 46 "google.golang.org/grpc/internal/testutils" 47 "google.golang.org/grpc/internal/testutils/xds/e2e" 48 "google.golang.org/grpc/metadata" 49 "google.golang.org/grpc/peer" 50 "google.golang.org/grpc/resolver" 51 "google.golang.org/grpc/resolver/manual" 52 "google.golang.org/grpc/status" 53 "google.golang.org/grpc/xds/internal/balancer/ringhash" 54 55 v3clusterpb "github.com/envoyproxy/go-control-plane/envoy/config/cluster/v3" 56 v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" 57 v3endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3" 58 v3listenerpb "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3" 59 v3routepb "github.com/envoyproxy/go-control-plane/envoy/config/route/v3" 60 v3ringhashpb "github.com/envoyproxy/go-control-plane/envoy/extensions/load_balancing_policies/ring_hash/v3" 61 v3matcherpb "github.com/envoyproxy/go-control-plane/envoy/type/matcher/v3" 62 testgrpc "google.golang.org/grpc/interop/grpc_testing" 63 testpb "google.golang.org/grpc/interop/grpc_testing" 64 "google.golang.org/protobuf/types/known/wrapperspb" 65 66 _ "google.golang.org/grpc/xds" 67 ) 68 69 type s struct { 70 grpctest.Tester 71 } 72 73 func Test(t *testing.T) { 74 grpctest.RunSubTests(t, s{}) 75 } 76 77 const ( 78 defaultTestTimeout = 10 * time.Second 79 defaultTestShortTimeout = 10 * time.Millisecond 80 81 errorTolerance = .05 // For tests that rely on statistical significance. 82 83 virtualHostName = "test.server" 84 ) 85 86 // fastConnectParams disables connection attempts backoffs and lowers delays. 87 // This speeds up tests that rely on subchannel to move to transient failure. 88 var fastConnectParams = grpc.ConnectParams{ 89 Backoff: backoff.Config{ 90 BaseDelay: 10 * time.Millisecond, 91 }, 92 MinConnectTimeout: 100 * time.Millisecond, 93 } 94 95 // Tests the case where the ring contains a single subConn, and verifies that 96 // when the server goes down, the LB policy on the client automatically 97 // reconnects until the subChannel moves out of TRANSIENT_FAILURE. 98 func (s) TestRingHash_ReconnectToMoveOutOfTransientFailure(t *testing.T) { 99 // Create a restartable listener to simulate server being down. 100 l, err := testutils.LocalTCPListener() 101 if err != nil { 102 t.Fatalf("testutils.LocalTCPListener() failed: %v", err) 103 } 104 lis := testutils.NewRestartableListener(l) 105 srv := stubserver.StartTestService(t, &stubserver.StubServer{ 106 Listener: lis, 107 EmptyCallF: func(context.Context, *testpb.Empty) (*testpb.Empty, error) { return &testpb.Empty{}, nil }, 108 }) 109 defer srv.Stop() 110 111 // Create a clientConn with a manual resolver (which is used to push the 112 // address of the test backend), and a default service config pointing to 113 // the use of the ring_hash_experimental LB policy. 114 const ringHashServiceConfig = `{"loadBalancingConfig": [{"ring_hash_experimental":{}}]}` 115 r := manual.NewBuilderWithScheme("whatever") 116 dopts := []grpc.DialOption{ 117 grpc.WithTransportCredentials(insecure.NewCredentials()), 118 grpc.WithResolvers(r), 119 grpc.WithDefaultServiceConfig(ringHashServiceConfig), 120 grpc.WithConnectParams(fastConnectParams), 121 } 122 cc, err := grpc.NewClient(r.Scheme()+":///test.server", dopts...) 123 if err != nil { 124 t.Fatalf("Failed to dial local test server: %v", err) 125 } 126 defer cc.Close() 127 128 // Push the address of the test backend through the manual resolver. 129 r.UpdateState(resolver.State{Addresses: []resolver.Address{{Addr: lis.Addr().String()}}}) 130 131 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 132 ctx = ringhash.SetXDSRequestHash(ctx, 0) 133 defer cancel() 134 client := testgrpc.NewTestServiceClient(cc) 135 if _, err := client.EmptyCall(ctx, &testpb.Empty{}); err != nil { 136 t.Fatalf("rpc EmptyCall() failed: %v", err) 137 } 138 139 // Stopping the server listener will close the transport on the client, 140 // which will lead to the channel eventually moving to IDLE. The ring_hash 141 // LB policy is not expected to reconnect by itself at this point. 142 lis.Stop() 143 144 testutils.AwaitState(ctx, t, cc, connectivity.Idle) 145 146 // Make an RPC to get the ring_hash LB policy to reconnect and thereby move 147 // to TRANSIENT_FAILURE upon connection failure. 148 client.EmptyCall(ctx, &testpb.Empty{}) 149 150 testutils.AwaitState(ctx, t, cc, connectivity.TransientFailure) 151 152 // An RPC at this point is expected to fail. 153 if _, err = client.EmptyCall(ctx, &testpb.Empty{}); err == nil { 154 t.Fatal("EmptyCall RPC succeeded when the channel is in TRANSIENT_FAILURE") 155 } 156 157 // Restart the server listener. The ring_hash LB policy is expected to 158 // attempt to reconnect on its own and come out of TRANSIENT_FAILURE, even 159 // without an RPC attempt. 160 lis.Restart() 161 testutils.AwaitState(ctx, t, cc, connectivity.Ready) 162 163 // An RPC at this point is expected to succeed. 164 if _, err := client.EmptyCall(ctx, &testpb.Empty{}); err != nil { 165 t.Fatalf("rpc EmptyCall() failed: %v", err) 166 } 167 } 168 169 // startTestServiceBackends starts num stub servers. It returns the list of 170 // stubservers. Servers are closed when the test is stopped. 171 func startTestServiceBackends(t *testing.T, num int) []*stubserver.StubServer { 172 t.Helper() 173 174 servers := make([]*stubserver.StubServer, 0, num) 175 for i := 0; i < num; i++ { 176 server := stubserver.StartTestService(t, nil) 177 t.Cleanup(server.Stop) 178 servers = append(servers, server) 179 } 180 return servers 181 } 182 183 // backendAddrs returns a list of address strings for the given stubservers. 184 func backendAddrs(servers []*stubserver.StubServer) []string { 185 addrs := make([]string, 0, len(servers)) 186 for _, s := range servers { 187 addrs = append(addrs, s.Address) 188 } 189 return addrs 190 } 191 192 // backendOptions returns a slice of e2e.BackendOptions for the given server 193 // addresses. 194 func backendOptions(t *testing.T, serverAddrs []string) []e2e.BackendOptions { 195 t.Helper() 196 backendAddrs := [][]string{} 197 for _, addr := range serverAddrs { 198 backendAddrs = append(backendAddrs, []string{addr}) 199 } 200 return backendOptionsForEndpointsWithMultipleAddrs(t, backendAddrs) 201 } 202 203 // backendOptions returns a slice of e2e.BackendOptions for the given server 204 // addresses. Each endpoint can have multiple addresses. 205 func backendOptionsForEndpointsWithMultipleAddrs(t *testing.T, backendAddrs [][]string) []e2e.BackendOptions { 206 t.Helper() 207 208 var backendOpts []e2e.BackendOptions 209 for _, backend := range backendAddrs { 210 ports := []uint32{} 211 for _, addr := range backend { 212 ports = append(ports, testutils.ParsePort(t, addr)) 213 } 214 backendOpts = append(backendOpts, e2e.BackendOptions{Ports: ports}) 215 } 216 return backendOpts 217 } 218 219 // channelIDHashRoute returns a RouteConfiguration with a hash policy that 220 // hashes based on the channel ID. 221 func channelIDHashRoute(routeName, virtualHostDomain, clusterName string) *v3routepb.RouteConfiguration { 222 route := e2e.DefaultRouteConfig(routeName, virtualHostDomain, clusterName) 223 hashPolicy := v3routepb.RouteAction_HashPolicy{ 224 PolicySpecifier: &v3routepb.RouteAction_HashPolicy_FilterState_{ 225 FilterState: &v3routepb.RouteAction_HashPolicy_FilterState{ 226 Key: "io.grpc.channel_id", 227 }, 228 }, 229 } 230 action := route.VirtualHosts[0].Routes[0].Action.(*v3routepb.Route_Route) 231 action.Route.HashPolicy = []*v3routepb.RouteAction_HashPolicy{&hashPolicy} 232 return route 233 } 234 235 // checkRPCSendOK sends num RPCs to the client. It returns a map of backend 236 // addresses as keys and number of RPCs sent to this address as value. Abort the 237 // test if any RPC fails. 238 func checkRPCSendOK(ctx context.Context, t *testing.T, client testgrpc.TestServiceClient, num int) map[string]int { 239 t.Helper() 240 241 backendCount := make(map[string]int) 242 for i := 0; i < num; i++ { 243 var remote peer.Peer 244 if _, err := client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&remote)); err != nil { 245 t.Fatalf("rpc EmptyCall() failed: %v", err) 246 } 247 backendCount[remote.Addr.String()]++ 248 } 249 return backendCount 250 } 251 252 // makeUnreachableBackends returns a slice of addresses of backends that close 253 // connections as soon as they are established. Useful to simulate servers that 254 // are unreachable. 255 func makeUnreachableBackends(t *testing.T, num int) []string { 256 t.Helper() 257 258 addrs := make([]string, 0, num) 259 for i := 0; i < num; i++ { 260 l, err := testutils.LocalTCPListener() 261 if err != nil { 262 t.Fatalf("testutils.LocalTCPListener() failed: %v", err) 263 } 264 lis := testutils.NewRestartableListener(l) 265 addrs = append(addrs, lis.Addr().String()) 266 267 // It is enough to fail the first connection attempt to put the subchannel 268 // in TRANSIENT_FAILURE. 269 go func() { lis.Accept() }() 270 271 // We don't close these listeners here, to make sure ports are 272 // not reused across them, and across tests. 273 lis.Stop() 274 t.Cleanup(func() { lis.Close() }) 275 } 276 return addrs 277 } 278 279 // setupManagementServerAndResolver sets up an xDS management server, creates 280 // bootstrap configuration pointing to that server and creates an xDS resolver 281 // using that configuration. 282 // 283 // Registers a cleanup function on t to stop the management server. 284 // 285 // Returns the management server, node ID and the xDS resolver builder. 286 func setupManagementServerAndResolver(t *testing.T) (*e2e.ManagementServer, string, resolver.Builder) { 287 t.Helper() 288 289 // Start an xDS management server. 290 xdsServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{AllowResourceSubset: true}) 291 292 // Create bootstrap configuration pointing to the above management server. 293 nodeID := uuid.New().String() 294 bc := e2e.DefaultBootstrapContents(t, nodeID, xdsServer.Address) 295 296 // Create an xDS resolver with the above bootstrap configuration. 297 if internal.NewXDSResolverWithConfigForTesting == nil { 298 t.Fatalf("internal.NewXDSResolverWithConfigForTesting is nil") 299 } 300 r, err := internal.NewXDSResolverWithConfigForTesting.(func([]byte) (resolver.Builder, error))(bc) 301 if err != nil { 302 t.Fatalf("Failed to create xDS resolver for testing: %v", err) 303 } 304 305 return xdsServer, nodeID, r 306 } 307 308 // xdsUpdateOpts returns an e2e.UpdateOptions for the given node ID with the given xDS resources. 309 func xdsUpdateOpts(nodeID string, endpoints *v3endpointpb.ClusterLoadAssignment, cluster *v3clusterpb.Cluster, route *v3routepb.RouteConfiguration, listener *v3listenerpb.Listener) e2e.UpdateOptions { 310 return e2e.UpdateOptions{ 311 NodeID: nodeID, 312 Endpoints: []*v3endpointpb.ClusterLoadAssignment{endpoints}, 313 Clusters: []*v3clusterpb.Cluster{cluster}, 314 Routes: []*v3routepb.RouteConfiguration{route}, 315 Listeners: []*v3listenerpb.Listener{listener}, 316 } 317 } 318 319 // Tests that when an aggregate cluster is configured with ring hash policy, and 320 // the first cluster is in transient failure, all RPCs are sent to the second 321 // cluster using the ring hash policy. 322 func (s) TestRingHash_AggregateClusterFallBackFromRingHashAtStartup(t *testing.T) { 323 addrs := backendAddrs(startTestServiceBackends(t, 2)) 324 325 const primaryClusterName = "new_cluster_1" 326 const primaryServiceName = "new_eds_service_1" 327 const secondaryClusterName = "new_cluster_2" 328 const secondaryServiceName = "new_eds_service_2" 329 const clusterName = "aggregate_cluster" 330 331 ep1 := e2e.EndpointResourceWithOptions(e2e.EndpointOptions{ 332 ClusterName: primaryServiceName, 333 Localities: []e2e.LocalityOptions{{ 334 Name: "locality0", 335 Weight: 1, 336 Backends: backendOptions(t, makeUnreachableBackends(t, 2)), 337 }}, 338 }) 339 ep2 := e2e.EndpointResourceWithOptions(e2e.EndpointOptions{ 340 ClusterName: secondaryServiceName, 341 Localities: []e2e.LocalityOptions{{ 342 Name: "locality0", 343 Weight: 1, 344 Backends: backendOptions(t, addrs), 345 }}, 346 }) 347 primaryCluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 348 ClusterName: primaryClusterName, 349 ServiceName: primaryServiceName, 350 }) 351 secondaryCluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 352 ClusterName: secondaryClusterName, 353 ServiceName: secondaryServiceName, 354 }) 355 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 356 ClusterName: clusterName, 357 Type: e2e.ClusterTypeAggregate, 358 // TODO: when "A75: xDS Aggregate Cluster Behavior Fixes" is implemented, the 359 // policy will have to be set on the child clusters. 360 Policy: e2e.LoadBalancingPolicyRingHash, 361 ChildNames: []string{primaryClusterName, secondaryClusterName}, 362 }) 363 route := channelIDHashRoute("new_route", virtualHostName, clusterName) 364 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 365 366 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 367 defer cancel() 368 369 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 370 updateOpts := e2e.UpdateOptions{ 371 NodeID: nodeID, 372 Endpoints: []*v3endpointpb.ClusterLoadAssignment{ep1, ep2}, 373 Clusters: []*v3clusterpb.Cluster{cluster, primaryCluster, secondaryCluster}, 374 Routes: []*v3routepb.RouteConfiguration{route}, 375 Listeners: []*v3listenerpb.Listener{listener}, 376 } 377 if err := xdsServer.Update(ctx, updateOpts); err != nil { 378 t.Fatalf("Failed to update xDS resources: %v", err) 379 } 380 381 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 382 if err != nil { 383 t.Fatalf("Failed to create client: %s", err) 384 } 385 defer conn.Close() 386 client := testgrpc.NewTestServiceClient(conn) 387 388 const numRPCs = 100 389 gotPerBackend := checkRPCSendOK(ctx, t, client, numRPCs) 390 391 // Since this is using ring hash with the channel ID as the key, all RPCs 392 // are routed to the same backend of the secondary locality. 393 if len(gotPerBackend) != 1 { 394 t.Errorf("Got RPCs routed to %v backends, want %v", len(gotPerBackend), 1) 395 } 396 397 var backend string 398 var got int 399 for backend, got = range gotPerBackend { 400 } 401 if !slices.Contains(addrs, backend) { 402 t.Errorf("Got RPCs routed to an unexpected backend: %v, want one of %v", backend, addrs) 403 } 404 if got != numRPCs { 405 t.Errorf("Got %v RPCs routed to a backend, want %v", got, 100) 406 } 407 } 408 409 func replaceDNSResolver(t *testing.T) *manual.Resolver { 410 mr := manual.NewBuilderWithScheme("dns") 411 412 dnsResolverBuilder := resolver.Get("dns") 413 resolver.Register(mr) 414 415 t.Cleanup(func() { resolver.Register(dnsResolverBuilder) }) 416 return mr 417 } 418 419 // Tests that when an aggregate cluster is configured with ring hash policy, and 420 // the first is an EDS cluster in transient failure, and the fallback is a 421 // logical DNS cluster, all RPCs are sent to the second cluster using the ring 422 // hash policy. 423 func (s) TestRingHash_AggregateClusterFallBackFromRingHashToLogicalDnsAtStartup(t *testing.T) { 424 const edsClusterName = "eds_cluster" 425 const logicalDNSClusterName = "logical_dns_cluster" 426 const clusterName = "aggregate_cluster" 427 428 backends := backendAddrs(startTestServiceBackends(t, 1)) 429 430 endpoints := e2e.EndpointResourceWithOptions(e2e.EndpointOptions{ 431 ClusterName: edsClusterName, 432 Localities: []e2e.LocalityOptions{{ 433 Name: "locality0", 434 Weight: 1, 435 Backends: backendOptions(t, makeUnreachableBackends(t, 1)), 436 Priority: 0, 437 }}, 438 }) 439 edsCluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 440 ClusterName: edsClusterName, 441 ServiceName: edsClusterName, 442 }) 443 444 logicalDNSCluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 445 Type: e2e.ClusterTypeLogicalDNS, 446 ClusterName: logicalDNSClusterName, 447 // The DNS values are not used because we fake DNS later on, but they 448 // are required to be present for the resource to be valid. 449 DNSHostName: "server.example.com", 450 DNSPort: 443, 451 }) 452 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 453 ClusterName: clusterName, 454 Type: e2e.ClusterTypeAggregate, 455 // TODO: when "A75: xDS Aggregate Cluster Behavior Fixes" is merged, the 456 // policy will have to be set on the child clusters. 457 Policy: e2e.LoadBalancingPolicyRingHash, 458 ChildNames: []string{edsClusterName, logicalDNSClusterName}, 459 }) 460 route := channelIDHashRoute("new_route", virtualHostName, clusterName) 461 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 462 463 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 464 defer cancel() 465 466 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 467 updateOpts := e2e.UpdateOptions{ 468 NodeID: nodeID, 469 Endpoints: []*v3endpointpb.ClusterLoadAssignment{endpoints}, 470 Clusters: []*v3clusterpb.Cluster{cluster, edsCluster, logicalDNSCluster}, 471 Routes: []*v3routepb.RouteConfiguration{route}, 472 Listeners: []*v3listenerpb.Listener{listener}, 473 } 474 475 dnsR := replaceDNSResolver(t) 476 dnsR.UpdateState(resolver.State{Addresses: []resolver.Address{{Addr: backends[0]}}}) 477 478 if err := xdsServer.Update(ctx, updateOpts); err != nil { 479 t.Fatalf("Failed to update xDS resources: %v", err) 480 } 481 482 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 483 if err != nil { 484 t.Fatalf("Failed to create client: %s", err) 485 } 486 defer conn.Close() 487 client := testgrpc.NewTestServiceClient(conn) 488 489 gotPerBackend := checkRPCSendOK(ctx, t, client, 1) 490 var got string 491 for got = range gotPerBackend { 492 } 493 if want := backends[0]; got != want { 494 t.Errorf("Got RPCs routed to an unexpected got: %v, want %v", got, want) 495 } 496 } 497 498 // Tests that when an aggregate cluster is configured with ring hash policy, and 499 // it's first child is in transient failure, and the fallback is a logical DNS, 500 // the later recovers from transient failure when its backend becomes available. 501 func (s) TestRingHash_AggregateClusterFallBackFromRingHashToLogicalDnsAtStartupNoFailedRPCs(t *testing.T) { 502 const edsClusterName = "eds_cluster" 503 const logicalDNSClusterName = "logical_dns_cluster" 504 const clusterName = "aggregate_cluster" 505 506 backends := backendAddrs(startTestServiceBackends(t, 1)) 507 508 endpoints := e2e.EndpointResourceWithOptions(e2e.EndpointOptions{ 509 ClusterName: edsClusterName, 510 Localities: []e2e.LocalityOptions{{ 511 Name: "locality0", 512 Weight: 1, 513 Backends: backendOptions(t, makeUnreachableBackends(t, 1)), 514 Priority: 0, 515 }}, 516 }) 517 edsCluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 518 ClusterName: edsClusterName, 519 ServiceName: edsClusterName, 520 }) 521 522 logicalDNSCluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 523 Type: e2e.ClusterTypeLogicalDNS, 524 ClusterName: logicalDNSClusterName, 525 // The DNS values are not used because we fake DNS later on, but they 526 // are required to be present for the resource to be valid. 527 DNSHostName: "server.example.com", 528 DNSPort: 443, 529 }) 530 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 531 ClusterName: clusterName, 532 Type: e2e.ClusterTypeAggregate, 533 // TODO: when "A75: xDS Aggregate Cluster Behavior Fixes" is merged, the 534 // policy will have to be set on the child clusters. 535 Policy: e2e.LoadBalancingPolicyRingHash, 536 ChildNames: []string{edsClusterName, logicalDNSClusterName}, 537 }) 538 route := channelIDHashRoute("new_route", virtualHostName, clusterName) 539 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 540 541 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 542 defer cancel() 543 544 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 545 updateOpts := e2e.UpdateOptions{ 546 NodeID: nodeID, 547 Endpoints: []*v3endpointpb.ClusterLoadAssignment{endpoints}, 548 Clusters: []*v3clusterpb.Cluster{cluster, edsCluster, logicalDNSCluster}, 549 Routes: []*v3routepb.RouteConfiguration{route}, 550 Listeners: []*v3listenerpb.Listener{listener}, 551 } 552 553 dnsR := replaceDNSResolver(t) 554 dnsR.UpdateState(resolver.State{Addresses: []resolver.Address{{Addr: backends[0]}}}) 555 556 if err := xdsServer.Update(ctx, updateOpts); err != nil { 557 t.Fatalf("Failed to update xDS resources: %v", err) 558 } 559 560 dialer := testutils.NewBlockingDialer() 561 cp := grpc.ConnectParams{ 562 // Increase backoff time, so that subconns stay in TRANSIENT_FAILURE 563 // for long enough to trigger potential problems. 564 Backoff: backoff.Config{ 565 BaseDelay: defaultTestTimeout, 566 }, 567 MinConnectTimeout: 0, 568 } 569 dopts := []grpc.DialOption{ 570 grpc.WithResolvers(xdsResolver), 571 grpc.WithTransportCredentials(insecure.NewCredentials()), 572 grpc.WithContextDialer(dialer.DialContext), 573 grpc.WithConnectParams(cp)} 574 conn, err := grpc.NewClient("xds:///test.server", dopts...) 575 if err != nil { 576 t.Fatalf("Failed to create client: %s", err) 577 } 578 defer conn.Close() 579 client := testgrpc.NewTestServiceClient(conn) 580 581 hold := dialer.Hold(backends[0]) 582 583 errCh := make(chan error, 2) 584 go func() { 585 if _, err := client.EmptyCall(ctx, &testpb.Empty{}); err != nil { 586 errCh <- fmt.Errorf("first rpc UnaryCall() failed: %v", err) 587 return 588 } 589 errCh <- nil 590 }() 591 592 testutils.AwaitState(ctx, t, conn, connectivity.Connecting) 593 594 go func() { 595 // Start a second RPC at this point, which should be queued as well. 596 // This will fail if the priority policy fails to update the picker to 597 // point to the LOGICAL_DNS child; if it leaves it pointing to the EDS 598 // priority 1, then the RPC will fail, because all subchannels are in 599 // transient failure. 600 // 601 // Note that sending only the first RPC does not catch this case, 602 // because if the priority policy fails to update the picker, then the 603 // pick for the first RPC will not be retried. 604 if _, err := client.EmptyCall(ctx, &testpb.Empty{}); err != nil { 605 errCh <- fmt.Errorf("second UnaryCall() failed: %v", err) 606 return 607 } 608 errCh <- nil 609 }() 610 611 // Wait for a connection attempt to backends[0]. 612 if !hold.Wait(ctx) { 613 t.Fatalf("Timeout while waiting for a connection attempt to %s", backends[0]) 614 } 615 // Allow the connection attempts to complete. 616 hold.Resume() 617 618 // RPCs should complete successfully. 619 for range []int{0, 1} { 620 select { 621 case err := <-errCh: 622 if err != nil { 623 t.Errorf("Expected 2 rpc to succeed, but at least one failed: %v", err) 624 } 625 case <-ctx.Done(): 626 t.Fatalf("Timed out waiting for RPCs to complete") 627 } 628 } 629 } 630 631 // endpointResource creates a ClusterLoadAssignment containing a single locality 632 // with the given addresses. 633 func endpointResource(t *testing.T, clusterName string, addrs []string) *v3endpointpb.ClusterLoadAssignment { 634 t.Helper() 635 backendAddrs := [][]string{} 636 for _, addr := range addrs { 637 backendAddrs = append(backendAddrs, []string{addr}) 638 } 639 return endpointResourceForBackendsWithMultipleAddrs(t, clusterName, backendAddrs) 640 } 641 642 // endpointResourceForBackendsWithMultipleAddrs creates a ClusterLoadAssignment 643 // containing a single locality with the given addresses. 644 func endpointResourceForBackendsWithMultipleAddrs(t *testing.T, clusterName string, addrs [][]string) *v3endpointpb.ClusterLoadAssignment { 645 t.Helper() 646 647 // We must set the host name socket address in EDS, as the ring hash policy 648 // uses it to construct the ring. 649 host, _, err := net.SplitHostPort(addrs[0][0]) 650 if err != nil { 651 t.Fatalf("Failed to split host and port from stubserver: %v", err) 652 } 653 654 return e2e.EndpointResourceWithOptions(e2e.EndpointOptions{ 655 ClusterName: clusterName, 656 Host: host, 657 Localities: []e2e.LocalityOptions{{ 658 Backends: backendOptionsForEndpointsWithMultipleAddrs(t, addrs), 659 Weight: 1, 660 }}, 661 }) 662 } 663 664 // Tests that ring hash policy that hashes using channel id ensures all RPCs to 665 // go 1 particular backend. 666 func (s) TestRingHash_ChannelIdHashing(t *testing.T) { 667 backends := backendAddrs(startTestServiceBackends(t, 4)) 668 669 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 670 671 const clusterName = "cluster" 672 endpoints := endpointResource(t, clusterName, backends) 673 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 674 ClusterName: clusterName, 675 ServiceName: clusterName, 676 Policy: e2e.LoadBalancingPolicyRingHash, 677 }) 678 route := channelIDHashRoute("new_route", virtualHostName, clusterName) 679 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 680 681 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 682 defer cancel() 683 684 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 685 t.Fatalf("Failed to update xDS resources: %v", err) 686 } 687 688 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 689 if err != nil { 690 t.Fatalf("Failed to create client: %s", err) 691 } 692 defer conn.Close() 693 client := testgrpc.NewTestServiceClient(conn) 694 695 const numRPCs = 100 696 received := checkRPCSendOK(ctx, t, client, numRPCs) 697 if len(received) != 1 { 698 t.Errorf("Got RPCs routed to %v backends, want %v", len(received), 1) 699 } 700 var got int 701 for _, got = range received { 702 } 703 if got != numRPCs { 704 t.Errorf("Got %v RPCs routed to a backend, want %v", got, numRPCs) 705 } 706 } 707 708 // headerHashRoute creates a RouteConfiguration with a hash policy that uses the 709 // provided header. 710 func headerHashRoute(routeName, virtualHostName, clusterName, header string) *v3routepb.RouteConfiguration { 711 route := e2e.DefaultRouteConfig(routeName, virtualHostName, clusterName) 712 hashPolicy := v3routepb.RouteAction_HashPolicy{ 713 PolicySpecifier: &v3routepb.RouteAction_HashPolicy_Header_{ 714 Header: &v3routepb.RouteAction_HashPolicy_Header{ 715 HeaderName: header, 716 }, 717 }, 718 } 719 action := route.VirtualHosts[0].Routes[0].Action.(*v3routepb.Route_Route) 720 action.Route.HashPolicy = []*v3routepb.RouteAction_HashPolicy{&hashPolicy} 721 return route 722 } 723 724 // Tests that ring hash policy that hashes using a header value can send RPCs 725 // to specific backends based on their hash. 726 func (s) TestRingHash_HeaderHashing(t *testing.T) { 727 backends := backendAddrs(startTestServiceBackends(t, 4)) 728 729 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 730 731 const clusterName = "cluster" 732 endpoints := endpointResource(t, clusterName, backends) 733 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 734 ClusterName: clusterName, 735 ServiceName: clusterName, 736 Policy: e2e.LoadBalancingPolicyRingHash, 737 }) 738 route := headerHashRoute("new_route", virtualHostName, clusterName, "address_hash") 739 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 740 741 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 742 defer cancel() 743 744 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 745 t.Fatalf("Failed to update xDS resources: %v", err) 746 } 747 748 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 749 if err != nil { 750 t.Fatalf("Failed to create client: %s", err) 751 } 752 defer conn.Close() 753 client := testgrpc.NewTestServiceClient(conn) 754 755 // Note each type of RPC contains a header value that will always be hashed 756 // to a specific backend as the header value matches the value used to 757 // create the entry in the ring. 758 for _, backend := range backends { 759 ctx := metadata.NewOutgoingContext(ctx, metadata.Pairs("address_hash", backend+"_0")) 760 numRPCs := 10 761 reqPerBackend := checkRPCSendOK(ctx, t, client, numRPCs) 762 if reqPerBackend[backend] != numRPCs { 763 t.Errorf("Got RPC routed to addresses %v, want all RPCs routed to %v", reqPerBackend, backend) 764 } 765 } 766 } 767 768 // Tests that ring hash policy that hashes using a header value and regex 769 // rewrite to aggregate RPCs to 1 backend. 770 func (s) TestRingHash_HeaderHashingWithRegexRewrite(t *testing.T) { 771 backends := backendAddrs(startTestServiceBackends(t, 4)) 772 773 clusterName := "cluster" 774 endpoints := endpointResource(t, clusterName, backends) 775 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 776 ClusterName: clusterName, 777 ServiceName: clusterName, 778 Policy: e2e.LoadBalancingPolicyRingHash, 779 }) 780 route := headerHashRoute("new_route", virtualHostName, clusterName, "address_hash") 781 action := route.VirtualHosts[0].Routes[0].Action.(*v3routepb.Route_Route) 782 action.Route.HashPolicy[0].GetHeader().RegexRewrite = &v3matcherpb.RegexMatchAndSubstitute{ 783 Pattern: &v3matcherpb.RegexMatcher{ 784 EngineType: &v3matcherpb.RegexMatcher_GoogleRe2{}, 785 Regex: "[0-9]+", 786 }, 787 Substitution: "foo", 788 } 789 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 790 791 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 792 defer cancel() 793 794 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 795 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 796 t.Fatalf("Failed to update xDS resources: %v", err) 797 } 798 799 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 800 if err != nil { 801 t.Fatalf("Failed to create client: %s", err) 802 } 803 defer conn.Close() 804 client := testgrpc.NewTestServiceClient(conn) 805 806 // Note each type of RPC contains a header value that would always be hashed 807 // to a specific backend as the header value matches the value used to 808 // create the entry in the ring. However, the regex rewrites all numbers to 809 // "foo", and header values only differ by numbers, so they all end up 810 // hashing to the same value. 811 gotPerBackend := make(map[string]int) 812 for _, backend := range backends { 813 ctx := metadata.NewOutgoingContext(ctx, metadata.Pairs("address_hash", backend+"_0")) 814 res := checkRPCSendOK(ctx, t, client, 100) 815 for addr, count := range res { 816 gotPerBackend[addr] += count 817 } 818 } 819 if want := 1; len(gotPerBackend) != want { 820 t.Errorf("Got RPCs routed to %v backends, want %v", len(gotPerBackend), want) 821 } 822 var got int 823 for _, got = range gotPerBackend { 824 } 825 if want := 400; got != want { 826 t.Errorf("Got %v RPCs routed to a backend, want %v", got, want) 827 } 828 } 829 830 // computeIdealNumberOfRPCs computes the ideal number of RPCs to send so that 831 // we can observe an event happening with probability p, and the result will 832 // have value p with the given error tolerance. 833 // 834 // See https://github.com/grpc/grpc/blob/4f6e13bdda9e8c26d6027af97db4b368ca2b3069/test/cpp/end2end/xds/xds_end2end_test_lib.h#L941 835 // for an explanation of the formula. 836 func computeIdealNumberOfRPCs(t *testing.T, p, errorTolerance float64) int { 837 if p < 0 || p > 1 { 838 t.Fatal("p must be in (0, 1)") 839 } 840 numRPCs := math.Ceil(p * (1 - p) * 5. * 5. / errorTolerance / errorTolerance) 841 return int(numRPCs + 1000.) // add 1k as a buffer to avoid flakiness. 842 } 843 844 // setRingHashLBPolicyWithHighMinRingSize sets the ring hash policy with a high 845 // minimum ring size to ensure that the ring is large enough to distribute 846 // requests more uniformly across endpoints when a random hash is used. 847 func setRingHashLBPolicyWithHighMinRingSize(t *testing.T, cluster *v3clusterpb.Cluster) { 848 const minRingSize = 100000 849 oldVal := envconfig.RingHashCap 850 envconfig.RingHashCap = minRingSize 851 t.Cleanup(func() { 852 envconfig.RingHashCap = oldVal 853 }) 854 // Increasing min ring size for random distribution. 855 config := testutils.MarshalAny(t, &v3ringhashpb.RingHash{ 856 HashFunction: v3ringhashpb.RingHash_XX_HASH, 857 MinimumRingSize: &wrapperspb.UInt64Value{Value: minRingSize}, 858 }) 859 cluster.LoadBalancingPolicy = &v3clusterpb.LoadBalancingPolicy{ 860 Policies: []*v3clusterpb.LoadBalancingPolicy_Policy{{ 861 TypedExtensionConfig: &v3corepb.TypedExtensionConfig{ 862 Name: "envoy.load_balancing_policies.ring_hash", 863 TypedConfig: config, 864 }, 865 }}, 866 } 867 } 868 869 // Tests that ring hash policy that hashes using a random value. 870 func (s) TestRingHash_NoHashPolicy(t *testing.T) { 871 backends := backendAddrs(startTestServiceBackends(t, 2)) 872 numRPCs := computeIdealNumberOfRPCs(t, .5, errorTolerance) 873 874 const clusterName = "cluster" 875 endpoints := endpointResource(t, clusterName, backends) 876 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 877 ClusterName: clusterName, 878 ServiceName: clusterName, 879 }) 880 setRingHashLBPolicyWithHighMinRingSize(t, cluster) 881 route := e2e.DefaultRouteConfig("new_route", virtualHostName, clusterName) 882 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 883 884 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 885 defer cancel() 886 887 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 888 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 889 t.Fatalf("Failed to update xDS resources: %v", err) 890 } 891 892 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 893 if err != nil { 894 t.Fatalf("Failed to create client: %s", err) 895 } 896 defer conn.Close() 897 client := testgrpc.NewTestServiceClient(conn) 898 899 // Send a large number of RPCs and check that they are distributed randomly. 900 gotPerBackend := checkRPCSendOK(ctx, t, client, numRPCs) 901 for _, backend := range backends { 902 got := float64(gotPerBackend[backend]) / float64(numRPCs) 903 want := .5 904 if !cmp.Equal(got, want, cmpopts.EquateApprox(0, errorTolerance)) { 905 t.Errorf("Fraction of RPCs to backend %s: got %v, want %v (margin: +-%v)", backend, got, want, errorTolerance) 906 } 907 } 908 } 909 910 // Tests that we observe endpoint weights. 911 func (s) TestRingHash_EndpointWeights(t *testing.T) { 912 backends := backendAddrs(startTestServiceBackends(t, 3)) 913 914 const clusterName = "cluster" 915 backendOpts := []e2e.BackendOptions{ 916 {Ports: []uint32{testutils.ParsePort(t, backends[0])}}, 917 {Ports: []uint32{testutils.ParsePort(t, backends[1])}}, 918 {Ports: []uint32{testutils.ParsePort(t, backends[2])}, Weight: 2}, 919 } 920 921 endpoints := e2e.EndpointResourceWithOptions(e2e.EndpointOptions{ 922 ClusterName: clusterName, 923 Localities: []e2e.LocalityOptions{{ 924 Backends: backendOpts, 925 Weight: 1, 926 }}, 927 }) 928 endpoints.Endpoints[0].LbEndpoints[0].LoadBalancingWeight = wrapperspb.UInt32(uint32(1)) 929 endpoints.Endpoints[0].LbEndpoints[1].LoadBalancingWeight = wrapperspb.UInt32(uint32(1)) 930 endpoints.Endpoints[0].LbEndpoints[2].LoadBalancingWeight = wrapperspb.UInt32(uint32(2)) 931 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 932 ClusterName: clusterName, 933 ServiceName: clusterName, 934 }) 935 // Increasing min ring size for random distribution. 936 setRingHashLBPolicyWithHighMinRingSize(t, cluster) 937 route := e2e.DefaultRouteConfig("new_route", virtualHostName, clusterName) 938 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 939 940 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 941 defer cancel() 942 943 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 944 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 945 t.Fatalf("Failed to update xDS resources: %v", err) 946 } 947 948 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 949 if err != nil { 950 t.Fatalf("Failed to create client: %s", err) 951 } 952 defer conn.Close() 953 client := testgrpc.NewTestServiceClient(conn) 954 955 // Send a large number of RPCs and check that they are distributed randomly. 956 numRPCs := computeIdealNumberOfRPCs(t, .25, errorTolerance) 957 gotPerBackend := checkRPCSendOK(ctx, t, client, numRPCs) 958 959 got := float64(gotPerBackend[backends[0]]) / float64(numRPCs) 960 want := .25 961 if !cmp.Equal(got, want, cmpopts.EquateApprox(0, errorTolerance)) { 962 t.Errorf("Fraction of RPCs to backend %s: got %v, want %v (margin: +-%v)", backends[0], got, want, errorTolerance) 963 } 964 got = float64(gotPerBackend[backends[1]]) / float64(numRPCs) 965 if !cmp.Equal(got, want, cmpopts.EquateApprox(0, errorTolerance)) { 966 t.Errorf("Fraction of RPCs to backend %s: got %v, want %v (margin: +-%v)", backends[1], got, want, errorTolerance) 967 } 968 got = float64(gotPerBackend[backends[2]]) / float64(numRPCs) 969 want = .50 970 if !cmp.Equal(got, want, cmpopts.EquateApprox(0, errorTolerance)) { 971 t.Errorf("Fraction of RPCs to backend %s: got %v, want %v (margin: +-%v)", backends[2], got, want, errorTolerance) 972 } 973 } 974 975 // Tests that ring hash policy evaluation will continue past the terminal hash 976 // policy if no results are produced yet. 977 func (s) TestRingHash_ContinuesPastTerminalPolicyThatDoesNotProduceResult(t *testing.T) { 978 backends := backendAddrs(startTestServiceBackends(t, 2)) 979 980 const clusterName = "cluster" 981 endpoints := endpointResource(t, clusterName, backends) 982 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 983 ClusterName: clusterName, 984 ServiceName: clusterName, 985 Policy: e2e.LoadBalancingPolicyRingHash, 986 }) 987 988 route := e2e.DefaultRouteConfig("new_route", "test.server", clusterName) 989 990 // Even though this hash policy is terminal, since it produces no result, we 991 // continue past it to find a policy that produces results. 992 hashPolicy := v3routepb.RouteAction_HashPolicy{ 993 PolicySpecifier: &v3routepb.RouteAction_HashPolicy_Header_{ 994 Header: &v3routepb.RouteAction_HashPolicy_Header{ 995 HeaderName: "header_not_present", 996 }, 997 }, 998 Terminal: true, 999 } 1000 hashPolicy2 := v3routepb.RouteAction_HashPolicy{ 1001 PolicySpecifier: &v3routepb.RouteAction_HashPolicy_Header_{ 1002 Header: &v3routepb.RouteAction_HashPolicy_Header{ 1003 HeaderName: "address_hash", 1004 }, 1005 }, 1006 } 1007 action := route.VirtualHosts[0].Routes[0].Action.(*v3routepb.Route_Route) 1008 action.Route.HashPolicy = []*v3routepb.RouteAction_HashPolicy{&hashPolicy, &hashPolicy2} 1009 1010 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 1011 1012 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1013 defer cancel() 1014 1015 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 1016 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 1017 t.Fatalf("Failed to update xDS resources: %v", err) 1018 } 1019 1020 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 1021 if err != nil { 1022 t.Fatalf("Failed to create client: %s", err) 1023 } 1024 defer conn.Close() 1025 client := testgrpc.NewTestServiceClient(conn) 1026 1027 // - The first hash policy does not match because the header is not present. 1028 // If this hash policy was applied, it would spread the load across 1029 // backend 0 and 1, since a random hash would be used. 1030 // - In the second hash policy, each type of RPC contains a header 1031 // value that always hashes to backend 0, as the header value 1032 // matches the value used to create the entry in the ring. 1033 // We verify that the second hash policy is used by checking that all RPCs 1034 // are being routed to backend 0. 1035 wantBackend := backends[0] 1036 ctx = metadata.NewOutgoingContext(ctx, metadata.Pairs("address_hash", wantBackend+"_0")) 1037 const numRPCs = 100 1038 gotPerBackend := checkRPCSendOK(ctx, t, client, numRPCs) 1039 if got := gotPerBackend[wantBackend]; got != numRPCs { 1040 t.Errorf("Got %v RPCs routed to backend %v, want %v", got, wantBackend, numRPCs) 1041 } 1042 } 1043 1044 // Tests that a random hash is used when header hashing policy specified a 1045 // header field that the RPC did not have. 1046 func (s) TestRingHash_HashOnHeaderThatIsNotPresent(t *testing.T) { 1047 backends := backendAddrs(startTestServiceBackends(t, 2)) 1048 wantFractionPerBackend := .5 1049 numRPCs := computeIdealNumberOfRPCs(t, wantFractionPerBackend, errorTolerance) 1050 1051 const clusterName = "cluster" 1052 endpoints := e2e.EndpointResourceWithOptions(e2e.EndpointOptions{ 1053 ClusterName: clusterName, 1054 Localities: []e2e.LocalityOptions{{ 1055 Backends: backendOptions(t, backends), 1056 Weight: 1, 1057 }}, 1058 }) 1059 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 1060 ClusterName: clusterName, 1061 ServiceName: clusterName, 1062 }) 1063 setRingHashLBPolicyWithHighMinRingSize(t, cluster) 1064 route := headerHashRoute("new_route", virtualHostName, clusterName, "header_not_present") 1065 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 1066 1067 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1068 defer cancel() 1069 1070 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 1071 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 1072 t.Fatalf("Failed to update xDS resources: %v", err) 1073 } 1074 1075 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 1076 if err != nil { 1077 t.Fatalf("Failed to create client: %s", err) 1078 } 1079 defer conn.Close() 1080 client := testgrpc.NewTestServiceClient(conn) 1081 1082 // The first hash policy does not apply because the header is not present in 1083 // the RPCs that we are about to send. As a result, a random hash should be 1084 // used instead, resulting in a random request distribution. 1085 // We verify this by checking that the RPCs are distributed randomly. 1086 gotPerBackend := checkRPCSendOK(ctx, t, client, numRPCs) 1087 for _, backend := range backends { 1088 got := float64(gotPerBackend[backend]) / float64(numRPCs) 1089 if !cmp.Equal(got, wantFractionPerBackend, cmpopts.EquateApprox(0, errorTolerance)) { 1090 t.Errorf("fraction of RPCs to backend %s: got %v, want %v (margin: +-%v)", backend, got, wantFractionPerBackend, errorTolerance) 1091 } 1092 } 1093 } 1094 1095 // Tests that a random hash is used when only unsupported hash policies are 1096 // configured. 1097 func (s) TestRingHash_UnsupportedHashPolicyDefaultToRandomHashing(t *testing.T) { 1098 backends := backendAddrs(startTestServiceBackends(t, 2)) 1099 wantFractionPerBackend := .5 1100 numRPCs := computeIdealNumberOfRPCs(t, wantFractionPerBackend, errorTolerance) 1101 1102 const clusterName = "cluster" 1103 endpoints := e2e.EndpointResourceWithOptions(e2e.EndpointOptions{ 1104 ClusterName: clusterName, 1105 Localities: []e2e.LocalityOptions{{ 1106 Backends: backendOptions(t, backends), 1107 Weight: 1, 1108 }}, 1109 }) 1110 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 1111 ClusterName: clusterName, 1112 ServiceName: clusterName, 1113 }) 1114 setRingHashLBPolicyWithHighMinRingSize(t, cluster) 1115 route := e2e.DefaultRouteConfig("new_route", "test.server", clusterName) 1116 unsupportedHashPolicy1 := v3routepb.RouteAction_HashPolicy{ 1117 PolicySpecifier: &v3routepb.RouteAction_HashPolicy_Cookie_{ 1118 Cookie: &v3routepb.RouteAction_HashPolicy_Cookie{Name: "cookie"}, 1119 }, 1120 } 1121 unsupportedHashPolicy2 := v3routepb.RouteAction_HashPolicy{ 1122 PolicySpecifier: &v3routepb.RouteAction_HashPolicy_ConnectionProperties_{ 1123 ConnectionProperties: &v3routepb.RouteAction_HashPolicy_ConnectionProperties{SourceIp: true}, 1124 }, 1125 } 1126 unsupportedHashPolicy3 := v3routepb.RouteAction_HashPolicy{ 1127 PolicySpecifier: &v3routepb.RouteAction_HashPolicy_QueryParameter_{ 1128 QueryParameter: &v3routepb.RouteAction_HashPolicy_QueryParameter{Name: "query_parameter"}, 1129 }, 1130 } 1131 action := route.VirtualHosts[0].Routes[0].Action.(*v3routepb.Route_Route) 1132 action.Route.HashPolicy = []*v3routepb.RouteAction_HashPolicy{&unsupportedHashPolicy1, &unsupportedHashPolicy2, &unsupportedHashPolicy3} 1133 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 1134 1135 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1136 defer cancel() 1137 1138 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 1139 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 1140 t.Fatalf("Failed to update xDS resources: %v", err) 1141 } 1142 1143 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 1144 if err != nil { 1145 t.Fatalf("Failed to create client: %s", err) 1146 } 1147 defer conn.Close() 1148 client := testgrpc.NewTestServiceClient(conn) 1149 1150 // Since none of the hash policy are supported, a random hash should be 1151 // generated for every request. 1152 // We verify this by checking that the RPCs are distributed randomly. 1153 gotPerBackend := checkRPCSendOK(ctx, t, client, numRPCs) 1154 for _, backend := range backends { 1155 got := float64(gotPerBackend[backend]) / float64(numRPCs) 1156 if !cmp.Equal(got, wantFractionPerBackend, cmpopts.EquateApprox(0, errorTolerance)) { 1157 t.Errorf("Fraction of RPCs to backend %s: got %v, want %v (margin: +-%v)", backend, got, wantFractionPerBackend, errorTolerance) 1158 } 1159 } 1160 } 1161 1162 // Tests that unsupported hash policy types are all ignored before a supported 1163 // hash policy. 1164 func (s) TestRingHash_UnsupportedHashPolicyUntilChannelIdHashing(t *testing.T) { 1165 backends := backendAddrs(startTestServiceBackends(t, 2)) 1166 1167 const clusterName = "cluster" 1168 endpoints := e2e.EndpointResourceWithOptions(e2e.EndpointOptions{ 1169 ClusterName: clusterName, 1170 Localities: []e2e.LocalityOptions{{ 1171 Backends: backendOptions(t, backends), 1172 Weight: 1, 1173 }}, 1174 }) 1175 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 1176 ClusterName: clusterName, 1177 ServiceName: clusterName, 1178 }) 1179 setRingHashLBPolicyWithHighMinRingSize(t, cluster) 1180 route := e2e.DefaultRouteConfig("new_route", "test.server", clusterName) 1181 unsupportedHashPolicy1 := v3routepb.RouteAction_HashPolicy{ 1182 PolicySpecifier: &v3routepb.RouteAction_HashPolicy_Cookie_{ 1183 Cookie: &v3routepb.RouteAction_HashPolicy_Cookie{Name: "cookie"}, 1184 }, 1185 } 1186 unsupportedHashPolicy2 := v3routepb.RouteAction_HashPolicy{ 1187 PolicySpecifier: &v3routepb.RouteAction_HashPolicy_ConnectionProperties_{ 1188 ConnectionProperties: &v3routepb.RouteAction_HashPolicy_ConnectionProperties{SourceIp: true}, 1189 }, 1190 } 1191 unsupportedHashPolicy3 := v3routepb.RouteAction_HashPolicy{ 1192 PolicySpecifier: &v3routepb.RouteAction_HashPolicy_QueryParameter_{ 1193 QueryParameter: &v3routepb.RouteAction_HashPolicy_QueryParameter{Name: "query_parameter"}, 1194 }, 1195 } 1196 channelIDhashPolicy := v3routepb.RouteAction_HashPolicy{ 1197 PolicySpecifier: &v3routepb.RouteAction_HashPolicy_FilterState_{ 1198 FilterState: &v3routepb.RouteAction_HashPolicy_FilterState{ 1199 Key: "io.grpc.channel_id", 1200 }, 1201 }, 1202 } 1203 action := route.VirtualHosts[0].Routes[0].Action.(*v3routepb.Route_Route) 1204 action.Route.HashPolicy = []*v3routepb.RouteAction_HashPolicy{&unsupportedHashPolicy1, &unsupportedHashPolicy2, &unsupportedHashPolicy3, &channelIDhashPolicy} 1205 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 1206 1207 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1208 defer cancel() 1209 1210 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 1211 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 1212 t.Fatalf("Failed to update xDS resources: %v", err) 1213 } 1214 1215 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 1216 if err != nil { 1217 t.Fatalf("Failed to create client: %s", err) 1218 } 1219 defer conn.Close() 1220 client := testgrpc.NewTestServiceClient(conn) 1221 1222 // Since only unsupported policies are present except for the last one 1223 // which is using the channel ID hashing policy, all requests should be 1224 // routed to the same backend. 1225 const numRPCs = 100 1226 gotPerBackend := checkRPCSendOK(ctx, t, client, numRPCs) 1227 if len(gotPerBackend) != 1 { 1228 t.Errorf("Got RPCs routed to %v backends, want 1", len(gotPerBackend)) 1229 } 1230 var got int 1231 for _, got = range gotPerBackend { 1232 } 1233 if got != numRPCs { 1234 t.Errorf("Got %v RPCs routed to a backend, want %v", got, numRPCs) 1235 } 1236 } 1237 1238 // Tests that ring hash policy that hashes using a random value can spread RPCs 1239 // across all the backends according to locality weight. 1240 func (s) TestRingHash_RandomHashingDistributionAccordingToLocalityAndEndpointWeight(t *testing.T) { 1241 backends := backendAddrs(startTestServiceBackends(t, 2)) 1242 1243 const clusterName = "cluster" 1244 const locality1Weight = uint32(1) 1245 const endpoint1Weight = uint32(1) 1246 const locality2Weight = uint32(2) 1247 const endpoint2Weight = uint32(2) 1248 endpoints := e2e.EndpointResourceWithOptions(e2e.EndpointOptions{ 1249 ClusterName: clusterName, 1250 Localities: []e2e.LocalityOptions{ 1251 { 1252 Backends: []e2e.BackendOptions{{ 1253 Ports: []uint32{testutils.ParsePort(t, backends[0])}, 1254 Weight: endpoint1Weight, 1255 }}, 1256 Weight: locality1Weight, 1257 }, 1258 { 1259 Backends: []e2e.BackendOptions{{ 1260 Ports: []uint32{testutils.ParsePort(t, backends[1])}, 1261 Weight: endpoint2Weight, 1262 }}, 1263 Weight: locality2Weight, 1264 }, 1265 }, 1266 }) 1267 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 1268 ClusterName: clusterName, 1269 ServiceName: clusterName, 1270 }) 1271 setRingHashLBPolicyWithHighMinRingSize(t, cluster) 1272 route := e2e.DefaultRouteConfig("new_route", "test.server", clusterName) 1273 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 1274 1275 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1276 defer cancel() 1277 1278 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 1279 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 1280 t.Fatalf("Failed to update xDS resources: %v", err) 1281 } 1282 1283 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 1284 if err != nil { 1285 t.Fatalf("Failed to create client: %s", err) 1286 } 1287 defer conn.Close() 1288 client := testgrpc.NewTestServiceClient(conn) 1289 1290 const weight1 = endpoint1Weight * locality1Weight 1291 const weight2 = endpoint2Weight * locality2Weight 1292 const wantRPCs1 = float64(weight1) / float64(weight1+weight2) 1293 const wantRPCs2 = float64(weight2) / float64(weight1+weight2) 1294 numRPCs := computeIdealNumberOfRPCs(t, math.Min(wantRPCs1, wantRPCs2), errorTolerance) 1295 1296 // Send a large number of RPCs and check that they are distributed randomly. 1297 gotPerBackend := checkRPCSendOK(ctx, t, client, numRPCs) 1298 got := float64(gotPerBackend[backends[0]]) / float64(numRPCs) 1299 if !cmp.Equal(got, wantRPCs1, cmpopts.EquateApprox(0, errorTolerance)) { 1300 t.Errorf("Fraction of RPCs to backend %s: got %v, want %v (margin: +-%v)", backends[2], got, wantRPCs1, errorTolerance) 1301 } 1302 got = float64(gotPerBackend[backends[1]]) / float64(numRPCs) 1303 if !cmp.Equal(got, wantRPCs2, cmpopts.EquateApprox(0, errorTolerance)) { 1304 t.Errorf("Fraction of RPCs to backend %s: got %v, want %v (margin: +-%v)", backends[2], got, wantRPCs2, errorTolerance) 1305 } 1306 } 1307 1308 // Tests that ring hash policy that hashes using a fixed string ensures all RPCs 1309 // to go 1 particular backend; and that subsequent hashing policies are ignored 1310 // due to the setting of terminal. 1311 func (s) TestRingHash_FixedHashingTerminalPolicy(t *testing.T) { 1312 backends := backendAddrs(startTestServiceBackends(t, 2)) 1313 const clusterName = "cluster" 1314 endpoints := endpointResource(t, clusterName, backends) 1315 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 1316 ClusterName: clusterName, 1317 ServiceName: clusterName, 1318 Policy: e2e.LoadBalancingPolicyRingHash, 1319 }) 1320 1321 route := e2e.DefaultRouteConfig("new_route", "test.server", clusterName) 1322 1323 hashPolicy := v3routepb.RouteAction_HashPolicy{ 1324 PolicySpecifier: &v3routepb.RouteAction_HashPolicy_Header_{ 1325 Header: &v3routepb.RouteAction_HashPolicy_Header{ 1326 HeaderName: "fixed_string", 1327 }, 1328 }, 1329 Terminal: true, 1330 } 1331 hashPolicy2 := v3routepb.RouteAction_HashPolicy{ 1332 PolicySpecifier: &v3routepb.RouteAction_HashPolicy_Header_{ 1333 Header: &v3routepb.RouteAction_HashPolicy_Header{ 1334 HeaderName: "random_string", 1335 }, 1336 }, 1337 } 1338 action := route.VirtualHosts[0].Routes[0].Action.(*v3routepb.Route_Route) 1339 action.Route.HashPolicy = []*v3routepb.RouteAction_HashPolicy{&hashPolicy, &hashPolicy2} 1340 1341 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 1342 1343 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1344 defer cancel() 1345 1346 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 1347 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 1348 t.Fatalf("Failed to update xDS resources: %v", err) 1349 } 1350 1351 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 1352 if err != nil { 1353 t.Fatalf("Failed to create client: %s", err) 1354 } 1355 defer conn.Close() 1356 client := testgrpc.NewTestServiceClient(conn) 1357 1358 // Check that despite the matching random string header, since the fixed 1359 // string hash policy is terminal, only the fixed string hash policy applies 1360 // and requests all get routed to the same host. 1361 gotPerBackend := make(map[string]int) 1362 const numRPCs = 100 1363 for i := 0; i < numRPCs; i++ { 1364 ctx := metadata.NewOutgoingContext(ctx, metadata.Pairs( 1365 "fixed_string", backends[0]+"_0", 1366 "random_string", fmt.Sprintf("%d", rand.Int())), 1367 ) 1368 var remote peer.Peer 1369 _, err := client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&remote)) 1370 if err != nil { 1371 t.Fatalf("rpc EmptyCall() failed: %v", err) 1372 } 1373 gotPerBackend[remote.Addr.String()]++ 1374 } 1375 1376 if len(gotPerBackend) != 1 { 1377 t.Error("Got RPCs routed to multiple backends, want a single backend") 1378 } 1379 if got := gotPerBackend[backends[0]]; got != numRPCs { 1380 t.Errorf("Got %v RPCs routed to %v, want %v", got, backends[0], numRPCs) 1381 } 1382 } 1383 1384 // TestRingHash_IdleToReady tests that the channel will go from idle to ready 1385 // via connecting; (though it is not possible to catch the connecting state 1386 // before moving to ready via the public API). 1387 // TODO: we should be able to catch all state transitions by using the internal.SubscribeToConnectivityStateChanges API. 1388 func (s) TestRingHash_IdleToReady(t *testing.T) { 1389 backends := backendAddrs(startTestServiceBackends(t, 1)) 1390 1391 const clusterName = "cluster" 1392 endpoints := endpointResource(t, clusterName, backends) 1393 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 1394 ClusterName: clusterName, 1395 ServiceName: clusterName, 1396 Policy: e2e.LoadBalancingPolicyRingHash, 1397 }) 1398 route := channelIDHashRoute("new_route", virtualHostName, clusterName) 1399 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 1400 1401 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1402 defer cancel() 1403 1404 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 1405 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 1406 t.Fatalf("Failed to update xDS resources: %v", err) 1407 } 1408 1409 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 1410 if err != nil { 1411 t.Fatalf("Failed to create client: %s", err) 1412 } 1413 defer conn.Close() 1414 testutils.AwaitState(ctx, t, conn, connectivity.Idle) 1415 1416 client := testgrpc.NewTestServiceClient(conn) 1417 checkRPCSendOK(ctx, t, client, 1) 1418 testutils.AwaitState(ctx, t, conn, connectivity.Ready) 1419 } 1420 1421 // Test that the channel will transition to READY once it starts 1422 // connecting even if there are no RPCs being sent to the picker. 1423 func (s) TestRingHash_ContinuesConnectingWithoutPicks(t *testing.T) { 1424 backend := stubserver.StartTestService(t, &stubserver.StubServer{ 1425 // We expect the server EmptyCall to not be call here because the 1426 // aggregated channel state is never READY when the call is pending. 1427 EmptyCallF: func(ctx context.Context, _ *testpb.Empty) (*testpb.Empty, error) { 1428 t.Errorf("EmptyCall() should not have been called") 1429 return &testpb.Empty{}, nil 1430 }, 1431 }) 1432 defer backend.Stop() 1433 1434 unReachableServerAddr := makeUnreachableBackends(t, 1)[0] 1435 1436 const clusterName = "cluster" 1437 endpoints := endpointResource(t, clusterName, []string{backend.Address, unReachableServerAddr}) 1438 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 1439 ClusterName: clusterName, 1440 ServiceName: clusterName, 1441 Policy: e2e.LoadBalancingPolicyRingHash, 1442 }) 1443 route := headerHashRoute("new_route", virtualHostName, clusterName, "address_hash") 1444 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 1445 1446 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1447 defer cancel() 1448 1449 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 1450 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 1451 t.Fatalf("Failed to update xDS resources: %v", err) 1452 } 1453 1454 dialer := testutils.NewBlockingDialer() 1455 dopts := []grpc.DialOption{ 1456 grpc.WithResolvers(xdsResolver), 1457 grpc.WithTransportCredentials(insecure.NewCredentials()), 1458 grpc.WithContextDialer(dialer.DialContext), 1459 } 1460 conn, err := grpc.NewClient("xds:///test.server", dopts...) 1461 if err != nil { 1462 t.Fatalf("Failed to create client: %s", err) 1463 } 1464 defer conn.Close() 1465 client := testgrpc.NewTestServiceClient(conn) 1466 1467 hold := dialer.Hold(backend.Address) 1468 1469 rpcCtx, rpcCancel := context.WithCancel(ctx) 1470 go func() { 1471 rpcCtx = metadata.NewOutgoingContext(rpcCtx, metadata.Pairs("address_hash", unReachableServerAddr+"_0")) 1472 _, err := client.EmptyCall(rpcCtx, &testpb.Empty{}) 1473 if status.Code(err) != codes.Canceled { 1474 t.Errorf("Expected RPC to be canceled, got error: %v", err) 1475 } 1476 }() 1477 1478 // Wait for the connection attempt to the real backend. 1479 if !hold.Wait(ctx) { 1480 t.Fatalf("Timeout waiting for connection attempt to backend %v.", backend.Address) 1481 } 1482 // Now cancel the RPC while we are still connecting. 1483 rpcCancel() 1484 1485 // This allows the connection attempts to continue. The RPC was cancelled 1486 // before the backend was connected, but the backend is up. The conn 1487 // becomes Ready due to the connection attempt to the existing backend 1488 // succeeding, despite no new RPC being sent. 1489 hold.Resume() 1490 1491 testutils.AwaitState(ctx, t, conn, connectivity.Ready) 1492 } 1493 1494 // Tests that when the first pick is down leading to a transient failure, we 1495 // will move on to the next ring hash entry. 1496 func (s) TestRingHash_TransientFailureCheckNextOne(t *testing.T) { 1497 backends := backendAddrs(startTestServiceBackends(t, 1)) 1498 unReachableBackends := makeUnreachableBackends(t, 1) 1499 1500 const clusterName = "cluster" 1501 endpoints := endpointResource(t, clusterName, append(unReachableBackends, backends...)) 1502 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 1503 ClusterName: clusterName, 1504 ServiceName: clusterName, 1505 Policy: e2e.LoadBalancingPolicyRingHash, 1506 }) 1507 route := headerHashRoute("new_route", virtualHostName, clusterName, "address_hash") 1508 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 1509 1510 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1511 defer cancel() 1512 1513 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 1514 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 1515 t.Fatalf("Failed to update xDS resources: %v", err) 1516 } 1517 1518 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 1519 if err != nil { 1520 t.Fatalf("Failed to create client: %s", err) 1521 } 1522 defer conn.Close() 1523 client := testgrpc.NewTestServiceClient(conn) 1524 1525 // Note each type of RPC contains a header value that will always be hashed 1526 // the value that was used to place the non-existent endpoint on the ring, 1527 // but it still gets routed to the backend that is up. 1528 ctx = metadata.NewOutgoingContext(ctx, metadata.Pairs("address_hash", unReachableBackends[0]+"_0")) 1529 reqPerBackend := checkRPCSendOK(ctx, t, client, 1) 1530 var got string 1531 for got = range reqPerBackend { 1532 } 1533 if want := backends[0]; got != want { 1534 t.Errorf("Got RPC routed to addr %v, want %v", got, want) 1535 } 1536 } 1537 1538 // Tests for a bug seen in the wild in c-core, where ring_hash started with no 1539 // endpoints and reported TRANSIENT_FAILURE, then got an update with endpoints 1540 // and reported IDLE, but the picker update was squelched, so it failed to ever 1541 // get reconnected. 1542 func (s) TestRingHash_ReattemptWhenGoingFromTransientFailureToIdle(t *testing.T) { 1543 const clusterName = "cluster" 1544 endpoints := e2e.EndpointResourceWithOptions(e2e.EndpointOptions{ 1545 ClusterName: clusterName, 1546 Localities: []e2e.LocalityOptions{{}}, // note the empty locality (no endpoint). 1547 }) 1548 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 1549 ClusterName: clusterName, 1550 ServiceName: clusterName, 1551 Policy: e2e.LoadBalancingPolicyRingHash, 1552 }) 1553 route := e2e.DefaultRouteConfig("new_route", virtualHostName, clusterName) 1554 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 1555 1556 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1557 defer cancel() 1558 1559 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 1560 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 1561 t.Fatalf("Failed to update xDS resources: %v", err) 1562 } 1563 1564 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 1565 if err != nil { 1566 t.Fatalf("Failed to create client: %s", err) 1567 } 1568 defer conn.Close() 1569 testutils.AwaitState(ctx, t, conn, connectivity.Idle) 1570 1571 // There are no endpoints in EDS. RPCs should fail and the channel should 1572 // transition to transient failure. 1573 client := testgrpc.NewTestServiceClient(conn) 1574 if _, err = client.EmptyCall(ctx, &testpb.Empty{}); err == nil { 1575 t.Errorf("rpc EmptyCall() succeeded, want error") 1576 } 1577 testutils.AwaitState(ctx, t, conn, connectivity.TransientFailure) 1578 1579 t.Log("Updating EDS with a new backend endpoint.") 1580 backends := backendAddrs(startTestServiceBackends(t, 1)) 1581 endpoints = e2e.EndpointResourceWithOptions(e2e.EndpointOptions{ 1582 ClusterName: clusterName, 1583 Localities: []e2e.LocalityOptions{{ 1584 Backends: backendOptions(t, backends), 1585 Weight: 1, 1586 }}, 1587 }) 1588 if err = xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 1589 t.Fatalf("Failed to update xDS resources: %v", err) 1590 } 1591 1592 // A WaitForReady RPC should succeed, and the channel should report READY. 1593 if _, err = client.EmptyCall(ctx, &testpb.Empty{}, grpc.WaitForReady(true)); err != nil { 1594 t.Errorf("rpc EmptyCall() failed: %v", err) 1595 } 1596 testutils.AwaitState(ctx, t, conn, connectivity.Ready) 1597 } 1598 1599 // Tests that when all backends are down and then up, we may pick a TF backend 1600 // and we will then jump to ready backend. 1601 func (s) TestRingHash_TransientFailureSkipToAvailableReady(t *testing.T) { 1602 emptyCallF := func(ctx context.Context, in *testpb.Empty) (*testpb.Empty, error) { 1603 return &testpb.Empty{}, nil 1604 } 1605 lis, err := testutils.LocalTCPListener() 1606 if err != nil { 1607 t.Fatalf("Failed to create listener: %v", err) 1608 } 1609 restartableListener1 := testutils.NewRestartableListener(lis) 1610 restartableServer1 := stubserver.StartTestService(t, &stubserver.StubServer{ 1611 Listener: restartableListener1, 1612 EmptyCallF: emptyCallF, 1613 }) 1614 defer restartableServer1.Stop() 1615 1616 lis, err = testutils.LocalTCPListener() 1617 if err != nil { 1618 t.Fatalf("Failed to create listener: %v", err) 1619 } 1620 restartableListener2 := testutils.NewRestartableListener(lis) 1621 restartableServer2 := stubserver.StartTestService(t, &stubserver.StubServer{ 1622 Listener: restartableListener2, 1623 EmptyCallF: emptyCallF, 1624 }) 1625 defer restartableServer2.Stop() 1626 1627 unReachableBackends := makeUnreachableBackends(t, 2) 1628 1629 const clusterName = "cluster" 1630 backends := []string{restartableServer1.Address, restartableServer2.Address} 1631 backends = append(backends, unReachableBackends...) 1632 endpoints := endpointResource(t, clusterName, backends) 1633 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 1634 ClusterName: clusterName, 1635 ServiceName: clusterName, 1636 Policy: e2e.LoadBalancingPolicyRingHash, 1637 }) 1638 route := headerHashRoute("new_route", virtualHostName, clusterName, "address_hash") 1639 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 1640 1641 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1642 defer cancel() 1643 1644 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 1645 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 1646 t.Fatalf("Failed to update xDS resources: %v", err) 1647 } 1648 opts := []grpc.DialOption{ 1649 grpc.WithConnectParams(grpc.ConnectParams{ 1650 // Disable backoff to speed up the test. 1651 MinConnectTimeout: 100 * time.Millisecond, 1652 }), 1653 grpc.WithResolvers(xdsResolver), 1654 grpc.WithTransportCredentials(insecure.NewCredentials()), 1655 } 1656 conn, err := grpc.NewClient("xds:///test.server", opts...) 1657 if err != nil { 1658 t.Fatalf("Failed to create client: %s", err) 1659 } 1660 defer conn.Close() 1661 client := testgrpc.NewTestServiceClient(conn) 1662 1663 testutils.AwaitState(ctx, t, conn, connectivity.Idle) 1664 1665 // Test starts with backends not listening. 1666 restartableListener1.Stop() 1667 restartableListener2.Stop() 1668 1669 // Send a request with a hash that should go to restartableServer1. 1670 // Because it is not accepting connections, and no other backend is 1671 // listening, the RPC fails. 1672 ctx = metadata.NewOutgoingContext(ctx, metadata.Pairs("address_hash", restartableServer1.Address+"_0")) 1673 if _, err = client.EmptyCall(ctx, &testpb.Empty{}); err == nil { 1674 t.Fatalf("rpc EmptyCall() succeeded, want error") 1675 } 1676 1677 testutils.AwaitState(ctx, t, conn, connectivity.TransientFailure) 1678 1679 // Bring up first backend. The channel should become Ready without any 1680 // picks, because in TF, we are always trying to connect to at least one 1681 // backend at all times. 1682 restartableListener1.Restart() 1683 testutils.AwaitState(ctx, t, conn, connectivity.Ready) 1684 1685 // Bring down backend 1 and bring up backend 2. 1686 // Note the RPC contains a header value that will always be hashed to 1687 // backend 1. So by purposely bringing down backend 1 and bringing up 1688 // another backend, this will ensure Picker's first choice of backend 1 1689 // fails and it will go through the remaining subchannels to find one in 1690 // READY. Since the entries in the ring are pretty distributed and we have 1691 // unused ports to fill the ring, it is almost guaranteed that the Picker 1692 // will go through some non-READY entries and skip them as per design. 1693 t.Logf("bringing down backend 1") 1694 restartableListener1.Stop() 1695 1696 testutils.AwaitState(ctx, t, conn, connectivity.TransientFailure) 1697 if _, err = client.EmptyCall(ctx, &testpb.Empty{}); err == nil { 1698 t.Fatalf("rpc EmptyCall() succeeded, want error") 1699 } 1700 1701 t.Logf("bringing up backend 2") 1702 restartableListener2.Restart() 1703 testutils.AwaitState(ctx, t, conn, connectivity.Ready) 1704 1705 wantPeerAddr := "" 1706 for wantPeerAddr != restartableServer2.Address { 1707 p := peer.Peer{} 1708 if _, err := client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&p)); errors.Is(err, context.DeadlineExceeded) { 1709 t.Fatalf("Timed out waiting for rpc EmptyCall() to be routed to the expected backend") 1710 } 1711 wantPeerAddr = p.Addr.String() 1712 } 1713 } 1714 1715 // Tests that when all backends are down, we keep reattempting. 1716 func (s) TestRingHash_ReattemptWhenAllEndpointsUnreachable(t *testing.T) { 1717 lis, err := testutils.LocalTCPListener() 1718 if err != nil { 1719 t.Fatalf("Failed to create listener: %v", err) 1720 } 1721 restartableListener := testutils.NewRestartableListener(lis) 1722 restartableServer := stubserver.StartTestService(t, &stubserver.StubServer{ 1723 Listener: restartableListener, 1724 EmptyCallF: func(ctx context.Context, in *testpb.Empty) (*testpb.Empty, error) { 1725 return &testpb.Empty{}, nil 1726 }, 1727 }) 1728 defer restartableServer.Stop() 1729 1730 const clusterName = "cluster" 1731 endpoints := endpointResource(t, clusterName, []string{restartableServer.Address}) 1732 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 1733 ClusterName: clusterName, 1734 ServiceName: clusterName, 1735 Policy: e2e.LoadBalancingPolicyRingHash, 1736 }) 1737 route := headerHashRoute("new_route", virtualHostName, clusterName, "address_hash") 1738 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 1739 1740 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1741 defer cancel() 1742 1743 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 1744 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 1745 t.Fatalf("Failed to update xDS resources: %v", err) 1746 } 1747 1748 dopts := []grpc.DialOption{ 1749 grpc.WithResolvers(xdsResolver), 1750 grpc.WithTransportCredentials(insecure.NewCredentials()), 1751 grpc.WithConnectParams(fastConnectParams), 1752 } 1753 conn, err := grpc.NewClient("xds:///test.server", dopts...) 1754 if err != nil { 1755 t.Fatalf("Failed to create client: %s", err) 1756 } 1757 defer conn.Close() 1758 client := testgrpc.NewTestServiceClient(conn) 1759 1760 testutils.AwaitState(ctx, t, conn, connectivity.Idle) 1761 1762 t.Log("Stopping the backend server") 1763 restartableListener.Stop() 1764 1765 if _, err = client.EmptyCall(ctx, &testpb.Empty{}); status.Code(err) != codes.Unavailable { 1766 t.Fatalf("rpc EmptyCall() succeeded, want Unavailable error") 1767 } 1768 1769 // Wait for channel to fail. 1770 testutils.AwaitState(ctx, t, conn, connectivity.TransientFailure) 1771 1772 t.Log("Restarting the backend server") 1773 restartableListener.Restart() 1774 1775 // Wait for channel to become READY without any pending RPC. 1776 testutils.AwaitState(ctx, t, conn, connectivity.Ready) 1777 } 1778 1779 // Tests that when a backend goes down, we will move on to the next subchannel 1780 // (with a lower priority). When the backend comes back up, traffic will move 1781 // back. 1782 func (s) TestRingHash_SwitchToLowerPriorityAndThenBack(t *testing.T) { 1783 lis, err := testutils.LocalTCPListener() 1784 if err != nil { 1785 t.Fatalf("Failed to create listener: %v", err) 1786 } 1787 restartableListener := testutils.NewRestartableListener(lis) 1788 restartableServer := stubserver.StartTestService(t, &stubserver.StubServer{ 1789 Listener: restartableListener, 1790 EmptyCallF: func(ctx context.Context, in *testpb.Empty) (*testpb.Empty, error) { 1791 return &testpb.Empty{}, nil 1792 }, 1793 }) 1794 defer restartableServer.Stop() 1795 1796 otherBackend := backendAddrs(startTestServiceBackends(t, 1))[0] 1797 1798 // We must set the host name socket address in EDS, as the ring hash policy 1799 // uses it to construct the ring. 1800 host, _, err := net.SplitHostPort(otherBackend) 1801 if err != nil { 1802 t.Fatalf("Failed to split host and port from stubserver: %v", err) 1803 } 1804 1805 const clusterName = "cluster" 1806 endpoints := e2e.EndpointResourceWithOptions(e2e.EndpointOptions{ 1807 ClusterName: clusterName, 1808 Host: host, 1809 Localities: []e2e.LocalityOptions{{ 1810 Backends: backendOptions(t, []string{restartableServer.Address}), 1811 Weight: 1, 1812 }, { 1813 Backends: backendOptions(t, []string{otherBackend}), 1814 Weight: 1, 1815 Priority: 1, 1816 }}}) 1817 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 1818 ClusterName: clusterName, 1819 ServiceName: clusterName, 1820 Policy: e2e.LoadBalancingPolicyRingHash, 1821 }) 1822 route := headerHashRoute("new_route", virtualHostName, clusterName, "address_hash") 1823 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 1824 1825 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1826 defer cancel() 1827 1828 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 1829 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 1830 t.Fatalf("Failed to update xDS resources: %v", err) 1831 } 1832 1833 dopts := []grpc.DialOption{ 1834 grpc.WithResolvers(xdsResolver), 1835 grpc.WithTransportCredentials(insecure.NewCredentials()), 1836 grpc.WithConnectParams(fastConnectParams), 1837 } 1838 conn, err := grpc.NewClient("xds:///test.server", dopts...) 1839 if err != nil { 1840 t.Fatalf("Failed to create client: %s", err) 1841 } 1842 defer conn.Close() 1843 client := testgrpc.NewTestServiceClient(conn) 1844 1845 // Note each type of RPC contains a header value that will always be hashed 1846 // to the value that was used to place the non-existent endpoint on the ring. 1847 ctx = metadata.NewOutgoingContext(ctx, metadata.Pairs("address_hash", restartableServer.Address+"_0")) 1848 var got string 1849 for got = range checkRPCSendOK(ctx, t, client, 1) { 1850 } 1851 if want := restartableServer.Address; got != want { 1852 t.Fatalf("Got RPC routed to addr %v, want %v", got, want) 1853 } 1854 1855 // Trigger failure with the existing backend, which should cause the 1856 // balancer to go in transient failure and the priority balancer to move 1857 // to the lower priority. 1858 restartableListener.Stop() 1859 1860 for { 1861 p := peer.Peer{} 1862 _, err = client.EmptyCall(ctx, &testpb.Empty{}, grpc.WaitForReady(true), grpc.Peer(&p)) 1863 1864 // Ignore errors: we may need to attempt to send an RPC to detect the 1865 // failure (the next write on connection fails). 1866 if err == nil { 1867 if got, want := p.Addr.String(), otherBackend; got != want { 1868 t.Fatalf("Got RPC routed to addr %v, want %v", got, want) 1869 } 1870 break 1871 } 1872 } 1873 1874 // Now we start the backend with the address hash that is used in the 1875 // metadata, so eventually RPCs should be routed to it, since it is in a 1876 // locality with higher priority. 1877 peerAddr := "" 1878 restartableListener.Restart() 1879 for peerAddr != restartableServer.Address { 1880 p := peer.Peer{} 1881 _, err := client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&p)) 1882 if errors.Is(err, context.DeadlineExceeded) { 1883 t.Fatalf("Timed out waiting for rpc EmptyCall() to be routed to the expected backend") 1884 } 1885 peerAddr = p.Addr.String() 1886 } 1887 } 1888 1889 // Tests that when we trigger internal connection attempts without picks, we 1890 // keep retrying all the SubConns that have reported TF previously. 1891 func (s) TestRingHash_ContinuesConnectingWithoutPicksToMultipleSubConnsConcurrently(t *testing.T) { 1892 const backendsCount = 4 1893 backends := backendAddrs(startTestServiceBackends(t, backendsCount)) 1894 1895 const clusterName = "cluster" 1896 1897 endpoints := endpointResource(t, clusterName, backends) 1898 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 1899 ClusterName: clusterName, 1900 ServiceName: clusterName, 1901 Policy: e2e.LoadBalancingPolicyRingHash, 1902 }) 1903 route := headerHashRoute("new_route", virtualHostName, clusterName, "address_hash") 1904 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 1905 1906 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 1907 defer cancel() 1908 1909 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 1910 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 1911 t.Fatalf("Failed to update xDS resources: %v", err) 1912 } 1913 1914 dialer := testutils.NewBlockingDialer() 1915 dialOpts := []grpc.DialOption{ 1916 grpc.WithResolvers(xdsResolver), 1917 grpc.WithTransportCredentials(insecure.NewCredentials()), 1918 grpc.WithContextDialer(dialer.DialContext), 1919 grpc.WithConnectParams(fastConnectParams), 1920 } 1921 conn, err := grpc.NewClient("xds:///test.server", dialOpts...) 1922 if err != nil { 1923 t.Fatalf("Failed to create client: %s", err) 1924 } 1925 defer conn.Close() 1926 1927 // Create holds for each backend address to delay a successful connection 1928 // until the end of the test. 1929 holds := make([]*testutils.Hold, backendsCount) 1930 for i := 0; i < len(backends); i++ { 1931 holds[i] = dialer.Hold(backends[i]) 1932 } 1933 1934 client := testgrpc.NewTestServiceClient(conn) 1935 1936 rpcCtx, rpcCancel := context.WithCancel(ctx) 1937 errCh := make(chan error, 1) 1938 go func() { 1939 rpcCtx = metadata.NewOutgoingContext(rpcCtx, metadata.Pairs("address_hash", backends[0]+"_0")) 1940 _, err := client.EmptyCall(rpcCtx, &testpb.Empty{}) 1941 if status.Code(err) == codes.Canceled { 1942 errCh <- nil 1943 return 1944 } 1945 errCh <- err 1946 }() 1947 1948 // Wait for the RPC to trigger a connection attempt to the first address, 1949 // then cancel the RPC. No other connection attempts should be started yet. 1950 if !holds[0].Wait(ctx) { 1951 t.Fatalf("Timeout waiting for connection attempt to backend 0") 1952 } 1953 rpcCancel() 1954 if err := <-errCh; err != nil { 1955 t.Fatalf("Expected RPC to fail be canceled, got %v", err) 1956 } 1957 1958 // In every iteration of the following loop, we count the number of backends 1959 // that are dialed. After counting, we fail all the connection attempts. 1960 // This should cause the number of dialed backends to increase by 1 in every 1961 // iteration of the loop as ringhash tries to exit TRANSIENT_FAILURE. 1962 activeAddrs := map[string]bool{} 1963 for wantBackendCount := 1; wantBackendCount <= backendsCount; wantBackendCount++ { 1964 newAddrIdx := -1 1965 for ; ctx.Err() == nil; <-time.After(time.Millisecond) { 1966 for i, hold := range holds { 1967 if !hold.IsStarted() { 1968 continue 1969 } 1970 if _, ok := activeAddrs[backends[i]]; ok { 1971 continue 1972 } 1973 activeAddrs[backends[i]] = true 1974 newAddrIdx = i 1975 } 1976 if len(activeAddrs) > wantBackendCount { 1977 t.Fatalf("More backends dialed than expected: got %d, want %d", len(activeAddrs), wantBackendCount) 1978 } 1979 if len(activeAddrs) == wantBackendCount { 1980 break 1981 } 1982 } 1983 1984 // Wait for a short time and verify no more backends are contacted. 1985 <-time.After(defaultTestShortTimeout) 1986 for i, hold := range holds { 1987 if !hold.IsStarted() { 1988 continue 1989 } 1990 activeAddrs[backends[i]] = true 1991 } 1992 if len(activeAddrs) != wantBackendCount { 1993 t.Fatalf("Unexpected number of backends dialed: got %d, want %d", len(activeAddrs), wantBackendCount) 1994 } 1995 1996 // Create a new hold for the address dialed in this iteration and fail 1997 // the existing hold. 1998 hold := holds[newAddrIdx] 1999 holds[newAddrIdx] = dialer.Hold(backends[newAddrIdx]) 2000 hold.Fail(errors.New("Test error")) 2001 } 2002 2003 // Allow the request to a backend to succeed. 2004 if !holds[1].Wait(ctx) { 2005 t.Fatalf("Context timed out waiting %q to be dialed again.", backends[1]) 2006 } 2007 holds[1].Resume() 2008 2009 // Wait for channel to become READY without any pending RPC. 2010 testutils.AwaitState(ctx, t, conn, connectivity.Ready) 2011 } 2012 2013 // Tests that first address of an endpoint is used to generate the ring. The 2014 // test sends a request to a random endpoint. The test then reverses the 2015 // addresses of every endpoint and verifies that an RPC with header pointing to 2016 // the second address of the endpoint is sent to the initial address. The test 2017 // then swaps the second and third address of the endpoint and verifies that an 2018 // RPC with the header used earlier still reaches the same backend. 2019 func (s) TestRingHash_ReorderAddressessWithinEndpoint(t *testing.T) { 2020 origDualstackEndpointsEnabled := envconfig.XDSDualstackEndpointsEnabled 2021 defer func() { 2022 envconfig.XDSDualstackEndpointsEnabled = origDualstackEndpointsEnabled 2023 }() 2024 envconfig.XDSDualstackEndpointsEnabled = true 2025 backends := backendAddrs(startTestServiceBackends(t, 6)) 2026 2027 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 2028 2029 const clusterName = "cluster" 2030 addrGroups := [][]string{ 2031 {backends[0], backends[1], backends[2]}, 2032 {backends[3], backends[4], backends[5]}, 2033 } 2034 endpoints := endpointResourceForBackendsWithMultipleAddrs(t, clusterName, addrGroups) 2035 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 2036 ClusterName: clusterName, 2037 ServiceName: clusterName, 2038 Policy: e2e.LoadBalancingPolicyRingHash, 2039 }) 2040 route := headerHashRoute("new_route", virtualHostName, clusterName, "address_hash") 2041 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 2042 2043 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 2044 defer cancel() 2045 2046 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 2047 t.Fatalf("Failed to update xDS resources: %v", err) 2048 } 2049 2050 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 2051 if err != nil { 2052 t.Fatalf("Failed to create client: %s", err) 2053 } 2054 defer conn.Close() 2055 client := testgrpc.NewTestServiceClient(conn) 2056 2057 rpcCtx := metadata.NewOutgoingContext(ctx, metadata.Pairs( 2058 "address_hash", fmt.Sprintf("%d", rand.Int()), 2059 )) 2060 var remote peer.Peer 2061 if _, err := client.EmptyCall(rpcCtx, &testpb.Empty{}, grpc.Peer(&remote)); err != nil { 2062 t.Fatalf("rpc EmptyCall() failed: %v", err) 2063 } 2064 2065 initialFirstAddr := "" 2066 newFirstAddr := "" 2067 switch remote.Addr.String() { 2068 case addrGroups[0][0]: 2069 initialFirstAddr = addrGroups[0][0] 2070 newFirstAddr = addrGroups[0][2] 2071 case addrGroups[1][0]: 2072 initialFirstAddr = addrGroups[1][0] 2073 newFirstAddr = addrGroups[1][2] 2074 default: 2075 t.Fatalf("Request went to unexpected address: %q", remote.Addr) 2076 } 2077 2078 t.Log("Reversing addresses within each endpoint.") 2079 addrGroups1 := [][]string{ 2080 {addrGroups[0][2], addrGroups[0][1], addrGroups[0][0]}, 2081 {addrGroups[1][2], addrGroups[1][1], addrGroups[1][0]}, 2082 } 2083 endpoints = endpointResourceForBackendsWithMultipleAddrs(t, clusterName, addrGroups1) 2084 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 2085 t.Fatalf("Failed to update xDS resources: %v", err) 2086 } 2087 2088 // The first address of an endpoint is used to create the ring. This means 2089 // that requests should continue to go to the first address, but the hash 2090 // should be computed based on the last address in the original list. 2091 for ; ctx.Err() == nil; <-time.After(time.Millisecond) { 2092 rpcCtx := metadata.NewOutgoingContext(ctx, metadata.Pairs( 2093 "address_hash", newFirstAddr+"_0", 2094 )) 2095 if _, err := client.EmptyCall(rpcCtx, &testpb.Empty{}, grpc.Peer(&remote)); err != nil { 2096 t.Fatalf("rpc EmptyCall() failed: %v", err) 2097 } 2098 if remote.Addr.String() == initialFirstAddr { 2099 break 2100 } 2101 } 2102 2103 if ctx.Err() != nil { 2104 t.Fatalf("Context timed out waiting for request to be sent to %q, last request went to %q", initialFirstAddr, remote.Addr) 2105 } 2106 2107 t.Log("Swapping the second and third addresses within each endpoint.") 2108 // This should not effect the ring, since only the first address is used 2109 // by the ring. 2110 addrGroups2 := [][]string{ 2111 {addrGroups1[0][0], addrGroups[0][2], addrGroups[0][1]}, 2112 {addrGroups1[1][0], addrGroups[1][2], addrGroups[1][1]}, 2113 } 2114 endpoints = endpointResourceForBackendsWithMultipleAddrs(t, clusterName, addrGroups2) 2115 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 2116 t.Fatalf("Failed to update xDS resources: %v", err) 2117 } 2118 2119 // Verify that requests with the hash of the last address in chosenAddrGroup 2120 // continue reaching the first address in chosenAddrGroup. 2121 shortCtx, cancel := context.WithTimeout(ctx, defaultTestShortTimeout) 2122 defer cancel() 2123 for ; shortCtx.Err() == nil; <-time.After(time.Millisecond) { 2124 rpcCtx := metadata.NewOutgoingContext(ctx, metadata.Pairs( 2125 "address_hash", newFirstAddr+"_0", 2126 )) 2127 if _, err := client.EmptyCall(rpcCtx, &testpb.Empty{}, grpc.Peer(&remote)); err != nil { 2128 t.Fatalf("rpc EmptyCall() failed: %v", err) 2129 } 2130 if remote.Addr.String() == initialFirstAddr { 2131 continue 2132 } 2133 t.Fatalf("Request went to unexpected backend %q, want backend %q", remote.Addr, initialFirstAddr) 2134 } 2135 } 2136 2137 // Tests that requests are sent to the next address within the same endpoint 2138 // after the first address becomes unreachable. 2139 func (s) TestRingHash_FallBackWithinEndpoint(t *testing.T) { 2140 origDualstackEndpointsEnabled := envconfig.XDSDualstackEndpointsEnabled 2141 defer func() { 2142 envconfig.XDSDualstackEndpointsEnabled = origDualstackEndpointsEnabled 2143 }() 2144 envconfig.XDSDualstackEndpointsEnabled = true 2145 backends := startTestServiceBackends(t, 4) 2146 backendAddrs := backendAddrs(backends) 2147 2148 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 2149 2150 const clusterName = "cluster" 2151 endpoints := endpointResourceForBackendsWithMultipleAddrs(t, clusterName, [][]string{{backendAddrs[0], backendAddrs[1]}, {backendAddrs[2], backendAddrs[3]}}) 2152 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 2153 ClusterName: clusterName, 2154 ServiceName: clusterName, 2155 Policy: e2e.LoadBalancingPolicyRingHash, 2156 }) 2157 route := channelIDHashRoute("new_route", virtualHostName, clusterName) 2158 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 2159 2160 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 2161 defer cancel() 2162 2163 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 2164 t.Fatalf("Failed to update xDS resources: %v", err) 2165 } 2166 2167 conn, err := grpc.NewClient("xds:///test.server", grpc.WithResolvers(xdsResolver), grpc.WithTransportCredentials(insecure.NewCredentials())) 2168 if err != nil { 2169 t.Fatalf("Failed to create client: %s", err) 2170 } 2171 defer conn.Close() 2172 client := testgrpc.NewTestServiceClient(conn) 2173 2174 const numRPCs = 5 2175 received := checkRPCSendOK(ctx, t, client, numRPCs) 2176 if len(received) != 1 { 2177 t.Errorf("Got RPCs routed to %v backends, want %v", len(received), 1) 2178 } 2179 var got int 2180 var initialAddr string 2181 for initialAddr, got = range received { 2182 } 2183 if got != numRPCs { 2184 t.Errorf("Got %v RPCs routed to a backend, want %v", got, numRPCs) 2185 } 2186 2187 // Due to the channel ID hashing policy, the request could go to the first 2188 // address of either endpoint. 2189 var backendIdx int 2190 switch initialAddr { 2191 case backendAddrs[0]: 2192 backendIdx = 0 2193 case backendAddrs[2]: 2194 backendIdx = 2 2195 default: 2196 t.Fatalf("Request sent to unexpected backend: %q", initialAddr) 2197 } 2198 otherEndpointAddr := backendAddrs[backendIdx+1] 2199 2200 // Shut down the previously used backend. 2201 backends[backendIdx].Stop() 2202 testutils.AwaitState(ctx, t, conn, connectivity.Idle) 2203 2204 // Verify that the requests go to the remaining address in the same 2205 // endpoint. 2206 received = checkRPCSendOK(ctx, t, client, numRPCs) 2207 if len(received) != 1 { 2208 t.Errorf("Got RPCs routed to %v backends, want %v", len(received), 1) 2209 } 2210 var newAddr string 2211 for newAddr, got = range received { 2212 } 2213 if got != numRPCs { 2214 t.Errorf("Got %v RPCs routed to a backend, want %v", got, numRPCs) 2215 } 2216 2217 if newAddr != otherEndpointAddr { 2218 t.Errorf("Requests went to unexpected address, got=%q, want=%q", newAddr, otherEndpointAddr) 2219 } 2220 } 2221 2222 // Tests that ringhash is able to recover automatically in situations when a 2223 // READY endpoint enters IDLE making the aggregated state TRANSIENT_FAILURE. The 2224 // test creates 4 endpoints in the following connectivity states: [TF, TF, 2225 // READY, IDLE]. The test fails the READY backend and verifies that the last 2226 // IDLE endopint is dialed and the channel enters READY. 2227 func (s) TestRingHash_RecoverWhenEndpointEntersIdle(t *testing.T) { 2228 const backendsCount = 4 2229 backends := startTestServiceBackends(t, backendsCount) 2230 backendAddrs := backendAddrs(backends) 2231 2232 const clusterName = "cluster" 2233 2234 endpoints := endpointResource(t, clusterName, backendAddrs) 2235 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 2236 ClusterName: clusterName, 2237 ServiceName: clusterName, 2238 Policy: e2e.LoadBalancingPolicyRingHash, 2239 }) 2240 route := headerHashRoute("new_route", virtualHostName, clusterName, "address_hash") 2241 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 2242 2243 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 2244 defer cancel() 2245 2246 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 2247 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 2248 t.Fatalf("Failed to update xDS resources: %v", err) 2249 } 2250 2251 dialer := testutils.NewBlockingDialer() 2252 dialOpts := []grpc.DialOption{ 2253 grpc.WithResolvers(xdsResolver), 2254 grpc.WithTransportCredentials(insecure.NewCredentials()), 2255 grpc.WithContextDialer(dialer.DialContext), 2256 grpc.WithConnectParams(fastConnectParams), 2257 } 2258 conn, err := grpc.NewClient("xds:///test.server", dialOpts...) 2259 if err != nil { 2260 t.Fatalf("Failed to create client: %s", err) 2261 } 2262 defer conn.Close() 2263 2264 // Create holds for each backend address to delay a successful connection 2265 // until the end of the test. 2266 holds := make([]*testutils.Hold, backendsCount) 2267 for i := 0; i < len(backendAddrs); i++ { 2268 holds[i] = dialer.Hold(backendAddrs[i]) 2269 } 2270 2271 client := testgrpc.NewTestServiceClient(conn) 2272 2273 rpcCtx, rpcCancel := context.WithCancel(ctx) 2274 errCh := make(chan error, 1) 2275 go func() { 2276 rpcCtx = metadata.NewOutgoingContext(rpcCtx, metadata.Pairs("address_hash", backendAddrs[0]+"_0")) 2277 _, err := client.EmptyCall(rpcCtx, &testpb.Empty{}) 2278 if status.Code(err) == codes.Canceled { 2279 errCh <- nil 2280 return 2281 } 2282 errCh <- err 2283 }() 2284 2285 // Wait for the RPC to trigger a connection attempt to the first address, 2286 // then cancel the RPC. No other connection attempts should be started yet. 2287 if !holds[0].Wait(ctx) { 2288 t.Fatalf("Timeout waiting for connection attempt to backend 0") 2289 } 2290 rpcCancel() 2291 if err := <-errCh; err != nil { 2292 t.Fatalf("Expected RPC to fail be canceled, got %v", err) 2293 } 2294 2295 // The number of dialed backends increases by 1 in every iteration of the 2296 // loop as ringhash tries to exit TRANSIENT_FAILURE. Run the loop twice to 2297 // get two endpoints in TRANSIENT_FAILURE. 2298 activeAddrs := map[string]bool{} 2299 for wantFailingBackendCount := 1; wantFailingBackendCount <= 2; wantFailingBackendCount++ { 2300 newAddrIdx := -1 2301 for ; ctx.Err() == nil && len(activeAddrs) < wantFailingBackendCount; <-time.After(time.Millisecond) { 2302 for i, hold := range holds { 2303 if !hold.IsStarted() { 2304 continue 2305 } 2306 if _, ok := activeAddrs[backendAddrs[i]]; ok { 2307 continue 2308 } 2309 activeAddrs[backendAddrs[i]] = true 2310 newAddrIdx = i 2311 } 2312 } 2313 2314 if ctx.Err() != nil { 2315 t.Fatal("Context timed out waiting for new backneds to be dialed.") 2316 } 2317 if len(activeAddrs) > wantFailingBackendCount { 2318 t.Fatalf("More backends dialed than expected: got %d, want %d", len(activeAddrs), wantFailingBackendCount) 2319 } 2320 2321 // Create a new hold for the address dialed in this iteration and fail 2322 // the existing hold. 2323 hold := holds[newAddrIdx] 2324 holds[newAddrIdx] = dialer.Hold(backendAddrs[newAddrIdx]) 2325 hold.Fail(errors.New("Test error")) 2326 } 2327 2328 // Current state of endpoints: [TF, TF, READY, IDLE]. 2329 // Two endpoints failing should cause the channel to enter 2330 // TRANSIENT_FAILURE. 2331 testutils.AwaitState(ctx, t, conn, connectivity.TransientFailure) 2332 2333 // Allow the request to the backend dialed next to succeed. 2334 readyBackendIdx := -1 2335 for ; ctx.Err() == nil && readyBackendIdx == -1; <-time.After(time.Millisecond) { 2336 for i, addr := range backendAddrs { 2337 if _, ok := activeAddrs[addr]; ok || !holds[i].IsStarted() { 2338 continue 2339 } 2340 readyBackendIdx = i 2341 activeAddrs[addr] = true 2342 holds[i].Resume() 2343 break 2344 } 2345 } 2346 2347 if ctx.Err() != nil { 2348 t.Fatal("Context timed out waiting for the next backend to be contacted.") 2349 } 2350 2351 // Wait for channel to become READY without any pending RPC. 2352 testutils.AwaitState(ctx, t, conn, connectivity.Ready) 2353 2354 // Current state of endpoints: [TF, TF, READY, IDLE]. 2355 // Stopping the READY backend should cause the channel to re-enter 2356 // TRANSIENT_FAILURE. 2357 backends[readyBackendIdx].Stop() 2358 testutils.AwaitState(ctx, t, conn, connectivity.TransientFailure) 2359 2360 // To recover from TRANSIENT_FAILURE, ringhash should automatically try to 2361 // connect to the final endpoint. 2362 readyBackendIdx = -1 2363 for ; ctx.Err() == nil && readyBackendIdx == -1; <-time.After(time.Millisecond) { 2364 for i, addr := range backendAddrs { 2365 if _, ok := activeAddrs[addr]; ok || !holds[i].IsStarted() { 2366 continue 2367 } 2368 readyBackendIdx = i 2369 activeAddrs[addr] = true 2370 holds[i].Resume() 2371 break 2372 } 2373 } 2374 2375 if ctx.Err() != nil { 2376 t.Fatal("Context timed out waiting for next backend to be contacted.") 2377 } 2378 2379 // Wait for channel to become READY without any pending RPC. 2380 testutils.AwaitState(ctx, t, conn, connectivity.Ready) 2381 } 2382 2383 // Tests that ringhash is able to recover automatically in situations when a 2384 // READY endpoint is removed by the resolver making the aggregated state 2385 // TRANSIENT_FAILURE. The test creates 4 endpoints in the following 2386 // connectivity states: [TF, TF, READY, IDLE]. The test removes the 2387 // READY endpoint and verifies that the last IDLE endopint is dialed and the 2388 // channel enters READY. 2389 func (s) TestRingHash_RecoverWhenResolverRemovesEndpoint(t *testing.T) { 2390 const backendsCount = 4 2391 backends := startTestServiceBackends(t, backendsCount) 2392 backendAddrs := backendAddrs(backends) 2393 2394 const clusterName = "cluster" 2395 2396 endpoints := endpointResource(t, clusterName, backendAddrs) 2397 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 2398 ClusterName: clusterName, 2399 ServiceName: clusterName, 2400 Policy: e2e.LoadBalancingPolicyRingHash, 2401 }) 2402 route := headerHashRoute("new_route", virtualHostName, clusterName, "address_hash") 2403 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 2404 2405 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 2406 defer cancel() 2407 2408 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 2409 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 2410 t.Fatalf("Failed to update xDS resources: %v", err) 2411 } 2412 2413 dialer := testutils.NewBlockingDialer() 2414 dialOpts := []grpc.DialOption{ 2415 grpc.WithResolvers(xdsResolver), 2416 grpc.WithTransportCredentials(insecure.NewCredentials()), 2417 grpc.WithContextDialer(dialer.DialContext), 2418 grpc.WithConnectParams(fastConnectParams), 2419 } 2420 conn, err := grpc.NewClient("xds:///test.server", dialOpts...) 2421 if err != nil { 2422 t.Fatalf("Failed to create client: %s", err) 2423 } 2424 defer conn.Close() 2425 2426 // Create holds for each backend address to delay a successful connection 2427 // until the end of the test. 2428 holds := make([]*testutils.Hold, backendsCount) 2429 for i := 0; i < len(backendAddrs); i++ { 2430 holds[i] = dialer.Hold(backendAddrs[i]) 2431 } 2432 2433 client := testgrpc.NewTestServiceClient(conn) 2434 2435 rpcCtx, rpcCancel := context.WithCancel(ctx) 2436 errCh := make(chan error, 1) 2437 go func() { 2438 rpcCtx = metadata.NewOutgoingContext(rpcCtx, metadata.Pairs("address_hash", backendAddrs[0]+"_0")) 2439 _, err := client.EmptyCall(rpcCtx, &testpb.Empty{}) 2440 if status.Code(err) == codes.Canceled { 2441 errCh <- nil 2442 return 2443 } 2444 errCh <- err 2445 }() 2446 2447 // Wait for the RPC to trigger a connection attempt to the first address, 2448 // then cancel the RPC. No other connection attempts should be started yet. 2449 if !holds[0].Wait(ctx) { 2450 t.Fatalf("Timeout waiting for connection attempt to backend 0") 2451 } 2452 rpcCancel() 2453 if err := <-errCh; err != nil { 2454 t.Fatalf("Expected RPC to fail be canceled, got %v", err) 2455 } 2456 2457 // The number of dialed backends increases by 1 in every iteration of the 2458 // loop as ringhash tries to exit TRANSIENT_FAILURE. Run the loop twice to 2459 // get two endpoints in TRANSIENT_FAILURE. 2460 activeAddrs := map[string]bool{} 2461 for wantFailingBackendCount := 1; wantFailingBackendCount <= 2; wantFailingBackendCount++ { 2462 newAddrIdx := -1 2463 for ; ctx.Err() == nil && len(activeAddrs) < wantFailingBackendCount; <-time.After(time.Millisecond) { 2464 for i, hold := range holds { 2465 if !hold.IsStarted() { 2466 continue 2467 } 2468 if _, ok := activeAddrs[backendAddrs[i]]; ok { 2469 continue 2470 } 2471 activeAddrs[backendAddrs[i]] = true 2472 newAddrIdx = i 2473 } 2474 } 2475 2476 if ctx.Err() != nil { 2477 t.Fatal("Context timed out waiting for new backneds to be dialed.") 2478 } 2479 if len(activeAddrs) > wantFailingBackendCount { 2480 t.Fatalf("More backends dialed than expected: got %d, want %d", len(activeAddrs), wantFailingBackendCount) 2481 } 2482 2483 // Create a new hold for the address dialed in this iteration and fail 2484 // the existing hold. 2485 hold := holds[newAddrIdx] 2486 holds[newAddrIdx] = dialer.Hold(backendAddrs[newAddrIdx]) 2487 hold.Fail(errors.New("Test error")) 2488 } 2489 2490 // Current state of endpoints: [TF, TF, READY, IDLE]. 2491 // Two endpoints failing should cause the channel to enter 2492 // TRANSIENT_FAILURE. 2493 testutils.AwaitState(ctx, t, conn, connectivity.TransientFailure) 2494 2495 // Allow the request to the backend dialed next to succeed. 2496 readyBackendIdx := -1 2497 for ; ctx.Err() == nil && readyBackendIdx == -1; <-time.After(time.Millisecond) { 2498 for i, addr := range backendAddrs { 2499 if _, ok := activeAddrs[addr]; ok || !holds[i].IsStarted() { 2500 continue 2501 } 2502 readyBackendIdx = i 2503 activeAddrs[addr] = true 2504 holds[i].Resume() 2505 break 2506 } 2507 } 2508 2509 if ctx.Err() != nil { 2510 t.Fatal("Context timed out waiting for the next backend to be contacted.") 2511 } 2512 2513 // Wait for channel to become READY without any pending RPC. 2514 testutils.AwaitState(ctx, t, conn, connectivity.Ready) 2515 2516 // Current state of endpoints: [TF, TF, READY, IDLE]. 2517 // Removing the READY backend should cause the channel to re-enter 2518 // TRANSIENT_FAILURE. 2519 updatedAddrs := append([]string{}, backendAddrs[:readyBackendIdx]...) 2520 updatedAddrs = append(updatedAddrs, backendAddrs[readyBackendIdx+1:]...) 2521 updatedEndpoints := endpointResource(t, clusterName, updatedAddrs) 2522 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, updatedEndpoints, cluster, route, listener)); err != nil { 2523 t.Fatalf("Failed to update xDS resources: %v", err) 2524 } 2525 testutils.AwaitState(ctx, t, conn, connectivity.TransientFailure) 2526 2527 // To recover from TRANSIENT_FAILURE, ringhash should automatically try to 2528 // connect to the final endpoint. 2529 readyBackendIdx = -1 2530 for ; ctx.Err() == nil && readyBackendIdx == -1; <-time.After(time.Millisecond) { 2531 for i, addr := range backendAddrs { 2532 if _, ok := activeAddrs[addr]; ok || !holds[i].IsStarted() { 2533 continue 2534 } 2535 readyBackendIdx = i 2536 activeAddrs[addr] = true 2537 holds[i].Resume() 2538 break 2539 } 2540 } 2541 2542 if ctx.Err() != nil { 2543 t.Fatal("Context timed out waiting for next backend to be contacted.") 2544 } 2545 2546 // Wait for channel to become READY without any pending RPC. 2547 testutils.AwaitState(ctx, t, conn, connectivity.Ready) 2548 } 2549 2550 // Tests that RPCs are routed according to endpoint hash key rather than 2551 // endpoint first address if it is set in EDS endpoint metadata. 2552 func (s) TestRingHash_EndpointHashKey(t *testing.T) { 2553 testutils.SetEnvConfig(t, &envconfig.XDSEndpointHashKeyBackwardCompat, false) 2554 2555 backends := backendAddrs(startTestServiceBackends(t, 4)) 2556 2557 const clusterName = "cluster" 2558 var backendOpts []e2e.BackendOptions 2559 for i, addr := range backends { 2560 var ports []uint32 2561 ports = append(ports, testutils.ParsePort(t, addr)) 2562 backendOpts = append(backendOpts, e2e.BackendOptions{ 2563 Ports: ports, 2564 Metadata: map[string]any{"hash_key": strconv.Itoa(i)}, 2565 }) 2566 } 2567 endpoints := e2e.EndpointResourceWithOptions(e2e.EndpointOptions{ 2568 ClusterName: clusterName, 2569 Host: "localhost", 2570 Localities: []e2e.LocalityOptions{{ 2571 Backends: backendOpts, 2572 Weight: 1, 2573 }}, 2574 }) 2575 cluster := e2e.ClusterResourceWithOptions(e2e.ClusterOptions{ 2576 ClusterName: clusterName, 2577 ServiceName: clusterName, 2578 Policy: e2e.LoadBalancingPolicyRingHash, 2579 }) 2580 route := headerHashRoute("new_route", virtualHostName, clusterName, "address_hash") 2581 listener := e2e.DefaultClientListener(virtualHostName, route.Name) 2582 2583 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 2584 defer cancel() 2585 2586 xdsServer, nodeID, xdsResolver := setupManagementServerAndResolver(t) 2587 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 2588 t.Fatalf("Failed to update xDS resources: %v", err) 2589 } 2590 2591 opts := []grpc.DialOption{ 2592 grpc.WithResolvers(xdsResolver), 2593 grpc.WithTransportCredentials(insecure.NewCredentials()), 2594 } 2595 conn, err := grpc.NewClient("xds:///test.server", opts...) 2596 if err != nil { 2597 t.Fatalf("Failed to create client: %s", err) 2598 } 2599 defer conn.Close() 2600 client := testgrpc.NewTestServiceClient(conn) 2601 2602 // Make sure RPCs are routed to backends according to the endpoint metadata 2603 // rather than their address. Note each type of RPC contains a header value 2604 // that will always be hashed to a specific backend as the header value 2605 // matches the endpoint metadata hash key. 2606 for i, backend := range backends { 2607 ctx := metadata.NewOutgoingContext(ctx, metadata.Pairs("address_hash", strconv.Itoa(i)+"_0")) 2608 numRPCs := 10 2609 reqPerBackend := checkRPCSendOK(ctx, t, client, numRPCs) 2610 if reqPerBackend[backend] != numRPCs { 2611 t.Errorf("Got RPC routed to addresses %v, want all RPCs routed to %v", reqPerBackend, backend) 2612 } 2613 } 2614 2615 // Update the endpoints to swap the metadata hash key. 2616 for i := range backendOpts { 2617 backendOpts[i].Metadata = map[string]any{"hash_key": strconv.Itoa(len(backends) - i - 1)} 2618 } 2619 endpoints = e2e.EndpointResourceWithOptions(e2e.EndpointOptions{ 2620 ClusterName: clusterName, 2621 Host: "localhost", 2622 Localities: []e2e.LocalityOptions{{ 2623 Backends: backendOpts, 2624 Weight: 1, 2625 }}, 2626 }) 2627 if err := xdsServer.Update(ctx, xdsUpdateOpts(nodeID, endpoints, cluster, route, listener)); err != nil { 2628 t.Fatalf("Failed to update xDS resources: %v", err) 2629 } 2630 2631 // Wait for the resolver update to make it to the balancer. This RPC should 2632 // be routed to backend 3 with the reverse numbering of the hash_key 2633 // attribute delivered above. 2634 for { 2635 ctx := metadata.NewOutgoingContext(ctx, metadata.Pairs("address_hash", "0_0")) 2636 var remote peer.Peer 2637 if _, err := client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&remote)); err != nil { 2638 t.Fatalf("Unexpected RPC error waiting for EDS update propagation: %s", err) 2639 } 2640 if remote.Addr.String() == backends[3] { 2641 break 2642 } 2643 } 2644 2645 // Now that the balancer has the new endpoint attributes, make sure RPCs are 2646 // routed to backends according to the new endpoint metadata. 2647 for i, backend := range backends { 2648 ctx := metadata.NewOutgoingContext(ctx, metadata.Pairs("address_hash", strconv.Itoa(len(backends)-i-1)+"_0")) 2649 numRPCs := 10 2650 reqPerBackend := checkRPCSendOK(ctx, t, client, numRPCs) 2651 if reqPerBackend[backend] != numRPCs { 2652 t.Errorf("Got RPC routed to addresses %v, want all RPCs routed to %v", reqPerBackend, backend) 2653 } 2654 } 2655 } 2656 2657 // Tests that when a request hash key is set in the balancer configuration via 2658 // service config, this header is used to route to a specific backend. 2659 func (s) TestRingHash_RequestHashKey(t *testing.T) { 2660 testutils.SetEnvConfig(t, &envconfig.RingHashSetRequestHashKey, true) 2661 2662 backends := backendAddrs(startTestServiceBackends(t, 4)) 2663 2664 // Create a clientConn with a manual resolver (which is used to push the 2665 // address of the test backend), and a default service config pointing to 2666 // the use of the ring_hash_experimental LB policy with an explicit hash 2667 // header. 2668 const ringHashServiceConfig = `{"loadBalancingConfig": [{"ring_hash_experimental":{"requestHashHeader":"address_hash"}}]}` 2669 r := manual.NewBuilderWithScheme("whatever") 2670 dopts := []grpc.DialOption{ 2671 grpc.WithTransportCredentials(insecure.NewCredentials()), 2672 grpc.WithResolvers(r), 2673 grpc.WithDefaultServiceConfig(ringHashServiceConfig), 2674 grpc.WithConnectParams(fastConnectParams), 2675 } 2676 cc, err := grpc.NewClient(r.Scheme()+":///test.server", dopts...) 2677 if err != nil { 2678 t.Fatalf("Failed to dial local test server: %v", err) 2679 } 2680 defer cc.Close() 2681 var endpoints []resolver.Endpoint 2682 for _, backend := range backends { 2683 endpoints = append(endpoints, resolver.Endpoint{ 2684 Addresses: []resolver.Address{{Addr: backend}}, 2685 }) 2686 } 2687 r.UpdateState(resolver.State{ 2688 Endpoints: endpoints, 2689 }) 2690 client := testgrpc.NewTestServiceClient(cc) 2691 2692 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 2693 defer cancel() 2694 2695 // Note each type of RPC contains a header value that will always be hashed 2696 // to a specific backend as the header value matches the value used to 2697 // create the entry in the ring. 2698 for _, backend := range backends { 2699 ctx := metadata.NewOutgoingContext(ctx, metadata.Pairs("address_hash", backend+"_0")) 2700 numRPCs := 10 2701 reqPerBackend := checkRPCSendOK(ctx, t, client, numRPCs) 2702 if reqPerBackend[backend] != numRPCs { 2703 t.Errorf("Got RPC routed to addresses %v, want all RPCs routed to %v", reqPerBackend, backend) 2704 } 2705 } 2706 2707 const ringHashServiceConfigUpdate = `{"loadBalancingConfig": [{"ring_hash_experimental":{"requestHashHeader":"other_header"}}]}` 2708 r.UpdateState(resolver.State{ 2709 Endpoints: endpoints, 2710 ServiceConfig: (&testutils.ResolverClientConn{}).ParseServiceConfig(ringHashServiceConfigUpdate), 2711 }) 2712 2713 // Make sure that requests with the new hash are sent to the right backend. 2714 for _, backend := range backends { 2715 ctx := metadata.NewOutgoingContext(ctx, metadata.Pairs("other_header", backend+"_0")) 2716 numRPCs := 10 2717 reqPerBackend := checkRPCSendOK(ctx, t, client, numRPCs) 2718 if reqPerBackend[backend] != numRPCs { 2719 t.Errorf("Got RPC routed to addresses %v, want all RPCs routed to %v", reqPerBackend, backend) 2720 } 2721 } 2722 } 2723 2724 // Tests that when a request hash key is set in the balancer configuration via 2725 // service config, and the header is not set in the outgoing request, then it 2726 // is sent to a random backend. 2727 func (s) TestRingHash_RequestHashKeyRandom(t *testing.T) { 2728 testutils.SetEnvConfig(t, &envconfig.RingHashSetRequestHashKey, true) 2729 2730 backends := backendAddrs(startTestServiceBackends(t, 4)) 2731 2732 // Create a clientConn with a manual resolver (which is used to push the 2733 // address of the test backend), and a default service config pointing to 2734 // the use of the ring_hash_experimental LB policy with an explicit hash 2735 // header. 2736 const ringHashServiceConfig = `{"loadBalancingConfig": [{"ring_hash_experimental":{"requestHashHeader":"address_hash"}}]}` 2737 r := manual.NewBuilderWithScheme("whatever") 2738 dopts := []grpc.DialOption{ 2739 grpc.WithTransportCredentials(insecure.NewCredentials()), 2740 grpc.WithResolvers(r), 2741 grpc.WithDefaultServiceConfig(ringHashServiceConfig), 2742 grpc.WithConnectParams(fastConnectParams), 2743 } 2744 cc, err := grpc.NewClient(r.Scheme()+":///test.server", dopts...) 2745 if err != nil { 2746 t.Fatalf("Failed to dial local test server: %v", err) 2747 } 2748 defer cc.Close() 2749 var endpoints []resolver.Endpoint 2750 for _, backend := range backends { 2751 endpoints = append(endpoints, resolver.Endpoint{ 2752 Addresses: []resolver.Address{{Addr: backend}}, 2753 }) 2754 } 2755 r.UpdateState(resolver.State{ 2756 Endpoints: endpoints, 2757 }) 2758 client := testgrpc.NewTestServiceClient(cc) 2759 2760 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 2761 defer cancel() 2762 2763 // Due to the way that ring hash lazily establishes connections when using a 2764 // random hash, request distribution is skewed towards the order in which we 2765 // connected. The test send RPCs until we are connected to all backends, so 2766 // we can later assert that the distribution is uniform. 2767 seen := make(map[string]bool) 2768 for len(seen) != 4 { 2769 var remote peer.Peer 2770 if _, err := client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&remote)); err != nil { 2771 t.Fatalf("rpc EmptyCall() failed: %v", err) 2772 } 2773 seen[remote.String()] = true 2774 } 2775 2776 // Make sure that requests with the old hash are sent to random backends. 2777 numRPCs := computeIdealNumberOfRPCs(t, .25, errorTolerance) 2778 gotPerBackend := checkRPCSendOK(ctx, t, client, numRPCs) 2779 for _, backend := range backends { 2780 got := float64(gotPerBackend[backend]) / float64(numRPCs) 2781 want := .25 2782 if !cmp.Equal(got, want, cmpopts.EquateApprox(0, errorTolerance)) { 2783 t.Errorf("Fraction of RPCs to backend %s: got %v, want %v (margin: +-%v)", backend, got, want, errorTolerance) 2784 } 2785 } 2786 } 2787 2788 // Tests that when a request hash key is set in the balancer configuration via 2789 // service config, and the header is not set in the outgoing request (random 2790 // behavior), then each RPC wakes up at most one SubChannel, and, if there are 2791 // SubChannels in Ready state, RPCs are routed to them. 2792 func (s) TestRingHash_RequestHashKeyConnecting(t *testing.T) { 2793 testutils.SetEnvConfig(t, &envconfig.RingHashSetRequestHashKey, true) 2794 2795 backends := backendAddrs(startTestServiceBackends(t, 20)) 2796 2797 // Create a clientConn with a manual resolver (which is used to push the 2798 // address of the test backend), and a default service config pointing to 2799 // the use of the ring_hash_experimental LB policy with an explicit hash 2800 // header. Use a blocking dialer to control connection attempts. 2801 const ringHashServiceConfig = `{"loadBalancingConfig": [ 2802 {"ring_hash_experimental":{"requestHashHeader":"address_hash"}} 2803 ]}` 2804 r := manual.NewBuilderWithScheme("whatever") 2805 blockingDialer := testutils.NewBlockingDialer() 2806 dopts := []grpc.DialOption{ 2807 grpc.WithTransportCredentials(insecure.NewCredentials()), 2808 grpc.WithResolvers(r), 2809 grpc.WithDefaultServiceConfig(ringHashServiceConfig), 2810 grpc.WithConnectParams(fastConnectParams), 2811 grpc.WithContextDialer(blockingDialer.DialContext), 2812 } 2813 cc, err := grpc.NewClient(r.Scheme()+":///test.server", dopts...) 2814 if err != nil { 2815 t.Fatalf("Failed to dial local test server: %v", err) 2816 } 2817 defer cc.Close() 2818 var endpoints []resolver.Endpoint 2819 for _, backend := range backends { 2820 endpoints = append(endpoints, resolver.Endpoint{ 2821 Addresses: []resolver.Address{{Addr: backend}}, 2822 }) 2823 } 2824 r.UpdateState(resolver.State{ 2825 Endpoints: endpoints, 2826 }) 2827 client := testgrpc.NewTestServiceClient(cc) 2828 2829 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 2830 defer cancel() 2831 2832 // Intercept all connection attempts to the backends. 2833 var holds []*testutils.Hold 2834 for i := 0; i < len(backends); i++ { 2835 holds = append(holds, blockingDialer.Hold(backends[i])) 2836 } 2837 2838 wg := sync.WaitGroup{} 2839 wg.Add(1) 2840 go func() { 2841 // Send 1 RPC and make sure this triggers at most 1 connection attempt. 2842 _, err := client.EmptyCall(ctx, &testpb.Empty{}) 2843 if err != nil { 2844 t.Errorf("EmptyCall(): got %v, want success", err) 2845 } 2846 wg.Done() 2847 }() 2848 testutils.AwaitState(ctx, t, cc, connectivity.Connecting) 2849 2850 // Check that only one connection attempt was started. 2851 nConn := 0 2852 for _, hold := range holds { 2853 if hold.IsStarted() { 2854 nConn++ 2855 } 2856 } 2857 if wantMaxConn := 1; nConn > wantMaxConn { 2858 t.Fatalf("Got %d connection attempts, want at most %d", nConn, wantMaxConn) 2859 } 2860 2861 // Do a second RPC. Since there should already be a SubChannel in 2862 // Connecting state, this should not trigger a connection attempt. 2863 wg.Add(1) 2864 go func() { 2865 _, err := client.EmptyCall(ctx, &testpb.Empty{}) 2866 if err != nil { 2867 t.Errorf("EmptyCall(): got %v, want success", err) 2868 } 2869 wg.Done() 2870 }() 2871 2872 // Give extra time for more connections to be attempted. 2873 time.Sleep(defaultTestShortTimeout) 2874 2875 var firstConnectedBackend string 2876 nConn = 0 2877 for i, hold := range holds { 2878 if hold.IsStarted() { 2879 // Unblock the connection attempt. The SubChannel (and hence the 2880 // channel) should transition to Ready. RPCs should succeed and 2881 // be routed to this backend. 2882 hold.Resume() 2883 holds[i] = nil 2884 firstConnectedBackend = backends[i] 2885 nConn++ 2886 } 2887 } 2888 if wantMaxConn := 1; nConn > wantMaxConn { 2889 t.Fatalf("Got %d connection attempts, want at most %d", nConn, wantMaxConn) 2890 } 2891 testutils.AwaitState(ctx, t, cc, connectivity.Ready) 2892 wg.Wait() // Make sure we're done with the 2 previous RPCs. 2893 2894 // Now send RPCs until we have at least one more connection attempt, that 2895 // is, the random hash did not land on the same backend on every pick (the 2896 // chances are low, but we don't want this to be flaky). Make sure no RPC 2897 // fails and that we route all of them to the only subchannel in ready 2898 // state. 2899 nConn = 0 2900 for nConn == 0 { 2901 p := peer.Peer{} 2902 _, err = client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&p)) 2903 if status.Code(err) == codes.DeadlineExceeded { 2904 t.Fatal("EmptyCall(): test timed out while waiting for more connection attempts") 2905 } 2906 if err != nil { 2907 t.Fatalf("EmptyCall(): got %v, want success", err) 2908 } 2909 if p.Addr.String() != firstConnectedBackend { 2910 t.Errorf("RPC sent to backend %q, want %q", p.Addr.String(), firstConnectedBackend) 2911 } 2912 for _, hold := range holds { 2913 if hold != nil && hold.IsStarted() { 2914 nConn++ 2915 } 2916 } 2917 } 2918 }