google.golang.org/grpc@v1.72.2/xds/internal/balancer/clusterresolver/e2e_test/balancer_test.go (about)

     1  /*
     2   * Copyright 2023 gRPC authors.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package e2e_test
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"fmt"
    23  	"strings"
    24  	"testing"
    25  	"time"
    26  
    27  	"github.com/google/go-cmp/cmp"
    28  	"github.com/google/uuid"
    29  	"google.golang.org/grpc"
    30  	"google.golang.org/grpc/balancer"
    31  	"google.golang.org/grpc/balancer/roundrobin"
    32  	"google.golang.org/grpc/codes"
    33  	"google.golang.org/grpc/connectivity"
    34  	"google.golang.org/grpc/credentials/insecure"
    35  	"google.golang.org/grpc/internal"
    36  	"google.golang.org/grpc/internal/balancer/stub"
    37  	iserviceconfig "google.golang.org/grpc/internal/serviceconfig"
    38  	"google.golang.org/grpc/internal/stubserver"
    39  	"google.golang.org/grpc/internal/testutils"
    40  	"google.golang.org/grpc/internal/testutils/xds/e2e"
    41  	"google.golang.org/grpc/internal/xds/bootstrap"
    42  	"google.golang.org/grpc/resolver"
    43  	"google.golang.org/grpc/resolver/manual"
    44  	"google.golang.org/grpc/serviceconfig"
    45  	"google.golang.org/grpc/status"
    46  	xdsinternal "google.golang.org/grpc/xds/internal"
    47  	"google.golang.org/grpc/xds/internal/balancer/clusterimpl"
    48  	"google.golang.org/grpc/xds/internal/balancer/outlierdetection"
    49  	"google.golang.org/grpc/xds/internal/balancer/priority"
    50  	"google.golang.org/grpc/xds/internal/balancer/wrrlocality"
    51  	"google.golang.org/grpc/xds/internal/xdsclient"
    52  	"google.golang.org/grpc/xds/internal/xdsclient/xdsresource/version"
    53  	"google.golang.org/protobuf/types/known/durationpb"
    54  	"google.golang.org/protobuf/types/known/wrapperspb"
    55  
    56  	v3clusterpb "github.com/envoyproxy/go-control-plane/envoy/config/cluster/v3"
    57  	v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
    58  	v3endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3"
    59  	v3discoverypb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3"
    60  	testgrpc "google.golang.org/grpc/interop/grpc_testing"
    61  	testpb "google.golang.org/grpc/interop/grpc_testing"
    62  
    63  	_ "google.golang.org/grpc/xds/internal/balancer/cdsbalancer" // Register the "cds_experimental" LB policy.
    64  )
    65  
    66  // setupAndDial performs common setup across all tests
    67  //
    68  //   - creates an xDS client with the passed in bootstrap contents
    69  //   - creates a  manual resolver that configures `cds_experimental` as the
    70  //     top-level LB policy.
    71  //   - creates a ClientConn to talk to the test backends
    72  //
    73  // Returns a function to close the ClientConn and the xDS client.
    74  func setupAndDial(t *testing.T, bootstrapContents []byte) (*grpc.ClientConn, func()) {
    75  	t.Helper()
    76  
    77  	// Create an xDS client for use by the cluster_resolver LB policy.
    78  	config, err := bootstrap.NewConfigFromContents(bootstrapContents)
    79  	if err != nil {
    80  		t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err)
    81  	}
    82  	pool := xdsclient.NewPool(config)
    83  	xdsC, xdsClose, err := pool.NewClientForTesting(xdsclient.OptionsForTesting{
    84  		Name: t.Name(),
    85  	})
    86  	if err != nil {
    87  		t.Fatalf("Failed to create xDS client: %v", err)
    88  	}
    89  
    90  	// Create a manual resolver and push a service config specifying the use of
    91  	// the cds LB policy as the top-level LB policy, and a corresponding config
    92  	// with a single cluster.
    93  	r := manual.NewBuilderWithScheme("whatever")
    94  	jsonSC := fmt.Sprintf(`{
    95  			"loadBalancingConfig":[{
    96  				"cds_experimental":{
    97  					"cluster": "%s"
    98  				}
    99  			}]
   100  		}`, clusterName)
   101  	scpr := internal.ParseServiceConfig.(func(string) *serviceconfig.ParseResult)(jsonSC)
   102  	r.InitialState(xdsclient.SetClient(resolver.State{ServiceConfig: scpr}, xdsC))
   103  
   104  	// Create a ClientConn and make a successful RPC.
   105  	cc, err := grpc.NewClient(r.Scheme()+":///test.service", grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithResolvers(r))
   106  	if err != nil {
   107  		xdsClose()
   108  		t.Fatalf("grpc.NewClient() failed: %v", err)
   109  	}
   110  	cc.Connect()
   111  	return cc, func() {
   112  		xdsClose()
   113  		cc.Close()
   114  	}
   115  }
   116  
   117  // TestErrorFromParentLB_ConnectionError tests the case where the parent of the
   118  // clusterresolver LB policy sends it a connection error. The parent policy,
   119  // CDS LB policy, sends a connection error when the ADS stream to the management
   120  // server breaks. The test verifies that there is no perceivable effect because
   121  // of this connection error, and that RPCs continue to work (because the LB
   122  // policies are expected to use previously received xDS resources).
   123  func (s) TestErrorFromParentLB_ConnectionError(t *testing.T) {
   124  	// Create a listener to be used by the management server. The test will
   125  	// close this listener to simulate ADS stream breakage.
   126  	lis, err := testutils.LocalTCPListener()
   127  	if err != nil {
   128  		t.Fatalf("testutils.LocalTCPListener() failed: %v", err)
   129  	}
   130  
   131  	// Start an xDS management server with the above restartable listener, and
   132  	// push a channel when the stream is closed.
   133  	streamClosedCh := make(chan struct{}, 1)
   134  	managementServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{
   135  		Listener: lis,
   136  		OnStreamClosed: func(int64, *v3corepb.Node) {
   137  			select {
   138  			case streamClosedCh <- struct{}{}:
   139  			default:
   140  			}
   141  		},
   142  	})
   143  
   144  	// Create bootstrap configuration pointing to the above management server.
   145  	nodeID := uuid.New().String()
   146  	bootstrapContents := e2e.DefaultBootstrapContents(t, nodeID, managementServer.Address)
   147  
   148  	server := stubserver.StartTestService(t, nil)
   149  	defer server.Stop()
   150  
   151  	// Configure cluster and endpoints resources in the management server.
   152  	resources := e2e.UpdateOptions{
   153  		NodeID:         nodeID,
   154  		Clusters:       []*v3clusterpb.Cluster{e2e.DefaultCluster(clusterName, edsServiceName, e2e.SecurityLevelNone)},
   155  		Endpoints:      []*v3endpointpb.ClusterLoadAssignment{e2e.DefaultEndpoint(edsServiceName, "localhost", []uint32{testutils.ParsePort(t, server.Address)})},
   156  		SkipValidation: true,
   157  	}
   158  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   159  	defer cancel()
   160  	if err := managementServer.Update(ctx, resources); err != nil {
   161  		t.Fatal(err)
   162  	}
   163  
   164  	// Create xDS client, configure cds_experimental LB policy with a manual
   165  	// resolver, and dial the test backends.
   166  	cc, cleanup := setupAndDial(t, bootstrapContents)
   167  	defer cleanup()
   168  
   169  	client := testgrpc.NewTestServiceClient(cc)
   170  	if _, err := client.EmptyCall(ctx, &testpb.Empty{}); err != nil {
   171  		t.Fatalf("EmptyCall() failed: %v", err)
   172  	}
   173  
   174  	// Close the listener and ensure that the ADS stream breaks.
   175  	lis.Close()
   176  	select {
   177  	case <-ctx.Done():
   178  		t.Fatal("Timeout when waiting for ADS stream to close")
   179  	default:
   180  	}
   181  
   182  	// Ensure that RPCs continue to succeed for the next second.
   183  	for end := time.Now().Add(time.Second); time.Now().Before(end); <-time.After(defaultTestShortTimeout) {
   184  		if _, err := client.EmptyCall(ctx, &testpb.Empty{}); err != nil {
   185  			t.Fatalf("EmptyCall() failed: %v", err)
   186  		}
   187  	}
   188  }
   189  
   190  // TestErrorFromParentLB_ResourceNotFound tests the case where the parent of the
   191  // clusterresolver LB policy sends it a resource-not-found error. The parent
   192  // policy, CDS LB policy, sends a resource-not-found error when the cluster
   193  // resource associated with these LB policies is removed by the management
   194  // server. The test verifies that the associated EDS is canceled and RPCs fail.
   195  // It also ensures that when the Cluster resource is added back, the EDS
   196  // resource is re-requested and RPCs being to succeed.
   197  func (s) TestErrorFromParentLB_ResourceNotFound(t *testing.T) {
   198  	// Start an xDS management server that uses a couple of channels to
   199  	// notify the test about the following events:
   200  	// - an EDS requested with the expected resource name is requested
   201  	// - EDS resource is unrequested, i.e, an EDS request with no resource name
   202  	//   is received, which indicates that we are no longer interested in that
   203  	//   resource.
   204  	edsResourceRequestedCh := make(chan struct{}, 1)
   205  	edsResourceCanceledCh := make(chan struct{}, 1)
   206  	managementServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{
   207  		OnStreamRequest: func(_ int64, req *v3discoverypb.DiscoveryRequest) error {
   208  			if req.GetTypeUrl() == version.V3EndpointsURL {
   209  				switch len(req.GetResourceNames()) {
   210  				case 0:
   211  					select {
   212  					case edsResourceCanceledCh <- struct{}{}:
   213  					default:
   214  					}
   215  				case 1:
   216  					if req.GetResourceNames()[0] == edsServiceName {
   217  						select {
   218  						case edsResourceRequestedCh <- struct{}{}:
   219  						default:
   220  						}
   221  					}
   222  				default:
   223  					t.Errorf("Unexpected number of resources, %d, in an EDS request", len(req.GetResourceNames()))
   224  				}
   225  			}
   226  			return nil
   227  		},
   228  	})
   229  
   230  	// Create bootstrap configuration pointing to the above management server.
   231  	nodeID := uuid.New().String()
   232  	bootstrapContents := e2e.DefaultBootstrapContents(t, nodeID, managementServer.Address)
   233  
   234  	server := stubserver.StartTestService(t, nil)
   235  	defer server.Stop()
   236  
   237  	// Configure cluster and endpoints resources in the management server.
   238  	resources := e2e.UpdateOptions{
   239  		NodeID:         nodeID,
   240  		Clusters:       []*v3clusterpb.Cluster{e2e.DefaultCluster(clusterName, edsServiceName, e2e.SecurityLevelNone)},
   241  		Endpoints:      []*v3endpointpb.ClusterLoadAssignment{e2e.DefaultEndpoint(edsServiceName, "localhost", []uint32{testutils.ParsePort(t, server.Address)})},
   242  		SkipValidation: true,
   243  	}
   244  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   245  	defer cancel()
   246  	if err := managementServer.Update(ctx, resources); err != nil {
   247  		t.Fatal(err)
   248  	}
   249  
   250  	// Create xDS client, configure cds_experimental LB policy with a manual
   251  	// resolver, and dial the test backends.
   252  	cc, cleanup := setupAndDial(t, bootstrapContents)
   253  	defer cleanup()
   254  
   255  	// Wait for the EDS resource to be requested.
   256  	select {
   257  	case <-ctx.Done():
   258  		t.Fatal("Timeout when waiting for EDS resource to be requested")
   259  	case <-edsResourceRequestedCh:
   260  	}
   261  
   262  	// Ensure that a successful RPC can be made.
   263  	client := testgrpc.NewTestServiceClient(cc)
   264  	if _, err := client.EmptyCall(ctx, &testpb.Empty{}); err != nil {
   265  		t.Fatalf("EmptyCall() failed: %v", err)
   266  	}
   267  
   268  	// Delete the cluster resource from the management server.
   269  	resources.Clusters = nil
   270  	if err := managementServer.Update(ctx, resources); err != nil {
   271  		t.Fatal(err)
   272  	}
   273  
   274  	// Wait for the EDS resource to be not requested anymore.
   275  	select {
   276  	case <-ctx.Done():
   277  		t.Fatal("Timeout when waiting for EDS resource to not requested")
   278  	case <-edsResourceCanceledCh:
   279  	}
   280  
   281  	// Ensure that RPCs start to fail with expected error.
   282  	wantErr := fmt.Sprintf("cluster %q not found", clusterName)
   283  	for ; ctx.Err() == nil; <-time.After(defaultTestShortTimeout) {
   284  		sCtx, sCancel := context.WithTimeout(ctx, defaultTestShortTimeout)
   285  		defer sCancel()
   286  		_, err := client.EmptyCall(sCtx, &testpb.Empty{})
   287  		if status.Code(err) == codes.Unavailable && strings.Contains(err.Error(), wantErr) {
   288  			break
   289  		}
   290  		if err != nil {
   291  			t.Logf("EmptyCall failed: %v", err)
   292  		}
   293  	}
   294  	if ctx.Err() != nil {
   295  		t.Fatalf("RPCs did not fail after removal of Cluster resource")
   296  	}
   297  
   298  	testutils.AwaitState(ctx, t, cc, connectivity.TransientFailure)
   299  
   300  	// Configure cluster and endpoints resources in the management server.
   301  	resources = e2e.UpdateOptions{
   302  		NodeID:         nodeID,
   303  		Clusters:       []*v3clusterpb.Cluster{e2e.DefaultCluster(clusterName, edsServiceName, e2e.SecurityLevelNone)},
   304  		Endpoints:      []*v3endpointpb.ClusterLoadAssignment{e2e.DefaultEndpoint(edsServiceName, "localhost", []uint32{testutils.ParsePort(t, server.Address)})},
   305  		SkipValidation: true,
   306  	}
   307  	if err := managementServer.Update(ctx, resources); err != nil {
   308  		t.Fatal(err)
   309  	}
   310  
   311  	// Wait for the EDS resource to be requested again.
   312  	select {
   313  	case <-ctx.Done():
   314  		t.Fatal("Timeout when waiting for EDS resource to be requested")
   315  	case <-edsResourceRequestedCh:
   316  	}
   317  
   318  	// Ensure that a successful RPC can be made.
   319  	for ; ctx.Err() == nil; <-time.After(defaultTestShortTimeout) {
   320  		sCtx, sCancel := context.WithTimeout(ctx, defaultTestShortTimeout)
   321  		defer sCancel()
   322  		if _, err := client.EmptyCall(sCtx, &testpb.Empty{}); err != nil {
   323  			t.Logf("EmptyCall failed: %v", err)
   324  			continue
   325  		}
   326  		break
   327  	}
   328  	if ctx.Err() != nil {
   329  		t.Fatalf("RPCs did not fail after removal of Cluster resource")
   330  	}
   331  }
   332  
   333  // Test verifies that when the received Cluster resource contains outlier
   334  // detection configuration, the LB config pushed to the child policy contains
   335  // the appropriate configuration for the outlier detection LB policy.
   336  func (s) TestOutlierDetectionConfigPropagationToChildPolicy(t *testing.T) {
   337  	// Unregister the priority balancer builder for the duration of this test,
   338  	// and register a policy under the same name that makes the LB config
   339  	// pushed to it available to the test.
   340  	priorityBuilder := balancer.Get(priority.Name)
   341  	internal.BalancerUnregister(priorityBuilder.Name())
   342  	lbCfgCh := make(chan serviceconfig.LoadBalancingConfig, 1)
   343  	stub.Register(priority.Name, stub.BalancerFuncs{
   344  		Init: func(bd *stub.BalancerData) {
   345  			bd.Data = priorityBuilder.Build(bd.ClientConn, bd.BuildOptions)
   346  		},
   347  		ParseConfig: func(lbCfg json.RawMessage) (serviceconfig.LoadBalancingConfig, error) {
   348  			return priorityBuilder.(balancer.ConfigParser).ParseConfig(lbCfg)
   349  		},
   350  		UpdateClientConnState: func(bd *stub.BalancerData, ccs balancer.ClientConnState) error {
   351  			select {
   352  			case lbCfgCh <- ccs.BalancerConfig:
   353  			default:
   354  			}
   355  			bal := bd.Data.(balancer.Balancer)
   356  			return bal.UpdateClientConnState(ccs)
   357  		},
   358  		Close: func(bd *stub.BalancerData) {
   359  			bal := bd.Data.(balancer.Balancer)
   360  			bal.Close()
   361  		},
   362  	})
   363  	defer balancer.Register(priorityBuilder)
   364  
   365  	managementServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{})
   366  
   367  	// Create bootstrap configuration pointing to the above management server.
   368  	nodeID := uuid.New().String()
   369  	bootstrapContents := e2e.DefaultBootstrapContents(t, nodeID, managementServer.Address)
   370  
   371  	server := stubserver.StartTestService(t, nil)
   372  	defer server.Stop()
   373  
   374  	// Configure cluster and endpoints resources in the management server.
   375  	cluster := e2e.DefaultCluster(clusterName, edsServiceName, e2e.SecurityLevelNone)
   376  	cluster.OutlierDetection = &v3clusterpb.OutlierDetection{
   377  		Interval:                 durationpb.New(10 * time.Second),
   378  		BaseEjectionTime:         durationpb.New(30 * time.Second),
   379  		MaxEjectionTime:          durationpb.New(300 * time.Second),
   380  		MaxEjectionPercent:       wrapperspb.UInt32(10),
   381  		SuccessRateStdevFactor:   wrapperspb.UInt32(2000),
   382  		EnforcingSuccessRate:     wrapperspb.UInt32(50),
   383  		SuccessRateMinimumHosts:  wrapperspb.UInt32(10),
   384  		SuccessRateRequestVolume: wrapperspb.UInt32(50),
   385  	}
   386  	resources := e2e.UpdateOptions{
   387  		NodeID:         nodeID,
   388  		Clusters:       []*v3clusterpb.Cluster{cluster},
   389  		Endpoints:      []*v3endpointpb.ClusterLoadAssignment{e2e.DefaultEndpoint(edsServiceName, "localhost", []uint32{testutils.ParsePort(t, server.Address)})},
   390  		SkipValidation: true,
   391  	}
   392  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   393  	defer cancel()
   394  	if err := managementServer.Update(ctx, resources); err != nil {
   395  		t.Fatal(err)
   396  	}
   397  
   398  	// Create xDS client, configure cds_experimental LB policy with a manual
   399  	// resolver, and dial the test backends.
   400  	_, cleanup := setupAndDial(t, bootstrapContents)
   401  	defer cleanup()
   402  
   403  	// The priority configuration generated should have Outlier Detection as a
   404  	// direct child due to Outlier Detection being turned on.
   405  	wantCfg := &priority.LBConfig{
   406  		Children: map[string]*priority.Child{
   407  			"priority-0-0": {
   408  				Config: &iserviceconfig.BalancerConfig{
   409  					Name: outlierdetection.Name,
   410  					Config: &outlierdetection.LBConfig{
   411  						Interval:           iserviceconfig.Duration(10 * time.Second), // default interval
   412  						BaseEjectionTime:   iserviceconfig.Duration(30 * time.Second),
   413  						MaxEjectionTime:    iserviceconfig.Duration(300 * time.Second),
   414  						MaxEjectionPercent: 10,
   415  						SuccessRateEjection: &outlierdetection.SuccessRateEjection{
   416  							StdevFactor:           2000,
   417  							EnforcementPercentage: 50,
   418  							MinimumHosts:          10,
   419  							RequestVolume:         50,
   420  						},
   421  						ChildPolicy: &iserviceconfig.BalancerConfig{
   422  							Name: clusterimpl.Name,
   423  							Config: &clusterimpl.LBConfig{
   424  								Cluster:         clusterName,
   425  								EDSServiceName:  edsServiceName,
   426  								TelemetryLabels: xdsinternal.UnknownCSMLabels,
   427  								ChildPolicy: &iserviceconfig.BalancerConfig{
   428  									Name: wrrlocality.Name,
   429  									Config: &wrrlocality.LBConfig{
   430  										ChildPolicy: &iserviceconfig.BalancerConfig{
   431  											Name: roundrobin.Name,
   432  										},
   433  									},
   434  								},
   435  							},
   436  						},
   437  					},
   438  				},
   439  				IgnoreReresolutionRequests: true,
   440  			},
   441  		},
   442  		Priorities: []string{"priority-0-0"},
   443  	}
   444  
   445  	select {
   446  	case lbCfg := <-lbCfgCh:
   447  		gotCfg := lbCfg.(*priority.LBConfig)
   448  		if diff := cmp.Diff(wantCfg, gotCfg); diff != "" {
   449  			t.Fatalf("Child policy received unexpected diff in config (-want +got):\n%s", diff)
   450  		}
   451  	case <-ctx.Done():
   452  		t.Fatalf("Timeout when waiting for child policy to receive its configuration")
   453  	}
   454  }