google.golang.org/grpc@v1.72.2/xds/internal/xdsclient/tests/fallback_test.go (about)

     1  /*
     2   *
     3   * Copyright 2024 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   *
    17   */
    18  
    19  package xdsclient_test
    20  
    21  import (
    22  	"context"
    23  	"fmt"
    24  	"sync/atomic"
    25  	"testing"
    26  	"time"
    27  
    28  	"github.com/google/uuid"
    29  	"google.golang.org/grpc"
    30  	"google.golang.org/grpc/codes"
    31  	"google.golang.org/grpc/credentials/insecure"
    32  	"google.golang.org/grpc/internal"
    33  	"google.golang.org/grpc/internal/stubserver"
    34  	"google.golang.org/grpc/internal/testutils"
    35  	"google.golang.org/grpc/internal/testutils/xds/e2e"
    36  	"google.golang.org/grpc/internal/xds/bootstrap"
    37  	"google.golang.org/grpc/peer"
    38  	"google.golang.org/grpc/resolver"
    39  	"google.golang.org/grpc/status"
    40  	"google.golang.org/grpc/xds/internal/xdsclient"
    41  	"google.golang.org/grpc/xds/internal/xdsclient/xdsresource/version"
    42  
    43  	v3clusterpb "github.com/envoyproxy/go-control-plane/envoy/config/cluster/v3"
    44  	v3endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3"
    45  	v3listenerpb "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3"
    46  	v3routepb "github.com/envoyproxy/go-control-plane/envoy/config/route/v3"
    47  	v3discoverypb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3"
    48  	testgrpc "google.golang.org/grpc/interop/grpc_testing"
    49  	testpb "google.golang.org/grpc/interop/grpc_testing"
    50  )
    51  
    52  // Give the fallback tests additional time to complete because they need to
    53  // first identify failed connections before establishing new ones.
    54  const defaultFallbackTestTimeout = 2 * defaultTestTimeout
    55  
    56  func waitForRPCsToReachBackend(ctx context.Context, client testgrpc.TestServiceClient, backend string) error {
    57  	var lastErr error
    58  	for ; ctx.Err() == nil; <-time.After(defaultTestShortTimeout) {
    59  		var peer peer.Peer
    60  		if _, err := client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer)); err != nil {
    61  			lastErr = err
    62  			continue
    63  		}
    64  		// Veirfy the peer when the RPC succeeds.
    65  		if peer.Addr.String() == backend {
    66  			break
    67  		}
    68  	}
    69  	if ctx.Err() != nil {
    70  		return fmt.Errorf("timeout when waiting for RPCs to reach expected backend. Last error: %v", lastErr)
    71  	}
    72  	return nil
    73  }
    74  
    75  // Tests fallback on startup where the xDS client is unable to establish a
    76  // connection to the primary server. The test verifies that the xDS client falls
    77  // back to the secondary server, and when the primary comes back up, it reverts
    78  // to it. The test also verifies that when all requested resources are cached
    79  // from the primary, fallback is not triggered when the connection goes down.
    80  func (s) TestFallback_OnStartup(t *testing.T) {
    81  	ctx, cancel := context.WithTimeout(context.Background(), defaultFallbackTestTimeout)
    82  	defer cancel()
    83  
    84  	// Create two listeners for the two management servers. The test can
    85  	// start/stop these listeners and can also get notified when the listener
    86  	// receives a connection request.
    87  	primaryWrappedLis := testutils.NewListenerWrapper(t, nil)
    88  	primaryLis := testutils.NewRestartableListener(primaryWrappedLis)
    89  	fallbackWrappedLis := testutils.NewListenerWrapper(t, nil)
    90  	fallbackLis := testutils.NewRestartableListener(fallbackWrappedLis)
    91  
    92  	// Start two management servers, primary and fallback, with the above
    93  	// listeners.
    94  	primaryManagementServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{Listener: primaryLis})
    95  	fallbackManagementServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{Listener: fallbackLis})
    96  
    97  	// Start two test service backends.
    98  	backend1 := stubserver.StartTestService(t, nil)
    99  	defer backend1.Stop()
   100  	backend2 := stubserver.StartTestService(t, nil)
   101  	defer backend2.Stop()
   102  
   103  	// Configure xDS resource on the primary management server, with a cluster
   104  	// resource that contains an endpoint for backend1.
   105  	nodeID := uuid.New().String()
   106  	const serviceName = "my-service-fallback-xds"
   107  	resources := e2e.DefaultClientResources(e2e.ResourceParams{
   108  		DialTarget: serviceName,
   109  		NodeID:     nodeID,
   110  		Host:       "localhost",
   111  		Port:       testutils.ParsePort(t, backend1.Address),
   112  		SecLevel:   e2e.SecurityLevelNone,
   113  	})
   114  	if err := primaryManagementServer.Update(ctx, resources); err != nil {
   115  		t.Fatal(err)
   116  	}
   117  
   118  	// Configure xDS resource on the secondary management server, with a cluster
   119  	// resource that contains an endpoint for backend2. Only the listener
   120  	// resource has the same name on both servers.
   121  	fallbackRouteConfigName := "fallback-route-" + serviceName
   122  	fallbackClusterName := "fallback-cluster-" + serviceName
   123  	fallbackEndpointsName := "fallback-endpoints-" + serviceName
   124  	resources = e2e.UpdateOptions{
   125  		NodeID:    nodeID,
   126  		Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(serviceName, fallbackRouteConfigName)},
   127  		Routes:    []*v3routepb.RouteConfiguration{e2e.DefaultRouteConfig(fallbackRouteConfigName, serviceName, fallbackClusterName)},
   128  		Clusters:  []*v3clusterpb.Cluster{e2e.DefaultCluster(fallbackClusterName, fallbackEndpointsName, e2e.SecurityLevelNone)},
   129  		Endpoints: []*v3endpointpb.ClusterLoadAssignment{e2e.DefaultEndpoint(fallbackEndpointsName, "localhost", []uint32{testutils.ParsePort(t, backend2.Address)})},
   130  	}
   131  	if err := fallbackManagementServer.Update(ctx, resources); err != nil {
   132  		t.Fatal(err)
   133  	}
   134  
   135  	// Shut both management servers down before starting the gRPC client to
   136  	// trigger fallback on startup.
   137  	primaryLis.Stop()
   138  	fallbackLis.Stop()
   139  
   140  	// Generate bootstrap configuration with the above two servers.
   141  	bootstrapContents, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{
   142  		Servers: []byte(fmt.Sprintf(`[
   143  		{
   144  			"server_uri": %q,
   145  			"channel_creds": [{"type": "insecure"}]
   146  		},
   147  		{
   148  			"server_uri": %q,
   149  			"channel_creds": [{"type": "insecure"}]
   150  		}]`, primaryManagementServer.Address, fallbackManagementServer.Address)),
   151  		Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)),
   152  	})
   153  	if err != nil {
   154  		t.Fatalf("Failed to create bootstrap file: %v", err)
   155  	}
   156  
   157  	// Create an xDS client with the above bootstrap configuration.
   158  	config, err := bootstrap.NewConfigFromContents(bootstrapContents)
   159  	if err != nil {
   160  		t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err)
   161  	}
   162  	pool := xdsclient.NewPool(config)
   163  	if err != nil {
   164  		t.Fatalf("Failed to create xDS client: %v", err)
   165  	}
   166  
   167  	// Get the xDS resolver to use the above xDS client.
   168  	resolverBuilder := internal.NewXDSResolverWithPoolForTesting.(func(*xdsclient.Pool) (resolver.Builder, error))
   169  	resolver, err := resolverBuilder(pool)
   170  	if err != nil {
   171  		t.Fatalf("Failed to create xDS resolver for testing: %v", err)
   172  	}
   173  
   174  	// Start a gRPC client that uses the above xDS resolver.
   175  	cc, err := grpc.NewClient(fmt.Sprintf("xds:///%s", serviceName), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithResolvers(resolver))
   176  	if err != nil {
   177  		t.Fatalf("Failed to create gRPC client: %v", err)
   178  	}
   179  	defer cc.Close()
   180  	cc.Connect()
   181  
   182  	// Ensure that a connection is attempted to the primary.
   183  	if _, err := primaryWrappedLis.NewConnCh.Receive(ctx); err != nil {
   184  		t.Fatalf("Failure when waiting for a connection to be opened to the primary management server: %v", err)
   185  	}
   186  
   187  	// Ensure that a connection is attempted to the fallback.
   188  	if _, err := fallbackWrappedLis.NewConnCh.Receive(ctx); err != nil {
   189  		t.Fatalf("Failure when waiting for a connection to be opened to the primary management server: %v", err)
   190  	}
   191  
   192  	// Make an RPC with a shortish deadline and expect it to fail.
   193  	sCtx, sCancel := context.WithTimeout(ctx, defaultTestShortTimeout)
   194  	defer sCancel()
   195  	client := testgrpc.NewTestServiceClient(cc)
   196  	if _, err := client.EmptyCall(sCtx, &testpb.Empty{}, grpc.WaitForReady(true)); err == nil || status.Code(err) != codes.DeadlineExceeded {
   197  		t.Fatalf("EmptyCall() = %v, want DeadlineExceeded", err)
   198  	}
   199  
   200  	// Start the fallback server. Ensure that an RPC can succeed, and that it
   201  	// reaches backend2.
   202  	fallbackLis.Restart()
   203  	if err := waitForRPCsToReachBackend(ctx, client, backend2.Address); err != nil {
   204  		t.Fatal(err)
   205  	}
   206  
   207  	// Start the primary server. It can take a while before the xDS client
   208  	// notices this, since the ADS stream implementation uses a backoff before
   209  	// retrying the stream.
   210  	primaryLis.Restart()
   211  
   212  	// Wait for the connection to the secondary to be closed and ensure that an
   213  	// RPC can succeed, and that it reaches backend1.
   214  	c, err := fallbackWrappedLis.NewConnCh.Receive(ctx)
   215  	if err != nil {
   216  		t.Fatalf("Failure when retrieving the most recent connection to the fallback management server: %v", err)
   217  	}
   218  	conn := c.(*testutils.ConnWrapper)
   219  	if _, err := conn.CloseCh.Receive(ctx); err != nil {
   220  		t.Fatalf("Connection to fallback server not closed once primary becomes ready: %v", err)
   221  	}
   222  	if err := waitForRPCsToReachBackend(ctx, client, backend1.Address); err != nil {
   223  		t.Fatal(err)
   224  	}
   225  
   226  	// Stop the primary servers. Since all xDS resources were received from the
   227  	// primary (and RPCs were succeeding to the clusters returned by the
   228  	// primary), we will not trigger fallback.
   229  	primaryLis.Stop()
   230  	sCtx, sCancel = context.WithTimeout(ctx, defaultTestShortTimeout)
   231  	defer sCancel()
   232  	if _, err := fallbackWrappedLis.NewConnCh.Receive(sCtx); err == nil {
   233  		t.Fatalf("Fallback attempted when not expected to. There are no uncached resources from the primary server at this point.")
   234  	}
   235  
   236  	// Ensure that RPCs still succeed, and that they use the configuration
   237  	// received from the primary.
   238  	if err := waitForRPCsToReachBackend(ctx, client, backend1.Address); err != nil {
   239  		t.Fatal(err)
   240  	}
   241  }
   242  
   243  // Tests fallback when the primary management server fails during an update.
   244  func (s) TestFallback_MidUpdate(t *testing.T) {
   245  	ctx, cancel := context.WithTimeout(context.Background(), defaultFallbackTestTimeout)
   246  	defer cancel()
   247  
   248  	// Create two listeners for the two management servers. The test can
   249  	// start/stop these listeners and can also get notified when the listener
   250  	// receives a connection request.
   251  	primaryWrappedLis := testutils.NewListenerWrapper(t, nil)
   252  	primaryLis := testutils.NewRestartableListener(primaryWrappedLis)
   253  	fallbackWrappedLis := testutils.NewListenerWrapper(t, nil)
   254  	fallbackLis := testutils.NewRestartableListener(fallbackWrappedLis)
   255  
   256  	// This boolean helps with triggering fallback mid update. When this boolean
   257  	// is set and the below defined cluster resource is requested, the primary
   258  	// management server shuts down the connection, forcing the client to
   259  	// fallback to the secondary server.
   260  	var closeConnOnMidUpdateClusterResource atomic.Bool
   261  	const (
   262  		serviceName              = "my-service-fallback-xds"
   263  		routeConfigName          = "route-" + serviceName
   264  		clusterName              = "cluster-" + serviceName
   265  		endpointsName            = "endpoints-" + serviceName
   266  		midUpdateRouteConfigName = "mid-update-route-" + serviceName
   267  		midUpdateClusterName     = "mid-update-cluster-" + serviceName
   268  		midUpdateEndpointsName   = "mid-update-endpoints-" + serviceName
   269  	)
   270  
   271  	// Start two management servers, primary and fallback, with the above
   272  	// listeners.
   273  	primaryManagementServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{
   274  		Listener: primaryLis,
   275  		OnStreamRequest: func(id int64, req *v3discoverypb.DiscoveryRequest) error {
   276  			if closeConnOnMidUpdateClusterResource.Load() == false {
   277  				return nil
   278  			}
   279  			if req.GetTypeUrl() != version.V3ClusterURL {
   280  				return nil
   281  			}
   282  			for _, name := range req.GetResourceNames() {
   283  				if name == midUpdateClusterName {
   284  					primaryLis.Stop()
   285  					return fmt.Errorf("closing ADS stream because %q resource was requested", midUpdateClusterName)
   286  				}
   287  			}
   288  			return nil
   289  		},
   290  		AllowResourceSubset: true,
   291  	})
   292  	fallbackManagementServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{Listener: fallbackLis})
   293  
   294  	// Start three test service backends.
   295  	backend1 := stubserver.StartTestService(t, nil)
   296  	defer backend1.Stop()
   297  	backend2 := stubserver.StartTestService(t, nil)
   298  	defer backend2.Stop()
   299  	backend3 := stubserver.StartTestService(t, nil)
   300  	defer backend3.Stop()
   301  
   302  	// Configure xDS resource on the primary management server, with a cluster
   303  	// resource that contains an endpoint for backend1.
   304  	nodeID := uuid.New().String()
   305  	primaryResources := e2e.UpdateOptions{
   306  		NodeID:    nodeID,
   307  		Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(serviceName, routeConfigName)},
   308  		Routes:    []*v3routepb.RouteConfiguration{e2e.DefaultRouteConfig(routeConfigName, serviceName, clusterName)},
   309  		Clusters:  []*v3clusterpb.Cluster{e2e.DefaultCluster(clusterName, endpointsName, e2e.SecurityLevelNone)},
   310  		Endpoints: []*v3endpointpb.ClusterLoadAssignment{e2e.DefaultEndpoint(endpointsName, "localhost", []uint32{testutils.ParsePort(t, backend1.Address)})},
   311  	}
   312  	if err := primaryManagementServer.Update(ctx, primaryResources); err != nil {
   313  		t.Fatal(err)
   314  	}
   315  
   316  	// Configure xDS resource on the secondary management server, with a cluster
   317  	// resource that contains an endpoint for backend2. Only the listener
   318  	// resource has the same name on both servers.
   319  	const (
   320  		fallbackRouteConfigName = "fallback-route-" + serviceName
   321  		fallbackClusterName     = "fallback-cluster-" + serviceName
   322  		fallbackEndpointsName   = "fallback-endpoints-" + serviceName
   323  	)
   324  	fallbackResources := e2e.UpdateOptions{
   325  		NodeID:    nodeID,
   326  		Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(serviceName, fallbackRouteConfigName)},
   327  		Routes:    []*v3routepb.RouteConfiguration{e2e.DefaultRouteConfig(fallbackRouteConfigName, serviceName, fallbackClusterName)},
   328  		Clusters:  []*v3clusterpb.Cluster{e2e.DefaultCluster(fallbackClusterName, fallbackEndpointsName, e2e.SecurityLevelNone)},
   329  		Endpoints: []*v3endpointpb.ClusterLoadAssignment{e2e.DefaultEndpoint(fallbackEndpointsName, "localhost", []uint32{testutils.ParsePort(t, backend2.Address)})},
   330  	}
   331  	if err := fallbackManagementServer.Update(ctx, fallbackResources); err != nil {
   332  		t.Fatal(err)
   333  	}
   334  
   335  	// Generate bootstrap configuration with the above two servers.
   336  	bootstrapContents, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{
   337  		Servers: []byte(fmt.Sprintf(`[
   338  		{
   339  			"server_uri": %q,
   340  			"channel_creds": [{"type": "insecure"}]
   341  		},
   342  		{
   343  			"server_uri": %q,
   344  			"channel_creds": [{"type": "insecure"}]
   345  		}]`, primaryManagementServer.Address, fallbackManagementServer.Address)),
   346  		Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)),
   347  	})
   348  	if err != nil {
   349  		t.Fatalf("Failed to create bootstrap file: %v", err)
   350  	}
   351  
   352  	// Create an xDS client with the above bootstrap configuration.
   353  	config, err := bootstrap.NewConfigFromContents(bootstrapContents)
   354  	if err != nil {
   355  		t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err)
   356  	}
   357  	pool := xdsclient.NewPool(config)
   358  	if err != nil {
   359  		t.Fatalf("Failed to create xDS client: %v", err)
   360  	}
   361  
   362  	// Get the xDS resolver to use the above xDS client.
   363  	resolverBuilder := internal.NewXDSResolverWithPoolForTesting.(func(*xdsclient.Pool) (resolver.Builder, error))
   364  	resolver, err := resolverBuilder(pool)
   365  	if err != nil {
   366  		t.Fatalf("Failed to create xDS resolver for testing: %v", err)
   367  	}
   368  
   369  	// Start a gRPC client that uses the above xDS resolver.
   370  	cc, err := grpc.NewClient(fmt.Sprintf("xds:///%s", serviceName), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithResolvers(resolver))
   371  	if err != nil {
   372  		t.Fatalf("Failed to create gRPC client: %v", err)
   373  	}
   374  	defer cc.Close()
   375  	cc.Connect()
   376  
   377  	// Ensure that RPCs reach the cluster specified by the primary server and
   378  	// that no connection is attempted to the fallback server.
   379  	client := testgrpc.NewTestServiceClient(cc)
   380  	if err := waitForRPCsToReachBackend(ctx, client, backend1.Address); err != nil {
   381  		t.Fatal(err)
   382  	}
   383  	sCtx, sCancel := context.WithTimeout(ctx, defaultTestShortTimeout)
   384  	defer sCancel()
   385  	if _, err := fallbackWrappedLis.NewConnCh.Receive(sCtx); err != context.DeadlineExceeded {
   386  		t.Fatalf("Connection attempt made to fallback server when none expected: %v", err)
   387  	}
   388  
   389  	// Instruct the primary server to close the connection if below defined
   390  	// cluster resource is requested.
   391  	closeConnOnMidUpdateClusterResource.Store(true)
   392  
   393  	// Update the listener resource on the primary server to point to a new
   394  	// route configuration that points to a new cluster that points to a new
   395  	// endpoints resource that contains backend3.
   396  	primaryResources = e2e.UpdateOptions{
   397  		NodeID:    nodeID,
   398  		Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(serviceName, midUpdateRouteConfigName)},
   399  		Routes:    []*v3routepb.RouteConfiguration{e2e.DefaultRouteConfig(midUpdateRouteConfigName, serviceName, midUpdateClusterName)},
   400  		Clusters:  []*v3clusterpb.Cluster{e2e.DefaultCluster(midUpdateClusterName, midUpdateEndpointsName, e2e.SecurityLevelNone)},
   401  		Endpoints: []*v3endpointpb.ClusterLoadAssignment{e2e.DefaultEndpoint(midUpdateEndpointsName, "localhost", []uint32{testutils.ParsePort(t, backend3.Address)})},
   402  	}
   403  	if err := primaryManagementServer.Update(ctx, primaryResources); err != nil {
   404  		t.Fatal(err)
   405  	}
   406  
   407  	// Ensure that a connection is attempted to the fallback (because both
   408  	// conditions mentioned for fallback in A71 are satisfied: connectivity
   409  	// failure and a watcher for an uncached resource), and that RPCs are
   410  	// routed to the cluster returned by the fallback server.
   411  	c, err := fallbackWrappedLis.NewConnCh.Receive(ctx)
   412  	if err != nil {
   413  		t.Fatalf("Failure when waiting for a connection to be opened to the fallback management server: %v", err)
   414  	}
   415  	fallbackConn := c.(*testutils.ConnWrapper)
   416  	if err := waitForRPCsToReachBackend(ctx, client, backend2.Address); err != nil {
   417  		t.Fatal(err)
   418  	}
   419  
   420  	// Set the primary management server to not close the connection anymore if
   421  	// the mid-update cluster resource is requested, and get it to start serving
   422  	// again.
   423  	closeConnOnMidUpdateClusterResource.Store(false)
   424  	primaryLis.Restart()
   425  
   426  	// A new snapshot, with the same resources, is pushed to the management
   427  	// server to get it to respond for already requested resource names.
   428  	if err := primaryManagementServer.Update(ctx, primaryResources); err != nil {
   429  		t.Fatal(err)
   430  	}
   431  
   432  	// Ensure that RPCs reach the backend pointed to by the new cluster.
   433  	if err := waitForRPCsToReachBackend(ctx, client, backend3.Address); err != nil {
   434  		t.Fatal(err)
   435  	}
   436  
   437  	// Wait for the connection to the secondary to be closed since we have
   438  	// reverted back to the primary.
   439  	if _, err := fallbackConn.CloseCh.Receive(ctx); err != nil {
   440  		t.Fatalf("Connection to fallback server not closed once primary becomes ready: %v", err)
   441  	}
   442  }
   443  
   444  // Tests fallback when the primary management server fails during startup.
   445  func (s) TestFallback_MidStartup(t *testing.T) {
   446  	ctx, cancel := context.WithTimeout(context.Background(), defaultFallbackTestTimeout)
   447  	defer cancel()
   448  
   449  	// Create two listeners for the two management servers. The test can
   450  	// start/stop these listeners and can also get notified when the listener
   451  	// receives a connection request.
   452  	primaryWrappedLis := testutils.NewListenerWrapper(t, nil)
   453  	primaryLis := testutils.NewRestartableListener(primaryWrappedLis)
   454  	fallbackWrappedLis := testutils.NewListenerWrapper(t, nil)
   455  	fallbackLis := testutils.NewRestartableListener(fallbackWrappedLis)
   456  
   457  	// This boolean helps with triggering fallback during startup. When this
   458  	// boolean is set and a cluster resource is requested, the primary
   459  	// management server shuts down the connection, forcing the client to
   460  	// fallback to the secondary server.
   461  	var closeConnOnClusterResource atomic.Bool
   462  	closeConnOnClusterResource.Store(true)
   463  
   464  	// Start two management servers, primary and fallback, with the above
   465  	// listeners.
   466  	primaryManagementServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{
   467  		Listener: primaryLis,
   468  		OnStreamRequest: func(id int64, req *v3discoverypb.DiscoveryRequest) error {
   469  			if closeConnOnClusterResource.Load() == false {
   470  				return nil
   471  			}
   472  			if req.GetTypeUrl() != version.V3ClusterURL {
   473  				return nil
   474  			}
   475  			primaryLis.Stop()
   476  			return fmt.Errorf("closing ADS stream because cluster resource was requested")
   477  		},
   478  		AllowResourceSubset: true,
   479  	})
   480  	fallbackManagementServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{Listener: fallbackLis})
   481  
   482  	// Start two test service backends.
   483  	backend1 := stubserver.StartTestService(t, nil)
   484  	defer backend1.Stop()
   485  	backend2 := stubserver.StartTestService(t, nil)
   486  	defer backend2.Stop()
   487  
   488  	// Configure xDS resource on the primary management server, with a cluster
   489  	// resource that contains an endpoint for backend1.
   490  	nodeID := uuid.New().String()
   491  	const serviceName = "my-service-fallback-xds"
   492  	primaryResources := e2e.DefaultClientResources(e2e.ResourceParams{
   493  		DialTarget: serviceName,
   494  		NodeID:     nodeID,
   495  		Host:       "localhost",
   496  		Port:       testutils.ParsePort(t, backend1.Address),
   497  		SecLevel:   e2e.SecurityLevelNone,
   498  	})
   499  	if err := primaryManagementServer.Update(ctx, primaryResources); err != nil {
   500  		t.Fatal(err)
   501  	}
   502  
   503  	// Configure xDS resource on the secondary management server, with a cluster
   504  	// resource that contains an endpoint for backend2. Only the listener
   505  	// resource has the same name on both servers.
   506  	fallbackRouteConfigName := "fallback-route-" + serviceName
   507  	fallbackClusterName := "fallback-cluster-" + serviceName
   508  	fallbackEndpointsName := "fallback-endpoints-" + serviceName
   509  	fallbackResources := e2e.UpdateOptions{
   510  		NodeID:    nodeID,
   511  		Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(serviceName, fallbackRouteConfigName)},
   512  		Routes:    []*v3routepb.RouteConfiguration{e2e.DefaultRouteConfig(fallbackRouteConfigName, serviceName, fallbackClusterName)},
   513  		Clusters:  []*v3clusterpb.Cluster{e2e.DefaultCluster(fallbackClusterName, fallbackEndpointsName, e2e.SecurityLevelNone)},
   514  		Endpoints: []*v3endpointpb.ClusterLoadAssignment{e2e.DefaultEndpoint(fallbackEndpointsName, "localhost", []uint32{testutils.ParsePort(t, backend2.Address)})},
   515  	}
   516  	if err := fallbackManagementServer.Update(ctx, fallbackResources); err != nil {
   517  		t.Fatal(err)
   518  	}
   519  
   520  	// Generate bootstrap configuration with the above two servers.
   521  	bootstrapContents, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{
   522  		Servers: []byte(fmt.Sprintf(`[
   523  		{
   524  			"server_uri": %q,
   525  			"channel_creds": [{"type": "insecure"}]
   526  		},
   527  		{
   528  			"server_uri": %q,
   529  			"channel_creds": [{"type": "insecure"}]
   530  		}]`, primaryManagementServer.Address, fallbackManagementServer.Address)),
   531  		Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)),
   532  	})
   533  	if err != nil {
   534  		t.Fatalf("Failed to create bootstrap file: %v", err)
   535  	}
   536  
   537  	// Create an xDS client with the above bootstrap configuration.
   538  	config, err := bootstrap.NewConfigFromContents(bootstrapContents)
   539  	if err != nil {
   540  		t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err)
   541  	}
   542  	pool := xdsclient.NewPool(config)
   543  	if err != nil {
   544  		t.Fatalf("Failed to create xDS client: %v", err)
   545  	}
   546  
   547  	// Get the xDS resolver to use the above xDS client.
   548  	resolverBuilder := internal.NewXDSResolverWithPoolForTesting.(func(*xdsclient.Pool) (resolver.Builder, error))
   549  	resolver, err := resolverBuilder(pool)
   550  	if err != nil {
   551  		t.Fatalf("Failed to create xDS resolver for testing: %v", err)
   552  	}
   553  
   554  	// Start a gRPC client that uses the above xDS resolver.
   555  	cc, err := grpc.NewClient(fmt.Sprintf("xds:///%s", serviceName), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithResolvers(resolver))
   556  	if err != nil {
   557  		t.Fatalf("Failed to create gRPC client: %v", err)
   558  	}
   559  	defer cc.Close()
   560  	cc.Connect()
   561  
   562  	// Ensure that a connection is attempted to the primary.
   563  	if _, err := primaryWrappedLis.NewConnCh.Receive(ctx); err != nil {
   564  		t.Fatalf("Failure when waiting for a connection to be opened to the primary management server: %v", err)
   565  	}
   566  
   567  	// Ensure that a connection is attempted to the fallback.
   568  	c, err := fallbackWrappedLis.NewConnCh.Receive(ctx)
   569  	if err != nil {
   570  		t.Fatalf("Failure when waiting for a connection to be opened to the secondary management server: %v", err)
   571  	}
   572  	fallbackConn := c.(*testutils.ConnWrapper)
   573  
   574  	// Ensure that RPCs are routed to the cluster returned by the fallback
   575  	// management server.
   576  	client := testgrpc.NewTestServiceClient(cc)
   577  	if err := waitForRPCsToReachBackend(ctx, client, backend2.Address); err != nil {
   578  		t.Fatal(err)
   579  	}
   580  
   581  	// Get the primary management server to no longer close the connection when
   582  	// the cluster resource is requested.
   583  	closeConnOnClusterResource.Store(false)
   584  	primaryLis.Restart()
   585  
   586  	// A new snapshot, with the same resources, is pushed to the management
   587  	// server to get it to respond for already requested resource names.
   588  	if err := primaryManagementServer.Update(ctx, primaryResources); err != nil {
   589  		t.Fatal(err)
   590  	}
   591  
   592  	// Ensure that RPCs are routed to the cluster returned by the primary
   593  	// management server.
   594  	if err := waitForRPCsToReachBackend(ctx, client, backend1.Address); err != nil {
   595  		t.Fatal(err)
   596  	}
   597  
   598  	// Wait for the connection to the secondary to be closed since we have
   599  	// reverted back to the primary.
   600  	if _, err := fallbackConn.CloseCh.Receive(ctx); err != nil {
   601  		t.Fatalf("Connection to fallback server not closed once primary becomes ready: %v", err)
   602  	}
   603  }
   604  
   605  // Tests that RPCs succeed at startup when the primary management server is
   606  // down, but the secondary is available.
   607  func (s) TestFallback_OnStartup_RPCSuccess(t *testing.T) {
   608  	ctx, cancel := context.WithTimeout(context.Background(), defaultFallbackTestTimeout)
   609  	defer cancel()
   610  
   611  	// Create two listeners for the two management servers. The test can
   612  	// start/stop these listeners.
   613  	l, err := testutils.LocalTCPListener()
   614  	if err != nil {
   615  		t.Fatalf("Failed to create listener: %v", err)
   616  	}
   617  	primaryLis := testutils.NewRestartableListener(l)
   618  	l, err = testutils.LocalTCPListener()
   619  	if err != nil {
   620  		t.Fatalf("Failed to create listener: %v", err)
   621  	}
   622  	fallbackLis := testutils.NewRestartableListener(l)
   623  
   624  	// Start two management servers, primary and fallback, with the above
   625  	// listeners.
   626  	primaryManagementServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{Listener: primaryLis})
   627  	fallbackManagementServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{Listener: fallbackLis})
   628  
   629  	// Start two test service backends.
   630  	backend1 := stubserver.StartTestService(t, nil)
   631  	defer backend1.Stop()
   632  	backend2 := stubserver.StartTestService(t, nil)
   633  	defer backend2.Stop()
   634  
   635  	// Configure xDS resource on the primary management server, with a cluster
   636  	// resource that contains an endpoint for backend1.
   637  	nodeID := uuid.New().String()
   638  	const serviceName = "my-service-fallback-xds"
   639  	resources := e2e.DefaultClientResources(e2e.ResourceParams{
   640  		DialTarget: serviceName,
   641  		NodeID:     nodeID,
   642  		Host:       "localhost",
   643  		Port:       testutils.ParsePort(t, backend1.Address),
   644  		SecLevel:   e2e.SecurityLevelNone,
   645  	})
   646  	if err := primaryManagementServer.Update(ctx, resources); err != nil {
   647  		t.Fatal(err)
   648  	}
   649  
   650  	// Configure xDS resource on the secondary management server, with a cluster
   651  	// resource that contains an endpoint for backend2. Only the listener
   652  	// resource has the same name on both servers.
   653  	fallbackRouteConfigName := "fallback-route-" + serviceName
   654  	fallbackClusterName := "fallback-cluster-" + serviceName
   655  	fallbackEndpointsName := "fallback-endpoints-" + serviceName
   656  	resources = e2e.UpdateOptions{
   657  		NodeID:    nodeID,
   658  		Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(serviceName, fallbackRouteConfigName)},
   659  		Routes:    []*v3routepb.RouteConfiguration{e2e.DefaultRouteConfig(fallbackRouteConfigName, serviceName, fallbackClusterName)},
   660  		Clusters:  []*v3clusterpb.Cluster{e2e.DefaultCluster(fallbackClusterName, fallbackEndpointsName, e2e.SecurityLevelNone)},
   661  		Endpoints: []*v3endpointpb.ClusterLoadAssignment{e2e.DefaultEndpoint(fallbackEndpointsName, "localhost", []uint32{testutils.ParsePort(t, backend2.Address)})},
   662  	}
   663  	if err := fallbackManagementServer.Update(ctx, resources); err != nil {
   664  		t.Fatal(err)
   665  	}
   666  
   667  	// Shutdown the primary management server before starting the gRPC client to
   668  	// trigger fallback on startup.
   669  	primaryLis.Stop()
   670  
   671  	// Generate bootstrap configuration with the above two servers.
   672  	bootstrapContents, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{
   673  		Servers: []byte(fmt.Sprintf(`[
   674  		{
   675  			"server_uri": %q,
   676  			"channel_creds": [{"type": "insecure"}]
   677  		},
   678  		{
   679  			"server_uri": %q,
   680  			"channel_creds": [{"type": "insecure"}]
   681  		}]`, primaryManagementServer.Address, fallbackManagementServer.Address)),
   682  		Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)),
   683  	})
   684  	if err != nil {
   685  		t.Fatalf("Failed to create bootstrap file: %v", err)
   686  	}
   687  
   688  	// Create an xDS client with the above bootstrap configuration.
   689  	config, err := bootstrap.NewConfigFromContents(bootstrapContents)
   690  	if err != nil {
   691  		t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err)
   692  	}
   693  	pool := xdsclient.NewPool(config)
   694  	if err != nil {
   695  		t.Fatalf("Failed to create xDS client: %v", err)
   696  	}
   697  
   698  	// Get the xDS resolver to use the above xDS client.
   699  	resolverBuilder := internal.NewXDSResolverWithPoolForTesting.(func(*xdsclient.Pool) (resolver.Builder, error))
   700  	resolver, err := resolverBuilder(pool)
   701  	if err != nil {
   702  		t.Fatalf("Failed to create xDS resolver for testing: %v", err)
   703  	}
   704  
   705  	// Start a gRPC client that uses the above xDS resolver.
   706  	cc, err := grpc.NewClient(fmt.Sprintf("xds:///%s", serviceName), grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithResolvers(resolver))
   707  	if err != nil {
   708  		t.Fatalf("Failed to create gRPC client: %v", err)
   709  	}
   710  	defer cc.Close()
   711  
   712  	// Make an RPC (without the `wait_for_ready` call option) and expect it to
   713  	// succeed since the fallback management server is up and running.
   714  	client := testgrpc.NewTestServiceClient(cc)
   715  	var peer peer.Peer
   716  	if _, err := client.EmptyCall(ctx, &testpb.Empty{}, grpc.Peer(&peer)); err != nil {
   717  		t.Fatalf("EmptyCall() failed: %v", err)
   718  	}
   719  	if got, want := peer.Addr.String(), backend2.Address; got != want {
   720  		t.Fatalf("Unexpected peer address: got %q, want %q", got, want)
   721  	}
   722  
   723  	// Start the primary server. It can take a while before the xDS client
   724  	// notices this, since the ADS stream implementation uses a backoff before
   725  	// retrying the stream.
   726  	primaryLis.Restart()
   727  	if err := waitForRPCsToReachBackend(ctx, client, backend1.Address); err != nil {
   728  		t.Fatal(err)
   729  	}
   730  }