google.golang.org/grpc@v1.74.2/xds/internal/clients/xdsclient/test/ads_stream_backoff_test.go

google.golang.org/grpc@v1.74.2/xds/internal/clients/xdsclient/test/ads_stream_backoff_test.go (about)

     1  /*
     2   *
     3   * Copyright 2024 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   *
    17   */
    18  
    19  package xdsclient_test
    20  
    21  import (
    22  	"context"
    23  	"errors"
    24  	"fmt"
    25  	"net"
    26  	"testing"
    27  	"time"
    28  
    29  	"google.golang.org/grpc/credentials/insecure"
    30  	"google.golang.org/grpc/xds/internal/clients/grpctransport"
    31  	"google.golang.org/grpc/xds/internal/clients/internal/testutils"
    32  	"google.golang.org/grpc/xds/internal/clients/internal/testutils/e2e"
    33  	"google.golang.org/grpc/xds/internal/clients/xdsclient"
    34  	xdsclientinternal "google.golang.org/grpc/xds/internal/clients/xdsclient/internal"
    35  	"google.golang.org/grpc/xds/internal/clients/xdsclient/internal/xdsresource"
    36  	"google.golang.org/grpc/xds/internal/xdsclient/xdsresource/version"
    37  	"google.golang.org/protobuf/testing/protocmp"
    38  
    39  	v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
    40  	v3listenerpb "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3"
    41  	v3discoverypb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3"
    42  	"github.com/google/go-cmp/cmp"
    43  	"github.com/google/go-cmp/cmp/cmpopts"
    44  	"github.com/google/uuid"
    45  )
    46  
    47  func overrideStreamBackOff(t *testing.T, streamBackOff func(int) time.Duration) {
    48  	originalStreamBackoff := xdsclientinternal.StreamBackoff
    49  	xdsclientinternal.StreamBackoff = streamBackOff
    50  	t.Cleanup(func() { xdsclientinternal.StreamBackoff = originalStreamBackoff })
    51  }
    52  
    53  // Creates an xDS client with the given management server address, nodeID and backoff function.
    54  func createXDSClientWithBackoff(t *testing.T, mgmtServerAddress string, nodeID string, streamBackoff func(int) time.Duration) *xdsclient.XDSClient {
    55  	t.Helper()
    56  	overrideStreamBackOff(t, streamBackoff)
    57  	configs := map[string]grpctransport.Config{"insecure": {Credentials: insecure.NewBundle()}}
    58  	return createXDSClient(t, mgmtServerAddress, nodeID, grpctransport.NewBuilder(configs))
    59  }
    60  
    61  // Tests the case where the management server returns an error in the ADS
    62  // streaming RPC. Verifies that the ADS stream is restarted after a backoff
    63  // period, and that the previously requested resources are re-requested on the
    64  // new stream.
    65  func (s) TestADS_BackoffAfterStreamFailure(t *testing.T) {
    66  	// Channels used for verifying different events in the test.
    67  	streamCloseCh := make(chan struct{}, 1)  // ADS stream is closed.
    68  	ldsResourcesCh := make(chan []string, 1) // Listener resource names in the discovery request.
    69  	backoffCh := make(chan struct{}, 1)      // Backoff after stream failure.
    70  
    71  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
    72  	defer cancel()
    73  
    74  	// Create an xDS management server that returns RPC errors.
    75  	streamErr := errors.New("ADS stream error")
    76  	mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{
    77  		OnStreamRequest: func(_ int64, req *v3discoverypb.DiscoveryRequest) error {
    78  			// Push the requested resource names on to a channel.
    79  			if req.GetTypeUrl() == version.V3ListenerURL {
    80  				t.Logf("Received LDS request for resources: %v", req.GetResourceNames())
    81  				select {
    82  				case ldsResourcesCh <- req.GetResourceNames():
    83  				case <-ctx.Done():
    84  				}
    85  			}
    86  			// Return an error everytime a request is sent on the stream. This
    87  			// should cause the transport to backoff before attempting to
    88  			// recreate the stream.
    89  			return streamErr
    90  		},
    91  		// Push on a channel whenever the stream is closed.
    92  		OnStreamClosed: func(int64, *v3corepb.Node) {
    93  			select {
    94  			case streamCloseCh <- struct{}{}:
    95  			case <-ctx.Done():
    96  			}
    97  		},
    98  	})
    99  
   100  	// Override the backoff implementation to push on a channel that is read by
   101  	// the test goroutine.
   102  	backoffCtx, backoffCancel := context.WithCancel(ctx)
   103  	streamBackoff := func(int) time.Duration {
   104  		select {
   105  		case backoffCh <- struct{}{}:
   106  		case <-backoffCtx.Done():
   107  		}
   108  		return 0
   109  	}
   110  	defer backoffCancel()
   111  
   112  	// Create an xDS client with bootstrap pointing to the above server.
   113  	nodeID := uuid.New().String()
   114  	client := createXDSClientWithBackoff(t, mgmtServer.Address, nodeID, streamBackoff)
   115  
   116  	// Register a watch for a listener resource.
   117  	const listenerName = "listener"
   118  	lw := newListenerWatcher()
   119  	ldsCancel := client.WatchResource(xdsresource.V3ListenerURL, listenerName, lw)
   120  	defer ldsCancel()
   121  
   122  	// Verify that an ADS stream is created and an LDS request with the above
   123  	// resource name is sent.
   124  	if err := waitForResourceNames(ctx, t, ldsResourcesCh, []string{listenerName}); err != nil {
   125  		t.Fatal(err)
   126  	}
   127  
   128  	// Verify that the received stream error is reported to the watcher.
   129  	if err := verifyListenerResourceError(ctx, lw.resourceErrCh, streamErr.Error(), nodeID); err != nil {
   130  		t.Fatal(err)
   131  	}
   132  
   133  	// Verify that the stream is closed.
   134  	select {
   135  	case <-streamCloseCh:
   136  	case <-ctx.Done():
   137  		t.Fatalf("Timeout waiting for stream to be closed after an error")
   138  	}
   139  
   140  	// Verify that the ADS stream backs off before recreating the stream.
   141  	select {
   142  	case <-backoffCh:
   143  	case <-ctx.Done():
   144  		t.Fatalf("Timeout waiting for ADS stream to backoff after stream failure")
   145  	}
   146  
   147  	// Verify that the same resource name is re-requested on the new stream.
   148  	if err := waitForResourceNames(ctx, t, ldsResourcesCh, []string{listenerName}); err != nil {
   149  		t.Fatal(err)
   150  	}
   151  
   152  	// To prevent indefinite blocking during xDS client close, which is caused
   153  	// by a blocking backoff channel write, cancel the backoff context early
   154  	// given that the test is complete.
   155  	backoffCancel()
   156  
   157  }
   158  
   159  // Tests the case where a stream breaks because the server goes down. Verifies
   160  // that when the server comes back up, the same resources are re-requested, this
   161  // time with the previously acked version and an empty nonce.
   162  func (s) TestADS_RetriesAfterBrokenStream(t *testing.T) {
   163  	// Channels used for verifying different events in the test.
   164  	streamRequestCh := make(chan *v3discoverypb.DiscoveryRequest, 1)   // Discovery request is received.
   165  	streamResponseCh := make(chan *v3discoverypb.DiscoveryResponse, 1) // Discovery response is received.
   166  
   167  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   168  	defer cancel()
   169  
   170  	// Create an xDS management server listening on a local port.
   171  	l, err := net.Listen("tcp", "localhost:0")
   172  	if err != nil {
   173  		t.Fatalf("net.Listen() failed: %v", err)
   174  	}
   175  	lis := testutils.NewRestartableListener(l)
   176  	mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{
   177  		Listener: lis,
   178  		// Push the received request on to a channel for the test goroutine to
   179  		// verify that it matches expectations.
   180  		OnStreamRequest: func(_ int64, req *v3discoverypb.DiscoveryRequest) error {
   181  			select {
   182  			case streamRequestCh <- req:
   183  			case <-ctx.Done():
   184  			}
   185  			return nil
   186  		},
   187  		// Push the response that the management server is about to send on to a
   188  		// channel. The test goroutine to uses this to extract the version and
   189  		// nonce, expected on subsequent requests.
   190  		OnStreamResponse: func(_ context.Context, _ int64, _ *v3discoverypb.DiscoveryRequest, resp *v3discoverypb.DiscoveryResponse) {
   191  			select {
   192  			case streamResponseCh <- resp:
   193  			case <-ctx.Done():
   194  			}
   195  		},
   196  	})
   197  
   198  	// Create a listener resource on the management server.
   199  	const listenerName = "listener"
   200  	const routeConfigName = "route-config"
   201  	nodeID := uuid.New().String()
   202  	resources := e2e.UpdateOptions{
   203  		NodeID:         nodeID,
   204  		Listeners:      []*v3listenerpb.Listener{e2e.DefaultClientListener(listenerName, routeConfigName)},
   205  		SkipValidation: true,
   206  	}
   207  	if err := mgmtServer.Update(ctx, resources); err != nil {
   208  		t.Fatal(err)
   209  	}
   210  
   211  	// Override the backoff implementation to always return 0, to reduce test
   212  	// run time. Instead control when the backoff returns by blocking on a
   213  	// channel, that the test closes.
   214  	backoffCh := make(chan struct{})
   215  	streamBackoff := func(int) time.Duration {
   216  		select {
   217  		case backoffCh <- struct{}{}:
   218  		case <-ctx.Done():
   219  		}
   220  		return 0
   221  	}
   222  
   223  	// Create an xDS client pointing to the above server.
   224  	client := createXDSClientWithBackoff(t, mgmtServer.Address, nodeID, streamBackoff)
   225  
   226  	// Register a watch for a listener resource.
   227  	lw := newListenerWatcher()
   228  	ldsCancel := client.WatchResource(xdsresource.V3ListenerURL, listenerName, lw)
   229  	defer ldsCancel()
   230  
   231  	// Verify that the initial discovery request matches expectation.
   232  	var gotReq *v3discoverypb.DiscoveryRequest
   233  	select {
   234  	case gotReq = <-streamRequestCh:
   235  	case <-ctx.Done():
   236  		t.Fatalf("Timeout waiting for discovery request on the stream")
   237  	}
   238  	wantReq := &v3discoverypb.DiscoveryRequest{
   239  		VersionInfo: "",
   240  		Node: &v3corepb.Node{
   241  			Id:                   nodeID,
   242  			UserAgentName:        "user-agent",
   243  			UserAgentVersionType: &v3corepb.Node_UserAgentVersion{UserAgentVersion: "0.0.0.0"},
   244  			ClientFeatures:       []string{"envoy.lb.does_not_support_overprovisioning", "xds.config.resource-in-sotw"},
   245  		},
   246  		ResourceNames: []string{listenerName},
   247  		TypeUrl:       "type.googleapis.com/envoy.config.listener.v3.Listener",
   248  		ResponseNonce: "",
   249  	}
   250  	if diff := cmp.Diff(gotReq, wantReq, protocmp.Transform()); diff != "" {
   251  		t.Fatalf("Unexpected diff in received discovery request, diff (-got, +want):\n%s", diff)
   252  	}
   253  
   254  	// Capture the version and nonce from the response.
   255  	var gotResp *v3discoverypb.DiscoveryResponse
   256  	select {
   257  	case gotResp = <-streamResponseCh:
   258  	case <-ctx.Done():
   259  		t.Fatalf("Timeout waiting for discovery response on the stream")
   260  	}
   261  	version := gotResp.GetVersionInfo()
   262  	nonce := gotResp.GetNonce()
   263  
   264  	// Verify that the ACK contains the appropriate version and nonce.
   265  	wantReq.VersionInfo = version
   266  	wantReq.ResponseNonce = nonce
   267  	select {
   268  	case gotReq = <-streamRequestCh:
   269  	case <-ctx.Done():
   270  		t.Fatalf("Timeout waiting for the discovery request ACK on the stream")
   271  	}
   272  	if diff := cmp.Diff(gotReq, wantReq, protocmp.Transform()); diff != "" {
   273  		t.Fatalf("Unexpected diff in received discovery request, diff (-got, +want):\n%s", diff)
   274  	}
   275  
   276  	// Verify the update received by the watcher.
   277  	wantUpdate := listenerUpdateErrTuple{
   278  		update: listenerUpdate{
   279  			RouteConfigName: routeConfigName},
   280  	}
   281  	if err := verifyListenerUpdate(ctx, lw.updateCh, wantUpdate); err != nil {
   282  		t.Fatal(err)
   283  	}
   284  
   285  	// Bring down the management server to simulate a broken stream.
   286  	lis.Stop()
   287  
   288  	// Verify that the error callback on the watcher is not invoked.
   289  	verifyNoListenerUpdate(ctx, lw.updateCh)
   290  
   291  	// Wait for backoff to kick in, and unblock the first backoff attempt.
   292  	select {
   293  	case <-backoffCh:
   294  	case <-ctx.Done():
   295  		t.Fatal("Timeout waiting for stream backoff")
   296  	}
   297  
   298  	// Bring up the management server. The test does not have prcecise control
   299  	// over when new streams to the management server will start succeeding. The
   300  	// ADS stream implementation will backoff as many times as required before
   301  	// it can successfully create a new stream. Therefore, we need to receive on
   302  	// the backoffCh as many times as required, and unblock the backoff
   303  	// implementation.
   304  	lis.Restart()
   305  	go func() {
   306  		for {
   307  			select {
   308  			case <-backoffCh:
   309  			case <-ctx.Done():
   310  				return
   311  			}
   312  		}
   313  	}()
   314  
   315  	// Verify that the transport creates a new stream and sends out a new
   316  	// request which contains the previously acked version, but an empty nonce.
   317  	wantReq.ResponseNonce = ""
   318  	select {
   319  	case gotReq = <-streamRequestCh:
   320  	case <-ctx.Done():
   321  		t.Fatalf("Timeout waiting for the discovery request ACK on the stream")
   322  	}
   323  	if diff := cmp.Diff(gotReq, wantReq, protocmp.Transform()); diff != "" {
   324  		t.Fatalf("Unexpected diff in received discovery request, diff (-got, +want):\n%s", diff)
   325  	}
   326  }
   327  
   328  // Tests the case where a resource is requested before the a valid ADS stream
   329  // exists. Verifies that the a discovery request is sent out for the previously
   330  // requested resource once a valid stream is created.
   331  func (s) TestADS_ResourceRequestedBeforeStreamCreation(t *testing.T) {
   332  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   333  	defer cancel()
   334  
   335  	// Channels used for verifying different events in the test.
   336  	streamRequestCh := make(chan *v3discoverypb.DiscoveryRequest, 1) // Discovery request is received.
   337  
   338  	// Create an xDS management server listening on a local port.
   339  	l, err := net.Listen("tcp", "localhost:0")
   340  	if err != nil {
   341  		t.Fatalf("net.Listen() failed: %v", err)
   342  	}
   343  	lis := testutils.NewRestartableListener(l)
   344  	streamErr := errors.New("ADS stream error")
   345  
   346  	mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{
   347  		Listener: lis,
   348  
   349  		// Return an error everytime a request is sent on the stream. This
   350  		// should cause the transport to backoff before attempting to recreate
   351  		// the stream.
   352  		OnStreamRequest: func(_ int64, req *v3discoverypb.DiscoveryRequest) error {
   353  			select {
   354  			case streamRequestCh <- req:
   355  			default:
   356  			}
   357  			return streamErr
   358  		},
   359  	})
   360  
   361  	// Bring down the management server before creating the transport. This
   362  	// allows us to test the case where SendRequest() is called when there is no
   363  	// stream to the management server.
   364  	lis.Stop()
   365  
   366  	// Override the backoff implementation to always return 0, to reduce test
   367  	// run time. Instead control when the backoff returns by blocking on a
   368  	// channel, that the test closes.
   369  	backoffCh := make(chan struct{}, 1)
   370  	unblockBackoffCh := make(chan struct{})
   371  	streamBackoff := func(int) time.Duration {
   372  		select {
   373  		case backoffCh <- struct{}{}:
   374  		default:
   375  		}
   376  		<-unblockBackoffCh
   377  		return 0
   378  	}
   379  
   380  	// Create an xDS client with bootstrap pointing to the above server.
   381  	nodeID := uuid.New().String()
   382  	client := createXDSClientWithBackoff(t, mgmtServer.Address, nodeID, streamBackoff)
   383  
   384  	// Register a watch for a listener resource.
   385  	const listenerName = "listener"
   386  	lw := newListenerWatcher()
   387  	ldsCancel := client.WatchResource(xdsresource.V3ListenerURL, listenerName, lw)
   388  	defer ldsCancel()
   389  
   390  	// The above watch results in an attempt to create a new stream, which will
   391  	// fail, and will result in backoff. Wait for backoff to kick in.
   392  	select {
   393  	case <-backoffCh:
   394  	case <-ctx.Done():
   395  		t.Fatal("Timeout waiting for stream backoff")
   396  	}
   397  
   398  	// Bring up the connection to the management server, and unblock the backoff
   399  	// implementation.
   400  	lis.Restart()
   401  	close(unblockBackoffCh)
   402  
   403  	// Verify that the initial discovery request matches expectation.
   404  	var gotReq *v3discoverypb.DiscoveryRequest
   405  	select {
   406  	case gotReq = <-streamRequestCh:
   407  	case <-ctx.Done():
   408  		t.Fatalf("Timeout waiting for discovery request on the stream")
   409  	}
   410  	wantReq := &v3discoverypb.DiscoveryRequest{
   411  		VersionInfo: "",
   412  		Node: &v3corepb.Node{
   413  			Id:                   nodeID,
   414  			UserAgentName:        "user-agent",
   415  			UserAgentVersionType: &v3corepb.Node_UserAgentVersion{UserAgentVersion: "0.0.0.0"},
   416  			ClientFeatures:       []string{"envoy.lb.does_not_support_overprovisioning", "xds.config.resource-in-sotw"},
   417  		},
   418  		ResourceNames: []string{listenerName},
   419  		TypeUrl:       "type.googleapis.com/envoy.config.listener.v3.Listener",
   420  		ResponseNonce: "",
   421  	}
   422  	if diff := cmp.Diff(gotReq, wantReq, protocmp.Transform()); diff != "" {
   423  		t.Fatalf("Unexpected diff in received discovery request, diff (-got, +want):\n%s", diff)
   424  	}
   425  }
   426  
   427  // waitForResourceNames waits for the wantNames to be received on namesCh.
   428  // Returns a non-nil error if the context expires before that.
   429  func waitForResourceNames(ctx context.Context, t *testing.T, namesCh chan []string, wantNames []string) error {
   430  	t.Helper()
   431  
   432  	var lastRequestedNames []string
   433  	for ; ; <-time.After(defaultTestShortTimeout) {
   434  		select {
   435  		case <-ctx.Done():
   436  			return fmt.Errorf("timeout waiting for resources %v to be requested from the management server. Last requested resources: %v", wantNames, lastRequestedNames)
   437  		case gotNames := <-namesCh:
   438  			if cmp.Equal(gotNames, wantNames, cmpopts.EquateEmpty(), cmpopts.SortSlices(func(s1, s2 string) bool { return s1 < s2 })) {
   439  				return nil
   440  			}
   441  			lastRequestedNames = gotNames
   442  		}
   443  	}
   444  }