google.golang.org/grpc@v1.74.2/xds/internal/xdsclient/tests/loadreport_test.go (about)

     1  /*
     2   *
     3   * Copyright 2024 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   *
    17   */
    18  
    19  package xdsclient_test
    20  
    21  import (
    22  	"context"
    23  	"encoding/json"
    24  	"fmt"
    25  	"net"
    26  	"testing"
    27  
    28  	"github.com/google/go-cmp/cmp"
    29  	"github.com/google/go-cmp/cmp/cmpopts"
    30  	"github.com/google/uuid"
    31  	"google.golang.org/grpc"
    32  	"google.golang.org/grpc/codes"
    33  	"google.golang.org/grpc/internal/testutils"
    34  	"google.golang.org/grpc/internal/testutils/xds/e2e"
    35  	"google.golang.org/grpc/internal/testutils/xds/fakeserver"
    36  	"google.golang.org/grpc/internal/xds/bootstrap"
    37  	"google.golang.org/grpc/status"
    38  	"google.golang.org/grpc/xds/internal/clients"
    39  	"google.golang.org/protobuf/testing/protocmp"
    40  
    41  	v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
    42  	v3endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3"
    43  	v3lrspb "github.com/envoyproxy/go-control-plane/envoy/service/load_stats/v3"
    44  	"google.golang.org/protobuf/types/known/durationpb"
    45  )
    46  
    47  const (
    48  	testKey1 = "test-key1"
    49  	testKey2 = "test-key2"
    50  )
    51  
    52  var (
    53  	testLocality1     = clients.Locality{Region: "test-region1"}
    54  	testLocality2     = clients.Locality{Region: "test-region2"}
    55  	toleranceCmpOpt   = cmpopts.EquateApprox(0, 1e-5)
    56  	ignoreOrderCmpOpt = protocmp.FilterField(&v3endpointpb.ClusterStats{}, "upstream_locality_stats",
    57  		cmpopts.SortSlices(func(a, b protocmp.Message) bool {
    58  			return a.String() < b.String()
    59  		}),
    60  	)
    61  )
    62  
    63  type wrappedListener struct {
    64  	net.Listener
    65  	newConnChan *testutils.Channel // Connection attempts are pushed here.
    66  }
    67  
    68  func (wl *wrappedListener) Accept() (net.Conn, error) {
    69  	c, err := wl.Listener.Accept()
    70  	if err != nil {
    71  		return nil, err
    72  	}
    73  	wl.newConnChan.Send(struct{}{})
    74  	return c, err
    75  }
    76  
    77  // Tests a load reporting scenario where the xDS client is reporting loads to
    78  // multiple servers. Verifies the following:
    79  //   - calling the load reporting API with different server configuration
    80  //     results in connections being created to those corresponding servers
    81  //   - the same load.Store is not returned when the load reporting API called
    82  //     with different server configurations
    83  //   - canceling the load reporting from the client results in the LRS stream
    84  //     being canceled on the server
    85  func (s) TestReportLoad_ConnectionCreation(t *testing.T) {
    86  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
    87  	defer cancel()
    88  
    89  	// Create two management servers that also serve LRS.
    90  	l, err := testutils.LocalTCPListener()
    91  	if err != nil {
    92  		t.Fatalf("Failed to create a local TCP listener: %v", err)
    93  	}
    94  	newConnChan1 := testutils.NewChannel()
    95  	lis1 := &wrappedListener{
    96  		Listener:    l,
    97  		newConnChan: newConnChan1,
    98  	}
    99  	mgmtServer1 := e2e.StartManagementServer(t, e2e.ManagementServerOptions{
   100  		Listener:                    lis1,
   101  		SupportLoadReportingService: true,
   102  	})
   103  	l, err = testutils.LocalTCPListener()
   104  	if err != nil {
   105  		t.Fatalf("Failed to create a local TCP listener: %v", err)
   106  	}
   107  	newConnChan2 := testutils.NewChannel()
   108  	lis2 := &wrappedListener{
   109  		Listener:    l,
   110  		newConnChan: newConnChan2,
   111  	}
   112  	mgmtServer2 := e2e.StartManagementServer(t, e2e.ManagementServerOptions{
   113  		Listener:                    lis2,
   114  		SupportLoadReportingService: true,
   115  	})
   116  
   117  	// Create an xDS client with a bootstrap configuration that contains both of
   118  	// the above two servers. The authority name is immaterial here since load
   119  	// reporting is per-server and not per-authority.
   120  	nodeID := uuid.New().String()
   121  	bc, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{
   122  		Servers: []byte(fmt.Sprintf(`[{
   123  			"server_uri": %q,
   124  			"channel_creds": [{"type": "insecure"}]
   125  		}]`, mgmtServer1.Address)),
   126  		Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)),
   127  		Authorities: map[string]json.RawMessage{
   128  			"test-authority": []byte(fmt.Sprintf(`{
   129  				"xds_servers": [{
   130  					"server_uri": %q,
   131  					"channel_creds": [{"type": "insecure"}]
   132  				}]}`, mgmtServer2.Address)),
   133  		},
   134  	})
   135  	if err != nil {
   136  		t.Fatalf("Failed to create bootstrap configuration: %v", err)
   137  	}
   138  	client := createXDSClient(t, bc)
   139  
   140  	serverCfg1, err := bootstrap.ServerConfigForTesting(bootstrap.ServerConfigTestingOptions{URI: mgmtServer1.Address})
   141  	if err != nil {
   142  		t.Fatalf("Failed to create server config for testing: %v", err)
   143  	}
   144  	// Call the load reporting API to report load to the first management
   145  	// server, and ensure that a connection to the server is created.
   146  	store1, lrsCancel1 := client.ReportLoad(serverCfg1)
   147  	sCtx, sCancel := context.WithTimeout(ctx, defaultTestShortTimeout)
   148  	defer sCancel()
   149  	defer lrsCancel1(sCtx)
   150  	if _, err := newConnChan1.Receive(ctx); err != nil {
   151  		t.Fatal("Timeout when waiting for a connection to the first management server, after starting load reporting")
   152  	}
   153  	if _, err := mgmtServer1.LRSServer.LRSStreamOpenChan.Receive(ctx); err != nil {
   154  		t.Fatal("Timeout when waiting for LRS stream to be created")
   155  	}
   156  
   157  	serverCfg2, err := bootstrap.ServerConfigForTesting(bootstrap.ServerConfigTestingOptions{URI: mgmtServer2.Address})
   158  	if err != nil {
   159  		t.Fatalf("Failed to create server config for testing: %v", err)
   160  	}
   161  	// Call the load reporting API to report load to the second management
   162  	// server, and ensure that a connection to the server is created.
   163  	store2, lrsCancel2 := client.ReportLoad(serverCfg2)
   164  	sCtx2, sCancel2 := context.WithTimeout(ctx, defaultTestShortTimeout)
   165  	defer sCancel2()
   166  	defer lrsCancel2(sCtx2)
   167  	if _, err := newConnChan2.Receive(ctx); err != nil {
   168  		t.Fatal("Timeout when waiting for a connection to the second management server, after starting load reporting")
   169  	}
   170  	if _, err := mgmtServer2.LRSServer.LRSStreamOpenChan.Receive(ctx); err != nil {
   171  		t.Fatal("Timeout when waiting for LRS stream to be created")
   172  	}
   173  
   174  	if store1 == store2 {
   175  		t.Fatalf("Got same store for different servers, want different")
   176  	}
   177  
   178  	// Push some loads on the received store.
   179  	store2.ReporterForCluster("cluster", "eds").CallDropped("test")
   180  
   181  	// Ensure the initial load reporting request is received at the server.
   182  	lrsServer := mgmtServer2.LRSServer
   183  	req, err := lrsServer.LRSRequestChan.Receive(ctx)
   184  	if err != nil {
   185  		t.Fatalf("Timeout when waiting for initial LRS request: %v", err)
   186  	}
   187  	gotInitialReq := req.(*fakeserver.Request).Req.(*v3lrspb.LoadStatsRequest)
   188  	nodeProto := &v3corepb.Node{
   189  		Id:                   nodeID,
   190  		UserAgentName:        "gRPC Go",
   191  		UserAgentVersionType: &v3corepb.Node_UserAgentVersion{UserAgentVersion: grpc.Version},
   192  		ClientFeatures:       []string{"envoy.lb.does_not_support_overprovisioning", "xds.config.resource-in-sotw", "envoy.lrs.supports_send_all_clusters"},
   193  	}
   194  	wantInitialReq := &v3lrspb.LoadStatsRequest{Node: nodeProto}
   195  	if diff := cmp.Diff(gotInitialReq, wantInitialReq, protocmp.Transform()); diff != "" {
   196  		t.Fatalf("Unexpected diff in initial LRS request (-got, +want):\n%s", diff)
   197  	}
   198  
   199  	// Send a response from the server with a small deadline.
   200  	lrsServer.LRSResponseChan <- &fakeserver.Response{
   201  		Resp: &v3lrspb.LoadStatsResponse{
   202  			SendAllClusters:       true,
   203  			LoadReportingInterval: &durationpb.Duration{Nanos: 50000000}, // 50ms
   204  		},
   205  	}
   206  
   207  	// Ensure that loads are seen on the server.
   208  	req, err = lrsServer.LRSRequestChan.Receive(ctx)
   209  	if err != nil {
   210  		t.Fatalf("Timeout when waiting for LRS request with loads: %v", err)
   211  	}
   212  	gotLoad := req.(*fakeserver.Request).Req.(*v3lrspb.LoadStatsRequest).ClusterStats
   213  	if l := len(gotLoad); l != 1 {
   214  		t.Fatalf("Received load for %d clusters, want 1", l)
   215  	}
   216  
   217  	// This field is set by the client to indicate the actual time elapsed since
   218  	// the last report was sent. We cannot deterministically compare this, and
   219  	// we cannot use the cmpopts.IgnoreFields() option on proto structs, since
   220  	// we already use the protocmp.Transform() which marshals the struct into
   221  	// another message. Hence setting this field to nil is the best option here.
   222  	gotLoad[0].LoadReportInterval = nil
   223  	wantLoad := &v3endpointpb.ClusterStats{
   224  		ClusterName:          "cluster",
   225  		ClusterServiceName:   "eds",
   226  		TotalDroppedRequests: 1,
   227  		DroppedRequests:      []*v3endpointpb.ClusterStats_DroppedRequests{{Category: "test", DroppedCount: 1}},
   228  	}
   229  	if diff := cmp.Diff(wantLoad, gotLoad[0], protocmp.Transform(), toleranceCmpOpt, ignoreOrderCmpOpt); diff != "" {
   230  		t.Fatalf("Unexpected diff in LRS request (-got, +want):\n%s", diff)
   231  	}
   232  
   233  	// Cancel this load reporting stream, server should see error canceled.
   234  	sCtx2, sCancel2 = context.WithTimeout(ctx, defaultTestShortTimeout)
   235  	defer sCancel2()
   236  	lrsCancel2(sCtx2)
   237  
   238  	// Server should receive a stream canceled error. There may be additional
   239  	// load reports from the client in the channel.
   240  	for {
   241  		if ctx.Err() != nil {
   242  			t.Fatal("Timeout when waiting for the LRS stream to be canceled on the server")
   243  		}
   244  		u, err := lrsServer.LRSRequestChan.Receive(ctx)
   245  		if err != nil {
   246  			continue
   247  		}
   248  		// Ignore load reports sent before the stream was cancelled.
   249  		if u.(*fakeserver.Request).Err == nil {
   250  			continue
   251  		}
   252  		if status.Code(u.(*fakeserver.Request).Err) != codes.Canceled {
   253  			t.Fatalf("Unexpected LRS request: %v, want error canceled", u)
   254  		}
   255  		break
   256  	}
   257  }
   258  
   259  // Tests a load reporting scenario where the load reporting API is called
   260  // multiple times for the same server. The test verifies the following:
   261  //   - calling the load reporting API the second time for the same server
   262  //     configuration does not create a new LRS stream
   263  //   - the LRS stream is closed *only* after all the API calls invoke their
   264  //     cancel functions
   265  //   - creating new streams after the previous one was closed works
   266  func (s) TestReportLoad_StreamCreation(t *testing.T) {
   267  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   268  	defer cancel()
   269  
   270  	// Create a management server that serves LRS.
   271  	mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{SupportLoadReportingService: true})
   272  
   273  	// Create an xDS client with bootstrap pointing to the above server.
   274  	nodeID := uuid.New().String()
   275  	bc := e2e.DefaultBootstrapContents(t, nodeID, mgmtServer.Address)
   276  	client := createXDSClient(t, bc)
   277  
   278  	// Call the load reporting API, and ensure that an LRS stream is created.
   279  	serverConfig, err := bootstrap.ServerConfigForTesting(bootstrap.ServerConfigTestingOptions{URI: mgmtServer.Address})
   280  	if err != nil {
   281  		t.Fatalf("Failed to create server config for testing: %v", err)
   282  	}
   283  	store1, cancel1 := client.ReportLoad(serverConfig)
   284  	lrsServer := mgmtServer.LRSServer
   285  	if _, err := lrsServer.LRSStreamOpenChan.Receive(ctx); err != nil {
   286  		t.Fatalf("Timeout when waiting for LRS stream to be created: %v", err)
   287  	}
   288  
   289  	// Push some loads on the received store.
   290  	store1.ReporterForCluster("cluster1", "eds1").CallDropped("test")
   291  	store1.ReporterForCluster("cluster1", "eds1").CallStarted(testLocality1)
   292  	store1.ReporterForCluster("cluster1", "eds1").CallServerLoad(testLocality1, testKey1, 3.14)
   293  	store1.ReporterForCluster("cluster1", "eds1").CallServerLoad(testLocality1, testKey1, 2.718)
   294  	store1.ReporterForCluster("cluster1", "eds1").CallFinished(testLocality1, nil)
   295  	store1.ReporterForCluster("cluster1", "eds1").CallStarted(testLocality2)
   296  	store1.ReporterForCluster("cluster1", "eds1").CallServerLoad(testLocality2, testKey2, 1.618)
   297  	store1.ReporterForCluster("cluster1", "eds1").CallFinished(testLocality2, nil)
   298  
   299  	// Ensure the initial load reporting request is received at the server.
   300  	req, err := lrsServer.LRSRequestChan.Receive(ctx)
   301  	if err != nil {
   302  		t.Fatalf("Timeout when waiting for initial LRS request: %v", err)
   303  	}
   304  	gotInitialReq := req.(*fakeserver.Request).Req.(*v3lrspb.LoadStatsRequest)
   305  	nodeProto := &v3corepb.Node{
   306  		Id:                   nodeID,
   307  		UserAgentName:        "gRPC Go",
   308  		UserAgentVersionType: &v3corepb.Node_UserAgentVersion{UserAgentVersion: grpc.Version},
   309  		ClientFeatures:       []string{"envoy.lb.does_not_support_overprovisioning", "xds.config.resource-in-sotw", "envoy.lrs.supports_send_all_clusters"},
   310  	}
   311  	wantInitialReq := &v3lrspb.LoadStatsRequest{Node: nodeProto}
   312  	if diff := cmp.Diff(gotInitialReq, wantInitialReq, protocmp.Transform()); diff != "" {
   313  		t.Fatalf("Unexpected diff in initial LRS request (-got, +want):\n%s", diff)
   314  	}
   315  
   316  	// Send a response from the server with a small deadline.
   317  	lrsServer.LRSResponseChan <- &fakeserver.Response{
   318  		Resp: &v3lrspb.LoadStatsResponse{
   319  			SendAllClusters:       true,
   320  			LoadReportingInterval: &durationpb.Duration{Nanos: 50000000}, // 50ms
   321  		},
   322  	}
   323  
   324  	// Ensure that loads are seen on the server.
   325  	req, err = lrsServer.LRSRequestChan.Receive(ctx)
   326  	if err != nil {
   327  		t.Fatal("Timeout when waiting for LRS request with loads")
   328  	}
   329  	gotLoad := req.(*fakeserver.Request).Req.(*v3lrspb.LoadStatsRequest).ClusterStats
   330  	if l := len(gotLoad); l != 1 {
   331  		t.Fatalf("Received load for %d clusters, want 1", l)
   332  	}
   333  
   334  	// This field is set by the client to indicate the actual time elapsed since
   335  	// the last report was sent. We cannot deterministically compare this, and
   336  	// we cannot use the cmpopts.IgnoreFields() option on proto structs, since
   337  	// we already use the protocmp.Transform() which marshals the struct into
   338  	// another message. Hence setting this field to nil is the best option here.
   339  	gotLoad[0].LoadReportInterval = nil
   340  	wantLoad := &v3endpointpb.ClusterStats{
   341  		ClusterName:          "cluster1",
   342  		ClusterServiceName:   "eds1",
   343  		TotalDroppedRequests: 1,
   344  		DroppedRequests:      []*v3endpointpb.ClusterStats_DroppedRequests{{Category: "test", DroppedCount: 1}},
   345  		UpstreamLocalityStats: []*v3endpointpb.UpstreamLocalityStats{
   346  			{
   347  				Locality: &v3corepb.Locality{Region: "test-region1"},
   348  				LoadMetricStats: []*v3endpointpb.EndpointLoadMetricStats{
   349  					// TotalMetricValue is the aggregation of 3.14 + 2.718 = 5.858
   350  					{MetricName: testKey1, NumRequestsFinishedWithMetric: 2, TotalMetricValue: 5.858}},
   351  				TotalSuccessfulRequests: 1,
   352  				TotalIssuedRequests:     1,
   353  			},
   354  			{
   355  				Locality: &v3corepb.Locality{Region: "test-region2"},
   356  				LoadMetricStats: []*v3endpointpb.EndpointLoadMetricStats{
   357  					{MetricName: testKey2, NumRequestsFinishedWithMetric: 1, TotalMetricValue: 1.618}},
   358  				TotalSuccessfulRequests: 1,
   359  				TotalIssuedRequests:     1,
   360  			},
   361  		},
   362  	}
   363  	if diff := cmp.Diff(wantLoad, gotLoad[0], protocmp.Transform(), toleranceCmpOpt, ignoreOrderCmpOpt); diff != "" {
   364  		t.Fatalf("Unexpected diff in LRS request (-got, +want):\n%s", diff)
   365  	}
   366  
   367  	// Make another call to the load reporting API, and ensure that a new LRS
   368  	// stream is not created.
   369  	store2, cancel2 := client.ReportLoad(serverConfig)
   370  	sCtx, sCancel := context.WithTimeout(context.Background(), defaultTestShortTimeout)
   371  	defer sCancel()
   372  	if _, err := lrsServer.LRSStreamOpenChan.Receive(sCtx); err != context.DeadlineExceeded {
   373  		t.Fatal("New LRS stream created when expected to use an existing one")
   374  	}
   375  
   376  	// Push more loads.
   377  	store2.ReporterForCluster("cluster2", "eds2").CallDropped("test")
   378  
   379  	// Ensure that loads are seen on the server. We need a loop here because
   380  	// there could have been some requests from the client in the time between
   381  	// us reading the first request and now. Those would have been queued in the
   382  	// request channel that we read out of.
   383  	for {
   384  		if ctx.Err() != nil {
   385  			t.Fatalf("Timeout when waiting for new loads to be seen on the server")
   386  		}
   387  
   388  		req, err = lrsServer.LRSRequestChan.Receive(ctx)
   389  		if err != nil {
   390  			continue
   391  		}
   392  		gotLoad = req.(*fakeserver.Request).Req.(*v3lrspb.LoadStatsRequest).ClusterStats
   393  		if l := len(gotLoad); l != 1 {
   394  			continue
   395  		}
   396  		gotLoad[0].LoadReportInterval = nil
   397  		wantLoad := &v3endpointpb.ClusterStats{
   398  			ClusterName:          "cluster2",
   399  			ClusterServiceName:   "eds2",
   400  			TotalDroppedRequests: 1,
   401  			DroppedRequests:      []*v3endpointpb.ClusterStats_DroppedRequests{{Category: "test", DroppedCount: 1}},
   402  		}
   403  		if diff := cmp.Diff(wantLoad, gotLoad[0], protocmp.Transform()); diff != "" {
   404  			t.Logf("Unexpected diff in LRS request (-got, +want):\n%s", diff)
   405  			continue
   406  		}
   407  		break
   408  	}
   409  
   410  	// Cancel the first load reporting call, and ensure that the stream does not
   411  	// close (because we have another call open).
   412  	sCtx1, sCancel1 := context.WithTimeout(ctx, defaultTestShortTimeout)
   413  	defer sCancel1()
   414  	cancel1(sCtx1)
   415  	sCtx, sCancel = context.WithTimeout(context.Background(), defaultTestShortTimeout)
   416  	defer sCancel()
   417  	if _, err := lrsServer.LRSStreamCloseChan.Receive(sCtx); err != context.DeadlineExceeded {
   418  		t.Fatal("LRS stream closed when expected to stay open")
   419  	}
   420  
   421  	// Cancel the second load reporting call, and ensure the stream is closed.
   422  	sCtx2, sCancel2 := context.WithTimeout(ctx, defaultTestShortTimeout)
   423  	defer sCancel2()
   424  	cancel2(sCtx2)
   425  	if _, err := lrsServer.LRSStreamCloseChan.Receive(ctx); err != nil {
   426  		t.Fatal("Timeout waiting for LRS stream to close")
   427  	}
   428  
   429  	// Calling the load reporting API again should result in the creation of a
   430  	// new LRS stream. This ensures that creating and closing multiple streams
   431  	// works smoothly.
   432  	_, cancel3 := client.ReportLoad(serverConfig)
   433  	if _, err := lrsServer.LRSStreamOpenChan.Receive(ctx); err != nil {
   434  		t.Fatalf("Timeout when waiting for LRS stream to be created: %v", err)
   435  	}
   436  	sCtx3, sCancel3 := context.WithTimeout(ctx, defaultTestShortTimeout)
   437  	defer sCancel3()
   438  	cancel3(sCtx3)
   439  }