google.golang.org/grpc@v1.74.2/xds/internal/xdsclient/metrics_test.go (about)

     1  /*
     2   *
     3   * Copyright 2025 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   *
    17   */
    18  
    19  package xdsclient
    20  
    21  import (
    22  	"context"
    23  	"encoding/json"
    24  	"errors"
    25  	"fmt"
    26  	"testing"
    27  
    28  	"github.com/google/uuid"
    29  	"google.golang.org/grpc/internal/testutils"
    30  	"google.golang.org/grpc/internal/testutils/stats"
    31  	"google.golang.org/grpc/internal/testutils/xds/e2e"
    32  	"google.golang.org/grpc/internal/xds/bootstrap"
    33  	"google.golang.org/grpc/xds/internal/xdsclient/xdsresource"
    34  
    35  	v3listenerpb "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3"
    36  	v3discoverypb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3"
    37  
    38  	_ "google.golang.org/grpc/xds/internal/httpfilter/router" // Register the router filter.
    39  )
    40  
    41  type noopListenerWatcher struct{}
    42  
    43  func (noopListenerWatcher) ResourceChanged(_ *xdsresource.ListenerResourceData, onDone func()) {
    44  	onDone()
    45  }
    46  
    47  func (noopListenerWatcher) ResourceError(_ error, onDone func()) {
    48  	onDone()
    49  }
    50  
    51  func (noopListenerWatcher) AmbientError(_ error, onDone func()) {
    52  	onDone()
    53  }
    54  
    55  // TestResourceUpdateMetrics configures an xDS client, and a management server
    56  // to send valid and invalid LDS updates, and verifies that the expected metrics
    57  // for both good and bad updates are emitted.
    58  func (s) TestResourceUpdateMetrics(t *testing.T) {
    59  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
    60  	defer cancel()
    61  
    62  	tmr := stats.NewTestMetricsRecorder()
    63  	l, err := testutils.LocalTCPListener()
    64  	if err != nil {
    65  		t.Fatalf("net.Listen() failed: %v", err)
    66  	}
    67  	mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{Listener: l})
    68  	const listenerResourceName = "test-listener-resource"
    69  	const routeConfigurationName = "test-route-configuration-resource"
    70  	nodeID := uuid.New().String()
    71  	resources := e2e.UpdateOptions{
    72  		NodeID:         nodeID,
    73  		Listeners:      []*v3listenerpb.Listener{e2e.DefaultClientListener(listenerResourceName, routeConfigurationName)},
    74  		SkipValidation: true,
    75  	}
    76  	if err := mgmtServer.Update(ctx, resources); err != nil {
    77  		t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err)
    78  	}
    79  
    80  	bootstrapContents, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{
    81  		Servers: []byte(fmt.Sprintf(`[{
    82  			"server_uri": %q,
    83  			"channel_creds": [{"type": "insecure"}]
    84  		}]`, mgmtServer.Address)),
    85  		Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)),
    86  		Authorities: map[string]json.RawMessage{
    87  			"authority": []byte("{}"),
    88  		},
    89  	})
    90  	if err != nil {
    91  		t.Fatalf("Failed to create bootstrap configuration: %v", err)
    92  	}
    93  
    94  	config, err := bootstrap.NewConfigFromContents(bootstrapContents)
    95  	if err != nil {
    96  		t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err)
    97  	}
    98  	pool := NewPool(config)
    99  	client, close, err := pool.NewClientForTesting(OptionsForTesting{
   100  		Name:               t.Name(),
   101  		WatchExpiryTimeout: defaultTestWatchExpiryTimeout,
   102  		MetricsRecorder:    tmr,
   103  	})
   104  	if err != nil {
   105  		t.Fatalf("Failed to create an xDS client: %v", err)
   106  	}
   107  	defer close()
   108  
   109  	// Watch the valid listener configured on the management server. This should
   110  	// cause a resource updates valid count to emit eventually.
   111  	xdsresource.WatchListener(client, listenerResourceName, noopListenerWatcher{})
   112  	mdWant := stats.MetricsData{
   113  		Handle:    xdsClientResourceUpdatesValidMetric.Descriptor(),
   114  		IntIncr:   1,
   115  		LabelKeys: []string{"grpc.target", "grpc.xds.server", "grpc.xds.resource_type"},
   116  		LabelVals: []string{"Test/ResourceUpdateMetrics", mgmtServer.Address, "ListenerResource"},
   117  	}
   118  	if err := tmr.WaitForInt64Count(ctx, mdWant); err != nil {
   119  		t.Fatal(err.Error())
   120  	}
   121  	// Invalid should have no recording point.
   122  	if got, _ := tmr.Metric("grpc.xds_client.resource_updates_invalid"); got != 0 {
   123  		t.Fatalf("Unexpected data for metric \"grpc.xds_client.resource_updates_invalid\", got: %v, want: %v", got, 0)
   124  	}
   125  
   126  	// Update management server with a bad update. Eventually, tmr should
   127  	// receive an invalid count received metric. The successful metric should
   128  	// stay the same.
   129  	resources = e2e.UpdateOptions{
   130  		NodeID:         nodeID,
   131  		Listeners:      []*v3listenerpb.Listener{e2e.DefaultClientListener(listenerResourceName, routeConfigurationName)},
   132  		SkipValidation: true,
   133  	}
   134  	resources.Listeners[0].ApiListener = nil
   135  	if err := mgmtServer.Update(ctx, resources); err != nil {
   136  		t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err)
   137  	}
   138  
   139  	mdWant = stats.MetricsData{
   140  		Handle:    xdsClientResourceUpdatesInvalidMetric.Descriptor(),
   141  		IntIncr:   1,
   142  		LabelKeys: []string{"grpc.target", "grpc.xds.server", "grpc.xds.resource_type"},
   143  		LabelVals: []string{"Test/ResourceUpdateMetrics", mgmtServer.Address, "ListenerResource"},
   144  	}
   145  	if err := tmr.WaitForInt64Count(ctx, mdWant); err != nil {
   146  		t.Fatal(err.Error())
   147  	}
   148  	// Valid should stay the same at 1.
   149  	if got, _ := tmr.Metric("grpc.xds_client.resource_updates_valid"); got != 1 {
   150  		t.Fatalf("Unexpected data for metric \"grpc.xds_client.resource_updates_invalid\", got: %v, want: %v", got, 1)
   151  	}
   152  }
   153  
   154  // TestServerFailureMetrics_BeforeResponseRecv configures an xDS client, and a
   155  // management server. It then register a watcher and stops the management
   156  // server before sending a resource update, and verifies that the expected
   157  // metrics for server failure are emitted.
   158  func (s) TestServerFailureMetrics_BeforeResponseRecv(t *testing.T) {
   159  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   160  	defer cancel()
   161  
   162  	tmr := stats.NewTestMetricsRecorder()
   163  	l, err := testutils.LocalTCPListener()
   164  	if err != nil {
   165  		t.Fatalf("net.Listen() failed: %v", err)
   166  	}
   167  	lis := testutils.NewRestartableListener(l)
   168  	streamOpened := make(chan struct{}, 1)
   169  	mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{
   170  		Listener: lis,
   171  		OnStreamOpen: func(context.Context, int64, string) error {
   172  			select {
   173  			case streamOpened <- struct{}{}:
   174  			default:
   175  			}
   176  			return nil
   177  		},
   178  	})
   179  
   180  	nodeID := uuid.New().String()
   181  
   182  	bootstrapContents, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{
   183  		Servers: []byte(fmt.Sprintf(`[{
   184  			"server_uri": %q,
   185  			"channel_creds": [{"type": "insecure"}]
   186  		}]`, mgmtServer.Address)),
   187  		Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)),
   188  		Authorities: map[string]json.RawMessage{
   189  			"authority": []byte("{}"),
   190  		},
   191  	})
   192  	if err != nil {
   193  		t.Fatalf("Failed to create bootstrap configuration: %v", err)
   194  	}
   195  
   196  	config, err := bootstrap.NewConfigFromContents(bootstrapContents)
   197  	if err != nil {
   198  		t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err)
   199  	}
   200  	pool := NewPool(config)
   201  	client, close, err := pool.NewClientForTesting(OptionsForTesting{
   202  		Name:               t.Name(),
   203  		WatchExpiryTimeout: defaultTestWatchExpiryTimeout,
   204  		MetricsRecorder:    tmr,
   205  	})
   206  	if err != nil {
   207  		t.Fatalf("Failed to create an xDS client: %v", err)
   208  	}
   209  	defer close()
   210  
   211  	const listenerResourceName = "test-listener-resource"
   212  
   213  	// Watch for the listener on the above management server.
   214  	xdsresource.WatchListener(client, listenerResourceName, noopListenerWatcher{})
   215  	// Verify that an ADS stream is opened and an LDS request with the above
   216  	// resource name is sent.
   217  	select {
   218  	case <-streamOpened:
   219  	case <-ctx.Done():
   220  		t.Fatal("Timeout when waiting for ADS stream to open")
   221  	}
   222  
   223  	// Close the listener and ensure that the ADS stream breaks. This should
   224  	// cause a server failure count to emit eventually.
   225  	lis.Stop()
   226  
   227  	// Restart to prevent the attempt to create a new ADS stream after back off.
   228  	lis.Restart()
   229  
   230  	mdWant := stats.MetricsData{
   231  		Handle:    xdsClientServerFailureMetric.Descriptor(),
   232  		IntIncr:   1,
   233  		LabelKeys: []string{"grpc.target", "grpc.xds.server"},
   234  		LabelVals: []string{"Test/ServerFailureMetrics_BeforeResponseRecv", mgmtServer.Address},
   235  	}
   236  	if err := tmr.WaitForInt64Count(ctx, mdWant); err != nil {
   237  		t.Fatal(err.Error())
   238  	}
   239  }
   240  
   241  // TestServerFailureMetrics_AfterResponseRecv configures an xDS client and a
   242  // management server to send a valid LDS update, and verifies that the
   243  // successful update metric is emitted. When the client ACKs the update, the
   244  // server returns an error, breaking the stream. The test then verifies that the
   245  // server failure metric is not emitted, because the ADS stream was closed after
   246  // a response was received on the stream. Finally, the test waits for the client
   247  // to establish a new stream and verifies that the client emits a metric after
   248  // receiving a successful update.
   249  func (s) TestServerFailureMetrics_AfterResponseRecv(t *testing.T) {
   250  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   251  	defer cancel()
   252  
   253  	tmr := stats.NewTestMetricsRecorder()
   254  	l, err := testutils.LocalTCPListener()
   255  	if err != nil {
   256  		t.Fatalf("net.Listen() failed: %v", err)
   257  	}
   258  	lis := testutils.NewRestartableListener(l)
   259  	streamCreationQuota := make(chan struct{}, 1)
   260  	streamCreationQuota <- struct{}{}
   261  
   262  	mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{
   263  		Listener: lis,
   264  		OnStreamOpen: func(context.Context, int64, string) error {
   265  			// The following select block is used to block stream creation after
   266  			// the first stream has failed, but while we are waiting to verify
   267  			// that the failure metric is not reported.
   268  			select {
   269  			case <-streamCreationQuota:
   270  			case <-ctx.Done():
   271  			}
   272  			return nil
   273  		},
   274  		OnStreamRequest: func(streamID int64, req *v3discoverypb.DiscoveryRequest) error {
   275  			// We only want the ACK on the first stream to return an error
   276  			// (leading to stream closure), without effecting subsequent stream
   277  			// attempts.
   278  			if streamID == 1 && req.GetVersionInfo() != "" {
   279  				return errors.New("test configured error")
   280  			}
   281  			return nil
   282  		}},
   283  	)
   284  	const listenerResourceName = "test-listener-resource"
   285  	const routeConfigurationName = "test-route-configuration-resource"
   286  	nodeID := uuid.New().String()
   287  	resources := e2e.UpdateOptions{
   288  		NodeID:         nodeID,
   289  		Listeners:      []*v3listenerpb.Listener{e2e.DefaultClientListener(listenerResourceName, routeConfigurationName)},
   290  		SkipValidation: true,
   291  	}
   292  	if err := mgmtServer.Update(ctx, resources); err != nil {
   293  		t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err)
   294  	}
   295  
   296  	bootstrapContents, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{
   297  		Servers: []byte(fmt.Sprintf(`[{
   298  			"server_uri": %q,
   299  			"channel_creds": [{"type": "insecure"}]
   300  		}]`, mgmtServer.Address)),
   301  		Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)),
   302  		Authorities: map[string]json.RawMessage{
   303  			"authority": []byte("{}"),
   304  		},
   305  	})
   306  	if err != nil {
   307  		t.Fatalf("Failed to create bootstrap configuration: %v", err)
   308  	}
   309  
   310  	config, err := bootstrap.NewConfigFromContents(bootstrapContents)
   311  	if err != nil {
   312  		t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err)
   313  	}
   314  	pool := NewPool(config)
   315  	client, closePool, err := pool.NewClientForTesting(OptionsForTesting{
   316  		Name:            t.Name(),
   317  		MetricsRecorder: tmr,
   318  	})
   319  	if err != nil {
   320  		t.Fatalf("Failed to create an xDS client: %v", err)
   321  	}
   322  	defer closePool()
   323  
   324  	// Watch the valid listener configured on the management server. This should
   325  	// cause a resource updates valid count to emit eventually.
   326  	xdsresource.WatchListener(client, listenerResourceName, noopListenerWatcher{})
   327  	mdSuccess := stats.MetricsData{
   328  		Handle:    xdsClientResourceUpdatesValidMetric.Descriptor(),
   329  		IntIncr:   1,
   330  		LabelKeys: []string{"grpc.target", "grpc.xds.server", "grpc.xds.resource_type"},
   331  		LabelVals: []string{"Test/ServerFailureMetrics_AfterResponseRecv", mgmtServer.Address, "ListenerResource"},
   332  	}
   333  	if err := tmr.WaitForInt64Count(ctx, mdSuccess); err != nil {
   334  		t.Fatal(err.Error())
   335  	}
   336  
   337  	// When the client sends an ACK, the management server would reply with an
   338  	// error, breaking the stream.
   339  	mdFailure := stats.MetricsData{
   340  		Handle:    xdsClientServerFailureMetric.Descriptor(),
   341  		IntIncr:   1,
   342  		LabelKeys: []string{"grpc.target", "grpc.xds.server"},
   343  		LabelVals: []string{"Test/ServerFailureMetrics_AfterResponseRecv", mgmtServer.Address},
   344  	}
   345  
   346  	// Server failure should still have no recording point.
   347  	sCtx, sCancel := context.WithTimeout(ctx, defaultTestShortTimeout)
   348  	defer sCancel()
   349  	if err := tmr.WaitForInt64Count(sCtx, mdFailure); err == nil {
   350  		t.Fatalf("tmr.WaitForInt64Count(%v) succeeded when expected to timeout.", mdFailure)
   351  	} else if sCtx.Err() == nil {
   352  		t.Fatalf("tmr.WaitForInt64Count(%v) = %v, want context deadline exceeded", mdFailure, err)
   353  	}
   354  
   355  	// Unblock stream creation and verify that an update is received
   356  	// successfully.
   357  	close(streamCreationQuota)
   358  	if err := tmr.WaitForInt64Count(ctx, mdSuccess); err != nil {
   359  		t.Fatal(err.Error())
   360  	}
   361  }