google.golang.org/grpc@v1.74.2/xds/internal/xdsclient/metrics_test.go (about) 1 /* 2 * 3 * Copyright 2025 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 package xdsclient 20 21 import ( 22 "context" 23 "encoding/json" 24 "errors" 25 "fmt" 26 "testing" 27 28 "github.com/google/uuid" 29 "google.golang.org/grpc/internal/testutils" 30 "google.golang.org/grpc/internal/testutils/stats" 31 "google.golang.org/grpc/internal/testutils/xds/e2e" 32 "google.golang.org/grpc/internal/xds/bootstrap" 33 "google.golang.org/grpc/xds/internal/xdsclient/xdsresource" 34 35 v3listenerpb "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3" 36 v3discoverypb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" 37 38 _ "google.golang.org/grpc/xds/internal/httpfilter/router" // Register the router filter. 39 ) 40 41 type noopListenerWatcher struct{} 42 43 func (noopListenerWatcher) ResourceChanged(_ *xdsresource.ListenerResourceData, onDone func()) { 44 onDone() 45 } 46 47 func (noopListenerWatcher) ResourceError(_ error, onDone func()) { 48 onDone() 49 } 50 51 func (noopListenerWatcher) AmbientError(_ error, onDone func()) { 52 onDone() 53 } 54 55 // TestResourceUpdateMetrics configures an xDS client, and a management server 56 // to send valid and invalid LDS updates, and verifies that the expected metrics 57 // for both good and bad updates are emitted. 58 func (s) TestResourceUpdateMetrics(t *testing.T) { 59 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 60 defer cancel() 61 62 tmr := stats.NewTestMetricsRecorder() 63 l, err := testutils.LocalTCPListener() 64 if err != nil { 65 t.Fatalf("net.Listen() failed: %v", err) 66 } 67 mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{Listener: l}) 68 const listenerResourceName = "test-listener-resource" 69 const routeConfigurationName = "test-route-configuration-resource" 70 nodeID := uuid.New().String() 71 resources := e2e.UpdateOptions{ 72 NodeID: nodeID, 73 Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(listenerResourceName, routeConfigurationName)}, 74 SkipValidation: true, 75 } 76 if err := mgmtServer.Update(ctx, resources); err != nil { 77 t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err) 78 } 79 80 bootstrapContents, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{ 81 Servers: []byte(fmt.Sprintf(`[{ 82 "server_uri": %q, 83 "channel_creds": [{"type": "insecure"}] 84 }]`, mgmtServer.Address)), 85 Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)), 86 Authorities: map[string]json.RawMessage{ 87 "authority": []byte("{}"), 88 }, 89 }) 90 if err != nil { 91 t.Fatalf("Failed to create bootstrap configuration: %v", err) 92 } 93 94 config, err := bootstrap.NewConfigFromContents(bootstrapContents) 95 if err != nil { 96 t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err) 97 } 98 pool := NewPool(config) 99 client, close, err := pool.NewClientForTesting(OptionsForTesting{ 100 Name: t.Name(), 101 WatchExpiryTimeout: defaultTestWatchExpiryTimeout, 102 MetricsRecorder: tmr, 103 }) 104 if err != nil { 105 t.Fatalf("Failed to create an xDS client: %v", err) 106 } 107 defer close() 108 109 // Watch the valid listener configured on the management server. This should 110 // cause a resource updates valid count to emit eventually. 111 xdsresource.WatchListener(client, listenerResourceName, noopListenerWatcher{}) 112 mdWant := stats.MetricsData{ 113 Handle: xdsClientResourceUpdatesValidMetric.Descriptor(), 114 IntIncr: 1, 115 LabelKeys: []string{"grpc.target", "grpc.xds.server", "grpc.xds.resource_type"}, 116 LabelVals: []string{"Test/ResourceUpdateMetrics", mgmtServer.Address, "ListenerResource"}, 117 } 118 if err := tmr.WaitForInt64Count(ctx, mdWant); err != nil { 119 t.Fatal(err.Error()) 120 } 121 // Invalid should have no recording point. 122 if got, _ := tmr.Metric("grpc.xds_client.resource_updates_invalid"); got != 0 { 123 t.Fatalf("Unexpected data for metric \"grpc.xds_client.resource_updates_invalid\", got: %v, want: %v", got, 0) 124 } 125 126 // Update management server with a bad update. Eventually, tmr should 127 // receive an invalid count received metric. The successful metric should 128 // stay the same. 129 resources = e2e.UpdateOptions{ 130 NodeID: nodeID, 131 Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(listenerResourceName, routeConfigurationName)}, 132 SkipValidation: true, 133 } 134 resources.Listeners[0].ApiListener = nil 135 if err := mgmtServer.Update(ctx, resources); err != nil { 136 t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err) 137 } 138 139 mdWant = stats.MetricsData{ 140 Handle: xdsClientResourceUpdatesInvalidMetric.Descriptor(), 141 IntIncr: 1, 142 LabelKeys: []string{"grpc.target", "grpc.xds.server", "grpc.xds.resource_type"}, 143 LabelVals: []string{"Test/ResourceUpdateMetrics", mgmtServer.Address, "ListenerResource"}, 144 } 145 if err := tmr.WaitForInt64Count(ctx, mdWant); err != nil { 146 t.Fatal(err.Error()) 147 } 148 // Valid should stay the same at 1. 149 if got, _ := tmr.Metric("grpc.xds_client.resource_updates_valid"); got != 1 { 150 t.Fatalf("Unexpected data for metric \"grpc.xds_client.resource_updates_invalid\", got: %v, want: %v", got, 1) 151 } 152 } 153 154 // TestServerFailureMetrics_BeforeResponseRecv configures an xDS client, and a 155 // management server. It then register a watcher and stops the management 156 // server before sending a resource update, and verifies that the expected 157 // metrics for server failure are emitted. 158 func (s) TestServerFailureMetrics_BeforeResponseRecv(t *testing.T) { 159 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 160 defer cancel() 161 162 tmr := stats.NewTestMetricsRecorder() 163 l, err := testutils.LocalTCPListener() 164 if err != nil { 165 t.Fatalf("net.Listen() failed: %v", err) 166 } 167 lis := testutils.NewRestartableListener(l) 168 streamOpened := make(chan struct{}, 1) 169 mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{ 170 Listener: lis, 171 OnStreamOpen: func(context.Context, int64, string) error { 172 select { 173 case streamOpened <- struct{}{}: 174 default: 175 } 176 return nil 177 }, 178 }) 179 180 nodeID := uuid.New().String() 181 182 bootstrapContents, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{ 183 Servers: []byte(fmt.Sprintf(`[{ 184 "server_uri": %q, 185 "channel_creds": [{"type": "insecure"}] 186 }]`, mgmtServer.Address)), 187 Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)), 188 Authorities: map[string]json.RawMessage{ 189 "authority": []byte("{}"), 190 }, 191 }) 192 if err != nil { 193 t.Fatalf("Failed to create bootstrap configuration: %v", err) 194 } 195 196 config, err := bootstrap.NewConfigFromContents(bootstrapContents) 197 if err != nil { 198 t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err) 199 } 200 pool := NewPool(config) 201 client, close, err := pool.NewClientForTesting(OptionsForTesting{ 202 Name: t.Name(), 203 WatchExpiryTimeout: defaultTestWatchExpiryTimeout, 204 MetricsRecorder: tmr, 205 }) 206 if err != nil { 207 t.Fatalf("Failed to create an xDS client: %v", err) 208 } 209 defer close() 210 211 const listenerResourceName = "test-listener-resource" 212 213 // Watch for the listener on the above management server. 214 xdsresource.WatchListener(client, listenerResourceName, noopListenerWatcher{}) 215 // Verify that an ADS stream is opened and an LDS request with the above 216 // resource name is sent. 217 select { 218 case <-streamOpened: 219 case <-ctx.Done(): 220 t.Fatal("Timeout when waiting for ADS stream to open") 221 } 222 223 // Close the listener and ensure that the ADS stream breaks. This should 224 // cause a server failure count to emit eventually. 225 lis.Stop() 226 227 // Restart to prevent the attempt to create a new ADS stream after back off. 228 lis.Restart() 229 230 mdWant := stats.MetricsData{ 231 Handle: xdsClientServerFailureMetric.Descriptor(), 232 IntIncr: 1, 233 LabelKeys: []string{"grpc.target", "grpc.xds.server"}, 234 LabelVals: []string{"Test/ServerFailureMetrics_BeforeResponseRecv", mgmtServer.Address}, 235 } 236 if err := tmr.WaitForInt64Count(ctx, mdWant); err != nil { 237 t.Fatal(err.Error()) 238 } 239 } 240 241 // TestServerFailureMetrics_AfterResponseRecv configures an xDS client and a 242 // management server to send a valid LDS update, and verifies that the 243 // successful update metric is emitted. When the client ACKs the update, the 244 // server returns an error, breaking the stream. The test then verifies that the 245 // server failure metric is not emitted, because the ADS stream was closed after 246 // a response was received on the stream. Finally, the test waits for the client 247 // to establish a new stream and verifies that the client emits a metric after 248 // receiving a successful update. 249 func (s) TestServerFailureMetrics_AfterResponseRecv(t *testing.T) { 250 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 251 defer cancel() 252 253 tmr := stats.NewTestMetricsRecorder() 254 l, err := testutils.LocalTCPListener() 255 if err != nil { 256 t.Fatalf("net.Listen() failed: %v", err) 257 } 258 lis := testutils.NewRestartableListener(l) 259 streamCreationQuota := make(chan struct{}, 1) 260 streamCreationQuota <- struct{}{} 261 262 mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{ 263 Listener: lis, 264 OnStreamOpen: func(context.Context, int64, string) error { 265 // The following select block is used to block stream creation after 266 // the first stream has failed, but while we are waiting to verify 267 // that the failure metric is not reported. 268 select { 269 case <-streamCreationQuota: 270 case <-ctx.Done(): 271 } 272 return nil 273 }, 274 OnStreamRequest: func(streamID int64, req *v3discoverypb.DiscoveryRequest) error { 275 // We only want the ACK on the first stream to return an error 276 // (leading to stream closure), without effecting subsequent stream 277 // attempts. 278 if streamID == 1 && req.GetVersionInfo() != "" { 279 return errors.New("test configured error") 280 } 281 return nil 282 }}, 283 ) 284 const listenerResourceName = "test-listener-resource" 285 const routeConfigurationName = "test-route-configuration-resource" 286 nodeID := uuid.New().String() 287 resources := e2e.UpdateOptions{ 288 NodeID: nodeID, 289 Listeners: []*v3listenerpb.Listener{e2e.DefaultClientListener(listenerResourceName, routeConfigurationName)}, 290 SkipValidation: true, 291 } 292 if err := mgmtServer.Update(ctx, resources); err != nil { 293 t.Fatalf("Failed to update management server with resources: %v, err: %v", resources, err) 294 } 295 296 bootstrapContents, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{ 297 Servers: []byte(fmt.Sprintf(`[{ 298 "server_uri": %q, 299 "channel_creds": [{"type": "insecure"}] 300 }]`, mgmtServer.Address)), 301 Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)), 302 Authorities: map[string]json.RawMessage{ 303 "authority": []byte("{}"), 304 }, 305 }) 306 if err != nil { 307 t.Fatalf("Failed to create bootstrap configuration: %v", err) 308 } 309 310 config, err := bootstrap.NewConfigFromContents(bootstrapContents) 311 if err != nil { 312 t.Fatalf("Failed to parse bootstrap contents: %s, %v", string(bootstrapContents), err) 313 } 314 pool := NewPool(config) 315 client, closePool, err := pool.NewClientForTesting(OptionsForTesting{ 316 Name: t.Name(), 317 MetricsRecorder: tmr, 318 }) 319 if err != nil { 320 t.Fatalf("Failed to create an xDS client: %v", err) 321 } 322 defer closePool() 323 324 // Watch the valid listener configured on the management server. This should 325 // cause a resource updates valid count to emit eventually. 326 xdsresource.WatchListener(client, listenerResourceName, noopListenerWatcher{}) 327 mdSuccess := stats.MetricsData{ 328 Handle: xdsClientResourceUpdatesValidMetric.Descriptor(), 329 IntIncr: 1, 330 LabelKeys: []string{"grpc.target", "grpc.xds.server", "grpc.xds.resource_type"}, 331 LabelVals: []string{"Test/ServerFailureMetrics_AfterResponseRecv", mgmtServer.Address, "ListenerResource"}, 332 } 333 if err := tmr.WaitForInt64Count(ctx, mdSuccess); err != nil { 334 t.Fatal(err.Error()) 335 } 336 337 // When the client sends an ACK, the management server would reply with an 338 // error, breaking the stream. 339 mdFailure := stats.MetricsData{ 340 Handle: xdsClientServerFailureMetric.Descriptor(), 341 IntIncr: 1, 342 LabelKeys: []string{"grpc.target", "grpc.xds.server"}, 343 LabelVals: []string{"Test/ServerFailureMetrics_AfterResponseRecv", mgmtServer.Address}, 344 } 345 346 // Server failure should still have no recording point. 347 sCtx, sCancel := context.WithTimeout(ctx, defaultTestShortTimeout) 348 defer sCancel() 349 if err := tmr.WaitForInt64Count(sCtx, mdFailure); err == nil { 350 t.Fatalf("tmr.WaitForInt64Count(%v) succeeded when expected to timeout.", mdFailure) 351 } else if sCtx.Err() == nil { 352 t.Fatalf("tmr.WaitForInt64Count(%v) = %v, want context deadline exceeded", mdFailure, err) 353 } 354 355 // Unblock stream creation and verify that an update is received 356 // successfully. 357 close(streamCreationQuota) 358 if err := tmr.WaitForInt64Count(ctx, mdSuccess); err != nil { 359 t.Fatal(err.Error()) 360 } 361 }