google.golang.org/grpc@v1.74.2/xds/internal/xdsclient/tests/loadreport_test.go (about) 1 /* 2 * 3 * Copyright 2024 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 package xdsclient_test 20 21 import ( 22 "context" 23 "encoding/json" 24 "fmt" 25 "net" 26 "testing" 27 28 "github.com/google/go-cmp/cmp" 29 "github.com/google/go-cmp/cmp/cmpopts" 30 "github.com/google/uuid" 31 "google.golang.org/grpc" 32 "google.golang.org/grpc/codes" 33 "google.golang.org/grpc/internal/testutils" 34 "google.golang.org/grpc/internal/testutils/xds/e2e" 35 "google.golang.org/grpc/internal/testutils/xds/fakeserver" 36 "google.golang.org/grpc/internal/xds/bootstrap" 37 "google.golang.org/grpc/status" 38 "google.golang.org/grpc/xds/internal/clients" 39 "google.golang.org/protobuf/testing/protocmp" 40 41 v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" 42 v3endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3" 43 v3lrspb "github.com/envoyproxy/go-control-plane/envoy/service/load_stats/v3" 44 "google.golang.org/protobuf/types/known/durationpb" 45 ) 46 47 const ( 48 testKey1 = "test-key1" 49 testKey2 = "test-key2" 50 ) 51 52 var ( 53 testLocality1 = clients.Locality{Region: "test-region1"} 54 testLocality2 = clients.Locality{Region: "test-region2"} 55 toleranceCmpOpt = cmpopts.EquateApprox(0, 1e-5) 56 ignoreOrderCmpOpt = protocmp.FilterField(&v3endpointpb.ClusterStats{}, "upstream_locality_stats", 57 cmpopts.SortSlices(func(a, b protocmp.Message) bool { 58 return a.String() < b.String() 59 }), 60 ) 61 ) 62 63 type wrappedListener struct { 64 net.Listener 65 newConnChan *testutils.Channel // Connection attempts are pushed here. 66 } 67 68 func (wl *wrappedListener) Accept() (net.Conn, error) { 69 c, err := wl.Listener.Accept() 70 if err != nil { 71 return nil, err 72 } 73 wl.newConnChan.Send(struct{}{}) 74 return c, err 75 } 76 77 // Tests a load reporting scenario where the xDS client is reporting loads to 78 // multiple servers. Verifies the following: 79 // - calling the load reporting API with different server configuration 80 // results in connections being created to those corresponding servers 81 // - the same load.Store is not returned when the load reporting API called 82 // with different server configurations 83 // - canceling the load reporting from the client results in the LRS stream 84 // being canceled on the server 85 func (s) TestReportLoad_ConnectionCreation(t *testing.T) { 86 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 87 defer cancel() 88 89 // Create two management servers that also serve LRS. 90 l, err := testutils.LocalTCPListener() 91 if err != nil { 92 t.Fatalf("Failed to create a local TCP listener: %v", err) 93 } 94 newConnChan1 := testutils.NewChannel() 95 lis1 := &wrappedListener{ 96 Listener: l, 97 newConnChan: newConnChan1, 98 } 99 mgmtServer1 := e2e.StartManagementServer(t, e2e.ManagementServerOptions{ 100 Listener: lis1, 101 SupportLoadReportingService: true, 102 }) 103 l, err = testutils.LocalTCPListener() 104 if err != nil { 105 t.Fatalf("Failed to create a local TCP listener: %v", err) 106 } 107 newConnChan2 := testutils.NewChannel() 108 lis2 := &wrappedListener{ 109 Listener: l, 110 newConnChan: newConnChan2, 111 } 112 mgmtServer2 := e2e.StartManagementServer(t, e2e.ManagementServerOptions{ 113 Listener: lis2, 114 SupportLoadReportingService: true, 115 }) 116 117 // Create an xDS client with a bootstrap configuration that contains both of 118 // the above two servers. The authority name is immaterial here since load 119 // reporting is per-server and not per-authority. 120 nodeID := uuid.New().String() 121 bc, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{ 122 Servers: []byte(fmt.Sprintf(`[{ 123 "server_uri": %q, 124 "channel_creds": [{"type": "insecure"}] 125 }]`, mgmtServer1.Address)), 126 Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)), 127 Authorities: map[string]json.RawMessage{ 128 "test-authority": []byte(fmt.Sprintf(`{ 129 "xds_servers": [{ 130 "server_uri": %q, 131 "channel_creds": [{"type": "insecure"}] 132 }]}`, mgmtServer2.Address)), 133 }, 134 }) 135 if err != nil { 136 t.Fatalf("Failed to create bootstrap configuration: %v", err) 137 } 138 client := createXDSClient(t, bc) 139 140 serverCfg1, err := bootstrap.ServerConfigForTesting(bootstrap.ServerConfigTestingOptions{URI: mgmtServer1.Address}) 141 if err != nil { 142 t.Fatalf("Failed to create server config for testing: %v", err) 143 } 144 // Call the load reporting API to report load to the first management 145 // server, and ensure that a connection to the server is created. 146 store1, lrsCancel1 := client.ReportLoad(serverCfg1) 147 sCtx, sCancel := context.WithTimeout(ctx, defaultTestShortTimeout) 148 defer sCancel() 149 defer lrsCancel1(sCtx) 150 if _, err := newConnChan1.Receive(ctx); err != nil { 151 t.Fatal("Timeout when waiting for a connection to the first management server, after starting load reporting") 152 } 153 if _, err := mgmtServer1.LRSServer.LRSStreamOpenChan.Receive(ctx); err != nil { 154 t.Fatal("Timeout when waiting for LRS stream to be created") 155 } 156 157 serverCfg2, err := bootstrap.ServerConfigForTesting(bootstrap.ServerConfigTestingOptions{URI: mgmtServer2.Address}) 158 if err != nil { 159 t.Fatalf("Failed to create server config for testing: %v", err) 160 } 161 // Call the load reporting API to report load to the second management 162 // server, and ensure that a connection to the server is created. 163 store2, lrsCancel2 := client.ReportLoad(serverCfg2) 164 sCtx2, sCancel2 := context.WithTimeout(ctx, defaultTestShortTimeout) 165 defer sCancel2() 166 defer lrsCancel2(sCtx2) 167 if _, err := newConnChan2.Receive(ctx); err != nil { 168 t.Fatal("Timeout when waiting for a connection to the second management server, after starting load reporting") 169 } 170 if _, err := mgmtServer2.LRSServer.LRSStreamOpenChan.Receive(ctx); err != nil { 171 t.Fatal("Timeout when waiting for LRS stream to be created") 172 } 173 174 if store1 == store2 { 175 t.Fatalf("Got same store for different servers, want different") 176 } 177 178 // Push some loads on the received store. 179 store2.ReporterForCluster("cluster", "eds").CallDropped("test") 180 181 // Ensure the initial load reporting request is received at the server. 182 lrsServer := mgmtServer2.LRSServer 183 req, err := lrsServer.LRSRequestChan.Receive(ctx) 184 if err != nil { 185 t.Fatalf("Timeout when waiting for initial LRS request: %v", err) 186 } 187 gotInitialReq := req.(*fakeserver.Request).Req.(*v3lrspb.LoadStatsRequest) 188 nodeProto := &v3corepb.Node{ 189 Id: nodeID, 190 UserAgentName: "gRPC Go", 191 UserAgentVersionType: &v3corepb.Node_UserAgentVersion{UserAgentVersion: grpc.Version}, 192 ClientFeatures: []string{"envoy.lb.does_not_support_overprovisioning", "xds.config.resource-in-sotw", "envoy.lrs.supports_send_all_clusters"}, 193 } 194 wantInitialReq := &v3lrspb.LoadStatsRequest{Node: nodeProto} 195 if diff := cmp.Diff(gotInitialReq, wantInitialReq, protocmp.Transform()); diff != "" { 196 t.Fatalf("Unexpected diff in initial LRS request (-got, +want):\n%s", diff) 197 } 198 199 // Send a response from the server with a small deadline. 200 lrsServer.LRSResponseChan <- &fakeserver.Response{ 201 Resp: &v3lrspb.LoadStatsResponse{ 202 SendAllClusters: true, 203 LoadReportingInterval: &durationpb.Duration{Nanos: 50000000}, // 50ms 204 }, 205 } 206 207 // Ensure that loads are seen on the server. 208 req, err = lrsServer.LRSRequestChan.Receive(ctx) 209 if err != nil { 210 t.Fatalf("Timeout when waiting for LRS request with loads: %v", err) 211 } 212 gotLoad := req.(*fakeserver.Request).Req.(*v3lrspb.LoadStatsRequest).ClusterStats 213 if l := len(gotLoad); l != 1 { 214 t.Fatalf("Received load for %d clusters, want 1", l) 215 } 216 217 // This field is set by the client to indicate the actual time elapsed since 218 // the last report was sent. We cannot deterministically compare this, and 219 // we cannot use the cmpopts.IgnoreFields() option on proto structs, since 220 // we already use the protocmp.Transform() which marshals the struct into 221 // another message. Hence setting this field to nil is the best option here. 222 gotLoad[0].LoadReportInterval = nil 223 wantLoad := &v3endpointpb.ClusterStats{ 224 ClusterName: "cluster", 225 ClusterServiceName: "eds", 226 TotalDroppedRequests: 1, 227 DroppedRequests: []*v3endpointpb.ClusterStats_DroppedRequests{{Category: "test", DroppedCount: 1}}, 228 } 229 if diff := cmp.Diff(wantLoad, gotLoad[0], protocmp.Transform(), toleranceCmpOpt, ignoreOrderCmpOpt); diff != "" { 230 t.Fatalf("Unexpected diff in LRS request (-got, +want):\n%s", diff) 231 } 232 233 // Cancel this load reporting stream, server should see error canceled. 234 sCtx2, sCancel2 = context.WithTimeout(ctx, defaultTestShortTimeout) 235 defer sCancel2() 236 lrsCancel2(sCtx2) 237 238 // Server should receive a stream canceled error. There may be additional 239 // load reports from the client in the channel. 240 for { 241 if ctx.Err() != nil { 242 t.Fatal("Timeout when waiting for the LRS stream to be canceled on the server") 243 } 244 u, err := lrsServer.LRSRequestChan.Receive(ctx) 245 if err != nil { 246 continue 247 } 248 // Ignore load reports sent before the stream was cancelled. 249 if u.(*fakeserver.Request).Err == nil { 250 continue 251 } 252 if status.Code(u.(*fakeserver.Request).Err) != codes.Canceled { 253 t.Fatalf("Unexpected LRS request: %v, want error canceled", u) 254 } 255 break 256 } 257 } 258 259 // Tests a load reporting scenario where the load reporting API is called 260 // multiple times for the same server. The test verifies the following: 261 // - calling the load reporting API the second time for the same server 262 // configuration does not create a new LRS stream 263 // - the LRS stream is closed *only* after all the API calls invoke their 264 // cancel functions 265 // - creating new streams after the previous one was closed works 266 func (s) TestReportLoad_StreamCreation(t *testing.T) { 267 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 268 defer cancel() 269 270 // Create a management server that serves LRS. 271 mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{SupportLoadReportingService: true}) 272 273 // Create an xDS client with bootstrap pointing to the above server. 274 nodeID := uuid.New().String() 275 bc := e2e.DefaultBootstrapContents(t, nodeID, mgmtServer.Address) 276 client := createXDSClient(t, bc) 277 278 // Call the load reporting API, and ensure that an LRS stream is created. 279 serverConfig, err := bootstrap.ServerConfigForTesting(bootstrap.ServerConfigTestingOptions{URI: mgmtServer.Address}) 280 if err != nil { 281 t.Fatalf("Failed to create server config for testing: %v", err) 282 } 283 store1, cancel1 := client.ReportLoad(serverConfig) 284 lrsServer := mgmtServer.LRSServer 285 if _, err := lrsServer.LRSStreamOpenChan.Receive(ctx); err != nil { 286 t.Fatalf("Timeout when waiting for LRS stream to be created: %v", err) 287 } 288 289 // Push some loads on the received store. 290 store1.ReporterForCluster("cluster1", "eds1").CallDropped("test") 291 store1.ReporterForCluster("cluster1", "eds1").CallStarted(testLocality1) 292 store1.ReporterForCluster("cluster1", "eds1").CallServerLoad(testLocality1, testKey1, 3.14) 293 store1.ReporterForCluster("cluster1", "eds1").CallServerLoad(testLocality1, testKey1, 2.718) 294 store1.ReporterForCluster("cluster1", "eds1").CallFinished(testLocality1, nil) 295 store1.ReporterForCluster("cluster1", "eds1").CallStarted(testLocality2) 296 store1.ReporterForCluster("cluster1", "eds1").CallServerLoad(testLocality2, testKey2, 1.618) 297 store1.ReporterForCluster("cluster1", "eds1").CallFinished(testLocality2, nil) 298 299 // Ensure the initial load reporting request is received at the server. 300 req, err := lrsServer.LRSRequestChan.Receive(ctx) 301 if err != nil { 302 t.Fatalf("Timeout when waiting for initial LRS request: %v", err) 303 } 304 gotInitialReq := req.(*fakeserver.Request).Req.(*v3lrspb.LoadStatsRequest) 305 nodeProto := &v3corepb.Node{ 306 Id: nodeID, 307 UserAgentName: "gRPC Go", 308 UserAgentVersionType: &v3corepb.Node_UserAgentVersion{UserAgentVersion: grpc.Version}, 309 ClientFeatures: []string{"envoy.lb.does_not_support_overprovisioning", "xds.config.resource-in-sotw", "envoy.lrs.supports_send_all_clusters"}, 310 } 311 wantInitialReq := &v3lrspb.LoadStatsRequest{Node: nodeProto} 312 if diff := cmp.Diff(gotInitialReq, wantInitialReq, protocmp.Transform()); diff != "" { 313 t.Fatalf("Unexpected diff in initial LRS request (-got, +want):\n%s", diff) 314 } 315 316 // Send a response from the server with a small deadline. 317 lrsServer.LRSResponseChan <- &fakeserver.Response{ 318 Resp: &v3lrspb.LoadStatsResponse{ 319 SendAllClusters: true, 320 LoadReportingInterval: &durationpb.Duration{Nanos: 50000000}, // 50ms 321 }, 322 } 323 324 // Ensure that loads are seen on the server. 325 req, err = lrsServer.LRSRequestChan.Receive(ctx) 326 if err != nil { 327 t.Fatal("Timeout when waiting for LRS request with loads") 328 } 329 gotLoad := req.(*fakeserver.Request).Req.(*v3lrspb.LoadStatsRequest).ClusterStats 330 if l := len(gotLoad); l != 1 { 331 t.Fatalf("Received load for %d clusters, want 1", l) 332 } 333 334 // This field is set by the client to indicate the actual time elapsed since 335 // the last report was sent. We cannot deterministically compare this, and 336 // we cannot use the cmpopts.IgnoreFields() option on proto structs, since 337 // we already use the protocmp.Transform() which marshals the struct into 338 // another message. Hence setting this field to nil is the best option here. 339 gotLoad[0].LoadReportInterval = nil 340 wantLoad := &v3endpointpb.ClusterStats{ 341 ClusterName: "cluster1", 342 ClusterServiceName: "eds1", 343 TotalDroppedRequests: 1, 344 DroppedRequests: []*v3endpointpb.ClusterStats_DroppedRequests{{Category: "test", DroppedCount: 1}}, 345 UpstreamLocalityStats: []*v3endpointpb.UpstreamLocalityStats{ 346 { 347 Locality: &v3corepb.Locality{Region: "test-region1"}, 348 LoadMetricStats: []*v3endpointpb.EndpointLoadMetricStats{ 349 // TotalMetricValue is the aggregation of 3.14 + 2.718 = 5.858 350 {MetricName: testKey1, NumRequestsFinishedWithMetric: 2, TotalMetricValue: 5.858}}, 351 TotalSuccessfulRequests: 1, 352 TotalIssuedRequests: 1, 353 }, 354 { 355 Locality: &v3corepb.Locality{Region: "test-region2"}, 356 LoadMetricStats: []*v3endpointpb.EndpointLoadMetricStats{ 357 {MetricName: testKey2, NumRequestsFinishedWithMetric: 1, TotalMetricValue: 1.618}}, 358 TotalSuccessfulRequests: 1, 359 TotalIssuedRequests: 1, 360 }, 361 }, 362 } 363 if diff := cmp.Diff(wantLoad, gotLoad[0], protocmp.Transform(), toleranceCmpOpt, ignoreOrderCmpOpt); diff != "" { 364 t.Fatalf("Unexpected diff in LRS request (-got, +want):\n%s", diff) 365 } 366 367 // Make another call to the load reporting API, and ensure that a new LRS 368 // stream is not created. 369 store2, cancel2 := client.ReportLoad(serverConfig) 370 sCtx, sCancel := context.WithTimeout(context.Background(), defaultTestShortTimeout) 371 defer sCancel() 372 if _, err := lrsServer.LRSStreamOpenChan.Receive(sCtx); err != context.DeadlineExceeded { 373 t.Fatal("New LRS stream created when expected to use an existing one") 374 } 375 376 // Push more loads. 377 store2.ReporterForCluster("cluster2", "eds2").CallDropped("test") 378 379 // Ensure that loads are seen on the server. We need a loop here because 380 // there could have been some requests from the client in the time between 381 // us reading the first request and now. Those would have been queued in the 382 // request channel that we read out of. 383 for { 384 if ctx.Err() != nil { 385 t.Fatalf("Timeout when waiting for new loads to be seen on the server") 386 } 387 388 req, err = lrsServer.LRSRequestChan.Receive(ctx) 389 if err != nil { 390 continue 391 } 392 gotLoad = req.(*fakeserver.Request).Req.(*v3lrspb.LoadStatsRequest).ClusterStats 393 if l := len(gotLoad); l != 1 { 394 continue 395 } 396 gotLoad[0].LoadReportInterval = nil 397 wantLoad := &v3endpointpb.ClusterStats{ 398 ClusterName: "cluster2", 399 ClusterServiceName: "eds2", 400 TotalDroppedRequests: 1, 401 DroppedRequests: []*v3endpointpb.ClusterStats_DroppedRequests{{Category: "test", DroppedCount: 1}}, 402 } 403 if diff := cmp.Diff(wantLoad, gotLoad[0], protocmp.Transform()); diff != "" { 404 t.Logf("Unexpected diff in LRS request (-got, +want):\n%s", diff) 405 continue 406 } 407 break 408 } 409 410 // Cancel the first load reporting call, and ensure that the stream does not 411 // close (because we have another call open). 412 sCtx1, sCancel1 := context.WithTimeout(ctx, defaultTestShortTimeout) 413 defer sCancel1() 414 cancel1(sCtx1) 415 sCtx, sCancel = context.WithTimeout(context.Background(), defaultTestShortTimeout) 416 defer sCancel() 417 if _, err := lrsServer.LRSStreamCloseChan.Receive(sCtx); err != context.DeadlineExceeded { 418 t.Fatal("LRS stream closed when expected to stay open") 419 } 420 421 // Cancel the second load reporting call, and ensure the stream is closed. 422 sCtx2, sCancel2 := context.WithTimeout(ctx, defaultTestShortTimeout) 423 defer sCancel2() 424 cancel2(sCtx2) 425 if _, err := lrsServer.LRSStreamCloseChan.Receive(ctx); err != nil { 426 t.Fatal("Timeout waiting for LRS stream to close") 427 } 428 429 // Calling the load reporting API again should result in the creation of a 430 // new LRS stream. This ensures that creating and closing multiple streams 431 // works smoothly. 432 _, cancel3 := client.ReportLoad(serverConfig) 433 if _, err := lrsServer.LRSStreamOpenChan.Receive(ctx); err != nil { 434 t.Fatalf("Timeout when waiting for LRS stream to be created: %v", err) 435 } 436 sCtx3, sCancel3 := context.WithTimeout(ctx, defaultTestShortTimeout) 437 defer sCancel3() 438 cancel3(sCtx3) 439 }