google.golang.org/grpc@v1.72.2/xds/internal/xdsclient/tests/loadreport_test.go (about) 1 /* 2 * 3 * Copyright 2024 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 package xdsclient_test 20 21 import ( 22 "context" 23 "encoding/json" 24 "fmt" 25 "net" 26 "testing" 27 28 "github.com/google/go-cmp/cmp" 29 "github.com/google/go-cmp/cmp/cmpopts" 30 "github.com/google/uuid" 31 "google.golang.org/grpc" 32 "google.golang.org/grpc/codes" 33 "google.golang.org/grpc/internal/testutils" 34 "google.golang.org/grpc/internal/testutils/xds/e2e" 35 "google.golang.org/grpc/internal/testutils/xds/fakeserver" 36 "google.golang.org/grpc/internal/xds/bootstrap" 37 "google.golang.org/grpc/status" 38 "google.golang.org/protobuf/testing/protocmp" 39 40 v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" 41 v3endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3" 42 v3lrspb "github.com/envoyproxy/go-control-plane/envoy/service/load_stats/v3" 43 "google.golang.org/protobuf/types/known/durationpb" 44 ) 45 46 const ( 47 testLocality1 = `{"region":"test-region1"}` 48 testLocality2 = `{"region":"test-region2"}` 49 testKey1 = "test-key1" 50 testKey2 = "test-key2" 51 ) 52 53 var ( 54 toleranceCmpOpt = cmpopts.EquateApprox(0, 1e-5) 55 ignoreOrderCmpOpt = protocmp.FilterField(&v3endpointpb.ClusterStats{}, "upstream_locality_stats", 56 cmpopts.SortSlices(func(a, b protocmp.Message) bool { 57 return a.String() < b.String() 58 }), 59 ) 60 ) 61 62 type wrappedListener struct { 63 net.Listener 64 newConnChan *testutils.Channel // Connection attempts are pushed here. 65 } 66 67 func (wl *wrappedListener) Accept() (net.Conn, error) { 68 c, err := wl.Listener.Accept() 69 if err != nil { 70 return nil, err 71 } 72 wl.newConnChan.Send(struct{}{}) 73 return c, err 74 } 75 76 // Tests a load reporting scenario where the xDS client is reporting loads to 77 // multiple servers. Verifies the following: 78 // - calling the load reporting API with different server configuration 79 // results in connections being created to those corresponding servers 80 // - the same load.Store is not returned when the load reporting API called 81 // with different server configurations 82 // - canceling the load reporting from the client results in the LRS stream 83 // being canceled on the server 84 func (s) TestReportLoad_ConnectionCreation(t *testing.T) { 85 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 86 defer cancel() 87 88 // Create two management servers that also serve LRS. 89 l, err := testutils.LocalTCPListener() 90 if err != nil { 91 t.Fatalf("Failed to create a local TCP listener: %v", err) 92 } 93 newConnChan1 := testutils.NewChannel() 94 lis1 := &wrappedListener{ 95 Listener: l, 96 newConnChan: newConnChan1, 97 } 98 mgmtServer1 := e2e.StartManagementServer(t, e2e.ManagementServerOptions{ 99 Listener: lis1, 100 SupportLoadReportingService: true, 101 }) 102 l, err = testutils.LocalTCPListener() 103 if err != nil { 104 t.Fatalf("Failed to create a local TCP listener: %v", err) 105 } 106 newConnChan2 := testutils.NewChannel() 107 lis2 := &wrappedListener{ 108 Listener: l, 109 newConnChan: newConnChan2, 110 } 111 mgmtServer2 := e2e.StartManagementServer(t, e2e.ManagementServerOptions{ 112 Listener: lis2, 113 SupportLoadReportingService: true, 114 }) 115 116 // Create an xDS client with a bootstrap configuration that contains both of 117 // the above two servers. The authority name is immaterial here since load 118 // reporting is per-server and not per-authority. 119 nodeID := uuid.New().String() 120 bc, err := bootstrap.NewContentsForTesting(bootstrap.ConfigOptionsForTesting{ 121 Servers: []byte(fmt.Sprintf(`[{ 122 "server_uri": %q, 123 "channel_creds": [{"type": "insecure"}] 124 }]`, mgmtServer1.Address)), 125 Node: []byte(fmt.Sprintf(`{"id": "%s"}`, nodeID)), 126 Authorities: map[string]json.RawMessage{ 127 "test-authority": []byte(fmt.Sprintf(`{ 128 "xds_servers": [{ 129 "server_uri": %q, 130 "channel_creds": [{"type": "insecure"}] 131 }]}`, mgmtServer2.Address)), 132 }, 133 }) 134 if err != nil { 135 t.Fatalf("Failed to create bootstrap configuration: %v", err) 136 } 137 client := createXDSClient(t, bc) 138 139 serverCfg1, err := bootstrap.ServerConfigForTesting(bootstrap.ServerConfigTestingOptions{URI: mgmtServer1.Address}) 140 if err != nil { 141 t.Fatalf("Failed to create server config for testing: %v", err) 142 } 143 // Call the load reporting API to report load to the first management 144 // server, and ensure that a connection to the server is created. 145 store1, lrsCancel1 := client.ReportLoad(serverCfg1) 146 defer lrsCancel1() 147 if _, err := newConnChan1.Receive(ctx); err != nil { 148 t.Fatal("Timeout when waiting for a connection to the first management server, after starting load reporting") 149 } 150 if _, err := mgmtServer1.LRSServer.LRSStreamOpenChan.Receive(ctx); err != nil { 151 t.Fatal("Timeout when waiting for LRS stream to be created") 152 } 153 154 serverCfg2, err := bootstrap.ServerConfigForTesting(bootstrap.ServerConfigTestingOptions{URI: mgmtServer2.Address}) 155 if err != nil { 156 t.Fatalf("Failed to create server config for testing: %v", err) 157 } 158 // Call the load reporting API to report load to the second management 159 // server, and ensure that a connection to the server is created. 160 store2, lrsCancel2 := client.ReportLoad(serverCfg2) 161 defer lrsCancel2() 162 if _, err := newConnChan2.Receive(ctx); err != nil { 163 t.Fatal("Timeout when waiting for a connection to the second management server, after starting load reporting") 164 } 165 if _, err := mgmtServer2.LRSServer.LRSStreamOpenChan.Receive(ctx); err != nil { 166 t.Fatal("Timeout when waiting for LRS stream to be created") 167 } 168 169 if store1 == store2 { 170 t.Fatalf("Got same store for different servers, want different") 171 } 172 173 // Push some loads on the received store. 174 store2.PerCluster("cluster", "eds").CallDropped("test") 175 176 // Ensure the initial load reporting request is received at the server. 177 lrsServer := mgmtServer2.LRSServer 178 req, err := lrsServer.LRSRequestChan.Receive(ctx) 179 if err != nil { 180 t.Fatalf("Timeout when waiting for initial LRS request: %v", err) 181 } 182 gotInitialReq := req.(*fakeserver.Request).Req.(*v3lrspb.LoadStatsRequest) 183 nodeProto := &v3corepb.Node{ 184 Id: nodeID, 185 UserAgentName: "gRPC Go", 186 UserAgentVersionType: &v3corepb.Node_UserAgentVersion{UserAgentVersion: grpc.Version}, 187 ClientFeatures: []string{"envoy.lb.does_not_support_overprovisioning", "xds.config.resource-in-sotw", "envoy.lrs.supports_send_all_clusters"}, 188 } 189 wantInitialReq := &v3lrspb.LoadStatsRequest{Node: nodeProto} 190 if diff := cmp.Diff(gotInitialReq, wantInitialReq, protocmp.Transform()); diff != "" { 191 t.Fatalf("Unexpected diff in initial LRS request (-got, +want):\n%s", diff) 192 } 193 194 // Send a response from the server with a small deadline. 195 lrsServer.LRSResponseChan <- &fakeserver.Response{ 196 Resp: &v3lrspb.LoadStatsResponse{ 197 SendAllClusters: true, 198 LoadReportingInterval: &durationpb.Duration{Nanos: 50000000}, // 50ms 199 }, 200 } 201 202 // Ensure that loads are seen on the server. 203 req, err = lrsServer.LRSRequestChan.Receive(ctx) 204 if err != nil { 205 t.Fatalf("Timeout when waiting for LRS request with loads: %v", err) 206 } 207 gotLoad := req.(*fakeserver.Request).Req.(*v3lrspb.LoadStatsRequest).ClusterStats 208 if l := len(gotLoad); l != 1 { 209 t.Fatalf("Received load for %d clusters, want 1", l) 210 } 211 212 // This field is set by the client to indicate the actual time elapsed since 213 // the last report was sent. We cannot deterministically compare this, and 214 // we cannot use the cmpopts.IgnoreFields() option on proto structs, since 215 // we already use the protocmp.Transform() which marshals the struct into 216 // another message. Hence setting this field to nil is the best option here. 217 gotLoad[0].LoadReportInterval = nil 218 wantLoad := &v3endpointpb.ClusterStats{ 219 ClusterName: "cluster", 220 ClusterServiceName: "eds", 221 TotalDroppedRequests: 1, 222 DroppedRequests: []*v3endpointpb.ClusterStats_DroppedRequests{{Category: "test", DroppedCount: 1}}, 223 } 224 if diff := cmp.Diff(wantLoad, gotLoad[0], protocmp.Transform(), toleranceCmpOpt, ignoreOrderCmpOpt); diff != "" { 225 t.Fatalf("Unexpected diff in LRS request (-got, +want):\n%s", diff) 226 } 227 228 // Cancel this load reporting stream, server should see error canceled. 229 lrsCancel2() 230 231 // Server should receive a stream canceled error. There may be additional 232 // load reports from the client in the channel. 233 for { 234 if ctx.Err() != nil { 235 t.Fatal("Timeout when waiting for the LRS stream to be canceled on the server") 236 } 237 u, err := lrsServer.LRSRequestChan.Receive(ctx) 238 if err != nil { 239 continue 240 } 241 // Ignore load reports sent before the stream was cancelled. 242 if u.(*fakeserver.Request).Err == nil { 243 continue 244 } 245 if status.Code(u.(*fakeserver.Request).Err) != codes.Canceled { 246 t.Fatalf("Unexpected LRS request: %v, want error canceled", u) 247 } 248 break 249 } 250 } 251 252 // Tests a load reporting scenario where the load reporting API is called 253 // multiple times for the same server. The test verifies the following: 254 // - calling the load reporting API the second time for the same server 255 // configuration does not create a new LRS stream 256 // - the LRS stream is closed *only* after all the API calls invoke their 257 // cancel functions 258 // - creating new streams after the previous one was closed works 259 func (s) TestReportLoad_StreamCreation(t *testing.T) { 260 ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) 261 defer cancel() 262 263 // Create a management server that serves LRS. 264 mgmtServer := e2e.StartManagementServer(t, e2e.ManagementServerOptions{SupportLoadReportingService: true}) 265 266 // Create an xDS client with bootstrap pointing to the above server. 267 nodeID := uuid.New().String() 268 bc := e2e.DefaultBootstrapContents(t, nodeID, mgmtServer.Address) 269 client := createXDSClient(t, bc) 270 271 // Call the load reporting API, and ensure that an LRS stream is created. 272 serverConfig, err := bootstrap.ServerConfigForTesting(bootstrap.ServerConfigTestingOptions{URI: mgmtServer.Address}) 273 if err != nil { 274 t.Fatalf("Failed to create server config for testing: %v", err) 275 } 276 store1, cancel1 := client.ReportLoad(serverConfig) 277 lrsServer := mgmtServer.LRSServer 278 if _, err := lrsServer.LRSStreamOpenChan.Receive(ctx); err != nil { 279 t.Fatalf("Timeout when waiting for LRS stream to be created: %v", err) 280 } 281 282 // Push some loads on the received store. 283 store1.PerCluster("cluster1", "eds1").CallDropped("test") 284 store1.PerCluster("cluster1", "eds1").CallStarted(testLocality1) 285 store1.PerCluster("cluster1", "eds1").CallServerLoad(testLocality1, testKey1, 3.14) 286 store1.PerCluster("cluster1", "eds1").CallServerLoad(testLocality1, testKey1, 2.718) 287 store1.PerCluster("cluster1", "eds1").CallFinished(testLocality1, nil) 288 store1.PerCluster("cluster1", "eds1").CallStarted(testLocality2) 289 store1.PerCluster("cluster1", "eds1").CallServerLoad(testLocality2, testKey2, 1.618) 290 store1.PerCluster("cluster1", "eds1").CallFinished(testLocality2, nil) 291 292 // Ensure the initial load reporting request is received at the server. 293 req, err := lrsServer.LRSRequestChan.Receive(ctx) 294 if err != nil { 295 t.Fatalf("Timeout when waiting for initial LRS request: %v", err) 296 } 297 gotInitialReq := req.(*fakeserver.Request).Req.(*v3lrspb.LoadStatsRequest) 298 nodeProto := &v3corepb.Node{ 299 Id: nodeID, 300 UserAgentName: "gRPC Go", 301 UserAgentVersionType: &v3corepb.Node_UserAgentVersion{UserAgentVersion: grpc.Version}, 302 ClientFeatures: []string{"envoy.lb.does_not_support_overprovisioning", "xds.config.resource-in-sotw", "envoy.lrs.supports_send_all_clusters"}, 303 } 304 wantInitialReq := &v3lrspb.LoadStatsRequest{Node: nodeProto} 305 if diff := cmp.Diff(gotInitialReq, wantInitialReq, protocmp.Transform()); diff != "" { 306 t.Fatalf("Unexpected diff in initial LRS request (-got, +want):\n%s", diff) 307 } 308 309 // Send a response from the server with a small deadline. 310 lrsServer.LRSResponseChan <- &fakeserver.Response{ 311 Resp: &v3lrspb.LoadStatsResponse{ 312 SendAllClusters: true, 313 LoadReportingInterval: &durationpb.Duration{Nanos: 50000000}, // 50ms 314 }, 315 } 316 317 // Ensure that loads are seen on the server. 318 req, err = lrsServer.LRSRequestChan.Receive(ctx) 319 if err != nil { 320 t.Fatal("Timeout when waiting for LRS request with loads") 321 } 322 gotLoad := req.(*fakeserver.Request).Req.(*v3lrspb.LoadStatsRequest).ClusterStats 323 if l := len(gotLoad); l != 1 { 324 t.Fatalf("Received load for %d clusters, want 1", l) 325 } 326 327 // This field is set by the client to indicate the actual time elapsed since 328 // the last report was sent. We cannot deterministically compare this, and 329 // we cannot use the cmpopts.IgnoreFields() option on proto structs, since 330 // we already use the protocmp.Transform() which marshals the struct into 331 // another message. Hence setting this field to nil is the best option here. 332 gotLoad[0].LoadReportInterval = nil 333 wantLoad := &v3endpointpb.ClusterStats{ 334 ClusterName: "cluster1", 335 ClusterServiceName: "eds1", 336 TotalDroppedRequests: 1, 337 DroppedRequests: []*v3endpointpb.ClusterStats_DroppedRequests{{Category: "test", DroppedCount: 1}}, 338 UpstreamLocalityStats: []*v3endpointpb.UpstreamLocalityStats{ 339 { 340 Locality: &v3corepb.Locality{Region: "test-region1"}, 341 LoadMetricStats: []*v3endpointpb.EndpointLoadMetricStats{ 342 // TotalMetricValue is the aggregation of 3.14 + 2.718 = 5.858 343 {MetricName: testKey1, NumRequestsFinishedWithMetric: 2, TotalMetricValue: 5.858}}, 344 TotalSuccessfulRequests: 1, 345 TotalIssuedRequests: 1, 346 }, 347 { 348 Locality: &v3corepb.Locality{Region: "test-region2"}, 349 LoadMetricStats: []*v3endpointpb.EndpointLoadMetricStats{ 350 {MetricName: testKey2, NumRequestsFinishedWithMetric: 1, TotalMetricValue: 1.618}}, 351 TotalSuccessfulRequests: 1, 352 TotalIssuedRequests: 1, 353 }, 354 }, 355 } 356 if diff := cmp.Diff(wantLoad, gotLoad[0], protocmp.Transform(), toleranceCmpOpt, ignoreOrderCmpOpt); diff != "" { 357 t.Fatalf("Unexpected diff in LRS request (-got, +want):\n%s", diff) 358 } 359 360 // Make another call to the load reporting API, and ensure that a new LRS 361 // stream is not created. 362 store2, cancel2 := client.ReportLoad(serverConfig) 363 sCtx, sCancel := context.WithTimeout(context.Background(), defaultTestShortTimeout) 364 defer sCancel() 365 if _, err := lrsServer.LRSStreamOpenChan.Receive(sCtx); err != context.DeadlineExceeded { 366 t.Fatal("New LRS stream created when expected to use an existing one") 367 } 368 369 // Push more loads. 370 store2.PerCluster("cluster2", "eds2").CallDropped("test") 371 372 // Ensure that loads are seen on the server. We need a loop here because 373 // there could have been some requests from the client in the time between 374 // us reading the first request and now. Those would have been queued in the 375 // request channel that we read out of. 376 for { 377 if ctx.Err() != nil { 378 t.Fatalf("Timeout when waiting for new loads to be seen on the server") 379 } 380 381 req, err = lrsServer.LRSRequestChan.Receive(ctx) 382 if err != nil { 383 continue 384 } 385 gotLoad = req.(*fakeserver.Request).Req.(*v3lrspb.LoadStatsRequest).ClusterStats 386 if l := len(gotLoad); l != 1 { 387 continue 388 } 389 gotLoad[0].LoadReportInterval = nil 390 wantLoad := &v3endpointpb.ClusterStats{ 391 ClusterName: "cluster2", 392 ClusterServiceName: "eds2", 393 TotalDroppedRequests: 1, 394 DroppedRequests: []*v3endpointpb.ClusterStats_DroppedRequests{{Category: "test", DroppedCount: 1}}, 395 } 396 if diff := cmp.Diff(wantLoad, gotLoad[0], protocmp.Transform()); diff != "" { 397 t.Logf("Unexpected diff in LRS request (-got, +want):\n%s", diff) 398 continue 399 } 400 break 401 } 402 403 // Cancel the first load reporting call, and ensure that the stream does not 404 // close (because we have another call open). 405 cancel1() 406 sCtx, sCancel = context.WithTimeout(context.Background(), defaultTestShortTimeout) 407 defer sCancel() 408 if _, err := lrsServer.LRSStreamCloseChan.Receive(sCtx); err != context.DeadlineExceeded { 409 t.Fatal("LRS stream closed when expected to stay open") 410 } 411 412 // Cancel the second load reporting call, and ensure the stream is closed. 413 cancel2() 414 if _, err := lrsServer.LRSStreamCloseChan.Receive(ctx); err != nil { 415 t.Fatal("Timeout waiting for LRS stream to close") 416 } 417 418 // Calling the load reporting API again should result in the creation of a 419 // new LRS stream. This ensures that creating and closing multiple streams 420 // works smoothly. 421 _, cancel3 := client.ReportLoad(serverConfig) 422 if _, err := lrsServer.LRSStreamOpenChan.Receive(ctx); err != nil { 423 t.Fatalf("Timeout when waiting for LRS stream to be created: %v", err) 424 } 425 cancel3() 426 }