istio.io/istio@v0.0.0-20240520182934-d79c90f27776/tests/integration/telemetry/api/stats_test.go (about) 1 //go:build integ 2 // +build integ 3 4 // Copyright Istio Authors. All Rights Reserved. 5 // 6 // Licensed under the Apache License, Version 2.0 (the "License"); 7 // you may not use this file except in compliance with the License. 8 // You may obtain a copy of the License at 9 // 10 // http://www.apache.org/licenses/LICENSE-2.0 11 // 12 // Unless required by applicable law or agreed to in writing, software 13 // distributed under the License is distributed on an "AS IS" BASIS, 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 // See the License for the specific language governing permissions and 16 // limitations under the License. 17 18 package api 19 20 import ( 21 "context" 22 "fmt" 23 "path/filepath" 24 "testing" 25 "time" 26 27 "golang.org/x/sync/errgroup" 28 29 "istio.io/istio/pkg/config/constants" 30 "istio.io/istio/pkg/test" 31 "istio.io/istio/pkg/test/echo/common/scheme" 32 "istio.io/istio/pkg/test/env" 33 "istio.io/istio/pkg/test/framework" 34 "istio.io/istio/pkg/test/framework/components/cluster" 35 "istio.io/istio/pkg/test/framework/components/echo" 36 "istio.io/istio/pkg/test/framework/components/echo/check" 37 cdeployment "istio.io/istio/pkg/test/framework/components/echo/common/deployment" 38 "istio.io/istio/pkg/test/framework/components/echo/common/ports" 39 "istio.io/istio/pkg/test/framework/components/echo/match" 40 "istio.io/istio/pkg/test/framework/components/prometheus" 41 "istio.io/istio/pkg/test/framework/label" 42 "istio.io/istio/pkg/test/framework/resource/config/apply" 43 "istio.io/istio/pkg/test/util/retry" 44 util "istio.io/istio/tests/integration/telemetry" 45 ) 46 47 var PeerAuthenticationConfig = ` 48 apiVersion: security.istio.io/v1beta1 49 kind: PeerAuthentication 50 metadata: 51 name: default 52 spec: 53 mtls: 54 mode: STRICT 55 ` 56 57 func GetClientInstances() echo.Instances { 58 return apps.A 59 } 60 61 func GetTarget() echo.Target { 62 return apps.B 63 } 64 65 // TestStatsFilter verifies the stats filter could emit expected client and server side 66 // metrics when configured with the Telemetry API (with EnvoyFilters disabled). 67 // This test focuses on stats filter and metadata exchange filter could work coherently with 68 // proxy bootstrap config with Wasm runtime. To avoid flake, it does not verify correctness 69 // of metrics, which should be covered by integration test in proxy repo. 70 func TestStatsFilter(t *testing.T) { 71 expectedBuckets := DefaultBucketCount 72 framework.NewTest(t). 73 Run(func(t framework.TestContext) { 74 // Enable strict mTLS. This is needed for mock secured prometheus scraping test. 75 t.ConfigIstio().YAML(ist.Settings().SystemNamespace, PeerAuthenticationConfig).ApplyOrFail(t) 76 g, _ := errgroup.WithContext(context.Background()) 77 for _, cltInstance := range GetClientInstances() { 78 cltInstance := cltInstance 79 g.Go(func() error { 80 err := retry.UntilSuccess(func() error { 81 if err := SendTraffic(cltInstance); err != nil { 82 return err 83 } 84 c := cltInstance.Config().Cluster 85 sourceCluster := constants.DefaultClusterName 86 if len(t.AllClusters()) > 1 { 87 sourceCluster = c.Name() 88 } 89 sourceQuery, destinationQuery, appQuery := buildQuery(sourceCluster) 90 // Query client side metrics 91 prom := promInst 92 if _, err := prom.QuerySum(c, sourceQuery); err != nil { 93 util.PromDiff(t, prom, c, sourceQuery) 94 return err 95 } 96 // Query client side metrics for non-injected server 97 outOfMeshServerQuery := buildOutOfMeshServerQuery(sourceCluster) 98 if _, err := prom.QuerySum(c, outOfMeshServerQuery); err != nil { 99 util.PromDiff(t, prom, c, outOfMeshServerQuery) 100 return err 101 } 102 // Query server side metrics. 103 if _, err := prom.QuerySum(c, destinationQuery); err != nil { 104 util.PromDiff(t, prom, c, destinationQuery) 105 return err 106 } 107 // This query will continue to increase due to readiness probe; don't wait for it to converge 108 if _, err := prom.QuerySum(c, appQuery); err != nil { 109 util.PromDiff(t, prom, c, appQuery) 110 return err 111 } 112 113 if err := ValidateBucket(c, prom, cltInstance.Config().Service, "source", expectedBuckets); err != nil { 114 return err 115 } 116 117 return nil 118 }, retry.Delay(framework.TelemetryRetryDelay), retry.Timeout(framework.TelemetryRetryTimeout)) 119 if err != nil { 120 return err 121 } 122 return nil 123 }) 124 } 125 if err := g.Wait(); err != nil { 126 t.Fatalf("test failed: %v", err) 127 } 128 129 // In addition, verifies that mocked prometheus could call metrics endpoint with proxy provisioned certs 130 t.NewSubTest("mockprom-to-metrics").Run( 131 func(t framework.TestContext) { 132 for _, prom := range mockProm { 133 st := match.Cluster(prom.Config().Cluster).FirstOrFail(t, GetTarget().Instances()) 134 prom.CallOrFail(t, echo.CallOptions{ 135 ToWorkload: st, 136 Scheme: scheme.HTTPS, 137 Port: echo.Port{ServicePort: 15014}, 138 HTTP: echo.HTTP{ 139 Path: "/metrics", 140 }, 141 TLS: echo.TLS{ 142 CertFile: "/etc/certs/custom/cert-chain.pem", 143 KeyFile: "/etc/certs/custom/key.pem", 144 CaCertFile: "/etc/certs/custom/root-cert.pem", 145 InsecureSkipVerify: true, 146 }, 147 }) 148 } 149 }) 150 }) 151 } 152 153 // TestStatsTCPFilter includes common test logic for stats and metadataexchange filters running 154 // with nullvm and wasm runtime for TCP. 155 func TestStatsTCPFilter(t *testing.T) { 156 framework.NewTest(t). 157 Run(func(t framework.TestContext) { 158 g, _ := errgroup.WithContext(context.Background()) 159 for _, cltInstance := range GetClientInstances() { 160 cltInstance := cltInstance 161 g.Go(func() error { 162 err := retry.UntilSuccess(func() error { 163 if err := SendTCPTraffic(cltInstance); err != nil { 164 return err 165 } 166 c := cltInstance.Config().Cluster 167 sourceCluster := constants.DefaultClusterName 168 if len(t.AllClusters()) > 1 { 169 sourceCluster = c.Name() 170 } 171 destinationQuery := buildTCPQuery(sourceCluster) 172 if _, err := promInst.Query(c, destinationQuery); err != nil { 173 util.PromDiff(t, promInst, c, destinationQuery) 174 return err 175 } 176 177 return nil 178 }, retry.Delay(framework.TelemetryRetryDelay), retry.Timeout(framework.TelemetryRetryTimeout)) 179 if err != nil { 180 return err 181 } 182 return nil 183 }) 184 } 185 if err := g.Wait(); err != nil { 186 t.Fatalf("test failed: %v", err) 187 } 188 }) 189 } 190 191 func TestStatsGatewayServerTCPFilter(t *testing.T) { 192 framework.NewTest(t). 193 Run(func(t framework.TestContext) { 194 base := filepath.Join(env.IstioSrc, "tests/integration/telemetry/testdata/") 195 // Following resources are being deployed to test sidecar->gateway communication. With following resources, 196 // routing is being setup from sidecar to external site, via egress gateway. 197 // clt(https:443) -> sidecar(tls:443) -> istio-mtls -> (TLS:443)egress-gateway-> vs(tcp:443) -> cnn.com 198 t.ConfigIstio().File(apps.Namespace.Name(), filepath.Join(base, "istio-mtls-dest-rule.yaml")).ApplyOrFail(t) 199 t.ConfigIstio().File(apps.Namespace.Name(), filepath.Join(base, "istio-mtls-gateway.yaml")).ApplyOrFail(t) 200 t.ConfigIstio().File(apps.Namespace.Name(), filepath.Join(base, "istio-mtls-vs.yaml")).ApplyOrFail(t) 201 202 // The main SE is available only to app namespace, make one the egress can access. 203 t.ConfigIstio().Eval(ist.Settings().SystemNamespace, map[string]any{ 204 "Namespace": apps.External.Namespace.Name(), 205 "Hostname": cdeployment.ExternalHostname, 206 }, `apiVersion: networking.istio.io/v1alpha3 207 kind: ServiceEntry 208 metadata: 209 name: external-service 210 spec: 211 exportTo: [.] 212 hosts: 213 - {{.Hostname}} 214 location: MESH_EXTERNAL 215 resolution: DNS 216 endpoints: 217 - address: external.{{.Namespace}}.svc.cluster.local 218 ports: 219 - name: https 220 number: 443 221 protocol: HTTPS 222 `).ApplyOrFail(t, apply.NoCleanup) 223 g, _ := errgroup.WithContext(context.Background()) 224 for _, cltInstance := range GetClientInstances() { 225 cltInstance := cltInstance 226 g.Go(func() error { 227 err := retry.UntilSuccess(func() error { 228 if _, err := cltInstance.Call(echo.CallOptions{ 229 Address: "fake.external.com", 230 Scheme: scheme.HTTPS, 231 Port: ports.HTTPS, 232 Count: 1, 233 Retry: echo.Retry{NoRetry: true}, // we do retry in outer loop 234 Check: check.OK(), 235 }); err != nil { 236 return err 237 } 238 239 c := cltInstance.Config().Cluster 240 sourceCluster := constants.DefaultClusterName 241 if len(t.AllClusters()) > 1 { 242 sourceCluster = c.Name() 243 } 244 destinationQuery := buildGatewayTCPServerQuery(sourceCluster) 245 if _, err := promInst.Query(c, destinationQuery); err != nil { 246 util.PromDiff(t, promInst, c, destinationQuery) 247 return err 248 } 249 return nil 250 }, retry.Delay(framework.TelemetryRetryDelay), retry.Timeout(framework.TelemetryRetryTimeout)) 251 if err != nil { 252 t.Fatalf("test failed: %v", err) 253 } 254 return nil 255 }) 256 } 257 if err := g.Wait(); err != nil { 258 t.Fatalf("test failed: %v", err) 259 } 260 }) 261 } 262 263 // SendTraffic makes a client call to the "server" service on the http port. 264 func SendTraffic(from echo.Instance) error { 265 _, err := from.Call(echo.CallOptions{ 266 To: GetTarget(), 267 Port: echo.Port{ 268 Name: "http", 269 }, 270 Check: check.OK(), 271 Retry: echo.Retry{ 272 NoRetry: true, 273 }, 274 }) 275 if err != nil { 276 return err 277 } 278 _, err = from.Call(echo.CallOptions{ 279 To: apps.Naked, 280 Port: echo.Port{ 281 Name: "http", 282 }, 283 Retry: echo.Retry{ 284 NoRetry: true, 285 }, 286 }) 287 if err != nil { 288 return err 289 } 290 return nil 291 } 292 293 func SendTrafficOrFail(t test.Failer, from echo.Instance) { 294 from.CallOrFail(t, echo.CallOptions{ 295 To: GetTarget(), 296 Port: echo.Port{ 297 Name: "http", 298 }, 299 Check: check.OK(), 300 }) 301 from.CallOrFail(t, echo.CallOptions{ 302 To: apps.Naked, 303 Port: echo.Port{ 304 Name: "http", 305 }, 306 Retry: echo.Retry{ 307 NoRetry: true, 308 }, 309 }) 310 } 311 312 // SendTCPTraffic makes a client call to the "server" service on the tcp port. 313 func SendTCPTraffic(from echo.Instance) error { 314 _, err := from.Call(echo.CallOptions{ 315 To: GetTarget(), 316 Port: echo.Port{ 317 Name: "tcp", 318 }, 319 Retry: echo.Retry{ 320 NoRetry: true, 321 }, 322 }) 323 if err != nil { 324 return err 325 } 326 return nil 327 } 328 329 // BuildQueryCommon is the shared function to construct prom query for istio_request_total metric. 330 func BuildQueryCommon(labels map[string]string, ns string) (sourceQuery, destinationQuery, appQuery prometheus.Query) { 331 sourceQuery.Metric = "istio_requests_total" 332 sourceQuery.Labels = clone(labels) 333 sourceQuery.Labels["reporter"] = "source" 334 335 destinationQuery.Metric = "istio_requests_total" 336 destinationQuery.Labels = clone(labels) 337 destinationQuery.Labels["reporter"] = "destination" 338 339 appQuery.Metric = "istio_echo_http_requests_total" 340 appQuery.Labels = map[string]string{"namespace": ns} 341 342 return 343 } 344 345 func clone(labels map[string]string) map[string]string { 346 ret := map[string]string{} 347 for k, v := range labels { 348 ret[k] = v 349 } 350 return ret 351 } 352 353 func buildQuery(sourceCluster string) (sourceQuery, destinationQuery, appQuery prometheus.Query) { 354 ns := apps.Namespace 355 labels := map[string]string{ 356 "request_protocol": "http", 357 "response_code": "200", 358 "destination_app": "b", 359 "destination_version": "v1", 360 "destination_service": "b." + ns.Name() + ".svc.cluster.local", 361 "destination_service_name": "b", 362 "destination_workload_namespace": ns.Name(), 363 "destination_service_namespace": ns.Name(), 364 "source_app": "a", 365 "source_version": "v1", 366 "source_workload": "a-v1", 367 "source_workload_namespace": ns.Name(), 368 "source_cluster": sourceCluster, 369 } 370 371 return BuildQueryCommon(labels, ns.Name()) 372 } 373 374 func buildOutOfMeshServerQuery(sourceCluster string) prometheus.Query { 375 ns := apps.Namespace 376 labels := map[string]string{ 377 "request_protocol": "http", 378 "response_code": "200", 379 // For out of mesh server, client side metrics rely on endpoint resource metadata 380 // to fill in workload labels. To limit size of endpoint resource, we only populate 381 // workload name and namespace, canonical service name and version in endpoint metadata. 382 // Thus destination_app and destination_version labels are unknown. 383 // However, they are known with WDS, so we can relax this check. 384 // "destination_app": "unknown", 385 // "destination_version": "unknown", 386 "destination_service": "naked." + ns.Name() + ".svc.cluster.local", 387 "destination_service_name": "naked", 388 "destination_workload_namespace": ns.Name(), 389 "destination_service_namespace": ns.Name(), 390 "source_app": "a", 391 "source_version": "v1", 392 "source_workload": "a-v1", 393 "source_workload_namespace": ns.Name(), 394 "source_cluster": sourceCluster, 395 } 396 397 source, _, _ := BuildQueryCommon(labels, ns.Name()) 398 return source 399 } 400 401 func buildTCPQuery(sourceCluster string) (destinationQuery prometheus.Query) { 402 ns := apps.Namespace 403 labels := map[string]string{ 404 "request_protocol": "tcp", 405 "destination_service_name": "b", 406 "destination_canonical_revision": "v1", 407 "destination_canonical_service": "b", 408 "destination_app": "b", 409 "destination_version": "v1", 410 "destination_workload_namespace": ns.Name(), 411 "destination_service_namespace": ns.Name(), 412 "source_app": "a", 413 "source_version": "v1", 414 "source_workload": "a-v1", 415 "source_workload_namespace": ns.Name(), 416 "source_cluster": sourceCluster, 417 "reporter": "destination", 418 } 419 return prometheus.Query{ 420 Metric: "istio_tcp_connections_opened_total", 421 Labels: labels, 422 } 423 } 424 425 func buildGatewayTCPServerQuery(sourceCluster string) (destinationQuery prometheus.Query) { 426 ns := apps.Namespace 427 labels := map[string]string{ 428 "request_protocol": "tcp", 429 "destination_service_name": "istio-egressgateway", 430 "destination_canonical_revision": "latest", 431 "destination_canonical_service": "istio-egressgateway", 432 "destination_app": "istio-egressgateway", 433 // Does not play well with canonical revision which defaults to "latest". 434 // "destination_version": "unknown", 435 "destination_workload_namespace": "istio-system", 436 "destination_service_namespace": "istio-system", 437 "source_app": "a", 438 "source_version": "v1", 439 "source_workload": "a-v1", 440 "source_workload_namespace": ns.Name(), 441 "source_cluster": sourceCluster, 442 "reporter": "source", 443 } 444 return prometheus.Query{ 445 Metric: "istio_tcp_connections_opened_total", 446 Labels: labels, 447 } 448 } 449 450 func ValidateBucket(cluster cluster.Cluster, prom prometheus.Instance, sourceApp string, reporter string, expectedBuckets int) error { 451 return retry.UntilSuccess(func() error { 452 promQL := fmt.Sprintf(`count(sum by(le) (rate(istio_request_duration_milliseconds_bucket{source_app="%s",reporter="%s",response_code="200"}[24h])))`, 453 sourceApp, reporter) 454 v, err := prom.RawQuery(cluster, promQL) 455 if err != nil { 456 return err 457 } 458 totalBuckets, err := prometheus.Sum(v) 459 if err != nil { 460 return err 461 } 462 if int(totalBuckets) != expectedBuckets { 463 return fmt.Errorf("expected %d buckets, got %v", expectedBuckets, totalBuckets) 464 } 465 return nil 466 }, retry.Delay(time.Second), retry.Timeout(time.Second*20)) 467 } 468 469 // TestGRPCCountMetrics tests that istio_[request/response]_messages_total are present https://github.com/istio/istio/issues/44144 470 // Kiali depends on these metrics 471 func TestGRPCCountMetrics(t *testing.T) { 472 framework.NewTest(t). 473 Label(label.IPv4). // https://github.com/istio/istio/issues/35835 474 Run(func(t framework.TestContext) { 475 // Metrics to be queried and tested 476 metrics := []string{"istio_request_messages_total", "istio_response_messages_total"} 477 for _, metric := range metrics { 478 t.NewSubTestf(metric).Run(func(t framework.TestContext) { 479 t.Cleanup(func() { 480 if t.Failed() { 481 util.PromDump(t.Clusters().Default(), promInst, prometheus.Query{Metric: metric}) 482 } 483 grpcSourceQuery := buildGRPCQuery(metric) 484 cluster := t.Clusters().Default() 485 retry.UntilSuccessOrFail(t, func() error { 486 if err := SendGRPCTraffic(); err != nil { 487 t.Log("failed to send grpc traffic") 488 return err 489 } 490 if _, err := util.QueryPrometheus(t, cluster, grpcSourceQuery, promInst); err != nil { 491 util.PromDiff(t, promInst, cluster, grpcSourceQuery) 492 return err 493 } 494 return nil 495 }, retry.Delay(1*time.Second), retry.Timeout(300*time.Second)) 496 util.ValidateMetric(t, cluster, promInst, grpcSourceQuery, 1) 497 }) 498 }) 499 } 500 }) 501 } 502 503 func buildGRPCQuery(metric string) (destinationQuery prometheus.Query) { 504 ns := apps.Namespace 505 506 labels := map[string]string{ 507 "destination_app": "b", 508 "destination_version": "v1", 509 "destination_service": "b." + ns.Name() + ".svc.cluster.local", 510 "destination_service_name": "b", 511 "destination_workload_namespace": ns.Name(), 512 "destination_service_namespace": ns.Name(), 513 } 514 sourceQuery := prometheus.Query{} 515 sourceQuery.Metric = metric 516 sourceQuery.Labels = labels 517 518 return sourceQuery 519 } 520 521 func SendGRPCTraffic() error { 522 for _, cltInstance := range GetClientInstances() { 523 cltInstance := cltInstance 524 525 _, err := cltInstance.Call(echo.CallOptions{ 526 To: GetTarget(), 527 Port: echo.Port{ 528 Name: "grpc", 529 }, 530 }) 531 if err != nil { 532 return err 533 } 534 } 535 return nil 536 }