istio.io/istio@v0.0.0-20240520182934-d79c90f27776/tests/integration/telemetry/api/stats_test.go (about)

     1  //go:build integ
     2  // +build integ
     3  
     4  // Copyright Istio Authors. All Rights Reserved.
     5  //
     6  // Licensed under the Apache License, Version 2.0 (the "License");
     7  // you may not use this file except in compliance with the License.
     8  // You may obtain a copy of the License at
     9  //
    10  //     http://www.apache.org/licenses/LICENSE-2.0
    11  //
    12  // Unless required by applicable law or agreed to in writing, software
    13  // distributed under the License is distributed on an "AS IS" BASIS,
    14  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15  // See the License for the specific language governing permissions and
    16  // limitations under the License.
    17  
    18  package api
    19  
    20  import (
    21  	"context"
    22  	"fmt"
    23  	"path/filepath"
    24  	"testing"
    25  	"time"
    26  
    27  	"golang.org/x/sync/errgroup"
    28  
    29  	"istio.io/istio/pkg/config/constants"
    30  	"istio.io/istio/pkg/test"
    31  	"istio.io/istio/pkg/test/echo/common/scheme"
    32  	"istio.io/istio/pkg/test/env"
    33  	"istio.io/istio/pkg/test/framework"
    34  	"istio.io/istio/pkg/test/framework/components/cluster"
    35  	"istio.io/istio/pkg/test/framework/components/echo"
    36  	"istio.io/istio/pkg/test/framework/components/echo/check"
    37  	cdeployment "istio.io/istio/pkg/test/framework/components/echo/common/deployment"
    38  	"istio.io/istio/pkg/test/framework/components/echo/common/ports"
    39  	"istio.io/istio/pkg/test/framework/components/echo/match"
    40  	"istio.io/istio/pkg/test/framework/components/prometheus"
    41  	"istio.io/istio/pkg/test/framework/label"
    42  	"istio.io/istio/pkg/test/framework/resource/config/apply"
    43  	"istio.io/istio/pkg/test/util/retry"
    44  	util "istio.io/istio/tests/integration/telemetry"
    45  )
    46  
    47  var PeerAuthenticationConfig = `
    48  apiVersion: security.istio.io/v1beta1
    49  kind: PeerAuthentication
    50  metadata:
    51    name: default
    52  spec:
    53    mtls:
    54      mode: STRICT
    55  `
    56  
    57  func GetClientInstances() echo.Instances {
    58  	return apps.A
    59  }
    60  
    61  func GetTarget() echo.Target {
    62  	return apps.B
    63  }
    64  
    65  // TestStatsFilter verifies the stats filter could emit expected client and server side
    66  // metrics when configured with the Telemetry API (with EnvoyFilters disabled).
    67  // This test focuses on stats filter and metadata exchange filter could work coherently with
    68  // proxy bootstrap config with Wasm runtime. To avoid flake, it does not verify correctness
    69  // of metrics, which should be covered by integration test in proxy repo.
    70  func TestStatsFilter(t *testing.T) {
    71  	expectedBuckets := DefaultBucketCount
    72  	framework.NewTest(t).
    73  		Run(func(t framework.TestContext) {
    74  			// Enable strict mTLS. This is needed for mock secured prometheus scraping test.
    75  			t.ConfigIstio().YAML(ist.Settings().SystemNamespace, PeerAuthenticationConfig).ApplyOrFail(t)
    76  			g, _ := errgroup.WithContext(context.Background())
    77  			for _, cltInstance := range GetClientInstances() {
    78  				cltInstance := cltInstance
    79  				g.Go(func() error {
    80  					err := retry.UntilSuccess(func() error {
    81  						if err := SendTraffic(cltInstance); err != nil {
    82  							return err
    83  						}
    84  						c := cltInstance.Config().Cluster
    85  						sourceCluster := constants.DefaultClusterName
    86  						if len(t.AllClusters()) > 1 {
    87  							sourceCluster = c.Name()
    88  						}
    89  						sourceQuery, destinationQuery, appQuery := buildQuery(sourceCluster)
    90  						// Query client side metrics
    91  						prom := promInst
    92  						if _, err := prom.QuerySum(c, sourceQuery); err != nil {
    93  							util.PromDiff(t, prom, c, sourceQuery)
    94  							return err
    95  						}
    96  						// Query client side metrics for non-injected server
    97  						outOfMeshServerQuery := buildOutOfMeshServerQuery(sourceCluster)
    98  						if _, err := prom.QuerySum(c, outOfMeshServerQuery); err != nil {
    99  							util.PromDiff(t, prom, c, outOfMeshServerQuery)
   100  							return err
   101  						}
   102  						// Query server side metrics.
   103  						if _, err := prom.QuerySum(c, destinationQuery); err != nil {
   104  							util.PromDiff(t, prom, c, destinationQuery)
   105  							return err
   106  						}
   107  						// This query will continue to increase due to readiness probe; don't wait for it to converge
   108  						if _, err := prom.QuerySum(c, appQuery); err != nil {
   109  							util.PromDiff(t, prom, c, appQuery)
   110  							return err
   111  						}
   112  
   113  						if err := ValidateBucket(c, prom, cltInstance.Config().Service, "source", expectedBuckets); err != nil {
   114  							return err
   115  						}
   116  
   117  						return nil
   118  					}, retry.Delay(framework.TelemetryRetryDelay), retry.Timeout(framework.TelemetryRetryTimeout))
   119  					if err != nil {
   120  						return err
   121  					}
   122  					return nil
   123  				})
   124  			}
   125  			if err := g.Wait(); err != nil {
   126  				t.Fatalf("test failed: %v", err)
   127  			}
   128  
   129  			// In addition, verifies that mocked prometheus could call metrics endpoint with proxy provisioned certs
   130  			t.NewSubTest("mockprom-to-metrics").Run(
   131  				func(t framework.TestContext) {
   132  					for _, prom := range mockProm {
   133  						st := match.Cluster(prom.Config().Cluster).FirstOrFail(t, GetTarget().Instances())
   134  						prom.CallOrFail(t, echo.CallOptions{
   135  							ToWorkload: st,
   136  							Scheme:     scheme.HTTPS,
   137  							Port:       echo.Port{ServicePort: 15014},
   138  							HTTP: echo.HTTP{
   139  								Path: "/metrics",
   140  							},
   141  							TLS: echo.TLS{
   142  								CertFile:           "/etc/certs/custom/cert-chain.pem",
   143  								KeyFile:            "/etc/certs/custom/key.pem",
   144  								CaCertFile:         "/etc/certs/custom/root-cert.pem",
   145  								InsecureSkipVerify: true,
   146  							},
   147  						})
   148  					}
   149  				})
   150  		})
   151  }
   152  
   153  // TestStatsTCPFilter includes common test logic for stats and metadataexchange filters running
   154  // with nullvm and wasm runtime for TCP.
   155  func TestStatsTCPFilter(t *testing.T) {
   156  	framework.NewTest(t).
   157  		Run(func(t framework.TestContext) {
   158  			g, _ := errgroup.WithContext(context.Background())
   159  			for _, cltInstance := range GetClientInstances() {
   160  				cltInstance := cltInstance
   161  				g.Go(func() error {
   162  					err := retry.UntilSuccess(func() error {
   163  						if err := SendTCPTraffic(cltInstance); err != nil {
   164  							return err
   165  						}
   166  						c := cltInstance.Config().Cluster
   167  						sourceCluster := constants.DefaultClusterName
   168  						if len(t.AllClusters()) > 1 {
   169  							sourceCluster = c.Name()
   170  						}
   171  						destinationQuery := buildTCPQuery(sourceCluster)
   172  						if _, err := promInst.Query(c, destinationQuery); err != nil {
   173  							util.PromDiff(t, promInst, c, destinationQuery)
   174  							return err
   175  						}
   176  
   177  						return nil
   178  					}, retry.Delay(framework.TelemetryRetryDelay), retry.Timeout(framework.TelemetryRetryTimeout))
   179  					if err != nil {
   180  						return err
   181  					}
   182  					return nil
   183  				})
   184  			}
   185  			if err := g.Wait(); err != nil {
   186  				t.Fatalf("test failed: %v", err)
   187  			}
   188  		})
   189  }
   190  
   191  func TestStatsGatewayServerTCPFilter(t *testing.T) {
   192  	framework.NewTest(t).
   193  		Run(func(t framework.TestContext) {
   194  			base := filepath.Join(env.IstioSrc, "tests/integration/telemetry/testdata/")
   195  			// Following resources are being deployed to test sidecar->gateway communication. With following resources,
   196  			// routing is being setup from sidecar to external site, via egress gateway.
   197  			// clt(https:443) -> sidecar(tls:443) -> istio-mtls -> (TLS:443)egress-gateway-> vs(tcp:443) -> cnn.com
   198  			t.ConfigIstio().File(apps.Namespace.Name(), filepath.Join(base, "istio-mtls-dest-rule.yaml")).ApplyOrFail(t)
   199  			t.ConfigIstio().File(apps.Namespace.Name(), filepath.Join(base, "istio-mtls-gateway.yaml")).ApplyOrFail(t)
   200  			t.ConfigIstio().File(apps.Namespace.Name(), filepath.Join(base, "istio-mtls-vs.yaml")).ApplyOrFail(t)
   201  
   202  			// The main SE is available only to app namespace, make one the egress can access.
   203  			t.ConfigIstio().Eval(ist.Settings().SystemNamespace, map[string]any{
   204  				"Namespace": apps.External.Namespace.Name(),
   205  				"Hostname":  cdeployment.ExternalHostname,
   206  			}, `apiVersion: networking.istio.io/v1alpha3
   207  kind: ServiceEntry
   208  metadata:
   209    name: external-service
   210  spec:
   211    exportTo: [.]
   212    hosts:
   213    - {{.Hostname}}
   214    location: MESH_EXTERNAL
   215    resolution: DNS
   216    endpoints:
   217    - address: external.{{.Namespace}}.svc.cluster.local
   218    ports:
   219    - name: https
   220      number: 443
   221      protocol: HTTPS
   222  `).ApplyOrFail(t, apply.NoCleanup)
   223  			g, _ := errgroup.WithContext(context.Background())
   224  			for _, cltInstance := range GetClientInstances() {
   225  				cltInstance := cltInstance
   226  				g.Go(func() error {
   227  					err := retry.UntilSuccess(func() error {
   228  						if _, err := cltInstance.Call(echo.CallOptions{
   229  							Address: "fake.external.com",
   230  							Scheme:  scheme.HTTPS,
   231  							Port:    ports.HTTPS,
   232  							Count:   1,
   233  							Retry:   echo.Retry{NoRetry: true}, // we do retry in outer loop
   234  							Check:   check.OK(),
   235  						}); err != nil {
   236  							return err
   237  						}
   238  
   239  						c := cltInstance.Config().Cluster
   240  						sourceCluster := constants.DefaultClusterName
   241  						if len(t.AllClusters()) > 1 {
   242  							sourceCluster = c.Name()
   243  						}
   244  						destinationQuery := buildGatewayTCPServerQuery(sourceCluster)
   245  						if _, err := promInst.Query(c, destinationQuery); err != nil {
   246  							util.PromDiff(t, promInst, c, destinationQuery)
   247  							return err
   248  						}
   249  						return nil
   250  					}, retry.Delay(framework.TelemetryRetryDelay), retry.Timeout(framework.TelemetryRetryTimeout))
   251  					if err != nil {
   252  						t.Fatalf("test failed: %v", err)
   253  					}
   254  					return nil
   255  				})
   256  			}
   257  			if err := g.Wait(); err != nil {
   258  				t.Fatalf("test failed: %v", err)
   259  			}
   260  		})
   261  }
   262  
   263  // SendTraffic makes a client call to the "server" service on the http port.
   264  func SendTraffic(from echo.Instance) error {
   265  	_, err := from.Call(echo.CallOptions{
   266  		To: GetTarget(),
   267  		Port: echo.Port{
   268  			Name: "http",
   269  		},
   270  		Check: check.OK(),
   271  		Retry: echo.Retry{
   272  			NoRetry: true,
   273  		},
   274  	})
   275  	if err != nil {
   276  		return err
   277  	}
   278  	_, err = from.Call(echo.CallOptions{
   279  		To: apps.Naked,
   280  		Port: echo.Port{
   281  			Name: "http",
   282  		},
   283  		Retry: echo.Retry{
   284  			NoRetry: true,
   285  		},
   286  	})
   287  	if err != nil {
   288  		return err
   289  	}
   290  	return nil
   291  }
   292  
   293  func SendTrafficOrFail(t test.Failer, from echo.Instance) {
   294  	from.CallOrFail(t, echo.CallOptions{
   295  		To: GetTarget(),
   296  		Port: echo.Port{
   297  			Name: "http",
   298  		},
   299  		Check: check.OK(),
   300  	})
   301  	from.CallOrFail(t, echo.CallOptions{
   302  		To: apps.Naked,
   303  		Port: echo.Port{
   304  			Name: "http",
   305  		},
   306  		Retry: echo.Retry{
   307  			NoRetry: true,
   308  		},
   309  	})
   310  }
   311  
   312  // SendTCPTraffic makes a client call to the "server" service on the tcp port.
   313  func SendTCPTraffic(from echo.Instance) error {
   314  	_, err := from.Call(echo.CallOptions{
   315  		To: GetTarget(),
   316  		Port: echo.Port{
   317  			Name: "tcp",
   318  		},
   319  		Retry: echo.Retry{
   320  			NoRetry: true,
   321  		},
   322  	})
   323  	if err != nil {
   324  		return err
   325  	}
   326  	return nil
   327  }
   328  
   329  // BuildQueryCommon is the shared function to construct prom query for istio_request_total metric.
   330  func BuildQueryCommon(labels map[string]string, ns string) (sourceQuery, destinationQuery, appQuery prometheus.Query) {
   331  	sourceQuery.Metric = "istio_requests_total"
   332  	sourceQuery.Labels = clone(labels)
   333  	sourceQuery.Labels["reporter"] = "source"
   334  
   335  	destinationQuery.Metric = "istio_requests_total"
   336  	destinationQuery.Labels = clone(labels)
   337  	destinationQuery.Labels["reporter"] = "destination"
   338  
   339  	appQuery.Metric = "istio_echo_http_requests_total"
   340  	appQuery.Labels = map[string]string{"namespace": ns}
   341  
   342  	return
   343  }
   344  
   345  func clone(labels map[string]string) map[string]string {
   346  	ret := map[string]string{}
   347  	for k, v := range labels {
   348  		ret[k] = v
   349  	}
   350  	return ret
   351  }
   352  
   353  func buildQuery(sourceCluster string) (sourceQuery, destinationQuery, appQuery prometheus.Query) {
   354  	ns := apps.Namespace
   355  	labels := map[string]string{
   356  		"request_protocol":               "http",
   357  		"response_code":                  "200",
   358  		"destination_app":                "b",
   359  		"destination_version":            "v1",
   360  		"destination_service":            "b." + ns.Name() + ".svc.cluster.local",
   361  		"destination_service_name":       "b",
   362  		"destination_workload_namespace": ns.Name(),
   363  		"destination_service_namespace":  ns.Name(),
   364  		"source_app":                     "a",
   365  		"source_version":                 "v1",
   366  		"source_workload":                "a-v1",
   367  		"source_workload_namespace":      ns.Name(),
   368  		"source_cluster":                 sourceCluster,
   369  	}
   370  
   371  	return BuildQueryCommon(labels, ns.Name())
   372  }
   373  
   374  func buildOutOfMeshServerQuery(sourceCluster string) prometheus.Query {
   375  	ns := apps.Namespace
   376  	labels := map[string]string{
   377  		"request_protocol": "http",
   378  		"response_code":    "200",
   379  		// For out of mesh server, client side metrics rely on endpoint resource metadata
   380  		// to fill in workload labels. To limit size of endpoint resource, we only populate
   381  		// workload name and namespace, canonical service name and version in endpoint metadata.
   382  		// Thus destination_app and destination_version labels are unknown.
   383  		// However, they are known with WDS, so we can relax this check.
   384  		// "destination_app":                "unknown",
   385  		// "destination_version":            "unknown",
   386  		"destination_service":            "naked." + ns.Name() + ".svc.cluster.local",
   387  		"destination_service_name":       "naked",
   388  		"destination_workload_namespace": ns.Name(),
   389  		"destination_service_namespace":  ns.Name(),
   390  		"source_app":                     "a",
   391  		"source_version":                 "v1",
   392  		"source_workload":                "a-v1",
   393  		"source_workload_namespace":      ns.Name(),
   394  		"source_cluster":                 sourceCluster,
   395  	}
   396  
   397  	source, _, _ := BuildQueryCommon(labels, ns.Name())
   398  	return source
   399  }
   400  
   401  func buildTCPQuery(sourceCluster string) (destinationQuery prometheus.Query) {
   402  	ns := apps.Namespace
   403  	labels := map[string]string{
   404  		"request_protocol":               "tcp",
   405  		"destination_service_name":       "b",
   406  		"destination_canonical_revision": "v1",
   407  		"destination_canonical_service":  "b",
   408  		"destination_app":                "b",
   409  		"destination_version":            "v1",
   410  		"destination_workload_namespace": ns.Name(),
   411  		"destination_service_namespace":  ns.Name(),
   412  		"source_app":                     "a",
   413  		"source_version":                 "v1",
   414  		"source_workload":                "a-v1",
   415  		"source_workload_namespace":      ns.Name(),
   416  		"source_cluster":                 sourceCluster,
   417  		"reporter":                       "destination",
   418  	}
   419  	return prometheus.Query{
   420  		Metric: "istio_tcp_connections_opened_total",
   421  		Labels: labels,
   422  	}
   423  }
   424  
   425  func buildGatewayTCPServerQuery(sourceCluster string) (destinationQuery prometheus.Query) {
   426  	ns := apps.Namespace
   427  	labels := map[string]string{
   428  		"request_protocol":               "tcp",
   429  		"destination_service_name":       "istio-egressgateway",
   430  		"destination_canonical_revision": "latest",
   431  		"destination_canonical_service":  "istio-egressgateway",
   432  		"destination_app":                "istio-egressgateway",
   433  		// Does not play well with canonical revision which defaults to "latest".
   434  		// "destination_version":            "unknown",
   435  		"destination_workload_namespace": "istio-system",
   436  		"destination_service_namespace":  "istio-system",
   437  		"source_app":                     "a",
   438  		"source_version":                 "v1",
   439  		"source_workload":                "a-v1",
   440  		"source_workload_namespace":      ns.Name(),
   441  		"source_cluster":                 sourceCluster,
   442  		"reporter":                       "source",
   443  	}
   444  	return prometheus.Query{
   445  		Metric: "istio_tcp_connections_opened_total",
   446  		Labels: labels,
   447  	}
   448  }
   449  
   450  func ValidateBucket(cluster cluster.Cluster, prom prometheus.Instance, sourceApp string, reporter string, expectedBuckets int) error {
   451  	return retry.UntilSuccess(func() error {
   452  		promQL := fmt.Sprintf(`count(sum by(le) (rate(istio_request_duration_milliseconds_bucket{source_app="%s",reporter="%s",response_code="200"}[24h])))`,
   453  			sourceApp, reporter)
   454  		v, err := prom.RawQuery(cluster, promQL)
   455  		if err != nil {
   456  			return err
   457  		}
   458  		totalBuckets, err := prometheus.Sum(v)
   459  		if err != nil {
   460  			return err
   461  		}
   462  		if int(totalBuckets) != expectedBuckets {
   463  			return fmt.Errorf("expected %d buckets, got %v", expectedBuckets, totalBuckets)
   464  		}
   465  		return nil
   466  	}, retry.Delay(time.Second), retry.Timeout(time.Second*20))
   467  }
   468  
   469  // TestGRPCCountMetrics tests that istio_[request/response]_messages_total are present https://github.com/istio/istio/issues/44144
   470  // Kiali depends on these metrics
   471  func TestGRPCCountMetrics(t *testing.T) {
   472  	framework.NewTest(t).
   473  		Label(label.IPv4). // https://github.com/istio/istio/issues/35835
   474  		Run(func(t framework.TestContext) {
   475  			// Metrics to be queried and tested
   476  			metrics := []string{"istio_request_messages_total", "istio_response_messages_total"}
   477  			for _, metric := range metrics {
   478  				t.NewSubTestf(metric).Run(func(t framework.TestContext) {
   479  					t.Cleanup(func() {
   480  						if t.Failed() {
   481  							util.PromDump(t.Clusters().Default(), promInst, prometheus.Query{Metric: metric})
   482  						}
   483  						grpcSourceQuery := buildGRPCQuery(metric)
   484  						cluster := t.Clusters().Default()
   485  						retry.UntilSuccessOrFail(t, func() error {
   486  							if err := SendGRPCTraffic(); err != nil {
   487  								t.Log("failed to send grpc traffic")
   488  								return err
   489  							}
   490  							if _, err := util.QueryPrometheus(t, cluster, grpcSourceQuery, promInst); err != nil {
   491  								util.PromDiff(t, promInst, cluster, grpcSourceQuery)
   492  								return err
   493  							}
   494  							return nil
   495  						}, retry.Delay(1*time.Second), retry.Timeout(300*time.Second))
   496  						util.ValidateMetric(t, cluster, promInst, grpcSourceQuery, 1)
   497  					})
   498  				})
   499  			}
   500  		})
   501  }
   502  
   503  func buildGRPCQuery(metric string) (destinationQuery prometheus.Query) {
   504  	ns := apps.Namespace
   505  
   506  	labels := map[string]string{
   507  		"destination_app":                "b",
   508  		"destination_version":            "v1",
   509  		"destination_service":            "b." + ns.Name() + ".svc.cluster.local",
   510  		"destination_service_name":       "b",
   511  		"destination_workload_namespace": ns.Name(),
   512  		"destination_service_namespace":  ns.Name(),
   513  	}
   514  	sourceQuery := prometheus.Query{}
   515  	sourceQuery.Metric = metric
   516  	sourceQuery.Labels = labels
   517  
   518  	return sourceQuery
   519  }
   520  
   521  func SendGRPCTraffic() error {
   522  	for _, cltInstance := range GetClientInstances() {
   523  		cltInstance := cltInstance
   524  
   525  		_, err := cltInstance.Call(echo.CallOptions{
   526  			To: GetTarget(),
   527  			Port: echo.Port{
   528  				Name: "grpc",
   529  			},
   530  		})
   531  		if err != nil {
   532  			return err
   533  		}
   534  	}
   535  	return nil
   536  }