istio.io/istio@v0.0.0-20240520182934-d79c90f27776/tests/integration/telemetry/api/dashboard_test.go (about)

     1  //go:build integ
     2  // +build integ
     3  
     4  // Copyright Istio Authors
     5  //
     6  // Licensed under the Apache License, Version 2.0 (the "License");
     7  // you may not use this file except in compliance with the License.
     8  // You may obtain a copy of the License at
     9  //
    10  //     http://www.apache.org/licenses/LICENSE-2.0
    11  //
    12  // Unless required by applicable law or agreed to in writing, software
    13  // distributed under the License is distributed on an "AS IS" BASIS,
    14  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15  // See the License for the specific language governing permissions and
    16  // limitations under the License.
    17  
    18  package api
    19  
    20  import (
    21  	"context"
    22  	"encoding/json"
    23  	"fmt"
    24  	"os"
    25  	"path/filepath"
    26  	"strings"
    27  	"testing"
    28  	"time"
    29  
    30  	promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
    31  	"github.com/prometheus/common/model"
    32  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  
    34  	"istio.io/istio/pkg/config/protocol"
    35  	"istio.io/istio/pkg/http/headers"
    36  	"istio.io/istio/pkg/log"
    37  	"istio.io/istio/pkg/test/env"
    38  	"istio.io/istio/pkg/test/framework"
    39  	"istio.io/istio/pkg/test/framework/components/cluster"
    40  	"istio.io/istio/pkg/test/framework/components/echo"
    41  	"istio.io/istio/pkg/test/framework/components/echo/check"
    42  	"istio.io/istio/pkg/test/framework/components/prometheus"
    43  	"istio.io/istio/pkg/test/scopes"
    44  	"istio.io/istio/pkg/test/util/retry"
    45  	"istio.io/istio/pkg/test/util/yml"
    46  )
    47  
    48  var dashboards = []struct {
    49  	configmap      string
    50  	name           string
    51  	excluded       []string
    52  	requirePrimary bool
    53  }{
    54  	{
    55  		"istio-grafana-dashboards",
    56  		"pilot-dashboard.json",
    57  		[]string{
    58  			"pilot_xds_push_errors",
    59  			"pilot_total_xds_internal_errors",
    60  			"pilot_xds_push_context_errors",
    61  			`pilot_xds_pushes{type!~"lds|cds|rds|eds"}`,
    62  			// We do not push credentials in this test
    63  			`pilot_xds_pushes{type="sds"}`,
    64  			"_timeout",
    65  			"_rejects",
    66  			// We do not simulate injection errors
    67  			"sidecar_injection_failure_total",
    68  			// In default install, we have no proxy
    69  			"istio-proxy",
    70  			// We do not simulate validation failed
    71  			"galley_validation_failed",
    72  			// cAdvisor does not expose this metrics, and we don't have kubelet in kind
    73  			"container_fs_usage_bytes",
    74  			// flakes: https://github.com/istio/istio/issues/29871
    75  			"container_memory_working_set_bytes",
    76  			"container_cpu_usage_seconds_total",
    77  		},
    78  		// Pilot is installed only on Primary cluster, hence validate for primary clusters only.
    79  		true,
    80  	},
    81  	{
    82  		"istio-services-grafana-dashboards",
    83  		"istio-mesh-dashboard.json",
    84  		[]string{
    85  			"galley_",
    86  			"istio_tcp_",
    87  			"max(pilot_k8s_cfg_events{",
    88  		},
    89  		false,
    90  	},
    91  	{
    92  		"istio-services-grafana-dashboards",
    93  		"istio-service-dashboard.json",
    94  		[]string{
    95  			"istio_tcp_",
    96  		},
    97  		false,
    98  	},
    99  	{
   100  		"istio-services-grafana-dashboards",
   101  		"istio-workload-dashboard.json",
   102  		[]string{
   103  			"istio_tcp_",
   104  			// there is no non-mtls traffic generated so the test flakes for the split query on
   105  			// "Outgoing Requests By Destination And Response Code"
   106  			"spiffe.*",
   107  		},
   108  		false,
   109  	},
   110  	{
   111  		"istio-grafana-dashboards",
   112  		"istio-performance-dashboard.json",
   113  		[]string{
   114  			// cAdvisor does not expose this metrics, and we don't have kubelet in kind
   115  			"container_fs_usage_bytes",
   116  			// flakes: https://github.com/istio/istio/issues/29871
   117  			"container_memory_working_set_bytes",
   118  			"container_cpu_usage_seconds_total",
   119  		},
   120  		true,
   121  	},
   122  	/* No longer using Wasm by default.
   123  	{
   124  		"istio-services-grafana-dashboards",
   125  		"istio-extension-dashboard.json",
   126  		[]string{
   127  			"avg(envoy_wasm_envoy_wasm_runtime_v8_",
   128  			// flakes: https://github.com/istio/istio/issues/29871
   129  			"container_memory_working_set_bytes",
   130  			"container_cpu_usage_seconds_total",
   131  		},
   132  		false,
   133  	},
   134  	*/
   135  }
   136  
   137  func TestDashboard(t *testing.T) {
   138  	c, cancel := context.WithCancel(context.Background())
   139  	defer cancel()
   140  	framework.NewTest(t).
   141  		Run(func(t framework.TestContext) {
   142  			p := promInst
   143  
   144  			t.ConfigIstio().YAML(apps.Namespace.Name(), fmt.Sprintf(gatewayConfig, apps.Namespace.Name())).
   145  				ApplyOrFail(t)
   146  
   147  			// Apply just the grafana dashboards
   148  			cfg, err := os.ReadFile(filepath.Join(env.IstioSrc, "samples/addons/grafana.yaml"))
   149  			if err != nil {
   150  				t.Fatal(err)
   151  			}
   152  			t.ConfigKube().YAML("istio-system", yml.SplitYamlByKind(string(cfg))["ConfigMap"]).ApplyOrFail(t)
   153  
   154  			// We will send a bunch of requests until the test exits. This ensures we are continuously
   155  			// getting new metrics ingested. If we just send a bunch at once, Prometheus may scrape them
   156  			// all in a single scrape which can lead to `rate()` not behaving correctly.
   157  			go setupDashboardTest(c.Done())
   158  			for _, d := range dashboards {
   159  				d := d
   160  				t.NewSubTest(d.name).Run(func(t framework.TestContext) {
   161  					for _, cl := range t.Clusters() {
   162  						if !cl.IsPrimary() && d.requirePrimary {
   163  							// Skip verification of dashboards that won't be present on non primary(remote) clusters.
   164  							continue
   165  						}
   166  						t.Logf("Verifying %s for cluster %s", d.name, cl.Name())
   167  						cm, err := cl.Kube().CoreV1().ConfigMaps(ist.Settings().TelemetryNamespace).Get(
   168  							context.TODO(), d.configmap, metav1.GetOptions{})
   169  						if err != nil {
   170  							t.Fatalf("Failed to find dashboard %v: %v", d.configmap, err)
   171  						}
   172  
   173  						config, f := cm.Data[d.name]
   174  						if !f {
   175  							t.Fatalf("Failed to find expected dashboard: %v", d.name)
   176  						}
   177  
   178  						queries, err := extractQueries(config)
   179  						if err != nil {
   180  							t.Fatalf("Failed to extract queries: %v", err)
   181  						}
   182  
   183  						for _, query := range queries {
   184  							retry.UntilSuccessOrFail(t, func() error {
   185  								return checkMetric(cl, p, query, d.excluded)
   186  							}, retry.Timeout(time.Minute))
   187  						}
   188  					}
   189  				})
   190  			}
   191  		})
   192  }
   193  
   194  // Some templates use replacement variables. Instead, replace those with wildcard
   195  var replacer = strings.NewReplacer(
   196  	"$dstns", ".*",
   197  	"$dstwl", ".*",
   198  	"$service", ".*",
   199  	"$srcns", ".*",
   200  	"$srcwl", ".*",
   201  	"$namespace", ".*",
   202  	"$workload", ".*",
   203  	"$dstsvc", ".*",
   204  	"$adapter", ".*",
   205  	"$qrep", "destination",
   206  	// Just allow all mTLS settings rather than trying to send mtls and plaintext
   207  	`connection_security_policy="unknown"`, `connection_security_policy=~".*"`,
   208  	`connection_security_policy="mutual_tls"`, `connection_security_policy=~".*"`,
   209  	`connection_security_policy!="mutual_tls"`, `connection_security_policy=~".*"`,
   210  	// Test runs in istio-system
   211  	`destination_workload_namespace!="istio-system"`, `destination_workload_namespace=~".*"`,
   212  	`source_workload_namespace!="istio-system"`, `source_workload_namespace=~".*"`,
   213  )
   214  
   215  func checkMetric(cl cluster.Cluster, p prometheus.Instance, query string, excluded []string) error {
   216  	query = replacer.Replace(query)
   217  	value, _, err := p.APIForCluster(cl).QueryRange(context.Background(), query, promv1.Range{
   218  		Start: time.Now().Add(-time.Minute),
   219  		End:   time.Now(),
   220  		Step:  time.Second,
   221  	})
   222  	if err != nil {
   223  		return fmt.Errorf("failure executing query (%s): %v", query, err)
   224  	}
   225  	if value == nil {
   226  		return fmt.Errorf("returned value should not be nil for '%s'", query)
   227  	}
   228  	numSamples := 0
   229  	switch v := value.(type) {
   230  	case model.Vector:
   231  		numSamples = v.Len()
   232  	case model.Matrix:
   233  		numSamples = v.Len()
   234  	case *model.Scalar:
   235  		numSamples = 1
   236  	default:
   237  		return fmt.Errorf("unknown metric value type: %T", v)
   238  	}
   239  	if includeQuery(query, excluded) {
   240  		if numSamples == 0 {
   241  			return fmt.Errorf("expected a metric value for '%s', found no samples: %#v", query, value)
   242  		}
   243  	} else {
   244  		if numSamples != 0 {
   245  			scopes.Framework.Infof("Filtered out metric '%v', but got samples: %v", query, numSamples)
   246  		}
   247  	}
   248  	return nil
   249  }
   250  
   251  const gatewayConfig = `
   252  apiVersion: networking.istio.io/v1alpha3
   253  kind: Gateway
   254  metadata:
   255    name: echo-gateway
   256  spec:
   257    selector:
   258      istio: ingressgateway
   259    servers:
   260    - port:
   261        number: 80
   262        name: http
   263        protocol: HTTP
   264      hosts:
   265      - "*"
   266    - port:
   267        number: 31400
   268        name: tcp
   269        protocol: TCP
   270      hosts:
   271      - "*"
   272  ---
   273  apiVersion: networking.istio.io/v1alpha3
   274  kind: VirtualService
   275  metadata:
   276    name: echo
   277  spec:
   278    hosts:
   279    - "*"
   280    gateways:
   281    - echo-gateway
   282    http:
   283    - match:
   284      - uri:
   285          exact: /echo-%s
   286      route:
   287      - destination:
   288          host: b
   289          port:
   290            number: 80
   291    tcp:
   292    - match:
   293      - port: 31400
   294      route:
   295      - destination:
   296          host: b
   297          port:
   298            number: 9090
   299  `
   300  
   301  func setupDashboardTest(done <-chan struct{}) {
   302  	// Send 200 http requests, 20 tcp requests across goroutines, generating a variety of error codes.
   303  	// Spread out over 20s so rate() queries will behave correctly
   304  	ticker := time.NewTicker(time.Second)
   305  	times := 0
   306  	for {
   307  		select {
   308  		case <-ticker.C:
   309  			times++
   310  			scopes.Framework.Infof("sending traffic %v", times)
   311  			for _, ing := range ingr {
   312  				hosts, ports := ing.TCPAddresses()
   313  				host := hosts[0]
   314  				port := ports[0]
   315  				_, err := ing.Call(echo.CallOptions{
   316  					Port: echo.Port{
   317  						Protocol: protocol.HTTP,
   318  					},
   319  					Count: 10,
   320  					HTTP: echo.HTTP{
   321  						Path:    fmt.Sprintf("/echo-%s?codes=418:10,520:15,200:75", apps.Namespace.Name()),
   322  						Headers: headers.New().WithHost("server").Build(),
   323  					},
   324  					Check: check.NoError(), // Do not use check.OK since we expect non-200
   325  					Retry: echo.Retry{
   326  						NoRetry: true,
   327  					},
   328  				})
   329  				if err != nil {
   330  					// Do not fail on errors since there may be initial startup errors
   331  					// These calls are not under tests, the dashboards are, so we can be leniant here
   332  					log.Warnf("requests failed: %v", err)
   333  				}
   334  				_, err = ing.Call(echo.CallOptions{
   335  					Port: echo.Port{
   336  						Protocol:    protocol.TCP,
   337  						ServicePort: port,
   338  					},
   339  					Address: host,
   340  					HTTP: echo.HTTP{
   341  						Path:    fmt.Sprintf("/echo-%s", apps.Namespace.Name()),
   342  						Headers: headers.New().WithHost("server").Build(),
   343  					},
   344  					Check: check.OK(),
   345  					Retry: echo.Retry{
   346  						NoRetry: true,
   347  					},
   348  				})
   349  				if err != nil {
   350  					// Do not fail on errors since there may be initial startup errors
   351  					// These calls are not under tests, the dashboards are, so we can be leniant here
   352  					log.Warnf("requests failed: %v", err)
   353  				}
   354  			}
   355  		case <-done:
   356  			scopes.Framework.Infof("done sending traffic after %v rounds", times)
   357  			return
   358  		}
   359  	}
   360  }
   361  
   362  // extractQueries pulls all prometheus queries out of a grafana dashboard
   363  // Rather than importing the entire grafana API just for this test, do some shoddy json parsing
   364  // Equivalent to the union of the jq commands:
   365  // '.panels[].targets[]?.expr' and '.panels[].panels[]?.targets[]?.expr'
   366  func extractQueries(dash string) ([]string, error) {
   367  	var queries []string
   368  	js := map[string]any{}
   369  	if err := json.Unmarshal([]byte(dash), &js); err != nil {
   370  		return nil, err
   371  	}
   372  	panels, f := js["panels"]
   373  	if !f {
   374  		return nil, fmt.Errorf("failed to find panels in %v", dash)
   375  	}
   376  	panelsList, f := panels.([]any)
   377  	if !f {
   378  		return nil, fmt.Errorf("failed to find panelsList in type %T: %v", panels, panels)
   379  	}
   380  	for _, p := range panelsList {
   381  		pm := p.(map[string]any)
   382  		if pm["type"] == "row" {
   383  			continue
   384  		}
   385  		subPanels, exist := pm["panels"]
   386  		var targets any
   387  		var f bool
   388  		if exist {
   389  			subpm := subPanels.(map[string]any)
   390  			targets, f = subpm["targets"]
   391  		} else {
   392  			targets, f = pm["targets"]
   393  		}
   394  		if !f {
   395  			continue
   396  		}
   397  		targetsList, f := targets.([]any)
   398  		if !f {
   399  			return nil, fmt.Errorf("failed to find targetsList in type %T: %v", targets, targets)
   400  		}
   401  		for _, t := range targetsList {
   402  			tm := t.(map[string]any)
   403  			expr, f := tm["expr"]
   404  			if !f {
   405  				continue
   406  			}
   407  			queries = append(queries, expr.(string))
   408  		}
   409  	}
   410  	return queries, nil
   411  }
   412  
   413  func includeQuery(query string, excluded []string) bool {
   414  	for _, f := range excluded {
   415  		if strings.Contains(query, f) {
   416  			return false
   417  		}
   418  	}
   419  	return true
   420  }