istio.io/istio@v0.0.0-20240520182934-d79c90f27776/tests/integration/telemetry/api/dashboard_test.go (about) 1 //go:build integ 2 // +build integ 3 4 // Copyright Istio Authors 5 // 6 // Licensed under the Apache License, Version 2.0 (the "License"); 7 // you may not use this file except in compliance with the License. 8 // You may obtain a copy of the License at 9 // 10 // http://www.apache.org/licenses/LICENSE-2.0 11 // 12 // Unless required by applicable law or agreed to in writing, software 13 // distributed under the License is distributed on an "AS IS" BASIS, 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 // See the License for the specific language governing permissions and 16 // limitations under the License. 17 18 package api 19 20 import ( 21 "context" 22 "encoding/json" 23 "fmt" 24 "os" 25 "path/filepath" 26 "strings" 27 "testing" 28 "time" 29 30 promv1 "github.com/prometheus/client_golang/api/prometheus/v1" 31 "github.com/prometheus/common/model" 32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 34 "istio.io/istio/pkg/config/protocol" 35 "istio.io/istio/pkg/http/headers" 36 "istio.io/istio/pkg/log" 37 "istio.io/istio/pkg/test/env" 38 "istio.io/istio/pkg/test/framework" 39 "istio.io/istio/pkg/test/framework/components/cluster" 40 "istio.io/istio/pkg/test/framework/components/echo" 41 "istio.io/istio/pkg/test/framework/components/echo/check" 42 "istio.io/istio/pkg/test/framework/components/prometheus" 43 "istio.io/istio/pkg/test/scopes" 44 "istio.io/istio/pkg/test/util/retry" 45 "istio.io/istio/pkg/test/util/yml" 46 ) 47 48 var dashboards = []struct { 49 configmap string 50 name string 51 excluded []string 52 requirePrimary bool 53 }{ 54 { 55 "istio-grafana-dashboards", 56 "pilot-dashboard.json", 57 []string{ 58 "pilot_xds_push_errors", 59 "pilot_total_xds_internal_errors", 60 "pilot_xds_push_context_errors", 61 `pilot_xds_pushes{type!~"lds|cds|rds|eds"}`, 62 // We do not push credentials in this test 63 `pilot_xds_pushes{type="sds"}`, 64 "_timeout", 65 "_rejects", 66 // We do not simulate injection errors 67 "sidecar_injection_failure_total", 68 // In default install, we have no proxy 69 "istio-proxy", 70 // We do not simulate validation failed 71 "galley_validation_failed", 72 // cAdvisor does not expose this metrics, and we don't have kubelet in kind 73 "container_fs_usage_bytes", 74 // flakes: https://github.com/istio/istio/issues/29871 75 "container_memory_working_set_bytes", 76 "container_cpu_usage_seconds_total", 77 }, 78 // Pilot is installed only on Primary cluster, hence validate for primary clusters only. 79 true, 80 }, 81 { 82 "istio-services-grafana-dashboards", 83 "istio-mesh-dashboard.json", 84 []string{ 85 "galley_", 86 "istio_tcp_", 87 "max(pilot_k8s_cfg_events{", 88 }, 89 false, 90 }, 91 { 92 "istio-services-grafana-dashboards", 93 "istio-service-dashboard.json", 94 []string{ 95 "istio_tcp_", 96 }, 97 false, 98 }, 99 { 100 "istio-services-grafana-dashboards", 101 "istio-workload-dashboard.json", 102 []string{ 103 "istio_tcp_", 104 // there is no non-mtls traffic generated so the test flakes for the split query on 105 // "Outgoing Requests By Destination And Response Code" 106 "spiffe.*", 107 }, 108 false, 109 }, 110 { 111 "istio-grafana-dashboards", 112 "istio-performance-dashboard.json", 113 []string{ 114 // cAdvisor does not expose this metrics, and we don't have kubelet in kind 115 "container_fs_usage_bytes", 116 // flakes: https://github.com/istio/istio/issues/29871 117 "container_memory_working_set_bytes", 118 "container_cpu_usage_seconds_total", 119 }, 120 true, 121 }, 122 /* No longer using Wasm by default. 123 { 124 "istio-services-grafana-dashboards", 125 "istio-extension-dashboard.json", 126 []string{ 127 "avg(envoy_wasm_envoy_wasm_runtime_v8_", 128 // flakes: https://github.com/istio/istio/issues/29871 129 "container_memory_working_set_bytes", 130 "container_cpu_usage_seconds_total", 131 }, 132 false, 133 }, 134 */ 135 } 136 137 func TestDashboard(t *testing.T) { 138 c, cancel := context.WithCancel(context.Background()) 139 defer cancel() 140 framework.NewTest(t). 141 Run(func(t framework.TestContext) { 142 p := promInst 143 144 t.ConfigIstio().YAML(apps.Namespace.Name(), fmt.Sprintf(gatewayConfig, apps.Namespace.Name())). 145 ApplyOrFail(t) 146 147 // Apply just the grafana dashboards 148 cfg, err := os.ReadFile(filepath.Join(env.IstioSrc, "samples/addons/grafana.yaml")) 149 if err != nil { 150 t.Fatal(err) 151 } 152 t.ConfigKube().YAML("istio-system", yml.SplitYamlByKind(string(cfg))["ConfigMap"]).ApplyOrFail(t) 153 154 // We will send a bunch of requests until the test exits. This ensures we are continuously 155 // getting new metrics ingested. If we just send a bunch at once, Prometheus may scrape them 156 // all in a single scrape which can lead to `rate()` not behaving correctly. 157 go setupDashboardTest(c.Done()) 158 for _, d := range dashboards { 159 d := d 160 t.NewSubTest(d.name).Run(func(t framework.TestContext) { 161 for _, cl := range t.Clusters() { 162 if !cl.IsPrimary() && d.requirePrimary { 163 // Skip verification of dashboards that won't be present on non primary(remote) clusters. 164 continue 165 } 166 t.Logf("Verifying %s for cluster %s", d.name, cl.Name()) 167 cm, err := cl.Kube().CoreV1().ConfigMaps(ist.Settings().TelemetryNamespace).Get( 168 context.TODO(), d.configmap, metav1.GetOptions{}) 169 if err != nil { 170 t.Fatalf("Failed to find dashboard %v: %v", d.configmap, err) 171 } 172 173 config, f := cm.Data[d.name] 174 if !f { 175 t.Fatalf("Failed to find expected dashboard: %v", d.name) 176 } 177 178 queries, err := extractQueries(config) 179 if err != nil { 180 t.Fatalf("Failed to extract queries: %v", err) 181 } 182 183 for _, query := range queries { 184 retry.UntilSuccessOrFail(t, func() error { 185 return checkMetric(cl, p, query, d.excluded) 186 }, retry.Timeout(time.Minute)) 187 } 188 } 189 }) 190 } 191 }) 192 } 193 194 // Some templates use replacement variables. Instead, replace those with wildcard 195 var replacer = strings.NewReplacer( 196 "$dstns", ".*", 197 "$dstwl", ".*", 198 "$service", ".*", 199 "$srcns", ".*", 200 "$srcwl", ".*", 201 "$namespace", ".*", 202 "$workload", ".*", 203 "$dstsvc", ".*", 204 "$adapter", ".*", 205 "$qrep", "destination", 206 // Just allow all mTLS settings rather than trying to send mtls and plaintext 207 `connection_security_policy="unknown"`, `connection_security_policy=~".*"`, 208 `connection_security_policy="mutual_tls"`, `connection_security_policy=~".*"`, 209 `connection_security_policy!="mutual_tls"`, `connection_security_policy=~".*"`, 210 // Test runs in istio-system 211 `destination_workload_namespace!="istio-system"`, `destination_workload_namespace=~".*"`, 212 `source_workload_namespace!="istio-system"`, `source_workload_namespace=~".*"`, 213 ) 214 215 func checkMetric(cl cluster.Cluster, p prometheus.Instance, query string, excluded []string) error { 216 query = replacer.Replace(query) 217 value, _, err := p.APIForCluster(cl).QueryRange(context.Background(), query, promv1.Range{ 218 Start: time.Now().Add(-time.Minute), 219 End: time.Now(), 220 Step: time.Second, 221 }) 222 if err != nil { 223 return fmt.Errorf("failure executing query (%s): %v", query, err) 224 } 225 if value == nil { 226 return fmt.Errorf("returned value should not be nil for '%s'", query) 227 } 228 numSamples := 0 229 switch v := value.(type) { 230 case model.Vector: 231 numSamples = v.Len() 232 case model.Matrix: 233 numSamples = v.Len() 234 case *model.Scalar: 235 numSamples = 1 236 default: 237 return fmt.Errorf("unknown metric value type: %T", v) 238 } 239 if includeQuery(query, excluded) { 240 if numSamples == 0 { 241 return fmt.Errorf("expected a metric value for '%s', found no samples: %#v", query, value) 242 } 243 } else { 244 if numSamples != 0 { 245 scopes.Framework.Infof("Filtered out metric '%v', but got samples: %v", query, numSamples) 246 } 247 } 248 return nil 249 } 250 251 const gatewayConfig = ` 252 apiVersion: networking.istio.io/v1alpha3 253 kind: Gateway 254 metadata: 255 name: echo-gateway 256 spec: 257 selector: 258 istio: ingressgateway 259 servers: 260 - port: 261 number: 80 262 name: http 263 protocol: HTTP 264 hosts: 265 - "*" 266 - port: 267 number: 31400 268 name: tcp 269 protocol: TCP 270 hosts: 271 - "*" 272 --- 273 apiVersion: networking.istio.io/v1alpha3 274 kind: VirtualService 275 metadata: 276 name: echo 277 spec: 278 hosts: 279 - "*" 280 gateways: 281 - echo-gateway 282 http: 283 - match: 284 - uri: 285 exact: /echo-%s 286 route: 287 - destination: 288 host: b 289 port: 290 number: 80 291 tcp: 292 - match: 293 - port: 31400 294 route: 295 - destination: 296 host: b 297 port: 298 number: 9090 299 ` 300 301 func setupDashboardTest(done <-chan struct{}) { 302 // Send 200 http requests, 20 tcp requests across goroutines, generating a variety of error codes. 303 // Spread out over 20s so rate() queries will behave correctly 304 ticker := time.NewTicker(time.Second) 305 times := 0 306 for { 307 select { 308 case <-ticker.C: 309 times++ 310 scopes.Framework.Infof("sending traffic %v", times) 311 for _, ing := range ingr { 312 hosts, ports := ing.TCPAddresses() 313 host := hosts[0] 314 port := ports[0] 315 _, err := ing.Call(echo.CallOptions{ 316 Port: echo.Port{ 317 Protocol: protocol.HTTP, 318 }, 319 Count: 10, 320 HTTP: echo.HTTP{ 321 Path: fmt.Sprintf("/echo-%s?codes=418:10,520:15,200:75", apps.Namespace.Name()), 322 Headers: headers.New().WithHost("server").Build(), 323 }, 324 Check: check.NoError(), // Do not use check.OK since we expect non-200 325 Retry: echo.Retry{ 326 NoRetry: true, 327 }, 328 }) 329 if err != nil { 330 // Do not fail on errors since there may be initial startup errors 331 // These calls are not under tests, the dashboards are, so we can be leniant here 332 log.Warnf("requests failed: %v", err) 333 } 334 _, err = ing.Call(echo.CallOptions{ 335 Port: echo.Port{ 336 Protocol: protocol.TCP, 337 ServicePort: port, 338 }, 339 Address: host, 340 HTTP: echo.HTTP{ 341 Path: fmt.Sprintf("/echo-%s", apps.Namespace.Name()), 342 Headers: headers.New().WithHost("server").Build(), 343 }, 344 Check: check.OK(), 345 Retry: echo.Retry{ 346 NoRetry: true, 347 }, 348 }) 349 if err != nil { 350 // Do not fail on errors since there may be initial startup errors 351 // These calls are not under tests, the dashboards are, so we can be leniant here 352 log.Warnf("requests failed: %v", err) 353 } 354 } 355 case <-done: 356 scopes.Framework.Infof("done sending traffic after %v rounds", times) 357 return 358 } 359 } 360 } 361 362 // extractQueries pulls all prometheus queries out of a grafana dashboard 363 // Rather than importing the entire grafana API just for this test, do some shoddy json parsing 364 // Equivalent to the union of the jq commands: 365 // '.panels[].targets[]?.expr' and '.panels[].panels[]?.targets[]?.expr' 366 func extractQueries(dash string) ([]string, error) { 367 var queries []string 368 js := map[string]any{} 369 if err := json.Unmarshal([]byte(dash), &js); err != nil { 370 return nil, err 371 } 372 panels, f := js["panels"] 373 if !f { 374 return nil, fmt.Errorf("failed to find panels in %v", dash) 375 } 376 panelsList, f := panels.([]any) 377 if !f { 378 return nil, fmt.Errorf("failed to find panelsList in type %T: %v", panels, panels) 379 } 380 for _, p := range panelsList { 381 pm := p.(map[string]any) 382 if pm["type"] == "row" { 383 continue 384 } 385 subPanels, exist := pm["panels"] 386 var targets any 387 var f bool 388 if exist { 389 subpm := subPanels.(map[string]any) 390 targets, f = subpm["targets"] 391 } else { 392 targets, f = pm["targets"] 393 } 394 if !f { 395 continue 396 } 397 targetsList, f := targets.([]any) 398 if !f { 399 return nil, fmt.Errorf("failed to find targetsList in type %T: %v", targets, targets) 400 } 401 for _, t := range targetsList { 402 tm := t.(map[string]any) 403 expr, f := tm["expr"] 404 if !f { 405 continue 406 } 407 queries = append(queries, expr.(string)) 408 } 409 } 410 return queries, nil 411 } 412 413 func includeQuery(query string, excluded []string) bool { 414 for _, f := range excluded { 415 if strings.Contains(query, f) { 416 return false 417 } 418 } 419 return true 420 }