istio.io/istio@v0.0.0-20240520182934-d79c90f27776/pilot/pkg/model/telemetry.go (about) 1 // Copyright Istio Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package model 16 17 import ( 18 "fmt" 19 "sort" 20 "strings" 21 "sync" 22 "time" 23 24 udpa "github.com/cncf/xds/go/udpa/type/v1" 25 accesslog "github.com/envoyproxy/go-control-plane/envoy/config/accesslog/v3" 26 listener "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3" 27 hcm "github.com/envoyproxy/go-control-plane/envoy/extensions/filters/network/http_connection_manager/v3" 28 "google.golang.org/protobuf/types/known/anypb" 29 "google.golang.org/protobuf/types/known/durationpb" 30 "google.golang.org/protobuf/types/known/structpb" 31 wrappers "google.golang.org/protobuf/types/known/wrapperspb" 32 "k8s.io/apimachinery/pkg/types" 33 34 "istio.io/api/envoy/extensions/stats" 35 meshconfig "istio.io/api/mesh/v1alpha1" 36 tpb "istio.io/api/telemetry/v1alpha1" 37 "istio.io/istio/pilot/pkg/features" 38 "istio.io/istio/pilot/pkg/networking" 39 "istio.io/istio/pilot/pkg/util/protoconv" 40 "istio.io/istio/pkg/config/schema/gvk" 41 "istio.io/istio/pkg/config/xds" 42 "istio.io/istio/pkg/ptr" 43 "istio.io/istio/pkg/util/sets" 44 ) 45 46 // Telemetry holds configuration for Telemetry API resources. 47 type Telemetry struct { 48 Name string `json:"name"` 49 Namespace string `json:"namespace"` 50 Spec *tpb.Telemetry `json:"spec"` 51 } 52 53 func (t *Telemetry) NamespacedName() types.NamespacedName { 54 return types.NamespacedName{Name: t.Name, Namespace: t.Namespace} 55 } 56 57 // Telemetries organizes Telemetry configuration by namespace. 58 type Telemetries struct { 59 // Maps from namespace to the Telemetry configs. 60 NamespaceToTelemetries map[string][]Telemetry `json:"namespace_to_telemetries"` 61 62 // The name of the root namespace. 63 RootNamespace string `json:"root_namespace"` 64 65 // Computed meshConfig 66 meshConfig *meshconfig.MeshConfig 67 68 // computedMetricsFilters contains the set of cached HCM/listener filters for the metrics portion. 69 // These filters are extremely costly, as we insert them into every listener on every proxy, and to 70 // generate them we need to merge many telemetry specs and perform 2 Any marshals. 71 // To improve performance, we store a cache based on the Telemetries that impacted the filter, as well as 72 // its class and protocol. This is protected by mu. 73 // Currently, this only applies to metrics, but a similar concept can likely be applied to logging and 74 // tracing for performance. 75 // The computedMetricsFilters lifetime is bound to the Telemetries object. During a push context 76 // creation, we will preserve the Telemetries (and thus the cache) if not Telemetries are modified. 77 // As result, this cache will live until any Telemetry is modified. 78 computedMetricsFilters map[metricsKey]any 79 computedLoggingConfig map[loggingKey][]LoggingConfig 80 mu sync.Mutex 81 } 82 83 // telemetryKey defines a key into the computedMetricsFilters cache. 84 type telemetryKey struct { 85 // Root stores the Telemetry in the root namespace, if any 86 Root types.NamespacedName 87 // Namespace stores the Telemetry in the root namespace, if any 88 Namespace types.NamespacedName 89 // Workload stores the Telemetry in the root namespace, if any 90 Workload types.NamespacedName 91 } 92 93 // loggingKey defines a key into the computedLoggingConfig cache. 94 type loggingKey struct { 95 telemetryKey 96 Class networking.ListenerClass 97 Protocol networking.ListenerProtocol 98 } 99 100 // metricsKey defines a key into the computedMetricsFilters cache. 101 type metricsKey struct { 102 telemetryKey 103 Class networking.ListenerClass 104 Protocol networking.ListenerProtocol 105 ProxyType NodeType 106 } 107 108 // getTelemetries returns the Telemetry configurations for the given environment. 109 func getTelemetries(env *Environment) *Telemetries { 110 telemetries := &Telemetries{ 111 NamespaceToTelemetries: map[string][]Telemetry{}, 112 RootNamespace: env.Mesh().GetRootNamespace(), 113 meshConfig: env.Mesh(), 114 computedMetricsFilters: map[metricsKey]any{}, 115 computedLoggingConfig: map[loggingKey][]LoggingConfig{}, 116 } 117 118 fromEnv := env.List(gvk.Telemetry, NamespaceAll) 119 sortConfigByCreationTime(fromEnv) 120 for _, config := range fromEnv { 121 telemetry := Telemetry{ 122 Name: config.Name, 123 Namespace: config.Namespace, 124 Spec: config.Spec.(*tpb.Telemetry), 125 } 126 telemetries.NamespaceToTelemetries[config.Namespace] = append(telemetries.NamespaceToTelemetries[config.Namespace], telemetry) 127 } 128 129 return telemetries 130 } 131 132 type metricsConfig struct { 133 ClientMetrics metricConfig 134 ServerMetrics metricConfig 135 ReportingInterval *durationpb.Duration 136 RotationInterval *durationpb.Duration 137 GracefulDeletionInterval *durationpb.Duration 138 } 139 140 type metricConfig struct { 141 // if true, do not add filter to chain 142 Disabled bool 143 Overrides []metricsOverride 144 } 145 146 type telemetryFilterConfig struct { 147 metricsConfig 148 Provider *meshconfig.MeshConfig_ExtensionProvider 149 Metrics bool 150 AccessLogging bool 151 LogsFilter *tpb.AccessLogging_Filter 152 NodeType NodeType 153 } 154 155 func (t telemetryFilterConfig) MetricsForClass(c networking.ListenerClass) metricConfig { 156 switch c { 157 case networking.ListenerClassGateway: 158 return t.ClientMetrics 159 case networking.ListenerClassSidecarInbound: 160 return t.ServerMetrics 161 case networking.ListenerClassSidecarOutbound: 162 return t.ClientMetrics 163 default: 164 return t.ClientMetrics 165 } 166 } 167 168 type metricsOverride struct { 169 Name string 170 Disabled bool 171 Tags []tagOverride 172 } 173 174 type tagOverride struct { 175 Name string 176 Remove bool 177 Value string 178 } 179 180 // computedTelemetries contains the various Telemetry configurations in scope for a given proxy. 181 // This can include the root namespace, namespace, and workload Telemetries combined 182 type computedTelemetries struct { 183 telemetryKey 184 Metrics []*tpb.Metrics 185 Logging []*computedAccessLogging 186 Tracing []*tpb.Tracing 187 } 188 189 // computedAccessLogging contains the various AccessLogging configurations in scope for a given proxy, 190 // include combined configurations for one of the following levels: 1. the root namespace level 191 // 2. namespace level 3. workload level combined. 192 type computedAccessLogging struct { 193 telemetryKey 194 Logging []*tpb.AccessLogging 195 } 196 197 type TracingConfig struct { 198 ServerSpec TracingSpec 199 ClientSpec TracingSpec 200 } 201 202 type TracingSpec struct { 203 Provider *meshconfig.MeshConfig_ExtensionProvider 204 Disabled bool 205 RandomSamplingPercentage *float64 206 CustomTags map[string]*tpb.Tracing_CustomTag 207 UseRequestIDForTraceSampling bool 208 } 209 210 type LoggingConfig struct { 211 Disabled bool 212 AccessLog *accesslog.AccessLog 213 Provider *meshconfig.MeshConfig_ExtensionProvider 214 Filter *tpb.AccessLogging_Filter 215 } 216 217 type loggingSpec struct { 218 Disabled bool 219 Filter *tpb.AccessLogging_Filter 220 } 221 222 func workloadMode(class networking.ListenerClass) tpb.WorkloadMode { 223 switch class { 224 case networking.ListenerClassGateway: 225 return tpb.WorkloadMode_CLIENT 226 case networking.ListenerClassSidecarInbound: 227 return tpb.WorkloadMode_SERVER 228 case networking.ListenerClassSidecarOutbound: 229 return tpb.WorkloadMode_CLIENT 230 case networking.ListenerClassUndefined: 231 // this should not happen, just in case 232 return tpb.WorkloadMode_CLIENT 233 } 234 235 return tpb.WorkloadMode_CLIENT 236 } 237 238 // AccessLogging returns the logging configuration for a given proxy and listener class. 239 // If nil or empty configuration is returned, access logs are not configured via Telemetry and should use fallback mechanisms. 240 // If access logging is explicitly disabled, a configuration with disabled set to true is returned. 241 func (t *Telemetries) AccessLogging(push *PushContext, proxy *Proxy, class networking.ListenerClass, svc *Service) []LoggingConfig { 242 ct := t.applicableTelemetries(proxy, nil) 243 if len(ct.Logging) == 0 && len(t.meshConfig.GetDefaultProviders().GetAccessLogging()) == 0 { 244 // No Telemetry API configured, fall back to legacy mesh config setting 245 return nil 246 } 247 248 key := loggingKey{ 249 telemetryKey: ct.telemetryKey, 250 Class: class, 251 } 252 t.mu.Lock() 253 defer t.mu.Unlock() 254 precomputed, ok := t.computedLoggingConfig[key] 255 if ok { 256 return precomputed 257 } 258 259 providers := mergeLogs(ct.Logging, t.meshConfig, workloadMode(class)) 260 cfgs := make([]LoggingConfig, 0, len(providers)) 261 for p, v := range providers { 262 fp := t.fetchProvider(p) 263 if fp == nil { 264 log.Debugf("fail to fetch provider %s", p) 265 continue 266 } 267 cfg := LoggingConfig{ 268 Provider: fp, 269 Filter: v.Filter, 270 Disabled: v.Disabled, 271 } 272 273 al := telemetryAccessLog(push, fp) 274 if al == nil { 275 // stackdriver will be handled in HTTPFilters/TCPFilters 276 continue 277 } 278 cfg.AccessLog = al 279 cfgs = append(cfgs, cfg) 280 } 281 282 t.computedLoggingConfig[key] = cfgs 283 return cfgs 284 } 285 286 // Tracing returns the logging tracing for a given proxy. If nil is returned, tracing 287 // are not configured via Telemetry and should use fallback mechanisms. If a non-nil but disabled is set, 288 // then tracing is explicitly disabled. 289 // A service can optionally be provided to include service-attached Telemetry config. 290 func (t *Telemetries) Tracing(proxy *Proxy, svc *Service) *TracingConfig { 291 ct := t.applicableTelemetries(proxy, svc) 292 293 providerNames := t.meshConfig.GetDefaultProviders().GetTracing() 294 hasDefaultProvider := len(providerNames) > 0 295 296 if len(ct.Tracing) == 0 && !hasDefaultProvider { 297 return nil 298 } 299 300 clientSpec := TracingSpec{UseRequestIDForTraceSampling: true} 301 serverSpec := TracingSpec{UseRequestIDForTraceSampling: true} 302 303 if hasDefaultProvider { 304 // todo: what do we want to do with more than one default provider? 305 // for now, use only the first provider. 306 fetched := t.fetchProvider(providerNames[0]) 307 clientSpec.Provider = fetched 308 serverSpec.Provider = fetched 309 } 310 311 for _, m := range ct.Tracing { 312 names := getProviderNames(m.Providers) 313 314 specs := []*TracingSpec{&clientSpec, &serverSpec} 315 if m.Match != nil { 316 switch m.Match.Mode { 317 case tpb.WorkloadMode_CLIENT: 318 specs = []*TracingSpec{&clientSpec} 319 case tpb.WorkloadMode_SERVER: 320 specs = []*TracingSpec{&serverSpec} 321 } 322 } 323 324 if len(names) > 0 { 325 // NOTE: we only support a single provider per mode 326 // so, choosing the first provider returned in the list 327 // is the "safest" 328 fetched := t.fetchProvider(names[0]) 329 for _, spec := range specs { 330 spec.Provider = fetched 331 } 332 } 333 334 // Now merge in any overrides 335 if m.DisableSpanReporting != nil { 336 for _, spec := range specs { 337 spec.Disabled = m.DisableSpanReporting.GetValue() 338 } 339 } 340 // TODO: metrics overrides do a deep merge, but here we do a shallow merge. 341 // We should consider if we want to reconcile the two. 342 if m.CustomTags != nil { 343 for _, spec := range specs { 344 spec.CustomTags = m.CustomTags 345 } 346 } 347 if m.RandomSamplingPercentage != nil { 348 for _, spec := range specs { 349 spec.RandomSamplingPercentage = ptr.Of(m.RandomSamplingPercentage.GetValue()) 350 } 351 } 352 if m.UseRequestIdForTraceSampling != nil { 353 for _, spec := range specs { 354 spec.UseRequestIDForTraceSampling = m.UseRequestIdForTraceSampling.Value 355 } 356 } 357 } 358 359 // If no provider is configured (and retrieved) for the tracing specs, 360 // then we will disable the configuration. 361 if clientSpec.Provider == nil { 362 clientSpec.Disabled = true 363 } 364 if serverSpec.Provider == nil { 365 serverSpec.Disabled = true 366 } 367 368 cfg := TracingConfig{ 369 ClientSpec: clientSpec, 370 ServerSpec: serverSpec, 371 } 372 return &cfg 373 } 374 375 // HTTPFilters computes the HttpFilter for a given proxy/class 376 func (t *Telemetries) HTTPFilters(proxy *Proxy, class networking.ListenerClass, svc *Service) []*hcm.HttpFilter { 377 if res := t.telemetryFilters(proxy, class, networking.ListenerProtocolHTTP, svc); res != nil { 378 return res.([]*hcm.HttpFilter) 379 } 380 return nil 381 } 382 383 // TCPFilters computes the TCPFilters for a given proxy/class 384 func (t *Telemetries) TCPFilters(proxy *Proxy, class networking.ListenerClass, svc *Service) []*listener.Filter { 385 if res := t.telemetryFilters(proxy, class, networking.ListenerProtocolTCP, svc); res != nil { 386 return res.([]*listener.Filter) 387 } 388 return nil 389 } 390 391 // applicableTelemetries fetches the relevant telemetry configurations for a given proxy 392 func (t *Telemetries) applicableTelemetries(proxy *Proxy, svc *Service) computedTelemetries { 393 if t == nil { 394 return computedTelemetries{} 395 } 396 397 namespace := proxy.ConfigNamespace 398 // Order here matters. The latter elements will override the first elements 399 ms := []*tpb.Metrics{} 400 ls := []*computedAccessLogging{} 401 ts := []*tpb.Tracing{} 402 key := telemetryKey{} 403 if t.RootNamespace != "" { 404 telemetry := t.namespaceWideTelemetryConfig(t.RootNamespace) 405 if telemetry != (Telemetry{}) { 406 key.Root = types.NamespacedName{Name: telemetry.Name, Namespace: telemetry.Namespace} 407 ms = append(ms, telemetry.Spec.GetMetrics()...) 408 if len(telemetry.Spec.GetAccessLogging()) != 0 { 409 ls = append(ls, &computedAccessLogging{ 410 telemetryKey: telemetryKey{ 411 Root: key.Root, 412 }, 413 Logging: telemetry.Spec.GetAccessLogging(), 414 }) 415 } 416 ts = append(ts, telemetry.Spec.GetTracing()...) 417 } 418 } 419 420 if namespace != t.RootNamespace { 421 telemetry := t.namespaceWideTelemetryConfig(namespace) 422 if telemetry != (Telemetry{}) { 423 key.Namespace = types.NamespacedName{Name: telemetry.Name, Namespace: telemetry.Namespace} 424 ms = append(ms, telemetry.Spec.GetMetrics()...) 425 if len(telemetry.Spec.GetAccessLogging()) != 0 { 426 ls = append(ls, &computedAccessLogging{ 427 telemetryKey: telemetryKey{ 428 Namespace: key.Namespace, 429 }, 430 Logging: telemetry.Spec.GetAccessLogging(), 431 }) 432 } 433 ts = append(ts, telemetry.Spec.GetTracing()...) 434 } 435 } 436 437 ct := &computedTelemetries{ 438 telemetryKey: key, 439 Metrics: ms, 440 Logging: ls, 441 Tracing: ts, 442 } 443 444 matcher := PolicyMatcherForProxy(proxy).WithService(svc) 445 for _, telemetry := range t.NamespaceToTelemetries[namespace] { 446 spec := telemetry.Spec 447 // TODO in many other places, empty selector matches all policy 448 if len(spec.GetSelector().GetMatchLabels()) == 0 { 449 continue 450 } 451 if matcher.ShouldAttachPolicy(gvk.Telemetry, telemetry.NamespacedName(), spec) { 452 ct = appendApplicableTelemetries(ct, telemetry, spec) 453 } else { 454 log.Debug("There isn't a match between the workload and the policy. Policy is ignored.") 455 } 456 } 457 458 return *ct 459 } 460 461 func appendApplicableTelemetries(ct *computedTelemetries, tel Telemetry, spec *tpb.Telemetry) *computedTelemetries { 462 ct.telemetryKey.Workload = types.NamespacedName{Name: tel.Name, Namespace: tel.Namespace} 463 ct.Metrics = append(ct.Metrics, spec.GetMetrics()...) 464 if len(tel.Spec.GetAccessLogging()) != 0 { 465 ct.Logging = append(ct.Logging, &computedAccessLogging{ 466 telemetryKey: telemetryKey{ 467 Workload: types.NamespacedName{Name: tel.Name, Namespace: tel.Namespace}, 468 }, 469 Logging: tel.Spec.GetAccessLogging(), 470 }) 471 } 472 ct.Tracing = append(ct.Tracing, spec.GetTracing()...) 473 474 return ct 475 } 476 477 // telemetryFilters computes the filters for the given proxy/class and protocol. This computes the 478 // set of applicable Telemetries, merges them, then translates to the appropriate filters based on the 479 // extension providers in the mesh config. Where possible, the result is cached. 480 // Currently, this includes metrics and access logging, as some providers are implemented in filters. 481 func (t *Telemetries) telemetryFilters(proxy *Proxy, class networking.ListenerClass, protocol networking.ListenerProtocol, svc *Service) any { 482 if t == nil { 483 return nil 484 } 485 486 c := t.applicableTelemetries(proxy, svc) 487 488 key := metricsKey{ 489 telemetryKey: c.telemetryKey, 490 Class: class, 491 Protocol: protocol, 492 ProxyType: proxy.Type, 493 } 494 t.mu.Lock() 495 defer t.mu.Unlock() 496 precomputed, f := t.computedMetricsFilters[key] 497 if f { 498 return precomputed 499 } 500 501 // First, take all the metrics configs and transform them into a normalized form 502 tmm := mergeMetrics(c.Metrics, t.meshConfig) 503 log.Debugf("merged metrics, proxyID: %s metrics: %+v", proxy.ID, tmm) 504 // Additionally, fetch relevant access logging configurations 505 tml := mergeLogs(c.Logging, t.meshConfig, workloadMode(class)) 506 507 // The above result is in a nested map to deduplicate responses. This loses ordering, so we convert to 508 // a list to retain stable naming 509 allKeys := sets.New[string]() 510 for k, v := range tml { 511 if v.Disabled { 512 continue 513 } 514 allKeys.Insert(k) 515 } 516 for k := range tmm { 517 allKeys.Insert(k) 518 } 519 520 rotationInterval := getInterval(features.MetricRotationInterval, defaultMetricRotationInterval) 521 gracefulDeletionInterval := getInterval(features.MetricGracefulDeletionInterval, defaultMetricGracefulDeletionInterval) 522 523 m := make([]telemetryFilterConfig, 0, allKeys.Len()) 524 for _, k := range sets.SortedList(allKeys) { 525 p := t.fetchProvider(k) 526 if p == nil { 527 continue 528 } 529 loggingCfg, logging := tml[k] 530 mertricCfg, metrics := tmm[k] 531 532 mertricCfg.RotationInterval = rotationInterval 533 mertricCfg.GracefulDeletionInterval = gracefulDeletionInterval 534 535 cfg := telemetryFilterConfig{ 536 Provider: p, 537 metricsConfig: mertricCfg, 538 AccessLogging: logging && !loggingCfg.Disabled, 539 Metrics: metrics, 540 LogsFilter: tml[p.Name].Filter, 541 NodeType: proxy.Type, 542 } 543 m = append(m, cfg) 544 } 545 546 var res any 547 // Finally, compute the actual filters based on the protoc 548 switch protocol { 549 case networking.ListenerProtocolHTTP: 550 res = buildHTTPTelemetryFilter(class, m) 551 default: 552 res = buildTCPTelemetryFilter(class, m) 553 } 554 555 // Update cache 556 t.computedMetricsFilters[key] = res 557 return res 558 } 559 560 // default value for metric rotation interval and graceful deletion interval, 561 // more details can be found in here: https://github.com/istio/proxy/blob/master/source/extensions/filters/http/istio_stats/config.proto#L116 562 var ( 563 defaultMetricRotationInterval = 0 * time.Second 564 defaultMetricGracefulDeletionInterval = 5 * time.Minute 565 ) 566 567 // getInterval return nil to reduce the size of the config, when equal to the default. 568 func getInterval(input, defaultValue time.Duration) *durationpb.Duration { 569 if input == defaultValue { 570 return nil 571 } 572 573 return durationpb.New(input) 574 } 575 576 // mergeLogs returns the set of providers for the given logging configuration. 577 // The provider names are mapped to any applicable access logging filter that has been applied in provider configuration. 578 func mergeLogs(logs []*computedAccessLogging, mesh *meshconfig.MeshConfig, mode tpb.WorkloadMode) map[string]loggingSpec { 579 providers := map[string]loggingSpec{} 580 581 if len(logs) == 0 { 582 for _, dp := range mesh.GetDefaultProviders().GetAccessLogging() { 583 // Insert the default provider. 584 providers[dp] = loggingSpec{} 585 } 586 return providers 587 } 588 providerNames := mesh.GetDefaultProviders().GetAccessLogging() 589 filters := map[string]loggingSpec{} 590 for _, m := range logs { 591 names := sets.New[string]() 592 for _, p := range m.Logging { 593 if !matchWorkloadMode(p.Match, mode) { 594 continue 595 } 596 subProviders := getProviderNames(p.Providers) 597 names.InsertAll(subProviders...) 598 599 for _, prov := range subProviders { 600 filters[prov] = loggingSpec{ 601 Filter: p.Filter, 602 } 603 } 604 } 605 606 if names.Len() > 0 { 607 providerNames = names.UnsortedList() 608 } 609 } 610 inScopeProviders := sets.New(providerNames...) 611 612 parentProviders := mesh.GetDefaultProviders().GetAccessLogging() 613 for _, l := range logs { 614 for _, m := range l.Logging { 615 providerNames := getProviderNames(m.Providers) 616 if len(providerNames) == 0 { 617 providerNames = parentProviders 618 } 619 parentProviders = providerNames 620 for _, provider := range providerNames { 621 if !inScopeProviders.Contains(provider) { 622 // We don't care about this, remove it 623 // This occurs when a top level provider is later disabled by a lower level 624 continue 625 } 626 627 if !matchWorkloadMode(m.Match, mode) { 628 continue 629 } 630 631 // see UT: server - multi filters disabled 632 if m.GetDisabled().GetValue() { 633 providers[provider] = loggingSpec{Disabled: true} 634 continue 635 } 636 637 providers[provider] = filters[provider] 638 } 639 } 640 } 641 642 return providers 643 } 644 645 func matchWorkloadMode(selector *tpb.AccessLogging_LogSelector, mode tpb.WorkloadMode) bool { 646 if selector == nil { 647 return true 648 } 649 650 if selector.Mode == tpb.WorkloadMode_CLIENT_AND_SERVER { 651 return true 652 } 653 654 return selector.Mode == mode 655 } 656 657 func (t *Telemetries) namespaceWideTelemetryConfig(namespace string) Telemetry { 658 for _, tel := range t.NamespaceToTelemetries[namespace] { 659 if len(tel.Spec.GetSelector().GetMatchLabels()) == 0 { 660 return tel 661 } 662 } 663 return Telemetry{} 664 } 665 666 // fetchProvider finds the matching ExtensionProviders from the mesh config 667 func (t *Telemetries) fetchProvider(m string) *meshconfig.MeshConfig_ExtensionProvider { 668 for _, p := range t.meshConfig.ExtensionProviders { 669 if strings.EqualFold(m, p.Name) { 670 return p 671 } 672 } 673 return nil 674 } 675 676 func (t *Telemetries) Debug(proxy *Proxy) any { 677 // TODO we could use service targets + ambient index to include service-attached here 678 at := t.applicableTelemetries(proxy, nil) 679 return at 680 } 681 682 var allMetrics = func() []string { 683 r := make([]string, 0, len(tpb.MetricSelector_IstioMetric_value)) 684 for k := range tpb.MetricSelector_IstioMetric_value { 685 if k != tpb.MetricSelector_IstioMetric_name[int32(tpb.MetricSelector_ALL_METRICS)] { 686 r = append(r, k) 687 } 688 } 689 sort.Strings(r) 690 return r 691 }() 692 693 // mergeMetrics merges many Metrics objects into a normalized configuration 694 func mergeMetrics(metrics []*tpb.Metrics, mesh *meshconfig.MeshConfig) map[string]metricsConfig { 695 type metricOverride struct { 696 Disabled *wrappers.BoolValue 697 TagOverrides map[string]*tpb.MetricsOverrides_TagOverride 698 } 699 // provider -> mode -> metric -> overrides 700 providers := map[string]map[tpb.WorkloadMode]map[string]metricOverride{} 701 702 if len(metrics) == 0 { 703 for _, dp := range mesh.GetDefaultProviders().GetMetrics() { 704 // Insert the default provider. It has no overrides; presence of the key is sufficient to 705 // get the filter created. 706 providers[dp] = map[tpb.WorkloadMode]map[string]metricOverride{} 707 } 708 } 709 710 providerNames := mesh.GetDefaultProviders().GetMetrics() 711 for _, m := range metrics { 712 names := getProviderNames(m.Providers) 713 // If providers is set, it overrides the parent. If not, inherent from the parent. It is not a deep merge. 714 if len(names) > 0 { 715 providerNames = names 716 } 717 } 718 // Record the names of all providers we should configure. Anything else we will ignore 719 inScopeProviders := sets.New(providerNames...) 720 721 parentProviders := mesh.GetDefaultProviders().GetMetrics() 722 disabledAllMetricsProviders := sets.New[string]() 723 reportingIntervals := map[string]*durationpb.Duration{} 724 for _, m := range metrics { 725 providerNames := getProviderNames(m.Providers) 726 // If providers is not set, use parent's 727 if len(providerNames) == 0 { 728 providerNames = parentProviders 729 } 730 731 reportInterval := m.GetReportingInterval() 732 parentProviders = providerNames 733 for _, provider := range providerNames { 734 if !inScopeProviders.Contains(provider) { 735 // We don't care about this, remove it 736 // This occurs when a top level provider is later disabled by a lower level 737 continue 738 } 739 740 if reportInterval != nil { 741 reportingIntervals[provider] = reportInterval 742 } 743 744 if _, f := providers[provider]; !f { 745 providers[provider] = map[tpb.WorkloadMode]map[string]metricOverride{ 746 tpb.WorkloadMode_CLIENT: {}, 747 tpb.WorkloadMode_SERVER: {}, 748 } 749 } 750 751 mp := providers[provider] 752 // For each override, we normalize the configuration. The metrics list is an ordered list - latter 753 // elements have precedence. As a result, we will apply updates on top of previous entries. 754 for _, o := range m.Overrides { 755 // if we disable all metrics, we should drop the entire filter 756 if isAllMetrics(o.GetMatch()) && o.Disabled.GetValue() { 757 for _, mode := range getModes(o.GetMatch().GetMode()) { 758 key := metricProviderModeKey(provider, mode) 759 disabledAllMetricsProviders.Insert(key) 760 } 761 762 continue 763 } 764 765 metricsNames := getMatches(o.GetMatch()) 766 // If client or server is set explicitly, only apply there. Otherwise, we will apply to both. 767 // Note: client and server keys may end up the same, which is fine 768 for _, mode := range getModes(o.GetMatch().GetMode()) { 769 // root namespace disables all, but then enables them by namespace scoped 770 key := metricProviderModeKey(provider, mode) 771 disabledAllMetricsProviders.Delete(key) 772 // Next, get all matches. 773 // This is a bit funky because the matches are oneof of ENUM and customer metric. We normalize 774 // these to strings, so we may end up with a list like [REQUEST_COUNT, my-customer-metric]. 775 // TODO: we always flatten ALL_METRICS into each metric mode. For some stats providers (prometheus), 776 // we are able to apply overrides to all metrics directly rather than duplicating the config. 777 // We should tweak this to collapse to this mode where possible 778 for _, metricName := range metricsNames { 779 if _, f := mp[mode]; !f { 780 mp[mode] = map[string]metricOverride{} 781 } 782 override := mp[mode][metricName] 783 if o.Disabled != nil { 784 override.Disabled = o.Disabled 785 } 786 for k, v := range o.TagOverrides { 787 if override.TagOverrides == nil { 788 override.TagOverrides = map[string]*tpb.MetricsOverrides_TagOverride{} 789 } 790 override.TagOverrides[k] = v 791 } 792 mp[mode][metricName] = override 793 } 794 } 795 } 796 } 797 } 798 799 processed := map[string]metricsConfig{} 800 for provider, modeMap := range providers { 801 tmm := processed[provider] 802 tmm.ReportingInterval = reportingIntervals[provider] 803 804 for mode, metricMap := range modeMap { 805 key := metricProviderModeKey(provider, mode) 806 if disabledAllMetricsProviders.Contains(key) { 807 switch mode { 808 case tpb.WorkloadMode_CLIENT: 809 tmm.ClientMetrics.Disabled = true 810 case tpb.WorkloadMode_SERVER: 811 tmm.ServerMetrics.Disabled = true 812 } 813 continue 814 } 815 816 for metric, override := range metricMap { 817 tags := []tagOverride{} 818 for k, v := range override.TagOverrides { 819 o := tagOverride{Name: k} 820 switch v.Operation { 821 case tpb.MetricsOverrides_TagOverride_REMOVE: 822 o.Remove = true 823 o.Value = "" 824 case tpb.MetricsOverrides_TagOverride_UPSERT: 825 o.Value = v.GetValue() 826 o.Remove = false 827 } 828 tags = append(tags, o) 829 } 830 // Keep order deterministic 831 sort.Slice(tags, func(i, j int) bool { 832 return tags[i].Name < tags[j].Name 833 }) 834 mo := metricsOverride{ 835 Name: metric, 836 Disabled: override.Disabled.GetValue(), 837 Tags: tags, 838 } 839 840 switch mode { 841 case tpb.WorkloadMode_CLIENT: 842 tmm.ClientMetrics.Overrides = append(tmm.ClientMetrics.Overrides, mo) 843 default: 844 tmm.ServerMetrics.Overrides = append(tmm.ServerMetrics.Overrides, mo) 845 } 846 } 847 } 848 849 // Keep order deterministic 850 sort.Slice(tmm.ServerMetrics.Overrides, func(i, j int) bool { 851 return tmm.ServerMetrics.Overrides[i].Name < tmm.ServerMetrics.Overrides[j].Name 852 }) 853 sort.Slice(tmm.ClientMetrics.Overrides, func(i, j int) bool { 854 return tmm.ClientMetrics.Overrides[i].Name < tmm.ClientMetrics.Overrides[j].Name 855 }) 856 processed[provider] = tmm 857 } 858 return processed 859 } 860 861 func metricProviderModeKey(provider string, mode tpb.WorkloadMode) string { 862 return fmt.Sprintf("%s/%s", provider, mode) 863 } 864 865 func getProviderNames(providers []*tpb.ProviderRef) []string { 866 res := make([]string, 0, len(providers)) 867 for _, p := range providers { 868 res = append(res, p.GetName()) 869 } 870 return res 871 } 872 873 func getModes(mode tpb.WorkloadMode) []tpb.WorkloadMode { 874 switch mode { 875 case tpb.WorkloadMode_CLIENT, tpb.WorkloadMode_SERVER: 876 return []tpb.WorkloadMode{mode} 877 default: 878 return []tpb.WorkloadMode{tpb.WorkloadMode_CLIENT, tpb.WorkloadMode_SERVER} 879 } 880 } 881 882 func isAllMetrics(match *tpb.MetricSelector) bool { 883 switch m := match.GetMetricMatch().(type) { 884 case *tpb.MetricSelector_CustomMetric: 885 return false 886 case *tpb.MetricSelector_Metric: 887 return m.Metric == tpb.MetricSelector_ALL_METRICS 888 default: 889 return true 890 } 891 } 892 893 func getMatches(match *tpb.MetricSelector) []string { 894 switch m := match.GetMetricMatch().(type) { 895 case *tpb.MetricSelector_CustomMetric: 896 return []string{m.CustomMetric} 897 case *tpb.MetricSelector_Metric: 898 if m.Metric == tpb.MetricSelector_ALL_METRICS { 899 return allMetrics 900 } 901 return []string{m.Metric.String()} 902 default: 903 return allMetrics 904 } 905 } 906 907 var waypointStatsConfig = protoconv.MessageToAny(&udpa.TypedStruct{ 908 TypeUrl: "type.googleapis.com/stats.PluginConfig", 909 Value: &structpb.Struct{ 910 Fields: map[string]*structpb.Value{ 911 "reporter": { 912 Kind: &structpb.Value_StringValue{ 913 StringValue: "SERVER_GATEWAY", 914 }, 915 }, 916 }, 917 }, 918 }) 919 920 // telemetryFilterHandled contains the number of providers we handle below. 921 // This is to ensure this stays in sync as new handlers are added 922 // STOP. DO NOT UPDATE THIS WITHOUT UPDATING buildHTTPTelemetryFilter and buildTCPTelemetryFilter. 923 const telemetryFilterHandled = 14 924 925 func buildHTTPTelemetryFilter(class networking.ListenerClass, metricsCfg []telemetryFilterConfig) []*hcm.HttpFilter { 926 res := make([]*hcm.HttpFilter, 0, len(metricsCfg)) 927 for _, cfg := range metricsCfg { 928 switch cfg.Provider.GetProvider().(type) { 929 case *meshconfig.MeshConfig_ExtensionProvider_Prometheus: 930 if cfg.NodeType == Waypoint { 931 f := &hcm.HttpFilter{ 932 Name: xds.StatsFilterName, 933 ConfigType: &hcm.HttpFilter_TypedConfig{TypedConfig: waypointStatsConfig}, 934 } 935 res = append(res, f) 936 } else { 937 if statsCfg := generateStatsConfig(class, cfg); statsCfg != nil { 938 f := &hcm.HttpFilter{ 939 Name: xds.StatsFilterName, 940 ConfigType: &hcm.HttpFilter_TypedConfig{TypedConfig: statsCfg}, 941 } 942 res = append(res, f) 943 } 944 } 945 default: 946 // Only prometheus and SD supported currently 947 continue 948 } 949 } 950 return res 951 } 952 953 func buildTCPTelemetryFilter(class networking.ListenerClass, telemetryConfigs []telemetryFilterConfig) []*listener.Filter { 954 res := []*listener.Filter{} 955 for _, telemetryCfg := range telemetryConfigs { 956 switch telemetryCfg.Provider.GetProvider().(type) { 957 case *meshconfig.MeshConfig_ExtensionProvider_Prometheus: 958 if telemetryCfg.NodeType == Waypoint { 959 f := &listener.Filter{ 960 Name: xds.StatsFilterName, 961 ConfigType: &listener.Filter_TypedConfig{TypedConfig: waypointStatsConfig}, 962 } 963 res = append(res, f) 964 } else { 965 if cfg := generateStatsConfig(class, telemetryCfg); cfg != nil { 966 f := &listener.Filter{ 967 Name: xds.StatsFilterName, 968 ConfigType: &listener.Filter_TypedConfig{TypedConfig: cfg}, 969 } 970 res = append(res, f) 971 } 972 } 973 default: 974 // Only prometheus and SD supported currently 975 continue 976 } 977 } 978 return res 979 } 980 981 var metricToPrometheusMetric = map[string]string{ 982 "REQUEST_COUNT": "requests_total", 983 "REQUEST_DURATION": "request_duration_milliseconds", 984 "REQUEST_SIZE": "request_bytes", 985 "RESPONSE_SIZE": "response_bytes", 986 "TCP_OPENED_CONNECTIONS": "tcp_connections_opened_total", 987 "TCP_CLOSED_CONNECTIONS": "tcp_connections_closed_total", 988 "TCP_SENT_BYTES": "tcp_sent_bytes_total", 989 "TCP_RECEIVED_BYTES": "tcp_received_bytes_total", 990 "GRPC_REQUEST_MESSAGES": "request_messages_total", 991 "GRPC_RESPONSE_MESSAGES": "response_messages_total", 992 } 993 994 func generateStatsConfig(class networking.ListenerClass, filterConfig telemetryFilterConfig) *anypb.Any { 995 if !filterConfig.Metrics { 996 // No metric for prometheus 997 return nil 998 } 999 1000 listenerCfg := filterConfig.MetricsForClass(class) 1001 if listenerCfg.Disabled { 1002 // no metrics for this listener 1003 return nil 1004 } 1005 1006 cfg := stats.PluginConfig{ 1007 DisableHostHeaderFallback: disableHostHeaderFallback(class), 1008 TcpReportingDuration: filterConfig.ReportingInterval, 1009 RotationInterval: filterConfig.RotationInterval, 1010 GracefulDeletionInterval: filterConfig.GracefulDeletionInterval, 1011 } 1012 1013 for _, override := range listenerCfg.Overrides { 1014 metricName, f := metricToPrometheusMetric[override.Name] 1015 if !f { 1016 // Not a predefined metric, must be a custom one 1017 metricName = override.Name 1018 } 1019 mc := &stats.MetricConfig{ 1020 Dimensions: map[string]string{}, 1021 Name: metricName, 1022 Drop: override.Disabled, 1023 } 1024 for _, t := range override.Tags { 1025 if t.Remove { 1026 mc.TagsToRemove = append(mc.TagsToRemove, t.Name) 1027 } else { 1028 mc.Dimensions[t.Name] = t.Value 1029 } 1030 } 1031 cfg.Metrics = append(cfg.Metrics, mc) 1032 } 1033 1034 return protoconv.MessageToAny(&cfg) 1035 } 1036 1037 func disableHostHeaderFallback(class networking.ListenerClass) bool { 1038 return class == networking.ListenerClassSidecarInbound || class == networking.ListenerClassGateway 1039 } 1040 1041 // Equal compares two computedTelemetries for equality. This was created to help with testing. Because of the nature of the structs being compared, 1042 // it is safer to use cmp.Equal as opposed to reflect.DeepEqual. Also, because of the way the structs are generated, it is not possible to use 1043 // cmpopts.IgnoreUnexported without risking flakiness if those third party types that are relied on change. Next best thing is to use a custom 1044 // comparer as defined below. When cmp.Equal is called on this type, this will be leveraged by cmp.Equal to do the comparison see 1045 // https://godoc.org/github.com/google/go-cmp/cmp#Equal for more info. 1046 func (ct *computedTelemetries) Equal(other *computedTelemetries) bool { 1047 if ct == nil && other == nil { 1048 return true 1049 } 1050 if ct != nil && other == nil || ct == nil && other != nil { 1051 return false 1052 } 1053 if len(ct.Metrics) != len(other.Metrics) || len(ct.Logging) != len(other.Logging) || len(ct.Tracing) != len(other.Tracing) { 1054 return false 1055 } 1056 // Sort each slice so that we can compare them in order. Comparison is on the fields that are used in the test cases. 1057 sort.SliceStable(ct.Metrics, func(i, j int) bool { 1058 return ct.Metrics[i].Providers[0].Name < ct.Metrics[j].Providers[0].Name 1059 }) 1060 sort.SliceStable(other.Metrics, func(i, j int) bool { 1061 return other.Metrics[i].Providers[0].Name < other.Metrics[j].Providers[0].Name 1062 }) 1063 for i := range ct.Metrics { 1064 if ct.Metrics[i].ReportingInterval != nil && other.Metrics[i].ReportingInterval != nil { 1065 if ct.Metrics[i].ReportingInterval.AsDuration() != other.Metrics[i].ReportingInterval.AsDuration() { 1066 return false 1067 } 1068 } 1069 if ct.Metrics[i].Providers != nil && other.Metrics[i].Providers != nil { 1070 if ct.Metrics[i].Providers[0].Name != other.Metrics[i].Providers[0].Name { 1071 return false 1072 } 1073 } 1074 } 1075 sort.SliceStable(ct.Logging, func(i, j int) bool { 1076 return ct.Logging[i].telemetryKey.Root.Name < ct.Logging[j].telemetryKey.Root.Name 1077 }) 1078 sort.SliceStable(other.Logging, func(i, j int) bool { 1079 return other.Logging[i].telemetryKey.Root.Name < other.Logging[j].telemetryKey.Root.Name 1080 }) 1081 for i := range ct.Logging { 1082 if ct.Logging[i].telemetryKey != other.Logging[i].telemetryKey { 1083 return false 1084 } 1085 if ct.Logging[i].Logging != nil && other.Logging[i].Logging != nil { 1086 if ct.Logging[i].Logging[0].Providers[0].Name != other.Logging[i].Logging[0].Providers[0].Name { 1087 return false 1088 } 1089 } 1090 } 1091 sort.SliceStable(ct.Tracing, func(i, j int) bool { 1092 return ct.Tracing[i].Providers[0].Name < ct.Tracing[j].Providers[0].Name 1093 }) 1094 sort.SliceStable(other.Tracing, func(i, j int) bool { 1095 return other.Tracing[i].Providers[0].Name < other.Tracing[j].Providers[0].Name 1096 }) 1097 for i := range ct.Tracing { 1098 if ct.Tracing[i].Match != nil && other.Tracing[i].Match != nil { 1099 if ct.Tracing[i].Match.Mode != other.Tracing[i].Match.Mode { 1100 return false 1101 } 1102 } 1103 if ct.Tracing[i].Providers != nil && other.Tracing[i].Providers != nil { 1104 if ct.Tracing[i].Providers[0].Name != other.Tracing[i].Providers[0].Name { 1105 return false 1106 } 1107 } 1108 } 1109 return true 1110 }