istio.io/istio@v0.0.0-20240520182934-d79c90f27776/pilot/pkg/model/telemetry.go (about)

     1  // Copyright Istio Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package model
    16  
    17  import (
    18  	"fmt"
    19  	"sort"
    20  	"strings"
    21  	"sync"
    22  	"time"
    23  
    24  	udpa "github.com/cncf/xds/go/udpa/type/v1"
    25  	accesslog "github.com/envoyproxy/go-control-plane/envoy/config/accesslog/v3"
    26  	listener "github.com/envoyproxy/go-control-plane/envoy/config/listener/v3"
    27  	hcm "github.com/envoyproxy/go-control-plane/envoy/extensions/filters/network/http_connection_manager/v3"
    28  	"google.golang.org/protobuf/types/known/anypb"
    29  	"google.golang.org/protobuf/types/known/durationpb"
    30  	"google.golang.org/protobuf/types/known/structpb"
    31  	wrappers "google.golang.org/protobuf/types/known/wrapperspb"
    32  	"k8s.io/apimachinery/pkg/types"
    33  
    34  	"istio.io/api/envoy/extensions/stats"
    35  	meshconfig "istio.io/api/mesh/v1alpha1"
    36  	tpb "istio.io/api/telemetry/v1alpha1"
    37  	"istio.io/istio/pilot/pkg/features"
    38  	"istio.io/istio/pilot/pkg/networking"
    39  	"istio.io/istio/pilot/pkg/util/protoconv"
    40  	"istio.io/istio/pkg/config/schema/gvk"
    41  	"istio.io/istio/pkg/config/xds"
    42  	"istio.io/istio/pkg/ptr"
    43  	"istio.io/istio/pkg/util/sets"
    44  )
    45  
    46  // Telemetry holds configuration for Telemetry API resources.
    47  type Telemetry struct {
    48  	Name      string         `json:"name"`
    49  	Namespace string         `json:"namespace"`
    50  	Spec      *tpb.Telemetry `json:"spec"`
    51  }
    52  
    53  func (t *Telemetry) NamespacedName() types.NamespacedName {
    54  	return types.NamespacedName{Name: t.Name, Namespace: t.Namespace}
    55  }
    56  
    57  // Telemetries organizes Telemetry configuration by namespace.
    58  type Telemetries struct {
    59  	// Maps from namespace to the Telemetry configs.
    60  	NamespaceToTelemetries map[string][]Telemetry `json:"namespace_to_telemetries"`
    61  
    62  	// The name of the root namespace.
    63  	RootNamespace string `json:"root_namespace"`
    64  
    65  	// Computed meshConfig
    66  	meshConfig *meshconfig.MeshConfig
    67  
    68  	// computedMetricsFilters contains the set of cached HCM/listener filters for the metrics portion.
    69  	// These filters are extremely costly, as we insert them into every listener on every proxy, and to
    70  	// generate them we need to merge many telemetry specs and perform 2 Any marshals.
    71  	// To improve performance, we store a cache based on the Telemetries that impacted the filter, as well as
    72  	// its class and protocol. This is protected by mu.
    73  	// Currently, this only applies to metrics, but a similar concept can likely be applied to logging and
    74  	// tracing for performance.
    75  	// The computedMetricsFilters lifetime is bound to the Telemetries object. During a push context
    76  	// creation, we will preserve the Telemetries (and thus the cache) if not Telemetries are modified.
    77  	// As result, this cache will live until any Telemetry is modified.
    78  	computedMetricsFilters map[metricsKey]any
    79  	computedLoggingConfig  map[loggingKey][]LoggingConfig
    80  	mu                     sync.Mutex
    81  }
    82  
    83  // telemetryKey defines a key into the computedMetricsFilters cache.
    84  type telemetryKey struct {
    85  	// Root stores the Telemetry in the root namespace, if any
    86  	Root types.NamespacedName
    87  	// Namespace stores the Telemetry in the root namespace, if any
    88  	Namespace types.NamespacedName
    89  	// Workload stores the Telemetry in the root namespace, if any
    90  	Workload types.NamespacedName
    91  }
    92  
    93  // loggingKey defines a key into the computedLoggingConfig cache.
    94  type loggingKey struct {
    95  	telemetryKey
    96  	Class    networking.ListenerClass
    97  	Protocol networking.ListenerProtocol
    98  }
    99  
   100  // metricsKey defines a key into the computedMetricsFilters cache.
   101  type metricsKey struct {
   102  	telemetryKey
   103  	Class     networking.ListenerClass
   104  	Protocol  networking.ListenerProtocol
   105  	ProxyType NodeType
   106  }
   107  
   108  // getTelemetries returns the Telemetry configurations for the given environment.
   109  func getTelemetries(env *Environment) *Telemetries {
   110  	telemetries := &Telemetries{
   111  		NamespaceToTelemetries: map[string][]Telemetry{},
   112  		RootNamespace:          env.Mesh().GetRootNamespace(),
   113  		meshConfig:             env.Mesh(),
   114  		computedMetricsFilters: map[metricsKey]any{},
   115  		computedLoggingConfig:  map[loggingKey][]LoggingConfig{},
   116  	}
   117  
   118  	fromEnv := env.List(gvk.Telemetry, NamespaceAll)
   119  	sortConfigByCreationTime(fromEnv)
   120  	for _, config := range fromEnv {
   121  		telemetry := Telemetry{
   122  			Name:      config.Name,
   123  			Namespace: config.Namespace,
   124  			Spec:      config.Spec.(*tpb.Telemetry),
   125  		}
   126  		telemetries.NamespaceToTelemetries[config.Namespace] = append(telemetries.NamespaceToTelemetries[config.Namespace], telemetry)
   127  	}
   128  
   129  	return telemetries
   130  }
   131  
   132  type metricsConfig struct {
   133  	ClientMetrics            metricConfig
   134  	ServerMetrics            metricConfig
   135  	ReportingInterval        *durationpb.Duration
   136  	RotationInterval         *durationpb.Duration
   137  	GracefulDeletionInterval *durationpb.Duration
   138  }
   139  
   140  type metricConfig struct {
   141  	// if true, do not add filter to chain
   142  	Disabled  bool
   143  	Overrides []metricsOverride
   144  }
   145  
   146  type telemetryFilterConfig struct {
   147  	metricsConfig
   148  	Provider      *meshconfig.MeshConfig_ExtensionProvider
   149  	Metrics       bool
   150  	AccessLogging bool
   151  	LogsFilter    *tpb.AccessLogging_Filter
   152  	NodeType      NodeType
   153  }
   154  
   155  func (t telemetryFilterConfig) MetricsForClass(c networking.ListenerClass) metricConfig {
   156  	switch c {
   157  	case networking.ListenerClassGateway:
   158  		return t.ClientMetrics
   159  	case networking.ListenerClassSidecarInbound:
   160  		return t.ServerMetrics
   161  	case networking.ListenerClassSidecarOutbound:
   162  		return t.ClientMetrics
   163  	default:
   164  		return t.ClientMetrics
   165  	}
   166  }
   167  
   168  type metricsOverride struct {
   169  	Name     string
   170  	Disabled bool
   171  	Tags     []tagOverride
   172  }
   173  
   174  type tagOverride struct {
   175  	Name   string
   176  	Remove bool
   177  	Value  string
   178  }
   179  
   180  // computedTelemetries contains the various Telemetry configurations in scope for a given proxy.
   181  // This can include the root namespace, namespace, and workload Telemetries combined
   182  type computedTelemetries struct {
   183  	telemetryKey
   184  	Metrics []*tpb.Metrics
   185  	Logging []*computedAccessLogging
   186  	Tracing []*tpb.Tracing
   187  }
   188  
   189  // computedAccessLogging contains the various AccessLogging configurations in scope for a given proxy,
   190  // include combined configurations for one of the following levels: 1. the root namespace level
   191  // 2. namespace level 3. workload level combined.
   192  type computedAccessLogging struct {
   193  	telemetryKey
   194  	Logging []*tpb.AccessLogging
   195  }
   196  
   197  type TracingConfig struct {
   198  	ServerSpec TracingSpec
   199  	ClientSpec TracingSpec
   200  }
   201  
   202  type TracingSpec struct {
   203  	Provider                     *meshconfig.MeshConfig_ExtensionProvider
   204  	Disabled                     bool
   205  	RandomSamplingPercentage     *float64
   206  	CustomTags                   map[string]*tpb.Tracing_CustomTag
   207  	UseRequestIDForTraceSampling bool
   208  }
   209  
   210  type LoggingConfig struct {
   211  	Disabled  bool
   212  	AccessLog *accesslog.AccessLog
   213  	Provider  *meshconfig.MeshConfig_ExtensionProvider
   214  	Filter    *tpb.AccessLogging_Filter
   215  }
   216  
   217  type loggingSpec struct {
   218  	Disabled bool
   219  	Filter   *tpb.AccessLogging_Filter
   220  }
   221  
   222  func workloadMode(class networking.ListenerClass) tpb.WorkloadMode {
   223  	switch class {
   224  	case networking.ListenerClassGateway:
   225  		return tpb.WorkloadMode_CLIENT
   226  	case networking.ListenerClassSidecarInbound:
   227  		return tpb.WorkloadMode_SERVER
   228  	case networking.ListenerClassSidecarOutbound:
   229  		return tpb.WorkloadMode_CLIENT
   230  	case networking.ListenerClassUndefined:
   231  		// this should not happen, just in case
   232  		return tpb.WorkloadMode_CLIENT
   233  	}
   234  
   235  	return tpb.WorkloadMode_CLIENT
   236  }
   237  
   238  // AccessLogging returns the logging configuration for a given proxy and listener class.
   239  // If nil or empty configuration is returned, access logs are not configured via Telemetry and should use fallback mechanisms.
   240  // If access logging is explicitly disabled, a configuration with disabled set to true is returned.
   241  func (t *Telemetries) AccessLogging(push *PushContext, proxy *Proxy, class networking.ListenerClass, svc *Service) []LoggingConfig {
   242  	ct := t.applicableTelemetries(proxy, nil)
   243  	if len(ct.Logging) == 0 && len(t.meshConfig.GetDefaultProviders().GetAccessLogging()) == 0 {
   244  		// No Telemetry API configured, fall back to legacy mesh config setting
   245  		return nil
   246  	}
   247  
   248  	key := loggingKey{
   249  		telemetryKey: ct.telemetryKey,
   250  		Class:        class,
   251  	}
   252  	t.mu.Lock()
   253  	defer t.mu.Unlock()
   254  	precomputed, ok := t.computedLoggingConfig[key]
   255  	if ok {
   256  		return precomputed
   257  	}
   258  
   259  	providers := mergeLogs(ct.Logging, t.meshConfig, workloadMode(class))
   260  	cfgs := make([]LoggingConfig, 0, len(providers))
   261  	for p, v := range providers {
   262  		fp := t.fetchProvider(p)
   263  		if fp == nil {
   264  			log.Debugf("fail to fetch provider %s", p)
   265  			continue
   266  		}
   267  		cfg := LoggingConfig{
   268  			Provider: fp,
   269  			Filter:   v.Filter,
   270  			Disabled: v.Disabled,
   271  		}
   272  
   273  		al := telemetryAccessLog(push, fp)
   274  		if al == nil {
   275  			// stackdriver will be handled in HTTPFilters/TCPFilters
   276  			continue
   277  		}
   278  		cfg.AccessLog = al
   279  		cfgs = append(cfgs, cfg)
   280  	}
   281  
   282  	t.computedLoggingConfig[key] = cfgs
   283  	return cfgs
   284  }
   285  
   286  // Tracing returns the logging tracing for a given proxy. If nil is returned, tracing
   287  // are not configured via Telemetry and should use fallback mechanisms. If a non-nil but disabled is set,
   288  // then tracing is explicitly disabled.
   289  // A service can optionally be provided to include service-attached Telemetry config.
   290  func (t *Telemetries) Tracing(proxy *Proxy, svc *Service) *TracingConfig {
   291  	ct := t.applicableTelemetries(proxy, svc)
   292  
   293  	providerNames := t.meshConfig.GetDefaultProviders().GetTracing()
   294  	hasDefaultProvider := len(providerNames) > 0
   295  
   296  	if len(ct.Tracing) == 0 && !hasDefaultProvider {
   297  		return nil
   298  	}
   299  
   300  	clientSpec := TracingSpec{UseRequestIDForTraceSampling: true}
   301  	serverSpec := TracingSpec{UseRequestIDForTraceSampling: true}
   302  
   303  	if hasDefaultProvider {
   304  		// todo: what do we want to do with more than one default provider?
   305  		// for now, use only the first provider.
   306  		fetched := t.fetchProvider(providerNames[0])
   307  		clientSpec.Provider = fetched
   308  		serverSpec.Provider = fetched
   309  	}
   310  
   311  	for _, m := range ct.Tracing {
   312  		names := getProviderNames(m.Providers)
   313  
   314  		specs := []*TracingSpec{&clientSpec, &serverSpec}
   315  		if m.Match != nil {
   316  			switch m.Match.Mode {
   317  			case tpb.WorkloadMode_CLIENT:
   318  				specs = []*TracingSpec{&clientSpec}
   319  			case tpb.WorkloadMode_SERVER:
   320  				specs = []*TracingSpec{&serverSpec}
   321  			}
   322  		}
   323  
   324  		if len(names) > 0 {
   325  			// NOTE: we only support a single provider per mode
   326  			// so, choosing the first provider returned in the list
   327  			// is the "safest"
   328  			fetched := t.fetchProvider(names[0])
   329  			for _, spec := range specs {
   330  				spec.Provider = fetched
   331  			}
   332  		}
   333  
   334  		// Now merge in any overrides
   335  		if m.DisableSpanReporting != nil {
   336  			for _, spec := range specs {
   337  				spec.Disabled = m.DisableSpanReporting.GetValue()
   338  			}
   339  		}
   340  		// TODO: metrics overrides do a deep merge, but here we do a shallow merge.
   341  		// We should consider if we want to reconcile the two.
   342  		if m.CustomTags != nil {
   343  			for _, spec := range specs {
   344  				spec.CustomTags = m.CustomTags
   345  			}
   346  		}
   347  		if m.RandomSamplingPercentage != nil {
   348  			for _, spec := range specs {
   349  				spec.RandomSamplingPercentage = ptr.Of(m.RandomSamplingPercentage.GetValue())
   350  			}
   351  		}
   352  		if m.UseRequestIdForTraceSampling != nil {
   353  			for _, spec := range specs {
   354  				spec.UseRequestIDForTraceSampling = m.UseRequestIdForTraceSampling.Value
   355  			}
   356  		}
   357  	}
   358  
   359  	// If no provider is configured (and retrieved) for the tracing specs,
   360  	// then we will disable the configuration.
   361  	if clientSpec.Provider == nil {
   362  		clientSpec.Disabled = true
   363  	}
   364  	if serverSpec.Provider == nil {
   365  		serverSpec.Disabled = true
   366  	}
   367  
   368  	cfg := TracingConfig{
   369  		ClientSpec: clientSpec,
   370  		ServerSpec: serverSpec,
   371  	}
   372  	return &cfg
   373  }
   374  
   375  // HTTPFilters computes the HttpFilter for a given proxy/class
   376  func (t *Telemetries) HTTPFilters(proxy *Proxy, class networking.ListenerClass, svc *Service) []*hcm.HttpFilter {
   377  	if res := t.telemetryFilters(proxy, class, networking.ListenerProtocolHTTP, svc); res != nil {
   378  		return res.([]*hcm.HttpFilter)
   379  	}
   380  	return nil
   381  }
   382  
   383  // TCPFilters computes the TCPFilters for a given proxy/class
   384  func (t *Telemetries) TCPFilters(proxy *Proxy, class networking.ListenerClass, svc *Service) []*listener.Filter {
   385  	if res := t.telemetryFilters(proxy, class, networking.ListenerProtocolTCP, svc); res != nil {
   386  		return res.([]*listener.Filter)
   387  	}
   388  	return nil
   389  }
   390  
   391  // applicableTelemetries fetches the relevant telemetry configurations for a given proxy
   392  func (t *Telemetries) applicableTelemetries(proxy *Proxy, svc *Service) computedTelemetries {
   393  	if t == nil {
   394  		return computedTelemetries{}
   395  	}
   396  
   397  	namespace := proxy.ConfigNamespace
   398  	// Order here matters. The latter elements will override the first elements
   399  	ms := []*tpb.Metrics{}
   400  	ls := []*computedAccessLogging{}
   401  	ts := []*tpb.Tracing{}
   402  	key := telemetryKey{}
   403  	if t.RootNamespace != "" {
   404  		telemetry := t.namespaceWideTelemetryConfig(t.RootNamespace)
   405  		if telemetry != (Telemetry{}) {
   406  			key.Root = types.NamespacedName{Name: telemetry.Name, Namespace: telemetry.Namespace}
   407  			ms = append(ms, telemetry.Spec.GetMetrics()...)
   408  			if len(telemetry.Spec.GetAccessLogging()) != 0 {
   409  				ls = append(ls, &computedAccessLogging{
   410  					telemetryKey: telemetryKey{
   411  						Root: key.Root,
   412  					},
   413  					Logging: telemetry.Spec.GetAccessLogging(),
   414  				})
   415  			}
   416  			ts = append(ts, telemetry.Spec.GetTracing()...)
   417  		}
   418  	}
   419  
   420  	if namespace != t.RootNamespace {
   421  		telemetry := t.namespaceWideTelemetryConfig(namespace)
   422  		if telemetry != (Telemetry{}) {
   423  			key.Namespace = types.NamespacedName{Name: telemetry.Name, Namespace: telemetry.Namespace}
   424  			ms = append(ms, telemetry.Spec.GetMetrics()...)
   425  			if len(telemetry.Spec.GetAccessLogging()) != 0 {
   426  				ls = append(ls, &computedAccessLogging{
   427  					telemetryKey: telemetryKey{
   428  						Namespace: key.Namespace,
   429  					},
   430  					Logging: telemetry.Spec.GetAccessLogging(),
   431  				})
   432  			}
   433  			ts = append(ts, telemetry.Spec.GetTracing()...)
   434  		}
   435  	}
   436  
   437  	ct := &computedTelemetries{
   438  		telemetryKey: key,
   439  		Metrics:      ms,
   440  		Logging:      ls,
   441  		Tracing:      ts,
   442  	}
   443  
   444  	matcher := PolicyMatcherForProxy(proxy).WithService(svc)
   445  	for _, telemetry := range t.NamespaceToTelemetries[namespace] {
   446  		spec := telemetry.Spec
   447  		// TODO in many other places, empty selector matches all policy
   448  		if len(spec.GetSelector().GetMatchLabels()) == 0 {
   449  			continue
   450  		}
   451  		if matcher.ShouldAttachPolicy(gvk.Telemetry, telemetry.NamespacedName(), spec) {
   452  			ct = appendApplicableTelemetries(ct, telemetry, spec)
   453  		} else {
   454  			log.Debug("There isn't a match between the workload and the policy. Policy is ignored.")
   455  		}
   456  	}
   457  
   458  	return *ct
   459  }
   460  
   461  func appendApplicableTelemetries(ct *computedTelemetries, tel Telemetry, spec *tpb.Telemetry) *computedTelemetries {
   462  	ct.telemetryKey.Workload = types.NamespacedName{Name: tel.Name, Namespace: tel.Namespace}
   463  	ct.Metrics = append(ct.Metrics, spec.GetMetrics()...)
   464  	if len(tel.Spec.GetAccessLogging()) != 0 {
   465  		ct.Logging = append(ct.Logging, &computedAccessLogging{
   466  			telemetryKey: telemetryKey{
   467  				Workload: types.NamespacedName{Name: tel.Name, Namespace: tel.Namespace},
   468  			},
   469  			Logging: tel.Spec.GetAccessLogging(),
   470  		})
   471  	}
   472  	ct.Tracing = append(ct.Tracing, spec.GetTracing()...)
   473  
   474  	return ct
   475  }
   476  
   477  // telemetryFilters computes the filters for the given proxy/class and protocol. This computes the
   478  // set of applicable Telemetries, merges them, then translates to the appropriate filters based on the
   479  // extension providers in the mesh config. Where possible, the result is cached.
   480  // Currently, this includes metrics and access logging, as some providers are implemented in filters.
   481  func (t *Telemetries) telemetryFilters(proxy *Proxy, class networking.ListenerClass, protocol networking.ListenerProtocol, svc *Service) any {
   482  	if t == nil {
   483  		return nil
   484  	}
   485  
   486  	c := t.applicableTelemetries(proxy, svc)
   487  
   488  	key := metricsKey{
   489  		telemetryKey: c.telemetryKey,
   490  		Class:        class,
   491  		Protocol:     protocol,
   492  		ProxyType:    proxy.Type,
   493  	}
   494  	t.mu.Lock()
   495  	defer t.mu.Unlock()
   496  	precomputed, f := t.computedMetricsFilters[key]
   497  	if f {
   498  		return precomputed
   499  	}
   500  
   501  	// First, take all the metrics configs and transform them into a normalized form
   502  	tmm := mergeMetrics(c.Metrics, t.meshConfig)
   503  	log.Debugf("merged metrics, proxyID: %s metrics: %+v", proxy.ID, tmm)
   504  	// Additionally, fetch relevant access logging configurations
   505  	tml := mergeLogs(c.Logging, t.meshConfig, workloadMode(class))
   506  
   507  	// The above result is in a nested map to deduplicate responses. This loses ordering, so we convert to
   508  	// a list to retain stable naming
   509  	allKeys := sets.New[string]()
   510  	for k, v := range tml {
   511  		if v.Disabled {
   512  			continue
   513  		}
   514  		allKeys.Insert(k)
   515  	}
   516  	for k := range tmm {
   517  		allKeys.Insert(k)
   518  	}
   519  
   520  	rotationInterval := getInterval(features.MetricRotationInterval, defaultMetricRotationInterval)
   521  	gracefulDeletionInterval := getInterval(features.MetricGracefulDeletionInterval, defaultMetricGracefulDeletionInterval)
   522  
   523  	m := make([]telemetryFilterConfig, 0, allKeys.Len())
   524  	for _, k := range sets.SortedList(allKeys) {
   525  		p := t.fetchProvider(k)
   526  		if p == nil {
   527  			continue
   528  		}
   529  		loggingCfg, logging := tml[k]
   530  		mertricCfg, metrics := tmm[k]
   531  
   532  		mertricCfg.RotationInterval = rotationInterval
   533  		mertricCfg.GracefulDeletionInterval = gracefulDeletionInterval
   534  
   535  		cfg := telemetryFilterConfig{
   536  			Provider:      p,
   537  			metricsConfig: mertricCfg,
   538  			AccessLogging: logging && !loggingCfg.Disabled,
   539  			Metrics:       metrics,
   540  			LogsFilter:    tml[p.Name].Filter,
   541  			NodeType:      proxy.Type,
   542  		}
   543  		m = append(m, cfg)
   544  	}
   545  
   546  	var res any
   547  	// Finally, compute the actual filters based on the protoc
   548  	switch protocol {
   549  	case networking.ListenerProtocolHTTP:
   550  		res = buildHTTPTelemetryFilter(class, m)
   551  	default:
   552  		res = buildTCPTelemetryFilter(class, m)
   553  	}
   554  
   555  	// Update cache
   556  	t.computedMetricsFilters[key] = res
   557  	return res
   558  }
   559  
   560  // default value for metric rotation interval and graceful deletion interval,
   561  // more details can be found in here: https://github.com/istio/proxy/blob/master/source/extensions/filters/http/istio_stats/config.proto#L116
   562  var (
   563  	defaultMetricRotationInterval         = 0 * time.Second
   564  	defaultMetricGracefulDeletionInterval = 5 * time.Minute
   565  )
   566  
   567  // getInterval return nil to reduce the size of the config, when equal to the default.
   568  func getInterval(input, defaultValue time.Duration) *durationpb.Duration {
   569  	if input == defaultValue {
   570  		return nil
   571  	}
   572  
   573  	return durationpb.New(input)
   574  }
   575  
   576  // mergeLogs returns the set of providers for the given logging configuration.
   577  // The provider names are mapped to any applicable access logging filter that has been applied in provider configuration.
   578  func mergeLogs(logs []*computedAccessLogging, mesh *meshconfig.MeshConfig, mode tpb.WorkloadMode) map[string]loggingSpec {
   579  	providers := map[string]loggingSpec{}
   580  
   581  	if len(logs) == 0 {
   582  		for _, dp := range mesh.GetDefaultProviders().GetAccessLogging() {
   583  			// Insert the default provider.
   584  			providers[dp] = loggingSpec{}
   585  		}
   586  		return providers
   587  	}
   588  	providerNames := mesh.GetDefaultProviders().GetAccessLogging()
   589  	filters := map[string]loggingSpec{}
   590  	for _, m := range logs {
   591  		names := sets.New[string]()
   592  		for _, p := range m.Logging {
   593  			if !matchWorkloadMode(p.Match, mode) {
   594  				continue
   595  			}
   596  			subProviders := getProviderNames(p.Providers)
   597  			names.InsertAll(subProviders...)
   598  
   599  			for _, prov := range subProviders {
   600  				filters[prov] = loggingSpec{
   601  					Filter: p.Filter,
   602  				}
   603  			}
   604  		}
   605  
   606  		if names.Len() > 0 {
   607  			providerNames = names.UnsortedList()
   608  		}
   609  	}
   610  	inScopeProviders := sets.New(providerNames...)
   611  
   612  	parentProviders := mesh.GetDefaultProviders().GetAccessLogging()
   613  	for _, l := range logs {
   614  		for _, m := range l.Logging {
   615  			providerNames := getProviderNames(m.Providers)
   616  			if len(providerNames) == 0 {
   617  				providerNames = parentProviders
   618  			}
   619  			parentProviders = providerNames
   620  			for _, provider := range providerNames {
   621  				if !inScopeProviders.Contains(provider) {
   622  					// We don't care about this, remove it
   623  					// This occurs when a top level provider is later disabled by a lower level
   624  					continue
   625  				}
   626  
   627  				if !matchWorkloadMode(m.Match, mode) {
   628  					continue
   629  				}
   630  
   631  				// see UT: server - multi filters disabled
   632  				if m.GetDisabled().GetValue() {
   633  					providers[provider] = loggingSpec{Disabled: true}
   634  					continue
   635  				}
   636  
   637  				providers[provider] = filters[provider]
   638  			}
   639  		}
   640  	}
   641  
   642  	return providers
   643  }
   644  
   645  func matchWorkloadMode(selector *tpb.AccessLogging_LogSelector, mode tpb.WorkloadMode) bool {
   646  	if selector == nil {
   647  		return true
   648  	}
   649  
   650  	if selector.Mode == tpb.WorkloadMode_CLIENT_AND_SERVER {
   651  		return true
   652  	}
   653  
   654  	return selector.Mode == mode
   655  }
   656  
   657  func (t *Telemetries) namespaceWideTelemetryConfig(namespace string) Telemetry {
   658  	for _, tel := range t.NamespaceToTelemetries[namespace] {
   659  		if len(tel.Spec.GetSelector().GetMatchLabels()) == 0 {
   660  			return tel
   661  		}
   662  	}
   663  	return Telemetry{}
   664  }
   665  
   666  // fetchProvider finds the matching ExtensionProviders from the mesh config
   667  func (t *Telemetries) fetchProvider(m string) *meshconfig.MeshConfig_ExtensionProvider {
   668  	for _, p := range t.meshConfig.ExtensionProviders {
   669  		if strings.EqualFold(m, p.Name) {
   670  			return p
   671  		}
   672  	}
   673  	return nil
   674  }
   675  
   676  func (t *Telemetries) Debug(proxy *Proxy) any {
   677  	// TODO we could use service targets + ambient index to include service-attached here
   678  	at := t.applicableTelemetries(proxy, nil)
   679  	return at
   680  }
   681  
   682  var allMetrics = func() []string {
   683  	r := make([]string, 0, len(tpb.MetricSelector_IstioMetric_value))
   684  	for k := range tpb.MetricSelector_IstioMetric_value {
   685  		if k != tpb.MetricSelector_IstioMetric_name[int32(tpb.MetricSelector_ALL_METRICS)] {
   686  			r = append(r, k)
   687  		}
   688  	}
   689  	sort.Strings(r)
   690  	return r
   691  }()
   692  
   693  // mergeMetrics merges many Metrics objects into a normalized configuration
   694  func mergeMetrics(metrics []*tpb.Metrics, mesh *meshconfig.MeshConfig) map[string]metricsConfig {
   695  	type metricOverride struct {
   696  		Disabled     *wrappers.BoolValue
   697  		TagOverrides map[string]*tpb.MetricsOverrides_TagOverride
   698  	}
   699  	// provider -> mode -> metric -> overrides
   700  	providers := map[string]map[tpb.WorkloadMode]map[string]metricOverride{}
   701  
   702  	if len(metrics) == 0 {
   703  		for _, dp := range mesh.GetDefaultProviders().GetMetrics() {
   704  			// Insert the default provider. It has no overrides; presence of the key is sufficient to
   705  			// get the filter created.
   706  			providers[dp] = map[tpb.WorkloadMode]map[string]metricOverride{}
   707  		}
   708  	}
   709  
   710  	providerNames := mesh.GetDefaultProviders().GetMetrics()
   711  	for _, m := range metrics {
   712  		names := getProviderNames(m.Providers)
   713  		// If providers is set, it overrides the parent. If not, inherent from the parent. It is not a deep merge.
   714  		if len(names) > 0 {
   715  			providerNames = names
   716  		}
   717  	}
   718  	// Record the names of all providers we should configure. Anything else we will ignore
   719  	inScopeProviders := sets.New(providerNames...)
   720  
   721  	parentProviders := mesh.GetDefaultProviders().GetMetrics()
   722  	disabledAllMetricsProviders := sets.New[string]()
   723  	reportingIntervals := map[string]*durationpb.Duration{}
   724  	for _, m := range metrics {
   725  		providerNames := getProviderNames(m.Providers)
   726  		// If providers is not set, use parent's
   727  		if len(providerNames) == 0 {
   728  			providerNames = parentProviders
   729  		}
   730  
   731  		reportInterval := m.GetReportingInterval()
   732  		parentProviders = providerNames
   733  		for _, provider := range providerNames {
   734  			if !inScopeProviders.Contains(provider) {
   735  				// We don't care about this, remove it
   736  				// This occurs when a top level provider is later disabled by a lower level
   737  				continue
   738  			}
   739  
   740  			if reportInterval != nil {
   741  				reportingIntervals[provider] = reportInterval
   742  			}
   743  
   744  			if _, f := providers[provider]; !f {
   745  				providers[provider] = map[tpb.WorkloadMode]map[string]metricOverride{
   746  					tpb.WorkloadMode_CLIENT: {},
   747  					tpb.WorkloadMode_SERVER: {},
   748  				}
   749  			}
   750  
   751  			mp := providers[provider]
   752  			// For each override, we normalize the configuration. The metrics list is an ordered list - latter
   753  			// elements have precedence. As a result, we will apply updates on top of previous entries.
   754  			for _, o := range m.Overrides {
   755  				// if we disable all metrics, we should drop the entire filter
   756  				if isAllMetrics(o.GetMatch()) && o.Disabled.GetValue() {
   757  					for _, mode := range getModes(o.GetMatch().GetMode()) {
   758  						key := metricProviderModeKey(provider, mode)
   759  						disabledAllMetricsProviders.Insert(key)
   760  					}
   761  
   762  					continue
   763  				}
   764  
   765  				metricsNames := getMatches(o.GetMatch())
   766  				// If client or server is set explicitly, only apply there. Otherwise, we will apply to both.
   767  				// Note: client and server keys may end up the same, which is fine
   768  				for _, mode := range getModes(o.GetMatch().GetMode()) {
   769  					// root namespace disables all, but then enables them by namespace scoped
   770  					key := metricProviderModeKey(provider, mode)
   771  					disabledAllMetricsProviders.Delete(key)
   772  					// Next, get all matches.
   773  					// This is a bit funky because the matches are oneof of ENUM and customer metric. We normalize
   774  					// these to strings, so we may end up with a list like [REQUEST_COUNT, my-customer-metric].
   775  					// TODO: we always flatten ALL_METRICS into each metric mode. For some stats providers (prometheus),
   776  					// we are able to apply overrides to all metrics directly rather than duplicating the config.
   777  					// We should tweak this to collapse to this mode where possible
   778  					for _, metricName := range metricsNames {
   779  						if _, f := mp[mode]; !f {
   780  							mp[mode] = map[string]metricOverride{}
   781  						}
   782  						override := mp[mode][metricName]
   783  						if o.Disabled != nil {
   784  							override.Disabled = o.Disabled
   785  						}
   786  						for k, v := range o.TagOverrides {
   787  							if override.TagOverrides == nil {
   788  								override.TagOverrides = map[string]*tpb.MetricsOverrides_TagOverride{}
   789  							}
   790  							override.TagOverrides[k] = v
   791  						}
   792  						mp[mode][metricName] = override
   793  					}
   794  				}
   795  			}
   796  		}
   797  	}
   798  
   799  	processed := map[string]metricsConfig{}
   800  	for provider, modeMap := range providers {
   801  		tmm := processed[provider]
   802  		tmm.ReportingInterval = reportingIntervals[provider]
   803  
   804  		for mode, metricMap := range modeMap {
   805  			key := metricProviderModeKey(provider, mode)
   806  			if disabledAllMetricsProviders.Contains(key) {
   807  				switch mode {
   808  				case tpb.WorkloadMode_CLIENT:
   809  					tmm.ClientMetrics.Disabled = true
   810  				case tpb.WorkloadMode_SERVER:
   811  					tmm.ServerMetrics.Disabled = true
   812  				}
   813  				continue
   814  			}
   815  
   816  			for metric, override := range metricMap {
   817  				tags := []tagOverride{}
   818  				for k, v := range override.TagOverrides {
   819  					o := tagOverride{Name: k}
   820  					switch v.Operation {
   821  					case tpb.MetricsOverrides_TagOverride_REMOVE:
   822  						o.Remove = true
   823  						o.Value = ""
   824  					case tpb.MetricsOverrides_TagOverride_UPSERT:
   825  						o.Value = v.GetValue()
   826  						o.Remove = false
   827  					}
   828  					tags = append(tags, o)
   829  				}
   830  				// Keep order deterministic
   831  				sort.Slice(tags, func(i, j int) bool {
   832  					return tags[i].Name < tags[j].Name
   833  				})
   834  				mo := metricsOverride{
   835  					Name:     metric,
   836  					Disabled: override.Disabled.GetValue(),
   837  					Tags:     tags,
   838  				}
   839  
   840  				switch mode {
   841  				case tpb.WorkloadMode_CLIENT:
   842  					tmm.ClientMetrics.Overrides = append(tmm.ClientMetrics.Overrides, mo)
   843  				default:
   844  					tmm.ServerMetrics.Overrides = append(tmm.ServerMetrics.Overrides, mo)
   845  				}
   846  			}
   847  		}
   848  
   849  		// Keep order deterministic
   850  		sort.Slice(tmm.ServerMetrics.Overrides, func(i, j int) bool {
   851  			return tmm.ServerMetrics.Overrides[i].Name < tmm.ServerMetrics.Overrides[j].Name
   852  		})
   853  		sort.Slice(tmm.ClientMetrics.Overrides, func(i, j int) bool {
   854  			return tmm.ClientMetrics.Overrides[i].Name < tmm.ClientMetrics.Overrides[j].Name
   855  		})
   856  		processed[provider] = tmm
   857  	}
   858  	return processed
   859  }
   860  
   861  func metricProviderModeKey(provider string, mode tpb.WorkloadMode) string {
   862  	return fmt.Sprintf("%s/%s", provider, mode)
   863  }
   864  
   865  func getProviderNames(providers []*tpb.ProviderRef) []string {
   866  	res := make([]string, 0, len(providers))
   867  	for _, p := range providers {
   868  		res = append(res, p.GetName())
   869  	}
   870  	return res
   871  }
   872  
   873  func getModes(mode tpb.WorkloadMode) []tpb.WorkloadMode {
   874  	switch mode {
   875  	case tpb.WorkloadMode_CLIENT, tpb.WorkloadMode_SERVER:
   876  		return []tpb.WorkloadMode{mode}
   877  	default:
   878  		return []tpb.WorkloadMode{tpb.WorkloadMode_CLIENT, tpb.WorkloadMode_SERVER}
   879  	}
   880  }
   881  
   882  func isAllMetrics(match *tpb.MetricSelector) bool {
   883  	switch m := match.GetMetricMatch().(type) {
   884  	case *tpb.MetricSelector_CustomMetric:
   885  		return false
   886  	case *tpb.MetricSelector_Metric:
   887  		return m.Metric == tpb.MetricSelector_ALL_METRICS
   888  	default:
   889  		return true
   890  	}
   891  }
   892  
   893  func getMatches(match *tpb.MetricSelector) []string {
   894  	switch m := match.GetMetricMatch().(type) {
   895  	case *tpb.MetricSelector_CustomMetric:
   896  		return []string{m.CustomMetric}
   897  	case *tpb.MetricSelector_Metric:
   898  		if m.Metric == tpb.MetricSelector_ALL_METRICS {
   899  			return allMetrics
   900  		}
   901  		return []string{m.Metric.String()}
   902  	default:
   903  		return allMetrics
   904  	}
   905  }
   906  
   907  var waypointStatsConfig = protoconv.MessageToAny(&udpa.TypedStruct{
   908  	TypeUrl: "type.googleapis.com/stats.PluginConfig",
   909  	Value: &structpb.Struct{
   910  		Fields: map[string]*structpb.Value{
   911  			"reporter": {
   912  				Kind: &structpb.Value_StringValue{
   913  					StringValue: "SERVER_GATEWAY",
   914  				},
   915  			},
   916  		},
   917  	},
   918  })
   919  
   920  // telemetryFilterHandled contains the number of providers we handle below.
   921  // This is to ensure this stays in sync as new handlers are added
   922  // STOP. DO NOT UPDATE THIS WITHOUT UPDATING buildHTTPTelemetryFilter and buildTCPTelemetryFilter.
   923  const telemetryFilterHandled = 14
   924  
   925  func buildHTTPTelemetryFilter(class networking.ListenerClass, metricsCfg []telemetryFilterConfig) []*hcm.HttpFilter {
   926  	res := make([]*hcm.HttpFilter, 0, len(metricsCfg))
   927  	for _, cfg := range metricsCfg {
   928  		switch cfg.Provider.GetProvider().(type) {
   929  		case *meshconfig.MeshConfig_ExtensionProvider_Prometheus:
   930  			if cfg.NodeType == Waypoint {
   931  				f := &hcm.HttpFilter{
   932  					Name:       xds.StatsFilterName,
   933  					ConfigType: &hcm.HttpFilter_TypedConfig{TypedConfig: waypointStatsConfig},
   934  				}
   935  				res = append(res, f)
   936  			} else {
   937  				if statsCfg := generateStatsConfig(class, cfg); statsCfg != nil {
   938  					f := &hcm.HttpFilter{
   939  						Name:       xds.StatsFilterName,
   940  						ConfigType: &hcm.HttpFilter_TypedConfig{TypedConfig: statsCfg},
   941  					}
   942  					res = append(res, f)
   943  				}
   944  			}
   945  		default:
   946  			// Only prometheus and SD supported currently
   947  			continue
   948  		}
   949  	}
   950  	return res
   951  }
   952  
   953  func buildTCPTelemetryFilter(class networking.ListenerClass, telemetryConfigs []telemetryFilterConfig) []*listener.Filter {
   954  	res := []*listener.Filter{}
   955  	for _, telemetryCfg := range telemetryConfigs {
   956  		switch telemetryCfg.Provider.GetProvider().(type) {
   957  		case *meshconfig.MeshConfig_ExtensionProvider_Prometheus:
   958  			if telemetryCfg.NodeType == Waypoint {
   959  				f := &listener.Filter{
   960  					Name:       xds.StatsFilterName,
   961  					ConfigType: &listener.Filter_TypedConfig{TypedConfig: waypointStatsConfig},
   962  				}
   963  				res = append(res, f)
   964  			} else {
   965  				if cfg := generateStatsConfig(class, telemetryCfg); cfg != nil {
   966  					f := &listener.Filter{
   967  						Name:       xds.StatsFilterName,
   968  						ConfigType: &listener.Filter_TypedConfig{TypedConfig: cfg},
   969  					}
   970  					res = append(res, f)
   971  				}
   972  			}
   973  		default:
   974  			// Only prometheus and SD supported currently
   975  			continue
   976  		}
   977  	}
   978  	return res
   979  }
   980  
   981  var metricToPrometheusMetric = map[string]string{
   982  	"REQUEST_COUNT":          "requests_total",
   983  	"REQUEST_DURATION":       "request_duration_milliseconds",
   984  	"REQUEST_SIZE":           "request_bytes",
   985  	"RESPONSE_SIZE":          "response_bytes",
   986  	"TCP_OPENED_CONNECTIONS": "tcp_connections_opened_total",
   987  	"TCP_CLOSED_CONNECTIONS": "tcp_connections_closed_total",
   988  	"TCP_SENT_BYTES":         "tcp_sent_bytes_total",
   989  	"TCP_RECEIVED_BYTES":     "tcp_received_bytes_total",
   990  	"GRPC_REQUEST_MESSAGES":  "request_messages_total",
   991  	"GRPC_RESPONSE_MESSAGES": "response_messages_total",
   992  }
   993  
   994  func generateStatsConfig(class networking.ListenerClass, filterConfig telemetryFilterConfig) *anypb.Any {
   995  	if !filterConfig.Metrics {
   996  		// No metric for prometheus
   997  		return nil
   998  	}
   999  
  1000  	listenerCfg := filterConfig.MetricsForClass(class)
  1001  	if listenerCfg.Disabled {
  1002  		// no metrics for this listener
  1003  		return nil
  1004  	}
  1005  
  1006  	cfg := stats.PluginConfig{
  1007  		DisableHostHeaderFallback: disableHostHeaderFallback(class),
  1008  		TcpReportingDuration:      filterConfig.ReportingInterval,
  1009  		RotationInterval:          filterConfig.RotationInterval,
  1010  		GracefulDeletionInterval:  filterConfig.GracefulDeletionInterval,
  1011  	}
  1012  
  1013  	for _, override := range listenerCfg.Overrides {
  1014  		metricName, f := metricToPrometheusMetric[override.Name]
  1015  		if !f {
  1016  			// Not a predefined metric, must be a custom one
  1017  			metricName = override.Name
  1018  		}
  1019  		mc := &stats.MetricConfig{
  1020  			Dimensions: map[string]string{},
  1021  			Name:       metricName,
  1022  			Drop:       override.Disabled,
  1023  		}
  1024  		for _, t := range override.Tags {
  1025  			if t.Remove {
  1026  				mc.TagsToRemove = append(mc.TagsToRemove, t.Name)
  1027  			} else {
  1028  				mc.Dimensions[t.Name] = t.Value
  1029  			}
  1030  		}
  1031  		cfg.Metrics = append(cfg.Metrics, mc)
  1032  	}
  1033  
  1034  	return protoconv.MessageToAny(&cfg)
  1035  }
  1036  
  1037  func disableHostHeaderFallback(class networking.ListenerClass) bool {
  1038  	return class == networking.ListenerClassSidecarInbound || class == networking.ListenerClassGateway
  1039  }
  1040  
  1041  // Equal compares two computedTelemetries for equality. This was created to help with testing. Because of the nature of the structs being compared,
  1042  // it is safer to use cmp.Equal as opposed to reflect.DeepEqual. Also, because of the way the structs are generated, it is not possible to use
  1043  // cmpopts.IgnoreUnexported without risking flakiness if those third party types that are relied on change. Next best thing is to use a custom
  1044  // comparer as defined below. When cmp.Equal is called on this type, this will be leveraged by cmp.Equal to do the comparison see
  1045  // https://godoc.org/github.com/google/go-cmp/cmp#Equal for more info.
  1046  func (ct *computedTelemetries) Equal(other *computedTelemetries) bool {
  1047  	if ct == nil && other == nil {
  1048  		return true
  1049  	}
  1050  	if ct != nil && other == nil || ct == nil && other != nil {
  1051  		return false
  1052  	}
  1053  	if len(ct.Metrics) != len(other.Metrics) || len(ct.Logging) != len(other.Logging) || len(ct.Tracing) != len(other.Tracing) {
  1054  		return false
  1055  	}
  1056  	// Sort each slice so that we can compare them in order. Comparison is on the fields that are used in the test cases.
  1057  	sort.SliceStable(ct.Metrics, func(i, j int) bool {
  1058  		return ct.Metrics[i].Providers[0].Name < ct.Metrics[j].Providers[0].Name
  1059  	})
  1060  	sort.SliceStable(other.Metrics, func(i, j int) bool {
  1061  		return other.Metrics[i].Providers[0].Name < other.Metrics[j].Providers[0].Name
  1062  	})
  1063  	for i := range ct.Metrics {
  1064  		if ct.Metrics[i].ReportingInterval != nil && other.Metrics[i].ReportingInterval != nil {
  1065  			if ct.Metrics[i].ReportingInterval.AsDuration() != other.Metrics[i].ReportingInterval.AsDuration() {
  1066  				return false
  1067  			}
  1068  		}
  1069  		if ct.Metrics[i].Providers != nil && other.Metrics[i].Providers != nil {
  1070  			if ct.Metrics[i].Providers[0].Name != other.Metrics[i].Providers[0].Name {
  1071  				return false
  1072  			}
  1073  		}
  1074  	}
  1075  	sort.SliceStable(ct.Logging, func(i, j int) bool {
  1076  		return ct.Logging[i].telemetryKey.Root.Name < ct.Logging[j].telemetryKey.Root.Name
  1077  	})
  1078  	sort.SliceStable(other.Logging, func(i, j int) bool {
  1079  		return other.Logging[i].telemetryKey.Root.Name < other.Logging[j].telemetryKey.Root.Name
  1080  	})
  1081  	for i := range ct.Logging {
  1082  		if ct.Logging[i].telemetryKey != other.Logging[i].telemetryKey {
  1083  			return false
  1084  		}
  1085  		if ct.Logging[i].Logging != nil && other.Logging[i].Logging != nil {
  1086  			if ct.Logging[i].Logging[0].Providers[0].Name != other.Logging[i].Logging[0].Providers[0].Name {
  1087  				return false
  1088  			}
  1089  		}
  1090  	}
  1091  	sort.SliceStable(ct.Tracing, func(i, j int) bool {
  1092  		return ct.Tracing[i].Providers[0].Name < ct.Tracing[j].Providers[0].Name
  1093  	})
  1094  	sort.SliceStable(other.Tracing, func(i, j int) bool {
  1095  		return other.Tracing[i].Providers[0].Name < other.Tracing[j].Providers[0].Name
  1096  	})
  1097  	for i := range ct.Tracing {
  1098  		if ct.Tracing[i].Match != nil && other.Tracing[i].Match != nil {
  1099  			if ct.Tracing[i].Match.Mode != other.Tracing[i].Match.Mode {
  1100  				return false
  1101  			}
  1102  		}
  1103  		if ct.Tracing[i].Providers != nil && other.Tracing[i].Providers != nil {
  1104  			if ct.Tracing[i].Providers[0].Name != other.Tracing[i].Providers[0].Name {
  1105  				return false
  1106  			}
  1107  		}
  1108  	}
  1109  	return true
  1110  }