github.com/kiali/kiali@v1.84.0/business/metrics.go (about)

     1  package business
     2  
     3  import (
     4  	"math"
     5  	"sort"
     6  	"strings"
     7  	"sync"
     8  
     9  	"github.com/kiali/kiali/models"
    10  	"github.com/kiali/kiali/prometheus"
    11  )
    12  
    13  // MetricsService deals with fetching metrics from prometheus
    14  type MetricsService struct {
    15  	prom prometheus.ClientInterface
    16  }
    17  
    18  // NewMetricsService initializes this business service
    19  func NewMetricsService(prom prometheus.ClientInterface) *MetricsService {
    20  	return &MetricsService{prom: prom}
    21  }
    22  
    23  func (in *MetricsService) GetMetrics(q models.IstioMetricsQuery, scaler func(n string) float64) (models.MetricsMap, error) {
    24  	lb := createMetricsLabelsBuilder(&q)
    25  	grouping := strings.Join(q.ByLabels, ",")
    26  	return in.fetchAllMetrics(q, lb, grouping, scaler)
    27  }
    28  
    29  func createMetricsLabelsBuilder(q *models.IstioMetricsQuery) *MetricsLabelsBuilder {
    30  	lb := NewMetricsLabelsBuilder(q.Direction)
    31  	if q.Reporter != "both" {
    32  		lb.Reporter(q.Reporter)
    33  	}
    34  
    35  	namespaceSet := false
    36  
    37  	// add custom labels from config if custom labels are configured
    38  	lb.QueryScope()
    39  
    40  	if q.Service != "" {
    41  		lb.Service(q.Service, q.Namespace)
    42  		namespaceSet = true
    43  	}
    44  	if q.Workload != "" {
    45  		lb.Workload(q.Workload, q.Namespace)
    46  		namespaceSet = true
    47  	}
    48  	if q.App != "" {
    49  		lb.App(q.App, q.Namespace)
    50  		namespaceSet = true
    51  	}
    52  	if !namespaceSet && q.Namespace != "" {
    53  		lb.Namespace(q.Namespace)
    54  	}
    55  	if q.RequestProtocol != "" {
    56  		lb.Protocol(q.RequestProtocol)
    57  	}
    58  	if q.Aggregate != "" {
    59  		lb.Aggregate(q.Aggregate, q.AggregateValue)
    60  	}
    61  	if q.Cluster != "" {
    62  		lb.Cluster(q.Cluster)
    63  	}
    64  
    65  	return lb
    66  }
    67  
    68  func (in *MetricsService) fetchAllMetrics(q models.IstioMetricsQuery, lb *MetricsLabelsBuilder, grouping string, scaler func(n string) float64) (models.MetricsMap, error) {
    69  	labels := lb.Build()
    70  	labelsError := lb.BuildForErrors()
    71  
    72  	var wg sync.WaitGroup
    73  	fetchRate := func(p8sFamilyName string, metric *prometheus.Metric, lbl []string) {
    74  		defer wg.Done()
    75  		m := in.prom.FetchRateRange(p8sFamilyName, lbl, grouping, &q.RangeQuery)
    76  		*metric = m
    77  	}
    78  
    79  	fetchHisto := func(p8sFamilyName string, histo *prometheus.Histogram) {
    80  		defer wg.Done()
    81  		h := in.prom.FetchHistogramRange(p8sFamilyName, labels, grouping, &q.RangeQuery)
    82  		*histo = h
    83  	}
    84  
    85  	type resultHolder struct {
    86  		metric     prometheus.Metric
    87  		histo      prometheus.Histogram
    88  		definition istioMetric
    89  	}
    90  	maxResults := len(istioMetrics)
    91  	if len(q.Filters) != 0 {
    92  		maxResults = len(q.Filters)
    93  	}
    94  	results := make([]*resultHolder, maxResults)
    95  
    96  	for _, istioMetric := range istioMetrics {
    97  		// if filters is empty, fetch all anyway
    98  		doFetch := len(q.Filters) == 0
    99  		if !doFetch {
   100  			for _, filter := range q.Filters {
   101  				if filter == istioMetric.kialiName {
   102  					doFetch = true
   103  					break
   104  				}
   105  			}
   106  		}
   107  		if doFetch {
   108  			wg.Add(1)
   109  			result := resultHolder{definition: istioMetric}
   110  			results = append(results, &result)
   111  			if istioMetric.isHisto {
   112  				go fetchHisto(istioMetric.istioName, &result.histo)
   113  			} else {
   114  				labelsToUse := istioMetric.labelsToUse(labels, labelsError)
   115  				go fetchRate(istioMetric.istioName, &result.metric, labelsToUse)
   116  			}
   117  		}
   118  	}
   119  	wg.Wait()
   120  
   121  	// Return results as two maps per reporter
   122  	metrics := make(models.MetricsMap)
   123  	for _, result := range results {
   124  		if result != nil {
   125  			conversionParams := models.ConversionParams{Scale: 1.0}
   126  			if scaler != nil {
   127  				scale := scaler(result.definition.kialiName)
   128  				if scale != 0.0 {
   129  					conversionParams.Scale = scale
   130  				}
   131  			}
   132  			var converted []models.Metric
   133  			var err error
   134  			if result.definition.isHisto {
   135  				converted, err = models.ConvertHistogram(result.definition.kialiName, result.histo, conversionParams)
   136  				if err != nil {
   137  					return nil, err
   138  				}
   139  			} else {
   140  				converted, err = models.ConvertMetric(result.definition.kialiName, result.metric, conversionParams)
   141  				if err != nil {
   142  					return nil, err
   143  				}
   144  			}
   145  			metrics[result.definition.kialiName] = append(metrics[result.definition.kialiName], converted...)
   146  		}
   147  	}
   148  	return metrics, nil
   149  }
   150  
   151  // GetStats computes metrics stats, currently response times, for a set of queries
   152  func (in *MetricsService) GetStats(queries []models.MetricsStatsQuery) (map[string]models.MetricsStats, error) {
   153  	type statsChanResult struct {
   154  		key   string
   155  		stats *models.MetricsStats
   156  		err   error
   157  	}
   158  
   159  	// The number of queries could be high, limit concurrent requests to 10 at a time (see https://github.com/kiali/kiali/issues/5584)
   160  	// Note that the default prometheus_engine_queries_concurrent_max = 20, so by limiting here to 10 we leave some room for
   161  	// other users hitting prom while still allowing a decent amount of concurrency.  Prom also has a default query timeout
   162  	// of 2 minutes, and any queries pending execution (so any number > 20 by default) are still subject to that timer.
   163  	chunkSize := 10
   164  	numQueries := len(queries)
   165  	var queryChunks [][]models.MetricsStatsQuery
   166  	for i := 0; i < numQueries; i += chunkSize {
   167  		end := i + chunkSize
   168  		if end > numQueries {
   169  			end = numQueries
   170  		}
   171  		queryChunks = append(queryChunks, queries[i:end])
   172  	}
   173  
   174  	result := make(map[string]models.MetricsStats)
   175  
   176  	for i, queryChunk := range queryChunks {
   177  		statsChan := make(chan statsChanResult, len(queryChunk))
   178  		var wg sync.WaitGroup
   179  
   180  		for _, q := range queryChunks[i] {
   181  			wg.Add(1)
   182  			go func(q models.MetricsStatsQuery) {
   183  				defer wg.Done()
   184  				stats, err := in.getSingleQueryStats(&q)
   185  				statsChan <- statsChanResult{key: q.GenKey(), stats: stats, err: err}
   186  			}(q)
   187  		}
   188  		wg.Wait()
   189  		// All chunk stats are fetched, close channel
   190  		close(statsChan)
   191  		// Read channel
   192  		for r := range statsChan {
   193  			if r.err != nil {
   194  				return nil, r.err
   195  			}
   196  			if r.stats != nil {
   197  				result[r.key] = *r.stats
   198  			}
   199  		}
   200  	}
   201  	return result, nil
   202  }
   203  
   204  func (in *MetricsService) getSingleQueryStats(q *models.MetricsStatsQuery) (*models.MetricsStats, error) {
   205  	lb := createStatsMetricsLabelsBuilder(q)
   206  	labels := lb.Build()
   207  	stats, err := in.prom.FetchHistogramValues("istio_request_duration_milliseconds", labels, "", q.Interval, q.Avg, q.Quantiles, q.QueryTime)
   208  	if err != nil {
   209  		return nil, err
   210  	}
   211  	metricsStats := models.MetricsStats{
   212  		ResponseTimes: []models.Stat{},
   213  	}
   214  	for stat, vec := range stats {
   215  		for _, sample := range vec {
   216  			value := float64(sample.Value)
   217  			if math.IsNaN(value) {
   218  				continue
   219  			}
   220  			metricsStats.ResponseTimes = append(metricsStats.ResponseTimes, models.Stat{Name: stat, Value: value})
   221  		}
   222  	}
   223  	sort.Slice(metricsStats.ResponseTimes, func(i, j int) bool {
   224  		return metricsStats.ResponseTimes[i].Name < metricsStats.ResponseTimes[j].Name
   225  	})
   226  	return &metricsStats, nil
   227  }
   228  
   229  func createStatsMetricsLabelsBuilder(q *models.MetricsStatsQuery) *MetricsLabelsBuilder {
   230  	lb := NewMetricsLabelsBuilder(q.Direction)
   231  	lb.SelfReporter()
   232  	if q.Target.Kind == "app" {
   233  		lb.App(q.Target.Name, q.Target.Namespace)
   234  	} else if q.Target.Kind == "workload" {
   235  		lb.Workload(q.Target.Name, q.Target.Namespace)
   236  	} else if q.Target.Kind == "service" {
   237  		lb.Service(q.Target.Name, q.Target.Namespace)
   238  	}
   239  	if q.PeerTarget != nil {
   240  		if q.PeerTarget.Kind == "app" {
   241  			lb.PeerApp(q.PeerTarget.Name, q.PeerTarget.Namespace)
   242  		} else if q.PeerTarget.Kind == "workload" {
   243  			lb.PeerWorkload(q.PeerTarget.Name, q.PeerTarget.Namespace)
   244  		} else if q.PeerTarget.Kind == "service" {
   245  			lb.PeerService(q.PeerTarget.Name, q.PeerTarget.Namespace)
   246  		}
   247  	}
   248  	if q.Target.Cluster != "" {
   249  		lb.Cluster(q.Target.Cluster)
   250  	}
   251  	return lb
   252  }
   253  
   254  func (in *MetricsService) GetControlPlaneMetrics(q models.IstioMetricsQuery, scaler func(n string) float64) (models.MetricsMap, error) {
   255  	metrics := make(models.MetricsMap)
   256  
   257  	h := in.prom.FetchHistogramRange("pilot_proxy_convergence_time", "", "", &q.RangeQuery)
   258  	var err error
   259  	converted, err := models.ConvertHistogram("pilot_proxy_convergence_time", h, models.ConversionParams{Scale: 1})
   260  	if err != nil {
   261  		return nil, err
   262  	}
   263  	metrics["pilot_proxy_convergence_time"] = append(metrics["pilot_proxy_convergence_time"], converted...)
   264  
   265  	metric := in.prom.FetchRateRange("container_cpu_usage_seconds_total", []string{`{pod=~"istiod-.*|istio-pilot-.*"}`}, "", &q.RangeQuery)
   266  	converted, err = models.ConvertMetric("container_cpu_usage_seconds_total", metric, models.ConversionParams{Scale: 1})
   267  	if err != nil {
   268  		return nil, err
   269  	}
   270  	metrics["container_cpu_usage_seconds_total"] = append(metrics["container_cpu_usage_seconds_total"], converted...)
   271  
   272  	metric = in.prom.FetchRateRange("process_cpu_seconds_total", []string{`{app="istiod"}`}, "", &q.RangeQuery)
   273  	converted, err = models.ConvertMetric("process_cpu_seconds_total", metric, models.ConversionParams{Scale: 1})
   274  	if err != nil {
   275  		return nil, err
   276  	}
   277  	metrics["process_cpu_seconds_total"] = append(metrics["process_cpu_seconds_total"], converted...)
   278  
   279  	metric = in.prom.FetchRange("container_memory_working_set_bytes", `{container="discovery", pod=~"istiod-.*|istio-pilot-.*"}`, "", "", &q.RangeQuery)
   280  	converted, err = models.ConvertMetric("container_memory_working_set_bytes", metric, models.ConversionParams{Scale: 0.000001})
   281  	if err != nil {
   282  		return nil, err
   283  	}
   284  	metrics["container_memory_working_set_bytes"] = append(metrics["container_memory_working_set_bytes"], converted...)
   285  
   286  	metric = in.prom.FetchRange("process_resident_memory_bytes", `{app="istiod"}`, "", "", &q.RangeQuery)
   287  	converted, err = models.ConvertMetric("process_resident_memory_bytes", metric, models.ConversionParams{Scale: 0.000001})
   288  	if err != nil {
   289  		return nil, err
   290  	}
   291  	metrics["process_resident_memory_bytes"] = append(metrics["process_resident_memory_bytes"], converted...)
   292  
   293  	return metrics, nil
   294  }