github.com/kiali/kiali@v1.84.0/business/metrics.go (about) 1 package business 2 3 import ( 4 "math" 5 "sort" 6 "strings" 7 "sync" 8 9 "github.com/kiali/kiali/models" 10 "github.com/kiali/kiali/prometheus" 11 ) 12 13 // MetricsService deals with fetching metrics from prometheus 14 type MetricsService struct { 15 prom prometheus.ClientInterface 16 } 17 18 // NewMetricsService initializes this business service 19 func NewMetricsService(prom prometheus.ClientInterface) *MetricsService { 20 return &MetricsService{prom: prom} 21 } 22 23 func (in *MetricsService) GetMetrics(q models.IstioMetricsQuery, scaler func(n string) float64) (models.MetricsMap, error) { 24 lb := createMetricsLabelsBuilder(&q) 25 grouping := strings.Join(q.ByLabels, ",") 26 return in.fetchAllMetrics(q, lb, grouping, scaler) 27 } 28 29 func createMetricsLabelsBuilder(q *models.IstioMetricsQuery) *MetricsLabelsBuilder { 30 lb := NewMetricsLabelsBuilder(q.Direction) 31 if q.Reporter != "both" { 32 lb.Reporter(q.Reporter) 33 } 34 35 namespaceSet := false 36 37 // add custom labels from config if custom labels are configured 38 lb.QueryScope() 39 40 if q.Service != "" { 41 lb.Service(q.Service, q.Namespace) 42 namespaceSet = true 43 } 44 if q.Workload != "" { 45 lb.Workload(q.Workload, q.Namespace) 46 namespaceSet = true 47 } 48 if q.App != "" { 49 lb.App(q.App, q.Namespace) 50 namespaceSet = true 51 } 52 if !namespaceSet && q.Namespace != "" { 53 lb.Namespace(q.Namespace) 54 } 55 if q.RequestProtocol != "" { 56 lb.Protocol(q.RequestProtocol) 57 } 58 if q.Aggregate != "" { 59 lb.Aggregate(q.Aggregate, q.AggregateValue) 60 } 61 if q.Cluster != "" { 62 lb.Cluster(q.Cluster) 63 } 64 65 return lb 66 } 67 68 func (in *MetricsService) fetchAllMetrics(q models.IstioMetricsQuery, lb *MetricsLabelsBuilder, grouping string, scaler func(n string) float64) (models.MetricsMap, error) { 69 labels := lb.Build() 70 labelsError := lb.BuildForErrors() 71 72 var wg sync.WaitGroup 73 fetchRate := func(p8sFamilyName string, metric *prometheus.Metric, lbl []string) { 74 defer wg.Done() 75 m := in.prom.FetchRateRange(p8sFamilyName, lbl, grouping, &q.RangeQuery) 76 *metric = m 77 } 78 79 fetchHisto := func(p8sFamilyName string, histo *prometheus.Histogram) { 80 defer wg.Done() 81 h := in.prom.FetchHistogramRange(p8sFamilyName, labels, grouping, &q.RangeQuery) 82 *histo = h 83 } 84 85 type resultHolder struct { 86 metric prometheus.Metric 87 histo prometheus.Histogram 88 definition istioMetric 89 } 90 maxResults := len(istioMetrics) 91 if len(q.Filters) != 0 { 92 maxResults = len(q.Filters) 93 } 94 results := make([]*resultHolder, maxResults) 95 96 for _, istioMetric := range istioMetrics { 97 // if filters is empty, fetch all anyway 98 doFetch := len(q.Filters) == 0 99 if !doFetch { 100 for _, filter := range q.Filters { 101 if filter == istioMetric.kialiName { 102 doFetch = true 103 break 104 } 105 } 106 } 107 if doFetch { 108 wg.Add(1) 109 result := resultHolder{definition: istioMetric} 110 results = append(results, &result) 111 if istioMetric.isHisto { 112 go fetchHisto(istioMetric.istioName, &result.histo) 113 } else { 114 labelsToUse := istioMetric.labelsToUse(labels, labelsError) 115 go fetchRate(istioMetric.istioName, &result.metric, labelsToUse) 116 } 117 } 118 } 119 wg.Wait() 120 121 // Return results as two maps per reporter 122 metrics := make(models.MetricsMap) 123 for _, result := range results { 124 if result != nil { 125 conversionParams := models.ConversionParams{Scale: 1.0} 126 if scaler != nil { 127 scale := scaler(result.definition.kialiName) 128 if scale != 0.0 { 129 conversionParams.Scale = scale 130 } 131 } 132 var converted []models.Metric 133 var err error 134 if result.definition.isHisto { 135 converted, err = models.ConvertHistogram(result.definition.kialiName, result.histo, conversionParams) 136 if err != nil { 137 return nil, err 138 } 139 } else { 140 converted, err = models.ConvertMetric(result.definition.kialiName, result.metric, conversionParams) 141 if err != nil { 142 return nil, err 143 } 144 } 145 metrics[result.definition.kialiName] = append(metrics[result.definition.kialiName], converted...) 146 } 147 } 148 return metrics, nil 149 } 150 151 // GetStats computes metrics stats, currently response times, for a set of queries 152 func (in *MetricsService) GetStats(queries []models.MetricsStatsQuery) (map[string]models.MetricsStats, error) { 153 type statsChanResult struct { 154 key string 155 stats *models.MetricsStats 156 err error 157 } 158 159 // The number of queries could be high, limit concurrent requests to 10 at a time (see https://github.com/kiali/kiali/issues/5584) 160 // Note that the default prometheus_engine_queries_concurrent_max = 20, so by limiting here to 10 we leave some room for 161 // other users hitting prom while still allowing a decent amount of concurrency. Prom also has a default query timeout 162 // of 2 minutes, and any queries pending execution (so any number > 20 by default) are still subject to that timer. 163 chunkSize := 10 164 numQueries := len(queries) 165 var queryChunks [][]models.MetricsStatsQuery 166 for i := 0; i < numQueries; i += chunkSize { 167 end := i + chunkSize 168 if end > numQueries { 169 end = numQueries 170 } 171 queryChunks = append(queryChunks, queries[i:end]) 172 } 173 174 result := make(map[string]models.MetricsStats) 175 176 for i, queryChunk := range queryChunks { 177 statsChan := make(chan statsChanResult, len(queryChunk)) 178 var wg sync.WaitGroup 179 180 for _, q := range queryChunks[i] { 181 wg.Add(1) 182 go func(q models.MetricsStatsQuery) { 183 defer wg.Done() 184 stats, err := in.getSingleQueryStats(&q) 185 statsChan <- statsChanResult{key: q.GenKey(), stats: stats, err: err} 186 }(q) 187 } 188 wg.Wait() 189 // All chunk stats are fetched, close channel 190 close(statsChan) 191 // Read channel 192 for r := range statsChan { 193 if r.err != nil { 194 return nil, r.err 195 } 196 if r.stats != nil { 197 result[r.key] = *r.stats 198 } 199 } 200 } 201 return result, nil 202 } 203 204 func (in *MetricsService) getSingleQueryStats(q *models.MetricsStatsQuery) (*models.MetricsStats, error) { 205 lb := createStatsMetricsLabelsBuilder(q) 206 labels := lb.Build() 207 stats, err := in.prom.FetchHistogramValues("istio_request_duration_milliseconds", labels, "", q.Interval, q.Avg, q.Quantiles, q.QueryTime) 208 if err != nil { 209 return nil, err 210 } 211 metricsStats := models.MetricsStats{ 212 ResponseTimes: []models.Stat{}, 213 } 214 for stat, vec := range stats { 215 for _, sample := range vec { 216 value := float64(sample.Value) 217 if math.IsNaN(value) { 218 continue 219 } 220 metricsStats.ResponseTimes = append(metricsStats.ResponseTimes, models.Stat{Name: stat, Value: value}) 221 } 222 } 223 sort.Slice(metricsStats.ResponseTimes, func(i, j int) bool { 224 return metricsStats.ResponseTimes[i].Name < metricsStats.ResponseTimes[j].Name 225 }) 226 return &metricsStats, nil 227 } 228 229 func createStatsMetricsLabelsBuilder(q *models.MetricsStatsQuery) *MetricsLabelsBuilder { 230 lb := NewMetricsLabelsBuilder(q.Direction) 231 lb.SelfReporter() 232 if q.Target.Kind == "app" { 233 lb.App(q.Target.Name, q.Target.Namespace) 234 } else if q.Target.Kind == "workload" { 235 lb.Workload(q.Target.Name, q.Target.Namespace) 236 } else if q.Target.Kind == "service" { 237 lb.Service(q.Target.Name, q.Target.Namespace) 238 } 239 if q.PeerTarget != nil { 240 if q.PeerTarget.Kind == "app" { 241 lb.PeerApp(q.PeerTarget.Name, q.PeerTarget.Namespace) 242 } else if q.PeerTarget.Kind == "workload" { 243 lb.PeerWorkload(q.PeerTarget.Name, q.PeerTarget.Namespace) 244 } else if q.PeerTarget.Kind == "service" { 245 lb.PeerService(q.PeerTarget.Name, q.PeerTarget.Namespace) 246 } 247 } 248 if q.Target.Cluster != "" { 249 lb.Cluster(q.Target.Cluster) 250 } 251 return lb 252 } 253 254 func (in *MetricsService) GetControlPlaneMetrics(q models.IstioMetricsQuery, scaler func(n string) float64) (models.MetricsMap, error) { 255 metrics := make(models.MetricsMap) 256 257 h := in.prom.FetchHistogramRange("pilot_proxy_convergence_time", "", "", &q.RangeQuery) 258 var err error 259 converted, err := models.ConvertHistogram("pilot_proxy_convergence_time", h, models.ConversionParams{Scale: 1}) 260 if err != nil { 261 return nil, err 262 } 263 metrics["pilot_proxy_convergence_time"] = append(metrics["pilot_proxy_convergence_time"], converted...) 264 265 metric := in.prom.FetchRateRange("container_cpu_usage_seconds_total", []string{`{pod=~"istiod-.*|istio-pilot-.*"}`}, "", &q.RangeQuery) 266 converted, err = models.ConvertMetric("container_cpu_usage_seconds_total", metric, models.ConversionParams{Scale: 1}) 267 if err != nil { 268 return nil, err 269 } 270 metrics["container_cpu_usage_seconds_total"] = append(metrics["container_cpu_usage_seconds_total"], converted...) 271 272 metric = in.prom.FetchRateRange("process_cpu_seconds_total", []string{`{app="istiod"}`}, "", &q.RangeQuery) 273 converted, err = models.ConvertMetric("process_cpu_seconds_total", metric, models.ConversionParams{Scale: 1}) 274 if err != nil { 275 return nil, err 276 } 277 metrics["process_cpu_seconds_total"] = append(metrics["process_cpu_seconds_total"], converted...) 278 279 metric = in.prom.FetchRange("container_memory_working_set_bytes", `{container="discovery", pod=~"istiod-.*|istio-pilot-.*"}`, "", "", &q.RangeQuery) 280 converted, err = models.ConvertMetric("container_memory_working_set_bytes", metric, models.ConversionParams{Scale: 0.000001}) 281 if err != nil { 282 return nil, err 283 } 284 metrics["container_memory_working_set_bytes"] = append(metrics["container_memory_working_set_bytes"], converted...) 285 286 metric = in.prom.FetchRange("process_resident_memory_bytes", `{app="istiod"}`, "", "", &q.RangeQuery) 287 converted, err = models.ConvertMetric("process_resident_memory_bytes", metric, models.ConversionParams{Scale: 0.000001}) 288 if err != nil { 289 return nil, err 290 } 291 metrics["process_resident_memory_bytes"] = append(metrics["process_resident_memory_bytes"], converted...) 292 293 return metrics, nil 294 }