k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/slos/api_responsiveness_prometheus.go (about) 1 /* 2 Copyright 2018 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package slos 18 19 import ( 20 "fmt" 21 "math" 22 "sort" 23 "strconv" 24 "time" 25 26 "github.com/prometheus/common/model" 27 "gopkg.in/yaml.v2" 28 "k8s.io/klog/v2" 29 30 "k8s.io/perf-tests/clusterloader2/pkg/errors" 31 "k8s.io/perf-tests/clusterloader2/pkg/measurement" 32 "k8s.io/perf-tests/clusterloader2/pkg/measurement/common" 33 measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util" 34 "k8s.io/perf-tests/clusterloader2/pkg/util" 35 ) 36 37 const ( 38 apiResponsivenessPrometheusMeasurementName = "APIResponsivenessPrometheus" 39 40 // Thresholds for API call latency as defined in the official K8s SLO 41 // https://github.com/kubernetes/community/blob/master/sig-scalability/slos/api_call_latency.md 42 singleResourceThreshold time.Duration = 1 * time.Second 43 multipleResourcesThreshold time.Duration = 30 * time.Second 44 45 currentAPICallMetricsVersion = "v1" 46 47 filters = `verb!="WATCH", subresource!~"log|exec|portforward|attach|proxy"` 48 49 // latencyQuery matches description of the API call latency SLI and measure 99th percentaile over 5m windows 50 // 51 // latencyQuery: %v should be replaced with (1) filters and (2) query window size.. 52 latencyQuery = "quantile_over_time(0.99, %v{%v}[%v])" 53 54 // simpleLatencyQuery measures 99th percentile of API call latency over given period of time 55 // it doesn't match SLI, but is useful in shorter tests, where we don't have enough number of windows to use latencyQuery meaningfully. 56 // 57 // simpleLatencyQuery: placeholders should be replaced with (1) quantile (2) filters and (3) query window size. 58 simpleLatencyQuery = "histogram_quantile(%.2f, sum(rate(%v_bucket{%v}[%v])) by (resource, subresource, verb, scope, le))" 59 60 // countQuery %v should be replaced with (1) filters and (2) query window size. 61 countQuery = "sum(increase(%v_count{%v}[%v])) by (resource, subresource, scope, verb)" 62 63 countFastQuery = "sum(increase(%v_bucket{%v}[%v])) by (resource, subresource, scope, verb)" 64 65 // exclude all buckets of 1s and shorter 66 filterGetAndMutating = `verb!~"WATCH|LIST", subresource!="proxy", le="1"` 67 // exclude all buckets below or equal 5s 68 filterNamespaceList = `scope!="cluster", verb="LIST", le="5"` 69 // exclude all buckets below or equal 30s 70 filterClusterList = `scope="cluster", verb="LIST", le="30"` 71 72 latencyWindowSize = 5 * time.Minute 73 74 // Number of metrics with highest latency to print. If the latency exceeds SLO threshold, a metric is printed regardless. 75 topToPrint = 5 76 ) 77 78 func init() { 79 create := func() measurement.Measurement { 80 return common.CreatePrometheusMeasurement(&apiResponsivenessGatherer{}) 81 } 82 if err := measurement.Register(apiResponsivenessPrometheusMeasurementName, create); err != nil { 83 klog.Fatalf("Cannot register %s: %v", apiResponsivenessPrometheusMeasurementName, err) 84 } 85 } 86 87 type apiCallMetric struct { 88 Resource string `json:"resource"` 89 Subresource string `json:"subresource"` 90 Verb string `json:"verb"` 91 Scope string `json:"scope"` 92 Latency measurementutil.LatencyMetric `json:"latency"` 93 Count int `json:"count"` 94 SlowCount int `json:"slowCount"` 95 } 96 97 type apiCallMetrics struct { 98 metrics map[string]*apiCallMetric 99 } 100 101 type customThresholdEntry struct { 102 Resource string `json:"resource"` 103 Subresource string `json:"subresource"` 104 Verb string `json:"verb"` 105 Scope string `json:"scope"` 106 Threshold time.Duration `json:"threshold"` 107 } 108 109 type customThresholds map[string]time.Duration 110 111 func (cte *customThresholdEntry) getKey() string { 112 return buildKey(cte.Resource, cte.Subresource, cte.Verb, cte.Scope) 113 } 114 115 type apiResponsivenessGatherer struct{} 116 117 func (a *apiResponsivenessGatherer) Gather(executor common.QueryExecutor, startTime, endTime time.Time, config *measurement.Config) ([]measurement.Summary, error) { 118 apiCalls, err := a.gatherAPICalls(executor, startTime, endTime, config) 119 if err != nil { 120 return nil, err 121 } 122 123 content, err := util.PrettyPrintJSON(apiCalls.ToPerfData()) 124 if err != nil { 125 return nil, err 126 } 127 summaryName, err := util.GetStringOrDefault(config.Params, "summaryName", a.String()) 128 if err != nil { 129 return nil, err 130 } 131 summaries := []measurement.Summary{ 132 measurement.CreateSummary(summaryName, "json", content), 133 } 134 135 allowedSlowCalls, err := util.GetIntOrDefault(config.Params, "allowedSlowCalls", 0) 136 if err != nil { 137 return nil, err 138 } 139 140 customThresholds, err := getCustomThresholds(config, apiCalls) 141 if err != nil { 142 return nil, err 143 } 144 145 badMetrics := a.validateAPICalls(config.Identifier, allowedSlowCalls, apiCalls, customThresholds) 146 if len(badMetrics) > 0 { 147 err = errors.NewMetricViolationError("top latency metric", fmt.Sprintf("there should be no high-latency requests, but: %v", badMetrics)) 148 } 149 return summaries, err 150 } 151 152 func (a *apiResponsivenessGatherer) String() string { 153 return apiResponsivenessPrometheusMeasurementName 154 } 155 156 func (a *apiResponsivenessGatherer) Configure(config *measurement.Config) error { 157 return nil 158 } 159 160 func (a *apiResponsivenessGatherer) IsEnabled(config *measurement.Config) bool { 161 return true 162 } 163 164 func (a *apiResponsivenessGatherer) gatherAPICalls(executor common.QueryExecutor, startTime, endTime time.Time, config *measurement.Config) (*apiCallMetrics, error) { 165 measurementDuration := endTime.Sub(startTime) 166 promDuration := measurementutil.ToPrometheusTime(measurementDuration) 167 apiserverSLI := measurementutil.GetApiserverSLI(config.ClusterVersion) 168 apiserverLatency := measurementutil.GetApiserverLatency(config.ClusterVersion) 169 170 useSimple, err := util.GetBoolOrDefault(config.Params, "useSimpleLatencyQuery", false) 171 if err != nil { 172 return nil, err 173 } 174 175 var latencySamples []*model.Sample 176 if useSimple { 177 quantiles := []float64{0.5, 0.9, 0.99} 178 for _, q := range quantiles { 179 query := fmt.Sprintf(simpleLatencyQuery, q, apiserverSLI, filters, promDuration) 180 samples, err := executor.Query(query, endTime) 181 if err != nil { 182 return nil, err 183 } 184 // Underlying code assumes presence of 'quantile' label, so adding it manually. 185 for _, sample := range samples { 186 sample.Metric["quantile"] = model.LabelValue(fmt.Sprintf("%.2f", q)) 187 } 188 latencySamples = append(latencySamples, samples...) 189 } 190 } else { 191 // Latency measurement is based on 5m window aggregation, 192 // therefore first 5 minutes of the test should be skipped. 193 latencyMeasurementDuration := measurementDuration - latencyWindowSize 194 if latencyMeasurementDuration < time.Minute { 195 latencyMeasurementDuration = time.Minute 196 } 197 duration := measurementutil.ToPrometheusTime(latencyMeasurementDuration) 198 199 query := fmt.Sprintf(latencyQuery, apiserverLatency, filters, duration) 200 latencySamples, err = executor.Query(query, endTime) 201 if err != nil { 202 return nil, err 203 } 204 } 205 206 query := fmt.Sprintf(countQuery, apiserverSLI, filters, promDuration) 207 countSamples, err := executor.Query(query, endTime) 208 if err != nil { 209 return nil, err 210 } 211 212 countFastSamples := make([]*model.Sample, 0) 213 filters := []string{filterGetAndMutating, filterNamespaceList, filterClusterList} 214 for _, filter := range filters { 215 query := fmt.Sprintf(countFastQuery, apiserverSLI, filter, promDuration) 216 samples, err := executor.Query(query, endTime) 217 if err != nil { 218 return nil, err 219 } 220 countFastSamples = append(countFastSamples, samples...) 221 } 222 223 return newFromSamples(latencySamples, countSamples, countFastSamples) 224 } 225 226 func getCustomThresholds(config *measurement.Config, metrics *apiCallMetrics) (customThresholds, error) { 227 thresholdsString, err := util.GetStringOrDefault(config.Params, "customThresholds", "") 228 if err != nil { 229 return nil, err 230 } 231 var thresholds []customThresholdEntry 232 if err := yaml.Unmarshal([]byte(thresholdsString), &thresholds); err != nil { 233 return nil, err 234 } 235 236 customThresholds := customThresholds{} 237 for _, entry := range thresholds { 238 if entry.Threshold == 0 { 239 return nil, fmt.Errorf("custom threshold must be set to a positive time duration") 240 } 241 key := entry.getKey() 242 if _, ok := metrics.metrics[key]; !ok { 243 klog.V(1).Infof("WARNING: unrecognized custom threshold API call key: %v", key) 244 } else { 245 customThresholds[key] = entry.Threshold 246 } 247 } 248 return customThresholds, nil 249 } 250 251 func (a *apiResponsivenessGatherer) validateAPICalls(identifier string, allowedSlowCalls int, metrics *apiCallMetrics, customThresholds customThresholds) []error { 252 badMetrics := make([]error, 0) 253 top := topToPrint 254 255 for _, apiCall := range metrics.sorted() { 256 var threshold time.Duration 257 if customThreshold, ok := customThresholds[apiCall.getKey()]; ok { 258 threshold = customThreshold 259 } else { 260 threshold = apiCall.getSLOThreshold() 261 } 262 var err error 263 if err = apiCall.Validate(allowedSlowCalls, threshold); err != nil { 264 badMetrics = append(badMetrics, err) 265 } 266 if top > 0 || err != nil { 267 top-- 268 prefix := "" 269 if err != nil { 270 prefix = "WARNING " 271 } 272 klog.V(2).Infof("%s: %vTop latency metric: %+v; threshold: %v", identifier, prefix, *apiCall, threshold) 273 } 274 } 275 return badMetrics 276 } 277 278 func newFromSamples(latencySamples, countSamples, countFastSamples []*model.Sample) (*apiCallMetrics, error) { 279 extractCommon := func(sample *model.Sample) (string, string, string, string) { 280 return string(sample.Metric["resource"]), string(sample.Metric["subresource"]), string(sample.Metric["verb"]), string(sample.Metric["scope"]) 281 } 282 283 m := &apiCallMetrics{metrics: make(map[string]*apiCallMetric)} 284 285 for _, sample := range latencySamples { 286 resource, subresource, verb, scope := extractCommon(sample) 287 quantile, err := strconv.ParseFloat(string(sample.Metric["quantile"]), 64) 288 if err != nil { 289 return nil, err 290 } 291 292 latency := time.Duration(float64(sample.Value) * float64(time.Second)) 293 m.SetLatency(resource, subresource, verb, scope, quantile, latency) 294 } 295 296 for _, sample := range countSamples { 297 resource, subresource, verb, scope := extractCommon(sample) 298 count := int(math.Round(float64(sample.Value))) 299 m.SetCount(resource, subresource, verb, scope, count) 300 } 301 302 for _, sample := range countFastSamples { 303 resource, subresource, verb, scope := extractCommon(sample) 304 fastCount := int(math.Round(float64(sample.Value))) 305 count := m.GetCount(resource, subresource, verb, scope) 306 slowCount := count - fastCount 307 m.SetSlowCount(resource, subresource, verb, scope, slowCount) 308 } 309 310 return m, nil 311 } 312 313 func (m *apiCallMetrics) getAPICall(resource, subresource, verb, scope string) *apiCallMetric { 314 key := buildKey(resource, subresource, verb, scope) 315 call, exists := m.metrics[key] 316 if !exists { 317 call = &apiCallMetric{ 318 Resource: resource, 319 Subresource: subresource, 320 Verb: verb, 321 Scope: scope, 322 } 323 m.metrics[key] = call 324 } 325 return call 326 } 327 328 func (m *apiCallMetrics) SetLatency(resource, subresource, verb, scope string, quantile float64, latency time.Duration) { 329 call := m.getAPICall(resource, subresource, verb, scope) 330 call.Latency.SetQuantile(quantile, latency) 331 } 332 333 func (m *apiCallMetrics) GetCount(resource, subresource, verb, scope string) int { 334 call := m.getAPICall(resource, subresource, verb, scope) 335 return call.Count 336 } 337 338 func (m *apiCallMetrics) SetCount(resource, subresource, verb, scope string, count int) { 339 if count == 0 { 340 return 341 } 342 call := m.getAPICall(resource, subresource, verb, scope) 343 call.Count = count 344 } 345 346 func (m *apiCallMetrics) SetSlowCount(resource, subresource, verb, scope string, count int) { 347 if count == 0 { 348 return 349 } 350 call := m.getAPICall(resource, subresource, verb, scope) 351 call.SlowCount = count 352 } 353 354 func (m *apiCallMetrics) ToPerfData() *measurementutil.PerfData { 355 perfData := &measurementutil.PerfData{Version: currentAPICallMetricsVersion} 356 for _, apicall := range m.sorted() { 357 item := measurementutil.DataItem{ 358 Data: map[string]float64{ 359 "Perc50": float64(apicall.Latency.Perc50) / 1000000, // us -> ms 360 "Perc90": float64(apicall.Latency.Perc90) / 1000000, 361 "Perc99": float64(apicall.Latency.Perc99) / 1000000, 362 }, 363 Unit: "ms", 364 Labels: map[string]string{ 365 "Verb": apicall.Verb, 366 "Resource": apicall.Resource, 367 "Subresource": apicall.Subresource, 368 "Scope": apicall.Scope, 369 "Count": fmt.Sprintf("%v", apicall.Count), 370 "SlowCount": fmt.Sprintf("%v", apicall.SlowCount), 371 }, 372 } 373 perfData.DataItems = append(perfData.DataItems, item) 374 } 375 return perfData 376 } 377 378 func (m *apiCallMetrics) sorted() []*apiCallMetric { 379 all := make([]*apiCallMetric, 0) 380 for _, v := range m.metrics { 381 all = append(all, v) 382 } 383 sort.Slice(all, func(i, j int) bool { 384 return all[i].Latency.Perc99 > all[j].Latency.Perc99 385 }) 386 return all 387 } 388 389 func buildKey(resource, subresource, verb, scope string) string { 390 return fmt.Sprintf("%s|%s|%s|%s", resource, subresource, verb, scope) 391 } 392 393 func (ap *apiCallMetric) getKey() string { 394 return buildKey(ap.Resource, ap.Subresource, ap.Verb, ap.Scope) 395 } 396 397 func (ap *apiCallMetric) Validate(allowedSlowCalls int, threshold time.Duration) error { 398 // TODO(oxddr): remove allowedSlowCalls guard once it's stable 399 if allowedSlowCalls > 0 && ap.SlowCount <= allowedSlowCalls { 400 return nil 401 } 402 if err := ap.Latency.VerifyThreshold(threshold); err != nil { 403 return fmt.Errorf("got: %+v; expected perc99 <= %v", ap, threshold) 404 } 405 return nil 406 } 407 408 func (ap *apiCallMetric) getSLOThreshold() time.Duration { 409 if ap.Scope == "resource" { 410 return singleResourceThreshold 411 } 412 return multipleResourcesThreshold 413 }