k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/slos/api_responsiveness_prometheus.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package slos
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  	"sort"
    23  	"strconv"
    24  	"time"
    25  
    26  	"github.com/prometheus/common/model"
    27  	"gopkg.in/yaml.v2"
    28  	"k8s.io/klog/v2"
    29  
    30  	"k8s.io/perf-tests/clusterloader2/pkg/errors"
    31  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    32  	"k8s.io/perf-tests/clusterloader2/pkg/measurement/common"
    33  	measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util"
    34  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    35  )
    36  
    37  const (
    38  	apiResponsivenessPrometheusMeasurementName = "APIResponsivenessPrometheus"
    39  
    40  	// Thresholds for API call latency as defined in the official K8s SLO
    41  	// https://github.com/kubernetes/community/blob/master/sig-scalability/slos/api_call_latency.md
    42  	singleResourceThreshold    time.Duration = 1 * time.Second
    43  	multipleResourcesThreshold time.Duration = 30 * time.Second
    44  
    45  	currentAPICallMetricsVersion = "v1"
    46  
    47  	filters = `verb!="WATCH", subresource!~"log|exec|portforward|attach|proxy"`
    48  
    49  	// latencyQuery matches description of the API call latency SLI and measure 99th percentaile over 5m windows
    50  	//
    51  	// latencyQuery: %v should be replaced with (1) filters and (2) query window size..
    52  	latencyQuery = "quantile_over_time(0.99, %v{%v}[%v])"
    53  
    54  	// simpleLatencyQuery measures 99th percentile of API call latency  over given period of time
    55  	// it doesn't match SLI, but is useful in shorter tests, where we don't have enough number of windows to use latencyQuery meaningfully.
    56  	//
    57  	// simpleLatencyQuery: placeholders should be replaced with (1) quantile (2) filters and (3) query window size.
    58  	simpleLatencyQuery = "histogram_quantile(%.2f, sum(rate(%v_bucket{%v}[%v])) by (resource,  subresource, verb, scope, le))"
    59  
    60  	// countQuery %v should be replaced with (1) filters and (2) query window size.
    61  	countQuery = "sum(increase(%v_count{%v}[%v])) by (resource, subresource, scope, verb)"
    62  
    63  	countFastQuery = "sum(increase(%v_bucket{%v}[%v])) by (resource, subresource, scope, verb)"
    64  
    65  	// exclude all buckets of 1s and shorter
    66  	filterGetAndMutating = `verb!~"WATCH|LIST", subresource!="proxy", le="1"`
    67  	// exclude all buckets below or equal 5s
    68  	filterNamespaceList = `scope!="cluster", verb="LIST", le="5"`
    69  	// exclude all buckets below or equal 30s
    70  	filterClusterList = `scope="cluster", verb="LIST", le="30"`
    71  
    72  	latencyWindowSize = 5 * time.Minute
    73  
    74  	// Number of metrics with highest latency to print. If the latency exceeds SLO threshold, a metric is printed regardless.
    75  	topToPrint = 5
    76  )
    77  
    78  func init() {
    79  	create := func() measurement.Measurement {
    80  		return common.CreatePrometheusMeasurement(&apiResponsivenessGatherer{})
    81  	}
    82  	if err := measurement.Register(apiResponsivenessPrometheusMeasurementName, create); err != nil {
    83  		klog.Fatalf("Cannot register %s: %v", apiResponsivenessPrometheusMeasurementName, err)
    84  	}
    85  }
    86  
    87  type apiCallMetric struct {
    88  	Resource    string                        `json:"resource"`
    89  	Subresource string                        `json:"subresource"`
    90  	Verb        string                        `json:"verb"`
    91  	Scope       string                        `json:"scope"`
    92  	Latency     measurementutil.LatencyMetric `json:"latency"`
    93  	Count       int                           `json:"count"`
    94  	SlowCount   int                           `json:"slowCount"`
    95  }
    96  
    97  type apiCallMetrics struct {
    98  	metrics map[string]*apiCallMetric
    99  }
   100  
   101  type customThresholdEntry struct {
   102  	Resource    string        `json:"resource"`
   103  	Subresource string        `json:"subresource"`
   104  	Verb        string        `json:"verb"`
   105  	Scope       string        `json:"scope"`
   106  	Threshold   time.Duration `json:"threshold"`
   107  }
   108  
   109  type customThresholds map[string]time.Duration
   110  
   111  func (cte *customThresholdEntry) getKey() string {
   112  	return buildKey(cte.Resource, cte.Subresource, cte.Verb, cte.Scope)
   113  }
   114  
   115  type apiResponsivenessGatherer struct{}
   116  
   117  func (a *apiResponsivenessGatherer) Gather(executor common.QueryExecutor, startTime, endTime time.Time, config *measurement.Config) ([]measurement.Summary, error) {
   118  	apiCalls, err := a.gatherAPICalls(executor, startTime, endTime, config)
   119  	if err != nil {
   120  		return nil, err
   121  	}
   122  
   123  	content, err := util.PrettyPrintJSON(apiCalls.ToPerfData())
   124  	if err != nil {
   125  		return nil, err
   126  	}
   127  	summaryName, err := util.GetStringOrDefault(config.Params, "summaryName", a.String())
   128  	if err != nil {
   129  		return nil, err
   130  	}
   131  	summaries := []measurement.Summary{
   132  		measurement.CreateSummary(summaryName, "json", content),
   133  	}
   134  
   135  	allowedSlowCalls, err := util.GetIntOrDefault(config.Params, "allowedSlowCalls", 0)
   136  	if err != nil {
   137  		return nil, err
   138  	}
   139  
   140  	customThresholds, err := getCustomThresholds(config, apiCalls)
   141  	if err != nil {
   142  		return nil, err
   143  	}
   144  
   145  	badMetrics := a.validateAPICalls(config.Identifier, allowedSlowCalls, apiCalls, customThresholds)
   146  	if len(badMetrics) > 0 {
   147  		err = errors.NewMetricViolationError("top latency metric", fmt.Sprintf("there should be no high-latency requests, but: %v", badMetrics))
   148  	}
   149  	return summaries, err
   150  }
   151  
   152  func (a *apiResponsivenessGatherer) String() string {
   153  	return apiResponsivenessPrometheusMeasurementName
   154  }
   155  
   156  func (a *apiResponsivenessGatherer) Configure(config *measurement.Config) error {
   157  	return nil
   158  }
   159  
   160  func (a *apiResponsivenessGatherer) IsEnabled(config *measurement.Config) bool {
   161  	return true
   162  }
   163  
   164  func (a *apiResponsivenessGatherer) gatherAPICalls(executor common.QueryExecutor, startTime, endTime time.Time, config *measurement.Config) (*apiCallMetrics, error) {
   165  	measurementDuration := endTime.Sub(startTime)
   166  	promDuration := measurementutil.ToPrometheusTime(measurementDuration)
   167  	apiserverSLI := measurementutil.GetApiserverSLI(config.ClusterVersion)
   168  	apiserverLatency := measurementutil.GetApiserverLatency(config.ClusterVersion)
   169  
   170  	useSimple, err := util.GetBoolOrDefault(config.Params, "useSimpleLatencyQuery", false)
   171  	if err != nil {
   172  		return nil, err
   173  	}
   174  
   175  	var latencySamples []*model.Sample
   176  	if useSimple {
   177  		quantiles := []float64{0.5, 0.9, 0.99}
   178  		for _, q := range quantiles {
   179  			query := fmt.Sprintf(simpleLatencyQuery, q, apiserverSLI, filters, promDuration)
   180  			samples, err := executor.Query(query, endTime)
   181  			if err != nil {
   182  				return nil, err
   183  			}
   184  			// Underlying code assumes presence of 'quantile' label, so adding it manually.
   185  			for _, sample := range samples {
   186  				sample.Metric["quantile"] = model.LabelValue(fmt.Sprintf("%.2f", q))
   187  			}
   188  			latencySamples = append(latencySamples, samples...)
   189  		}
   190  	} else {
   191  		// Latency measurement is based on 5m window aggregation,
   192  		// therefore first 5 minutes of the test should be skipped.
   193  		latencyMeasurementDuration := measurementDuration - latencyWindowSize
   194  		if latencyMeasurementDuration < time.Minute {
   195  			latencyMeasurementDuration = time.Minute
   196  		}
   197  		duration := measurementutil.ToPrometheusTime(latencyMeasurementDuration)
   198  
   199  		query := fmt.Sprintf(latencyQuery, apiserverLatency, filters, duration)
   200  		latencySamples, err = executor.Query(query, endTime)
   201  		if err != nil {
   202  			return nil, err
   203  		}
   204  	}
   205  
   206  	query := fmt.Sprintf(countQuery, apiserverSLI, filters, promDuration)
   207  	countSamples, err := executor.Query(query, endTime)
   208  	if err != nil {
   209  		return nil, err
   210  	}
   211  
   212  	countFastSamples := make([]*model.Sample, 0)
   213  	filters := []string{filterGetAndMutating, filterNamespaceList, filterClusterList}
   214  	for _, filter := range filters {
   215  		query := fmt.Sprintf(countFastQuery, apiserverSLI, filter, promDuration)
   216  		samples, err := executor.Query(query, endTime)
   217  		if err != nil {
   218  			return nil, err
   219  		}
   220  		countFastSamples = append(countFastSamples, samples...)
   221  	}
   222  
   223  	return newFromSamples(latencySamples, countSamples, countFastSamples)
   224  }
   225  
   226  func getCustomThresholds(config *measurement.Config, metrics *apiCallMetrics) (customThresholds, error) {
   227  	thresholdsString, err := util.GetStringOrDefault(config.Params, "customThresholds", "")
   228  	if err != nil {
   229  		return nil, err
   230  	}
   231  	var thresholds []customThresholdEntry
   232  	if err := yaml.Unmarshal([]byte(thresholdsString), &thresholds); err != nil {
   233  		return nil, err
   234  	}
   235  
   236  	customThresholds := customThresholds{}
   237  	for _, entry := range thresholds {
   238  		if entry.Threshold == 0 {
   239  			return nil, fmt.Errorf("custom threshold must be set to a positive time duration")
   240  		}
   241  		key := entry.getKey()
   242  		if _, ok := metrics.metrics[key]; !ok {
   243  			klog.V(1).Infof("WARNING: unrecognized custom threshold API call key: %v", key)
   244  		} else {
   245  			customThresholds[key] = entry.Threshold
   246  		}
   247  	}
   248  	return customThresholds, nil
   249  }
   250  
   251  func (a *apiResponsivenessGatherer) validateAPICalls(identifier string, allowedSlowCalls int, metrics *apiCallMetrics, customThresholds customThresholds) []error {
   252  	badMetrics := make([]error, 0)
   253  	top := topToPrint
   254  
   255  	for _, apiCall := range metrics.sorted() {
   256  		var threshold time.Duration
   257  		if customThreshold, ok := customThresholds[apiCall.getKey()]; ok {
   258  			threshold = customThreshold
   259  		} else {
   260  			threshold = apiCall.getSLOThreshold()
   261  		}
   262  		var err error
   263  		if err = apiCall.Validate(allowedSlowCalls, threshold); err != nil {
   264  			badMetrics = append(badMetrics, err)
   265  		}
   266  		if top > 0 || err != nil {
   267  			top--
   268  			prefix := ""
   269  			if err != nil {
   270  				prefix = "WARNING "
   271  			}
   272  			klog.V(2).Infof("%s: %vTop latency metric: %+v; threshold: %v", identifier, prefix, *apiCall, threshold)
   273  		}
   274  	}
   275  	return badMetrics
   276  }
   277  
   278  func newFromSamples(latencySamples, countSamples, countFastSamples []*model.Sample) (*apiCallMetrics, error) {
   279  	extractCommon := func(sample *model.Sample) (string, string, string, string) {
   280  		return string(sample.Metric["resource"]), string(sample.Metric["subresource"]), string(sample.Metric["verb"]), string(sample.Metric["scope"])
   281  	}
   282  
   283  	m := &apiCallMetrics{metrics: make(map[string]*apiCallMetric)}
   284  
   285  	for _, sample := range latencySamples {
   286  		resource, subresource, verb, scope := extractCommon(sample)
   287  		quantile, err := strconv.ParseFloat(string(sample.Metric["quantile"]), 64)
   288  		if err != nil {
   289  			return nil, err
   290  		}
   291  
   292  		latency := time.Duration(float64(sample.Value) * float64(time.Second))
   293  		m.SetLatency(resource, subresource, verb, scope, quantile, latency)
   294  	}
   295  
   296  	for _, sample := range countSamples {
   297  		resource, subresource, verb, scope := extractCommon(sample)
   298  		count := int(math.Round(float64(sample.Value)))
   299  		m.SetCount(resource, subresource, verb, scope, count)
   300  	}
   301  
   302  	for _, sample := range countFastSamples {
   303  		resource, subresource, verb, scope := extractCommon(sample)
   304  		fastCount := int(math.Round(float64(sample.Value)))
   305  		count := m.GetCount(resource, subresource, verb, scope)
   306  		slowCount := count - fastCount
   307  		m.SetSlowCount(resource, subresource, verb, scope, slowCount)
   308  	}
   309  
   310  	return m, nil
   311  }
   312  
   313  func (m *apiCallMetrics) getAPICall(resource, subresource, verb, scope string) *apiCallMetric {
   314  	key := buildKey(resource, subresource, verb, scope)
   315  	call, exists := m.metrics[key]
   316  	if !exists {
   317  		call = &apiCallMetric{
   318  			Resource:    resource,
   319  			Subresource: subresource,
   320  			Verb:        verb,
   321  			Scope:       scope,
   322  		}
   323  		m.metrics[key] = call
   324  	}
   325  	return call
   326  }
   327  
   328  func (m *apiCallMetrics) SetLatency(resource, subresource, verb, scope string, quantile float64, latency time.Duration) {
   329  	call := m.getAPICall(resource, subresource, verb, scope)
   330  	call.Latency.SetQuantile(quantile, latency)
   331  }
   332  
   333  func (m *apiCallMetrics) GetCount(resource, subresource, verb, scope string) int {
   334  	call := m.getAPICall(resource, subresource, verb, scope)
   335  	return call.Count
   336  }
   337  
   338  func (m *apiCallMetrics) SetCount(resource, subresource, verb, scope string, count int) {
   339  	if count == 0 {
   340  		return
   341  	}
   342  	call := m.getAPICall(resource, subresource, verb, scope)
   343  	call.Count = count
   344  }
   345  
   346  func (m *apiCallMetrics) SetSlowCount(resource, subresource, verb, scope string, count int) {
   347  	if count == 0 {
   348  		return
   349  	}
   350  	call := m.getAPICall(resource, subresource, verb, scope)
   351  	call.SlowCount = count
   352  }
   353  
   354  func (m *apiCallMetrics) ToPerfData() *measurementutil.PerfData {
   355  	perfData := &measurementutil.PerfData{Version: currentAPICallMetricsVersion}
   356  	for _, apicall := range m.sorted() {
   357  		item := measurementutil.DataItem{
   358  			Data: map[string]float64{
   359  				"Perc50": float64(apicall.Latency.Perc50) / 1000000, // us -> ms
   360  				"Perc90": float64(apicall.Latency.Perc90) / 1000000,
   361  				"Perc99": float64(apicall.Latency.Perc99) / 1000000,
   362  			},
   363  			Unit: "ms",
   364  			Labels: map[string]string{
   365  				"Verb":        apicall.Verb,
   366  				"Resource":    apicall.Resource,
   367  				"Subresource": apicall.Subresource,
   368  				"Scope":       apicall.Scope,
   369  				"Count":       fmt.Sprintf("%v", apicall.Count),
   370  				"SlowCount":   fmt.Sprintf("%v", apicall.SlowCount),
   371  			},
   372  		}
   373  		perfData.DataItems = append(perfData.DataItems, item)
   374  	}
   375  	return perfData
   376  }
   377  
   378  func (m *apiCallMetrics) sorted() []*apiCallMetric {
   379  	all := make([]*apiCallMetric, 0)
   380  	for _, v := range m.metrics {
   381  		all = append(all, v)
   382  	}
   383  	sort.Slice(all, func(i, j int) bool {
   384  		return all[i].Latency.Perc99 > all[j].Latency.Perc99
   385  	})
   386  	return all
   387  }
   388  
   389  func buildKey(resource, subresource, verb, scope string) string {
   390  	return fmt.Sprintf("%s|%s|%s|%s", resource, subresource, verb, scope)
   391  }
   392  
   393  func (ap *apiCallMetric) getKey() string {
   394  	return buildKey(ap.Resource, ap.Subresource, ap.Verb, ap.Scope)
   395  }
   396  
   397  func (ap *apiCallMetric) Validate(allowedSlowCalls int, threshold time.Duration) error {
   398  	// TODO(oxddr): remove allowedSlowCalls guard once it's stable
   399  	if allowedSlowCalls > 0 && ap.SlowCount <= allowedSlowCalls {
   400  		return nil
   401  	}
   402  	if err := ap.Latency.VerifyThreshold(threshold); err != nil {
   403  		return fmt.Errorf("got: %+v; expected perc99 <= %v", ap, threshold)
   404  	}
   405  	return nil
   406  }
   407  
   408  func (ap *apiCallMetric) getSLOThreshold() time.Duration {
   409  	if ap.Scope == "resource" {
   410  		return singleResourceThreshold
   411  	}
   412  	return multipleResourcesThreshold
   413  }