k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/slos/api_responsiveness_prometheus_test.go

k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/slos/api_responsiveness_prometheus_test.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package slos
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/json"
    22  	"flag"
    23  	"fmt"
    24  	"strings"
    25  	"testing"
    26  	"time"
    27  
    28  	"github.com/prometheus/common/model"
    29  	"github.com/stretchr/testify/assert"
    30  	"k8s.io/klog/v2"
    31  	"k8s.io/perf-tests/clusterloader2/pkg/errors"
    32  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    33  	"k8s.io/perf-tests/clusterloader2/pkg/measurement/common/executors"
    34  	measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util"
    35  
    36  	_ "k8s.io/perf-tests/clusterloader2/pkg/flags" // init klog
    37  )
    38  
    39  var (
    40  	// klogv1 allows users to turn on/off logging to stderr only through
    41  	// the use of flag. This prevents us from having control over which
    42  	// of the test functions have that mechanism turned off when we run
    43  	// go test command.
    44  	// TODO(#1286): refactor api_responsiveness_prometheus.go to make
    45  	// testing of logging easier and remove this hack in the end.
    46  	klogLogToStderr = true
    47  )
    48  
    49  func turnOffLoggingToStderrInKlog(t *testing.T) {
    50  	if klogLogToStderr {
    51  		err := flag.Set("logtostderr", "false")
    52  		if err != nil {
    53  			t.Errorf("Unable to set flag %v", err)
    54  			return
    55  		}
    56  		err = flag.Set("v", "2")
    57  		if err != nil {
    58  			t.Errorf("Unable to set flag %v", err)
    59  			return
    60  		}
    61  		flag.Parse()
    62  		klogLogToStderr = false
    63  	}
    64  }
    65  
    66  type sample struct {
    67  	resource    string
    68  	subresource string
    69  	verb        string
    70  	scope       string
    71  	latency     float64
    72  	count       int
    73  	slowCount   int
    74  }
    75  type summaryEntry struct {
    76  	resource    string
    77  	subresource string
    78  	verb        string
    79  	scope       string
    80  	p50         float64
    81  	p90         float64
    82  	p99         float64
    83  	count       string
    84  	slowCount   string
    85  }
    86  
    87  type fakeQueryExecutor struct {
    88  	samples []*sample
    89  }
    90  
    91  func (ex *fakeQueryExecutor) Query(query string, queryTime time.Time) ([]*model.Sample, error) {
    92  	samples := make([]*model.Sample, 0)
    93  	for _, s := range ex.samples {
    94  		sample := &model.Sample{
    95  			Metric: model.Metric{
    96  				"resource":    model.LabelValue(s.resource),
    97  				"subresource": model.LabelValue(s.subresource),
    98  				"verb":        model.LabelValue(s.verb),
    99  				"scope":       model.LabelValue(s.scope),
   100  			},
   101  		}
   102  
   103  		if strings.HasPrefix(query, "sum(increase") {
   104  			if strings.Contains(query, "_count") {
   105  				// countQuery
   106  				sample.Value = model.SampleValue(s.count)
   107  			} else {
   108  				// countFastQuery
   109  				// This is query is called 3 times, but to avoid complex fake
   110  				// the same value is returned every time. The logic can handle
   111  				// duplicates well, so this shouldn't be an issue.
   112  				sample.Value = model.SampleValue(s.count - s.slowCount)
   113  			}
   114  		} else if strings.HasPrefix(query, "histogram_quantile") {
   115  			// simpleLatencyQuery
   116  			sample.Value = model.SampleValue(s.latency)
   117  		} else if strings.HasPrefix(query, "quantile_over_time") {
   118  			// latencyQuery
   119  			sample.Metric["quantile"] = ".99"
   120  			sample.Value = model.SampleValue(s.latency)
   121  		}
   122  		samples = append(samples, sample)
   123  	}
   124  	return samples, nil
   125  }
   126  
   127  func TestAPIResponsivenessSLOFailures(t *testing.T) {
   128  	cases := []struct {
   129  		name               string
   130  		useSimple          bool
   131  		allowedSlow        int
   132  		hasError           bool
   133  		testSeriesFile     string
   134  		testSeriesDuration time.Duration
   135  	}{
   136  		{
   137  			name:               "slo_pass",
   138  			hasError:           false,
   139  			testSeriesFile:     "slo_pass.yaml",
   140  			testSeriesDuration: 10 * time.Minute,
   141  		},
   142  		{
   143  			name:               "below_slow_count_pass",
   144  			hasError:           false,
   145  			allowedSlow:        1,
   146  			testSeriesFile:     "below_slow_count_pass.yaml",
   147  			testSeriesDuration: 10 * time.Minute,
   148  		},
   149  		{
   150  			name:               "above_slow_count_failure",
   151  			hasError:           true,
   152  			allowedSlow:        1,
   153  			testSeriesFile:     "above_slow_count_failure.yaml",
   154  			testSeriesDuration: 10 * time.Minute,
   155  		},
   156  		{
   157  			name:               "mutating_slo_failure",
   158  			hasError:           true,
   159  			testSeriesFile:     "mutating_slo_failure.yaml",
   160  			testSeriesDuration: 10 * time.Minute,
   161  		},
   162  		{
   163  			name:               "get_slo_failure",
   164  			hasError:           true,
   165  			testSeriesFile:     "get_slo_failure.yaml",
   166  			testSeriesDuration: 10 * time.Minute,
   167  		},
   168  		{
   169  			name:               "namespace_list_slo_failure",
   170  			hasError:           true,
   171  			testSeriesFile:     "namespace_list_slo_failure.yaml",
   172  			testSeriesDuration: 10 * time.Minute,
   173  		},
   174  		{
   175  			name:               "cluster_list_slo_failure",
   176  			hasError:           true,
   177  			testSeriesFile:     "cluster_list_slo_failure.yaml",
   178  			testSeriesDuration: 10 * time.Minute,
   179  		},
   180  		{
   181  			name:               "slo_pass_simple",
   182  			useSimple:          true,
   183  			hasError:           false,
   184  			testSeriesFile:     "slo_pass.yaml",
   185  			testSeriesDuration: 10 * time.Minute,
   186  		},
   187  		{
   188  			name:               "mutating_slo_failure_simple",
   189  			useSimple:          true,
   190  			hasError:           true,
   191  			testSeriesFile:     "mutating_slo_failure.yaml",
   192  			testSeriesDuration: 10 * time.Minute,
   193  		},
   194  		{
   195  			name:               "get_slo_failure_simple",
   196  			useSimple:          true,
   197  			hasError:           true,
   198  			testSeriesFile:     "get_slo_failure.yaml",
   199  			testSeriesDuration: 10 * time.Minute,
   200  		},
   201  		{
   202  			name:               "namespace_list_slo_failure_simple",
   203  			useSimple:          true,
   204  			hasError:           true,
   205  			testSeriesFile:     "namespace_list_slo_failure.yaml",
   206  			testSeriesDuration: 10 * time.Minute,
   207  		},
   208  		{
   209  			name:               "cluster_list_slo_failure_simple",
   210  			useSimple:          true,
   211  			hasError:           true,
   212  			testSeriesFile:     "cluster_list_slo_failure.yaml",
   213  			testSeriesDuration: 10 * time.Minute,
   214  		},
   215  	}
   216  
   217  	for _, tc := range cases {
   218  		t.Run(tc.name, func(t *testing.T) {
   219  			executor, err := executors.NewPromqlExecutor(fmt.Sprintf("../testdata/api_responsiveness_prometheus/%s", tc.testSeriesFile))
   220  			if err != nil {
   221  				t.Fatalf("failed to create PromQL executor: %v", err)
   222  			}
   223  			defer executor.Close()
   224  			gatherer := &apiResponsivenessGatherer{}
   225  			config := &measurement.Config{
   226  				Params: map[string]interface{}{
   227  					"useSimpleLatencyQuery": tc.useSimple,
   228  					"allowedSlowCalls":      tc.allowedSlow,
   229  				},
   230  			}
   231  			start := time.Unix(0, 0).UTC()
   232  			end := start.Add(tc.testSeriesDuration)
   233  			_, err = gatherer.Gather(executor, start, end, config)
   234  			if tc.hasError {
   235  				assert.NotNil(t, err, "wanted error, but got none")
   236  			} else {
   237  				assert.Nil(t, err, "wanted no error, but got %v", err)
   238  			}
   239  		})
   240  	}
   241  }
   242  
   243  func TestAPIResponsivenessSummary(t *testing.T) {
   244  	cases := []struct {
   245  		name        string
   246  		samples     []*sample
   247  		summary     []*summaryEntry
   248  		allowedSlow int
   249  	}{
   250  		{
   251  			name:        "single_entry",
   252  			allowedSlow: 0,
   253  			samples: []*sample{
   254  				{
   255  					resource:  "pod",
   256  					verb:      "POST",
   257  					scope:     "resource",
   258  					latency:   1.2,
   259  					count:     123,
   260  					slowCount: 5,
   261  				},
   262  			},
   263  			summary: []*summaryEntry{
   264  				{
   265  					resource:  "pod",
   266  					verb:      "POST",
   267  					scope:     "resource",
   268  					p99:       1200.,
   269  					count:     "123",
   270  					slowCount: "5",
   271  				},
   272  			},
   273  		},
   274  		{
   275  			name:        "single_entry_with_slow_calls_enabled",
   276  			allowedSlow: 1,
   277  			samples: []*sample{
   278  				{
   279  					resource:  "pod",
   280  					verb:      "POST",
   281  					scope:     "resource",
   282  					latency:   1.2,
   283  					count:     123,
   284  					slowCount: 5,
   285  				},
   286  			},
   287  			summary: []*summaryEntry{
   288  				{
   289  					resource:  "pod",
   290  					verb:      "POST",
   291  					scope:     "resource",
   292  					p99:       1200.,
   293  					count:     "123",
   294  					slowCount: "5",
   295  				},
   296  			},
   297  		},
   298  	}
   299  
   300  	for _, tc := range cases {
   301  		t.Run(tc.name, func(t *testing.T) {
   302  			executor := &fakeQueryExecutor{samples: tc.samples}
   303  			gatherer := &apiResponsivenessGatherer{}
   304  			config := &measurement.Config{
   305  				Params: map[string]interface{}{
   306  					"allowedSlowCalls": tc.allowedSlow,
   307  				},
   308  			}
   309  
   310  			summaries, err := gatherer.Gather(executor, time.Now(), time.Now(), config)
   311  			if !errors.IsMetricViolationError(err) {
   312  				t.Fatal("unexpected error: ", err)
   313  			}
   314  			checkSummary(t, summaries, tc.summary)
   315  		})
   316  	}
   317  }
   318  
   319  func checkSummary(t *testing.T, got []measurement.Summary, wanted []*summaryEntry) {
   320  	assert.Lenf(t, got, 1, "wanted single summary, got %d", len(got))
   321  	var perfData measurementutil.PerfData
   322  	if err := json.Unmarshal([]byte(got[0].SummaryContent()), &perfData); err != nil {
   323  		t.Errorf("unable to unmarshal summary: %v", err)
   324  		return
   325  	}
   326  	assert.Equal(t, currentAPICallMetricsVersion, perfData.Version)
   327  	assert.Len(t, perfData.DataItems, len(wanted))
   328  
   329  	toKey := func(resource, subresource, verb, scope string) string {
   330  		return fmt.Sprintf("%s-%s-%s-%s", resource, subresource, verb, scope)
   331  	}
   332  
   333  	items := make(map[string]*measurementutil.DataItem)
   334  	for _, item := range perfData.DataItems {
   335  		items[toKey(
   336  			item.Labels["Resource"],
   337  			item.Labels["Subresource"],
   338  			item.Labels["Verb"],
   339  			item.Labels["Scope"])] = &item
   340  	}
   341  
   342  	for _, entry := range wanted {
   343  		item, ok := items[toKey(entry.resource, entry.subresource, entry.verb, entry.scope)]
   344  		if !ok {
   345  			t.Errorf("%s in %s: %s %s wanted, but not found", entry.verb, entry.scope, entry.resource, entry.subresource)
   346  			continue
   347  		}
   348  		assert.Equal(t, "ms", item.Unit)
   349  		assert.Equal(t, entry.p50, item.Data["Perc50"])
   350  		assert.Equal(t, entry.p90, item.Data["Perc90"])
   351  		assert.Equal(t, entry.p99, item.Data["Perc99"])
   352  		assert.Equal(t, entry.count, item.Labels["Count"])
   353  		assert.Equal(t, entry.slowCount, item.Labels["SlowCount"])
   354  	}
   355  }
   356  
   357  func TestLogging(t *testing.T) {
   358  	cases := []struct {
   359  		name               string
   360  		samples            []*sample
   361  		expectedMessages   []string
   362  		unexpectedMessages []string
   363  	}{
   364  		{
   365  			name: "print_5_warnings",
   366  			samples: []*sample{
   367  				{
   368  					resource: "r1",
   369  					verb:     "POST",
   370  					scope:    "resource",
   371  					latency:  1.2,
   372  				},
   373  				{
   374  					resource: "r2",
   375  					verb:     "POST",
   376  					scope:    "resource",
   377  					latency:  .9,
   378  				},
   379  				{
   380  					resource: "r3",
   381  					verb:     "POST",
   382  					scope:    "resource",
   383  					latency:  .8,
   384  				},
   385  				{
   386  					resource: "r4",
   387  					verb:     "POST",
   388  					scope:    "resource",
   389  					latency:  .7,
   390  				},
   391  				{
   392  					resource: "r5",
   393  					verb:     "POST",
   394  					scope:    "resource",
   395  					latency:  .6,
   396  				},
   397  				{
   398  					resource: "r6",
   399  					verb:     "POST",
   400  					scope:    "resource",
   401  					latency:  .5,
   402  				},
   403  			},
   404  			expectedMessages: []string{
   405  				": WARNING Top latency metric: {Resource:r1",
   406  				": Top latency metric: {Resource:r2",
   407  				": Top latency metric: {Resource:r3",
   408  				": Top latency metric: {Resource:r4",
   409  				": Top latency metric: {Resource:r5",
   410  			},
   411  			unexpectedMessages: []string{
   412  				"Resource:r6",
   413  			},
   414  		},
   415  		{
   416  			name: "print_all_violations",
   417  			samples: []*sample{
   418  				{
   419  					resource: "r1",
   420  					verb:     "POST",
   421  					scope:    "resource",
   422  					latency:  1.2,
   423  				},
   424  				{
   425  					resource: "r2",
   426  					verb:     "POST",
   427  					scope:    "resource",
   428  					latency:  1.9,
   429  				},
   430  				{
   431  					resource: "r3",
   432  					verb:     "POST",
   433  					scope:    "resource",
   434  					latency:  1.8,
   435  				},
   436  				{
   437  					resource: "r4",
   438  					verb:     "POST",
   439  					scope:    "resource",
   440  					latency:  1.7,
   441  				},
   442  				{
   443  					resource: "r5",
   444  					verb:     "POST",
   445  					scope:    "resource",
   446  					latency:  1.6,
   447  				},
   448  				{
   449  					resource: "r6",
   450  					verb:     "POST",
   451  					scope:    "resource",
   452  					latency:  1.5,
   453  				},
   454  				{
   455  					resource: "r7",
   456  					verb:     "POST",
   457  					scope:    "resource",
   458  					latency:  .5,
   459  				},
   460  			},
   461  			expectedMessages: []string{
   462  				": WARNING Top latency metric: {Resource:r1",
   463  				": WARNING Top latency metric: {Resource:r2",
   464  				": WARNING Top latency metric: {Resource:r3",
   465  				": WARNING Top latency metric: {Resource:r4",
   466  				": WARNING Top latency metric: {Resource:r5",
   467  				": WARNING Top latency metric: {Resource:r6",
   468  			},
   469  			unexpectedMessages: []string{
   470  				"Resource:r7",
   471  			},
   472  		},
   473  	}
   474  
   475  	turnOffLoggingToStderrInKlog(t)
   476  
   477  	for _, tc := range cases {
   478  		t.Run(tc.name, func(t *testing.T) {
   479  			buf := bytes.NewBuffer(nil)
   480  			klog.SetOutput(buf)
   481  
   482  			executor := &fakeQueryExecutor{samples: tc.samples}
   483  			gatherer := &apiResponsivenessGatherer{}
   484  			config := &measurement.Config{}
   485  
   486  			_, err := gatherer.Gather(executor, time.Now(), time.Now(), config)
   487  			if err != nil && !errors.IsMetricViolationError(err) {
   488  				t.Errorf("error while gathering results: %v", err)
   489  			}
   490  			klog.Flush()
   491  
   492  			for _, msg := range tc.expectedMessages {
   493  				assert.Contains(t, buf.String(), msg)
   494  			}
   495  			for _, msg := range tc.unexpectedMessages {
   496  				assert.NotContains(t, buf.String(), msg)
   497  			}
   498  		})
   499  	}
   500  }
   501  
   502  func TestAPIResponsivenessCustomThresholds(t *testing.T) {
   503  	splitter := func(yamlLines []string) string {
   504  		return strings.Join(yamlLines, "\n")
   505  	}
   506  
   507  	cases := []struct {
   508  		name             string
   509  		config           *measurement.Config
   510  		samples          []*sample
   511  		hasError         bool
   512  		expectedMessages []string
   513  	}{
   514  		{
   515  			name: "simple_slo_threshold_override_success",
   516  			config: &measurement.Config{
   517  				Params: map[string]interface{}{
   518  					"customThresholds": splitter([]string{
   519  						"- verb: PUT",
   520  						"  resource: leases",
   521  						"  scope: namespace",
   522  						"  threshold: 600ms",
   523  					}),
   524  				},
   525  			},
   526  			samples: []*sample{
   527  				{
   528  					resource: "leases",
   529  					verb:     "PUT",
   530  					scope:    "namespace",
   531  					latency:  0.5,
   532  				},
   533  			},
   534  			hasError: false,
   535  		},
   536  		{
   537  			name: "simple_slo_threshold_override_failure",
   538  			config: &measurement.Config{
   539  				Params: map[string]interface{}{
   540  					"customThresholds": splitter([]string{
   541  						"- verb: PUT",
   542  						"  resource: leases",
   543  						"  scope: namespace",
   544  						"  threshold: 400ms",
   545  					}),
   546  				},
   547  			},
   548  			samples: []*sample{
   549  				{
   550  					resource: "leases",
   551  					verb:     "PUT",
   552  					scope:    "namespace",
   553  					latency:  0.5,
   554  				},
   555  			},
   556  			hasError: true,
   557  			expectedMessages: []string{
   558  				"WARNING Top latency metric",
   559  			},
   560  		},
   561  		{
   562  			name: "empty_custom_thresholds_field",
   563  			config: &measurement.Config{
   564  				Params: map[string]interface{}{
   565  					"customThresholds": "",
   566  				},
   567  			},
   568  			samples: []*sample{
   569  				{
   570  					resource: "leases",
   571  					verb:     "PUT",
   572  					scope:    "namespace",
   573  					latency:  0.5,
   574  				},
   575  			},
   576  			hasError: false,
   577  		},
   578  		{
   579  			name: "no_custom_thresholds_field",
   580  			config: &measurement.Config{
   581  				Params: map[string]interface{}{},
   582  			},
   583  			samples: []*sample{
   584  				{
   585  					resource: "leases",
   586  					verb:     "PUT",
   587  					scope:    "namespace",
   588  					latency:  0.5,
   589  				},
   590  			},
   591  			hasError: false,
   592  		},
   593  		{
   594  			name: "unrecognized_metric",
   595  			config: &measurement.Config{
   596  				Params: map[string]interface{}{
   597  					"customThresholds": splitter([]string{
   598  						"- verb: POST",
   599  						"  resource: pod",
   600  						"  scope: namespace",
   601  						"  threshold: 500ms",
   602  					}),
   603  				},
   604  			},
   605  			samples: []*sample{
   606  				{
   607  					resource: "leases",
   608  					verb:     "PUT",
   609  					scope:    "namespace",
   610  					latency:  0.2,
   611  				},
   612  			},
   613  			hasError: false,
   614  			expectedMessages: []string{
   615  				"unrecognized custom threshold API call key",
   616  			},
   617  		},
   618  		{
   619  			name: "non_unmarshallable_custom_thresholds",
   620  			config: &measurement.Config{
   621  				Params: map[string]interface{}{
   622  					"customThresholds": splitter([]string{
   623  						"im: not",
   624  						"a: good",
   625  						"yaml: array",
   626  					}),
   627  				},
   628  			},
   629  			samples: []*sample{
   630  				{
   631  					resource: "pod",
   632  					verb:     "POST",
   633  					scope:    "namespace",
   634  					latency:  0.2,
   635  				},
   636  			},
   637  			hasError: true,
   638  		},
   639  	}
   640  
   641  	turnOffLoggingToStderrInKlog(t)
   642  
   643  	for _, tc := range cases {
   644  		t.Run(tc.name, func(t *testing.T) {
   645  			buf := bytes.NewBuffer(nil)
   646  			klog.SetOutput(buf)
   647  
   648  			executor := &fakeQueryExecutor{samples: tc.samples}
   649  			gatherer := &apiResponsivenessGatherer{}
   650  
   651  			_, err := gatherer.Gather(executor, time.Now(), time.Now(), tc.config)
   652  			klog.Flush()
   653  			if tc.hasError {
   654  				assert.NotNil(t, err, "expected an error, but got none")
   655  			} else {
   656  				assert.Nil(t, err, "expected no error, but got %v", err)
   657  			}
   658  
   659  			for _, msg := range tc.expectedMessages {
   660  				assert.Contains(t, buf.String(), msg)
   661  			}
   662  		})
   663  	}
   664  }