k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/slos/api_responsiveness_prometheus_test.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package slos 18 19 import ( 20 "bytes" 21 "encoding/json" 22 "flag" 23 "fmt" 24 "strings" 25 "testing" 26 "time" 27 28 "github.com/prometheus/common/model" 29 "github.com/stretchr/testify/assert" 30 "k8s.io/klog/v2" 31 "k8s.io/perf-tests/clusterloader2/pkg/errors" 32 "k8s.io/perf-tests/clusterloader2/pkg/measurement" 33 "k8s.io/perf-tests/clusterloader2/pkg/measurement/common/executors" 34 measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util" 35 36 _ "k8s.io/perf-tests/clusterloader2/pkg/flags" // init klog 37 ) 38 39 var ( 40 // klogv1 allows users to turn on/off logging to stderr only through 41 // the use of flag. This prevents us from having control over which 42 // of the test functions have that mechanism turned off when we run 43 // go test command. 44 // TODO(#1286): refactor api_responsiveness_prometheus.go to make 45 // testing of logging easier and remove this hack in the end. 46 klogLogToStderr = true 47 ) 48 49 func turnOffLoggingToStderrInKlog(t *testing.T) { 50 if klogLogToStderr { 51 err := flag.Set("logtostderr", "false") 52 if err != nil { 53 t.Errorf("Unable to set flag %v", err) 54 return 55 } 56 err = flag.Set("v", "2") 57 if err != nil { 58 t.Errorf("Unable to set flag %v", err) 59 return 60 } 61 flag.Parse() 62 klogLogToStderr = false 63 } 64 } 65 66 type sample struct { 67 resource string 68 subresource string 69 verb string 70 scope string 71 latency float64 72 count int 73 slowCount int 74 } 75 type summaryEntry struct { 76 resource string 77 subresource string 78 verb string 79 scope string 80 p50 float64 81 p90 float64 82 p99 float64 83 count string 84 slowCount string 85 } 86 87 type fakeQueryExecutor struct { 88 samples []*sample 89 } 90 91 func (ex *fakeQueryExecutor) Query(query string, queryTime time.Time) ([]*model.Sample, error) { 92 samples := make([]*model.Sample, 0) 93 for _, s := range ex.samples { 94 sample := &model.Sample{ 95 Metric: model.Metric{ 96 "resource": model.LabelValue(s.resource), 97 "subresource": model.LabelValue(s.subresource), 98 "verb": model.LabelValue(s.verb), 99 "scope": model.LabelValue(s.scope), 100 }, 101 } 102 103 if strings.HasPrefix(query, "sum(increase") { 104 if strings.Contains(query, "_count") { 105 // countQuery 106 sample.Value = model.SampleValue(s.count) 107 } else { 108 // countFastQuery 109 // This is query is called 3 times, but to avoid complex fake 110 // the same value is returned every time. The logic can handle 111 // duplicates well, so this shouldn't be an issue. 112 sample.Value = model.SampleValue(s.count - s.slowCount) 113 } 114 } else if strings.HasPrefix(query, "histogram_quantile") { 115 // simpleLatencyQuery 116 sample.Value = model.SampleValue(s.latency) 117 } else if strings.HasPrefix(query, "quantile_over_time") { 118 // latencyQuery 119 sample.Metric["quantile"] = ".99" 120 sample.Value = model.SampleValue(s.latency) 121 } 122 samples = append(samples, sample) 123 } 124 return samples, nil 125 } 126 127 func TestAPIResponsivenessSLOFailures(t *testing.T) { 128 cases := []struct { 129 name string 130 useSimple bool 131 allowedSlow int 132 hasError bool 133 testSeriesFile string 134 testSeriesDuration time.Duration 135 }{ 136 { 137 name: "slo_pass", 138 hasError: false, 139 testSeriesFile: "slo_pass.yaml", 140 testSeriesDuration: 10 * time.Minute, 141 }, 142 { 143 name: "below_slow_count_pass", 144 hasError: false, 145 allowedSlow: 1, 146 testSeriesFile: "below_slow_count_pass.yaml", 147 testSeriesDuration: 10 * time.Minute, 148 }, 149 { 150 name: "above_slow_count_failure", 151 hasError: true, 152 allowedSlow: 1, 153 testSeriesFile: "above_slow_count_failure.yaml", 154 testSeriesDuration: 10 * time.Minute, 155 }, 156 { 157 name: "mutating_slo_failure", 158 hasError: true, 159 testSeriesFile: "mutating_slo_failure.yaml", 160 testSeriesDuration: 10 * time.Minute, 161 }, 162 { 163 name: "get_slo_failure", 164 hasError: true, 165 testSeriesFile: "get_slo_failure.yaml", 166 testSeriesDuration: 10 * time.Minute, 167 }, 168 { 169 name: "namespace_list_slo_failure", 170 hasError: true, 171 testSeriesFile: "namespace_list_slo_failure.yaml", 172 testSeriesDuration: 10 * time.Minute, 173 }, 174 { 175 name: "cluster_list_slo_failure", 176 hasError: true, 177 testSeriesFile: "cluster_list_slo_failure.yaml", 178 testSeriesDuration: 10 * time.Minute, 179 }, 180 { 181 name: "slo_pass_simple", 182 useSimple: true, 183 hasError: false, 184 testSeriesFile: "slo_pass.yaml", 185 testSeriesDuration: 10 * time.Minute, 186 }, 187 { 188 name: "mutating_slo_failure_simple", 189 useSimple: true, 190 hasError: true, 191 testSeriesFile: "mutating_slo_failure.yaml", 192 testSeriesDuration: 10 * time.Minute, 193 }, 194 { 195 name: "get_slo_failure_simple", 196 useSimple: true, 197 hasError: true, 198 testSeriesFile: "get_slo_failure.yaml", 199 testSeriesDuration: 10 * time.Minute, 200 }, 201 { 202 name: "namespace_list_slo_failure_simple", 203 useSimple: true, 204 hasError: true, 205 testSeriesFile: "namespace_list_slo_failure.yaml", 206 testSeriesDuration: 10 * time.Minute, 207 }, 208 { 209 name: "cluster_list_slo_failure_simple", 210 useSimple: true, 211 hasError: true, 212 testSeriesFile: "cluster_list_slo_failure.yaml", 213 testSeriesDuration: 10 * time.Minute, 214 }, 215 } 216 217 for _, tc := range cases { 218 t.Run(tc.name, func(t *testing.T) { 219 executor, err := executors.NewPromqlExecutor(fmt.Sprintf("../testdata/api_responsiveness_prometheus/%s", tc.testSeriesFile)) 220 if err != nil { 221 t.Fatalf("failed to create PromQL executor: %v", err) 222 } 223 defer executor.Close() 224 gatherer := &apiResponsivenessGatherer{} 225 config := &measurement.Config{ 226 Params: map[string]interface{}{ 227 "useSimpleLatencyQuery": tc.useSimple, 228 "allowedSlowCalls": tc.allowedSlow, 229 }, 230 } 231 start := time.Unix(0, 0).UTC() 232 end := start.Add(tc.testSeriesDuration) 233 _, err = gatherer.Gather(executor, start, end, config) 234 if tc.hasError { 235 assert.NotNil(t, err, "wanted error, but got none") 236 } else { 237 assert.Nil(t, err, "wanted no error, but got %v", err) 238 } 239 }) 240 } 241 } 242 243 func TestAPIResponsivenessSummary(t *testing.T) { 244 cases := []struct { 245 name string 246 samples []*sample 247 summary []*summaryEntry 248 allowedSlow int 249 }{ 250 { 251 name: "single_entry", 252 allowedSlow: 0, 253 samples: []*sample{ 254 { 255 resource: "pod", 256 verb: "POST", 257 scope: "resource", 258 latency: 1.2, 259 count: 123, 260 slowCount: 5, 261 }, 262 }, 263 summary: []*summaryEntry{ 264 { 265 resource: "pod", 266 verb: "POST", 267 scope: "resource", 268 p99: 1200., 269 count: "123", 270 slowCount: "5", 271 }, 272 }, 273 }, 274 { 275 name: "single_entry_with_slow_calls_enabled", 276 allowedSlow: 1, 277 samples: []*sample{ 278 { 279 resource: "pod", 280 verb: "POST", 281 scope: "resource", 282 latency: 1.2, 283 count: 123, 284 slowCount: 5, 285 }, 286 }, 287 summary: []*summaryEntry{ 288 { 289 resource: "pod", 290 verb: "POST", 291 scope: "resource", 292 p99: 1200., 293 count: "123", 294 slowCount: "5", 295 }, 296 }, 297 }, 298 } 299 300 for _, tc := range cases { 301 t.Run(tc.name, func(t *testing.T) { 302 executor := &fakeQueryExecutor{samples: tc.samples} 303 gatherer := &apiResponsivenessGatherer{} 304 config := &measurement.Config{ 305 Params: map[string]interface{}{ 306 "allowedSlowCalls": tc.allowedSlow, 307 }, 308 } 309 310 summaries, err := gatherer.Gather(executor, time.Now(), time.Now(), config) 311 if !errors.IsMetricViolationError(err) { 312 t.Fatal("unexpected error: ", err) 313 } 314 checkSummary(t, summaries, tc.summary) 315 }) 316 } 317 } 318 319 func checkSummary(t *testing.T, got []measurement.Summary, wanted []*summaryEntry) { 320 assert.Lenf(t, got, 1, "wanted single summary, got %d", len(got)) 321 var perfData measurementutil.PerfData 322 if err := json.Unmarshal([]byte(got[0].SummaryContent()), &perfData); err != nil { 323 t.Errorf("unable to unmarshal summary: %v", err) 324 return 325 } 326 assert.Equal(t, currentAPICallMetricsVersion, perfData.Version) 327 assert.Len(t, perfData.DataItems, len(wanted)) 328 329 toKey := func(resource, subresource, verb, scope string) string { 330 return fmt.Sprintf("%s-%s-%s-%s", resource, subresource, verb, scope) 331 } 332 333 items := make(map[string]*measurementutil.DataItem) 334 for _, item := range perfData.DataItems { 335 items[toKey( 336 item.Labels["Resource"], 337 item.Labels["Subresource"], 338 item.Labels["Verb"], 339 item.Labels["Scope"])] = &item 340 } 341 342 for _, entry := range wanted { 343 item, ok := items[toKey(entry.resource, entry.subresource, entry.verb, entry.scope)] 344 if !ok { 345 t.Errorf("%s in %s: %s %s wanted, but not found", entry.verb, entry.scope, entry.resource, entry.subresource) 346 continue 347 } 348 assert.Equal(t, "ms", item.Unit) 349 assert.Equal(t, entry.p50, item.Data["Perc50"]) 350 assert.Equal(t, entry.p90, item.Data["Perc90"]) 351 assert.Equal(t, entry.p99, item.Data["Perc99"]) 352 assert.Equal(t, entry.count, item.Labels["Count"]) 353 assert.Equal(t, entry.slowCount, item.Labels["SlowCount"]) 354 } 355 } 356 357 func TestLogging(t *testing.T) { 358 cases := []struct { 359 name string 360 samples []*sample 361 expectedMessages []string 362 unexpectedMessages []string 363 }{ 364 { 365 name: "print_5_warnings", 366 samples: []*sample{ 367 { 368 resource: "r1", 369 verb: "POST", 370 scope: "resource", 371 latency: 1.2, 372 }, 373 { 374 resource: "r2", 375 verb: "POST", 376 scope: "resource", 377 latency: .9, 378 }, 379 { 380 resource: "r3", 381 verb: "POST", 382 scope: "resource", 383 latency: .8, 384 }, 385 { 386 resource: "r4", 387 verb: "POST", 388 scope: "resource", 389 latency: .7, 390 }, 391 { 392 resource: "r5", 393 verb: "POST", 394 scope: "resource", 395 latency: .6, 396 }, 397 { 398 resource: "r6", 399 verb: "POST", 400 scope: "resource", 401 latency: .5, 402 }, 403 }, 404 expectedMessages: []string{ 405 ": WARNING Top latency metric: {Resource:r1", 406 ": Top latency metric: {Resource:r2", 407 ": Top latency metric: {Resource:r3", 408 ": Top latency metric: {Resource:r4", 409 ": Top latency metric: {Resource:r5", 410 }, 411 unexpectedMessages: []string{ 412 "Resource:r6", 413 }, 414 }, 415 { 416 name: "print_all_violations", 417 samples: []*sample{ 418 { 419 resource: "r1", 420 verb: "POST", 421 scope: "resource", 422 latency: 1.2, 423 }, 424 { 425 resource: "r2", 426 verb: "POST", 427 scope: "resource", 428 latency: 1.9, 429 }, 430 { 431 resource: "r3", 432 verb: "POST", 433 scope: "resource", 434 latency: 1.8, 435 }, 436 { 437 resource: "r4", 438 verb: "POST", 439 scope: "resource", 440 latency: 1.7, 441 }, 442 { 443 resource: "r5", 444 verb: "POST", 445 scope: "resource", 446 latency: 1.6, 447 }, 448 { 449 resource: "r6", 450 verb: "POST", 451 scope: "resource", 452 latency: 1.5, 453 }, 454 { 455 resource: "r7", 456 verb: "POST", 457 scope: "resource", 458 latency: .5, 459 }, 460 }, 461 expectedMessages: []string{ 462 ": WARNING Top latency metric: {Resource:r1", 463 ": WARNING Top latency metric: {Resource:r2", 464 ": WARNING Top latency metric: {Resource:r3", 465 ": WARNING Top latency metric: {Resource:r4", 466 ": WARNING Top latency metric: {Resource:r5", 467 ": WARNING Top latency metric: {Resource:r6", 468 }, 469 unexpectedMessages: []string{ 470 "Resource:r7", 471 }, 472 }, 473 } 474 475 turnOffLoggingToStderrInKlog(t) 476 477 for _, tc := range cases { 478 t.Run(tc.name, func(t *testing.T) { 479 buf := bytes.NewBuffer(nil) 480 klog.SetOutput(buf) 481 482 executor := &fakeQueryExecutor{samples: tc.samples} 483 gatherer := &apiResponsivenessGatherer{} 484 config := &measurement.Config{} 485 486 _, err := gatherer.Gather(executor, time.Now(), time.Now(), config) 487 if err != nil && !errors.IsMetricViolationError(err) { 488 t.Errorf("error while gathering results: %v", err) 489 } 490 klog.Flush() 491 492 for _, msg := range tc.expectedMessages { 493 assert.Contains(t, buf.String(), msg) 494 } 495 for _, msg := range tc.unexpectedMessages { 496 assert.NotContains(t, buf.String(), msg) 497 } 498 }) 499 } 500 } 501 502 func TestAPIResponsivenessCustomThresholds(t *testing.T) { 503 splitter := func(yamlLines []string) string { 504 return strings.Join(yamlLines, "\n") 505 } 506 507 cases := []struct { 508 name string 509 config *measurement.Config 510 samples []*sample 511 hasError bool 512 expectedMessages []string 513 }{ 514 { 515 name: "simple_slo_threshold_override_success", 516 config: &measurement.Config{ 517 Params: map[string]interface{}{ 518 "customThresholds": splitter([]string{ 519 "- verb: PUT", 520 " resource: leases", 521 " scope: namespace", 522 " threshold: 600ms", 523 }), 524 }, 525 }, 526 samples: []*sample{ 527 { 528 resource: "leases", 529 verb: "PUT", 530 scope: "namespace", 531 latency: 0.5, 532 }, 533 }, 534 hasError: false, 535 }, 536 { 537 name: "simple_slo_threshold_override_failure", 538 config: &measurement.Config{ 539 Params: map[string]interface{}{ 540 "customThresholds": splitter([]string{ 541 "- verb: PUT", 542 " resource: leases", 543 " scope: namespace", 544 " threshold: 400ms", 545 }), 546 }, 547 }, 548 samples: []*sample{ 549 { 550 resource: "leases", 551 verb: "PUT", 552 scope: "namespace", 553 latency: 0.5, 554 }, 555 }, 556 hasError: true, 557 expectedMessages: []string{ 558 "WARNING Top latency metric", 559 }, 560 }, 561 { 562 name: "empty_custom_thresholds_field", 563 config: &measurement.Config{ 564 Params: map[string]interface{}{ 565 "customThresholds": "", 566 }, 567 }, 568 samples: []*sample{ 569 { 570 resource: "leases", 571 verb: "PUT", 572 scope: "namespace", 573 latency: 0.5, 574 }, 575 }, 576 hasError: false, 577 }, 578 { 579 name: "no_custom_thresholds_field", 580 config: &measurement.Config{ 581 Params: map[string]interface{}{}, 582 }, 583 samples: []*sample{ 584 { 585 resource: "leases", 586 verb: "PUT", 587 scope: "namespace", 588 latency: 0.5, 589 }, 590 }, 591 hasError: false, 592 }, 593 { 594 name: "unrecognized_metric", 595 config: &measurement.Config{ 596 Params: map[string]interface{}{ 597 "customThresholds": splitter([]string{ 598 "- verb: POST", 599 " resource: pod", 600 " scope: namespace", 601 " threshold: 500ms", 602 }), 603 }, 604 }, 605 samples: []*sample{ 606 { 607 resource: "leases", 608 verb: "PUT", 609 scope: "namespace", 610 latency: 0.2, 611 }, 612 }, 613 hasError: false, 614 expectedMessages: []string{ 615 "unrecognized custom threshold API call key", 616 }, 617 }, 618 { 619 name: "non_unmarshallable_custom_thresholds", 620 config: &measurement.Config{ 621 Params: map[string]interface{}{ 622 "customThresholds": splitter([]string{ 623 "im: not", 624 "a: good", 625 "yaml: array", 626 }), 627 }, 628 }, 629 samples: []*sample{ 630 { 631 resource: "pod", 632 verb: "POST", 633 scope: "namespace", 634 latency: 0.2, 635 }, 636 }, 637 hasError: true, 638 }, 639 } 640 641 turnOffLoggingToStderrInKlog(t) 642 643 for _, tc := range cases { 644 t.Run(tc.name, func(t *testing.T) { 645 buf := bytes.NewBuffer(nil) 646 klog.SetOutput(buf) 647 648 executor := &fakeQueryExecutor{samples: tc.samples} 649 gatherer := &apiResponsivenessGatherer{} 650 651 _, err := gatherer.Gather(executor, time.Now(), time.Now(), tc.config) 652 klog.Flush() 653 if tc.hasError { 654 assert.NotNil(t, err, "expected an error, but got none") 655 } else { 656 assert.Nil(t, err, "expected no error, but got %v", err) 657 } 658 659 for _, msg := range tc.expectedMessages { 660 assert.Contains(t, buf.String(), msg) 661 } 662 }) 663 } 664 }