k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/pod_command.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package common 18 19 import ( 20 "bytes" 21 "context" 22 "fmt" 23 "strings" 24 "sync" 25 "time" 26 27 v1 "k8s.io/api/core/v1" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/labels" 30 "k8s.io/apimachinery/pkg/runtime" 31 "k8s.io/apimachinery/pkg/watch" 32 "k8s.io/client-go/kubernetes" 33 "k8s.io/client-go/kubernetes/scheme" 34 "k8s.io/client-go/rest" 35 "k8s.io/client-go/tools/cache" 36 "k8s.io/client-go/tools/remotecommand" 37 "k8s.io/client-go/util/exec" 38 "k8s.io/klog/v2" 39 "k8s.io/perf-tests/clusterloader2/pkg/measurement" 40 "k8s.io/perf-tests/clusterloader2/pkg/measurement/util/informer" 41 "k8s.io/perf-tests/clusterloader2/pkg/util" 42 ) 43 44 const ( 45 podPeriodicCommandMeasurementName = "PodPeriodicCommand" 46 ) 47 48 type podPeriodicCommandMeasurementCommandParams struct { 49 // Name is an identifier for the command. 50 Name string 51 // Command is the actual Command to execute in a pod. 52 Command []string 53 // Timeout is the maximum amount of time the command will have to finish. 54 Timeout time.Duration 55 } 56 57 type podPeriodicCommandMeasurementParams struct { 58 // LabelSelector is used to select applicable pods to run commands on. 59 LabelSelector *labels.Selector 60 // Interval is the time between sequential command executions. 61 Interval time.Duration 62 // Container is the name of the Container to run the command in. 63 Container string 64 // Limit is the maximum number of pods that will have the commands executed in on every interval. 65 Limit int 66 // FailOnCommandError controls if the measurement will fail if a command has a non-zero RC during the life of the measurement. 67 FailOnCommandError bool 68 // FailOnExecError controls if the measurement will fail if an error occurs while trying to execute a command. 69 // For example, this would include any error returned from the k8s client-go library. 70 FailOnExecError bool 71 // FailOnTimeout controls if the measurement will fail if a command times out. 72 FailOnTimeout bool 73 // Commands is the list of Commands that will be executed in each pod on each interval. 74 Commands []*podPeriodicCommandMeasurementCommandParams 75 } 76 77 func newPodPeriodCommandMeasurementParams( 78 params map[string]interface{}, 79 ) (p *podPeriodicCommandMeasurementParams, err error) { 80 p = &podPeriodicCommandMeasurementParams{} 81 82 p.LabelSelector, err = util.GetLabelSelector(params, "labelSelector") 83 if err != nil { 84 return 85 } 86 p.Interval, err = util.GetDuration(params, "interval") 87 if err != nil { 88 return 89 } 90 91 p.Container, err = util.GetString(params, "container") 92 if err != nil { 93 return 94 } 95 96 p.Limit, err = util.GetInt(params, "limit") 97 if err != nil { 98 return 99 } 100 101 p.FailOnCommandError, err = util.GetBool(params, "failOnCommandError") 102 if err != nil { 103 return 104 } 105 106 p.FailOnExecError, err = util.GetBool(params, "failOnExecError") 107 if err != nil { 108 return 109 } 110 111 p.FailOnTimeout, err = util.GetBool(params, "failOnTimeout") 112 if err != nil { 113 return 114 } 115 116 var commandMaps []map[string]interface{} 117 commandMaps, err = util.GetMapArray(params, "commands") 118 if err != nil { 119 return 120 } 121 122 p.Commands = []*podPeriodicCommandMeasurementCommandParams{} 123 for _, commandMap := range commandMaps { 124 c := &podPeriodicCommandMeasurementCommandParams{} 125 126 c.Name, err = util.GetString(commandMap, "name") 127 if err != nil { 128 return 129 } 130 131 c.Command, err = util.GetStringArray(commandMap, "command") 132 if err != nil { 133 return 134 } 135 136 c.Timeout, err = util.GetDuration(commandMap, "timeout") 137 if err != nil { 138 return 139 } 140 141 p.Commands = append(p.Commands, c) 142 } 143 144 return p, nil 145 } 146 147 type runCommandResult struct { 148 // stdout is the saved stdout from the command. Will be stored as its own measurement summary. 149 stdout string 150 // stderr is the saved stderr from the command. Will be stored as its own measurement summary. 151 stderr string 152 // ExitCode is the RC from the command. Defaults to zero and will not be set if the command times 153 // out or fails to run. 154 ExitCode int `json:"exitCode"` 155 // Name is the name of the command that was run, set in the config. 156 Name string `json:"name"` 157 // Command is the actual command which was executed. 158 Command []string `json:"command"` 159 // Timeout is the configured timeout duration. 160 Timeout string `json:"timeout"` 161 // HitTimeout is set to true if the command did not finish before the timeout. 162 HitTimeout bool `json:"hitTimeout"` 163 // StartTime is the time the command began executing. Isn't super precise. 164 StartTime time.Time `json:"startTime"` 165 // EndTime is the time the command finished executing. Isn't super precise. 166 EndTime time.Time `json:"endTime"` 167 // ExecError is set to any go error raised while executing the command. 168 ExecError string `json:"execError"` 169 } 170 171 type runAllCommandsResult struct { 172 Pod string `json:"pod"` 173 Namespace string `json:"namespace"` 174 Container string `json:"container"` 175 Commands []*runCommandResult `json:"commands"` 176 } 177 178 type stats struct { 179 // Execs is the total number of times a command was executed in a pod. 180 Execs int `json:"execs"` 181 // ExecErrors is the total number of errors that were observed, not including errors from the executed commands. 182 // For example, this includes any errors that are returned by the k8s client-go library. 183 ExecErrors int `json:"execErrors"` 184 // Timeouts is the number of commands which hit a timeout. 185 Timeouts int `json:"timeouts"` 186 // NonZeroRCs is the total number of non-zero return codes that were collected from the commands executed. 187 NonZeroRCs int `json:"nonZeroRCs"` 188 // Measurements is the total number of measurements gathered. 189 Measurements int `json:"measurements"` 190 // Ticks is the total number of intervals that were executed. 191 Ticks int `json:"ticks"` 192 // TicksNoPods is the total number of intervals that were skipped because no applicable pods could be found. 193 TicksNoPods int `json:"ticksNoPods"` 194 } 195 196 // podPeriodicCommandMeasurement can be used to continually run commands within pods at an interval. 197 // 198 // It works by performing the following on each tick: 199 // 200 // 1. Creating a list of pods, with maximum size `params.Limit`, which will execute the configured commands. 201 // Pods are selected using `params.LabelSelector`, must contain `params.Container`, and must be in a running 202 // state. If no applicable pods are available, then no step is performed for the tick. 203 // 2. For each pod, spin a goroutine which will run all configured commands in the pod. 204 // 3. For each command, spin a goroutine to handle running the command. 205 // 4. If the command returns non-zero, this will be reflected in the associated measurement. 206 // 5. If a go error occurred while trying to execute the command, this will be reflected in the associated measurement. 207 // 208 // The following measurements are produced during the gather step: 209 // 210 // 1. One summary measurement, which includes information on all executed commands, such as if the command 211 // took longer than `params.Timeout`, the command's RC, and the pod the command was executed on. 212 // 2. One measurement for each command's non-empty stdout and stderr. 213 // 3. One measurement containing statistics, such as the number of commands executed, the number of errors observed, 214 // and the number of non-zero RCs. 215 // 216 // The measurement fails in the following scenarios: 217 // 218 // 1. `params.FailOnCommandError` is set to true and a command has a non-zero RC. 219 // 2. `params.FailOnExecError` is set to true and an error occurs while trying to execute a command. 220 // 3. `params.FailOnTimeout` is set to true and a command takes longer than its configured timeout to execute. 221 type podPeriodicCommandMeasurement struct { 222 clientset kubernetes.Interface 223 restConfig *rest.Config 224 params *podPeriodicCommandMeasurementParams 225 isRunning bool 226 // skipGather signals if the gather step should be skipped, mainly used to bail if param parsing failed. 227 skipGather bool 228 // stopCh is closed when stop() is called. 229 stopCh chan struct{} 230 // doneCh is closed after stopCh is closed and all in progress commands have finished. 231 doneCh chan struct{} 232 results []*runAllCommandsResult 233 informer cache.SharedInformer 234 stats *stats 235 // statsLock needs to be held to modify and read the stats field. 236 statsLock *sync.Mutex 237 } 238 239 // isApplicablePod checks if a pod is a viable candidate for running a command on. 240 func (p *podPeriodicCommandMeasurement) isApplicablePod(pod *v1.Pod) bool { 241 if pod.Status.Phase != v1.PodRunning { 242 return false 243 } 244 245 hasContainer := false 246 for _, c := range pod.Spec.Containers { 247 if c.Name == p.params.Container { 248 hasContainer = true 249 250 break 251 } 252 } 253 254 if !hasContainer { 255 return false 256 } 257 258 for _, c := range pod.Status.Conditions { 259 if c.Type == v1.PodReady && c.Status == v1.ConditionTrue { 260 return true 261 } 262 } 263 264 return false 265 } 266 267 // getMaxNPods gets at most N pods from the internal informer's store. 268 // The informer uses a ThreadSafeStore, which stores objects in a map. When List is called, the map is 269 // iterated over using range, which ensures a random order. 270 func (p *podPeriodicCommandMeasurement) getMaxNPods(n int) []*v1.Pod { 271 store := p.informer.GetStore() 272 pods := []*v1.Pod{} 273 274 podList := store.List() 275 if len(podList) == 0 { 276 return pods 277 } 278 279 for _, podInterface := range podList { 280 pod := podInterface.(*v1.Pod) 281 if !p.isApplicablePod(pod) { 282 continue 283 } 284 285 pods = append(pods, pod) 286 287 if len(pods) >= n { 288 return pods 289 } 290 } 291 292 return pods 293 } 294 295 // runCommandInPod runs a specific given command in the specific given pod. 296 func (p *podPeriodicCommandMeasurement) runCommandInPod( 297 pod *v1.Pod, params *podPeriodicCommandMeasurementCommandParams, 298 ) *runCommandResult { 299 klog.V(4).Infof( 300 "%s: running named command %s in pod %s/%s", 301 podPeriodicCommandMeasurementName, params.Name, pod.Namespace, pod.Name, 302 ) 303 304 p.statsLock.Lock() 305 p.stats.Execs++ 306 p.statsLock.Unlock() 307 308 result := &runCommandResult{ 309 Name: params.Name, 310 Command: params.Command, 311 Timeout: params.Timeout.String(), 312 ExitCode: 0, 313 HitTimeout: false, 314 } 315 316 req := p.clientset.CoreV1().RESTClient(). 317 Post(). 318 Namespace(pod.Namespace). 319 Resource("pods"). 320 Name(pod.Name). 321 SubResource("exec"). 322 VersionedParams(&v1.PodExecOptions{ 323 Container: p.params.Container, 324 Command: params.Command, 325 Stdin: false, 326 Stdout: true, 327 Stderr: true, 328 TTY: false, 329 }, scheme.ParameterCodec) 330 331 executor, err := remotecommand.NewSPDYExecutor(p.restConfig, "POST", req.URL()) 332 if err != nil { 333 result.ExecError = err.Error() 334 335 return result 336 } 337 338 stdoutBuf := &bytes.Buffer{} 339 stderrBuf := &bytes.Buffer{} 340 // Holds error returned from executor.Stream. 341 execErrChan := make(chan error, 1) 342 343 // The logic used here to start the executor and the timeout timer isn't super precise, but 344 // it is good enough for this use case. It is ok that the timeout timer is started after the 345 // executor, since we still guarantee that the timeout is at least the configured value. 346 result.StartTime = time.Now() 347 348 go func() { 349 err := executor.Stream(remotecommand.StreamOptions{ 350 Stdout: stdoutBuf, 351 Stderr: stderrBuf, 352 }) 353 execErrChan <- err 354 }() 355 356 // Two different cases: (1) if the command returns before the timeout, and (2) if the timeout 357 // triggers before the command is done. 358 // The value result.EndTime is set in both cases. 359 // If the timeout triggers, then the command isn't actually cancelled. This logic isn't available until 360 // client-go version 0.26 (see Executor.StreamWithContext). 361 select { 362 case err = <-execErrChan: 363 result.EndTime = time.Now() 364 365 if err == nil { 366 break 367 } 368 369 switch e := err.(type) { 370 case exec.CodeExitError: 371 result.ExitCode = e.ExitStatus() 372 373 p.statsLock.Lock() 374 p.stats.NonZeroRCs++ 375 p.statsLock.Unlock() 376 377 klog.V(2).Infof( 378 "%s: warning: non-zero exit code %d for named command %s in pod %s/%s", 379 podPeriodicCommandMeasurementName, result.ExitCode, params.Name, pod.Namespace, pod.Name, 380 ) 381 default: 382 result.ExecError = err.Error() 383 return result 384 } 385 case <-time.After(params.Timeout): 386 result.EndTime = time.Now() 387 result.HitTimeout = true 388 389 p.statsLock.Lock() 390 p.stats.Timeouts++ 391 p.statsLock.Unlock() 392 393 klog.V(2).Infof( 394 "%s: warning: hit timeout of %s for named command %s in pod %s/%s", 395 podPeriodicCommandMeasurementName, params.Timeout.String(), params.Name, pod.Namespace, pod.Name, 396 ) 397 } 398 399 klog.V(4).Infof( 400 "%s: finished running named command %s in pod %s/%s", 401 podPeriodicCommandMeasurementName, params.Name, pod.Namespace, pod.Name, 402 ) 403 404 result.stdout = stdoutBuf.String() 405 result.stderr = stderrBuf.String() 406 407 return result 408 } 409 410 // runAllCommandsInPod runs all of the configured commands in the given specific pod. 411 func (p *podPeriodicCommandMeasurement) runAllCommandsInPod(pod *v1.Pod) *runAllCommandsResult { 412 wg := &sync.WaitGroup{} 413 commandResultCh := make(chan *runCommandResult, len(p.params.Commands)) 414 415 getRunCommandFunc := func(c *podPeriodicCommandMeasurementCommandParams) func() { 416 return func() { 417 defer wg.Done() 418 419 if c := p.runCommandInPod(pod, c); c != nil { 420 if c.ExecError != "" { 421 p.statsLock.Lock() 422 p.stats.ExecErrors++ 423 p.statsLock.Unlock() 424 425 klog.V(2).Infof( 426 "%s: error while running named command %s on pod %s/%s: %v", 427 podPeriodicCommandMeasurementName, c.Name, pod.Namespace, pod.Name, c.ExecError, 428 ) 429 } 430 431 commandResultCh <- c 432 } 433 } 434 } 435 436 klog.V(4).Infof( 437 "%s: running commands on pod %s/%s", podPeriodicCommandMeasurementName, pod.Namespace, pod.Name, 438 ) 439 440 for _, command := range p.params.Commands { 441 wg.Add(1) 442 443 go getRunCommandFunc(command)() 444 } 445 446 wg.Wait() 447 close(commandResultCh) 448 449 klog.V(4).Infof( 450 "%s: finished running commands on pod %s/%s", podPeriodicCommandMeasurementName, pod.Namespace, pod.Name, 451 ) 452 453 results := &runAllCommandsResult{ 454 Pod: pod.Name, 455 Namespace: pod.Namespace, 456 Container: p.params.Container, 457 Commands: []*runCommandResult{}, 458 } 459 460 for c := range commandResultCh { 461 results.Commands = append(results.Commands, c) 462 } 463 464 klog.V(8).Infof("%s: %#v", podPeriodicCommandMeasurementName, results) 465 466 return results 467 } 468 469 // commandWorker runs the configured commands in applicable pods on the configured interval. 470 func (p *podPeriodicCommandMeasurement) commandWorker() { 471 ticker := time.NewTicker(p.params.Interval) 472 defer func() { 473 ticker.Stop() 474 // Close doneCh to signal the worker has exited. 475 close(p.doneCh) 476 }() 477 478 doTick := func() { 479 p.statsLock.Lock() 480 p.stats.Ticks++ 481 p.statsLock.Unlock() 482 483 targetPods := p.getMaxNPods(p.params.Limit) 484 if len(targetPods) == 0 { 485 klog.V(2).Infof("%s: warning: no pods available to run commands on", podPeriodicCommandMeasurementName) 486 487 p.statsLock.Lock() 488 p.stats.TicksNoPods++ 489 p.statsLock.Unlock() 490 491 return 492 } 493 494 wg := &sync.WaitGroup{} 495 resultsChan := make(chan *runAllCommandsResult, len(targetPods)) 496 497 for _, pod := range targetPods { 498 wg.Add(1) 499 go func(targetPod *v1.Pod) { 500 defer wg.Done() 501 resultsChan <- p.runAllCommandsInPod(targetPod) 502 }(pod) 503 } 504 505 wg.Wait() 506 close(resultsChan) 507 508 for r := range resultsChan { 509 p.results = append(p.results, r) 510 } 511 } 512 513 // Do an initial tick 514 doTick() 515 for { 516 select { 517 case <-p.stopCh: 518 return 519 case <-ticker.C: 520 doTick() 521 } 522 } 523 } 524 525 func (p *podPeriodicCommandMeasurement) start( 526 clientset kubernetes.Interface, restConfig *rest.Config, params *podPeriodicCommandMeasurementParams, 527 ) error { 528 if p.isRunning { 529 return fmt.Errorf("%s: measurement already running", podPeriodicCommandMeasurementName) 530 } 531 532 klog.V(2).Infof("%s: starting pod periodic command measurement...", podPeriodicCommandMeasurementName) 533 534 p.clientset = clientset 535 p.restConfig = restConfig 536 p.params = params 537 p.isRunning = true 538 p.skipGather = false 539 p.stopCh = make(chan struct{}) 540 p.doneCh = make(chan struct{}) 541 p.results = []*runAllCommandsResult{} 542 p.stats = &stats{} 543 p.statsLock = &sync.Mutex{} 544 545 labelSelectorString := (*params.LabelSelector).String() 546 p.informer = informer.NewInformer( 547 &cache.ListWatch{ 548 ListFunc: func(options metav1.ListOptions) (runtime.Object, error) { 549 options.LabelSelector = labelSelectorString 550 return clientset.CoreV1().Pods("").List(context.TODO(), options) 551 }, 552 WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) { 553 options.LabelSelector = labelSelectorString 554 return clientset.CoreV1().Pods("").Watch(context.TODO(), options) 555 }, 556 }, 557 // Use the informer's internal cache to handle listing pods, no need to handle events. 558 func(_, _ interface{}) {}, 559 ) 560 561 if err := informer.StartAndSync(p.informer, p.stopCh, informerSyncTimeout); err != nil { 562 return err 563 } 564 565 go p.commandWorker() 566 567 return nil 568 } 569 570 func (p *podPeriodicCommandMeasurement) stop() { 571 if p.isRunning { 572 p.isRunning = false 573 close(p.stopCh) 574 // Wait for the commandWorker function to stop. 575 <-p.doneCh 576 } 577 } 578 579 func (p *podPeriodicCommandMeasurement) gather() ([]measurement.Summary, error) { 580 p.stop() 581 582 klog.V(2).Infof("%s: gathered %d command results", podPeriodicCommandMeasurementName, len(p.results)) 583 584 // Create summary for all results. 585 content, err := util.PrettyPrintJSON(p.results) 586 if err != nil { 587 // Ignore p.params.FailOnError here, since this is fatal. 588 return nil, fmt.Errorf("unable to convert results to JSON: %w", err) 589 } 590 591 measurements := []measurement.Summary{ 592 measurement.CreateSummary(podPeriodicCommandMeasurementName, "json", content), 593 } 594 595 // Hold error to be returned to signal that the measurement failed, or nil. 596 // Should only be non-nil if one of the FailOnXYZ params is set. 597 var resultErr error 598 599 // Create individual results for stdout and stderr. 600 // Saving these as a value in a json document can lead to weird issues in reading the data 601 // properly, especially if the data is binary, such as for profiling results. 602 // Additionally, check for any errors or timeouts that may have occurred. 603 for _, r := range p.results { 604 getSummaryName := func(c *runCommandResult, suffix string) string { 605 return strings.Join( 606 []string{ 607 podPeriodicCommandMeasurementName, c.StartTime.Format(time.RFC3339), r.Namespace, r.Pod, c.Name, suffix, 608 }, "-", 609 ) 610 } 611 612 for _, c := range r.Commands { 613 if c.stdout != "" { 614 measurements = append(measurements, measurement.CreateSummary(getSummaryName(c, "stdout"), "txt", c.stdout)) 615 } 616 if c.stderr != "" { 617 measurements = append(measurements, measurement.CreateSummary(getSummaryName(c, "stderr"), "txt", c.stderr)) 618 } 619 620 // If the result error has already been set, we don't need to set it again. 621 if resultErr != nil { 622 continue 623 } 624 625 if p.params.FailOnCommandError && c.ExitCode != 0 { 626 resultErr = fmt.Errorf( 627 "unexpected non-zero RC while executing command %s in pod %s/%s: got RC %d", 628 c.Name, r.Namespace, r.Pod, c.ExitCode, 629 ) 630 continue 631 } 632 633 if p.params.FailOnExecError && c.ExecError != "" { 634 resultErr = fmt.Errorf( 635 "unexpected error while executing command %s in pod %s/%s: %s", c.Name, r.Namespace, r.Pod, c.ExecError, 636 ) 637 continue 638 } 639 640 if p.params.FailOnTimeout && c.HitTimeout { 641 resultErr = fmt.Errorf( 642 "hit timeout of %s while executing command %s in pod %s/%s", 643 c.Timeout, c.Name, r.Namespace, r.Pod, 644 ) 645 } 646 } 647 } 648 649 // Create summary for stats. 650 p.stats.Measurements = len(measurements) + 1 // Adding another measurement for the stats. 651 content, err = util.PrettyPrintJSON(p.stats) 652 if err != nil { 653 // Ignore p.params.FailOnError here, since this is fatal. 654 return nil, fmt.Errorf("unable to convert stats to JSON: %w", err) 655 } 656 657 measurements = append( 658 measurements, 659 measurement.CreateSummary( 660 strings.Join([]string{podPeriodicCommandMeasurementName, "stats"}, "-"), "json", content, 661 ), 662 ) 663 664 // resultErr can only be set if one of the FailOnXYZ params is set. 665 if resultErr != nil { 666 return measurements, resultErr 667 } 668 669 return measurements, nil 670 } 671 672 func (*podPeriodicCommandMeasurement) String() string { 673 return podPeriodicCommandMeasurementName 674 } 675 676 func (p *podPeriodicCommandMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) { 677 action, err := util.GetString(config.Params, "action") 678 if err != nil { 679 return nil, err 680 } 681 682 switch action { 683 case "start": 684 params, err := newPodPeriodCommandMeasurementParams(config.Params) 685 if err != nil { 686 p.skipGather = true 687 return nil, err 688 } 689 690 return nil, p.start( 691 config.ClusterFramework.GetClientSets().GetClient(), config.ClusterFramework.GetRestClient(), params, 692 ) 693 case "gather": 694 if p.skipGather { 695 return nil, nil 696 } 697 698 return p.gather() 699 default: 700 return nil, fmt.Errorf("unknown action %s", action) 701 } 702 } 703 704 func (p *podPeriodicCommandMeasurement) Dispose() { 705 p.stop() 706 } 707 708 func createPodPeriodicCommandMeasurement() measurement.Measurement { 709 return &podPeriodicCommandMeasurement{} 710 } 711 712 func init() { 713 if err := measurement.Register(podPeriodicCommandMeasurementName, createPodPeriodicCommandMeasurement); err != nil { 714 klog.Fatalf("Cannot register %s: %v", podPeriodicCommandMeasurementName, err) 715 } 716 }