k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/system_pod_metrics.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package common
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"strings"
    23  
    24  	"gopkg.in/yaml.v2"
    25  	v1 "k8s.io/api/core/v1"
    26  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    27  	"k8s.io/client-go/kubernetes"
    28  	"k8s.io/klog/v2"
    29  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    30  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    31  )
    32  
    33  const (
    34  	systemPodMetricsName              = "SystemPodMetrics"
    35  	systemNamespace                   = "kube-system"
    36  	systemPodMetricsEnabledFlagName   = "systemPodMetricsEnabled"
    37  	restartThresholdOverridesFlagName = "restartCountThresholdOverrides"
    38  	enableRestartCountCheckFlagName   = "enableRestartCountCheck"
    39  	defaultRestartCountThresholdKey   = "default"
    40  )
    41  
    42  func init() {
    43  	if err := measurement.Register(systemPodMetricsName, createSystemPodMetricsMeasurement); err != nil {
    44  		klog.Fatalf("Cannot register %s: %v", systemPodMetricsName, err)
    45  	}
    46  }
    47  
    48  func createSystemPodMetricsMeasurement() measurement.Measurement {
    49  	return &systemPodMetricsMeasurement{}
    50  }
    51  
    52  // Gathers metrics for system pods, right now it only gathers container restart counts.
    53  // System pods are listed twice: first time for "start" action, second time for "gather" action.
    54  // When executing "gather", initial restart counts are subtracted from the current
    55  // restart counts. In effect, only restarts that happened during test execution
    56  // (between "start" and "gather") are visible in the final summary.
    57  type systemPodMetricsMeasurement struct {
    58  	initSnapshot *systemPodsMetrics
    59  }
    60  
    61  type containerMetrics struct {
    62  	Name              string `json:"name"`
    63  	RestartCount      int32  `json:"restartCount"`
    64  	LastRestartReason string `json:"lastRestartReason"`
    65  }
    66  
    67  type podMetrics struct {
    68  	Name       string             `json:"name"`
    69  	Containers []containerMetrics `json:"containers"`
    70  }
    71  
    72  type systemPodsMetrics struct {
    73  	Pods []podMetrics `json:"pods"`
    74  }
    75  
    76  // Execute gathers and prints system pod metrics.
    77  func (m *systemPodMetricsMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) {
    78  	systemPodMetricsEnabled, err := util.GetBoolOrDefault(config.Params, systemPodMetricsEnabledFlagName, false)
    79  	if err != nil {
    80  		return nil, err
    81  	}
    82  	if !systemPodMetricsEnabled {
    83  		klog.V(2).Info("skipping collection of system pod metrics")
    84  		return []measurement.Summary{}, nil
    85  	}
    86  
    87  	metrics, err := getPodMetrics(config)
    88  	if err != nil {
    89  		return nil, err
    90  	}
    91  
    92  	action, err := util.GetString(config.Params, "action")
    93  	if err != nil {
    94  		return nil, err
    95  	}
    96  
    97  	overrides, err := getThresholdOverrides(config)
    98  	if err != nil {
    99  		return nil, err
   100  	}
   101  
   102  	switch action {
   103  	case "start":
   104  		m.initSnapshot = metrics
   105  		return nil, nil
   106  	case "gather":
   107  		if m.initSnapshot == nil {
   108  			return nil, fmt.Errorf("start needs to be executed before gather")
   109  		}
   110  		subtractInitialRestartCounts(metrics, m.initSnapshot)
   111  		summary, err := buildSummary(metrics)
   112  		if err != nil {
   113  			return nil, err
   114  		}
   115  		if err = validateRestartCounts(metrics, config, overrides); err != nil {
   116  			return summary, err
   117  		}
   118  		return summary, nil
   119  	default:
   120  		return nil, fmt.Errorf("unknown action %v", action)
   121  	}
   122  }
   123  
   124  func getPodMetrics(config *measurement.Config) (*systemPodsMetrics, error) {
   125  	klog.V(2).Info("collecting system pod metrics...")
   126  	lst, err := getPodList(config.ClusterFramework.GetClientSets().GetClient())
   127  	if err != nil {
   128  		return &systemPodsMetrics{}, err
   129  	}
   130  	return extractMetrics(lst), nil
   131  }
   132  
   133  func getPodList(client kubernetes.Interface) (*v1.PodList, error) {
   134  	lst, err := client.CoreV1().Pods(systemNamespace).List(context.TODO(), metav1.ListOptions{
   135  		ResourceVersion: "0", // to read from cache
   136  	})
   137  	if err != nil {
   138  		return nil, err
   139  	}
   140  	return lst, nil
   141  }
   142  
   143  func subtractInitialRestartCounts(metrics *systemPodsMetrics, initMetrics *systemPodsMetrics) {
   144  	// podName -> containerName -> restartCount
   145  	initRestarts := make(map[string]map[string]int32)
   146  
   147  	for _, initPod := range initMetrics.Pods {
   148  		initRestarts[initPod.Name] = make(map[string]int32)
   149  		for _, initContainer := range initPod.Containers {
   150  			initRestarts[initPod.Name][initContainer.Name] = initContainer.RestartCount
   151  		}
   152  	}
   153  
   154  	for _, pod := range metrics.Pods {
   155  		for i, container := range pod.Containers {
   156  			initPod, ok := initRestarts[pod.Name]
   157  			if !ok {
   158  				continue
   159  			}
   160  			initRestartCount, ok := initPod[container.Name]
   161  			if !ok {
   162  				continue
   163  			}
   164  			pod.Containers[i].RestartCount -= initRestartCount
   165  		}
   166  	}
   167  }
   168  
   169  func validateRestartCounts(metrics *systemPodsMetrics, config *measurement.Config, overrides map[string]int) error {
   170  	enabled, err := util.GetBoolOrDefault(config.Params, enableRestartCountCheckFlagName, false)
   171  	if err != nil {
   172  		return err
   173  	}
   174  	if !enabled {
   175  		return nil
   176  	}
   177  
   178  	violations := make([]string, 0)
   179  	for _, p := range metrics.Pods {
   180  		for _, c := range p.Containers {
   181  			maxAllowedRestarts := getMaxAllowedRestarts(c.Name, overrides)
   182  			if c.RestartCount > int32(maxAllowedRestarts) {
   183  				violation := fmt.Sprintf("RestartCount(%v, %v)=%v, want <= %v",
   184  					p.Name, c.Name, c.RestartCount, maxAllowedRestarts)
   185  				violations = append(violations, violation)
   186  			}
   187  		}
   188  	}
   189  
   190  	if len(violations) == 0 {
   191  		return nil
   192  	}
   193  	violationsJoined := strings.Join(violations, "; ")
   194  	return fmt.Errorf("restart counts violation: %v", violationsJoined)
   195  }
   196  
   197  func getMaxAllowedRestarts(containerName string, thresholdOverrides map[string]int) int {
   198  	if override, ok := thresholdOverrides[containerName]; ok {
   199  		return override
   200  	}
   201  	// This allows setting default threshold, which will be used for containers
   202  	// not present in the thresholdOverrides map.
   203  	if override, ok := thresholdOverrides[defaultRestartCountThresholdKey]; ok {
   204  		return override
   205  	}
   206  	return 0 // do not allow any restarts if no override and no default specified
   207  }
   208  
   209  /*
   210  getThresholdOverrides deserializes restart count override flag value. The value of
   211  this flag is a map[string]int serialized using yaml format. Note that YamlQuote is used to ensure
   212  proper indentation after gotemplate execution.
   213  
   214  Alternatively, we could use yaml map as flag value, but then go templates executor would serialize it
   215  using golang map format (for example "map[c1:4 c2:8]"), but it would require implementation of a parser
   216  for such format. It would also introduce a dependency on golang map serialization format, which might break
   217  clusterloader if format ever changes.
   218  */
   219  func getThresholdOverrides(config *measurement.Config) (map[string]int, error) {
   220  	serialized, err := util.GetStringOrDefault(config.Params, restartThresholdOverridesFlagName, "")
   221  	if err != nil {
   222  		return make(map[string]int), nil
   223  	}
   224  	var parsed map[string]int
   225  	err = yaml.Unmarshal([]byte(serialized), &parsed)
   226  	if err != nil {
   227  		return nil, err
   228  	}
   229  	klog.V(2).Infof("Loaded restart count threshold overrides: %v", parsed)
   230  	return parsed, nil
   231  }
   232  
   233  func extractMetrics(lst *v1.PodList) *systemPodsMetrics {
   234  	metrics := systemPodsMetrics{
   235  		Pods: []podMetrics{},
   236  	}
   237  	for _, pod := range lst.Items {
   238  		podMetrics := podMetrics{
   239  			Containers: []containerMetrics{},
   240  			Name:       pod.Name,
   241  		}
   242  		for _, container := range pod.Status.ContainerStatuses {
   243  			metrics := containerMetrics{
   244  				Name:         container.Name,
   245  				RestartCount: container.RestartCount,
   246  			}
   247  			if container.LastTerminationState.Terminated != nil {
   248  				metrics.LastRestartReason = container.LastTerminationState.Terminated.String()
   249  			}
   250  			podMetrics.Containers = append(podMetrics.Containers, metrics)
   251  		}
   252  		metrics.Pods = append(metrics.Pods, podMetrics)
   253  	}
   254  	return &metrics
   255  }
   256  
   257  func buildSummary(podMetrics *systemPodsMetrics) ([]measurement.Summary, error) {
   258  	content, err := util.PrettyPrintJSON(podMetrics)
   259  	if err != nil {
   260  		return nil, err
   261  	}
   262  
   263  	summary := measurement.CreateSummary(systemPodMetricsName, "json", content)
   264  	return []measurement.Summary{summary}, nil
   265  }
   266  
   267  // Dispose cleans up after the measurement.
   268  func (m *systemPodMetricsMeasurement) Dispose() {}
   269  
   270  // String returns string representation of this measurement.
   271  func (*systemPodMetricsMeasurement) String() string {
   272  	return systemPodMetricsName
   273  }