k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/system_pod_metrics.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package common 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 24 "gopkg.in/yaml.v2" 25 v1 "k8s.io/api/core/v1" 26 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 "k8s.io/client-go/kubernetes" 28 "k8s.io/klog/v2" 29 "k8s.io/perf-tests/clusterloader2/pkg/measurement" 30 "k8s.io/perf-tests/clusterloader2/pkg/util" 31 ) 32 33 const ( 34 systemPodMetricsName = "SystemPodMetrics" 35 systemNamespace = "kube-system" 36 systemPodMetricsEnabledFlagName = "systemPodMetricsEnabled" 37 restartThresholdOverridesFlagName = "restartCountThresholdOverrides" 38 enableRestartCountCheckFlagName = "enableRestartCountCheck" 39 defaultRestartCountThresholdKey = "default" 40 ) 41 42 func init() { 43 if err := measurement.Register(systemPodMetricsName, createSystemPodMetricsMeasurement); err != nil { 44 klog.Fatalf("Cannot register %s: %v", systemPodMetricsName, err) 45 } 46 } 47 48 func createSystemPodMetricsMeasurement() measurement.Measurement { 49 return &systemPodMetricsMeasurement{} 50 } 51 52 // Gathers metrics for system pods, right now it only gathers container restart counts. 53 // System pods are listed twice: first time for "start" action, second time for "gather" action. 54 // When executing "gather", initial restart counts are subtracted from the current 55 // restart counts. In effect, only restarts that happened during test execution 56 // (between "start" and "gather") are visible in the final summary. 57 type systemPodMetricsMeasurement struct { 58 initSnapshot *systemPodsMetrics 59 } 60 61 type containerMetrics struct { 62 Name string `json:"name"` 63 RestartCount int32 `json:"restartCount"` 64 LastRestartReason string `json:"lastRestartReason"` 65 } 66 67 type podMetrics struct { 68 Name string `json:"name"` 69 Containers []containerMetrics `json:"containers"` 70 } 71 72 type systemPodsMetrics struct { 73 Pods []podMetrics `json:"pods"` 74 } 75 76 // Execute gathers and prints system pod metrics. 77 func (m *systemPodMetricsMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) { 78 systemPodMetricsEnabled, err := util.GetBoolOrDefault(config.Params, systemPodMetricsEnabledFlagName, false) 79 if err != nil { 80 return nil, err 81 } 82 if !systemPodMetricsEnabled { 83 klog.V(2).Info("skipping collection of system pod metrics") 84 return []measurement.Summary{}, nil 85 } 86 87 metrics, err := getPodMetrics(config) 88 if err != nil { 89 return nil, err 90 } 91 92 action, err := util.GetString(config.Params, "action") 93 if err != nil { 94 return nil, err 95 } 96 97 overrides, err := getThresholdOverrides(config) 98 if err != nil { 99 return nil, err 100 } 101 102 switch action { 103 case "start": 104 m.initSnapshot = metrics 105 return nil, nil 106 case "gather": 107 if m.initSnapshot == nil { 108 return nil, fmt.Errorf("start needs to be executed before gather") 109 } 110 subtractInitialRestartCounts(metrics, m.initSnapshot) 111 summary, err := buildSummary(metrics) 112 if err != nil { 113 return nil, err 114 } 115 if err = validateRestartCounts(metrics, config, overrides); err != nil { 116 return summary, err 117 } 118 return summary, nil 119 default: 120 return nil, fmt.Errorf("unknown action %v", action) 121 } 122 } 123 124 func getPodMetrics(config *measurement.Config) (*systemPodsMetrics, error) { 125 klog.V(2).Info("collecting system pod metrics...") 126 lst, err := getPodList(config.ClusterFramework.GetClientSets().GetClient()) 127 if err != nil { 128 return &systemPodsMetrics{}, err 129 } 130 return extractMetrics(lst), nil 131 } 132 133 func getPodList(client kubernetes.Interface) (*v1.PodList, error) { 134 lst, err := client.CoreV1().Pods(systemNamespace).List(context.TODO(), metav1.ListOptions{ 135 ResourceVersion: "0", // to read from cache 136 }) 137 if err != nil { 138 return nil, err 139 } 140 return lst, nil 141 } 142 143 func subtractInitialRestartCounts(metrics *systemPodsMetrics, initMetrics *systemPodsMetrics) { 144 // podName -> containerName -> restartCount 145 initRestarts := make(map[string]map[string]int32) 146 147 for _, initPod := range initMetrics.Pods { 148 initRestarts[initPod.Name] = make(map[string]int32) 149 for _, initContainer := range initPod.Containers { 150 initRestarts[initPod.Name][initContainer.Name] = initContainer.RestartCount 151 } 152 } 153 154 for _, pod := range metrics.Pods { 155 for i, container := range pod.Containers { 156 initPod, ok := initRestarts[pod.Name] 157 if !ok { 158 continue 159 } 160 initRestartCount, ok := initPod[container.Name] 161 if !ok { 162 continue 163 } 164 pod.Containers[i].RestartCount -= initRestartCount 165 } 166 } 167 } 168 169 func validateRestartCounts(metrics *systemPodsMetrics, config *measurement.Config, overrides map[string]int) error { 170 enabled, err := util.GetBoolOrDefault(config.Params, enableRestartCountCheckFlagName, false) 171 if err != nil { 172 return err 173 } 174 if !enabled { 175 return nil 176 } 177 178 violations := make([]string, 0) 179 for _, p := range metrics.Pods { 180 for _, c := range p.Containers { 181 maxAllowedRestarts := getMaxAllowedRestarts(c.Name, overrides) 182 if c.RestartCount > int32(maxAllowedRestarts) { 183 violation := fmt.Sprintf("RestartCount(%v, %v)=%v, want <= %v", 184 p.Name, c.Name, c.RestartCount, maxAllowedRestarts) 185 violations = append(violations, violation) 186 } 187 } 188 } 189 190 if len(violations) == 0 { 191 return nil 192 } 193 violationsJoined := strings.Join(violations, "; ") 194 return fmt.Errorf("restart counts violation: %v", violationsJoined) 195 } 196 197 func getMaxAllowedRestarts(containerName string, thresholdOverrides map[string]int) int { 198 if override, ok := thresholdOverrides[containerName]; ok { 199 return override 200 } 201 // This allows setting default threshold, which will be used for containers 202 // not present in the thresholdOverrides map. 203 if override, ok := thresholdOverrides[defaultRestartCountThresholdKey]; ok { 204 return override 205 } 206 return 0 // do not allow any restarts if no override and no default specified 207 } 208 209 /* 210 getThresholdOverrides deserializes restart count override flag value. The value of 211 this flag is a map[string]int serialized using yaml format. Note that YamlQuote is used to ensure 212 proper indentation after gotemplate execution. 213 214 Alternatively, we could use yaml map as flag value, but then go templates executor would serialize it 215 using golang map format (for example "map[c1:4 c2:8]"), but it would require implementation of a parser 216 for such format. It would also introduce a dependency on golang map serialization format, which might break 217 clusterloader if format ever changes. 218 */ 219 func getThresholdOverrides(config *measurement.Config) (map[string]int, error) { 220 serialized, err := util.GetStringOrDefault(config.Params, restartThresholdOverridesFlagName, "") 221 if err != nil { 222 return make(map[string]int), nil 223 } 224 var parsed map[string]int 225 err = yaml.Unmarshal([]byte(serialized), &parsed) 226 if err != nil { 227 return nil, err 228 } 229 klog.V(2).Infof("Loaded restart count threshold overrides: %v", parsed) 230 return parsed, nil 231 } 232 233 func extractMetrics(lst *v1.PodList) *systemPodsMetrics { 234 metrics := systemPodsMetrics{ 235 Pods: []podMetrics{}, 236 } 237 for _, pod := range lst.Items { 238 podMetrics := podMetrics{ 239 Containers: []containerMetrics{}, 240 Name: pod.Name, 241 } 242 for _, container := range pod.Status.ContainerStatuses { 243 metrics := containerMetrics{ 244 Name: container.Name, 245 RestartCount: container.RestartCount, 246 } 247 if container.LastTerminationState.Terminated != nil { 248 metrics.LastRestartReason = container.LastTerminationState.Terminated.String() 249 } 250 podMetrics.Containers = append(podMetrics.Containers, metrics) 251 } 252 metrics.Pods = append(metrics.Pods, podMetrics) 253 } 254 return &metrics 255 } 256 257 func buildSummary(podMetrics *systemPodsMetrics) ([]measurement.Summary, error) { 258 content, err := util.PrettyPrintJSON(podMetrics) 259 if err != nil { 260 return nil, err 261 } 262 263 summary := measurement.CreateSummary(systemPodMetricsName, "json", content) 264 return []measurement.Summary{summary}, nil 265 } 266 267 // Dispose cleans up after the measurement. 268 func (m *systemPodMetricsMeasurement) Dispose() {} 269 270 // String returns string representation of this measurement. 271 func (*systemPodMetricsMeasurement) String() string { 272 return systemPodMetricsName 273 }