k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/container_restarts.go (about) 1 /* 2 Copyright 2021 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package common 18 19 import ( 20 "fmt" 21 "math" 22 "regexp" 23 "strings" 24 "time" 25 26 "github.com/prometheus/common/model" 27 "gopkg.in/yaml.v2" 28 "k8s.io/klog/v2" 29 "k8s.io/perf-tests/clusterloader2/pkg/errors" 30 "k8s.io/perf-tests/clusterloader2/pkg/measurement" 31 measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util" 32 "k8s.io/perf-tests/clusterloader2/pkg/util" 33 ) 34 35 const ( 36 containerRestartsMeasurementName = "ContainerRestarts" 37 38 containerRestartCountQuery = `changes(container_start_time_seconds[%v])` 39 ) 40 41 func init() { 42 create := func() measurement.Measurement { 43 return CreatePrometheusMeasurement(&containerRestartsGatherer{}) 44 } 45 if err := measurement.Register(containerRestartsMeasurementName, create); err != nil { 46 klog.Fatalf("Cannot register %s: %v", containerRestartsMeasurementName, err) 47 } 48 } 49 50 type containerRestartsGatherer struct{} 51 52 type ContainerInfo struct { 53 Container string `yaml:"container"` 54 Pod string `yaml:"pod"` 55 Namespace string `yaml:"namespace"` 56 } 57 58 type ContainerRestartsInfo struct { 59 ContainerInfo `yaml:",inline"` 60 RestartCount int `yaml:"restartCount"` 61 } 62 63 type restartCountOverride struct { 64 ContainerInfo `yaml:",inline"` 65 AllowedRestarts int `yaml:"allowedRestarts"` 66 podNameRegex *regexp.Regexp 67 } 68 69 func (a *containerRestartsGatherer) Gather(executor QueryExecutor, startTime, endTime time.Time, config *measurement.Config) ([]measurement.Summary, error) { 70 restartCountOverrides, err := a.getOverrides(config) 71 if err != nil { 72 return nil, err 73 } 74 75 defaultAllowedRestarts, err := util.GetIntOrDefault(config.Params, "defaultAllowedRestarts", 0) 76 if err != nil { 77 return nil, err 78 } 79 80 containerRestarts, err := a.gatherContainerRestarts(executor, startTime, endTime) 81 if err != nil { 82 return nil, err 83 } 84 85 content, err := util.PrettyPrintJSON(containerRestarts) 86 if err != nil { 87 return nil, err 88 } 89 90 summaries := []measurement.Summary{measurement.CreateSummary(containerRestartsMeasurementName, "json", content)} 91 if badContainers := a.validateRestarts(containerRestarts, defaultAllowedRestarts, restartCountOverrides); len(badContainers) > 0 { 92 return summaries, errors.NewMetricViolationError("container restarts", fmt.Sprintf("container restart count validation: %v", badContainers)) 93 } 94 return summaries, nil 95 } 96 97 func (a *containerRestartsGatherer) getOverrides(config *measurement.Config) ([]*restartCountOverride, error) { 98 restartCountOverridesString, err := util.GetStringOrDefault(config.Params, "customAllowedRestarts", "") 99 if err != nil { 100 return nil, err 101 } 102 103 var restartCountOverrides []*restartCountOverride 104 if err := yaml.Unmarshal([]byte(restartCountOverridesString), &restartCountOverrides); err != nil { 105 return nil, err 106 } 107 108 for _, car := range restartCountOverrides { 109 podNamePattern := strings.ReplaceAll(strings.ReplaceAll(car.Pod, ".", "\\."), "*", ".*") 110 car.podNameRegex = regexp.MustCompile("^" + podNamePattern + "$") 111 } 112 return restartCountOverrides, nil 113 } 114 115 func (a *containerRestartsGatherer) gatherContainerRestarts(executor QueryExecutor, startTime, endTime time.Time) ([]ContainerRestartsInfo, error) { 116 measurementDuration := endTime.Sub(startTime) 117 promDuration := measurementutil.ToPrometheusTime(measurementDuration) 118 query := fmt.Sprintf(containerRestartCountQuery, promDuration) 119 samples, err := executor.Query(query, endTime) 120 if err != nil { 121 return nil, err 122 } 123 124 extractCommon := func(sample *model.Sample) (string, string, string) { 125 return string(sample.Metric["container"]), string(sample.Metric["pod"]), string(sample.Metric["namespace"]) 126 } 127 128 result := []ContainerRestartsInfo{} 129 for _, sample := range samples { 130 container, pod, namespace := extractCommon(sample) 131 count := int(math.Round(float64(sample.Value))) 132 cri := ContainerRestartsInfo{ 133 ContainerInfo: ContainerInfo{ 134 Container: container, 135 Pod: pod, 136 Namespace: namespace, 137 }, 138 RestartCount: count, 139 } 140 result = append(result, cri) 141 } 142 return result, nil 143 } 144 145 func (a *containerRestartsGatherer) validateRestarts(restartsInfos []ContainerRestartsInfo, defaultAllowedRestarts int, restartCountOverrides []*restartCountOverride) []error { 146 badContainers := make([]error, 0) 147 for _, ri := range restartsInfos { 148 allowedRestarts := defaultAllowedRestarts 149 for _, override := range restartCountOverrides { 150 if override.podNameRegex.MatchString(ri.Pod) && allowedRestarts < override.AllowedRestarts { 151 allowedRestarts = override.AllowedRestarts 152 } 153 } 154 if ri.RestartCount > allowedRestarts { 155 badContainers = append(badContainers, fmt.Errorf("restartCount(%+v) = %v, expected <= %v", ri.ContainerInfo, ri.RestartCount, allowedRestarts)) 156 } 157 } 158 return badContainers 159 } 160 161 func (a *containerRestartsGatherer) Configure(config *measurement.Config) error { 162 return nil 163 } 164 165 func (a *containerRestartsGatherer) IsEnabled(config *measurement.Config) bool { 166 return true 167 } 168 169 func (*containerRestartsGatherer) String() string { 170 return containerRestartsMeasurementName 171 }