k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/container_restarts.go (about)

     1  /*
     2  Copyright 2021 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package common
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  	"regexp"
    23  	"strings"
    24  	"time"
    25  
    26  	"github.com/prometheus/common/model"
    27  	"gopkg.in/yaml.v2"
    28  	"k8s.io/klog/v2"
    29  	"k8s.io/perf-tests/clusterloader2/pkg/errors"
    30  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    31  	measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util"
    32  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    33  )
    34  
    35  const (
    36  	containerRestartsMeasurementName = "ContainerRestarts"
    37  
    38  	containerRestartCountQuery = `changes(container_start_time_seconds[%v])`
    39  )
    40  
    41  func init() {
    42  	create := func() measurement.Measurement {
    43  		return CreatePrometheusMeasurement(&containerRestartsGatherer{})
    44  	}
    45  	if err := measurement.Register(containerRestartsMeasurementName, create); err != nil {
    46  		klog.Fatalf("Cannot register %s: %v", containerRestartsMeasurementName, err)
    47  	}
    48  }
    49  
    50  type containerRestartsGatherer struct{}
    51  
    52  type ContainerInfo struct {
    53  	Container string `yaml:"container"`
    54  	Pod       string `yaml:"pod"`
    55  	Namespace string `yaml:"namespace"`
    56  }
    57  
    58  type ContainerRestartsInfo struct {
    59  	ContainerInfo `yaml:",inline"`
    60  	RestartCount  int `yaml:"restartCount"`
    61  }
    62  
    63  type restartCountOverride struct {
    64  	ContainerInfo   `yaml:",inline"`
    65  	AllowedRestarts int `yaml:"allowedRestarts"`
    66  	podNameRegex    *regexp.Regexp
    67  }
    68  
    69  func (a *containerRestartsGatherer) Gather(executor QueryExecutor, startTime, endTime time.Time, config *measurement.Config) ([]measurement.Summary, error) {
    70  	restartCountOverrides, err := a.getOverrides(config)
    71  	if err != nil {
    72  		return nil, err
    73  	}
    74  
    75  	defaultAllowedRestarts, err := util.GetIntOrDefault(config.Params, "defaultAllowedRestarts", 0)
    76  	if err != nil {
    77  		return nil, err
    78  	}
    79  
    80  	containerRestarts, err := a.gatherContainerRestarts(executor, startTime, endTime)
    81  	if err != nil {
    82  		return nil, err
    83  	}
    84  
    85  	content, err := util.PrettyPrintJSON(containerRestarts)
    86  	if err != nil {
    87  		return nil, err
    88  	}
    89  
    90  	summaries := []measurement.Summary{measurement.CreateSummary(containerRestartsMeasurementName, "json", content)}
    91  	if badContainers := a.validateRestarts(containerRestarts, defaultAllowedRestarts, restartCountOverrides); len(badContainers) > 0 {
    92  		return summaries, errors.NewMetricViolationError("container restarts", fmt.Sprintf("container restart count validation: %v", badContainers))
    93  	}
    94  	return summaries, nil
    95  }
    96  
    97  func (a *containerRestartsGatherer) getOverrides(config *measurement.Config) ([]*restartCountOverride, error) {
    98  	restartCountOverridesString, err := util.GetStringOrDefault(config.Params, "customAllowedRestarts", "")
    99  	if err != nil {
   100  		return nil, err
   101  	}
   102  
   103  	var restartCountOverrides []*restartCountOverride
   104  	if err := yaml.Unmarshal([]byte(restartCountOverridesString), &restartCountOverrides); err != nil {
   105  		return nil, err
   106  	}
   107  
   108  	for _, car := range restartCountOverrides {
   109  		podNamePattern := strings.ReplaceAll(strings.ReplaceAll(car.Pod, ".", "\\."), "*", ".*")
   110  		car.podNameRegex = regexp.MustCompile("^" + podNamePattern + "$")
   111  	}
   112  	return restartCountOverrides, nil
   113  }
   114  
   115  func (a *containerRestartsGatherer) gatherContainerRestarts(executor QueryExecutor, startTime, endTime time.Time) ([]ContainerRestartsInfo, error) {
   116  	measurementDuration := endTime.Sub(startTime)
   117  	promDuration := measurementutil.ToPrometheusTime(measurementDuration)
   118  	query := fmt.Sprintf(containerRestartCountQuery, promDuration)
   119  	samples, err := executor.Query(query, endTime)
   120  	if err != nil {
   121  		return nil, err
   122  	}
   123  
   124  	extractCommon := func(sample *model.Sample) (string, string, string) {
   125  		return string(sample.Metric["container"]), string(sample.Metric["pod"]), string(sample.Metric["namespace"])
   126  	}
   127  
   128  	result := []ContainerRestartsInfo{}
   129  	for _, sample := range samples {
   130  		container, pod, namespace := extractCommon(sample)
   131  		count := int(math.Round(float64(sample.Value)))
   132  		cri := ContainerRestartsInfo{
   133  			ContainerInfo: ContainerInfo{
   134  				Container: container,
   135  				Pod:       pod,
   136  				Namespace: namespace,
   137  			},
   138  			RestartCount: count,
   139  		}
   140  		result = append(result, cri)
   141  	}
   142  	return result, nil
   143  }
   144  
   145  func (a *containerRestartsGatherer) validateRestarts(restartsInfos []ContainerRestartsInfo, defaultAllowedRestarts int, restartCountOverrides []*restartCountOverride) []error {
   146  	badContainers := make([]error, 0)
   147  	for _, ri := range restartsInfos {
   148  		allowedRestarts := defaultAllowedRestarts
   149  		for _, override := range restartCountOverrides {
   150  			if override.podNameRegex.MatchString(ri.Pod) && allowedRestarts < override.AllowedRestarts {
   151  				allowedRestarts = override.AllowedRestarts
   152  			}
   153  		}
   154  		if ri.RestartCount > allowedRestarts {
   155  			badContainers = append(badContainers, fmt.Errorf("restartCount(%+v) = %v, expected <= %v", ri.ContainerInfo, ri.RestartCount, allowedRestarts))
   156  		}
   157  	}
   158  	return badContainers
   159  }
   160  
   161  func (a *containerRestartsGatherer) Configure(config *measurement.Config) error {
   162  	return nil
   163  }
   164  
   165  func (a *containerRestartsGatherer) IsEnabled(config *measurement.Config) bool {
   166  	return true
   167  }
   168  
   169  func (*containerRestartsGatherer) String() string {
   170  	return containerRestartsMeasurementName
   171  }