k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/cilium_endpoint_propagation_delay.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package common 18 19 import ( 20 "fmt" 21 "math" 22 "strconv" 23 "time" 24 25 "github.com/prometheus/common/model" 26 "k8s.io/klog/v2" 27 "k8s.io/perf-tests/clusterloader2/pkg/errors" 28 "k8s.io/perf-tests/clusterloader2/pkg/measurement" 29 measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util" 30 "k8s.io/perf-tests/clusterloader2/pkg/util" 31 ) 32 33 const ( 34 cepPropagationDelayMeasurementName = "CiliumEndpointPropagationDelay" 35 // The metric definition and bucket sizes for cilium_endpoint_propagation_delay_seconds: 36 // https://github.com/cilium/cilium/blob/v1.11/pkg/metrics/metrics.go#L1263 37 cepPropagationDelayQuery = `sum(sum_over_time(cilium_endpoint_propagation_delay_seconds_bucket[%v])) by (le)` 38 queryInterval = 10 * time.Minute 39 40 // bucketAllEntries is the default Prometheus bucket that 41 // includes all entries for the histogram snapshot. 42 bucketAllEntries = "+Inf" 43 // defaultBucketSLO and defaultPercentileSLO are used together to 44 // determine if the test should pass, when not specified otherwise 45 // in the CL2 parameters. The test should pass only if the size of 46 // the defaultBucketSLO is at least within the defined percentile 47 // in comparison to the size of the bucketAllEntries. 48 defaultBucketSLO = 600 49 defaultPercentileSLO float64 = 95 50 ) 51 52 func init() { 53 create := func() measurement.Measurement { 54 return CreatePrometheusMeasurement(&cepPropagationDelayGatherer{}) 55 } 56 if err := measurement.Register(cepPropagationDelayMeasurementName, create); err != nil { 57 klog.Fatalf("Cannot register %s: %v", cepPropagationDelayMeasurementName, err) 58 } 59 } 60 61 type cepPropagationDelayGatherer struct{} 62 63 // cepPropagationDelayMetricMap contains timestamps in the outer map, 64 // and buckets and their sizes in the inner map. 65 type cepPropagationDelayMetricMap map[string]map[string]int 66 67 func (c *cepPropagationDelayGatherer) Gather(executor QueryExecutor, startTime, endTime time.Time, config *measurement.Config) ([]measurement.Summary, error) { 68 cepPropagationDelay, err := c.gatherCepPropagationDelay(executor, startTime, endTime) 69 if err != nil { 70 return nil, err 71 } 72 73 content, err := util.PrettyPrintJSON(cepPropagationDelay) 74 if err != nil { 75 return nil, err 76 } 77 78 summaries := []measurement.Summary{measurement.CreateSummary(cepPropagationDelayMeasurementName, "json", content)} 79 return summaries, validateCEPPropagationDelay(cepPropagationDelay, config) 80 } 81 82 func (c *cepPropagationDelayGatherer) gatherCepPropagationDelay(executor QueryExecutor, startTime, endTime time.Time) (cepPropagationDelayMetricMap, error) { 83 // Query the data between start and end time on fixed intervals 84 // to get accurate data from multiple snapshots. 85 var samples []*model.Sample 86 prevQueryTime := startTime 87 currQueryTime := startTime.Add(queryInterval) 88 for { 89 if currQueryTime.After(endTime) { 90 currQueryTime = endTime 91 } 92 queryDuration := currQueryTime.Sub(prevQueryTime) 93 promDuration := measurementutil.ToPrometheusTime(queryDuration) 94 query := fmt.Sprintf(cepPropagationDelayQuery, promDuration) 95 newSamples, err := executor.Query(query, currQueryTime) 96 if err == nil { 97 samples = append(samples, newSamples...) 98 } else { 99 klog.V(2).Infof("Got error querying Prometheus: %v", err) 100 } 101 if currQueryTime == endTime { 102 break 103 } 104 prevQueryTime = currQueryTime 105 currQueryTime = currQueryTime.Add(queryInterval) 106 } 107 108 extractSampleData := func(sample *model.Sample) (string, string, int) { 109 return sample.Timestamp.String(), string(sample.Metric["le"]), int(math.Round(float64(sample.Value))) 110 } 111 112 result := make(cepPropagationDelayMetricMap) 113 for _, sample := range samples { 114 timestamp, bucket, value := extractSampleData(sample) 115 if _, ok := result[timestamp]; !ok { 116 result[timestamp] = make(map[string]int) 117 } 118 result[timestamp][bucket] = value 119 } 120 return result, nil 121 } 122 123 func validateCEPPropagationDelay(result cepPropagationDelayMetricMap, config *measurement.Config) error { 124 bucketNumSLO, err := util.GetFloat64OrDefault(config.Params, "bucketSLO", defaultBucketSLO) 125 if err != nil || bucketNumSLO == 0 { 126 klog.V(2).Infof("Using defaultBucketSLO: %d, because bucketSLO param is invalid: %v", int(math.Floor(defaultBucketSLO)), err) 127 bucketNumSLO = defaultBucketSLO 128 } 129 bucketSLO := strconv.FormatFloat(bucketNumSLO, 'g', -1, 64) 130 131 percentileSLO, err := util.GetFloat64OrDefault(config.Params, "percentileSLO", defaultPercentileSLO) 132 if err != nil || percentileSLO == 0 { 133 klog.V(2).Infof("Using defaultPercentileSLO: %f, because percentileSLO param is invalid: %v", percentileSLO, err) 134 percentileSLO = defaultPercentileSLO 135 } 136 137 for timestamp, buckets := range result { 138 totalEvents := buckets[bucketAllEntries] 139 if totalEvents == 0 { 140 continue 141 } 142 143 acceptedDelayEvents := buckets[bucketSLO] 144 perc := (float64(acceptedDelayEvents) / float64(totalEvents)) * 100 145 if perc < percentileSLO { 146 return errors.NewMetricViolationError( 147 "Cilium endpoint propagation delay", 148 fmt.Sprintf("%s: updates for %ss delay is within %d%%, expected %d%%, buckets:\n%v", 149 timestamp, 150 bucketSLO, 151 int(math.Floor(perc)), 152 int(math.Floor(percentileSLO)), 153 buckets, 154 ), 155 ) 156 } 157 } 158 return nil 159 } 160 161 func (c *cepPropagationDelayGatherer) Configure(config *measurement.Config) error { 162 return nil 163 } 164 165 func (c *cepPropagationDelayGatherer) IsEnabled(config *measurement.Config) bool { 166 return true 167 } 168 169 func (*cepPropagationDelayGatherer) String() string { 170 return cepPropagationDelayMeasurementName 171 }