k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/cilium_endpoint_propagation_delay.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package common
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  	"strconv"
    23  	"time"
    24  
    25  	"github.com/prometheus/common/model"
    26  	"k8s.io/klog/v2"
    27  	"k8s.io/perf-tests/clusterloader2/pkg/errors"
    28  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    29  	measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util"
    30  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    31  )
    32  
    33  const (
    34  	cepPropagationDelayMeasurementName = "CiliumEndpointPropagationDelay"
    35  	// The metric definition and bucket sizes for cilium_endpoint_propagation_delay_seconds:
    36  	// https://github.com/cilium/cilium/blob/v1.11/pkg/metrics/metrics.go#L1263
    37  	cepPropagationDelayQuery = `sum(sum_over_time(cilium_endpoint_propagation_delay_seconds_bucket[%v])) by (le)`
    38  	queryInterval            = 10 * time.Minute
    39  
    40  	// bucketAllEntries is the default Prometheus bucket that
    41  	// includes all entries for the histogram snapshot.
    42  	bucketAllEntries = "+Inf"
    43  	// defaultBucketSLO and defaultPercentileSLO are used together to
    44  	// determine if the test should pass, when not specified otherwise
    45  	// in the CL2 parameters. The test should pass only if the size of
    46  	// the defaultBucketSLO is at least within the defined percentile
    47  	// in comparison to the size of the bucketAllEntries.
    48  	defaultBucketSLO             = 600
    49  	defaultPercentileSLO float64 = 95
    50  )
    51  
    52  func init() {
    53  	create := func() measurement.Measurement {
    54  		return CreatePrometheusMeasurement(&cepPropagationDelayGatherer{})
    55  	}
    56  	if err := measurement.Register(cepPropagationDelayMeasurementName, create); err != nil {
    57  		klog.Fatalf("Cannot register %s: %v", cepPropagationDelayMeasurementName, err)
    58  	}
    59  }
    60  
    61  type cepPropagationDelayGatherer struct{}
    62  
    63  // cepPropagationDelayMetricMap contains timestamps in the outer map,
    64  // and buckets and their sizes in the inner map.
    65  type cepPropagationDelayMetricMap map[string]map[string]int
    66  
    67  func (c *cepPropagationDelayGatherer) Gather(executor QueryExecutor, startTime, endTime time.Time, config *measurement.Config) ([]measurement.Summary, error) {
    68  	cepPropagationDelay, err := c.gatherCepPropagationDelay(executor, startTime, endTime)
    69  	if err != nil {
    70  		return nil, err
    71  	}
    72  
    73  	content, err := util.PrettyPrintJSON(cepPropagationDelay)
    74  	if err != nil {
    75  		return nil, err
    76  	}
    77  
    78  	summaries := []measurement.Summary{measurement.CreateSummary(cepPropagationDelayMeasurementName, "json", content)}
    79  	return summaries, validateCEPPropagationDelay(cepPropagationDelay, config)
    80  }
    81  
    82  func (c *cepPropagationDelayGatherer) gatherCepPropagationDelay(executor QueryExecutor, startTime, endTime time.Time) (cepPropagationDelayMetricMap, error) {
    83  	// Query the data between start and end time on fixed intervals
    84  	// to get accurate data from multiple snapshots.
    85  	var samples []*model.Sample
    86  	prevQueryTime := startTime
    87  	currQueryTime := startTime.Add(queryInterval)
    88  	for {
    89  		if currQueryTime.After(endTime) {
    90  			currQueryTime = endTime
    91  		}
    92  		queryDuration := currQueryTime.Sub(prevQueryTime)
    93  		promDuration := measurementutil.ToPrometheusTime(queryDuration)
    94  		query := fmt.Sprintf(cepPropagationDelayQuery, promDuration)
    95  		newSamples, err := executor.Query(query, currQueryTime)
    96  		if err == nil {
    97  			samples = append(samples, newSamples...)
    98  		} else {
    99  			klog.V(2).Infof("Got error querying Prometheus: %v", err)
   100  		}
   101  		if currQueryTime == endTime {
   102  			break
   103  		}
   104  		prevQueryTime = currQueryTime
   105  		currQueryTime = currQueryTime.Add(queryInterval)
   106  	}
   107  
   108  	extractSampleData := func(sample *model.Sample) (string, string, int) {
   109  		return sample.Timestamp.String(), string(sample.Metric["le"]), int(math.Round(float64(sample.Value)))
   110  	}
   111  
   112  	result := make(cepPropagationDelayMetricMap)
   113  	for _, sample := range samples {
   114  		timestamp, bucket, value := extractSampleData(sample)
   115  		if _, ok := result[timestamp]; !ok {
   116  			result[timestamp] = make(map[string]int)
   117  		}
   118  		result[timestamp][bucket] = value
   119  	}
   120  	return result, nil
   121  }
   122  
   123  func validateCEPPropagationDelay(result cepPropagationDelayMetricMap, config *measurement.Config) error {
   124  	bucketNumSLO, err := util.GetFloat64OrDefault(config.Params, "bucketSLO", defaultBucketSLO)
   125  	if err != nil || bucketNumSLO == 0 {
   126  		klog.V(2).Infof("Using defaultBucketSLO: %d, because bucketSLO param is invalid: %v", int(math.Floor(defaultBucketSLO)), err)
   127  		bucketNumSLO = defaultBucketSLO
   128  	}
   129  	bucketSLO := strconv.FormatFloat(bucketNumSLO, 'g', -1, 64)
   130  
   131  	percentileSLO, err := util.GetFloat64OrDefault(config.Params, "percentileSLO", defaultPercentileSLO)
   132  	if err != nil || percentileSLO == 0 {
   133  		klog.V(2).Infof("Using defaultPercentileSLO: %f, because percentileSLO param is invalid: %v", percentileSLO, err)
   134  		percentileSLO = defaultPercentileSLO
   135  	}
   136  
   137  	for timestamp, buckets := range result {
   138  		totalEvents := buckets[bucketAllEntries]
   139  		if totalEvents == 0 {
   140  			continue
   141  		}
   142  
   143  		acceptedDelayEvents := buckets[bucketSLO]
   144  		perc := (float64(acceptedDelayEvents) / float64(totalEvents)) * 100
   145  		if perc < percentileSLO {
   146  			return errors.NewMetricViolationError(
   147  				"Cilium endpoint propagation delay",
   148  				fmt.Sprintf("%s: updates for %ss delay is within %d%%, expected %d%%, buckets:\n%v",
   149  					timestamp,
   150  					bucketSLO,
   151  					int(math.Floor(perc)),
   152  					int(math.Floor(percentileSLO)),
   153  					buckets,
   154  				),
   155  			)
   156  		}
   157  	}
   158  	return nil
   159  }
   160  
   161  func (c *cepPropagationDelayGatherer) Configure(config *measurement.Config) error {
   162  	return nil
   163  }
   164  
   165  func (c *cepPropagationDelayGatherer) IsEnabled(config *measurement.Config) bool {
   166  	return true
   167  }
   168  
   169  func (*cepPropagationDelayGatherer) String() string {
   170  	return cepPropagationDelayMeasurementName
   171  }