k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/probes/probes.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package probes
    18  
    19  import (
    20  	"embed"
    21  	"fmt"
    22  	"path"
    23  	"time"
    24  
    25  	"k8s.io/apimachinery/pkg/util/wait"
    26  	"k8s.io/klog/v2"
    27  	"k8s.io/perf-tests/clusterloader2/pkg/errors"
    28  	"k8s.io/perf-tests/clusterloader2/pkg/framework"
    29  	"k8s.io/perf-tests/clusterloader2/pkg/framework/client"
    30  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    31  	measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util"
    32  	"k8s.io/perf-tests/clusterloader2/pkg/prometheus"
    33  	prom "k8s.io/perf-tests/clusterloader2/pkg/prometheus/clients"
    34  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    35  )
    36  
    37  const (
    38  	probesNamespace = "probes"
    39  
    40  	manifestsPathPrefix = "manifests/"
    41  
    42  	checkProbesReadyInterval = 15 * time.Second
    43  
    44  	defaultCheckProbesReadyTimeout = 15 * time.Minute
    45  
    46  	defaultPingSleepDuration = time.Second
    47  )
    48  
    49  var (
    50  	networkLatencyConfig = proberConfig{
    51  		Name:             "InClusterNetworkLatency",
    52  		MetricVersion:    "v1",
    53  		Query:            "quantile_over_time(0.99, probes:in_cluster_network_latency:histogram_quantile[%v])",
    54  		Manifests:        "*.yaml",
    55  		ProbeLabelValues: []string{"ping-client", "ping-server"},
    56  	}
    57  
    58  	dnsLookupConfig = proberConfig{
    59  		Name:             "DnsLookupLatency",
    60  		MetricVersion:    "v1",
    61  		Query:            "quantile_over_time(0.99, probes:dns_lookup_latency:histogram_quantile[%v])",
    62  		Manifests:        "dnsLookup/*yaml",
    63  		ProbeLabelValues: []string{"dns"},
    64  	}
    65  
    66  	metricsServerLatencyConfig = proberConfig{
    67  		Name:             "InClusterAPIServerRequestLatency",
    68  		MetricVersion:    "v1",
    69  		Query:            "quantile_over_time(0.99, probes:in_cluster_apiserver_request_latency:histogram_quantile[%v])",
    70  		Manifests:        "metricsServer/*.yaml",
    71  		ProbeLabelValues: []string{"metrics-server-prober"},
    72  	}
    73  
    74  	//go:embed manifests
    75  	manifestsFS embed.FS
    76  )
    77  
    78  func init() {
    79  	create := func() measurement.Measurement { return createProber(networkLatencyConfig) }
    80  	if err := measurement.Register(networkLatencyConfig.Name, create); err != nil {
    81  		klog.Errorf("cannot register %s: %v", networkLatencyConfig.Name, err)
    82  	}
    83  	create = func() measurement.Measurement { return createProber(dnsLookupConfig) }
    84  	if err := measurement.Register(dnsLookupConfig.Name, create); err != nil {
    85  		klog.Errorf("cannot register %s: %v", dnsLookupConfig.Name, err)
    86  	}
    87  	create = func() measurement.Measurement { return createProber(metricsServerLatencyConfig) }
    88  	if err := measurement.Register(metricsServerLatencyConfig.Name, create); err != nil {
    89  		klog.Errorf("cannot register %s: %v", metricsServerLatencyConfig.Name, err)
    90  	}
    91  }
    92  
    93  type proberConfig struct {
    94  	Name             string
    95  	MetricVersion    string
    96  	Query            string
    97  	Manifests        string
    98  	ProbeLabelValues []string
    99  }
   100  
   101  func createProber(config proberConfig) measurement.Measurement {
   102  	return &probesMeasurement{
   103  		config: config,
   104  	}
   105  }
   106  
   107  type probesMeasurement struct {
   108  	config proberConfig
   109  
   110  	framework        *framework.Framework
   111  	replicasPerProbe int
   112  	templateMapping  map[string]interface{}
   113  	startTime        time.Time
   114  }
   115  
   116  // Execute supports two actions:
   117  // - start - starts probes and sets up monitoring
   118  // - gather - Gathers and prints metrics.
   119  func (p *probesMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) {
   120  	if !config.CloudProvider.Features().SupportProbe {
   121  		klog.V(1).Infof("%s: Probes cannot work in %s, skipping the measurement!", p, config.CloudProvider.Name())
   122  		return nil, nil
   123  	}
   124  	if config.PrometheusFramework == nil {
   125  		klog.Warningf("%s: Prometheus is disabled, skipping the measurement!", p)
   126  		return nil, nil
   127  	}
   128  
   129  	action, err := util.GetString(config.Params, "action")
   130  	if err != nil {
   131  		return nil, err
   132  	}
   133  	switch action {
   134  	case "start":
   135  		return nil, p.start(config)
   136  	case "gather":
   137  		summary, err := p.gather(config.Params)
   138  		if err != nil && !errors.IsMetricViolationError(err) {
   139  			return nil, err
   140  		}
   141  		return []measurement.Summary{summary}, err
   142  	default:
   143  		return nil, fmt.Errorf("unknown action %v", action)
   144  	}
   145  }
   146  
   147  // Dispose cleans up after the measurement.
   148  func (p *probesMeasurement) Dispose() {
   149  	if p.framework == nil {
   150  		klog.V(1).Infof("Probe %s wasn't started, skipping the Dispose() step", p)
   151  		return
   152  	}
   153  	klog.V(2).Infof("Stopping %s probe...", p)
   154  	k8sClient := p.framework.GetClientSets().GetClient()
   155  	if err := client.DeleteNamespace(k8sClient, probesNamespace); err != nil {
   156  		klog.Errorf("error while deleting %s namespace: %v", probesNamespace, err)
   157  	}
   158  	if err := client.WaitForDeleteNamespace(k8sClient, probesNamespace, client.DefaultNamespaceDeletionTimeout); err != nil {
   159  		klog.Errorf("error while waiting for %s namespace to be deleted: %v", probesNamespace, err)
   160  	}
   161  }
   162  
   163  // String returns string representation of this measurement.
   164  func (p *probesMeasurement) String() string {
   165  	return p.config.Name
   166  }
   167  
   168  func (p *probesMeasurement) initialize(config *measurement.Config) error {
   169  	replicasPerProbe, err := util.GetInt(config.Params, "replicasPerProbe")
   170  	if err != nil {
   171  		return err
   172  	}
   173  	pingSleepDuration, err := util.GetDuration(config.Params, "pingSleepDuration")
   174  	if err != nil {
   175  		pingSleepDuration = defaultPingSleepDuration
   176  	}
   177  	p.framework = config.ClusterFramework
   178  	p.replicasPerProbe = replicasPerProbe
   179  	p.templateMapping = map[string]interface{}{"Replicas": replicasPerProbe, "PingSleepDuration": pingSleepDuration}
   180  	return nil
   181  }
   182  
   183  func (p *probesMeasurement) start(config *measurement.Config) error {
   184  	klog.V(2).Infof("Starting %s probe...", p)
   185  	if !p.startTime.IsZero() {
   186  		return fmt.Errorf("measurement %s cannot be started twice", p)
   187  	}
   188  	if err := p.initialize(config); err != nil {
   189  		return err
   190  	}
   191  	k8sClient := p.framework.GetClientSets().GetClient()
   192  	if err := client.CreateNamespace(k8sClient, probesNamespace); err != nil {
   193  		return err
   194  	}
   195  	if err := p.createProbesObjects(); err != nil {
   196  		return err
   197  	}
   198  	if err := p.waitForProbesReady(config); err != nil {
   199  		return err
   200  	}
   201  	p.startTime = time.Now()
   202  	return nil
   203  }
   204  
   205  func (p *probesMeasurement) gather(params map[string]interface{}) (measurement.Summary, error) {
   206  	klog.V(2).Info("Gathering metrics from probes...")
   207  	if p.startTime.IsZero() {
   208  		return nil, fmt.Errorf("measurement %s has not been started", p)
   209  	}
   210  	threshold, err := util.GetDurationOrDefault(params, "threshold", 0)
   211  	if err != nil {
   212  		return nil, err
   213  	}
   214  	measurementEnd := time.Now()
   215  
   216  	query := prepareQuery(p.config.Query, p.startTime, measurementEnd)
   217  	pc := prom.NewInClusterPrometheusClient(p.framework.GetClientSets().GetClient())
   218  	executor := measurementutil.NewQueryExecutor(pc)
   219  	samples, err := executor.Query(query, measurementEnd)
   220  	if err != nil {
   221  		return nil, err
   222  	}
   223  
   224  	latency, err := measurementutil.NewLatencyMetricPrometheus(samples)
   225  	if err != nil {
   226  		return nil, err
   227  	}
   228  
   229  	var violation error
   230  	prefix, suffix := "", ""
   231  	if threshold > 0 {
   232  		suffix = fmt.Sprintf(", expected perc99 <= %v", threshold)
   233  		if err := latency.VerifyThreshold(threshold); err != nil {
   234  			violation = errors.NewMetricViolationError(p.String(), err.Error())
   235  			prefix = " WARNING"
   236  		}
   237  	}
   238  	klog.V(2).Infof("%s:%s got %v%s", p, prefix, latency, suffix)
   239  
   240  	summary, err := p.createSummary(*latency)
   241  	if err != nil {
   242  		return nil, err
   243  	}
   244  	return summary, violation
   245  }
   246  
   247  func (p *probesMeasurement) createProbesObjects() error {
   248  	return p.framework.ApplyTemplatedManifests(manifestsFS, path.Join(manifestsPathPrefix, p.config.Manifests), p.templateMapping)
   249  }
   250  
   251  func (p *probesMeasurement) waitForProbesReady(config *measurement.Config) error {
   252  	klog.V(2).Infof("Waiting for Probe %s to become ready...", p)
   253  	checkProbesReadyTimeout, err := util.GetDurationOrDefault(config.Params, "checkProbesReadyTimeout", defaultCheckProbesReadyTimeout)
   254  	if err != nil {
   255  		return err
   256  	}
   257  	return wait.Poll(checkProbesReadyInterval, checkProbesReadyTimeout, p.checkProbesReady)
   258  }
   259  
   260  func (p *probesMeasurement) checkProbesReady() (bool, error) {
   261  	// TODO(mm4tt): Using prometheus targets to check whether probes are up is a bit hacky.
   262  	//              Consider rewriting this to something more intuitive.
   263  	selector := func(t prometheus.Target) bool {
   264  		for _, value := range p.config.ProbeLabelValues {
   265  			// NOTE(oxddr): Prometheus does some translation of labels. Labels here are not the same as labels defined on a monitored pod.
   266  			if t.Labels["job"] == value && t.Labels["namespace"] == probesNamespace {
   267  				return true
   268  			}
   269  		}
   270  		return false
   271  	}
   272  	expectedTargets := p.replicasPerProbe * len(p.config.ProbeLabelValues)
   273  	return prometheus.CheckAllTargetsReady(
   274  		p.framework.GetClientSets().GetClient(), selector, expectedTargets)
   275  }
   276  
   277  func (p *probesMeasurement) createSummary(latency measurementutil.LatencyMetric) (measurement.Summary, error) {
   278  	content, err := util.PrettyPrintJSON(&measurementutil.PerfData{
   279  		Version:   p.config.MetricVersion,
   280  		DataItems: []measurementutil.DataItem{latency.ToPerfData(p.String())},
   281  	})
   282  	if err != nil {
   283  		return nil, err
   284  	}
   285  	return measurement.CreateSummary(p.String(), "json", content), nil
   286  }
   287  
   288  func prepareQuery(queryTemplate string, startTime, endTime time.Time) string {
   289  	measurementDuration := endTime.Sub(startTime)
   290  	return fmt.Sprintf(queryTemplate, measurementutil.ToPrometheusTime(measurementDuration))
   291  }