k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/probes/probes.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package probes 18 19 import ( 20 "embed" 21 "fmt" 22 "path" 23 "time" 24 25 "k8s.io/apimachinery/pkg/util/wait" 26 "k8s.io/klog/v2" 27 "k8s.io/perf-tests/clusterloader2/pkg/errors" 28 "k8s.io/perf-tests/clusterloader2/pkg/framework" 29 "k8s.io/perf-tests/clusterloader2/pkg/framework/client" 30 "k8s.io/perf-tests/clusterloader2/pkg/measurement" 31 measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util" 32 "k8s.io/perf-tests/clusterloader2/pkg/prometheus" 33 prom "k8s.io/perf-tests/clusterloader2/pkg/prometheus/clients" 34 "k8s.io/perf-tests/clusterloader2/pkg/util" 35 ) 36 37 const ( 38 probesNamespace = "probes" 39 40 manifestsPathPrefix = "manifests/" 41 42 checkProbesReadyInterval = 15 * time.Second 43 44 defaultCheckProbesReadyTimeout = 15 * time.Minute 45 46 defaultPingSleepDuration = time.Second 47 ) 48 49 var ( 50 networkLatencyConfig = proberConfig{ 51 Name: "InClusterNetworkLatency", 52 MetricVersion: "v1", 53 Query: "quantile_over_time(0.99, probes:in_cluster_network_latency:histogram_quantile[%v])", 54 Manifests: "*.yaml", 55 ProbeLabelValues: []string{"ping-client", "ping-server"}, 56 } 57 58 dnsLookupConfig = proberConfig{ 59 Name: "DnsLookupLatency", 60 MetricVersion: "v1", 61 Query: "quantile_over_time(0.99, probes:dns_lookup_latency:histogram_quantile[%v])", 62 Manifests: "dnsLookup/*yaml", 63 ProbeLabelValues: []string{"dns"}, 64 } 65 66 metricsServerLatencyConfig = proberConfig{ 67 Name: "InClusterAPIServerRequestLatency", 68 MetricVersion: "v1", 69 Query: "quantile_over_time(0.99, probes:in_cluster_apiserver_request_latency:histogram_quantile[%v])", 70 Manifests: "metricsServer/*.yaml", 71 ProbeLabelValues: []string{"metrics-server-prober"}, 72 } 73 74 //go:embed manifests 75 manifestsFS embed.FS 76 ) 77 78 func init() { 79 create := func() measurement.Measurement { return createProber(networkLatencyConfig) } 80 if err := measurement.Register(networkLatencyConfig.Name, create); err != nil { 81 klog.Errorf("cannot register %s: %v", networkLatencyConfig.Name, err) 82 } 83 create = func() measurement.Measurement { return createProber(dnsLookupConfig) } 84 if err := measurement.Register(dnsLookupConfig.Name, create); err != nil { 85 klog.Errorf("cannot register %s: %v", dnsLookupConfig.Name, err) 86 } 87 create = func() measurement.Measurement { return createProber(metricsServerLatencyConfig) } 88 if err := measurement.Register(metricsServerLatencyConfig.Name, create); err != nil { 89 klog.Errorf("cannot register %s: %v", metricsServerLatencyConfig.Name, err) 90 } 91 } 92 93 type proberConfig struct { 94 Name string 95 MetricVersion string 96 Query string 97 Manifests string 98 ProbeLabelValues []string 99 } 100 101 func createProber(config proberConfig) measurement.Measurement { 102 return &probesMeasurement{ 103 config: config, 104 } 105 } 106 107 type probesMeasurement struct { 108 config proberConfig 109 110 framework *framework.Framework 111 replicasPerProbe int 112 templateMapping map[string]interface{} 113 startTime time.Time 114 } 115 116 // Execute supports two actions: 117 // - start - starts probes and sets up monitoring 118 // - gather - Gathers and prints metrics. 119 func (p *probesMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) { 120 if !config.CloudProvider.Features().SupportProbe { 121 klog.V(1).Infof("%s: Probes cannot work in %s, skipping the measurement!", p, config.CloudProvider.Name()) 122 return nil, nil 123 } 124 if config.PrometheusFramework == nil { 125 klog.Warningf("%s: Prometheus is disabled, skipping the measurement!", p) 126 return nil, nil 127 } 128 129 action, err := util.GetString(config.Params, "action") 130 if err != nil { 131 return nil, err 132 } 133 switch action { 134 case "start": 135 return nil, p.start(config) 136 case "gather": 137 summary, err := p.gather(config.Params) 138 if err != nil && !errors.IsMetricViolationError(err) { 139 return nil, err 140 } 141 return []measurement.Summary{summary}, err 142 default: 143 return nil, fmt.Errorf("unknown action %v", action) 144 } 145 } 146 147 // Dispose cleans up after the measurement. 148 func (p *probesMeasurement) Dispose() { 149 if p.framework == nil { 150 klog.V(1).Infof("Probe %s wasn't started, skipping the Dispose() step", p) 151 return 152 } 153 klog.V(2).Infof("Stopping %s probe...", p) 154 k8sClient := p.framework.GetClientSets().GetClient() 155 if err := client.DeleteNamespace(k8sClient, probesNamespace); err != nil { 156 klog.Errorf("error while deleting %s namespace: %v", probesNamespace, err) 157 } 158 if err := client.WaitForDeleteNamespace(k8sClient, probesNamespace, client.DefaultNamespaceDeletionTimeout); err != nil { 159 klog.Errorf("error while waiting for %s namespace to be deleted: %v", probesNamespace, err) 160 } 161 } 162 163 // String returns string representation of this measurement. 164 func (p *probesMeasurement) String() string { 165 return p.config.Name 166 } 167 168 func (p *probesMeasurement) initialize(config *measurement.Config) error { 169 replicasPerProbe, err := util.GetInt(config.Params, "replicasPerProbe") 170 if err != nil { 171 return err 172 } 173 pingSleepDuration, err := util.GetDuration(config.Params, "pingSleepDuration") 174 if err != nil { 175 pingSleepDuration = defaultPingSleepDuration 176 } 177 p.framework = config.ClusterFramework 178 p.replicasPerProbe = replicasPerProbe 179 p.templateMapping = map[string]interface{}{"Replicas": replicasPerProbe, "PingSleepDuration": pingSleepDuration} 180 return nil 181 } 182 183 func (p *probesMeasurement) start(config *measurement.Config) error { 184 klog.V(2).Infof("Starting %s probe...", p) 185 if !p.startTime.IsZero() { 186 return fmt.Errorf("measurement %s cannot be started twice", p) 187 } 188 if err := p.initialize(config); err != nil { 189 return err 190 } 191 k8sClient := p.framework.GetClientSets().GetClient() 192 if err := client.CreateNamespace(k8sClient, probesNamespace); err != nil { 193 return err 194 } 195 if err := p.createProbesObjects(); err != nil { 196 return err 197 } 198 if err := p.waitForProbesReady(config); err != nil { 199 return err 200 } 201 p.startTime = time.Now() 202 return nil 203 } 204 205 func (p *probesMeasurement) gather(params map[string]interface{}) (measurement.Summary, error) { 206 klog.V(2).Info("Gathering metrics from probes...") 207 if p.startTime.IsZero() { 208 return nil, fmt.Errorf("measurement %s has not been started", p) 209 } 210 threshold, err := util.GetDurationOrDefault(params, "threshold", 0) 211 if err != nil { 212 return nil, err 213 } 214 measurementEnd := time.Now() 215 216 query := prepareQuery(p.config.Query, p.startTime, measurementEnd) 217 pc := prom.NewInClusterPrometheusClient(p.framework.GetClientSets().GetClient()) 218 executor := measurementutil.NewQueryExecutor(pc) 219 samples, err := executor.Query(query, measurementEnd) 220 if err != nil { 221 return nil, err 222 } 223 224 latency, err := measurementutil.NewLatencyMetricPrometheus(samples) 225 if err != nil { 226 return nil, err 227 } 228 229 var violation error 230 prefix, suffix := "", "" 231 if threshold > 0 { 232 suffix = fmt.Sprintf(", expected perc99 <= %v", threshold) 233 if err := latency.VerifyThreshold(threshold); err != nil { 234 violation = errors.NewMetricViolationError(p.String(), err.Error()) 235 prefix = " WARNING" 236 } 237 } 238 klog.V(2).Infof("%s:%s got %v%s", p, prefix, latency, suffix) 239 240 summary, err := p.createSummary(*latency) 241 if err != nil { 242 return nil, err 243 } 244 return summary, violation 245 } 246 247 func (p *probesMeasurement) createProbesObjects() error { 248 return p.framework.ApplyTemplatedManifests(manifestsFS, path.Join(manifestsPathPrefix, p.config.Manifests), p.templateMapping) 249 } 250 251 func (p *probesMeasurement) waitForProbesReady(config *measurement.Config) error { 252 klog.V(2).Infof("Waiting for Probe %s to become ready...", p) 253 checkProbesReadyTimeout, err := util.GetDurationOrDefault(config.Params, "checkProbesReadyTimeout", defaultCheckProbesReadyTimeout) 254 if err != nil { 255 return err 256 } 257 return wait.Poll(checkProbesReadyInterval, checkProbesReadyTimeout, p.checkProbesReady) 258 } 259 260 func (p *probesMeasurement) checkProbesReady() (bool, error) { 261 // TODO(mm4tt): Using prometheus targets to check whether probes are up is a bit hacky. 262 // Consider rewriting this to something more intuitive. 263 selector := func(t prometheus.Target) bool { 264 for _, value := range p.config.ProbeLabelValues { 265 // NOTE(oxddr): Prometheus does some translation of labels. Labels here are not the same as labels defined on a monitored pod. 266 if t.Labels["job"] == value && t.Labels["namespace"] == probesNamespace { 267 return true 268 } 269 } 270 return false 271 } 272 expectedTargets := p.replicasPerProbe * len(p.config.ProbeLabelValues) 273 return prometheus.CheckAllTargetsReady( 274 p.framework.GetClientSets().GetClient(), selector, expectedTargets) 275 } 276 277 func (p *probesMeasurement) createSummary(latency measurementutil.LatencyMetric) (measurement.Summary, error) { 278 content, err := util.PrettyPrintJSON(&measurementutil.PerfData{ 279 Version: p.config.MetricVersion, 280 DataItems: []measurementutil.DataItem{latency.ToPerfData(p.String())}, 281 }) 282 if err != nil { 283 return nil, err 284 } 285 return measurement.CreateSummary(p.String(), "json", content), nil 286 } 287 288 func prepareQuery(queryTemplate string, startTime, endTime time.Time) string { 289 measurementDuration := endTime.Sub(startTime) 290 return fmt.Sprintf(queryTemplate, measurementutil.ToPrometheusTime(measurementDuration)) 291 }