k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/scheduler_latency.go (about) 1 /* 2 Copyright 2018 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package common 18 19 import ( 20 "context" 21 "fmt" 22 "math" 23 "strings" 24 "time" 25 26 "github.com/prometheus/common/model" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 clientset "k8s.io/client-go/kubernetes" 29 "k8s.io/klog/v2" 30 schedulermetric "k8s.io/kubernetes/pkg/scheduler/metrics" 31 "k8s.io/perf-tests/clusterloader2/pkg/measurement" 32 measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util" 33 "k8s.io/perf-tests/clusterloader2/pkg/provider" 34 "k8s.io/perf-tests/clusterloader2/pkg/util" 35 ) 36 37 const ( 38 schedulerLatencyMetricName = "SchedulingMetrics" 39 40 e2eSchedulingDurationMetricName = model.LabelValue(schedulermetric.SchedulerSubsystem + "_e2e_scheduling_duration_seconds_bucket") 41 schedulingAlgorithmDurationMetricName = model.LabelValue(schedulermetric.SchedulerSubsystem + "_scheduling_algorithm_duration_seconds_bucket") 42 frameworkExtensionPointDurationMetricName = model.LabelValue(schedulermetric.SchedulerSubsystem + "_framework_extension_point_duration_seconds_bucket") 43 preemptionEvaluationMetricName = model.LabelValue(schedulermetric.SchedulerSubsystem + "_scheduling_algorithm_preemption_evaluation_seconds_bucket") 44 45 singleRestCallTimeout = 5 * time.Minute 46 47 // kubeSchedulerPort is the default port for the scheduler status server. 48 kubeSchedulerPort = 10259 49 ) 50 51 var ( 52 extentionsPoints = []string{ 53 "PreFilter", 54 "Filter", 55 "PostFilter", 56 "PreScore", 57 "Score", 58 "PreBind", 59 "Bind", 60 "PostBind", 61 "Reserve", 62 "Unreserve", 63 "Permit", 64 } 65 ) 66 67 func init() { 68 if err := measurement.Register(schedulerLatencyMetricName, createSchedulerLatencyMeasurement); err != nil { 69 klog.Fatalf("Cannot register %s: %v", schedulerLatencyMetricName, err) 70 } 71 } 72 73 func createSchedulerLatencyMeasurement() measurement.Measurement { 74 return &schedulerLatencyMeasurement{} 75 } 76 77 type schedulerLatencyMeasurement struct { 78 initialLatency schedulerLatencyMetrics 79 } 80 81 type schedulerLatencyMetrics struct { 82 e2eSchedulingDurationHist *measurementutil.Histogram 83 schedulingAlgorithmDurationHist *measurementutil.Histogram 84 preemptionEvaluationHist *measurementutil.Histogram 85 frameworkExtensionPointDurationHist map[string]*measurementutil.Histogram 86 } 87 88 // Execute supports two actions: 89 // - reset - Resets latency data on api scheduler side. 90 // - gather - Gathers and prints current scheduler latency data. 91 func (s *schedulerLatencyMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) { 92 provider := config.ClusterFramework.GetClusterConfig().Provider 93 SSHToMasterSupported := provider.Features().SupportSSHToMaster 94 95 c := config.ClusterFramework.GetClientSets().GetClient() 96 nodes, err := c.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) 97 if err != nil { 98 return nil, err 99 } 100 101 var masterRegistered = false 102 for _, node := range nodes.Items { 103 if util.LegacyIsMasterNode(&node) || util.IsControlPlaneNode(&node) { 104 masterRegistered = true 105 } 106 } 107 108 if provider.Features().SchedulerInsecurePortDisabled || (!SSHToMasterSupported && !masterRegistered) { 109 klog.Warningf("unable to fetch scheduler metrics for provider: %s", provider.Name()) 110 return nil, nil 111 } 112 113 action, err := util.GetString(config.Params, "action") 114 if err != nil { 115 return nil, err 116 } 117 masterIP, err := util.GetStringOrDefault(config.Params, "masterIP", config.ClusterFramework.GetClusterConfig().GetMasterIP()) 118 if err != nil { 119 return nil, err 120 } 121 masterName, err := util.GetStringOrDefault(config.Params, "masterName", config.ClusterFramework.GetClusterConfig().MasterName) 122 if err != nil { 123 return nil, err 124 } 125 126 switch action { 127 case "reset": 128 klog.V(2).Infof("%s: start collecting latency initial metrics in scheduler...", s) 129 return nil, s.getSchedulingInitialLatency(config.ClusterFramework.GetClientSets().GetClient(), masterIP, provider, masterName, masterRegistered) 130 case "start": 131 klog.V(2).Infof("%s: start collecting latency metrics in scheduler...", s) 132 return nil, s.getSchedulingInitialLatency(config.ClusterFramework.GetClientSets().GetClient(), masterIP, provider, masterName, masterRegistered) 133 case "gather": 134 klog.V(2).Infof("%s: gathering latency metrics in scheduler...", s) 135 return s.getSchedulingLatency(config.ClusterFramework.GetClientSets().GetClient(), masterIP, provider, masterName, masterRegistered) 136 default: 137 return nil, fmt.Errorf("unknown action %v", action) 138 } 139 } 140 141 // Dispose cleans up after the measurement. 142 func (*schedulerLatencyMeasurement) Dispose() {} 143 144 // String returns string representation of this measurement. 145 func (*schedulerLatencyMeasurement) String() string { 146 return schedulerLatencyMetricName 147 } 148 149 // HistogramSub is a helper function to substract two histograms 150 func HistogramSub(finalHist, initialHist *measurementutil.Histogram) *measurementutil.Histogram { 151 for k := range finalHist.Buckets { 152 finalHist.Buckets[k] = finalHist.Buckets[k] - initialHist.Buckets[k] 153 } 154 return finalHist 155 } 156 157 func (m *schedulerLatencyMetrics) substract(sub schedulerLatencyMetrics) { 158 if sub.preemptionEvaluationHist != nil { 159 m.preemptionEvaluationHist = HistogramSub(m.preemptionEvaluationHist, sub.preemptionEvaluationHist) 160 } 161 if sub.schedulingAlgorithmDurationHist != nil { 162 m.schedulingAlgorithmDurationHist = HistogramSub(m.schedulingAlgorithmDurationHist, sub.schedulingAlgorithmDurationHist) 163 } 164 if sub.e2eSchedulingDurationHist != nil { 165 m.e2eSchedulingDurationHist = HistogramSub(m.e2eSchedulingDurationHist, sub.e2eSchedulingDurationHist) 166 } 167 for _, ep := range extentionsPoints { 168 if sub.frameworkExtensionPointDurationHist[ep] != nil { 169 m.frameworkExtensionPointDurationHist[ep] = HistogramSub(m.frameworkExtensionPointDurationHist[ep], sub.frameworkExtensionPointDurationHist[ep]) 170 } 171 } 172 } 173 174 func (s *schedulerLatencyMeasurement) setQuantiles(metrics schedulerLatencyMetrics) (schedulingMetrics, error) { 175 result := schedulingMetrics{ 176 FrameworkExtensionPointDuration: make(map[string]*measurementutil.LatencyMetric), 177 } 178 for _, ePoint := range extentionsPoints { 179 result.FrameworkExtensionPointDuration[ePoint] = &measurementutil.LatencyMetric{} 180 } 181 182 if err := SetQuantileFromHistogram(&result.E2eSchedulingLatency, metrics.e2eSchedulingDurationHist); err != nil { 183 return result, err 184 } 185 if err := SetQuantileFromHistogram(&result.SchedulingLatency, metrics.schedulingAlgorithmDurationHist); err != nil { 186 return result, err 187 } 188 189 for _, ePoint := range extentionsPoints { 190 if err := SetQuantileFromHistogram(result.FrameworkExtensionPointDuration[ePoint], metrics.frameworkExtensionPointDurationHist[ePoint]); err != nil { 191 return result, err 192 } 193 } 194 195 if err := SetQuantileFromHistogram(&result.PreemptionEvaluationLatency, metrics.preemptionEvaluationHist); err != nil { 196 return result, err 197 } 198 return result, nil 199 } 200 201 // getSchedulingLatency retrieves scheduler latency metrics. 202 func (s *schedulerLatencyMeasurement) getSchedulingLatency(c clientset.Interface, host string, provider provider.Provider, masterName string, masterRegistered bool) ([]measurement.Summary, error) { 203 schedulerMetrics, err := s.getSchedulingMetrics(c, host, provider, masterName, masterRegistered) 204 if err != nil { 205 return nil, err 206 } 207 schedulerMetrics.substract(s.initialLatency) 208 result, err := s.setQuantiles(schedulerMetrics) 209 if err != nil { 210 return nil, err 211 } 212 content, err := util.PrettyPrintJSON(result) 213 if err != nil { 214 return nil, err 215 } 216 summary := measurement.CreateSummary(schedulerLatencyMetricName, "json", content) 217 return []measurement.Summary{summary}, nil 218 } 219 220 // getSchedulingInitialLatency retrieves initial values of scheduler latency metrics 221 func (s *schedulerLatencyMeasurement) getSchedulingInitialLatency(c clientset.Interface, host string, provider provider.Provider, masterName string, masterRegistered bool) error { 222 var err error 223 s.initialLatency, err = s.getSchedulingMetrics(c, host, provider, masterName, masterRegistered) 224 if err != nil { 225 return err 226 } 227 return nil 228 } 229 230 // getSchedulingMetrics gets scheduler latency metrics 231 func (s *schedulerLatencyMeasurement) getSchedulingMetrics(c clientset.Interface, host string, provider provider.Provider, masterName string, masterRegistered bool) (schedulerLatencyMetrics, error) { 232 e2eSchedulingDurationHist := measurementutil.NewHistogram(nil) 233 schedulingAlgorithmDurationHist := measurementutil.NewHistogram(nil) 234 preemptionEvaluationHist := measurementutil.NewHistogram(nil) 235 frameworkExtensionPointDurationHist := make(map[string]*measurementutil.Histogram) 236 latencyMetrics := schedulerLatencyMetrics{ 237 e2eSchedulingDurationHist, 238 schedulingAlgorithmDurationHist, 239 preemptionEvaluationHist, 240 frameworkExtensionPointDurationHist} 241 242 for _, ePoint := range extentionsPoints { 243 frameworkExtensionPointDurationHist[ePoint] = measurementutil.NewHistogram(nil) 244 } 245 246 data, err := s.sendRequestToScheduler(c, "GET", host, provider, masterName, masterRegistered) 247 if err != nil { 248 return latencyMetrics, err 249 } 250 samples, err := measurementutil.ExtractMetricSamples(data) 251 if err != nil { 252 return latencyMetrics, err 253 } 254 255 for _, sample := range samples { 256 switch sample.Metric[model.MetricNameLabel] { 257 case e2eSchedulingDurationMetricName: 258 measurementutil.ConvertSampleToHistogram(sample, e2eSchedulingDurationHist) 259 case schedulingAlgorithmDurationMetricName: 260 measurementutil.ConvertSampleToHistogram(sample, schedulingAlgorithmDurationHist) 261 case frameworkExtensionPointDurationMetricName: 262 ePoint := string(sample.Metric["extension_point"]) 263 if _, exists := frameworkExtensionPointDurationHist[ePoint]; exists { 264 measurementutil.ConvertSampleToHistogram(sample, frameworkExtensionPointDurationHist[ePoint]) 265 } 266 case preemptionEvaluationMetricName: 267 measurementutil.ConvertSampleToHistogram(sample, preemptionEvaluationHist) 268 } 269 } 270 return latencyMetrics, nil 271 } 272 273 // SetQuantileFromHistogram sets quantile of LatencyMetric from Histogram 274 func SetQuantileFromHistogram(metric *measurementutil.LatencyMetric, hist *measurementutil.Histogram) error { 275 quantiles := []float64{0.5, 0.9, 0.99} 276 for _, quantile := range quantiles { 277 histQuantile, err := hist.Quantile(quantile) 278 if err != nil { 279 return err 280 } 281 // NaN is returned only when there are less than two buckets. 282 // In which case all quantiles are NaN and all latency metrics are untouched. 283 if !math.IsNaN(histQuantile) { 284 metric.SetQuantile(quantile, time.Duration(int64(histQuantile*float64(time.Second)))) 285 } 286 } 287 288 return nil 289 } 290 291 // sendRequestToScheduler sends request to kube scheduler metrics 292 func (s *schedulerLatencyMeasurement) sendRequestToScheduler(c clientset.Interface, op, host string, provider provider.Provider, masterName string, masterRegistered bool) (string, error) { 293 opUpper := strings.ToUpper(op) 294 if opUpper != "GET" && opUpper != "DELETE" { 295 return "", fmt.Errorf("unknown REST request") 296 } 297 298 var responseText string 299 if masterRegistered { 300 ctx, cancel := context.WithTimeout(context.Background(), singleRestCallTimeout) 301 defer cancel() 302 303 body, err := c.CoreV1().RESTClient().Verb(opUpper). 304 Namespace(metav1.NamespaceSystem). 305 Resource("pods"). 306 Name(fmt.Sprintf("https:kube-scheduler-%v:%v", masterName, kubeSchedulerPort)). 307 SubResource("proxy"). 308 Suffix("metrics"). 309 Do(ctx).Raw() 310 311 if err != nil { 312 klog.Errorf("Send request to scheduler failed with err: %v", err) 313 return "", err 314 } 315 responseText = string(body) 316 } else { 317 cmd := "curl -X " + opUpper + " -k https://localhost:10259/metrics" 318 sshResult, err := measurementutil.SSH(cmd, host+":22", provider) 319 if err != nil || sshResult.Code != 0 { 320 return "", fmt.Errorf("unexpected error (code: %d) in ssh connection to master: %#v", sshResult.Code, err) 321 } 322 responseText = sshResult.Stdout 323 } 324 return responseText, nil 325 } 326 327 type schedulingMetrics struct { 328 FrameworkExtensionPointDuration map[string]*measurementutil.LatencyMetric `json:"frameworkExtensionPointDuration"` 329 PreemptionEvaluationLatency measurementutil.LatencyMetric `json:"preemptionEvaluationLatency"` 330 E2eSchedulingLatency measurementutil.LatencyMetric `json:"e2eSchedulingLatency"` 331 332 // To track scheduling latency without binding, this allows to easier present the ceiling of the scheduler throughput. 333 SchedulingLatency measurementutil.LatencyMetric `json:"schedulingLatency"` 334 }