k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/scheduler_latency.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package common
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"math"
    23  	"strings"
    24  	"time"
    25  
    26  	"github.com/prometheus/common/model"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  	clientset "k8s.io/client-go/kubernetes"
    29  	"k8s.io/klog/v2"
    30  	schedulermetric "k8s.io/kubernetes/pkg/scheduler/metrics"
    31  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    32  	measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util"
    33  	"k8s.io/perf-tests/clusterloader2/pkg/provider"
    34  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    35  )
    36  
    37  const (
    38  	schedulerLatencyMetricName = "SchedulingMetrics"
    39  
    40  	e2eSchedulingDurationMetricName           = model.LabelValue(schedulermetric.SchedulerSubsystem + "_e2e_scheduling_duration_seconds_bucket")
    41  	schedulingAlgorithmDurationMetricName     = model.LabelValue(schedulermetric.SchedulerSubsystem + "_scheduling_algorithm_duration_seconds_bucket")
    42  	frameworkExtensionPointDurationMetricName = model.LabelValue(schedulermetric.SchedulerSubsystem + "_framework_extension_point_duration_seconds_bucket")
    43  	preemptionEvaluationMetricName            = model.LabelValue(schedulermetric.SchedulerSubsystem + "_scheduling_algorithm_preemption_evaluation_seconds_bucket")
    44  
    45  	singleRestCallTimeout = 5 * time.Minute
    46  
    47  	// kubeSchedulerPort is the default port for the scheduler status server.
    48  	kubeSchedulerPort = 10259
    49  )
    50  
    51  var (
    52  	extentionsPoints = []string{
    53  		"PreFilter",
    54  		"Filter",
    55  		"PostFilter",
    56  		"PreScore",
    57  		"Score",
    58  		"PreBind",
    59  		"Bind",
    60  		"PostBind",
    61  		"Reserve",
    62  		"Unreserve",
    63  		"Permit",
    64  	}
    65  )
    66  
    67  func init() {
    68  	if err := measurement.Register(schedulerLatencyMetricName, createSchedulerLatencyMeasurement); err != nil {
    69  		klog.Fatalf("Cannot register %s: %v", schedulerLatencyMetricName, err)
    70  	}
    71  }
    72  
    73  func createSchedulerLatencyMeasurement() measurement.Measurement {
    74  	return &schedulerLatencyMeasurement{}
    75  }
    76  
    77  type schedulerLatencyMeasurement struct {
    78  	initialLatency schedulerLatencyMetrics
    79  }
    80  
    81  type schedulerLatencyMetrics struct {
    82  	e2eSchedulingDurationHist           *measurementutil.Histogram
    83  	schedulingAlgorithmDurationHist     *measurementutil.Histogram
    84  	preemptionEvaluationHist            *measurementutil.Histogram
    85  	frameworkExtensionPointDurationHist map[string]*measurementutil.Histogram
    86  }
    87  
    88  // Execute supports two actions:
    89  // - reset - Resets latency data on api scheduler side.
    90  // - gather - Gathers and prints current scheduler latency data.
    91  func (s *schedulerLatencyMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) {
    92  	provider := config.ClusterFramework.GetClusterConfig().Provider
    93  	SSHToMasterSupported := provider.Features().SupportSSHToMaster
    94  
    95  	c := config.ClusterFramework.GetClientSets().GetClient()
    96  	nodes, err := c.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
    97  	if err != nil {
    98  		return nil, err
    99  	}
   100  
   101  	var masterRegistered = false
   102  	for _, node := range nodes.Items {
   103  		if util.LegacyIsMasterNode(&node) || util.IsControlPlaneNode(&node) {
   104  			masterRegistered = true
   105  		}
   106  	}
   107  
   108  	if provider.Features().SchedulerInsecurePortDisabled || (!SSHToMasterSupported && !masterRegistered) {
   109  		klog.Warningf("unable to fetch scheduler metrics for provider: %s", provider.Name())
   110  		return nil, nil
   111  	}
   112  
   113  	action, err := util.GetString(config.Params, "action")
   114  	if err != nil {
   115  		return nil, err
   116  	}
   117  	masterIP, err := util.GetStringOrDefault(config.Params, "masterIP", config.ClusterFramework.GetClusterConfig().GetMasterIP())
   118  	if err != nil {
   119  		return nil, err
   120  	}
   121  	masterName, err := util.GetStringOrDefault(config.Params, "masterName", config.ClusterFramework.GetClusterConfig().MasterName)
   122  	if err != nil {
   123  		return nil, err
   124  	}
   125  
   126  	switch action {
   127  	case "reset":
   128  		klog.V(2).Infof("%s: start collecting latency initial metrics in scheduler...", s)
   129  		return nil, s.getSchedulingInitialLatency(config.ClusterFramework.GetClientSets().GetClient(), masterIP, provider, masterName, masterRegistered)
   130  	case "start":
   131  		klog.V(2).Infof("%s: start collecting latency metrics in scheduler...", s)
   132  		return nil, s.getSchedulingInitialLatency(config.ClusterFramework.GetClientSets().GetClient(), masterIP, provider, masterName, masterRegistered)
   133  	case "gather":
   134  		klog.V(2).Infof("%s: gathering latency metrics in scheduler...", s)
   135  		return s.getSchedulingLatency(config.ClusterFramework.GetClientSets().GetClient(), masterIP, provider, masterName, masterRegistered)
   136  	default:
   137  		return nil, fmt.Errorf("unknown action %v", action)
   138  	}
   139  }
   140  
   141  // Dispose cleans up after the measurement.
   142  func (*schedulerLatencyMeasurement) Dispose() {}
   143  
   144  // String returns string representation of this measurement.
   145  func (*schedulerLatencyMeasurement) String() string {
   146  	return schedulerLatencyMetricName
   147  }
   148  
   149  // HistogramSub is a helper function to substract two histograms
   150  func HistogramSub(finalHist, initialHist *measurementutil.Histogram) *measurementutil.Histogram {
   151  	for k := range finalHist.Buckets {
   152  		finalHist.Buckets[k] = finalHist.Buckets[k] - initialHist.Buckets[k]
   153  	}
   154  	return finalHist
   155  }
   156  
   157  func (m *schedulerLatencyMetrics) substract(sub schedulerLatencyMetrics) {
   158  	if sub.preemptionEvaluationHist != nil {
   159  		m.preemptionEvaluationHist = HistogramSub(m.preemptionEvaluationHist, sub.preemptionEvaluationHist)
   160  	}
   161  	if sub.schedulingAlgorithmDurationHist != nil {
   162  		m.schedulingAlgorithmDurationHist = HistogramSub(m.schedulingAlgorithmDurationHist, sub.schedulingAlgorithmDurationHist)
   163  	}
   164  	if sub.e2eSchedulingDurationHist != nil {
   165  		m.e2eSchedulingDurationHist = HistogramSub(m.e2eSchedulingDurationHist, sub.e2eSchedulingDurationHist)
   166  	}
   167  	for _, ep := range extentionsPoints {
   168  		if sub.frameworkExtensionPointDurationHist[ep] != nil {
   169  			m.frameworkExtensionPointDurationHist[ep] = HistogramSub(m.frameworkExtensionPointDurationHist[ep], sub.frameworkExtensionPointDurationHist[ep])
   170  		}
   171  	}
   172  }
   173  
   174  func (s *schedulerLatencyMeasurement) setQuantiles(metrics schedulerLatencyMetrics) (schedulingMetrics, error) {
   175  	result := schedulingMetrics{
   176  		FrameworkExtensionPointDuration: make(map[string]*measurementutil.LatencyMetric),
   177  	}
   178  	for _, ePoint := range extentionsPoints {
   179  		result.FrameworkExtensionPointDuration[ePoint] = &measurementutil.LatencyMetric{}
   180  	}
   181  
   182  	if err := SetQuantileFromHistogram(&result.E2eSchedulingLatency, metrics.e2eSchedulingDurationHist); err != nil {
   183  		return result, err
   184  	}
   185  	if err := SetQuantileFromHistogram(&result.SchedulingLatency, metrics.schedulingAlgorithmDurationHist); err != nil {
   186  		return result, err
   187  	}
   188  
   189  	for _, ePoint := range extentionsPoints {
   190  		if err := SetQuantileFromHistogram(result.FrameworkExtensionPointDuration[ePoint], metrics.frameworkExtensionPointDurationHist[ePoint]); err != nil {
   191  			return result, err
   192  		}
   193  	}
   194  
   195  	if err := SetQuantileFromHistogram(&result.PreemptionEvaluationLatency, metrics.preemptionEvaluationHist); err != nil {
   196  		return result, err
   197  	}
   198  	return result, nil
   199  }
   200  
   201  // getSchedulingLatency retrieves scheduler latency metrics.
   202  func (s *schedulerLatencyMeasurement) getSchedulingLatency(c clientset.Interface, host string, provider provider.Provider, masterName string, masterRegistered bool) ([]measurement.Summary, error) {
   203  	schedulerMetrics, err := s.getSchedulingMetrics(c, host, provider, masterName, masterRegistered)
   204  	if err != nil {
   205  		return nil, err
   206  	}
   207  	schedulerMetrics.substract(s.initialLatency)
   208  	result, err := s.setQuantiles(schedulerMetrics)
   209  	if err != nil {
   210  		return nil, err
   211  	}
   212  	content, err := util.PrettyPrintJSON(result)
   213  	if err != nil {
   214  		return nil, err
   215  	}
   216  	summary := measurement.CreateSummary(schedulerLatencyMetricName, "json", content)
   217  	return []measurement.Summary{summary}, nil
   218  }
   219  
   220  // getSchedulingInitialLatency retrieves initial values of scheduler latency metrics
   221  func (s *schedulerLatencyMeasurement) getSchedulingInitialLatency(c clientset.Interface, host string, provider provider.Provider, masterName string, masterRegistered bool) error {
   222  	var err error
   223  	s.initialLatency, err = s.getSchedulingMetrics(c, host, provider, masterName, masterRegistered)
   224  	if err != nil {
   225  		return err
   226  	}
   227  	return nil
   228  }
   229  
   230  // getSchedulingMetrics gets scheduler latency metrics
   231  func (s *schedulerLatencyMeasurement) getSchedulingMetrics(c clientset.Interface, host string, provider provider.Provider, masterName string, masterRegistered bool) (schedulerLatencyMetrics, error) {
   232  	e2eSchedulingDurationHist := measurementutil.NewHistogram(nil)
   233  	schedulingAlgorithmDurationHist := measurementutil.NewHistogram(nil)
   234  	preemptionEvaluationHist := measurementutil.NewHistogram(nil)
   235  	frameworkExtensionPointDurationHist := make(map[string]*measurementutil.Histogram)
   236  	latencyMetrics := schedulerLatencyMetrics{
   237  		e2eSchedulingDurationHist,
   238  		schedulingAlgorithmDurationHist,
   239  		preemptionEvaluationHist,
   240  		frameworkExtensionPointDurationHist}
   241  
   242  	for _, ePoint := range extentionsPoints {
   243  		frameworkExtensionPointDurationHist[ePoint] = measurementutil.NewHistogram(nil)
   244  	}
   245  
   246  	data, err := s.sendRequestToScheduler(c, "GET", host, provider, masterName, masterRegistered)
   247  	if err != nil {
   248  		return latencyMetrics, err
   249  	}
   250  	samples, err := measurementutil.ExtractMetricSamples(data)
   251  	if err != nil {
   252  		return latencyMetrics, err
   253  	}
   254  
   255  	for _, sample := range samples {
   256  		switch sample.Metric[model.MetricNameLabel] {
   257  		case e2eSchedulingDurationMetricName:
   258  			measurementutil.ConvertSampleToHistogram(sample, e2eSchedulingDurationHist)
   259  		case schedulingAlgorithmDurationMetricName:
   260  			measurementutil.ConvertSampleToHistogram(sample, schedulingAlgorithmDurationHist)
   261  		case frameworkExtensionPointDurationMetricName:
   262  			ePoint := string(sample.Metric["extension_point"])
   263  			if _, exists := frameworkExtensionPointDurationHist[ePoint]; exists {
   264  				measurementutil.ConvertSampleToHistogram(sample, frameworkExtensionPointDurationHist[ePoint])
   265  			}
   266  		case preemptionEvaluationMetricName:
   267  			measurementutil.ConvertSampleToHistogram(sample, preemptionEvaluationHist)
   268  		}
   269  	}
   270  	return latencyMetrics, nil
   271  }
   272  
   273  // SetQuantileFromHistogram sets quantile of LatencyMetric from Histogram
   274  func SetQuantileFromHistogram(metric *measurementutil.LatencyMetric, hist *measurementutil.Histogram) error {
   275  	quantiles := []float64{0.5, 0.9, 0.99}
   276  	for _, quantile := range quantiles {
   277  		histQuantile, err := hist.Quantile(quantile)
   278  		if err != nil {
   279  			return err
   280  		}
   281  		// NaN is returned only when there are less than two buckets.
   282  		// In which case all quantiles are NaN and all latency metrics are untouched.
   283  		if !math.IsNaN(histQuantile) {
   284  			metric.SetQuantile(quantile, time.Duration(int64(histQuantile*float64(time.Second))))
   285  		}
   286  	}
   287  
   288  	return nil
   289  }
   290  
   291  // sendRequestToScheduler sends request to kube scheduler metrics
   292  func (s *schedulerLatencyMeasurement) sendRequestToScheduler(c clientset.Interface, op, host string, provider provider.Provider, masterName string, masterRegistered bool) (string, error) {
   293  	opUpper := strings.ToUpper(op)
   294  	if opUpper != "GET" && opUpper != "DELETE" {
   295  		return "", fmt.Errorf("unknown REST request")
   296  	}
   297  
   298  	var responseText string
   299  	if masterRegistered {
   300  		ctx, cancel := context.WithTimeout(context.Background(), singleRestCallTimeout)
   301  		defer cancel()
   302  
   303  		body, err := c.CoreV1().RESTClient().Verb(opUpper).
   304  			Namespace(metav1.NamespaceSystem).
   305  			Resource("pods").
   306  			Name(fmt.Sprintf("https:kube-scheduler-%v:%v", masterName, kubeSchedulerPort)).
   307  			SubResource("proxy").
   308  			Suffix("metrics").
   309  			Do(ctx).Raw()
   310  
   311  		if err != nil {
   312  			klog.Errorf("Send request to scheduler failed with err: %v", err)
   313  			return "", err
   314  		}
   315  		responseText = string(body)
   316  	} else {
   317  		cmd := "curl -X " + opUpper + " -k https://localhost:10259/metrics"
   318  		sshResult, err := measurementutil.SSH(cmd, host+":22", provider)
   319  		if err != nil || sshResult.Code != 0 {
   320  			return "", fmt.Errorf("unexpected error (code: %d) in ssh connection to master: %#v", sshResult.Code, err)
   321  		}
   322  		responseText = sshResult.Stdout
   323  	}
   324  	return responseText, nil
   325  }
   326  
   327  type schedulingMetrics struct {
   328  	FrameworkExtensionPointDuration map[string]*measurementutil.LatencyMetric `json:"frameworkExtensionPointDuration"`
   329  	PreemptionEvaluationLatency     measurementutil.LatencyMetric             `json:"preemptionEvaluationLatency"`
   330  	E2eSchedulingLatency            measurementutil.LatencyMetric             `json:"e2eSchedulingLatency"`
   331  
   332  	// To track scheduling latency without binding, this allows to easier present the ceiling of the scheduler throughput.
   333  	SchedulingLatency measurementutil.LatencyMetric `json:"schedulingLatency"`
   334  }