k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/scheduling_throughput.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package common
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  	"sort"
    23  	"time"
    24  
    25  	clientset "k8s.io/client-go/kubernetes"
    26  	"k8s.io/klog/v2"
    27  	"k8s.io/perf-tests/clusterloader2/pkg/errors"
    28  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    29  	measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util"
    30  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    31  )
    32  
    33  const (
    34  	schedulingThroughputMeasurementName = "SchedulingThroughput"
    35  	defaultSchedulingThroughputInterval = 5 * time.Second
    36  )
    37  
    38  func init() {
    39  	if err := measurement.Register(schedulingThroughputMeasurementName, createSchedulingThroughputMeasurement); err != nil {
    40  		klog.Fatalf("Cannot register %s: %v", schedulingThroughputMeasurementName, err)
    41  	}
    42  }
    43  
    44  func createSchedulingThroughputMeasurement() measurement.Measurement {
    45  	return &schedulingThroughputMeasurement{}
    46  }
    47  
    48  type schedulingThroughputMeasurement struct {
    49  	schedulingThroughputs []float64
    50  	isRunning             bool
    51  	stopCh                chan struct{}
    52  }
    53  
    54  // Execute supports two actions:
    55  //   - start - starts the pods scheduling observation.
    56  //     Pods can be specified by field and/or label selectors.
    57  //     If namespace is not passed by parameter, all-namespace scope is assumed.
    58  //   - gather - creates summary for observed values.
    59  func (s *schedulingThroughputMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) {
    60  	action, err := util.GetString(config.Params, "action")
    61  	if err != nil {
    62  		return nil, err
    63  	}
    64  	switch action {
    65  	case "start":
    66  		if s.isRunning {
    67  			klog.V(3).Infof("%s: measurement already running", s)
    68  			return nil, nil
    69  		}
    70  		selector := util.NewObjectSelector()
    71  		if err := selector.Parse(config.Params); err != nil {
    72  			return nil, err
    73  		}
    74  		measurmentInterval, err := util.GetDurationOrDefault(config.Params, "measurmentInterval", defaultSchedulingThroughputInterval)
    75  		if err != nil {
    76  			return nil, err
    77  		}
    78  		s.stopCh = make(chan struct{})
    79  		return nil, s.start(config.ClusterFramework.GetClientSets().GetClient(), selector, measurmentInterval)
    80  	case "gather":
    81  		threshold, err := util.GetFloat64OrDefault(config.Params, "threshold", 0)
    82  		if err != nil {
    83  			klog.Warningf("error while getting threshold param: %v", err)
    84  		}
    85  		enableViolations, err := util.GetBoolOrDefault(config.Params, "enableViolations", true)
    86  		if err != nil {
    87  			klog.Warningf("error while getting enableViolations param: %v", err)
    88  		}
    89  		summary, err := s.gather(threshold)
    90  		if err != nil {
    91  			if !errors.IsMetricViolationError(err) {
    92  				klog.Errorf("%s gathering error: %v", config.Identifier, err)
    93  				return nil, err
    94  			}
    95  			if !enableViolations {
    96  				err = nil
    97  			}
    98  		}
    99  		return summary, err
   100  	default:
   101  		return nil, fmt.Errorf("unknown action %v", action)
   102  	}
   103  }
   104  
   105  // Dispose cleans up after the measurement.
   106  func (s *schedulingThroughputMeasurement) Dispose() {
   107  	s.stop()
   108  }
   109  
   110  // String returns a string representation of the measurement.
   111  func (*schedulingThroughputMeasurement) String() string {
   112  	return schedulingThroughputMeasurementName
   113  }
   114  
   115  func (s *schedulingThroughputMeasurement) start(clientSet clientset.Interface, selector *util.ObjectSelector, measurmentInterval time.Duration) error {
   116  	ps, err := measurementutil.NewPodStore(clientSet, selector)
   117  	if err != nil {
   118  		return fmt.Errorf("pod store creation error: %v", err)
   119  	}
   120  	s.isRunning = true
   121  	klog.V(2).Infof("%s: starting collecting throughput data", s)
   122  
   123  	go func() {
   124  		defer ps.Stop()
   125  		lastScheduledCount := 0
   126  		for {
   127  			select {
   128  			case <-s.stopCh:
   129  				return
   130  			case <-time.After(measurmentInterval):
   131  				pods, err := ps.List()
   132  				if err != nil {
   133  					// List in NewPodStore never returns error.
   134  					// TODO(mborsz): Even if this is a case now, it doesn't need to be true in future. Refactor this.
   135  					panic(fmt.Errorf("unexpected error on PodStore.List: %w", err))
   136  				}
   137  				podsStatus := measurementutil.ComputePodsStartupStatus(pods, 0, nil /* updatePodPredicate */)
   138  				throughput := float64(podsStatus.Scheduled-lastScheduledCount) / float64(measurmentInterval/time.Second)
   139  				s.schedulingThroughputs = append(s.schedulingThroughputs, throughput)
   140  				lastScheduledCount = podsStatus.Scheduled
   141  				klog.V(3).Infof("%v: %s: %d pods scheduled", s, selector.String(), lastScheduledCount)
   142  			}
   143  		}
   144  	}()
   145  	return nil
   146  }
   147  
   148  func (s *schedulingThroughputMeasurement) gather(threshold float64) ([]measurement.Summary, error) {
   149  	if !s.isRunning {
   150  		klog.Errorf("%s: measurement is not running", s)
   151  		return nil, fmt.Errorf("measurement is not running")
   152  	}
   153  	s.stop()
   154  	klog.V(2).Infof("%s: gathering data", s)
   155  
   156  	throughputSummary := &schedulingThroughput{}
   157  	if length := len(s.schedulingThroughputs); length > 0 {
   158  		sort.Float64s(s.schedulingThroughputs)
   159  		throughputSummary.Perc50 = s.schedulingThroughputs[int(math.Ceil(float64(length*50)/100))-1]
   160  		throughputSummary.Perc90 = s.schedulingThroughputs[int(math.Ceil(float64(length*90)/100))-1]
   161  		throughputSummary.Perc99 = s.schedulingThroughputs[int(math.Ceil(float64(length*99)/100))-1]
   162  		throughputSummary.Max = s.schedulingThroughputs[length-1]
   163  	}
   164  	content, err := util.PrettyPrintJSON(throughputSummary)
   165  	if err != nil {
   166  		return nil, err
   167  	}
   168  	summary := measurement.CreateSummary(schedulingThroughputMeasurementName, "json", content)
   169  	if threshold > 0 && throughputSummary.Max < threshold {
   170  		err = errors.NewMetricViolationError(
   171  			"scheduler throughput",
   172  			fmt.Sprintf("actual throughput %f lower than threshold %f", throughputSummary.Max, threshold))
   173  	}
   174  	return []measurement.Summary{summary}, err
   175  }
   176  
   177  func (s *schedulingThroughputMeasurement) stop() {
   178  	if s.isRunning {
   179  		close(s.stopCh)
   180  		s.isRunning = false
   181  	}
   182  }
   183  
   184  type schedulingThroughput struct {
   185  	Perc50 float64 `json:"perc50"`
   186  	Perc90 float64 `json:"perc90"`
   187  	Perc99 float64 `json:"perc99"`
   188  	Max    float64 `json:"max"`
   189  }