k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/slos/pod_startup_latency.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package slos
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"time"
    23  
    24  	corev1 "k8s.io/api/core/v1"
    25  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    26  	"k8s.io/apimachinery/pkg/fields"
    27  	"k8s.io/apimachinery/pkg/runtime"
    28  	"k8s.io/apimachinery/pkg/watch"
    29  	clientset "k8s.io/client-go/kubernetes"
    30  	"k8s.io/client-go/tools/cache"
    31  	"k8s.io/client-go/util/workqueue"
    32  	"k8s.io/klog/v2"
    33  	"k8s.io/perf-tests/clusterloader2/pkg/errors"
    34  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    35  	measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util"
    36  	"k8s.io/perf-tests/clusterloader2/pkg/measurement/util/informer"
    37  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    38  )
    39  
    40  const (
    41  	defaultPodStartupLatencyThreshold = 5 * time.Second
    42  	defaultSchedulerName              = corev1.DefaultSchedulerName
    43  	podStartupLatencyMeasurementName  = "PodStartupLatency"
    44  	informerSyncTimeout               = time.Minute
    45  
    46  	createPhase   = "create"
    47  	schedulePhase = "schedule"
    48  	runPhase      = "run"
    49  	watchPhase    = "watch"
    50  )
    51  
    52  func init() {
    53  	if err := measurement.Register(podStartupLatencyMeasurementName, createPodStartupLatencyMeasurement); err != nil {
    54  		klog.Fatalf("cant register service %v", err)
    55  	}
    56  }
    57  
    58  func createPodStartupLatencyMeasurement() measurement.Measurement {
    59  	return &podStartupLatencyMeasurement{
    60  		selector:          util.NewObjectSelector(),
    61  		podStartupEntries: measurementutil.NewObjectTransitionTimes(podStartupLatencyMeasurementName),
    62  		podMetadata:       measurementutil.NewPodsMetadata(podStartupLatencyMeasurementName),
    63  		eventQueue:        workqueue.New(),
    64  	}
    65  }
    66  
    67  type eventData struct {
    68  	obj      interface{}
    69  	recvTime time.Time
    70  }
    71  
    72  type podStartupLatencyMeasurement struct {
    73  	selector  *util.ObjectSelector
    74  	isRunning bool
    75  	stopCh    chan struct{}
    76  	// This queue can potentially grow indefinitely, beacause we put all changes here.
    77  	// Usually it's not recommended pattern, but we need it for measuring PodStartupLatency.
    78  	eventQueue        *workqueue.Type
    79  	podStartupEntries *measurementutil.ObjectTransitionTimes
    80  	podMetadata       *measurementutil.PodsMetadata
    81  	threshold         time.Duration
    82  }
    83  
    84  // Execute supports two actions:
    85  // - start - Starts to observe pods and pods events.
    86  // - gather - Gathers and prints current pod latency data.
    87  // Does NOT support concurrency. Multiple calls to this measurement
    88  // shouldn't be done within one step.
    89  func (p *podStartupLatencyMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) {
    90  	action, err := util.GetString(config.Params, "action")
    91  	if err != nil {
    92  		return nil, err
    93  	}
    94  
    95  	switch action {
    96  	case "start":
    97  		if err := p.selector.Parse(config.Params); err != nil {
    98  			return nil, err
    99  		}
   100  		p.threshold, err = util.GetDurationOrDefault(config.Params, "threshold", defaultPodStartupLatencyThreshold)
   101  		if err != nil {
   102  			return nil, err
   103  		}
   104  		return nil, p.start(config.ClusterFramework.GetClientSets().GetClient())
   105  	case "gather":
   106  		schedulerName, err := util.GetStringOrDefault(config.Params, "schedulerName", defaultSchedulerName)
   107  		if err != nil {
   108  			return nil, err
   109  		}
   110  		return p.gather(config.ClusterFramework.GetClientSets().GetClient(), config.Identifier, schedulerName)
   111  	default:
   112  		return nil, fmt.Errorf("unknown action %v", action)
   113  	}
   114  
   115  }
   116  
   117  // Dispose cleans up after the measurement.
   118  func (p *podStartupLatencyMeasurement) Dispose() {
   119  	p.stop()
   120  }
   121  
   122  // String returns string representation of this measurement.
   123  func (p *podStartupLatencyMeasurement) String() string {
   124  	return podStartupLatencyMeasurementName + ": " + p.selector.String()
   125  }
   126  
   127  func (p *podStartupLatencyMeasurement) start(c clientset.Interface) error {
   128  	if p.isRunning {
   129  		klog.V(2).Infof("%s: pod startup latancy measurement already running", p)
   130  		return nil
   131  	}
   132  	klog.V(2).Infof("%s: starting pod startup latency measurement...", p)
   133  	p.isRunning = true
   134  	p.stopCh = make(chan struct{})
   135  	i := informer.NewInformer(
   136  		&cache.ListWatch{
   137  			ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
   138  				p.selector.ApplySelectors(&options)
   139  				return c.CoreV1().Pods(p.selector.Namespace).List(context.TODO(), options)
   140  			},
   141  			WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
   142  				p.selector.ApplySelectors(&options)
   143  				return c.CoreV1().Pods(p.selector.Namespace).Watch(context.TODO(), options)
   144  			},
   145  		},
   146  		p.addEvent,
   147  	)
   148  	go p.processEvents()
   149  	return informer.StartAndSync(i, p.stopCh, informerSyncTimeout)
   150  }
   151  
   152  func (p *podStartupLatencyMeasurement) addEvent(_, obj interface{}) {
   153  	event := &eventData{obj: obj, recvTime: time.Now()}
   154  	p.eventQueue.Add(event)
   155  }
   156  
   157  func (p *podStartupLatencyMeasurement) processEvents() {
   158  	for p.processNextWorkItem() {
   159  	}
   160  }
   161  
   162  func (p *podStartupLatencyMeasurement) processNextWorkItem() bool {
   163  	item, quit := p.eventQueue.Get()
   164  	if quit {
   165  		return false
   166  	}
   167  	defer p.eventQueue.Done(item)
   168  
   169  	event, ok := item.(*eventData)
   170  	if !ok {
   171  		klog.Warningf("Couldn't convert work item to evetData: %v", item)
   172  		return true
   173  	}
   174  	p.processEvent(event)
   175  	return true
   176  }
   177  
   178  func (p *podStartupLatencyMeasurement) stop() {
   179  	if p.isRunning {
   180  		p.isRunning = false
   181  		close(p.stopCh)
   182  		p.eventQueue.ShutDown()
   183  	}
   184  }
   185  
   186  var podStartupTransitions = map[string]measurementutil.Transition{
   187  	"create_to_schedule": {
   188  		From: createPhase,
   189  		To:   schedulePhase,
   190  	},
   191  	"schedule_to_run": {
   192  		From: schedulePhase,
   193  		To:   runPhase,
   194  	},
   195  	"run_to_watch": {
   196  		From: runPhase,
   197  		To:   watchPhase,
   198  	},
   199  	"schedule_to_watch": {
   200  		From: schedulePhase,
   201  		To:   watchPhase,
   202  	},
   203  	"pod_startup": {
   204  		From: createPhase,
   205  		To:   watchPhase,
   206  	},
   207  }
   208  
   209  func podStartupTransitionsWithThreshold(threshold time.Duration) map[string]measurementutil.Transition {
   210  	result := make(map[string]measurementutil.Transition)
   211  	for key, value := range podStartupTransitions {
   212  		result[key] = value
   213  	}
   214  	podStartupTransition := result["pod_startup"]
   215  	podStartupTransition.Threshold = threshold
   216  	result["pod_startup"] = podStartupTransition
   217  	return result
   218  }
   219  
   220  type podStartupLatencyCheck struct {
   221  	namePrefix string
   222  	filter     measurementutil.KeyFilterFunc
   223  }
   224  
   225  func (p *podStartupLatencyMeasurement) gather(c clientset.Interface, identifier string, schedulerName string) ([]measurement.Summary, error) {
   226  	klog.V(2).Infof("%s: gathering pod startup latency measurement...", p)
   227  	if !p.isRunning {
   228  		return nil, fmt.Errorf("metric %s has not been started", podStartupLatencyMeasurementName)
   229  	}
   230  
   231  	p.stop()
   232  
   233  	if err := p.gatherScheduleTimes(c, schedulerName); err != nil {
   234  		return nil, err
   235  	}
   236  
   237  	checks := []podStartupLatencyCheck{
   238  		{
   239  			namePrefix: "",
   240  			filter:     measurementutil.MatchAll,
   241  		},
   242  		{
   243  			namePrefix: "Stateless",
   244  			filter:     p.podMetadata.FilterStateless,
   245  		},
   246  		{
   247  			namePrefix: "Stateful",
   248  			filter:     p.podMetadata.FilterStateful,
   249  		},
   250  	}
   251  
   252  	var summaries []measurement.Summary
   253  	var err error
   254  	for _, check := range checks {
   255  		transitions := podStartupTransitionsWithThreshold(p.threshold)
   256  		podStartupLatency := p.podStartupEntries.CalculateTransitionsLatency(transitions, check.filter)
   257  
   258  		if slosErr := podStartupLatency["pod_startup"].VerifyThreshold(p.threshold); slosErr != nil {
   259  			err = errors.NewMetricViolationError("pod startup", slosErr.Error())
   260  			klog.Errorf("%s%s: %v", check.namePrefix, p, err)
   261  		}
   262  
   263  		content, jsonErr := util.PrettyPrintJSON(measurementutil.LatencyMapToPerfData(podStartupLatency))
   264  		if jsonErr != nil {
   265  			return nil, jsonErr
   266  		}
   267  		summaryName := fmt.Sprintf("%s%s_%s", check.namePrefix, podStartupLatencyMeasurementName, identifier)
   268  		summaries = append(summaries, measurement.CreateSummary(summaryName, "json", content))
   269  	}
   270  	return summaries, err
   271  }
   272  
   273  // TODO(#2006): gatherScheduleTimes is currently listing events at the end of the test.
   274  //
   275  //	Given that events by default have 1h TTL, for measurements across longer periods
   276  //	it just returns incomplete results.
   277  //	Given that we don't 100% accuracy, we should switch to a mechanism that is similar
   278  //	to the one that slo-monitor is using (added in #1477).
   279  func (p *podStartupLatencyMeasurement) gatherScheduleTimes(c clientset.Interface, schedulerName string) error {
   280  	selector := fields.Set{
   281  		"involvedObject.kind": "Pod",
   282  		"source":              schedulerName,
   283  	}.AsSelector().String()
   284  	options := metav1.ListOptions{FieldSelector: selector}
   285  	schedEvents, err := c.CoreV1().Events(p.selector.Namespace).List(context.TODO(), options)
   286  	if err != nil {
   287  		return err
   288  	}
   289  	for _, event := range schedEvents.Items {
   290  		key := createMetaNamespaceKey(event.InvolvedObject.Namespace, event.InvolvedObject.Name)
   291  		if _, exists := p.podStartupEntries.Get(key, createPhase); exists {
   292  			if !event.EventTime.IsZero() {
   293  				p.podStartupEntries.Set(key, schedulePhase, event.EventTime.Time)
   294  			} else {
   295  				p.podStartupEntries.Set(key, schedulePhase, event.FirstTimestamp.Time)
   296  			}
   297  		}
   298  	}
   299  	return nil
   300  }
   301  
   302  func (p *podStartupLatencyMeasurement) processEvent(event *eventData) {
   303  	obj, recvTime := event.obj, event.recvTime
   304  	if obj == nil {
   305  		return
   306  	}
   307  	pod, ok := obj.(*corev1.Pod)
   308  	if !ok {
   309  		return
   310  	}
   311  
   312  	key := createMetaNamespaceKey(pod.Namespace, pod.Name)
   313  	p.podMetadata.SetStateless(key, isPodStateless(pod))
   314  
   315  	if pod.Status.Phase == corev1.PodRunning {
   316  		if _, found := p.podStartupEntries.Get(key, createPhase); !found {
   317  			p.podStartupEntries.Set(key, watchPhase, recvTime)
   318  			p.podStartupEntries.Set(key, createPhase, pod.CreationTimestamp.Time)
   319  			var startTime metav1.Time
   320  			for _, cs := range pod.Status.ContainerStatuses {
   321  				if cs.State.Running != nil {
   322  					if startTime.Before(&cs.State.Running.StartedAt) {
   323  						startTime = cs.State.Running.StartedAt
   324  					}
   325  				}
   326  			}
   327  			if startTime != metav1.NewTime(time.Time{}) {
   328  				p.podStartupEntries.Set(key, runPhase, startTime.Time)
   329  			} else {
   330  				klog.Errorf("%s: pod %v (%v) is reported to be running, but none of its containers is", p, pod.Name, pod.Namespace)
   331  			}
   332  		}
   333  	}
   334  }
   335  
   336  func createMetaNamespaceKey(namespace, name string) string {
   337  	return namespace + "/" + name
   338  }
   339  
   340  func isPodStateless(pod *corev1.Pod) bool {
   341  	for _, volume := range pod.Spec.Volumes {
   342  		if volume.EmptyDir != nil || volume.DownwardAPI != nil || volume.ConfigMap != nil || volume.Secret != nil || volume.Projected != nil {
   343  			continue
   344  		}
   345  		klog.V(4).Infof("pod %s/%s classified as stateful", pod.Namespace, pod.Name)
   346  		return false
   347  	}
   348  	return true
   349  }