k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/job_lifecycle_latency.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package common
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"time"
    23  
    24  	batchv1 "k8s.io/api/batch/v1"
    25  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    26  	"k8s.io/apimachinery/pkg/runtime"
    27  	"k8s.io/apimachinery/pkg/util/wait"
    28  	"k8s.io/apimachinery/pkg/watch"
    29  	clientset "k8s.io/client-go/kubernetes"
    30  	"k8s.io/client-go/tools/cache"
    31  	"k8s.io/client-go/util/workqueue"
    32  	"k8s.io/klog/v2"
    33  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    34  	measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util"
    35  	"k8s.io/perf-tests/clusterloader2/pkg/measurement/util/informer"
    36  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    37  )
    38  
    39  type eventData struct {
    40  	obj      interface{}
    41  	recvTime time.Time
    42  }
    43  
    44  const (
    45  	jobLifecycleLatencyMeasurementName = "JobLifecycleLatency"
    46  	checkCompletedJobsInterval         = time.Second
    47  	jobCreated                         = "JobCreated"
    48  	jobStarted                         = "JobStarted"
    49  	jobCompleted                       = "JobCompleted"
    50  )
    51  
    52  func init() {
    53  	if err := measurement.Register(jobLifecycleLatencyMeasurementName, createJobLifecycleLatencyMeasurement); err != nil {
    54  		klog.Fatalf("Can't register service %v", err)
    55  	}
    56  }
    57  
    58  func createJobLifecycleLatencyMeasurement() measurement.Measurement {
    59  	return &jobLifecycleLatencyMeasurement{
    60  		selector:        util.NewObjectSelector(),
    61  		jobStateEntries: measurementutil.NewObjectTransitionTimes(jobLifecycleLatencyMeasurementName),
    62  		eventQueue:      workqueue.New(),
    63  	}
    64  }
    65  
    66  type jobLifecycleLatencyMeasurement struct {
    67  	selector        *util.ObjectSelector
    68  	isRunning       bool
    69  	stopCh          chan struct{}
    70  	eventQueue      *workqueue.Type
    71  	jobStateEntries *measurementutil.ObjectTransitionTimes
    72  }
    73  
    74  // Execute supports two actions:
    75  // - start - Starts to observe jobs and their state transitions.
    76  // - gather - Gathers and prints job latency data.
    77  // heavily influenced by pod_startup_latency measurement
    78  func (p *jobLifecycleLatencyMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) {
    79  	action, err := util.GetString(config.Params, "action")
    80  	if err != nil {
    81  		return nil, err
    82  	}
    83  	switch action {
    84  	case "start":
    85  		if err := p.selector.Parse(config.Params); err != nil {
    86  			return nil, err
    87  		}
    88  		return nil, p.start(config.ClusterFramework.GetClientSets().GetClient())
    89  	case "gather":
    90  		timeout, err := util.GetDurationOrDefault(config.Params, "timeout", defaultWaitForFinishedJobsTimeout)
    91  		if err != nil {
    92  			return nil, err
    93  		}
    94  		return p.gather(config.ClusterFramework.GetClientSets().GetClient(), config.Identifier, timeout)
    95  	default:
    96  		return nil, fmt.Errorf("unknown action %v", action)
    97  	}
    98  
    99  }
   100  
   101  // Dispose cleans up after the measurement.
   102  func (p *jobLifecycleLatencyMeasurement) Dispose() {
   103  	p.stop()
   104  }
   105  
   106  // String returns string representation of this measurement.
   107  func (p *jobLifecycleLatencyMeasurement) String() string {
   108  	return jobLifecycleLatencyMeasurementName + ": " + p.selector.String()
   109  }
   110  
   111  func (p *jobLifecycleLatencyMeasurement) start(c clientset.Interface) error {
   112  	if p.isRunning {
   113  		klog.V(2).Infof("%s: job lifecycle latency measurement already running", p)
   114  		return nil
   115  	}
   116  	klog.V(2).Infof("%s: starting job lifecycle latency measurement...", p)
   117  	p.isRunning = true
   118  	p.stopCh = make(chan struct{})
   119  	i := informer.NewInformer(
   120  		&cache.ListWatch{
   121  			ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
   122  				p.selector.ApplySelectors(&options)
   123  				return c.BatchV1().Jobs(p.selector.Namespace).List(context.TODO(), options)
   124  			},
   125  			WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
   126  				p.selector.ApplySelectors(&options)
   127  				return c.BatchV1().Jobs(p.selector.Namespace).Watch(context.TODO(), options)
   128  			},
   129  		},
   130  		p.addEvent,
   131  	)
   132  	go p.processEvents()
   133  	return informer.StartAndSync(i, p.stopCh, informerSyncTimeout)
   134  }
   135  
   136  func (p *jobLifecycleLatencyMeasurement) addEvent(_, obj interface{}) {
   137  	event := &eventData{obj: obj, recvTime: time.Now()}
   138  	p.eventQueue.Add(event)
   139  }
   140  
   141  func (p *jobLifecycleLatencyMeasurement) processEvents() {
   142  	for p.processNextWorkItem() {
   143  	}
   144  }
   145  
   146  func (p *jobLifecycleLatencyMeasurement) processNextWorkItem() bool {
   147  	item, quit := p.eventQueue.Get()
   148  	if quit {
   149  		return false
   150  	}
   151  	defer p.eventQueue.Done(item)
   152  	event, ok := item.(*eventData)
   153  	if !ok {
   154  		klog.Warningf("Couldn't convert work item to evetData: %v", item)
   155  		return true
   156  	}
   157  	p.processEvent(event)
   158  	return true
   159  }
   160  
   161  // processEvent processes job state change events:
   162  // uses Phase Latency utility to record job state transitions
   163  // it currently captures the following transitions:
   164  // JobCreated (job.CreationTimestamp.Time) -> JobStarted (job.Status.StartTime.Time)
   165  // JobStarted (job.Status.StartTime.Time) -> JobCompleted (job.Status.CompletionTime.Time)
   166  func (p *jobLifecycleLatencyMeasurement) processEvent(event *eventData) {
   167  	obj := event.obj
   168  	if obj == nil {
   169  		return
   170  	}
   171  	job, ok := obj.(*batchv1.Job)
   172  	if !ok {
   173  		return
   174  	}
   175  	key := createMetaNamespaceKey(job.Namespace, job.Name)
   176  	if _, found := p.jobStateEntries.Get(key, jobCreated); !found {
   177  		p.jobStateEntries.Set(key, jobCreated, job.CreationTimestamp.Time)
   178  	}
   179  	if job.Status.StartTime != nil {
   180  		if _, found := p.jobStateEntries.Get(key, jobStarted); !found {
   181  			p.jobStateEntries.Set(key, jobStarted, job.Status.StartTime.Time)
   182  		}
   183  	}
   184  	if job.Status.CompletionTime != nil {
   185  		if _, found := p.jobStateEntries.Get(key, jobCompleted); !found {
   186  			p.jobStateEntries.Set(key, jobCompleted, job.Status.CompletionTime.Time)
   187  		}
   188  	}
   189  }
   190  
   191  func (p *jobLifecycleLatencyMeasurement) stop() {
   192  	if p.isRunning {
   193  		p.isRunning = false
   194  		close(p.stopCh)
   195  		p.eventQueue.ShutDown()
   196  	}
   197  }
   198  
   199  var jobLifecycleTransitions = map[string]measurementutil.Transition{
   200  	"create_to_start": {
   201  		From: jobCreated,
   202  		To:   jobStarted,
   203  	},
   204  	"start_to_complete": {
   205  		From: jobStarted,
   206  		To:   jobCompleted,
   207  	},
   208  }
   209  
   210  // gather collects job lifecycle latency and calculates percentiles using Phase Latency utility
   211  // it waits for all jobs to be completed before collecting the metrics or times out
   212  func (p *jobLifecycleLatencyMeasurement) gather(c clientset.Interface, identifier string, timeout time.Duration) ([]measurement.Summary, error) {
   213  	klog.V(2).Infof("%s: gathering job lifecycle latency measurement...", p)
   214  	if !p.isRunning {
   215  		return nil, fmt.Errorf("metric %s has not been started", jobLifecycleLatencyMeasurementName)
   216  	}
   217  	condition := func() (bool, error) {
   218  		return p.jobStateEntries.Count(jobCreated) == p.jobStateEntries.Count(jobCompleted), nil
   219  	}
   220  	if err := wait.Poll(checkCompletedJobsInterval, timeout, condition); err != nil {
   221  		klog.V(2).Infof("Timed out waiting for all jobs to complete: %v", err)
   222  	}
   223  	p.stop()
   224  	jobLifecycleLatency := p.jobStateEntries.CalculateTransitionsLatency(jobLifecycleTransitions, measurementutil.MatchAll)
   225  	content, jsonErr := util.PrettyPrintJSON(measurementutil.LatencyMapToPerfData(jobLifecycleLatency))
   226  	if jsonErr != nil {
   227  		return nil, jsonErr
   228  	}
   229  	summaryName := fmt.Sprintf("%s_%s", jobLifecycleLatencyMeasurementName, identifier)
   230  	summaries := []measurement.Summary{measurement.CreateSummary(summaryName, "json", content)}
   231  	return summaries, nil
   232  }
   233  
   234  func createMetaNamespaceKey(namespace, name string) string {
   235  	return namespace + "/" + name
   236  }