k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/wait_for_jobs.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package common
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"strings"
    23  	"sync"
    24  	"time"
    25  
    26  	batchv1 "k8s.io/api/batch/v1"
    27  	corev1 "k8s.io/api/core/v1"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/apimachinery/pkg/runtime"
    30  	"k8s.io/apimachinery/pkg/util/sets"
    31  	"k8s.io/apimachinery/pkg/util/wait"
    32  	"k8s.io/apimachinery/pkg/watch"
    33  	"k8s.io/client-go/tools/cache"
    34  	"k8s.io/klog/v2"
    35  
    36  	"k8s.io/perf-tests/clusterloader2/pkg/framework"
    37  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    38  	"k8s.io/perf-tests/clusterloader2/pkg/measurement/util/informer"
    39  	"k8s.io/perf-tests/clusterloader2/pkg/measurement/util/workerqueue"
    40  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    41  )
    42  
    43  const (
    44  	defaultWaitForFinishedJobsTimeout = 10 * time.Minute
    45  	waitForFinishedJobsName           = "WaitForFinishedJobs"
    46  	waitForFinishedJobsWorkers        = 1
    47  	checkFinishedJobsInterval         = time.Second
    48  )
    49  
    50  func init() {
    51  	if err := measurement.Register(waitForFinishedJobsName, createWaitForFinishedJobsMeasurement); err != nil {
    52  		klog.Fatalf("Cannot register %s: %v", waitForFinishedJobsName, err)
    53  	}
    54  }
    55  
    56  func createWaitForFinishedJobsMeasurement() measurement.Measurement {
    57  	return &waitForFinishedJobsMeasurement{
    58  		selector:     util.NewObjectSelector(),
    59  		queue:        workerqueue.NewWorkerQueue(waitForFinishedJobsWorkers),
    60  		finishedJobs: make(map[string]batchv1.JobConditionType),
    61  	}
    62  }
    63  
    64  type waitForFinishedJobsMeasurement struct {
    65  	selector *util.ObjectSelector
    66  
    67  	queue            workerqueue.Interface
    68  	isRunning        bool
    69  	clusterFramework *framework.Framework
    70  	cancel           context.CancelFunc
    71  
    72  	// lock guards finishedJobs.
    73  	lock         sync.Mutex
    74  	finishedJobs map[string]batchv1.JobConditionType
    75  }
    76  
    77  func (w *waitForFinishedJobsMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) {
    78  	w.clusterFramework = config.ClusterFramework
    79  
    80  	action, err := util.GetString(config.Params, "action")
    81  	if err != nil {
    82  		return nil, err
    83  	}
    84  
    85  	switch action {
    86  	case "start":
    87  		if err = w.selector.Parse(config.Params); err != nil {
    88  			return nil, err
    89  		}
    90  		return nil, w.start()
    91  	case "gather":
    92  		timeout, err := util.GetDurationOrDefault(config.Params, "timeout", defaultWaitForFinishedJobsTimeout)
    93  		if err != nil {
    94  			return nil, err
    95  		}
    96  		return nil, w.gather(timeout)
    97  	default:
    98  		return nil, fmt.Errorf("unknown action %v", action)
    99  	}
   100  }
   101  
   102  func (w *waitForFinishedJobsMeasurement) Dispose() {
   103  	if !w.isRunning {
   104  		return
   105  	}
   106  	w.isRunning = false
   107  	w.queue.Stop()
   108  	w.cancel()
   109  }
   110  
   111  func (w *waitForFinishedJobsMeasurement) String() string {
   112  	return waitForFinishedJobsName
   113  }
   114  
   115  // start starts a job informer and queues the updates for evaluation.
   116  func (w *waitForFinishedJobsMeasurement) start() error {
   117  	if w.isRunning {
   118  		klog.V(2).Infof("%v: wait for finished jobs measurement already running", w)
   119  		return nil
   120  	}
   121  	klog.V(2).Infof("%v: starting wait for finished jobs measurement...", w)
   122  	w.isRunning = true
   123  	ctx, cancel := context.WithCancel(context.Background())
   124  	w.cancel = cancel
   125  	c := w.clusterFramework.GetClientSets().GetClient()
   126  	inf := informer.NewInformer(
   127  		&cache.ListWatch{
   128  			ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
   129  				w.selector.ApplySelectors(&options)
   130  				return c.BatchV1().Jobs(w.selector.Namespace).List(ctx, options)
   131  			},
   132  			WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
   133  				w.selector.ApplySelectors(&options)
   134  				return c.BatchV1().Jobs(w.selector.Namespace).Watch(ctx, options)
   135  			},
   136  		},
   137  		func(oldObj, newObj interface{}) {
   138  			f := func() {
   139  				w.handleObject(oldObj, newObj)
   140  			}
   141  			w.queue.Add(&f)
   142  		},
   143  	)
   144  	return informer.StartAndSync(inf, ctx.Done(), informerSyncTimeout)
   145  }
   146  
   147  // gather waits for all the existing jobs to finish and reports how many
   148  // completed and how many failed.
   149  func (w *waitForFinishedJobsMeasurement) gather(timeout time.Duration) error {
   150  	if !w.isRunning {
   151  		return fmt.Errorf("%v: wait for finished jobs was not started", w)
   152  	}
   153  	klog.V(2).Infof("%v: waiting for finished jobs measurement...", w)
   154  	jobKeys, err := w.jobKeys()
   155  	if err != nil {
   156  		return err
   157  	}
   158  
   159  	cond := func() (bool, error) {
   160  		w.lock.Lock()
   161  		defer w.lock.Unlock()
   162  		finishedKeys := make(sets.String, len(w.finishedJobs))
   163  		for k := range w.finishedJobs {
   164  			finishedKeys.Insert(k)
   165  		}
   166  		return jobKeys.Equal(finishedKeys), nil
   167  	}
   168  	if err := wait.Poll(checkFinishedJobsInterval, timeout, cond); err != nil {
   169  		klog.V(2).Infof("Timed out waiting for all jobs to finish: %v", err)
   170  	}
   171  	completed := 0
   172  	failed := 0
   173  	timedOut := sets.NewString()
   174  	w.lock.Lock()
   175  	defer w.lock.Unlock()
   176  	for key := range jobKeys {
   177  		if cond, ok := w.finishedJobs[key]; !ok {
   178  			timedOut.Insert(key)
   179  		} else if cond == batchv1.JobComplete {
   180  			completed++
   181  		} else if cond == batchv1.JobFailed {
   182  			failed++
   183  		}
   184  	}
   185  	if timedOut.Len() != 0 {
   186  		return fmt.Errorf("%d Jobs timed out: %s", timedOut.Len(), strings.Join(timedOut.List(), ", "))
   187  	}
   188  	klog.V(2).Infof("%v: %d/%d Jobs finished, %d completed, %d failed", w, completed+failed, len(jobKeys), completed, failed)
   189  	return nil
   190  }
   191  
   192  // handleObject casts the objects into Jobs and records their finished status.
   193  func (w *waitForFinishedJobsMeasurement) handleObject(oldObj, newObj interface{}) {
   194  	var oldJob, newJob *batchv1.Job
   195  	var ok bool
   196  	switch cast := oldObj.(type) {
   197  	case *batchv1.Job:
   198  		oldJob = cast
   199  		ok = true
   200  	case cache.DeletedFinalStateUnknown:
   201  		oldJob, ok = cast.Obj.(*batchv1.Job)
   202  	}
   203  	if oldObj != nil && !ok {
   204  		klog.Errorf("%v: uncastable old object: %v", w, oldObj)
   205  	}
   206  	newJob, ok = newObj.(*batchv1.Job)
   207  	if newObj != nil && !ok {
   208  		klog.Errorf("%v: uncastable new object: %v", w, newObj)
   209  		return
   210  	}
   211  	handleJob := newJob
   212  	if newJob == nil {
   213  		handleJob = oldJob
   214  	}
   215  	key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(handleJob)
   216  	if err != nil {
   217  		klog.Errorf("Failed obtaining meta key for Job: %v", err)
   218  		return
   219  	}
   220  	completed, condition := finishedJobCondition(newJob)
   221  
   222  	w.lock.Lock()
   223  	defer w.lock.Unlock()
   224  	if completed {
   225  		w.finishedJobs[key] = condition
   226  	} else {
   227  		delete(w.finishedJobs, key)
   228  	}
   229  }
   230  
   231  // jobKeys returns the keys of all the Jobs in the client the match the selector.
   232  func (w *waitForFinishedJobsMeasurement) jobKeys() (sets.String, error) {
   233  	objs, err := w.clusterFramework.GetClientSets().GetClient().BatchV1().Jobs(w.selector.Namespace).List(context.Background(), metav1.ListOptions{
   234  		LabelSelector: w.selector.LabelSelector,
   235  		FieldSelector: w.selector.FieldSelector,
   236  	})
   237  	if err != nil {
   238  		return nil, fmt.Errorf("listing jobs: %w", err)
   239  	}
   240  	keys := sets.NewString()
   241  	for _, j := range objs.Items {
   242  		key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(&j)
   243  		if err != nil {
   244  			return nil, fmt.Errorf("getting key for Job: %w", err)
   245  		}
   246  		keys.Insert(key)
   247  	}
   248  	return keys, nil
   249  }
   250  
   251  // finishedJobCondition returns whether the job finished and with what condition.
   252  func finishedJobCondition(j *batchv1.Job) (bool, batchv1.JobConditionType) {
   253  	if j == nil {
   254  		return false, ""
   255  	}
   256  	for _, cond := range j.Status.Conditions {
   257  		if cond.Status != corev1.ConditionTrue {
   258  			continue
   259  		}
   260  
   261  		if cond.Type == batchv1.JobComplete || cond.Type == batchv1.JobFailed {
   262  			return true, cond.Type
   263  		}
   264  	}
   265  	return false, ""
   266  }