k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/wait_for_jobs.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package common 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 "sync" 24 "time" 25 26 batchv1 "k8s.io/api/batch/v1" 27 corev1 "k8s.io/api/core/v1" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/runtime" 30 "k8s.io/apimachinery/pkg/util/sets" 31 "k8s.io/apimachinery/pkg/util/wait" 32 "k8s.io/apimachinery/pkg/watch" 33 "k8s.io/client-go/tools/cache" 34 "k8s.io/klog/v2" 35 36 "k8s.io/perf-tests/clusterloader2/pkg/framework" 37 "k8s.io/perf-tests/clusterloader2/pkg/measurement" 38 "k8s.io/perf-tests/clusterloader2/pkg/measurement/util/informer" 39 "k8s.io/perf-tests/clusterloader2/pkg/measurement/util/workerqueue" 40 "k8s.io/perf-tests/clusterloader2/pkg/util" 41 ) 42 43 const ( 44 defaultWaitForFinishedJobsTimeout = 10 * time.Minute 45 waitForFinishedJobsName = "WaitForFinishedJobs" 46 waitForFinishedJobsWorkers = 1 47 checkFinishedJobsInterval = time.Second 48 ) 49 50 func init() { 51 if err := measurement.Register(waitForFinishedJobsName, createWaitForFinishedJobsMeasurement); err != nil { 52 klog.Fatalf("Cannot register %s: %v", waitForFinishedJobsName, err) 53 } 54 } 55 56 func createWaitForFinishedJobsMeasurement() measurement.Measurement { 57 return &waitForFinishedJobsMeasurement{ 58 selector: util.NewObjectSelector(), 59 queue: workerqueue.NewWorkerQueue(waitForFinishedJobsWorkers), 60 finishedJobs: make(map[string]batchv1.JobConditionType), 61 } 62 } 63 64 type waitForFinishedJobsMeasurement struct { 65 selector *util.ObjectSelector 66 67 queue workerqueue.Interface 68 isRunning bool 69 clusterFramework *framework.Framework 70 cancel context.CancelFunc 71 72 // lock guards finishedJobs. 73 lock sync.Mutex 74 finishedJobs map[string]batchv1.JobConditionType 75 } 76 77 func (w *waitForFinishedJobsMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) { 78 w.clusterFramework = config.ClusterFramework 79 80 action, err := util.GetString(config.Params, "action") 81 if err != nil { 82 return nil, err 83 } 84 85 switch action { 86 case "start": 87 if err = w.selector.Parse(config.Params); err != nil { 88 return nil, err 89 } 90 return nil, w.start() 91 case "gather": 92 timeout, err := util.GetDurationOrDefault(config.Params, "timeout", defaultWaitForFinishedJobsTimeout) 93 if err != nil { 94 return nil, err 95 } 96 return nil, w.gather(timeout) 97 default: 98 return nil, fmt.Errorf("unknown action %v", action) 99 } 100 } 101 102 func (w *waitForFinishedJobsMeasurement) Dispose() { 103 if !w.isRunning { 104 return 105 } 106 w.isRunning = false 107 w.queue.Stop() 108 w.cancel() 109 } 110 111 func (w *waitForFinishedJobsMeasurement) String() string { 112 return waitForFinishedJobsName 113 } 114 115 // start starts a job informer and queues the updates for evaluation. 116 func (w *waitForFinishedJobsMeasurement) start() error { 117 if w.isRunning { 118 klog.V(2).Infof("%v: wait for finished jobs measurement already running", w) 119 return nil 120 } 121 klog.V(2).Infof("%v: starting wait for finished jobs measurement...", w) 122 w.isRunning = true 123 ctx, cancel := context.WithCancel(context.Background()) 124 w.cancel = cancel 125 c := w.clusterFramework.GetClientSets().GetClient() 126 inf := informer.NewInformer( 127 &cache.ListWatch{ 128 ListFunc: func(options metav1.ListOptions) (runtime.Object, error) { 129 w.selector.ApplySelectors(&options) 130 return c.BatchV1().Jobs(w.selector.Namespace).List(ctx, options) 131 }, 132 WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) { 133 w.selector.ApplySelectors(&options) 134 return c.BatchV1().Jobs(w.selector.Namespace).Watch(ctx, options) 135 }, 136 }, 137 func(oldObj, newObj interface{}) { 138 f := func() { 139 w.handleObject(oldObj, newObj) 140 } 141 w.queue.Add(&f) 142 }, 143 ) 144 return informer.StartAndSync(inf, ctx.Done(), informerSyncTimeout) 145 } 146 147 // gather waits for all the existing jobs to finish and reports how many 148 // completed and how many failed. 149 func (w *waitForFinishedJobsMeasurement) gather(timeout time.Duration) error { 150 if !w.isRunning { 151 return fmt.Errorf("%v: wait for finished jobs was not started", w) 152 } 153 klog.V(2).Infof("%v: waiting for finished jobs measurement...", w) 154 jobKeys, err := w.jobKeys() 155 if err != nil { 156 return err 157 } 158 159 cond := func() (bool, error) { 160 w.lock.Lock() 161 defer w.lock.Unlock() 162 finishedKeys := make(sets.String, len(w.finishedJobs)) 163 for k := range w.finishedJobs { 164 finishedKeys.Insert(k) 165 } 166 return jobKeys.Equal(finishedKeys), nil 167 } 168 if err := wait.Poll(checkFinishedJobsInterval, timeout, cond); err != nil { 169 klog.V(2).Infof("Timed out waiting for all jobs to finish: %v", err) 170 } 171 completed := 0 172 failed := 0 173 timedOut := sets.NewString() 174 w.lock.Lock() 175 defer w.lock.Unlock() 176 for key := range jobKeys { 177 if cond, ok := w.finishedJobs[key]; !ok { 178 timedOut.Insert(key) 179 } else if cond == batchv1.JobComplete { 180 completed++ 181 } else if cond == batchv1.JobFailed { 182 failed++ 183 } 184 } 185 if timedOut.Len() != 0 { 186 return fmt.Errorf("%d Jobs timed out: %s", timedOut.Len(), strings.Join(timedOut.List(), ", ")) 187 } 188 klog.V(2).Infof("%v: %d/%d Jobs finished, %d completed, %d failed", w, completed+failed, len(jobKeys), completed, failed) 189 return nil 190 } 191 192 // handleObject casts the objects into Jobs and records their finished status. 193 func (w *waitForFinishedJobsMeasurement) handleObject(oldObj, newObj interface{}) { 194 var oldJob, newJob *batchv1.Job 195 var ok bool 196 switch cast := oldObj.(type) { 197 case *batchv1.Job: 198 oldJob = cast 199 ok = true 200 case cache.DeletedFinalStateUnknown: 201 oldJob, ok = cast.Obj.(*batchv1.Job) 202 } 203 if oldObj != nil && !ok { 204 klog.Errorf("%v: uncastable old object: %v", w, oldObj) 205 } 206 newJob, ok = newObj.(*batchv1.Job) 207 if newObj != nil && !ok { 208 klog.Errorf("%v: uncastable new object: %v", w, newObj) 209 return 210 } 211 handleJob := newJob 212 if newJob == nil { 213 handleJob = oldJob 214 } 215 key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(handleJob) 216 if err != nil { 217 klog.Errorf("Failed obtaining meta key for Job: %v", err) 218 return 219 } 220 completed, condition := finishedJobCondition(newJob) 221 222 w.lock.Lock() 223 defer w.lock.Unlock() 224 if completed { 225 w.finishedJobs[key] = condition 226 } else { 227 delete(w.finishedJobs, key) 228 } 229 } 230 231 // jobKeys returns the keys of all the Jobs in the client the match the selector. 232 func (w *waitForFinishedJobsMeasurement) jobKeys() (sets.String, error) { 233 objs, err := w.clusterFramework.GetClientSets().GetClient().BatchV1().Jobs(w.selector.Namespace).List(context.Background(), metav1.ListOptions{ 234 LabelSelector: w.selector.LabelSelector, 235 FieldSelector: w.selector.FieldSelector, 236 }) 237 if err != nil { 238 return nil, fmt.Errorf("listing jobs: %w", err) 239 } 240 keys := sets.NewString() 241 for _, j := range objs.Items { 242 key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(&j) 243 if err != nil { 244 return nil, fmt.Errorf("getting key for Job: %w", err) 245 } 246 keys.Insert(key) 247 } 248 return keys, nil 249 } 250 251 // finishedJobCondition returns whether the job finished and with what condition. 252 func finishedJobCondition(j *batchv1.Job) (bool, batchv1.JobConditionType) { 253 if j == nil { 254 return false, "" 255 } 256 for _, cond := range j.Status.Conditions { 257 if cond.Status != corev1.ConditionTrue { 258 continue 259 } 260 261 if cond.Type == batchv1.JobComplete || cond.Type == batchv1.JobFailed { 262 return true, cond.Type 263 } 264 } 265 return false, "" 266 }