k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/wait_for_controlled_pods.go (about) 1 /* 2 Copyright 2018 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package common 18 19 import ( 20 "context" 21 "fmt" 22 "strings" 23 "sync" 24 "time" 25 26 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 28 v1 "k8s.io/api/core/v1" 29 "k8s.io/apimachinery/pkg/api/meta" 30 "k8s.io/apimachinery/pkg/runtime" 31 "k8s.io/apimachinery/pkg/runtime/schema" 32 "k8s.io/apimachinery/pkg/util/sets" 33 "k8s.io/apimachinery/pkg/util/wait" 34 "k8s.io/client-go/informers" 35 clientset "k8s.io/client-go/kubernetes" 36 "k8s.io/client-go/tools/cache" 37 "k8s.io/klog/v2" 38 39 "k8s.io/perf-tests/clusterloader2/pkg/errors" 40 "k8s.io/perf-tests/clusterloader2/pkg/framework" 41 "k8s.io/perf-tests/clusterloader2/pkg/measurement" 42 measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util" 43 "k8s.io/perf-tests/clusterloader2/pkg/measurement/util/checker" 44 "k8s.io/perf-tests/clusterloader2/pkg/measurement/util/informer" 45 "k8s.io/perf-tests/clusterloader2/pkg/measurement/util/runtimeobjects" 46 "k8s.io/perf-tests/clusterloader2/pkg/measurement/util/workerqueue" 47 "k8s.io/perf-tests/clusterloader2/pkg/util" 48 ) 49 50 const ( 51 defaultSyncTimeout = 60 * time.Second 52 defaultOperationTimeout = 10 * time.Minute 53 checkControlledPodsInterval = 5 * time.Second 54 informerSyncTimeout = time.Minute 55 waitForControlledPodsRunningName = "WaitForControlledPodsRunning" 56 57 // In this measurement, we rely on the fact that handlers are being 58 // processed in order - in particular gather() is checking if all 59 // objects up to a given resource version has already been processed. 60 // To guarantee processing order we can't have more than a single 61 // worker. Fortunately it doesn't change much, because almost all 62 // handler function is happening under lock. 63 waitForControlledPodsWorkers = 1 64 ) 65 66 var podIndexerFactory = &sharedPodIndexerFactory{} 67 68 func init() { 69 if err := measurement.Register(waitForControlledPodsRunningName, createWaitForControlledPodsRunningMeasurement); err != nil { 70 klog.Fatalf("Cannot register %s: %v", waitForControlledPodsRunningName, err) 71 } 72 } 73 74 type sharedPodIndexerFactory struct { 75 podsIndexer *measurementutil.ControlledPodsIndexer 76 err error 77 once sync.Once 78 } 79 80 func (s *sharedPodIndexerFactory) PodsIndexer(c clientset.Interface) (*measurementutil.ControlledPodsIndexer, error) { 81 s.once.Do(func() { 82 s.podsIndexer, s.err = s.start(c) 83 }) 84 return s.podsIndexer, s.err 85 } 86 87 func (s *sharedPodIndexerFactory) start(c clientset.Interface) (*measurementutil.ControlledPodsIndexer, error) { 88 ctx := context.Background() 89 informerFactory := informers.NewSharedInformerFactory(c, 0) 90 podsIndexer, err := measurementutil.NewControlledPodsIndexer( 91 informerFactory.Core().V1().Pods(), 92 informerFactory.Apps().V1().ReplicaSets(), 93 ) 94 if err != nil { 95 return nil, fmt.Errorf("failed to initialize controlledPodsIndexer: %w", err) 96 } 97 informerFactory.Start(ctx.Done()) 98 if !podsIndexer.WaitForCacheSync(ctx) { 99 return nil, fmt.Errorf("failed to sync informers") 100 } 101 return podsIndexer, nil 102 } 103 104 func createWaitForControlledPodsRunningMeasurement() measurement.Measurement { 105 return &waitForControlledPodsRunningMeasurement{ 106 selector: util.NewObjectSelector(), 107 queue: workerqueue.NewWorkerQueue(waitForControlledPodsWorkers), 108 objectKeys: sets.NewString(), 109 checkerMap: checker.NewMap(), 110 } 111 } 112 113 type waitForControlledPodsRunningMeasurement struct { 114 apiVersion string 115 kind string 116 selector *util.ObjectSelector 117 operationTimeout time.Duration 118 // countErrorMargin orders measurement to wait for number of pods to be in 119 // <desired count - countErrorMargin, desired count> range 120 // When using preemptibles on large scale, number of ready nodes is not stable 121 // and reaching DesiredPodCount could take a very long time. 122 countErrorMargin int 123 stopCh chan struct{} 124 isRunning bool 125 queue workerqueue.Interface 126 handlingGroup wait.Group 127 lock sync.Mutex 128 objectKeys sets.String 129 opResourceVersion uint64 130 gvr schema.GroupVersionResource 131 checkerMap checker.Map 132 clusterFramework *framework.Framework 133 checkIfPodsAreUpdated bool 134 // podsIndexer is an indexer propagated via informers observing 135 // changes of all pods in the whole cluster. 136 podsIndexer *measurementutil.ControlledPodsIndexer 137 } 138 139 type failedPod struct { 140 Namespace string `json:"namespace"` 141 ControlledBy string `json:"controlledBy"` 142 Name string `json:"name"` 143 Host string `json:"host"` 144 Status string `json:"status"` 145 } 146 147 func toFailedPods(ps *measurementutil.PodsStatus, controlledBy string) []failedPod { 148 failedPods := make([]failedPod, 0, len(ps.Info)) 149 for _, pod := range ps.Info { 150 failedPods = append(failedPods, failedPod{ 151 Namespace: pod.Namespace, 152 ControlledBy: controlledBy, 153 Name: pod.Name, 154 Host: pod.Hostname, 155 Status: pod.Status.String(), 156 }) 157 } 158 return failedPods 159 } 160 161 func createSummary(fp []failedPod) []measurement.Summary { 162 if len(fp) == 0 { 163 return nil 164 } 165 166 ts := time.Now().Unix() 167 name := fmt.Sprintf("%s_%d_failedpods", waitForControlledPodsRunningName, ts) 168 ext := "json" 169 content, err := util.PrettyPrintJSON(fp) 170 if err != nil { 171 klog.Errorf("error: %s while marshaling failed pods to json", err) 172 return nil 173 } 174 return []measurement.Summary{ 175 measurement.CreateSummary(name, ext, content), 176 } 177 } 178 179 // Execute waits until all specified controlling objects have all pods running or until timeout happens. 180 // Controlling objects can be specified by field and/or label selectors. 181 // If namespace is not passed by parameter, all-namespace scope is assumed. 182 // "Start" action starts observation of the controlling objects, while "gather" waits for until 183 // specified number of controlling objects have all pods running. 184 func (w *waitForControlledPodsRunningMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) { 185 w.clusterFramework = config.ClusterFramework 186 187 action, err := util.GetString(config.Params, "action") 188 if err != nil { 189 return nil, err 190 } 191 192 switch action { 193 case "start": 194 w.apiVersion, err = util.GetString(config.Params, "apiVersion") 195 if err != nil { 196 return nil, err 197 } 198 w.kind, err = util.GetString(config.Params, "kind") 199 if err != nil { 200 return nil, err 201 } 202 if err = w.selector.Parse(config.Params); err != nil { 203 return nil, err 204 } 205 w.operationTimeout, err = util.GetDurationOrDefault(config.Params, "operationTimeout", defaultOperationTimeout) 206 if err != nil { 207 return nil, err 208 } 209 w.checkIfPodsAreUpdated, err = util.GetBoolOrDefault(config.Params, "checkIfPodsAreUpdated", true) 210 if err != nil { 211 return nil, err 212 } 213 w.countErrorMargin, err = util.GetIntOrDefault(config.Params, "countErrorMargin", 0) 214 if err != nil { 215 return nil, err 216 } 217 return nil, w.start() 218 case "gather": 219 syncTimeout, err := util.GetDurationOrDefault(config.Params, "syncTimeout", defaultSyncTimeout) 220 if err != nil { 221 return nil, err 222 } 223 ps, err := w.gather(syncTimeout) 224 summary := createSummary(ps) 225 return summary, err 226 case "stop": 227 w.Dispose() 228 return nil, nil 229 default: 230 return nil, fmt.Errorf("unknown action %v", action) 231 } 232 } 233 234 // Dispose cleans up after the measurement. 235 func (w *waitForControlledPodsRunningMeasurement) Dispose() { 236 if !w.isRunning { 237 return 238 } 239 w.isRunning = false 240 close(w.stopCh) 241 w.queue.Stop() 242 w.lock.Lock() 243 defer w.lock.Unlock() 244 w.checkerMap.Dispose() 245 } 246 247 // String returns a string representation of the metric. 248 func (*waitForControlledPodsRunningMeasurement) String() string { 249 return waitForControlledPodsRunningName 250 } 251 252 func (w *waitForControlledPodsRunningMeasurement) start() error { 253 if w.isRunning { 254 klog.V(2).Infof("%v: wait for controlled pods measurement already running", w) 255 return nil 256 } 257 klog.V(2).Infof("%v: starting wait for controlled pods measurement...", w) 258 gv, err := schema.ParseGroupVersion(w.apiVersion) 259 if err != nil { 260 return err 261 } 262 gvk := gv.WithKind(w.kind) 263 w.gvr, _ = meta.UnsafeGuessKindToResource(gvk) 264 265 w.isRunning = true 266 w.stopCh = make(chan struct{}) 267 podsIndexer, err := podIndexerFactory.PodsIndexer(w.clusterFramework.GetClientSets().GetClient()) 268 if err != nil { 269 return err 270 } 271 w.podsIndexer = podsIndexer 272 273 i := informer.NewDynamicInformer( 274 w.clusterFramework.GetDynamicClients().GetClient(), 275 w.gvr, 276 w.selector, 277 func(odlObj, newObj interface{}) { 278 f := func() { 279 w.handleObject(odlObj, newObj) 280 } 281 w.queue.Add(&f) 282 }, 283 ) 284 return informer.StartAndSync(i, w.stopCh, informerSyncTimeout) 285 } 286 287 func (w *waitForControlledPodsRunningMeasurement) gather(syncTimeout time.Duration) ([]failedPod, error) { 288 klog.V(2).Infof("%v: waiting for controlled pods measurement...", w) 289 if !w.isRunning { 290 return nil, fmt.Errorf("metric %s has not been started", w) 291 } 292 objectKeys, maxResourceVersion, err := w.getObjectKeysAndMaxVersion() 293 if err != nil { 294 return nil, err 295 } 296 297 // Wait until checkers for all objects are registered: 298 // - when object is created/updated, it's enough to wait for its resourceVersion to 299 // be processed by our handler; thus we wait until all events up to maxResourceVersion 300 // are processed before proceeding 301 // - when object is deleted, by definition it will not be returned by the LIST request, 302 // thus resourceVersion of the deletion may be higher than the maxResourceVersion; 303 // we solve that by waiting until list of currently existing objects (that we propagate 304 // via our handler) is equal to the expected one; 305 // NOTE: we're not resiliant to situations where an object will be created/deleted 306 // after the LIST call happened. But given measurement and phases don't infer with 307 // each other, it can't be clusterloader that deleted it. Thus we accept this limitation. 308 // NOTE: we could try waiting for the informer state to be the same and use the 309 // resourceVersion from there, but then existence of bookmarks and the fact that our 310 // informer doesn't necessary follow all objects of a given type can break that. 311 // See #1259 for more details. 312 313 cond := func() (bool, error) { 314 w.lock.Lock() 315 defer w.lock.Unlock() 316 return w.opResourceVersion >= maxResourceVersion && objectKeys.Equal(w.objectKeys), nil 317 } 318 if err := wait.Poll(checkControlledPodsInterval, syncTimeout, cond); err != nil { 319 return nil, fmt.Errorf("timed out while waiting for controlled pods: %v", err) 320 } 321 322 w.handlingGroup.Wait() 323 w.lock.Lock() 324 defer w.lock.Unlock() 325 var numberRunning, numberDeleted, numberTimeout, numberFailed int 326 failedErrList := errors.NewErrorList() 327 timedOutObjects := []string{} 328 var maxDuration time.Duration 329 failedPods := []failedPod{} 330 for _, checker := range w.checkerMap { 331 objChecker := checker.(*objectChecker) 332 status, err := objChecker.getStatus() 333 if objChecker.duration > maxDuration { 334 maxDuration = objChecker.duration 335 } 336 switch status { 337 case running: 338 numberRunning++ 339 case deleted: 340 numberDeleted++ 341 case timeout: 342 timedOutObjects = append(timedOutObjects, objChecker.key) 343 numberTimeout++ 344 failedPods = append(failedPods, toFailedPods(objChecker.failedPods, objChecker.key)...) 345 case deleteTimeout: 346 timedOutObjects = append(timedOutObjects, objChecker.key) 347 numberTimeout++ 348 podsClient := w.clusterFramework.GetClientSets().GetClient().CoreV1().Pods(w.selector.Namespace) 349 err := podsClient.DeleteCollection(context.Background(), forceDeleteOptions(), w.listOptions()) 350 failedPods = append(failedPods, toFailedPods(objChecker.failedPods, objChecker.key)...) 351 if err != nil { 352 klog.Errorf("Error: %s while Force Deleting Pod, %s", err, objChecker.key) 353 } 354 case failed: 355 numberFailed++ 356 failedPods = append(failedPods, toFailedPods(objChecker.failedPods, objChecker.key)...) 357 if err != nil { 358 failedErrList.Append(err) 359 } 360 default: 361 // Probably implementation bug. 362 return nil, fmt.Errorf("got unknown status for %v: status=%v, err=%v", objChecker.key, status, err) 363 } 364 } 365 klog.V(2).Infof("%s: running %d, deleted %d, timeout: %d, failed: %d", w, numberRunning, numberDeleted, numberTimeout, numberFailed) 366 var ratio float64 367 if w.operationTimeout != 0 { 368 ratio = float64(maxDuration) / float64(w.operationTimeout) 369 } 370 klog.V(2).Infof("%s: maxDuration=%v, operationTimeout=%v, ratio=%.2f", w, maxDuration, w.operationTimeout, ratio) 371 if numberTimeout > 0 { 372 klog.Errorf("Timed out %ss: %s", w.kind, strings.Join(timedOutObjects, ", ")) 373 return failedPods, fmt.Errorf("%d objects timed out: %ss: %s", numberTimeout, w.kind, strings.Join(timedOutObjects, ", ")) 374 } 375 if objectKeys.Len() != numberRunning { 376 klog.Errorf("%s: incorrect objects number: %d/%d %ss are running with all pods", w, numberRunning, objectKeys.Len(), w.kind) 377 return failedPods, fmt.Errorf("incorrect objects number: %d/%d %ss are running with all pods", numberRunning, objectKeys.Len(), w.kind) 378 } 379 if numberFailed > 0 { 380 klog.Errorf("%s: failed status for %d %ss: %s", w, numberFailed, w.kind, failedErrList.String()) 381 return failedPods, fmt.Errorf("failed objects statuses: %v", failedErrList.String()) 382 } 383 384 klog.V(2).Infof("%s: %d/%d %ss are running with all pods", w, numberRunning, objectKeys.Len(), w.kind) 385 return nil, nil 386 } 387 388 func (w *waitForControlledPodsRunningMeasurement) listOptions() metav1.ListOptions { 389 listOptions := metav1.ListOptions{ 390 LabelSelector: w.selector.LabelSelector, 391 FieldSelector: w.selector.FieldSelector, 392 } 393 return listOptions 394 } 395 396 func forceDeleteOptions() metav1.DeleteOptions { 397 gracePeriod := int64(0) 398 propagationPolicy := metav1.DeletePropagationBackground 399 forceDeletePodOptions := metav1.DeleteOptions{ 400 GracePeriodSeconds: &gracePeriod, 401 PropagationPolicy: &propagationPolicy, 402 } 403 return forceDeletePodOptions 404 } 405 406 // handleObject manages checker for given controlling pod object. 407 // This function does not return errors only logs them. All possible errors will be caught in gather function. 408 // If this function does not executes correctly, verifying number of running pods will fail, 409 // causing incorrect objects number error to be returned. 410 func (w *waitForControlledPodsRunningMeasurement) handleObject(oldObj, newObj interface{}) { 411 var oldRuntimeObj runtime.Object 412 var newRuntimeObj runtime.Object 413 var ok bool 414 oldRuntimeObj, ok = oldObj.(runtime.Object) 415 if oldObj != nil && !ok { 416 klog.Errorf("%s: uncastable old object: %v", w, oldObj) 417 return 418 } 419 newRuntimeObj, ok = newObj.(runtime.Object) 420 if newObj != nil && !ok { 421 klog.Errorf("%s: uncastable new object: %v", w, newObj) 422 return 423 } 424 425 // Acquire the lock before defining defered function to ensure it 426 // will be called under the same lock. 427 w.lock.Lock() 428 defer w.lock.Unlock() 429 430 defer func() { 431 if err := w.updateCacheLocked(oldRuntimeObj, newRuntimeObj); err != nil { 432 klog.Errorf("%s: error when updating cache: %v", w, err) 433 } 434 }() 435 436 isEqual, err := runtimeobjects.IsEqualRuntimeObjectsSpec(oldRuntimeObj, newRuntimeObj) 437 if err != nil { 438 klog.Errorf("%s: comparing specs error: %v", w, err) 439 return 440 } 441 if isEqual { 442 // Skip updates without changes in the spec. 443 return 444 } 445 446 if !w.isRunning { 447 return 448 } 449 450 if err := w.deleteObjectLocked(oldRuntimeObj); err != nil { 451 klog.Errorf("%s: delete checker error: %v", w, err) 452 } 453 if err := w.handleObjectLocked(oldRuntimeObj, newRuntimeObj); err != nil { 454 klog.Errorf("%s: create checker error: %v", w, err) 455 } 456 } 457 458 func (w *waitForControlledPodsRunningMeasurement) checkScaledown(oldObj, newObj runtime.Object) (bool, error) { 459 oldReplicasWatcher, err := runtimeobjects.GetReplicasFromRuntimeObject(w.clusterFramework.GetClientSets().GetClient(), oldObj) 460 if err != nil { 461 return false, err 462 } 463 oldReplicas, err := runtimeobjects.GetReplicasOnce(oldReplicasWatcher) 464 if err != nil { 465 return false, err 466 } 467 newReplicasWatcher, err := runtimeobjects.GetReplicasFromRuntimeObject(w.clusterFramework.GetClientSets().GetClient(), newObj) 468 if err != nil { 469 return false, err 470 } 471 newReplicas, err := runtimeobjects.GetReplicasOnce(newReplicasWatcher) 472 if err != nil { 473 return false, err 474 } 475 476 return newReplicas < oldReplicas, nil 477 } 478 479 func (w *waitForControlledPodsRunningMeasurement) handleObjectLocked(oldObj, newObj runtime.Object) error { 480 isObjDeleted := newObj == nil 481 handledObj := newObj 482 if isObjDeleted { 483 handledObj = oldObj 484 } 485 key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(handledObj) 486 if err != nil { 487 return fmt.Errorf("meta key creation error: %v", err) 488 } 489 checker, err := w.waitForRuntimeObject(handledObj, isObjDeleted, w.operationTimeout) 490 if err != nil { 491 return fmt.Errorf("waiting for %v error: %v", key, err) 492 } 493 w.checkerMap.Add(key, checker) 494 return nil 495 } 496 497 func (w *waitForControlledPodsRunningMeasurement) deleteObjectLocked(obj runtime.Object) error { 498 if obj == nil { 499 return nil 500 } 501 key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) 502 if err != nil { 503 return fmt.Errorf("meta key creation error: %v", err) 504 } 505 w.checkerMap.DeleteAndStop(key) 506 return nil 507 } 508 509 func (w *waitForControlledPodsRunningMeasurement) updateCacheLocked(oldObj, newObj runtime.Object) error { 510 errList := errors.NewErrorList() 511 512 if oldObj != nil { 513 key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(oldObj) 514 if err != nil { 515 errList.Append(fmt.Errorf("%s: retrieving key error: %v", w, err)) 516 } else { 517 w.objectKeys.Delete(key) 518 } 519 if err := w.updateOpResourceVersionLocked(oldObj); err != nil { 520 errList.Append(err) 521 } 522 } 523 if newObj != nil { 524 key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(newObj) 525 if err != nil { 526 errList.Append(fmt.Errorf("%s: retrieving key error: %v", w, err)) 527 } else { 528 w.objectKeys.Insert(key) 529 } 530 if err := w.updateOpResourceVersionLocked(newObj); err != nil { 531 errList.Append(err) 532 } 533 } 534 535 if errList.IsEmpty() { 536 return nil 537 } 538 return fmt.Errorf(errList.Error()) 539 } 540 541 func (w *waitForControlledPodsRunningMeasurement) updateOpResourceVersionLocked(runtimeObj runtime.Object) error { 542 version, err := runtimeobjects.GetResourceVersionFromRuntimeObject(runtimeObj) 543 if err != nil { 544 return fmt.Errorf("retriving resource version error: %v", err) 545 } 546 if version > w.opResourceVersion { 547 w.opResourceVersion = version 548 } 549 return nil 550 } 551 552 // getObjectKeysAndMaxVersion returns keys of objects that satisfy measurement parameters 553 // and the maximal resource version of these objects. 554 func (w *waitForControlledPodsRunningMeasurement) getObjectKeysAndMaxVersion() (sets.String, uint64, error) { 555 objects, err := runtimeobjects.ListRuntimeObjectsForKind( 556 w.clusterFramework.GetDynamicClients().GetClient(), 557 w.gvr, w.kind, w.selector.Namespace, w.selector.LabelSelector, w.selector.FieldSelector) 558 if err != nil { 559 return nil, 0, fmt.Errorf("listing objects error: %v", err) 560 } 561 562 objectKeys := sets.NewString() 563 var maxResourceVersion uint64 564 for i := range objects { 565 runtimeObj, ok := objects[i].(runtime.Object) 566 if !ok { 567 klog.Errorf("%s: cannot cast to runtime.Object: %v", w, objects[i]) 568 continue 569 } 570 version, err := runtimeobjects.GetResourceVersionFromRuntimeObject(runtimeObj) 571 if err != nil { 572 klog.Errorf("%s: retriving resource version error: %v", w, err) 573 continue 574 } 575 key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(runtimeObj) 576 if err != nil { 577 klog.Errorf("%s: retrieving key error: %v", w, err) 578 continue 579 } 580 objectKeys.Insert(key) 581 if version > maxResourceVersion { 582 maxResourceVersion = version 583 } 584 } 585 return objectKeys, maxResourceVersion, nil 586 } 587 588 func (w *waitForControlledPodsRunningMeasurement) waitForRuntimeObject(obj runtime.Object, isDeleted bool, operationTimeout time.Duration) (*objectChecker, error) { 589 ctx := context.TODO() 590 591 runtimeObjectReplicas, err := runtimeobjects.GetReplicasFromRuntimeObject(w.clusterFramework.GetClientSets().GetClient(), obj) 592 if err != nil { 593 return nil, err 594 } 595 var isPodUpdated func(*v1.Pod) error 596 if w.checkIfPodsAreUpdated { 597 isPodUpdated, err = runtimeobjects.GetIsPodUpdatedPredicateFromRuntimeObject(obj) 598 if err != nil { 599 return nil, err 600 } 601 } 602 if isDeleted { 603 runtimeObjectReplicas = &runtimeobjects.ConstReplicas{0} 604 } 605 key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) 606 if err != nil { 607 return nil, fmt.Errorf("meta key creation error: %v", err) 608 } 609 610 podStore, err := measurementutil.NewOwnerReferenceBasedPodStore(w.podsIndexer, obj) 611 if err != nil { 612 return nil, fmt.Errorf("failed to create pod store: %w", err) 613 } 614 615 o := newObjectChecker(key) 616 o.lock.Lock() 617 defer o.lock.Unlock() 618 ctx, cancel := context.WithCancel(ctx) 619 o.cancel = cancel 620 w.handlingGroup.Start(func() { 621 defer cancel() 622 if operationTimeout != time.Duration(0) { 623 ctx, cancel = context.WithTimeout(ctx, operationTimeout) 624 defer cancel() 625 } 626 if err := runtimeObjectReplicas.Start(ctx.Done()); err != nil { 627 klog.Errorf("%s: error while starting runtimeObjectReplicas: %v", key, err) 628 o.err = fmt.Errorf("failed to start runtimeObjectReplicas: %v", err) 629 return 630 } 631 options := &measurementutil.WaitForPodOptions{ 632 DesiredPodCount: runtimeObjectReplicas.Replicas, 633 CountErrorMargin: w.countErrorMargin, 634 CallerName: w.String(), 635 WaitForPodsInterval: defaultWaitForPodsInterval, 636 IsPodUpdated: isPodUpdated, 637 } 638 639 // This function sets the status (and error message) for the object checker. 640 // The handling of bad statuses and errors is done by gather() function of the measurement. 641 start := time.Now() 642 failedPods, err := measurementutil.WaitForPods(ctx, podStore, options) 643 o.lock.Lock() 644 defer o.lock.Unlock() 645 o.duration = time.Since(start) 646 647 if err != nil { 648 klog.Errorf("%s: error for %v: %v", w, key, err) 649 o.status = failed 650 o.err = fmt.Errorf("%s: %v", key, err) 651 o.failedPods = failedPods 652 653 hasTimedOut := ctx.Err() == context.DeadlineExceeded 654 if hasTimedOut { 655 if isDeleted { 656 o.status = deleteTimeout 657 } else { 658 o.status = timeout 659 } 660 klog.Errorf("%s: %s timed out", w, key) 661 } 662 return 663 } 664 if isDeleted { 665 o.status = deleted 666 return 667 } 668 669 o.status = running 670 }) 671 return o, nil 672 } 673 674 type objectStatus int 675 676 const ( 677 unknown objectStatus = iota // WaitForPods hasn't finished yet. Result isn't determined yet. 678 running // WaitForPods finished and scale up/down to scale other than 0 succeeded. 679 deleted // WaitForPods finished and scale down to 0 succeeded. 680 failed // WaitForPods finished, but failed. o.err must be set. 681 timeout // WaitForPods has been interrupted due to timeout and target scale was other than 0. 682 deleteTimeout // WaitForPods has been interrupted due to timeout and target scale was 0. 683 ) 684 685 type objectChecker struct { 686 lock sync.Mutex 687 status objectStatus 688 err error 689 // key of the object being checked. In the current implementation it's a namespaced name, but it 690 // may change in the future. 691 key string 692 cancel context.CancelFunc 693 duration time.Duration 694 failedPods *measurementutil.PodsStatus 695 } 696 697 func newObjectChecker(key string) *objectChecker { 698 return &objectChecker{ 699 status: unknown, 700 key: key, 701 } 702 } 703 704 func (o *objectChecker) SetCancel(cancel context.CancelFunc) { 705 o.lock.Lock() 706 defer o.lock.Unlock() 707 o.cancel = cancel 708 } 709 710 func (o *objectChecker) Stop() { 711 o.lock.Lock() 712 defer o.lock.Unlock() 713 o.cancel() 714 } 715 716 func (o *objectChecker) getStatus() (objectStatus, error) { 717 o.lock.Lock() 718 defer o.lock.Unlock() 719 return o.status, o.err 720 }