sigs.k8s.io/kueue@v0.6.2/pkg/controller/jobs/pod/pod_controller.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package pod 18 19 import ( 20 "cmp" 21 "context" 22 "crypto/sha256" 23 "encoding/json" 24 "errors" 25 "fmt" 26 "slices" 27 "sort" 28 "strconv" 29 "strings" 30 "time" 31 32 corev1 "k8s.io/api/core/v1" 33 apierrors "k8s.io/apimachinery/pkg/api/errors" 34 apimeta "k8s.io/apimachinery/pkg/api/meta" 35 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 36 "k8s.io/apimachinery/pkg/runtime" 37 "k8s.io/apimachinery/pkg/runtime/schema" 38 "k8s.io/apimachinery/pkg/types" 39 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 40 "k8s.io/apimachinery/pkg/util/validation" 41 "k8s.io/client-go/tools/record" 42 "k8s.io/klog/v2" 43 "k8s.io/utils/ptr" 44 ctrl "sigs.k8s.io/controller-runtime" 45 "sigs.k8s.io/controller-runtime/pkg/client" 46 "sigs.k8s.io/controller-runtime/pkg/controller" 47 "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" 48 49 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 50 "sigs.k8s.io/kueue/pkg/constants" 51 controllerconsts "sigs.k8s.io/kueue/pkg/controller/constants" 52 "sigs.k8s.io/kueue/pkg/controller/jobframework" 53 "sigs.k8s.io/kueue/pkg/podset" 54 "sigs.k8s.io/kueue/pkg/util/kubeversion" 55 "sigs.k8s.io/kueue/pkg/util/parallelize" 56 utilslices "sigs.k8s.io/kueue/pkg/util/slices" 57 ) 58 59 const ( 60 SchedulingGateName = "kueue.x-k8s.io/admission" 61 FrameworkName = "pod" 62 gateNotFound = -1 63 ConditionTypeTerminationTarget = "TerminationTarget" 64 errMsgIncorrectGroupRoleCount = "pod group can't include more than 8 roles" 65 IsGroupWorkloadAnnotationKey = "kueue.x-k8s.io/is-group-workload" 66 IsGroupWorkloadAnnotationValue = "true" 67 ) 68 69 // Event reasons used by the pod controller 70 const ( 71 ReasonExcessPodDeleted = "ExcessPodDeleted" 72 ReasonOwnerReferencesAdded = "OwnerReferencesAdded" 73 ) 74 75 var ( 76 gvk = corev1.SchemeGroupVersion.WithKind("Pod") 77 errIncorrectReconcileRequest = fmt.Errorf("event handler error: got a single pod reconcile request for a pod group") 78 errPendingOps = jobframework.UnretryableError("waiting to observe previous operations on pods") 79 errPodNoSupportKubeVersion = errors.New("pod integration only supported in Kubernetes 1.27 or newer") 80 ) 81 82 func init() { 83 utilruntime.Must(jobframework.RegisterIntegration(FrameworkName, jobframework.IntegrationCallbacks{ 84 SetupIndexes: SetupIndexes, 85 NewReconciler: NewReconciler, 86 SetupWebhook: SetupWebhook, 87 JobType: &corev1.Pod{}, 88 CanSupportIntegration: CanSupportIntegration, 89 })) 90 } 91 92 // +kubebuilder:rbac:groups=scheduling.k8s.io,resources=priorityclasses,verbs=list;get;watch 93 // +kubebuilder:rbac:groups="",resources=events,verbs=create;watch;update;patch 94 // +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;update;patch;delete 95 // +kubebuilder:rbac:groups="",resources=pods/status,verbs=get;patch 96 // +kubebuilder:rbac:groups="",resources=pods/finalizers,verbs=get;update 97 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads,verbs=get;list;watch;create;update;patch;delete 98 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads/status,verbs=get;update;patch 99 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads/finalizers,verbs=update 100 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=resourceflavors,verbs=get;list;watch 101 102 type Reconciler struct { 103 *jobframework.JobReconciler 104 expectationsStore *expectationsStore 105 } 106 107 func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 108 return r.ReconcileGenericJob(ctx, req, &Pod{excessPodExpectations: r.expectationsStore}) 109 } 110 111 func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error { 112 concurrency := mgr.GetControllerOptions().GroupKindConcurrency[gvk.GroupKind().String()] 113 ctrl.Log.V(3).Info("Setting up Pod reconciler", "concurrency", concurrency) 114 return ctrl.NewControllerManagedBy(mgr). 115 Watches(&corev1.Pod{}, &podEventHandler{cleanedUpPodsExpectations: r.expectationsStore}).Named("v1_pod"). 116 Watches(&kueue.Workload{}, &workloadHandler{}). 117 WithOptions(controller.Options{ 118 MaxConcurrentReconciles: concurrency, 119 }). 120 Complete(r) 121 } 122 123 func NewReconciler(c client.Client, record record.EventRecorder, opts ...jobframework.Option) jobframework.JobReconcilerInterface { 124 return &Reconciler{ 125 JobReconciler: jobframework.NewReconciler(c, record, opts...), 126 expectationsStore: newUIDExpectations("finalizedPods"), 127 } 128 } 129 130 type Pod struct { 131 pod corev1.Pod 132 key types.NamespacedName 133 isFound bool 134 isGroup bool 135 unretriableGroup *bool 136 list corev1.PodList 137 excessPodExpectations *expectationsStore 138 satisfiedExcessPods bool 139 } 140 141 var ( 142 _ jobframework.GenericJob = (*Pod)(nil) 143 _ jobframework.JobWithFinalize = (*Pod)(nil) 144 _ jobframework.ComposableJob = (*Pod)(nil) 145 ) 146 147 func fromObject(o runtime.Object) *Pod { 148 out := Pod{} 149 out.pod = *o.(*corev1.Pod) 150 return &out 151 } 152 153 // Object returns the job instance. 154 func (p *Pod) Object() client.Object { 155 return &p.pod 156 } 157 158 // gateIndex returns the index of the Kueue scheduling gate for corev1.Pod. 159 // If the scheduling gate is not found, returns -1. 160 func gateIndex(p *corev1.Pod) int { 161 for i := range p.Spec.SchedulingGates { 162 if p.Spec.SchedulingGates[i].Name == SchedulingGateName { 163 return i 164 } 165 } 166 return gateNotFound 167 } 168 169 func isPodTerminated(p *corev1.Pod) bool { 170 return p.Status.Phase == corev1.PodFailed || p.Status.Phase == corev1.PodSucceeded 171 } 172 173 func podSuspended(p *corev1.Pod) bool { 174 return isPodTerminated(p) || gateIndex(p) != gateNotFound 175 } 176 177 func isUnretriablePod(pod corev1.Pod) bool { 178 return pod.Annotations[RetriableInGroupAnnotation] == "false" 179 } 180 181 // isUnretriableGroup returns true if at least one pod in the group 182 // has a RetriableInGroupAnnotation set to 'false'. 183 func (p *Pod) isUnretriableGroup() bool { 184 if p.unretriableGroup != nil { 185 return *p.unretriableGroup 186 } 187 188 for _, pod := range p.list.Items { 189 if isUnretriablePod(pod) { 190 p.unretriableGroup = ptr.To(true) 191 return true 192 } 193 } 194 195 p.unretriableGroup = ptr.To(false) 196 return false 197 } 198 199 // IsSuspended returns whether the job is suspended or not. 200 func (p *Pod) IsSuspended() bool { 201 if !p.isGroup { 202 return podSuspended(&p.pod) 203 } 204 205 for i := range p.list.Items { 206 if podSuspended(&p.list.Items[i]) { 207 return true 208 } 209 } 210 211 return false 212 } 213 214 // Suspend will suspend the job. 215 func (p *Pod) Suspend() { 216 // Not implemented because this is not called when JobWithCustomStop is implemented. 217 } 218 219 // ungatePod removes the kueue scheduling gate from the pod. 220 // Returns true if the pod has been ungated and false otherwise. 221 func ungatePod(pod *corev1.Pod) bool { 222 idx := gateIndex(pod) 223 if idx != gateNotFound { 224 pod.Spec.SchedulingGates = append(pod.Spec.SchedulingGates[:idx], pod.Spec.SchedulingGates[idx+1:]...) 225 return true 226 } 227 228 return false 229 } 230 231 // Run will inject the node affinity and podSet counts extracting from workload to job and unsuspend it. 232 func (p *Pod) Run(ctx context.Context, c client.Client, podSetsInfo []podset.PodSetInfo, recorder record.EventRecorder, msg string) error { 233 log := ctrl.LoggerFrom(ctx) 234 235 if !p.isGroup { 236 if len(podSetsInfo) != 1 { 237 return fmt.Errorf("%w: expecting 1 pod set got %d", podset.ErrInvalidPodsetInfo, len(podSetsInfo)) 238 } 239 240 if ungated := ungatePod(&p.pod); !ungated { 241 return nil 242 } 243 244 if err := podset.Merge(&p.pod.ObjectMeta, &p.pod.Spec, podSetsInfo[0]); err != nil { 245 return err 246 } 247 248 err := c.Update(ctx, &p.pod) 249 if err != nil { 250 return err 251 } 252 if recorder != nil { 253 recorder.Event(&p.pod, corev1.EventTypeNormal, jobframework.ReasonStarted, msg) 254 } 255 return nil 256 } 257 258 var podsToUngate []*corev1.Pod 259 260 for i := range p.list.Items { 261 pod := &p.list.Items[i] 262 if ungated := ungatePod(pod); !ungated { 263 continue 264 } 265 podsToUngate = append(podsToUngate, pod) 266 } 267 if len(podsToUngate) == 0 { 268 return nil 269 } 270 271 return parallelize.Until(ctx, len(podsToUngate), func(i int) error { 272 pod := podsToUngate[i] 273 roleHash, err := getRoleHash(*pod) 274 if err != nil { 275 return err 276 } 277 278 podSetIndex := slices.IndexFunc(podSetsInfo, func(info podset.PodSetInfo) bool { 279 return info.Name == roleHash 280 }) 281 if podSetIndex == -1 { 282 return fmt.Errorf("%w: podSetInfo with the name '%s' is not found", podset.ErrInvalidPodsetInfo, roleHash) 283 } 284 285 err = podset.Merge(&pod.ObjectMeta, &pod.Spec, podSetsInfo[podSetIndex]) 286 if err != nil { 287 return err 288 } 289 290 log.V(3).Info("Starting pod in group", "podInGroup", klog.KObj(pod)) 291 if err := c.Update(ctx, pod); err != nil { 292 return err 293 } 294 if recorder != nil { 295 recorder.Event(pod, corev1.EventTypeNormal, jobframework.ReasonStarted, msg) 296 } 297 return nil 298 }) 299 300 } 301 302 // RunWithPodSetsInfo will inject the node affinity and podSet counts extracting from workload to job and unsuspend it. 303 func (p *Pod) RunWithPodSetsInfo(_ []podset.PodSetInfo) error { 304 // Not implemented because this is not called when JobWithCustomRun is implemented. 305 return fmt.Errorf("RunWithPodSetsInfo is not implemented for the Pod object") 306 } 307 308 // RestorePodSetsInfo will restore the original node affinity and podSet counts of the job. 309 func (p *Pod) RestorePodSetsInfo(_ []podset.PodSetInfo) bool { 310 // Not implemented since Pods cannot be updated, they can only be terminated. 311 return false 312 } 313 314 // Finished means whether the job is completed/failed or not, 315 // condition represents the workload finished condition. 316 func (p *Pod) Finished() (metav1.Condition, bool) { 317 finished := true 318 319 condition := metav1.Condition{ 320 Type: kueue.WorkloadFinished, 321 Status: metav1.ConditionTrue, 322 Reason: "JobFinished", 323 Message: "Job finished successfully", 324 } 325 326 if !p.isGroup { 327 ph := p.pod.Status.Phase 328 finished = ph == corev1.PodSucceeded || ph == corev1.PodFailed 329 330 if ph == corev1.PodFailed { 331 condition.Message = "Job failed" 332 } 333 334 return condition, finished 335 } 336 isActive := false 337 succeededCount := 0 338 339 groupTotalCount, err := p.groupTotalCount() 340 if err != nil { 341 ctrl.Log.V(2).Error(err, "failed to check if pod group is finished") 342 return metav1.Condition{}, false 343 } 344 for _, pod := range p.list.Items { 345 if pod.Status.Phase == corev1.PodSucceeded { 346 succeededCount++ 347 } 348 349 if !isPodTerminated(&pod) { 350 isActive = true 351 } 352 } 353 354 unretriableGroup := p.isUnretriableGroup() 355 356 if succeededCount == groupTotalCount || (!isActive && unretriableGroup) { 357 condition.Message = fmt.Sprintf("Pods succeeded: %d/%d.", succeededCount, groupTotalCount) 358 } else { 359 return metav1.Condition{}, false 360 } 361 362 return condition, finished 363 } 364 365 // PodSets will build workload podSets corresponding to the job. 366 func (p *Pod) PodSets() []kueue.PodSet { 367 return []kueue.PodSet{ 368 { 369 Name: kueue.DefaultPodSetName, 370 Count: 1, 371 Template: corev1.PodTemplateSpec{ 372 Spec: *p.pod.Spec.DeepCopy(), 373 }, 374 }, 375 } 376 } 377 378 // IsActive returns true if there are any running pods. 379 func (p *Pod) IsActive() bool { 380 for i := range p.list.Items { 381 if p.list.Items[i].Status.Phase == corev1.PodRunning { 382 return true 383 } 384 } 385 return false 386 } 387 388 func hasPodReadyTrue(conds []corev1.PodCondition) bool { 389 for i := range conds { 390 c := conds[i] 391 if c.Type == corev1.PodReady { 392 return c.Status == corev1.ConditionTrue 393 } 394 } 395 return false 396 } 397 398 // PodsReady instructs whether job derived pods are all ready now. 399 func (p *Pod) PodsReady() bool { 400 if !p.isGroup { 401 return hasPodReadyTrue(p.pod.Status.Conditions) 402 } 403 404 for i := range p.list.Items { 405 if !hasPodReadyTrue(p.list.Items[i].Status.Conditions) { 406 return false 407 } 408 } 409 return true 410 } 411 412 // GVK returns GVK (Group Version Kind) for the job. 413 func (p *Pod) GVK() schema.GroupVersionKind { 414 return gvk 415 } 416 417 func (p *Pod) Stop(ctx context.Context, c client.Client, _ []podset.PodSetInfo, stopReason jobframework.StopReason, eventMsg string) ([]client.Object, error) { 418 var podsInGroup []corev1.Pod 419 420 if p.isGroup { 421 podsInGroup = p.list.Items 422 } else { 423 podsInGroup = []corev1.Pod{p.pod} 424 } 425 426 stoppedNow := make([]client.Object, 0) 427 for i := range podsInGroup { 428 // If the workload is being deleted, delete even finished Pods. 429 if !podsInGroup[i].DeletionTimestamp.IsZero() || (stopReason != jobframework.StopReasonWorkloadDeleted && podSuspended(&podsInGroup[i])) { 430 continue 431 } 432 podInGroup := fromObject(&podsInGroup[i]) 433 434 // The podset info is not relevant here, since this should mark the pod's end of life 435 pCopy := &corev1.Pod{ 436 ObjectMeta: metav1.ObjectMeta{ 437 UID: podInGroup.pod.UID, 438 Name: podInGroup.pod.Name, 439 Namespace: podInGroup.pod.Namespace, 440 }, 441 TypeMeta: podInGroup.pod.TypeMeta, 442 Status: corev1.PodStatus{ 443 Conditions: []corev1.PodCondition{ 444 { 445 Type: ConditionTypeTerminationTarget, 446 Status: corev1.ConditionTrue, 447 LastTransitionTime: metav1.Time{ 448 Time: time.Now(), 449 }, 450 Reason: "StoppedByKueue", 451 Message: eventMsg, 452 }, 453 }, 454 }, 455 } 456 if err := c.Status().Patch(ctx, pCopy, client.Apply, client.FieldOwner(constants.KueueName)); err != nil && !apierrors.IsNotFound(err) { 457 return stoppedNow, err 458 } 459 if err := c.Delete(ctx, podInGroup.Object()); err != nil && !apierrors.IsNotFound(err) { 460 return stoppedNow, err 461 } 462 stoppedNow = append(stoppedNow, podInGroup.Object()) 463 } 464 465 // If related workload is deleted, the generic reconciler will stop the pod group and finalize the workload. 466 // However, it won't finalize the pods. Since the Stop method for the pod group deletes all the pods in the 467 // group, the pods will be finalized here. 468 if p.isGroup && stopReason == jobframework.StopReasonWorkloadDeleted { 469 err := p.Finalize(ctx, c) 470 if err != nil { 471 return stoppedNow, err 472 } 473 } 474 475 return stoppedNow, nil 476 } 477 478 func SetupIndexes(ctx context.Context, indexer client.FieldIndexer) error { 479 if err := indexer.IndexField(ctx, &corev1.Pod{}, PodGroupNameCacheKey, IndexPodGroupName); err != nil { 480 return err 481 } 482 if err := jobframework.SetupWorkloadOwnerIndex(ctx, indexer, gvk); err != nil { 483 return err 484 } 485 return nil 486 } 487 488 func CanSupportIntegration(opts ...jobframework.Option) (bool, error) { 489 options := jobframework.ProcessOptions(opts...) 490 491 v := options.KubeServerVersion.GetServerVersion() 492 if v.String() == "" || v.LessThan(kubeversion.KubeVersion1_27) { 493 return false, fmt.Errorf("kubernetesVersion %q: %w", v.String(), errPodNoSupportKubeVersion) 494 } 495 return true, nil 496 } 497 498 func (p *Pod) Finalize(ctx context.Context, c client.Client) error { 499 groupName := podGroupName(p.pod) 500 501 var podsInGroup corev1.PodList 502 if groupName == "" { 503 podsInGroup.Items = append(podsInGroup.Items, *p.Object().(*corev1.Pod)) 504 } else { 505 if err := c.List(ctx, &podsInGroup, client.MatchingFields{ 506 PodGroupNameCacheKey: groupName, 507 }, client.InNamespace(p.pod.Namespace)); err != nil { 508 return err 509 } 510 } 511 512 return parallelize.Until(ctx, len(podsInGroup.Items), func(i int) error { 513 pod := &podsInGroup.Items[i] 514 if controllerutil.RemoveFinalizer(pod, PodFinalizer) { 515 return c.Update(ctx, pod) 516 } 517 return nil 518 }) 519 } 520 521 func (p *Pod) Skip() bool { 522 // Skip pod reconciliation, if pod is found, and it's managed label is not set or incorrect. 523 if v, ok := p.pod.GetLabels()[ManagedLabelKey]; p.isFound && (!ok || v != ManagedLabelValue) { 524 return true 525 } 526 return false 527 } 528 529 // podGroupName returns a value of GroupNameLabel for the pod object. 530 // Returns an empty string if there's no such label. 531 func podGroupName(p corev1.Pod) string { 532 return p.GetLabels()[GroupNameLabel] 533 } 534 535 // groupTotalCount returns the value of GroupTotalCountAnnotation for the pod being reconciled at the moment. 536 // It doesn't check if the whole group has the same total group count annotation value. 537 func (p *Pod) groupTotalCount() (int, error) { 538 if podGroupName(p.pod) == "" { 539 return 0, fmt.Errorf("pod doesn't have a '%s' label", GroupNameLabel) 540 } 541 542 gtcAnnotation, ok := p.Object().GetAnnotations()[GroupTotalCountAnnotation] 543 if !ok { 544 return 0, fmt.Errorf("failed to extract '%s' annotation", 545 GroupTotalCountAnnotation) 546 } 547 548 gtc, err := strconv.Atoi(gtcAnnotation) 549 if err != nil { 550 return 0, err 551 } 552 553 if gtc < 1 { 554 return 0, fmt.Errorf("incorrect annotation value '%s=%s': group total count should be greater than zero", 555 GroupTotalCountAnnotation, gtcAnnotation) 556 } 557 558 return gtc, nil 559 } 560 561 // getRoleHash will filter all the fields of the pod that are relevant to admission (pod role) and return a sha256 562 // checksum of those fields. This is used to group the pods of the same roles when interacting with the workload. 563 func getRoleHash(p corev1.Pod) (string, error) { 564 if roleHash, ok := p.Annotations[RoleHashAnnotation]; ok { 565 return roleHash, nil 566 } 567 568 shape := map[string]interface{}{ 569 "spec": map[string]interface{}{ 570 "initContainers": containersShape(p.Spec.InitContainers), 571 "containers": containersShape(p.Spec.Containers), 572 "nodeSelector": p.Spec.NodeSelector, 573 "affinity": p.Spec.Affinity, 574 "tolerations": p.Spec.Tolerations, 575 "runtimeClassName": p.Spec.RuntimeClassName, 576 "priority": p.Spec.Priority, 577 "topologySpreadConstraints": p.Spec.TopologySpreadConstraints, 578 "overhead": p.Spec.Overhead, 579 "resourceClaims": p.Spec.ResourceClaims, 580 }, 581 } 582 583 shapeJson, err := json.Marshal(shape) 584 if err != nil { 585 return "", err 586 } 587 588 // Trim hash to 8 characters and return 589 return fmt.Sprintf("%x", sha256.Sum256(shapeJson))[:8], nil 590 } 591 592 // Load loads all pods in the group 593 func (p *Pod) Load(ctx context.Context, c client.Client, key *types.NamespacedName) (removeFinalizers bool, err error) { 594 nsKey := strings.Split(key.Namespace, "/") 595 596 if len(nsKey) == 1 { 597 if err := c.Get(ctx, *key, &p.pod); err != nil { 598 return apierrors.IsNotFound(err), err 599 } 600 p.isFound = true 601 602 // If the key.Namespace doesn't contain a "group/" prefix, even though 603 // the pod has a group name, there's something wrong with the event handler. 604 if podGroupName(p.pod) != "" { 605 return false, errIncorrectReconcileRequest 606 } 607 608 return !p.pod.DeletionTimestamp.IsZero(), nil 609 } 610 611 p.isGroup = true 612 613 key.Namespace = nsKey[1] 614 p.key = *key 615 616 // Check the expectations before listing pods, otherwise a new pod can sneak in 617 // and update the expectations after we've retrieved active pods from the store. 618 p.satisfiedExcessPods = p.excessPodExpectations.Satisfied(ctrl.LoggerFrom(ctx), *key) 619 620 if err := c.List(ctx, &p.list, client.MatchingFields{ 621 PodGroupNameCacheKey: key.Name, 622 }, client.InNamespace(key.Namespace)); err != nil { 623 return false, err 624 } 625 626 if len(p.list.Items) > 0 { 627 p.isFound = true 628 p.pod = p.list.Items[0] 629 key.Name = p.pod.Name 630 } 631 632 // If none of the pods in group are found, 633 // the respective workload should be finalized 634 return !p.isFound, nil 635 } 636 637 func (p *Pod) constructGroupPodSets() ([]kueue.PodSet, error) { 638 return constructGroupPodSets(p.list.Items) 639 } 640 641 func constructGroupPodSets(pods []corev1.Pod) ([]kueue.PodSet, error) { 642 var resultPodSets []kueue.PodSet 643 644 for _, podInGroup := range pods { 645 if !isPodRunnableOrSucceeded(&podInGroup) { 646 continue 647 } 648 649 roleHash, err := getRoleHash(podInGroup) 650 if err != nil { 651 return nil, fmt.Errorf("failed to calculate pod role hash: %w", err) 652 } 653 654 podRoleFound := false 655 for psi := range resultPodSets { 656 if resultPodSets[psi].Name == roleHash { 657 podRoleFound = true 658 resultPodSets[psi].Count++ 659 } 660 } 661 662 if !podRoleFound { 663 podSet := fromObject(&podInGroup).PodSets() 664 podSet[0].Name = roleHash 665 666 resultPodSets = append(resultPodSets, podSet[0]) 667 } 668 } 669 670 slices.SortFunc(resultPodSets, func(a, b kueue.PodSet) int { 671 return cmp.Compare(a.Name, b.Name) 672 }) 673 674 return resultPodSets, nil 675 } 676 677 // validatePodGroupMetadata validates metadata of all members of the pod group 678 func (p *Pod) validatePodGroupMetadata(r record.EventRecorder, activePods []corev1.Pod) error { 679 groupTotalCount, err := p.groupTotalCount() 680 if err != nil { 681 return err 682 } 683 originalQueue := jobframework.QueueName(p) 684 685 if len(activePods) < groupTotalCount { 686 errMsg := fmt.Sprintf("'%s' group has fewer runnable pods than expected", podGroupName(p.pod)) 687 r.Eventf(p.Object(), corev1.EventTypeWarning, jobframework.ReasonErrWorkloadCompose, errMsg) 688 return jobframework.UnretryableError(errMsg) 689 } 690 691 for _, podInGroup := range p.list.Items { 692 // Skip failed pods 693 if podInGroup.Status.Phase == corev1.PodFailed { 694 continue 695 } 696 697 if podInGroupQueue := jobframework.QueueNameForObject(&podInGroup); podInGroupQueue != originalQueue { 698 return jobframework.UnretryableError(fmt.Sprintf("pods '%s' and '%s' has different queue names: %s!=%s", 699 p.pod.GetName(), podInGroup.GetName(), 700 originalQueue, podInGroupQueue)) 701 } 702 703 tc, err := strconv.Atoi(podInGroup.GetAnnotations()[GroupTotalCountAnnotation]) 704 if err != nil { 705 return fmt.Errorf("failed to extract '%s' annotation from the pod '%s': %w", 706 GroupTotalCountAnnotation, 707 podInGroup.GetName(), 708 err) 709 } 710 if tc != groupTotalCount { 711 return jobframework.UnretryableError(fmt.Sprintf("pods '%s' and '%s' has different '%s' values: %d!=%d", 712 p.pod.GetName(), podInGroup.GetName(), 713 GroupTotalCountAnnotation, 714 groupTotalCount, tc)) 715 } 716 } 717 718 return nil 719 } 720 721 // runnableOrSucceededPods returns a slice of active pods in the group 722 func (p *Pod) runnableOrSucceededPods() []corev1.Pod { 723 return utilslices.Pick(p.list.Items, isPodRunnableOrSucceeded) 724 } 725 726 // notRunnableNorSucceededPods returns a slice of inactive pods in the group 727 func (p *Pod) notRunnableNorSucceededPods() []corev1.Pod { 728 return utilslices.Pick(p.list.Items, func(p *corev1.Pod) bool { return !isPodRunnableOrSucceeded(p) }) 729 } 730 731 // isPodRunnableOrSucceeded returns whether the Pod can eventually run, is Running or Succeeded. 732 // A Pod cannot run if it's gated and has a deletionTimestamp. 733 func isPodRunnableOrSucceeded(p *corev1.Pod) bool { 734 if p.DeletionTimestamp != nil && len(p.Spec.SchedulingGates) > 0 { 735 return false 736 } 737 return p.Status.Phase != corev1.PodFailed 738 } 739 740 // lastActiveTime returns the last timestamp on which the pod was observed active: 741 // - the time the pod was declared Failed 742 // - the deletion time 743 func lastActiveTime(p *corev1.Pod) time.Time { 744 lastTransition := metav1.Now() 745 for _, c := range p.Status.Conditions { 746 if c.Type == corev1.ContainersReady { 747 if c.Status == corev1.ConditionFalse && c.Reason == string(corev1.PodFailed) { 748 lastTransition = c.LastTransitionTime 749 } 750 break 751 } 752 } 753 deletionTime := ptr.Deref(p.DeletionTimestamp, metav1.Now()) 754 if lastTransition.Before(&deletionTime) { 755 return lastTransition.Time 756 } 757 return deletionTime.Time 758 } 759 760 // sortInactivePods sorts the provided pods slice based on: 761 // - finalizer state (pods with finalizers are first) 762 // - lastActiveTime (pods that were active last are first) 763 // - creation timestamp (newer pods are first) 764 func sortInactivePods(inactivePods []corev1.Pod) { 765 sort.Slice(inactivePods, func(i, j int) bool { 766 pi := &inactivePods[i] 767 pj := &inactivePods[j] 768 iFin := slices.Contains(pi.Finalizers, PodFinalizer) 769 jFin := slices.Contains(pj.Finalizers, PodFinalizer) 770 if iFin != jFin { 771 return iFin 772 } 773 774 iLastActive := lastActiveTime(pi) 775 jLastActive := lastActiveTime(pj) 776 777 if iLastActive.Equal(jLastActive) { 778 return pi.CreationTimestamp.Before(&pj.CreationTimestamp) 779 } 780 return jLastActive.Before(iLastActive) 781 }) 782 } 783 784 // sortActivePods sorts the provided pods slice based on: 785 // - finalizer state (pods with no finalizers are last) 786 // - gated state (pods that are still gated are last) 787 // - creation timestamp (newer pods are last) 788 func sortActivePods(activePods []corev1.Pod) { 789 // Sort active pods by creation timestamp 790 sort.Slice(activePods, func(i, j int) bool { 791 pi := &activePods[i] 792 pj := &activePods[j] 793 iFin := slices.Contains(pi.Finalizers, PodFinalizer) 794 jFin := slices.Contains(pj.Finalizers, PodFinalizer) 795 // Prefer to keep pods that have a finalizer. 796 if iFin != jFin { 797 return iFin 798 } 799 iGated := gateIndex(pi) != gateNotFound 800 jGated := gateIndex(pj) != gateNotFound 801 // Prefer to keep pods that aren't gated. 802 if iGated != jGated { 803 return !iGated 804 } 805 return pi.CreationTimestamp.Before(&pj.CreationTimestamp) 806 }) 807 } 808 809 func (p *Pod) removeExcessPods(ctx context.Context, c client.Client, r record.EventRecorder, extraPods []corev1.Pod) error { 810 if len(extraPods) == 0 { 811 return nil 812 } 813 814 log := ctrl.LoggerFrom(ctx) 815 816 // Extract all the latest created extra pods 817 extraPodsUIDs := utilslices.Map(extraPods, func(p *corev1.Pod) types.UID { return p.UID }) 818 p.excessPodExpectations.ExpectUIDs(log, p.key, extraPodsUIDs) 819 820 // Finalize and delete the active pods created last 821 err := parallelize.Until(ctx, len(extraPods), func(i int) error { 822 pod := extraPods[i] 823 if controllerutil.RemoveFinalizer(&pod, PodFinalizer) { 824 log.V(3).Info("Finalizing excess pod in group", "excessPod", klog.KObj(&pod)) 825 if err := c.Update(ctx, &pod); err != nil { 826 // We won't observe this cleanup in the event handler. 827 p.excessPodExpectations.ObservedUID(log, p.key, pod.UID) 828 return err 829 } 830 } 831 if pod.DeletionTimestamp.IsZero() { 832 log.V(3).Info("Deleting excess pod in group", "excessPod", klog.KObj(&pod)) 833 if err := c.Delete(ctx, &pod); err != nil { 834 // We won't observe this cleanup in the event handler. 835 p.excessPodExpectations.ObservedUID(log, p.key, pod.UID) 836 return err 837 } 838 r.Event(&pod, corev1.EventTypeNormal, ReasonExcessPodDeleted, "Excess pod deleted") 839 } 840 return nil 841 }) 842 if err != nil { 843 return err 844 } 845 return nil 846 } 847 848 func (p *Pod) finalizePods(ctx context.Context, c client.Client, extraPods []corev1.Pod) error { 849 if len(extraPods) == 0 { 850 return nil 851 } 852 853 log := ctrl.LoggerFrom(ctx) 854 855 // Extract all the latest created extra pods 856 extraPodsUIDs := utilslices.Map(extraPods, func(p *corev1.Pod) types.UID { return p.UID }) 857 p.excessPodExpectations.ExpectUIDs(log, p.key, extraPodsUIDs) 858 859 err := parallelize.Until(ctx, len(extraPods), func(i int) error { 860 pod := extraPods[i] 861 if controllerutil.RemoveFinalizer(&pod, PodFinalizer) { 862 log.V(3).Info("Finalizing pod in group", "Pod", klog.KObj(&pod)) 863 if err := c.Update(ctx, &pod); err != nil { 864 // We won't observe this cleanup in the event handler. 865 p.excessPodExpectations.ObservedUID(log, p.key, pod.UID) 866 return err 867 } 868 } else { 869 // We don't expect an event in this case. 870 p.excessPodExpectations.ObservedUID(log, p.key, pod.UID) 871 } 872 return nil 873 }) 874 if err != nil { 875 return err 876 } 877 return nil 878 } 879 880 func (p *Pod) ensureWorkloadOwnedByAllMembers(ctx context.Context, c client.Client, r record.EventRecorder, workload *kueue.Workload) error { 881 oldOwnersCnt := len(workload.GetOwnerReferences()) 882 for _, pod := range p.list.Items { 883 if err := controllerutil.SetOwnerReference(&pod, workload, c.Scheme()); err != nil { 884 return err 885 } 886 } 887 newOwnersCnt := len(workload.GetOwnerReferences()) 888 if addedOwnersCnt := newOwnersCnt - oldOwnersCnt; addedOwnersCnt > 0 { 889 log := ctrl.LoggerFrom(ctx).WithValues("workload", klog.KObj(workload)) 890 log.V(4).Info("Adding owner references for workload", "count", addedOwnersCnt) 891 err := c.Update(ctx, workload) 892 if err == nil { 893 r.Eventf(workload, corev1.EventTypeNormal, ReasonOwnerReferencesAdded, fmt.Sprintf("Added %d owner reference(s)", addedOwnersCnt)) 894 } 895 return err 896 } 897 return nil 898 } 899 900 func (p *Pod) ConstructComposableWorkload(ctx context.Context, c client.Client, r record.EventRecorder) (*kueue.Workload, error) { 901 object := p.Object() 902 log := ctrl.LoggerFrom(ctx) 903 904 wl := &kueue.Workload{ 905 ObjectMeta: metav1.ObjectMeta{ 906 Namespace: p.pod.GetNamespace(), 907 Labels: map[string]string{}, 908 Finalizers: []string{kueue.ResourceInUseFinalizerName}, 909 }, 910 Spec: kueue.WorkloadSpec{ 911 QueueName: jobframework.QueueName(p), 912 }, 913 } 914 915 // Construct workload for a single pod 916 if !p.isGroup { 917 wl.Spec.PodSets = p.PodSets() 918 919 wl.Name = jobframework.GetWorkloadNameForOwnerWithGVK(p.pod.GetName(), p.GVK()) 920 jobUid := string(object.GetUID()) 921 if errs := validation.IsValidLabelValue(jobUid); len(errs) == 0 { 922 wl.Labels[controllerconsts.JobUIDLabel] = jobUid 923 } else { 924 log.V(2).Info( 925 "Validation of the owner job UID label has failed. Creating workload without the label.", 926 "ValidationErrors", errs, 927 "LabelValue", jobUid, 928 ) 929 } 930 931 // add the controller ref 932 if err := controllerutil.SetControllerReference(object, wl, c.Scheme()); err != nil { 933 return nil, err 934 } 935 936 return wl, nil 937 } 938 939 if err := p.finalizePods(ctx, c, p.notRunnableNorSucceededPods()); err != nil { 940 return nil, err 941 } 942 943 activePods := p.runnableOrSucceededPods() 944 945 if wl.Annotations == nil { 946 wl.Annotations = make(map[string]string) 947 } 948 wl.Annotations[IsGroupWorkloadAnnotationKey] = IsGroupWorkloadAnnotationValue 949 950 err := p.validatePodGroupMetadata(r, activePods) 951 if err != nil { 952 return nil, err 953 } 954 955 groupTotalCount, err := p.groupTotalCount() 956 if err != nil { 957 return nil, err 958 } 959 960 // Cleanup extra pods if there's any 961 if excessPodsCount := len(activePods) - groupTotalCount; excessPodsCount > 0 { 962 sortActivePods(activePods) 963 err = p.removeExcessPods(ctx, c, r, activePods[len(activePods)-excessPodsCount:]) 964 if err != nil { 965 return nil, err 966 } 967 p.list.Items = activePods[:len(activePods)-excessPodsCount] 968 } 969 970 // Construct workload for a pod group 971 wl.Spec.PodSets, err = p.constructGroupPodSets() 972 if err != nil { 973 if jobframework.IsUnretryableError(err) { 974 r.Eventf(object, corev1.EventTypeWarning, jobframework.ReasonErrWorkloadCompose, err.Error()) 975 } 976 return nil, err 977 } 978 979 if len(wl.Spec.PodSets) > 8 { 980 return nil, jobframework.UnretryableError(errMsgIncorrectGroupRoleCount) 981 } 982 983 wl.Name = podGroupName(p.pod) 984 for _, pod := range p.list.Items { 985 if err := controllerutil.SetOwnerReference(&pod, wl, c.Scheme()); err != nil { 986 return nil, err 987 } 988 } 989 990 return wl, nil 991 } 992 993 func (p *Pod) ListChildWorkloads(ctx context.Context, c client.Client, key types.NamespacedName) (*kueue.WorkloadList, error) { 994 log := ctrl.LoggerFrom(ctx) 995 996 workloads := &kueue.WorkloadList{} 997 998 // Get related workloads for the pod group 999 if p.isGroup { 1000 workload := &kueue.Workload{} 1001 if err := c.Get(ctx, types.NamespacedName{Name: key.Name, Namespace: key.Namespace}, workload); err != nil { 1002 if apierrors.IsNotFound(err) { 1003 return workloads, nil 1004 } 1005 log.Error(err, "Unable to get related workload for the pod group") 1006 return nil, err 1007 } 1008 1009 workloads.Items = []kueue.Workload{*workload} 1010 return workloads, nil 1011 } 1012 1013 // List related workloads for the single pod 1014 if err := c.List(ctx, workloads, client.InNamespace(key.Namespace), 1015 client.MatchingFields{jobframework.GetOwnerKey(gvk): key.Name}); err != nil { 1016 log.Error(err, "Unable to get related workload for the single pod") 1017 return nil, err 1018 } 1019 1020 return workloads, nil 1021 } 1022 1023 func (p *Pod) FindMatchingWorkloads(ctx context.Context, c client.Client, r record.EventRecorder) (*kueue.Workload, []*kueue.Workload, error) { 1024 log := ctrl.LoggerFrom(ctx) 1025 groupName := podGroupName(p.pod) 1026 1027 if groupName == "" { 1028 return jobframework.FindMatchingWorkloads(ctx, c, p) 1029 } 1030 1031 // Find a matching workload first if there is one. 1032 workload := &kueue.Workload{} 1033 if err := c.Get(ctx, types.NamespacedName{Name: groupName, Namespace: p.pod.GetNamespace()}, workload); err != nil { 1034 if apierrors.IsNotFound(err) { 1035 return nil, nil, nil 1036 } 1037 log.Error(err, "Unable to get related workload") 1038 return nil, nil, err 1039 } 1040 1041 // Cleanup excess pods for each workload pod set (role) 1042 activePods := p.runnableOrSucceededPods() 1043 inactivePods := p.notRunnableNorSucceededPods() 1044 1045 var keptPods []corev1.Pod 1046 var excessActivePods []corev1.Pod 1047 var replacedInactivePods []corev1.Pod 1048 1049 for _, ps := range workload.Spec.PodSets { 1050 // Find all the active and inactive pods of the role 1051 var roleHashErrors []error 1052 hasRoleFunc := func(p *corev1.Pod) bool { 1053 hash, err := getRoleHash(*p) 1054 if err != nil { 1055 roleHashErrors = append(roleHashErrors, err) 1056 return false 1057 } 1058 return hash == ps.Name 1059 } 1060 roleActivePods := utilslices.Pick(activePods, hasRoleFunc) 1061 roleInactivePods := utilslices.Pick(inactivePods, hasRoleFunc) 1062 if len(roleHashErrors) > 0 { 1063 return nil, nil, fmt.Errorf("failed to calculate pod role hash: %w", errors.Join(roleHashErrors...)) 1064 } 1065 1066 if excessCount := len(roleActivePods) - int(ps.Count); excessCount > 0 { 1067 sortActivePods(roleActivePods) 1068 excessActivePods = append(excessActivePods, roleActivePods[len(roleActivePods)-excessCount:]...) 1069 keptPods = append(keptPods, roleActivePods[:len(roleActivePods)-excessCount]...) 1070 } else { 1071 keptPods = append(keptPods, roleActivePods...) 1072 } 1073 1074 if finalizeablePodsCount := min(len(roleInactivePods), len(roleInactivePods)+len(roleActivePods)-int(ps.Count)); finalizeablePodsCount > 0 { 1075 sortInactivePods(roleInactivePods) 1076 replacedInactivePods = append(replacedInactivePods, roleInactivePods[len(roleInactivePods)-finalizeablePodsCount:]...) 1077 keptPods = append(keptPods, roleInactivePods[:len(roleInactivePods)-finalizeablePodsCount]...) 1078 } else { 1079 keptPods = append(keptPods, roleInactivePods...) 1080 } 1081 } 1082 1083 jobPodSets, err := constructGroupPodSets(keptPods) 1084 if err != nil { 1085 return nil, nil, err 1086 } 1087 1088 if len(keptPods) == 0 || !p.equivalentToWorkload(workload, jobPodSets) { 1089 return nil, []*kueue.Workload{workload}, nil 1090 } 1091 1092 // Do not clean up more pods until observing previous operations 1093 if !p.satisfiedExcessPods { 1094 return nil, nil, errPendingOps 1095 } 1096 1097 p.list.Items = keptPods 1098 if err := p.ensureWorkloadOwnedByAllMembers(ctx, c, r, workload); err != nil { 1099 return nil, nil, err 1100 } 1101 1102 if err := p.removeExcessPods(ctx, c, r, excessActivePods); err != nil { 1103 return nil, nil, err 1104 } 1105 1106 if err := p.finalizePods(ctx, c, replacedInactivePods); err != nil { 1107 return nil, nil, err 1108 } 1109 return workload, []*kueue.Workload{}, nil 1110 } 1111 1112 func (p *Pod) equivalentToWorkload(wl *kueue.Workload, jobPodSets []kueue.PodSet) bool { 1113 workloadFinished := apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadFinished) 1114 1115 if wl.GetName() != podGroupName(p.pod) { 1116 return false 1117 } 1118 1119 if !workloadFinished && len(wl.Spec.PodSets) < len(jobPodSets) { 1120 return false 1121 } 1122 1123 // Match the current state of pod sets 1124 // to the pod set info in the workload 1125 j := -1 1126 for i := range jobPodSets { 1127 for j++; j < len(wl.Spec.PodSets); j++ { 1128 if jobPodSets[i].Name == wl.Spec.PodSets[j].Name { 1129 break 1130 } 1131 } 1132 // If actual pod set info has a role that workload doesn't have, 1133 // consider workload not an equivalent to the pod group 1134 if j == len(wl.Spec.PodSets) { 1135 return false 1136 } 1137 // Check counts for found pod sets 1138 if !workloadFinished && wl.Spec.PodSets[j].Count < jobPodSets[i].Count { 1139 return false 1140 } 1141 } 1142 1143 return true 1144 } 1145 1146 func (p *Pod) ReclaimablePods() ([]kueue.ReclaimablePod, error) { 1147 if !p.isGroup { 1148 return []kueue.ReclaimablePod{}, nil 1149 } 1150 1151 var result []kueue.ReclaimablePod 1152 for _, pod := range p.list.Items { 1153 if pod.Status.Phase == corev1.PodSucceeded { 1154 roleHash, err := getRoleHash(pod) 1155 if err != nil { 1156 return nil, err 1157 } 1158 1159 roleFound := false 1160 for i := range result { 1161 if result[i].Name == roleHash { 1162 result[i].Count++ 1163 roleFound = true 1164 } 1165 } 1166 1167 if !roleFound { 1168 result = append(result, kueue.ReclaimablePod{Name: roleHash, Count: 1}) 1169 } 1170 } 1171 } 1172 1173 return result, nil 1174 } 1175 1176 func IsPodOwnerManagedByKueue(p *Pod) bool { 1177 if owner := metav1.GetControllerOf(&p.pod); owner != nil { 1178 return jobframework.IsOwnerManagedByKueue(owner) || (owner.Kind == "RayCluster" && strings.HasPrefix(owner.APIVersion, "ray.io/v1alpha1")) 1179 } 1180 return false 1181 } 1182 1183 func GetWorkloadNameForPod(podName string) string { 1184 return jobframework.GetWorkloadNameForOwnerWithGVK(podName, gvk) 1185 }