k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/resourceclaim/controller.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package resourceclaim 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "slices" 24 "strings" 25 "time" 26 27 v1 "k8s.io/api/core/v1" 28 resourcev1alpha2 "k8s.io/api/resource/v1alpha2" 29 apierrors "k8s.io/apimachinery/pkg/api/errors" 30 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 31 "k8s.io/apimachinery/pkg/types" 32 "k8s.io/apimachinery/pkg/util/runtime" 33 "k8s.io/apimachinery/pkg/util/wait" 34 corev1apply "k8s.io/client-go/applyconfigurations/core/v1" 35 v1informers "k8s.io/client-go/informers/core/v1" 36 resourcev1alpha2informers "k8s.io/client-go/informers/resource/v1alpha2" 37 clientset "k8s.io/client-go/kubernetes" 38 "k8s.io/client-go/kubernetes/scheme" 39 v1core "k8s.io/client-go/kubernetes/typed/core/v1" 40 v1listers "k8s.io/client-go/listers/core/v1" 41 resourcev1alpha2listers "k8s.io/client-go/listers/resource/v1alpha2" 42 "k8s.io/client-go/tools/cache" 43 "k8s.io/client-go/tools/record" 44 "k8s.io/client-go/util/workqueue" 45 "k8s.io/dynamic-resource-allocation/resourceclaim" 46 "k8s.io/klog/v2" 47 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 48 "k8s.io/kubernetes/pkg/controller/resourceclaim/metrics" 49 "k8s.io/utils/pointer" 50 ) 51 52 const ( 53 // podResourceClaimIndex is the lookup name for the index function which indexes by pod ResourceClaim templates. 54 podResourceClaimIndex = "pod-resource-claim-index" 55 56 // podResourceClaimAnnotation is the special annotation that generated 57 // ResourceClaims get. Its value is the pod.spec.resourceClaims[].name 58 // for which it was generated. This is used only inside the controller 59 // and not documented as part of the Kubernetes API. 60 podResourceClaimAnnotation = "resource.kubernetes.io/pod-claim-name" 61 62 // claimPodOwnerIndex is used to find ResourceClaims which have 63 // a specific pod as owner. Values for this index are the pod UID. 64 claimPodOwnerIndex = "claim-pod-owner-index" 65 66 // Field manager used to update the pod status. 67 fieldManager = "ResourceClaimController" 68 69 maxUIDCacheEntries = 500 70 ) 71 72 // Controller creates ResourceClaims for ResourceClaimTemplates in a pod spec. 73 type Controller struct { 74 // kubeClient is the kube API client used to communicate with the API 75 // server. 76 kubeClient clientset.Interface 77 78 // claimLister is the shared ResourceClaim lister used to fetch and store ResourceClaim 79 // objects from the API server. It is shared with other controllers and 80 // therefore the ResourceClaim objects in its store should be treated as immutable. 81 claimLister resourcev1alpha2listers.ResourceClaimLister 82 claimsSynced cache.InformerSynced 83 claimCache cache.MutationCache 84 85 // podLister is the shared Pod lister used to fetch Pod 86 // objects from the API server. It is shared with other controllers and 87 // therefore the Pod objects in its store should be treated as immutable. 88 podLister v1listers.PodLister 89 podSynced cache.InformerSynced 90 91 // podSchedulingList is the shared PodSchedulingContext lister used to 92 // fetch scheduling objects from the API server. It is shared with other 93 // controllers and therefore the objects in its store should be treated 94 // as immutable. 95 podSchedulingLister resourcev1alpha2listers.PodSchedulingContextLister 96 podSchedulingSynced cache.InformerSynced 97 98 // templateLister is the shared ResourceClaimTemplate lister used to 99 // fetch template objects from the API server. It is shared with other 100 // controllers and therefore the objects in its store should be treated 101 // as immutable. 102 templateLister resourcev1alpha2listers.ResourceClaimTemplateLister 103 templatesSynced cache.InformerSynced 104 105 // podIndexer has the common PodResourceClaim indexer indexer installed To 106 // limit iteration over pods to those of interest. 107 podIndexer cache.Indexer 108 109 // recorder is used to record events in the API server 110 recorder record.EventRecorder 111 112 queue workqueue.TypedRateLimitingInterface[string] 113 114 // The deletedObjects cache keeps track of Pods for which we know that 115 // they have existed and have been removed. For those we can be sure 116 // that a ReservedFor entry needs to be removed. 117 deletedObjects *uidCache 118 } 119 120 const ( 121 claimKeyPrefix = "claim:" 122 podKeyPrefix = "pod:" 123 ) 124 125 // NewController creates a ResourceClaim controller. 126 func NewController( 127 logger klog.Logger, 128 kubeClient clientset.Interface, 129 podInformer v1informers.PodInformer, 130 podSchedulingInformer resourcev1alpha2informers.PodSchedulingContextInformer, 131 claimInformer resourcev1alpha2informers.ResourceClaimInformer, 132 templateInformer resourcev1alpha2informers.ResourceClaimTemplateInformer) (*Controller, error) { 133 134 ec := &Controller{ 135 kubeClient: kubeClient, 136 podLister: podInformer.Lister(), 137 podIndexer: podInformer.Informer().GetIndexer(), 138 podSynced: podInformer.Informer().HasSynced, 139 podSchedulingLister: podSchedulingInformer.Lister(), 140 podSchedulingSynced: podSchedulingInformer.Informer().HasSynced, 141 claimLister: claimInformer.Lister(), 142 claimsSynced: claimInformer.Informer().HasSynced, 143 templateLister: templateInformer.Lister(), 144 templatesSynced: templateInformer.Informer().HasSynced, 145 queue: workqueue.NewTypedRateLimitingQueueWithConfig( 146 workqueue.DefaultTypedControllerRateLimiter[string](), 147 workqueue.TypedRateLimitingQueueConfig[string]{Name: "resource_claim"}, 148 ), 149 deletedObjects: newUIDCache(maxUIDCacheEntries), 150 } 151 152 metrics.RegisterMetrics() 153 154 if _, err := podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 155 AddFunc: func(obj interface{}) { 156 ec.enqueuePod(logger, obj, false) 157 }, 158 UpdateFunc: func(old, updated interface{}) { 159 ec.enqueuePod(logger, updated, false) 160 }, 161 DeleteFunc: func(obj interface{}) { 162 ec.enqueuePod(logger, obj, true) 163 }, 164 }); err != nil { 165 return nil, err 166 } 167 if _, err := claimInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 168 AddFunc: func(obj interface{}) { 169 logger.V(6).Info("new claim", "claimDump", obj) 170 ec.enqueueResourceClaim(logger, obj, false) 171 }, 172 UpdateFunc: func(old, updated interface{}) { 173 logger.V(6).Info("updated claim", "claimDump", updated) 174 ec.enqueueResourceClaim(logger, updated, false) 175 }, 176 DeleteFunc: func(obj interface{}) { 177 logger.V(6).Info("deleted claim", "claimDump", obj) 178 ec.enqueueResourceClaim(logger, obj, true) 179 }, 180 }); err != nil { 181 return nil, err 182 } 183 if err := ec.podIndexer.AddIndexers(cache.Indexers{podResourceClaimIndex: podResourceClaimIndexFunc}); err != nil { 184 return nil, fmt.Errorf("could not initialize ResourceClaim controller: %w", err) 185 } 186 187 // The mutation cache acts as an additional layer for the informer 188 // cache and after a create made by the controller returns that 189 // object until the informer catches up. That is necessary 190 // when a ResourceClaim got created, updating the pod status fails, 191 // and then a retry occurs before the informer cache is updated. 192 // In that scenario, the controller would create another claim 193 // instead of continuing with the existing one. 194 claimInformerCache := claimInformer.Informer().GetIndexer() 195 if err := claimInformerCache.AddIndexers(cache.Indexers{claimPodOwnerIndex: claimPodOwnerIndexFunc}); err != nil { 196 return nil, fmt.Errorf("could not initialize ResourceClaim controller: %w", err) 197 } 198 ec.claimCache = cache.NewIntegerResourceVersionMutationCache(claimInformerCache, claimInformerCache, 199 // Very long time to live, unlikely to be needed because 200 // the informer cache should get updated soon. 201 time.Hour, 202 // Allow storing objects not in the underlying cache - that's the point... 203 // It's safe because in case of a race (claim is in mutation cache, claim 204 // gets deleted, controller updates status based on mutation cache) the 205 // "bad" pod status will get detected and fixed when the informer catches up. 206 true, 207 ) 208 209 return ec, nil 210 } 211 212 func (ec *Controller) enqueuePod(logger klog.Logger, obj interface{}, deleted bool) { 213 if d, ok := obj.(cache.DeletedFinalStateUnknown); ok { 214 obj = d.Obj 215 } 216 pod, ok := obj.(*v1.Pod) 217 if !ok { 218 // Not a pod?! 219 logger.Error(nil, "enqueuePod called for unexpected object", "type", fmt.Sprintf("%T", obj)) 220 return 221 } 222 223 if len(pod.Spec.ResourceClaims) == 0 { 224 // Nothing to do for it at all. 225 return 226 } 227 228 if deleted { 229 logger.V(6).Info("pod got deleted", "pod", klog.KObj(pod)) 230 ec.deletedObjects.Add(pod.UID) 231 } 232 233 logger.V(6).Info("pod with resource claims changed", "pod", klog.KObj(pod), "deleted", deleted) 234 235 // Release reservations of a deleted or completed pod? 236 if needsClaims, reason := podNeedsClaims(pod, deleted); !needsClaims { 237 for _, podClaim := range pod.Spec.ResourceClaims { 238 claimName, _, err := resourceclaim.Name(pod, &podClaim) 239 switch { 240 case err != nil: 241 // Either the claim was not created (nothing to do here) or 242 // the API changed. The later will also get reported elsewhere, 243 // so here it's just a debug message. 244 logger.V(6).Info("Nothing to do for claim during pod change", "err", err, "reason", reason) 245 case claimName != nil: 246 key := claimKeyPrefix + pod.Namespace + "/" + *claimName 247 logger.V(6).Info("Process claim", "pod", klog.KObj(pod), "key", key, "reason", reason) 248 ec.queue.Add(key) 249 default: 250 // Nothing to do, claim wasn't generated. 251 logger.V(6).Info("Nothing to do for skipped claim during pod change", "reason", reason) 252 } 253 } 254 } 255 256 needsWork, reason := ec.podNeedsWork(pod) 257 if needsWork { 258 logger.V(6).Info("enqueing pod", "pod", klog.KObj(pod), "reason", reason) 259 ec.queue.Add(podKeyPrefix + pod.Namespace + "/" + pod.Name) 260 return 261 } 262 logger.V(6).Info("not enqueing pod", "pod", klog.KObj(pod), "reason", reason) 263 } 264 265 func podNeedsClaims(pod *v1.Pod, deleted bool) (bool, string) { 266 if deleted { 267 return false, "pod got removed" 268 } 269 if podutil.IsPodTerminal(pod) { 270 return false, "pod has terminated" 271 } 272 if pod.DeletionTimestamp != nil && pod.Spec.NodeName == "" { 273 return false, "pod got deleted before scheduling" 274 } 275 // Still needs claims. 276 return true, "pod might run" 277 } 278 279 // podNeedsWork checks whether a new or modified pod needs to be processed 280 // further by a worker. It returns a boolean with the result and an explanation 281 // for it. 282 func (ec *Controller) podNeedsWork(pod *v1.Pod) (bool, string) { 283 if pod.DeletionTimestamp != nil { 284 // Nothing else to do for the pod. 285 return false, "pod is deleted" 286 } 287 288 for _, podClaim := range pod.Spec.ResourceClaims { 289 claimName, checkOwner, err := resourceclaim.Name(pod, &podClaim) 290 if err != nil { 291 return true, err.Error() 292 } 293 // If the claimName is nil, then it has been determined before 294 // that the claim is not needed. 295 if claimName == nil { 296 return false, "claim is not needed" 297 } 298 claim, err := ec.claimLister.ResourceClaims(pod.Namespace).Get(*claimName) 299 if apierrors.IsNotFound(err) { 300 if podClaim.Source.ResourceClaimTemplateName != nil { 301 return true, "must create ResourceClaim from template" 302 } 303 // User needs to create claim. 304 return false, "claim is missing and must be created by user" 305 } 306 if err != nil { 307 // Shouldn't happen. 308 return true, fmt.Sprintf("internal error while checking for claim: %v", err) 309 } 310 311 if checkOwner && 312 resourceclaim.IsForPod(pod, claim) != nil { 313 // Cannot proceed with the pod unless that other claim gets deleted. 314 return false, "conflicting claim needs to be removed by user" 315 } 316 317 // This check skips over the reasons below that only apply 318 // when a pod has been scheduled already. We need to keep checking 319 // for more claims that might need to be created. 320 if pod.Spec.NodeName == "" { 321 continue 322 } 323 324 // Create PodSchedulingContext if the pod got scheduled without triggering 325 // delayed allocation. 326 // 327 // These can happen when: 328 // - a user created a pod with spec.nodeName set, perhaps for testing 329 // - some scheduler was used which is unaware of DRA 330 // - DRA was not enabled in kube-scheduler (version skew, configuration) 331 if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer && 332 claim.Status.Allocation == nil { 333 scheduling, err := ec.podSchedulingLister.PodSchedulingContexts(pod.Namespace).Get(pod.Name) 334 if apierrors.IsNotFound(err) { 335 return true, "need to create PodSchedulingContext for scheduled pod" 336 } 337 if err != nil { 338 // Shouldn't happen. 339 return true, fmt.Sprintf("internal error while checking for PodSchedulingContext: %v", err) 340 } 341 if scheduling.Spec.SelectedNode != pod.Spec.NodeName { 342 // Need to update PodSchedulingContext. 343 return true, "need to updated PodSchedulingContext for scheduled pod" 344 } 345 } 346 if claim.Status.Allocation != nil && 347 !resourceclaim.IsReservedForPod(pod, claim) && 348 resourceclaim.CanBeReserved(claim) { 349 // Need to reserve it. 350 return true, "need to reserve claim for pod" 351 } 352 } 353 354 return false, "nothing to do" 355 } 356 357 func (ec *Controller) enqueueResourceClaim(logger klog.Logger, obj interface{}, deleted bool) { 358 if d, ok := obj.(cache.DeletedFinalStateUnknown); ok { 359 obj = d.Obj 360 } 361 claim, ok := obj.(*resourcev1alpha2.ResourceClaim) 362 if !ok { 363 return 364 } 365 366 if !deleted { 367 // When starting up, we have to check all claims to find those with 368 // stale pods in ReservedFor. During an update, a pod might get added 369 // that already no longer exists. 370 key := claimKeyPrefix + claim.Namespace + "/" + claim.Name 371 logger.V(6).Info("enqueing new or updated claim", "claim", klog.KObj(claim), "key", key) 372 ec.queue.Add(key) 373 } else { 374 logger.V(6).Info("not enqueing deleted claim", "claim", klog.KObj(claim)) 375 } 376 377 // Also check whether this causes work for any of the currently 378 // known pods which use the ResourceClaim. 379 objs, err := ec.podIndexer.ByIndex(podResourceClaimIndex, fmt.Sprintf("%s/%s", claim.Namespace, claim.Name)) 380 if err != nil { 381 logger.Error(err, "listing pods from cache") 382 return 383 } 384 if len(objs) == 0 { 385 logger.V(6).Info("claim got deleted while not needed by any pod, nothing to do", "claim", klog.KObj(claim)) 386 return 387 } 388 for _, obj := range objs { 389 ec.enqueuePod(logger, obj, false) 390 } 391 } 392 393 func (ec *Controller) Run(ctx context.Context, workers int) { 394 defer runtime.HandleCrash() 395 defer ec.queue.ShutDown() 396 397 logger := klog.FromContext(ctx) 398 logger.Info("Starting resource claim controller") 399 defer logger.Info("Shutting down resource claim controller") 400 401 eventBroadcaster := record.NewBroadcaster(record.WithContext(ctx)) 402 eventBroadcaster.StartLogging(klog.Infof) 403 eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: ec.kubeClient.CoreV1().Events("")}) 404 ec.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "resource_claim"}) 405 defer eventBroadcaster.Shutdown() 406 407 if !cache.WaitForNamedCacheSync("resource_claim", ctx.Done(), ec.podSynced, ec.podSchedulingSynced, ec.claimsSynced, ec.templatesSynced) { 408 return 409 } 410 411 for i := 0; i < workers; i++ { 412 go wait.UntilWithContext(ctx, ec.runWorker, time.Second) 413 } 414 415 <-ctx.Done() 416 } 417 418 func (ec *Controller) runWorker(ctx context.Context) { 419 for ec.processNextWorkItem(ctx) { 420 } 421 } 422 423 func (ec *Controller) processNextWorkItem(ctx context.Context) bool { 424 key, shutdown := ec.queue.Get() 425 if shutdown { 426 return false 427 } 428 defer ec.queue.Done(key) 429 430 err := ec.syncHandler(ctx, key) 431 if err == nil { 432 ec.queue.Forget(key) 433 return true 434 } 435 436 runtime.HandleError(fmt.Errorf("%v failed with: %v", key, err)) 437 ec.queue.AddRateLimited(key) 438 439 return true 440 } 441 442 // syncHandler is invoked for each work item which might need to be processed. 443 // If an error is returned from this function, the item will be requeued. 444 func (ec *Controller) syncHandler(ctx context.Context, key string) error { 445 sep := strings.Index(key, ":") 446 if sep < 0 { 447 return fmt.Errorf("unexpected key: %s", key) 448 } 449 prefix, object := key[0:sep+1], key[sep+1:] 450 namespace, name, err := cache.SplitMetaNamespaceKey(object) 451 if err != nil { 452 return err 453 } 454 455 switch prefix { 456 case podKeyPrefix: 457 return ec.syncPod(ctx, namespace, name) 458 case claimKeyPrefix: 459 return ec.syncClaim(ctx, namespace, name) 460 default: 461 return fmt.Errorf("unexpected key prefix: %s", prefix) 462 } 463 464 } 465 466 func (ec *Controller) syncPod(ctx context.Context, namespace, name string) error { 467 logger := klog.LoggerWithValues(klog.FromContext(ctx), "pod", klog.KRef(namespace, name)) 468 ctx = klog.NewContext(ctx, logger) 469 pod, err := ec.podLister.Pods(namespace).Get(name) 470 if err != nil { 471 if apierrors.IsNotFound(err) { 472 logger.V(5).Info("nothing to do for pod, it is gone") 473 return nil 474 } 475 return err 476 } 477 478 // Ignore pods which are already getting deleted. 479 if pod.DeletionTimestamp != nil { 480 logger.V(5).Info("nothing to do for pod, it is marked for deletion") 481 return nil 482 } 483 484 var newPodClaims map[string]string 485 for _, podClaim := range pod.Spec.ResourceClaims { 486 if err := ec.handleClaim(ctx, pod, podClaim, &newPodClaims); err != nil { 487 if ec.recorder != nil { 488 ec.recorder.Event(pod, v1.EventTypeWarning, "FailedResourceClaimCreation", fmt.Sprintf("PodResourceClaim %s: %v", podClaim.Name, err)) 489 } 490 return fmt.Errorf("pod %s/%s, PodResourceClaim %s: %v", namespace, name, podClaim.Name, err) 491 } 492 } 493 494 if newPodClaims != nil { 495 // Patch the pod status with the new information about 496 // generated ResourceClaims. 497 statuses := make([]*corev1apply.PodResourceClaimStatusApplyConfiguration, 0, len(newPodClaims)) 498 for podClaimName, resourceClaimName := range newPodClaims { 499 statuses = append(statuses, corev1apply.PodResourceClaimStatus().WithName(podClaimName).WithResourceClaimName(resourceClaimName)) 500 } 501 podApply := corev1apply.Pod(name, namespace).WithStatus(corev1apply.PodStatus().WithResourceClaimStatuses(statuses...)) 502 if _, err := ec.kubeClient.CoreV1().Pods(namespace).ApplyStatus(ctx, podApply, metav1.ApplyOptions{FieldManager: fieldManager, Force: true}); err != nil { 503 return fmt.Errorf("update pod %s/%s ResourceClaimStatuses: %v", namespace, name, err) 504 } 505 } 506 507 if pod.Spec.NodeName == "" { 508 // Scheduler will handle PodSchedulingContext and reservations. 509 logger.V(5).Info("nothing to do for pod, scheduler will deal with it") 510 return nil 511 } 512 513 for _, podClaim := range pod.Spec.ResourceClaims { 514 claimName, checkOwner, err := resourceclaim.Name(pod, &podClaim) 515 if err != nil { 516 return err 517 } 518 // If nil, then it has been determined that the claim is not needed 519 // and can be skipped. 520 if claimName == nil { 521 continue 522 } 523 claim, err := ec.claimLister.ResourceClaims(pod.Namespace).Get(*claimName) 524 if apierrors.IsNotFound(err) { 525 return nil 526 } 527 if err != nil { 528 return fmt.Errorf("retrieve claim: %v", err) 529 } 530 if checkOwner { 531 if err := resourceclaim.IsForPod(pod, claim); err != nil { 532 return err 533 } 534 } 535 if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer && 536 claim.Status.Allocation == nil { 537 logger.V(5).Info("create PodSchedulingContext because claim needs to be allocated", "resourceClaim", klog.KObj(claim)) 538 return ec.ensurePodSchedulingContext(ctx, pod) 539 } 540 if claim.Status.Allocation != nil && 541 !resourceclaim.IsReservedForPod(pod, claim) && 542 resourceclaim.CanBeReserved(claim) { 543 logger.V(5).Info("reserve claim for pod", "resourceClaim", klog.KObj(claim)) 544 if err := ec.reserveForPod(ctx, pod, claim); err != nil { 545 return err 546 } 547 } 548 } 549 550 return nil 551 } 552 553 // handleResourceClaim is invoked for each resource claim of a pod. 554 func (ec *Controller) handleClaim(ctx context.Context, pod *v1.Pod, podClaim v1.PodResourceClaim, newPodClaims *map[string]string) error { 555 logger := klog.LoggerWithValues(klog.FromContext(ctx), "podClaim", podClaim.Name) 556 ctx = klog.NewContext(ctx, logger) 557 logger.V(5).Info("checking", "podClaim", podClaim.Name) 558 559 // resourceclaim.Name checks for the situation that the client doesn't 560 // know some future addition to the API. Therefore it gets called here 561 // even if there is no template to work on, because if some new field 562 // gets added, the expectation might be that the controller does 563 // something for it. 564 claimName, mustCheckOwner, err := resourceclaim.Name(pod, &podClaim) 565 switch { 566 case errors.Is(err, resourceclaim.ErrClaimNotFound): 567 // Continue below. 568 case err != nil: 569 return fmt.Errorf("checking for claim before creating it: %v", err) 570 case claimName == nil: 571 // Nothing to do, no claim needed. 572 return nil 573 case *claimName != "": 574 claimName := *claimName 575 // The ResourceClaim should exist because it is recorded in the pod.status.resourceClaimStatuses, 576 // but perhaps it was deleted accidentally. In that case we re-create it. 577 claim, err := ec.claimLister.ResourceClaims(pod.Namespace).Get(claimName) 578 if err != nil && !apierrors.IsNotFound(err) { 579 return err 580 } 581 if claim != nil { 582 var err error 583 if mustCheckOwner { 584 err = resourceclaim.IsForPod(pod, claim) 585 } 586 if err == nil { 587 // Already created, nothing more to do. 588 logger.V(5).Info("claim already created", "podClaim", podClaim.Name, "resourceClaim", claimName) 589 return nil 590 } 591 logger.Error(err, "claim that was created for the pod is no longer owned by the pod, creating a new one", "podClaim", podClaim.Name, "resourceClaim", claimName) 592 } 593 } 594 595 templateName := podClaim.Source.ResourceClaimTemplateName 596 if templateName == nil { 597 // Nothing to do. 598 return nil 599 } 600 601 // Before we create a new ResourceClaim, check if there is an orphaned one. 602 // This covers the case that the controller has created it, but then fails 603 // before it can update the pod status. 604 claim, err := ec.findPodResourceClaim(pod, podClaim) 605 if err != nil { 606 return fmt.Errorf("finding ResourceClaim for claim %s in pod %s/%s failed: %v", podClaim.Name, pod.Namespace, pod.Name, err) 607 } 608 609 if claim == nil { 610 template, err := ec.templateLister.ResourceClaimTemplates(pod.Namespace).Get(*templateName) 611 if err != nil { 612 return fmt.Errorf("resource claim template %q: %v", *templateName, err) 613 } 614 615 // Create the ResourceClaim with pod as owner, with a generated name that uses 616 // <pod>-<claim name> as base. 617 isTrue := true 618 annotations := template.Spec.ObjectMeta.Annotations 619 if annotations == nil { 620 annotations = make(map[string]string) 621 } 622 annotations[podResourceClaimAnnotation] = podClaim.Name 623 generateName := pod.Name + "-" + podClaim.Name + "-" 624 maxBaseLen := 57 // Leave space for hyphen and 5 random characters in a name with 63 characters. 625 if len(generateName) > maxBaseLen { 626 // We could leave truncation to the apiserver, but as 627 // it removes at the end, we would loose everything 628 // from the pod claim name when the pod name is long. 629 // We can do better and truncate both strings, 630 // proportional to their length. 631 generateName = pod.Name[0:len(pod.Name)*maxBaseLen/len(generateName)] + 632 "-" + 633 podClaim.Name[0:len(podClaim.Name)*maxBaseLen/len(generateName)] 634 } 635 claim = &resourcev1alpha2.ResourceClaim{ 636 ObjectMeta: metav1.ObjectMeta{ 637 GenerateName: generateName, 638 OwnerReferences: []metav1.OwnerReference{ 639 { 640 APIVersion: "v1", 641 Kind: "Pod", 642 Name: pod.Name, 643 UID: pod.UID, 644 Controller: &isTrue, 645 BlockOwnerDeletion: &isTrue, 646 }, 647 }, 648 Annotations: annotations, 649 Labels: template.Spec.ObjectMeta.Labels, 650 }, 651 Spec: template.Spec.Spec, 652 } 653 metrics.ResourceClaimCreateAttempts.Inc() 654 claimName := claim.Name 655 claim, err = ec.kubeClient.ResourceV1alpha2().ResourceClaims(pod.Namespace).Create(ctx, claim, metav1.CreateOptions{}) 656 if err != nil { 657 metrics.ResourceClaimCreateFailures.Inc() 658 return fmt.Errorf("create ResourceClaim %s: %v", claimName, err) 659 } 660 ec.claimCache.Mutation(claim) 661 } 662 663 // Remember the new ResourceClaim for a batch PodStatus update in our caller. 664 if *newPodClaims == nil { 665 *newPodClaims = make(map[string]string) 666 } 667 (*newPodClaims)[podClaim.Name] = claim.Name 668 669 return nil 670 } 671 672 // findPodResourceClaim looks for an existing ResourceClaim with the right 673 // annotation (ties it to the pod claim) and the right ownership (ties it to 674 // the pod). 675 func (ec *Controller) findPodResourceClaim(pod *v1.Pod, podClaim v1.PodResourceClaim) (*resourcev1alpha2.ResourceClaim, error) { 676 // Only claims owned by the pod will get returned here. 677 claims, err := ec.claimCache.ByIndex(claimPodOwnerIndex, string(pod.UID)) 678 if err != nil { 679 return nil, err 680 } 681 deterministicName := pod.Name + "-" + podClaim.Name // Kubernetes <= 1.27 behavior. 682 for _, claimObj := range claims { 683 claim, ok := claimObj.(*resourcev1alpha2.ResourceClaim) 684 if !ok { 685 return nil, fmt.Errorf("unexpected object of type %T returned by claim cache", claimObj) 686 } 687 podClaimName, ok := claim.Annotations[podResourceClaimAnnotation] 688 if ok && podClaimName != podClaim.Name { 689 continue 690 } 691 692 // No annotation? It might a ResourceClaim created for 693 // the pod with a previous Kubernetes release where the 694 // ResourceClaim name was deterministic, in which case 695 // we have to use it and update the new pod status 696 // field accordingly. 697 if !ok && claim.Name != deterministicName { 698 continue 699 } 700 701 // Pick the first one that matches. There shouldn't be more than one. If there is, 702 // then all others will be ignored until the pod gets deleted. Then they also get 703 // cleaned up. 704 return claim, nil 705 } 706 return nil, nil 707 } 708 709 func (ec *Controller) ensurePodSchedulingContext(ctx context.Context, pod *v1.Pod) error { 710 scheduling, err := ec.podSchedulingLister.PodSchedulingContexts(pod.Namespace).Get(pod.Name) 711 if err != nil && !apierrors.IsNotFound(err) { 712 return fmt.Errorf("retrieve PodSchedulingContext: %v", err) 713 } 714 if scheduling == nil { 715 scheduling = &resourcev1alpha2.PodSchedulingContext{ 716 ObjectMeta: metav1.ObjectMeta{ 717 Name: pod.Name, 718 Namespace: pod.Namespace, 719 OwnerReferences: []metav1.OwnerReference{ 720 { 721 APIVersion: "v1", 722 Kind: "Pod", 723 Name: pod.Name, 724 UID: pod.UID, 725 Controller: pointer.Bool(true), 726 }, 727 }, 728 }, 729 Spec: resourcev1alpha2.PodSchedulingContextSpec{ 730 SelectedNode: pod.Spec.NodeName, 731 // There is no need for negotiation about 732 // potential and suitable nodes anymore, so 733 // PotentialNodes can be left empty. 734 }, 735 } 736 if _, err := ec.kubeClient.ResourceV1alpha2().PodSchedulingContexts(pod.Namespace).Create(ctx, scheduling, metav1.CreateOptions{}); err != nil { 737 return fmt.Errorf("create PodSchedulingContext: %v", err) 738 } 739 return nil 740 } 741 742 if scheduling.Spec.SelectedNode != pod.Spec.NodeName { 743 scheduling := scheduling.DeepCopy() 744 scheduling.Spec.SelectedNode = pod.Spec.NodeName 745 if _, err := ec.kubeClient.ResourceV1alpha2().PodSchedulingContexts(pod.Namespace).Update(ctx, scheduling, metav1.UpdateOptions{}); err != nil { 746 return fmt.Errorf("update spec.selectedNode in PodSchedulingContext: %v", err) 747 } 748 } 749 750 return nil 751 } 752 753 func (ec *Controller) reserveForPod(ctx context.Context, pod *v1.Pod, claim *resourcev1alpha2.ResourceClaim) error { 754 claim = claim.DeepCopy() 755 claim.Status.ReservedFor = append(claim.Status.ReservedFor, 756 resourcev1alpha2.ResourceClaimConsumerReference{ 757 Resource: "pods", 758 Name: pod.Name, 759 UID: pod.UID, 760 }) 761 if _, err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{}); err != nil { 762 return fmt.Errorf("reserve claim for pod: %v", err) 763 } 764 return nil 765 } 766 767 func (ec *Controller) syncClaim(ctx context.Context, namespace, name string) error { 768 logger := klog.LoggerWithValues(klog.FromContext(ctx), "claim", klog.KRef(namespace, name)) 769 ctx = klog.NewContext(ctx, logger) 770 claim, err := ec.claimLister.ResourceClaims(namespace).Get(name) 771 if err != nil { 772 if apierrors.IsNotFound(err) { 773 logger.V(5).Info("nothing to do for claim, it is gone") 774 return nil 775 } 776 return err 777 } 778 779 // Check if the ReservedFor entries are all still valid. 780 valid := make([]resourcev1alpha2.ResourceClaimConsumerReference, 0, len(claim.Status.ReservedFor)) 781 for _, reservedFor := range claim.Status.ReservedFor { 782 if reservedFor.APIGroup == "" && 783 reservedFor.Resource == "pods" { 784 // A pod falls into one of three categories: 785 // - we have it in our cache -> don't remove it until we are told that it got removed 786 // - we don't have it in our cache anymore, but we have seen it before -> it was deleted, remove it 787 // - not in our cache, not seen -> double-check with API server before removal 788 789 keepEntry := true 790 791 // Tracking deleted pods in the LRU cache is an 792 // optimization. Without this cache, the code would 793 // have to do the API call below for every deleted pod 794 // to ensure that the pod really doesn't exist. With 795 // the cache, most of the time the pod will be recorded 796 // as deleted and the API call can be avoided. 797 if ec.deletedObjects.Has(reservedFor.UID) { 798 // We know that the pod was deleted. This is 799 // easy to check and thus is done first. 800 keepEntry = false 801 } else { 802 pod, err := ec.podLister.Pods(claim.Namespace).Get(reservedFor.Name) 803 switch { 804 case err != nil && !apierrors.IsNotFound(err): 805 return err 806 case err != nil: 807 // We might not have it in our informer cache 808 // yet. Removing the pod while the scheduler is 809 // scheduling it would be bad. We have to be 810 // absolutely sure and thus have to check with 811 // the API server. 812 pod, err := ec.kubeClient.CoreV1().Pods(claim.Namespace).Get(ctx, reservedFor.Name, metav1.GetOptions{}) 813 if err != nil && !apierrors.IsNotFound(err) { 814 return err 815 } 816 if pod == nil || pod.UID != reservedFor.UID { 817 logger.V(6).Info("remove reservation because pod is gone or got replaced", "pod", klog.KObj(pod), "claim", klog.KRef(namespace, name)) 818 keepEntry = false 819 } 820 case pod.UID != reservedFor.UID: 821 logger.V(6).Info("remove reservation because pod got replaced with new instance", "pod", klog.KObj(pod), "claim", klog.KRef(namespace, name)) 822 keepEntry = false 823 case isPodDone(pod): 824 logger.V(6).Info("remove reservation because pod will not run anymore", "pod", klog.KObj(pod), "claim", klog.KRef(namespace, name)) 825 keepEntry = false 826 } 827 } 828 829 if keepEntry { 830 valid = append(valid, reservedFor) 831 } 832 continue 833 } 834 835 // TODO: support generic object lookup 836 return fmt.Errorf("unsupported ReservedFor entry: %v", reservedFor) 837 } 838 839 builtinControllerFinalizer := slices.Index(claim.Finalizers, resourcev1alpha2.Finalizer) 840 logger.V(5).Info("claim reserved for counts", "currentCount", len(claim.Status.ReservedFor), "claim", klog.KRef(namespace, name), "updatedCount", len(valid), "builtinController", builtinControllerFinalizer >= 0) 841 if len(valid) < len(claim.Status.ReservedFor) { 842 // This is not using a patch because we want the update to fail if anything 843 // changed in the meantime. 844 claim := claim.DeepCopy() 845 claim.Status.ReservedFor = valid 846 847 // When a ResourceClaim uses delayed allocation, then it makes sense to 848 // deallocate the claim as soon as the last consumer stops using 849 // it. This ensures that the claim can be allocated again as needed by 850 // some future consumer instead of trying to schedule that consumer 851 // onto the node that was chosen for the previous consumer. It also 852 // releases the underlying resources for use by other claims. 853 // 854 // This has to be triggered by the transition from "was being used" to 855 // "is not used anymore" because a DRA driver is not required to set 856 // `status.reservedFor` together with `status.allocation`, i.e. a claim 857 // that is "currently unused" should not get deallocated. 858 // 859 // This does not matter for claims that were created for a pod. For 860 // those, the resource claim controller will trigger deletion when the 861 // pod is done. However, it doesn't hurt to also trigger deallocation 862 // for such claims and not checking for them keeps this code simpler. 863 if len(valid) == 0 { 864 if builtinControllerFinalizer >= 0 { 865 if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer || 866 claim.DeletionTimestamp != nil { 867 // Allocated by scheduler with structured parameters. We can "deallocate" 868 // by clearing the allocation. 869 claim.Status.Allocation = nil 870 } 871 } else if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer { 872 // DRA driver controller in the control plane 873 // needs to do the deallocation. 874 claim.Status.DeallocationRequested = true 875 } 876 // In all other cases, we keep the claim allocated, in particular for immediate allocation 877 // with a control plane controller. 878 } 879 880 claim, err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{}) 881 if err != nil { 882 return err 883 } 884 885 // Now also remove the finalizer if it is not needed anymore. 886 // Note that the index may have changed as a result of the UpdateStatus call. 887 builtinControllerFinalizer := slices.Index(claim.Finalizers, resourcev1alpha2.Finalizer) 888 if builtinControllerFinalizer >= 0 && claim.Status.Allocation == nil { 889 claim.Finalizers = slices.Delete(claim.Finalizers, builtinControllerFinalizer, builtinControllerFinalizer+1) 890 if _, err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).Update(ctx, claim, metav1.UpdateOptions{}); err != nil { 891 return err 892 } 893 } 894 } else if builtinControllerFinalizer >= 0 && claim.DeletionTimestamp != nil && len(valid) == 0 { 895 claim := claim.DeepCopy() 896 if claim.Status.Allocation != nil { 897 // This can happen when a claim with immediate allocation 898 // stopped being used, remained allocated, and then got 899 // deleted. As above we then need to clear the allocation. 900 claim.Status.Allocation = nil 901 var err error 902 claim, err = ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{}) 903 if err != nil { 904 return err 905 } 906 } 907 // Whether it was allocated or not, remove the finalizer to unblock removal. 908 claim.Finalizers = slices.Delete(claim.Finalizers, builtinControllerFinalizer, builtinControllerFinalizer+1) 909 _, err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).Update(ctx, claim, metav1.UpdateOptions{}) 910 if err != nil { 911 return err 912 } 913 } 914 915 if len(valid) == 0 { 916 // Claim is not reserved. If it was generated for a pod and 917 // that pod is not going to run, the claim can be 918 // deleted. Normally the garbage collector does that, but the 919 // pod itself might not get deleted for a while. 920 podName, podUID := owningPod(claim) 921 if podName != "" { 922 pod, err := ec.podLister.Pods(claim.Namespace).Get(podName) 923 switch { 924 case err == nil: 925 // Pod already replaced or not going to run? 926 if pod.UID != podUID || isPodDone(pod) { 927 // We are certain that the owning pod is not going to need 928 // the claim and therefore remove the claim. 929 logger.V(5).Info("deleting unused generated claim", "claim", klog.KObj(claim), "pod", klog.KObj(pod)) 930 err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).Delete(ctx, claim.Name, metav1.DeleteOptions{}) 931 if err != nil { 932 return fmt.Errorf("delete claim: %v", err) 933 } 934 } else { 935 logger.V(6).Info("wrong pod content, not deleting claim", "claim", klog.KObj(claim), "podUID", podUID, "podContent", pod) 936 } 937 case apierrors.IsNotFound(err): 938 // We might not know the pod *yet*. Instead of doing an expensive API call, 939 // let the garbage collector handle the case that the pod is truly gone. 940 logger.V(5).Info("pod for claim not found", "claim", klog.KObj(claim), "pod", klog.KRef(claim.Namespace, podName)) 941 default: 942 return fmt.Errorf("lookup pod: %v", err) 943 } 944 } else { 945 logger.V(5).Info("claim not generated for a pod", "claim", klog.KObj(claim)) 946 } 947 } 948 949 return nil 950 } 951 952 func owningPod(claim *resourcev1alpha2.ResourceClaim) (string, types.UID) { 953 for _, owner := range claim.OwnerReferences { 954 if pointer.BoolDeref(owner.Controller, false) && 955 owner.APIVersion == "v1" && 956 owner.Kind == "Pod" { 957 return owner.Name, owner.UID 958 } 959 } 960 return "", "" 961 } 962 963 // podResourceClaimIndexFunc is an index function that returns ResourceClaim keys (= 964 // namespace/name) for ResourceClaim or ResourceClaimTemplates in a given pod. 965 func podResourceClaimIndexFunc(obj interface{}) ([]string, error) { 966 pod, ok := obj.(*v1.Pod) 967 if !ok { 968 return []string{}, nil 969 } 970 keys := []string{} 971 for _, podClaim := range pod.Spec.ResourceClaims { 972 claimName, _, err := resourceclaim.Name(pod, &podClaim) 973 if err != nil || claimName == nil { 974 // Index functions are not supposed to fail, the caller will panic. 975 // For both error reasons (claim not created yet, unknown API) 976 // we simply don't index. 977 continue 978 } 979 keys = append(keys, fmt.Sprintf("%s/%s", pod.Namespace, *claimName)) 980 } 981 return keys, nil 982 } 983 984 // isPodDone returns true if it is certain that none of the containers are running and never will run. 985 func isPodDone(pod *v1.Pod) bool { 986 return podutil.IsPodPhaseTerminal(pod.Status.Phase) || 987 // Deleted and not scheduled: 988 pod.DeletionTimestamp != nil && pod.Spec.NodeName == "" 989 } 990 991 // claimPodOwnerIndexFunc is an index function that returns the pod UIDs of 992 // all pods which own the resource claim. Should only be one, though. 993 func claimPodOwnerIndexFunc(obj interface{}) ([]string, error) { 994 claim, ok := obj.(*resourcev1alpha2.ResourceClaim) 995 if !ok { 996 return nil, nil 997 } 998 var keys []string 999 for _, owner := range claim.OwnerReferences { 1000 if owner.Controller != nil && 1001 *owner.Controller && 1002 owner.APIVersion == "v1" && 1003 owner.Kind == "Pod" { 1004 keys = append(keys, string(owner.UID)) 1005 } 1006 } 1007 return keys, nil 1008 }