k8s.io/kubernetes@v1.29.3/pkg/controller/resourceclaim/controller.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package resourceclaim 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "strings" 24 "time" 25 26 v1 "k8s.io/api/core/v1" 27 resourcev1alpha2 "k8s.io/api/resource/v1alpha2" 28 apierrors "k8s.io/apimachinery/pkg/api/errors" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/apimachinery/pkg/types" 31 "k8s.io/apimachinery/pkg/util/runtime" 32 "k8s.io/apimachinery/pkg/util/wait" 33 corev1apply "k8s.io/client-go/applyconfigurations/core/v1" 34 v1informers "k8s.io/client-go/informers/core/v1" 35 resourcev1alpha2informers "k8s.io/client-go/informers/resource/v1alpha2" 36 clientset "k8s.io/client-go/kubernetes" 37 "k8s.io/client-go/kubernetes/scheme" 38 v1core "k8s.io/client-go/kubernetes/typed/core/v1" 39 v1listers "k8s.io/client-go/listers/core/v1" 40 resourcev1alpha2listers "k8s.io/client-go/listers/resource/v1alpha2" 41 "k8s.io/client-go/tools/cache" 42 "k8s.io/client-go/tools/record" 43 "k8s.io/client-go/util/workqueue" 44 "k8s.io/dynamic-resource-allocation/resourceclaim" 45 "k8s.io/klog/v2" 46 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 47 "k8s.io/kubernetes/pkg/controller/resourceclaim/metrics" 48 "k8s.io/utils/pointer" 49 ) 50 51 const ( 52 // podResourceClaimIndex is the lookup name for the index function which indexes by pod ResourceClaim templates. 53 podResourceClaimIndex = "pod-resource-claim-index" 54 55 // podResourceClaimAnnotation is the special annotation that generated 56 // ResourceClaims get. Its value is the pod.spec.resourceClaims[].name 57 // for which it was generated. This is used only inside the controller 58 // and not documented as part of the Kubernetes API. 59 podResourceClaimAnnotation = "resource.kubernetes.io/pod-claim-name" 60 61 // claimPodOwnerIndex is used to find ResourceClaims which have 62 // a specific pod as owner. Values for this index are the pod UID. 63 claimPodOwnerIndex = "claim-pod-owner-index" 64 65 // Field manager used to update the pod status. 66 fieldManager = "ResourceClaimController" 67 68 maxUIDCacheEntries = 500 69 ) 70 71 // Controller creates ResourceClaims for ResourceClaimTemplates in a pod spec. 72 type Controller struct { 73 // kubeClient is the kube API client used to communicate with the API 74 // server. 75 kubeClient clientset.Interface 76 77 // claimLister is the shared ResourceClaim lister used to fetch and store ResourceClaim 78 // objects from the API server. It is shared with other controllers and 79 // therefore the ResourceClaim objects in its store should be treated as immutable. 80 claimLister resourcev1alpha2listers.ResourceClaimLister 81 claimsSynced cache.InformerSynced 82 claimCache cache.MutationCache 83 84 // podLister is the shared Pod lister used to fetch Pod 85 // objects from the API server. It is shared with other controllers and 86 // therefore the Pod objects in its store should be treated as immutable. 87 podLister v1listers.PodLister 88 podSynced cache.InformerSynced 89 90 // podSchedulingList is the shared PodSchedulingContext lister used to 91 // fetch scheduling objects from the API server. It is shared with other 92 // controllers and therefore the objects in its store should be treated 93 // as immutable. 94 podSchedulingLister resourcev1alpha2listers.PodSchedulingContextLister 95 podSchedulingSynced cache.InformerSynced 96 97 // templateLister is the shared ResourceClaimTemplate lister used to 98 // fetch template objects from the API server. It is shared with other 99 // controllers and therefore the objects in its store should be treated 100 // as immutable. 101 templateLister resourcev1alpha2listers.ResourceClaimTemplateLister 102 templatesSynced cache.InformerSynced 103 104 // podIndexer has the common PodResourceClaim indexer indexer installed To 105 // limit iteration over pods to those of interest. 106 podIndexer cache.Indexer 107 108 // recorder is used to record events in the API server 109 recorder record.EventRecorder 110 111 queue workqueue.RateLimitingInterface 112 113 // The deletedObjects cache keeps track of Pods for which we know that 114 // they have existed and have been removed. For those we can be sure 115 // that a ReservedFor entry needs to be removed. 116 deletedObjects *uidCache 117 } 118 119 const ( 120 claimKeyPrefix = "claim:" 121 podKeyPrefix = "pod:" 122 ) 123 124 // NewController creates a ResourceClaim controller. 125 func NewController( 126 logger klog.Logger, 127 kubeClient clientset.Interface, 128 podInformer v1informers.PodInformer, 129 podSchedulingInformer resourcev1alpha2informers.PodSchedulingContextInformer, 130 claimInformer resourcev1alpha2informers.ResourceClaimInformer, 131 templateInformer resourcev1alpha2informers.ResourceClaimTemplateInformer) (*Controller, error) { 132 133 ec := &Controller{ 134 kubeClient: kubeClient, 135 podLister: podInformer.Lister(), 136 podIndexer: podInformer.Informer().GetIndexer(), 137 podSynced: podInformer.Informer().HasSynced, 138 podSchedulingLister: podSchedulingInformer.Lister(), 139 podSchedulingSynced: podSchedulingInformer.Informer().HasSynced, 140 claimLister: claimInformer.Lister(), 141 claimsSynced: claimInformer.Informer().HasSynced, 142 templateLister: templateInformer.Lister(), 143 templatesSynced: templateInformer.Informer().HasSynced, 144 queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "resource_claim"), 145 deletedObjects: newUIDCache(maxUIDCacheEntries), 146 } 147 148 metrics.RegisterMetrics() 149 150 if _, err := podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 151 AddFunc: func(obj interface{}) { 152 ec.enqueuePod(logger, obj, false) 153 }, 154 UpdateFunc: func(old, updated interface{}) { 155 ec.enqueuePod(logger, updated, false) 156 }, 157 DeleteFunc: func(obj interface{}) { 158 ec.enqueuePod(logger, obj, true) 159 }, 160 }); err != nil { 161 return nil, err 162 } 163 if _, err := claimInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 164 AddFunc: func(obj interface{}) { 165 logger.V(6).Info("new claim", "claimDump", obj) 166 ec.enqueueResourceClaim(logger, obj, false) 167 }, 168 UpdateFunc: func(old, updated interface{}) { 169 logger.V(6).Info("updated claim", "claimDump", updated) 170 ec.enqueueResourceClaim(logger, updated, false) 171 }, 172 DeleteFunc: func(obj interface{}) { 173 logger.V(6).Info("deleted claim", "claimDump", obj) 174 ec.enqueueResourceClaim(logger, obj, true) 175 }, 176 }); err != nil { 177 return nil, err 178 } 179 if err := ec.podIndexer.AddIndexers(cache.Indexers{podResourceClaimIndex: podResourceClaimIndexFunc}); err != nil { 180 return nil, fmt.Errorf("could not initialize ResourceClaim controller: %w", err) 181 } 182 183 // The mutation cache acts as an additional layer for the informer 184 // cache and after a create made by the controller returns that 185 // object until the informer catches up. That is necessary 186 // when a ResourceClaim got created, updating the pod status fails, 187 // and then a retry occurs before the informer cache is updated. 188 // In that scenario, the controller would create another claim 189 // instead of continuing with the existing one. 190 claimInformerCache := claimInformer.Informer().GetIndexer() 191 if err := claimInformerCache.AddIndexers(cache.Indexers{claimPodOwnerIndex: claimPodOwnerIndexFunc}); err != nil { 192 return nil, fmt.Errorf("could not initialize ResourceClaim controller: %w", err) 193 } 194 ec.claimCache = cache.NewIntegerResourceVersionMutationCache(claimInformerCache, claimInformerCache, 195 // Very long time to live, unlikely to be needed because 196 // the informer cache should get updated soon. 197 time.Hour, 198 // Allow storing objects not in the underlying cache - that's the point... 199 // It's safe because in case of a race (claim is in mutation cache, claim 200 // gets deleted, controller updates status based on mutation cache) the 201 // "bad" pod status will get detected and fixed when the informer catches up. 202 true, 203 ) 204 205 return ec, nil 206 } 207 208 func (ec *Controller) enqueuePod(logger klog.Logger, obj interface{}, deleted bool) { 209 if d, ok := obj.(cache.DeletedFinalStateUnknown); ok { 210 obj = d.Obj 211 } 212 pod, ok := obj.(*v1.Pod) 213 if !ok { 214 // Not a pod?! 215 logger.Error(nil, "enqueuePod called for unexpected object", "type", fmt.Sprintf("%T", obj)) 216 return 217 } 218 219 if len(pod.Spec.ResourceClaims) == 0 { 220 // Nothing to do for it at all. 221 return 222 } 223 224 if deleted { 225 logger.V(6).Info("pod got deleted", "pod", klog.KObj(pod)) 226 ec.deletedObjects.Add(pod.UID) 227 } 228 229 logger.V(6).Info("pod with resource claims changed", "pod", klog.KObj(pod), "deleted", deleted) 230 231 // Release reservations of a deleted or completed pod? 232 if needsClaims, reason := podNeedsClaims(pod, deleted); !needsClaims { 233 for _, podClaim := range pod.Spec.ResourceClaims { 234 claimName, _, err := resourceclaim.Name(pod, &podClaim) 235 switch { 236 case err != nil: 237 // Either the claim was not created (nothing to do here) or 238 // the API changed. The later will also get reported elsewhere, 239 // so here it's just a debug message. 240 logger.V(6).Info("Nothing to do for claim during pod change", "err", err, "reason", reason) 241 case claimName != nil: 242 key := claimKeyPrefix + pod.Namespace + "/" + *claimName 243 logger.V(6).Info("Process claim", "pod", klog.KObj(pod), "key", key, "reason", reason) 244 ec.queue.Add(key) 245 default: 246 // Nothing to do, claim wasn't generated. 247 logger.V(6).Info("Nothing to do for skipped claim during pod change", "reason", reason) 248 } 249 } 250 } 251 252 needsWork, reason := ec.podNeedsWork(pod) 253 if needsWork { 254 logger.V(6).Info("enqueing pod", "pod", klog.KObj(pod), "reason", reason) 255 ec.queue.Add(podKeyPrefix + pod.Namespace + "/" + pod.Name) 256 return 257 } 258 logger.V(6).Info("not enqueing pod", "pod", klog.KObj(pod), "reason", reason) 259 } 260 261 func podNeedsClaims(pod *v1.Pod, deleted bool) (bool, string) { 262 if deleted { 263 return false, "pod got removed" 264 } 265 if podutil.IsPodTerminal(pod) { 266 return false, "pod has terminated" 267 } 268 if pod.DeletionTimestamp != nil && pod.Spec.NodeName == "" { 269 return false, "pod got deleted before scheduling" 270 } 271 // Still needs claims. 272 return true, "pod might run" 273 } 274 275 // podNeedsWork checks whether a new or modified pod needs to be processed 276 // further by a worker. It returns a boolean with the result and an explanation 277 // for it. 278 func (ec *Controller) podNeedsWork(pod *v1.Pod) (bool, string) { 279 if pod.DeletionTimestamp != nil { 280 // Nothing else to do for the pod. 281 return false, "pod is deleted" 282 } 283 284 for _, podClaim := range pod.Spec.ResourceClaims { 285 claimName, checkOwner, err := resourceclaim.Name(pod, &podClaim) 286 if err != nil { 287 return true, err.Error() 288 } 289 // If the claimName is nil, then it has been determined before 290 // that the claim is not needed. 291 if claimName == nil { 292 return false, "claim is not needed" 293 } 294 claim, err := ec.claimLister.ResourceClaims(pod.Namespace).Get(*claimName) 295 if apierrors.IsNotFound(err) { 296 if podClaim.Source.ResourceClaimTemplateName != nil { 297 return true, "must create ResourceClaim from template" 298 } 299 // User needs to create claim. 300 return false, "claim is missing and must be created by user" 301 } 302 if err != nil { 303 // Shouldn't happen. 304 return true, fmt.Sprintf("internal error while checking for claim: %v", err) 305 } 306 307 if checkOwner && 308 resourceclaim.IsForPod(pod, claim) != nil { 309 // Cannot proceed with the pod unless that other claim gets deleted. 310 return false, "conflicting claim needs to be removed by user" 311 } 312 313 // This check skips over the reasons below that only apply 314 // when a pod has been scheduled already. We need to keep checking 315 // for more claims that might need to be created. 316 if pod.Spec.NodeName == "" { 317 continue 318 } 319 320 // Create PodSchedulingContext if the pod got scheduled without triggering 321 // delayed allocation. 322 // 323 // These can happen when: 324 // - a user created a pod with spec.nodeName set, perhaps for testing 325 // - some scheduler was used which is unaware of DRA 326 // - DRA was not enabled in kube-scheduler (version skew, configuration) 327 if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer && 328 claim.Status.Allocation == nil { 329 scheduling, err := ec.podSchedulingLister.PodSchedulingContexts(pod.Namespace).Get(pod.Name) 330 if apierrors.IsNotFound(err) { 331 return true, "need to create PodSchedulingContext for scheduled pod" 332 } 333 if err != nil { 334 // Shouldn't happen. 335 return true, fmt.Sprintf("internal error while checking for PodSchedulingContext: %v", err) 336 } 337 if scheduling.Spec.SelectedNode != pod.Spec.NodeName { 338 // Need to update PodSchedulingContext. 339 return true, "need to updated PodSchedulingContext for scheduled pod" 340 } 341 } 342 if claim.Status.Allocation != nil && 343 !resourceclaim.IsReservedForPod(pod, claim) && 344 resourceclaim.CanBeReserved(claim) { 345 // Need to reserve it. 346 return true, "need to reserve claim for pod" 347 } 348 } 349 350 return false, "nothing to do" 351 } 352 353 func (ec *Controller) enqueueResourceClaim(logger klog.Logger, obj interface{}, deleted bool) { 354 if d, ok := obj.(cache.DeletedFinalStateUnknown); ok { 355 obj = d.Obj 356 } 357 claim, ok := obj.(*resourcev1alpha2.ResourceClaim) 358 if !ok { 359 return 360 } 361 362 if !deleted { 363 // When starting up, we have to check all claims to find those with 364 // stale pods in ReservedFor. During an update, a pod might get added 365 // that already no longer exists. 366 key := claimKeyPrefix + claim.Namespace + "/" + claim.Name 367 logger.V(6).Info("enqueing new or updated claim", "claim", klog.KObj(claim), "key", key) 368 ec.queue.Add(key) 369 } else { 370 logger.V(6).Info("not enqueing deleted claim", "claim", klog.KObj(claim)) 371 } 372 373 // Also check whether this causes work for any of the currently 374 // known pods which use the ResourceClaim. 375 objs, err := ec.podIndexer.ByIndex(podResourceClaimIndex, fmt.Sprintf("%s/%s", claim.Namespace, claim.Name)) 376 if err != nil { 377 logger.Error(err, "listing pods from cache") 378 return 379 } 380 if len(objs) == 0 { 381 logger.V(6).Info("claim got deleted while not needed by any pod, nothing to do", "claim", klog.KObj(claim)) 382 return 383 } 384 for _, obj := range objs { 385 ec.enqueuePod(logger, obj, false) 386 } 387 } 388 389 func (ec *Controller) Run(ctx context.Context, workers int) { 390 defer runtime.HandleCrash() 391 defer ec.queue.ShutDown() 392 393 logger := klog.FromContext(ctx) 394 logger.Info("Starting ephemeral volume controller") 395 defer logger.Info("Shutting down ephemeral volume controller") 396 397 eventBroadcaster := record.NewBroadcaster() 398 eventBroadcaster.StartLogging(klog.Infof) 399 eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: ec.kubeClient.CoreV1().Events("")}) 400 ec.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "resource_claim"}) 401 defer eventBroadcaster.Shutdown() 402 403 if !cache.WaitForNamedCacheSync("ephemeral", ctx.Done(), ec.podSynced, ec.claimsSynced) { 404 return 405 } 406 407 for i := 0; i < workers; i++ { 408 go wait.UntilWithContext(ctx, ec.runWorker, time.Second) 409 } 410 411 <-ctx.Done() 412 } 413 414 func (ec *Controller) runWorker(ctx context.Context) { 415 for ec.processNextWorkItem(ctx) { 416 } 417 } 418 419 func (ec *Controller) processNextWorkItem(ctx context.Context) bool { 420 key, shutdown := ec.queue.Get() 421 if shutdown { 422 return false 423 } 424 defer ec.queue.Done(key) 425 426 err := ec.syncHandler(ctx, key.(string)) 427 if err == nil { 428 ec.queue.Forget(key) 429 return true 430 } 431 432 runtime.HandleError(fmt.Errorf("%v failed with: %v", key, err)) 433 ec.queue.AddRateLimited(key) 434 435 return true 436 } 437 438 // syncHandler is invoked for each work item which might need to be processed. 439 // If an error is returned from this function, the item will be requeued. 440 func (ec *Controller) syncHandler(ctx context.Context, key string) error { 441 sep := strings.Index(key, ":") 442 if sep < 0 { 443 return fmt.Errorf("unexpected key: %s", key) 444 } 445 prefix, object := key[0:sep+1], key[sep+1:] 446 namespace, name, err := cache.SplitMetaNamespaceKey(object) 447 if err != nil { 448 return err 449 } 450 451 switch prefix { 452 case podKeyPrefix: 453 return ec.syncPod(ctx, namespace, name) 454 case claimKeyPrefix: 455 return ec.syncClaim(ctx, namespace, name) 456 default: 457 return fmt.Errorf("unexpected key prefix: %s", prefix) 458 } 459 460 } 461 462 func (ec *Controller) syncPod(ctx context.Context, namespace, name string) error { 463 logger := klog.LoggerWithValues(klog.FromContext(ctx), "pod", klog.KRef(namespace, name)) 464 ctx = klog.NewContext(ctx, logger) 465 pod, err := ec.podLister.Pods(namespace).Get(name) 466 if err != nil { 467 if apierrors.IsNotFound(err) { 468 logger.V(5).Info("nothing to do for pod, it is gone") 469 return nil 470 } 471 return err 472 } 473 474 // Ignore pods which are already getting deleted. 475 if pod.DeletionTimestamp != nil { 476 logger.V(5).Info("nothing to do for pod, it is marked for deletion") 477 return nil 478 } 479 480 var newPodClaims map[string]string 481 for _, podClaim := range pod.Spec.ResourceClaims { 482 if err := ec.handleClaim(ctx, pod, podClaim, &newPodClaims); err != nil { 483 if ec.recorder != nil { 484 ec.recorder.Event(pod, v1.EventTypeWarning, "FailedResourceClaimCreation", fmt.Sprintf("PodResourceClaim %s: %v", podClaim.Name, err)) 485 } 486 return fmt.Errorf("pod %s/%s, PodResourceClaim %s: %v", namespace, name, podClaim.Name, err) 487 } 488 } 489 490 if newPodClaims != nil { 491 // Patch the pod status with the new information about 492 // generated ResourceClaims. 493 statuses := make([]*corev1apply.PodResourceClaimStatusApplyConfiguration, 0, len(newPodClaims)) 494 for podClaimName, resourceClaimName := range newPodClaims { 495 statuses = append(statuses, corev1apply.PodResourceClaimStatus().WithName(podClaimName).WithResourceClaimName(resourceClaimName)) 496 } 497 podApply := corev1apply.Pod(name, namespace).WithStatus(corev1apply.PodStatus().WithResourceClaimStatuses(statuses...)) 498 if _, err := ec.kubeClient.CoreV1().Pods(namespace).ApplyStatus(ctx, podApply, metav1.ApplyOptions{FieldManager: fieldManager, Force: true}); err != nil { 499 return fmt.Errorf("update pod %s/%s ResourceClaimStatuses: %v", namespace, name, err) 500 } 501 } 502 503 if pod.Spec.NodeName == "" { 504 // Scheduler will handle PodSchedulingContext and reservations. 505 logger.V(5).Info("nothing to do for pod, scheduler will deal with it") 506 return nil 507 } 508 509 for _, podClaim := range pod.Spec.ResourceClaims { 510 claimName, checkOwner, err := resourceclaim.Name(pod, &podClaim) 511 if err != nil { 512 return err 513 } 514 // If nil, then it has been determined that the claim is not needed 515 // and can be skipped. 516 if claimName == nil { 517 continue 518 } 519 claim, err := ec.claimLister.ResourceClaims(pod.Namespace).Get(*claimName) 520 if apierrors.IsNotFound(err) { 521 return nil 522 } 523 if err != nil { 524 return fmt.Errorf("retrieve claim: %v", err) 525 } 526 if checkOwner { 527 if err := resourceclaim.IsForPod(pod, claim); err != nil { 528 return err 529 } 530 } 531 if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer && 532 claim.Status.Allocation == nil { 533 logger.V(5).Info("create PodSchedulingContext because claim needs to be allocated", "resourceClaim", klog.KObj(claim)) 534 return ec.ensurePodSchedulingContext(ctx, pod) 535 } 536 if claim.Status.Allocation != nil && 537 !resourceclaim.IsReservedForPod(pod, claim) && 538 resourceclaim.CanBeReserved(claim) { 539 logger.V(5).Info("reserve claim for pod", "resourceClaim", klog.KObj(claim)) 540 if err := ec.reserveForPod(ctx, pod, claim); err != nil { 541 return err 542 } 543 } 544 } 545 546 return nil 547 } 548 549 // handleResourceClaim is invoked for each volume of a pod. 550 func (ec *Controller) handleClaim(ctx context.Context, pod *v1.Pod, podClaim v1.PodResourceClaim, newPodClaims *map[string]string) error { 551 logger := klog.LoggerWithValues(klog.FromContext(ctx), "podClaim", podClaim.Name) 552 ctx = klog.NewContext(ctx, logger) 553 logger.V(5).Info("checking", "podClaim", podClaim.Name) 554 555 // resourceclaim.Name checks for the situation that the client doesn't 556 // know some future addition to the API. Therefore it gets called here 557 // even if there is no template to work on, because if some new field 558 // gets added, the expectation might be that the controller does 559 // something for it. 560 claimName, mustCheckOwner, err := resourceclaim.Name(pod, &podClaim) 561 switch { 562 case errors.Is(err, resourceclaim.ErrClaimNotFound): 563 // Continue below. 564 case err != nil: 565 return fmt.Errorf("checking for claim before creating it: %v", err) 566 case claimName == nil: 567 // Nothing to do, no claim needed. 568 return nil 569 case *claimName != "": 570 claimName := *claimName 571 // The ResourceClaim should exist because it is recorded in the pod.status.resourceClaimStatuses, 572 // but perhaps it was deleted accidentally. In that case we re-create it. 573 claim, err := ec.claimLister.ResourceClaims(pod.Namespace).Get(claimName) 574 if err != nil && !apierrors.IsNotFound(err) { 575 return err 576 } 577 if claim != nil { 578 var err error 579 if mustCheckOwner { 580 err = resourceclaim.IsForPod(pod, claim) 581 } 582 if err == nil { 583 // Already created, nothing more to do. 584 logger.V(5).Info("claim already created", "podClaim", podClaim.Name, "resourceClaim", claimName) 585 return nil 586 } 587 logger.Error(err, "claim that was created for the pod is no longer owned by the pod, creating a new one", "podClaim", podClaim.Name, "resourceClaim", claimName) 588 } 589 } 590 591 templateName := podClaim.Source.ResourceClaimTemplateName 592 if templateName == nil { 593 // Nothing to do. 594 return nil 595 } 596 597 // Before we create a new ResourceClaim, check if there is an orphaned one. 598 // This covers the case that the controller has created it, but then fails 599 // before it can update the pod status. 600 claim, err := ec.findPodResourceClaim(pod, podClaim) 601 if err != nil { 602 return fmt.Errorf("finding ResourceClaim for claim %s in pod %s/%s failed: %v", podClaim.Name, pod.Namespace, pod.Name, err) 603 } 604 605 if claim == nil { 606 template, err := ec.templateLister.ResourceClaimTemplates(pod.Namespace).Get(*templateName) 607 if err != nil { 608 return fmt.Errorf("resource claim template %q: %v", *templateName, err) 609 } 610 611 // Create the ResourceClaim with pod as owner, with a generated name that uses 612 // <pod>-<claim name> as base. 613 isTrue := true 614 annotations := template.Spec.ObjectMeta.Annotations 615 if annotations == nil { 616 annotations = make(map[string]string) 617 } 618 annotations[podResourceClaimAnnotation] = podClaim.Name 619 generateName := pod.Name + "-" + podClaim.Name + "-" 620 maxBaseLen := 57 // Leave space for hyphen and 5 random characters in a name with 63 characters. 621 if len(generateName) > maxBaseLen { 622 // We could leave truncation to the apiserver, but as 623 // it removes at the end, we would loose everything 624 // from the pod claim name when the pod name is long. 625 // We can do better and truncate both strings, 626 // proportional to their length. 627 generateName = pod.Name[0:len(pod.Name)*maxBaseLen/len(generateName)] + 628 "-" + 629 podClaim.Name[0:len(podClaim.Name)*maxBaseLen/len(generateName)] 630 } 631 claim = &resourcev1alpha2.ResourceClaim{ 632 ObjectMeta: metav1.ObjectMeta{ 633 GenerateName: generateName, 634 OwnerReferences: []metav1.OwnerReference{ 635 { 636 APIVersion: "v1", 637 Kind: "Pod", 638 Name: pod.Name, 639 UID: pod.UID, 640 Controller: &isTrue, 641 BlockOwnerDeletion: &isTrue, 642 }, 643 }, 644 Annotations: annotations, 645 Labels: template.Spec.ObjectMeta.Labels, 646 }, 647 Spec: template.Spec.Spec, 648 } 649 metrics.ResourceClaimCreateAttempts.Inc() 650 claimName := claim.Name 651 claim, err = ec.kubeClient.ResourceV1alpha2().ResourceClaims(pod.Namespace).Create(ctx, claim, metav1.CreateOptions{}) 652 if err != nil { 653 metrics.ResourceClaimCreateFailures.Inc() 654 return fmt.Errorf("create ResourceClaim %s: %v", claimName, err) 655 } 656 ec.claimCache.Mutation(claim) 657 } 658 659 // Remember the new ResourceClaim for a batch PodStatus update in our caller. 660 if *newPodClaims == nil { 661 *newPodClaims = make(map[string]string) 662 } 663 (*newPodClaims)[podClaim.Name] = claim.Name 664 665 return nil 666 } 667 668 // findPodResourceClaim looks for an existing ResourceClaim with the right 669 // annotation (ties it to the pod claim) and the right ownership (ties it to 670 // the pod). 671 func (ec *Controller) findPodResourceClaim(pod *v1.Pod, podClaim v1.PodResourceClaim) (*resourcev1alpha2.ResourceClaim, error) { 672 // Only claims owned by the pod will get returned here. 673 claims, err := ec.claimCache.ByIndex(claimPodOwnerIndex, string(pod.UID)) 674 if err != nil { 675 return nil, err 676 } 677 deterministicName := pod.Name + "-" + podClaim.Name // Kubernetes <= 1.27 behavior. 678 for _, claimObj := range claims { 679 claim, ok := claimObj.(*resourcev1alpha2.ResourceClaim) 680 if !ok { 681 return nil, fmt.Errorf("unexpected object of type %T returned by claim cache", claimObj) 682 } 683 podClaimName, ok := claim.Annotations[podResourceClaimAnnotation] 684 if ok && podClaimName != podClaim.Name { 685 continue 686 } 687 688 // No annotation? It might a ResourceClaim created for 689 // the pod with a previous Kubernetes release where the 690 // ResourceClaim name was deterministic, in which case 691 // we have to use it and update the new pod status 692 // field accordingly. 693 if !ok && claim.Name != deterministicName { 694 continue 695 } 696 697 // Pick the first one that matches. There shouldn't be more than one. If there is, 698 // then all others will be ignored until the pod gets deleted. Then they also get 699 // cleaned up. 700 return claim, nil 701 } 702 return nil, nil 703 } 704 705 func (ec *Controller) ensurePodSchedulingContext(ctx context.Context, pod *v1.Pod) error { 706 scheduling, err := ec.podSchedulingLister.PodSchedulingContexts(pod.Namespace).Get(pod.Name) 707 if err != nil && !apierrors.IsNotFound(err) { 708 return fmt.Errorf("retrieve PodSchedulingContext: %v", err) 709 } 710 if scheduling == nil { 711 scheduling = &resourcev1alpha2.PodSchedulingContext{ 712 ObjectMeta: metav1.ObjectMeta{ 713 Name: pod.Name, 714 Namespace: pod.Namespace, 715 OwnerReferences: []metav1.OwnerReference{ 716 { 717 APIVersion: "v1", 718 Kind: "Pod", 719 Name: pod.Name, 720 UID: pod.UID, 721 Controller: pointer.Bool(true), 722 }, 723 }, 724 }, 725 Spec: resourcev1alpha2.PodSchedulingContextSpec{ 726 SelectedNode: pod.Spec.NodeName, 727 // There is no need for negotiation about 728 // potential and suitable nodes anymore, so 729 // PotentialNodes can be left empty. 730 }, 731 } 732 if _, err := ec.kubeClient.ResourceV1alpha2().PodSchedulingContexts(pod.Namespace).Create(ctx, scheduling, metav1.CreateOptions{}); err != nil { 733 return fmt.Errorf("create PodSchedulingContext: %v", err) 734 } 735 return nil 736 } 737 738 if scheduling.Spec.SelectedNode != pod.Spec.NodeName { 739 scheduling := scheduling.DeepCopy() 740 scheduling.Spec.SelectedNode = pod.Spec.NodeName 741 if _, err := ec.kubeClient.ResourceV1alpha2().PodSchedulingContexts(pod.Namespace).Update(ctx, scheduling, metav1.UpdateOptions{}); err != nil { 742 return fmt.Errorf("update spec.selectedNode in PodSchedulingContext: %v", err) 743 } 744 } 745 746 return nil 747 } 748 749 func (ec *Controller) reserveForPod(ctx context.Context, pod *v1.Pod, claim *resourcev1alpha2.ResourceClaim) error { 750 claim = claim.DeepCopy() 751 claim.Status.ReservedFor = append(claim.Status.ReservedFor, 752 resourcev1alpha2.ResourceClaimConsumerReference{ 753 Resource: "pods", 754 Name: pod.Name, 755 UID: pod.UID, 756 }) 757 if _, err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{}); err != nil { 758 return fmt.Errorf("reserve claim for pod: %v", err) 759 } 760 return nil 761 } 762 763 func (ec *Controller) syncClaim(ctx context.Context, namespace, name string) error { 764 logger := klog.LoggerWithValues(klog.FromContext(ctx), "claim", klog.KRef(namespace, name)) 765 ctx = klog.NewContext(ctx, logger) 766 claim, err := ec.claimLister.ResourceClaims(namespace).Get(name) 767 if err != nil { 768 if apierrors.IsNotFound(err) { 769 logger.V(5).Info("nothing to do for claim, it is gone") 770 return nil 771 } 772 return err 773 } 774 775 // Check if the ReservedFor entries are all still valid. 776 valid := make([]resourcev1alpha2.ResourceClaimConsumerReference, 0, len(claim.Status.ReservedFor)) 777 for _, reservedFor := range claim.Status.ReservedFor { 778 if reservedFor.APIGroup == "" && 779 reservedFor.Resource == "pods" { 780 // A pod falls into one of three categories: 781 // - we have it in our cache -> don't remove it until we are told that it got removed 782 // - we don't have it in our cache anymore, but we have seen it before -> it was deleted, remove it 783 // - not in our cache, not seen -> double-check with API server before removal 784 785 keepEntry := true 786 787 // Tracking deleted pods in the LRU cache is an 788 // optimization. Without this cache, the code would 789 // have to do the API call below for every deleted pod 790 // to ensure that the pod really doesn't exist. With 791 // the cache, most of the time the pod will be recorded 792 // as deleted and the API call can be avoided. 793 if ec.deletedObjects.Has(reservedFor.UID) { 794 // We know that the pod was deleted. This is 795 // easy to check and thus is done first. 796 keepEntry = false 797 } else { 798 pod, err := ec.podLister.Pods(claim.Namespace).Get(reservedFor.Name) 799 switch { 800 case err != nil && !apierrors.IsNotFound(err): 801 return err 802 case err != nil: 803 // We might not have it in our informer cache 804 // yet. Removing the pod while the scheduler is 805 // scheduling it would be bad. We have to be 806 // absolutely sure and thus have to check with 807 // the API server. 808 pod, err := ec.kubeClient.CoreV1().Pods(claim.Namespace).Get(ctx, reservedFor.Name, metav1.GetOptions{}) 809 if err != nil && !apierrors.IsNotFound(err) { 810 return err 811 } 812 if pod == nil || pod.UID != reservedFor.UID { 813 logger.V(6).Info("remove reservation because pod is gone or got replaced", "pod", klog.KObj(pod), "claim", klog.KRef(namespace, name)) 814 keepEntry = false 815 } 816 case pod.UID != reservedFor.UID: 817 logger.V(6).Info("remove reservation because pod got replaced with new instance", "pod", klog.KObj(pod), "claim", klog.KRef(namespace, name)) 818 keepEntry = false 819 case isPodDone(pod): 820 logger.V(6).Info("remove reservation because pod will not run anymore", "pod", klog.KObj(pod), "claim", klog.KRef(namespace, name)) 821 keepEntry = false 822 } 823 } 824 825 if keepEntry { 826 valid = append(valid, reservedFor) 827 } 828 continue 829 } 830 831 // TODO: support generic object lookup 832 return fmt.Errorf("unsupported ReservedFor entry: %v", reservedFor) 833 } 834 835 logger.V(5).Info("claim reserved for counts", "currentCount", len(claim.Status.ReservedFor), "claim", klog.KRef(namespace, name), "updatedCount", len(valid)) 836 if len(valid) < len(claim.Status.ReservedFor) { 837 // TODO (#113700): patch 838 claim := claim.DeepCopy() 839 claim.Status.ReservedFor = valid 840 841 // When a ResourceClaim uses delayed allocation, then it makes sense to 842 // deallocate the claim as soon as the last consumer stops using 843 // it. This ensures that the claim can be allocated again as needed by 844 // some future consumer instead of trying to schedule that consumer 845 // onto the node that was chosen for the previous consumer. It also 846 // releases the underlying resources for use by other claims. 847 // 848 // This has to be triggered by the transition from "was being used" to 849 // "is not used anymore" because a DRA driver is not required to set 850 // `status.reservedFor` together with `status.allocation`, i.e. a claim 851 // that is "currently unused" should not get deallocated. 852 // 853 // This does not matter for claims that were created for a pod. For 854 // those, the resource claim controller will trigger deletion when the 855 // pod is done. However, it doesn't hurt to also trigger deallocation 856 // for such claims and not checking for them keeps this code simpler. 857 if len(valid) == 0 && 858 claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer { 859 claim.Status.DeallocationRequested = true 860 } 861 862 _, err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{}) 863 if err != nil { 864 return err 865 } 866 } 867 868 if len(valid) == 0 { 869 // Claim is not reserved. If it was generated for a pod and 870 // that pod is not going to run, the claim can be 871 // deleted. Normally the garbage collector does that, but the 872 // pod itself might not get deleted for a while. 873 podName, podUID := owningPod(claim) 874 if podName != "" { 875 pod, err := ec.podLister.Pods(claim.Namespace).Get(podName) 876 switch { 877 case err == nil: 878 // Pod already replaced or not going to run? 879 if pod.UID != podUID || isPodDone(pod) { 880 // We are certain that the owning pod is not going to need 881 // the claim and therefore remove the claim. 882 logger.V(5).Info("deleting unused generated claim", "claim", klog.KObj(claim), "pod", klog.KObj(pod)) 883 err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).Delete(ctx, claim.Name, metav1.DeleteOptions{}) 884 if err != nil { 885 return fmt.Errorf("delete claim: %v", err) 886 } 887 } else { 888 logger.V(6).Info("wrong pod content, not deleting claim", "claim", klog.KObj(claim), "podUID", podUID, "podContent", pod) 889 } 890 case apierrors.IsNotFound(err): 891 // We might not know the pod *yet*. Instead of doing an expensive API call, 892 // let the garbage collector handle the case that the pod is truly gone. 893 logger.V(5).Info("pod for claim not found", "claim", klog.KObj(claim), "pod", klog.KRef(claim.Namespace, podName)) 894 default: 895 return fmt.Errorf("lookup pod: %v", err) 896 } 897 } else { 898 logger.V(5).Info("claim not generated for a pod", "claim", klog.KObj(claim)) 899 } 900 } 901 902 return nil 903 } 904 905 func owningPod(claim *resourcev1alpha2.ResourceClaim) (string, types.UID) { 906 for _, owner := range claim.OwnerReferences { 907 if pointer.BoolDeref(owner.Controller, false) && 908 owner.APIVersion == "v1" && 909 owner.Kind == "Pod" { 910 return owner.Name, owner.UID 911 } 912 } 913 return "", "" 914 } 915 916 // podResourceClaimIndexFunc is an index function that returns ResourceClaim keys (= 917 // namespace/name) for ResourceClaim or ResourceClaimTemplates in a given pod. 918 func podResourceClaimIndexFunc(obj interface{}) ([]string, error) { 919 pod, ok := obj.(*v1.Pod) 920 if !ok { 921 return []string{}, nil 922 } 923 keys := []string{} 924 for _, podClaim := range pod.Spec.ResourceClaims { 925 claimName, _, err := resourceclaim.Name(pod, &podClaim) 926 if err != nil || claimName == nil { 927 // Index functions are not supposed to fail, the caller will panic. 928 // For both error reasons (claim not created yet, unknown API) 929 // we simply don't index. 930 continue 931 } 932 keys = append(keys, fmt.Sprintf("%s/%s", pod.Namespace, *claimName)) 933 } 934 return keys, nil 935 } 936 937 // isPodDone returns true if it is certain that none of the containers are running and never will run. 938 func isPodDone(pod *v1.Pod) bool { 939 return podutil.IsPodPhaseTerminal(pod.Status.Phase) || 940 // Deleted and not scheduled: 941 pod.DeletionTimestamp != nil && pod.Spec.NodeName == "" 942 } 943 944 // claimPodOwnerIndexFunc is an index function that returns the pod UIDs of 945 // all pods which own the resource claim. Should only be one, though. 946 func claimPodOwnerIndexFunc(obj interface{}) ([]string, error) { 947 claim, ok := obj.(*resourcev1alpha2.ResourceClaim) 948 if !ok { 949 return nil, nil 950 } 951 var keys []string 952 for _, owner := range claim.OwnerReferences { 953 if owner.Controller != nil && 954 *owner.Controller && 955 owner.APIVersion == "v1" && 956 owner.Kind == "Pod" { 957 keys = append(keys, string(owner.UID)) 958 } 959 } 960 return keys, nil 961 }