k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package dynamicresources 18 19 import ( 20 "context" 21 "encoding/json" 22 "errors" 23 "fmt" 24 "slices" 25 "sort" 26 "sync" 27 28 "github.com/google/go-cmp/cmp" 29 30 v1 "k8s.io/api/core/v1" 31 resourcev1alpha2 "k8s.io/api/resource/v1alpha2" 32 apiequality "k8s.io/apimachinery/pkg/api/equality" 33 apierrors "k8s.io/apimachinery/pkg/api/errors" 34 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 35 "k8s.io/apimachinery/pkg/labels" 36 "k8s.io/apimachinery/pkg/runtime" 37 "k8s.io/apimachinery/pkg/runtime/schema" 38 "k8s.io/apimachinery/pkg/types" 39 "k8s.io/apimachinery/pkg/util/sets" 40 resourcev1alpha2apply "k8s.io/client-go/applyconfigurations/resource/v1alpha2" 41 "k8s.io/client-go/kubernetes" 42 resourcev1alpha2listers "k8s.io/client-go/listers/resource/v1alpha2" 43 "k8s.io/component-helpers/scheduling/corev1/nodeaffinity" 44 "k8s.io/dynamic-resource-allocation/resourceclaim" 45 "k8s.io/klog/v2" 46 "k8s.io/kubernetes/pkg/scheduler/framework" 47 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature" 48 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/names" 49 schedutil "k8s.io/kubernetes/pkg/scheduler/util" 50 "k8s.io/kubernetes/pkg/scheduler/util/assumecache" 51 "k8s.io/utils/ptr" 52 ) 53 54 const ( 55 // Name is the name of the plugin used in Registry and configurations. 56 Name = names.DynamicResources 57 58 stateKey framework.StateKey = Name 59 ) 60 61 // The state is initialized in PreFilter phase. Because we save the pointer in 62 // framework.CycleState, in the later phases we don't need to call Write method 63 // to update the value 64 type stateData struct { 65 // preScored is true if PreScore was invoked. 66 preScored bool 67 68 // A copy of all claims for the Pod (i.e. 1:1 match with 69 // pod.Spec.ResourceClaims), initially with the status from the start 70 // of the scheduling cycle. Each claim instance is read-only because it 71 // might come from the informer cache. The instances get replaced when 72 // the plugin itself successfully does an Update. 73 // 74 // Empty if the Pod has no claims. 75 claims []*resourcev1alpha2.ResourceClaim 76 77 // podSchedulingState keeps track of the PodSchedulingContext 78 // (if one exists) and the changes made to it. 79 podSchedulingState podSchedulingState 80 81 // resourceModel contains the information about available and allocated resources when using 82 // structured parameters and the pod needs this information. 83 resources resources 84 85 // mutex must be locked while accessing any of the fields below. 86 mutex sync.Mutex 87 88 // The indices of all claims that: 89 // - are allocated 90 // - use delayed allocation or the builtin controller 91 // - were not available on at least one node 92 // 93 // Set in parallel during Filter, so write access there must be 94 // protected by the mutex. Used by PostFilter. 95 unavailableClaims sets.Set[int] 96 97 informationsForClaim []informationForClaim 98 } 99 100 func (d *stateData) Clone() framework.StateData { 101 return d 102 } 103 104 type informationForClaim struct { 105 // The availableOnNode node filter of the claim converted from the 106 // v1 API to nodeaffinity.NodeSelector by PreFilter for repeated 107 // evaluation in Filter. Nil for claim which don't have it. 108 availableOnNode *nodeaffinity.NodeSelector 109 110 // The status of the claim got from the 111 // schedulingCtx by PreFilter for repeated 112 // evaluation in Filter. Nil for claim which don't have it. 113 status *resourcev1alpha2.ResourceClaimSchedulingStatus 114 115 // structuredParameters is true if the claim is handled via the builtin 116 // controller. 117 structuredParameters bool 118 controller *claimController 119 120 // Set by Reserved, published by PreBind. 121 allocation *resourcev1alpha2.AllocationResult 122 allocationDriverName string 123 } 124 125 type podSchedulingState struct { 126 // A pointer to the PodSchedulingContext object for the pod, if one exists 127 // in the API server. 128 // 129 // Conceptually, this object belongs into the scheduler framework 130 // where it might get shared by different plugins. But in practice, 131 // it is currently only used by dynamic provisioning and thus 132 // managed entirely here. 133 schedulingCtx *resourcev1alpha2.PodSchedulingContext 134 135 // selectedNode is set if (and only if) a node has been selected. 136 selectedNode *string 137 138 // potentialNodes is set if (and only if) the potential nodes field 139 // needs to be updated or set. 140 potentialNodes *[]string 141 } 142 143 func (p *podSchedulingState) isDirty() bool { 144 return p.selectedNode != nil || 145 p.potentialNodes != nil 146 } 147 148 // init checks whether there is already a PodSchedulingContext object. 149 // Must not be called concurrently, 150 func (p *podSchedulingState) init(ctx context.Context, pod *v1.Pod, podSchedulingContextLister resourcev1alpha2listers.PodSchedulingContextLister) error { 151 schedulingCtx, err := podSchedulingContextLister.PodSchedulingContexts(pod.Namespace).Get(pod.Name) 152 switch { 153 case apierrors.IsNotFound(err): 154 return nil 155 case err != nil: 156 return err 157 default: 158 // We have an object, but it might be obsolete. 159 if !metav1.IsControlledBy(schedulingCtx, pod) { 160 return fmt.Errorf("PodSchedulingContext object with UID %s is not owned by Pod %s/%s", schedulingCtx.UID, pod.Namespace, pod.Name) 161 } 162 } 163 p.schedulingCtx = schedulingCtx 164 return nil 165 } 166 167 // publish creates or updates the PodSchedulingContext object, if necessary. 168 // Must not be called concurrently. 169 func (p *podSchedulingState) publish(ctx context.Context, pod *v1.Pod, clientset kubernetes.Interface) error { 170 if !p.isDirty() { 171 return nil 172 } 173 174 var err error 175 logger := klog.FromContext(ctx) 176 if p.schedulingCtx != nil { 177 // Update it. 178 schedulingCtx := p.schedulingCtx.DeepCopy() 179 if p.selectedNode != nil { 180 schedulingCtx.Spec.SelectedNode = *p.selectedNode 181 } 182 if p.potentialNodes != nil { 183 schedulingCtx.Spec.PotentialNodes = *p.potentialNodes 184 } 185 if loggerV := logger.V(6); loggerV.Enabled() { 186 // At a high enough log level, dump the entire object. 187 loggerV.Info("Updating PodSchedulingContext", "podSchedulingCtx", klog.KObj(schedulingCtx), "podSchedulingCtxObject", klog.Format(schedulingCtx)) 188 } else { 189 logger.V(5).Info("Updating PodSchedulingContext", "podSchedulingCtx", klog.KObj(schedulingCtx)) 190 } 191 _, err = clientset.ResourceV1alpha2().PodSchedulingContexts(schedulingCtx.Namespace).Update(ctx, schedulingCtx, metav1.UpdateOptions{}) 192 if apierrors.IsConflict(err) { 193 // We don't use SSA by default for performance reasons 194 // (https://github.com/kubernetes/kubernetes/issues/113700#issuecomment-1698563918) 195 // because most of the time an Update doesn't encounter 196 // a conflict and is faster. 197 // 198 // We could return an error here and rely on 199 // backoff+retry, but scheduling attempts are expensive 200 // and the backoff delay would cause a (small) 201 // slowdown. Therefore we fall back to SSA here if needed. 202 // 203 // Using SSA instead of Get+Update has the advantage that 204 // there is no delay for the Get. SSA is safe because only 205 // the scheduler updates these fields. 206 spec := resourcev1alpha2apply.PodSchedulingContextSpec() 207 spec.SelectedNode = p.selectedNode 208 if p.potentialNodes != nil { 209 spec.PotentialNodes = *p.potentialNodes 210 } else { 211 // Unchanged. Has to be set because the object that we send 212 // must represent the "fully specified intent". Not sending 213 // the list would clear it. 214 spec.PotentialNodes = p.schedulingCtx.Spec.PotentialNodes 215 } 216 schedulingCtxApply := resourcev1alpha2apply.PodSchedulingContext(pod.Name, pod.Namespace).WithSpec(spec) 217 218 if loggerV := logger.V(6); loggerV.Enabled() { 219 // At a high enough log level, dump the entire object. 220 loggerV.Info("Patching PodSchedulingContext", "podSchedulingCtx", klog.KObj(pod), "podSchedulingCtxApply", klog.Format(schedulingCtxApply)) 221 } else { 222 logger.V(5).Info("Patching PodSchedulingContext", "podSchedulingCtx", klog.KObj(pod)) 223 } 224 _, err = clientset.ResourceV1alpha2().PodSchedulingContexts(pod.Namespace).Apply(ctx, schedulingCtxApply, metav1.ApplyOptions{FieldManager: "kube-scheduler", Force: true}) 225 } 226 227 } else { 228 // Create it. 229 schedulingCtx := &resourcev1alpha2.PodSchedulingContext{ 230 ObjectMeta: metav1.ObjectMeta{ 231 Name: pod.Name, 232 Namespace: pod.Namespace, 233 OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(pod, schema.GroupVersionKind{Version: "v1", Kind: "Pod"})}, 234 }, 235 } 236 if p.selectedNode != nil { 237 schedulingCtx.Spec.SelectedNode = *p.selectedNode 238 } 239 if p.potentialNodes != nil { 240 schedulingCtx.Spec.PotentialNodes = *p.potentialNodes 241 } 242 if loggerV := logger.V(6); loggerV.Enabled() { 243 // At a high enough log level, dump the entire object. 244 loggerV.Info("Creating PodSchedulingContext", "podSchedulingCtx", klog.KObj(schedulingCtx), "podSchedulingCtxObject", klog.Format(schedulingCtx)) 245 } else { 246 logger.V(5).Info("Creating PodSchedulingContext", "podSchedulingCtx", klog.KObj(schedulingCtx)) 247 } 248 _, err = clientset.ResourceV1alpha2().PodSchedulingContexts(schedulingCtx.Namespace).Create(ctx, schedulingCtx, metav1.CreateOptions{}) 249 } 250 if err != nil { 251 return err 252 } 253 p.potentialNodes = nil 254 p.selectedNode = nil 255 return nil 256 } 257 258 func statusForClaim(schedulingCtx *resourcev1alpha2.PodSchedulingContext, podClaimName string) *resourcev1alpha2.ResourceClaimSchedulingStatus { 259 if schedulingCtx == nil { 260 return nil 261 } 262 for _, status := range schedulingCtx.Status.ResourceClaims { 263 if status.Name == podClaimName { 264 return &status 265 } 266 } 267 return nil 268 } 269 270 // dynamicResources is a plugin that ensures that ResourceClaims are allocated. 271 type dynamicResources struct { 272 enabled bool 273 fh framework.Handle 274 clientset kubernetes.Interface 275 claimLister resourcev1alpha2listers.ResourceClaimLister 276 classLister resourcev1alpha2listers.ResourceClassLister 277 podSchedulingContextLister resourcev1alpha2listers.PodSchedulingContextLister 278 claimParametersLister resourcev1alpha2listers.ResourceClaimParametersLister 279 classParametersLister resourcev1alpha2listers.ResourceClassParametersLister 280 resourceSliceLister resourcev1alpha2listers.ResourceSliceLister 281 claimNameLookup *resourceclaim.Lookup 282 283 // claimAssumeCache enables temporarily storing a newer claim object 284 // while the scheduler has allocated it and the corresponding object 285 // update from the apiserver has not been processed by the claim 286 // informer callbacks. Claims get added here in PreBind and removed by 287 // the informer callback (based on the "newer than" comparison in the 288 // assume cache). 289 // 290 // It uses cache.MetaNamespaceKeyFunc to generate object names, which 291 // therefore are "<namespace>/<name>". 292 // 293 // This is necessary to ensure that reconstructing the resource usage 294 // at the start of a pod scheduling cycle doesn't reuse the resources 295 // assigned to such a claim. Alternatively, claim allocation state 296 // could also get tracked across pod scheduling cycles, but that 297 // - adds complexity (need to carefully sync state with informer events 298 // for claims and ResourceSlices) 299 // - would make integration with cluster autoscaler harder because it would need 300 // to trigger informer callbacks. 301 // 302 // When implementing cluster autoscaler support, this assume cache or 303 // something like it (see https://github.com/kubernetes/kubernetes/pull/112202) 304 // might have to be managed by the cluster autoscaler. 305 claimAssumeCache *assumecache.AssumeCache 306 307 // inFlightAllocations is map from claim UUIDs to claim objects for those claims 308 // for which allocation was triggered during a scheduling cycle and the 309 // corresponding claim status update call in PreBind has not been done 310 // yet. If another pod needs the claim, the pod is treated as "not 311 // schedulable yet". The cluster event for the claim status update will 312 // make it schedulable. 313 // 314 // This mechanism avoids the following problem: 315 // - Pod A triggers allocation for claim X. 316 // - Pod B shares access to that claim and gets scheduled because 317 // the claim is assumed to be allocated. 318 // - PreBind for pod B is called first, tries to update reservedFor and 319 // fails because the claim is not really allocated yet. 320 // 321 // We could avoid the ordering problem by allowing either pod A or pod B 322 // to set the allocation. But that is more complicated and leads to another 323 // problem: 324 // - Pod A and B get scheduled as above. 325 // - PreBind for pod A gets called first, then fails with a temporary API error. 326 // It removes the updated claim from the assume cache because of that. 327 // - PreBind for pod B gets called next and succeeds with adding the 328 // allocation and its own reservedFor entry. 329 // - The assume cache is now not reflecting that the claim is allocated, 330 // which could lead to reusing the same resource for some other claim. 331 // 332 // A sync.Map is used because in practice sharing of a claim between 333 // pods is expected to be rare compared to per-pod claim, so we end up 334 // hitting the "multiple goroutines read, write, and overwrite entries 335 // for disjoint sets of keys" case that sync.Map is optimized for. 336 inFlightAllocations sync.Map 337 } 338 339 // New initializes a new plugin and returns it. 340 func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts feature.Features) (framework.Plugin, error) { 341 if !fts.EnableDynamicResourceAllocation { 342 // Disabled, won't do anything. 343 return &dynamicResources{}, nil 344 } 345 346 logger := klog.FromContext(ctx) 347 pl := &dynamicResources{ 348 enabled: true, 349 fh: fh, 350 clientset: fh.ClientSet(), 351 claimLister: fh.SharedInformerFactory().Resource().V1alpha2().ResourceClaims().Lister(), 352 classLister: fh.SharedInformerFactory().Resource().V1alpha2().ResourceClasses().Lister(), 353 podSchedulingContextLister: fh.SharedInformerFactory().Resource().V1alpha2().PodSchedulingContexts().Lister(), 354 claimParametersLister: fh.SharedInformerFactory().Resource().V1alpha2().ResourceClaimParameters().Lister(), 355 classParametersLister: fh.SharedInformerFactory().Resource().V1alpha2().ResourceClassParameters().Lister(), 356 resourceSliceLister: fh.SharedInformerFactory().Resource().V1alpha2().ResourceSlices().Lister(), 357 claimNameLookup: resourceclaim.NewNameLookup(fh.ClientSet()), 358 claimAssumeCache: assumecache.NewAssumeCache(logger, fh.SharedInformerFactory().Resource().V1alpha2().ResourceClaims().Informer(), "claim", "", nil), 359 } 360 361 return pl, nil 362 } 363 364 var _ framework.PreEnqueuePlugin = &dynamicResources{} 365 var _ framework.PreFilterPlugin = &dynamicResources{} 366 var _ framework.FilterPlugin = &dynamicResources{} 367 var _ framework.PostFilterPlugin = &dynamicResources{} 368 var _ framework.PreScorePlugin = &dynamicResources{} 369 var _ framework.ReservePlugin = &dynamicResources{} 370 var _ framework.EnqueueExtensions = &dynamicResources{} 371 var _ framework.PreBindPlugin = &dynamicResources{} 372 var _ framework.PostBindPlugin = &dynamicResources{} 373 374 // Name returns name of the plugin. It is used in logs, etc. 375 func (pl *dynamicResources) Name() string { 376 return Name 377 } 378 379 // EventsToRegister returns the possible events that may make a Pod 380 // failed by this plugin schedulable. 381 func (pl *dynamicResources) EventsToRegister() []framework.ClusterEventWithHint { 382 if !pl.enabled { 383 return nil 384 } 385 386 events := []framework.ClusterEventWithHint{ 387 // Changes for claim or class parameters creation may make pods 388 // schedulable which depend on claims using those parameters. 389 {Event: framework.ClusterEvent{Resource: framework.ResourceClaimParameters, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterClaimParametersChange}, 390 {Event: framework.ClusterEvent{Resource: framework.ResourceClassParameters, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterClassParametersChange}, 391 392 // Allocation is tracked in ResourceClaims, so any changes may make the pods schedulable. 393 {Event: framework.ClusterEvent{Resource: framework.ResourceClaim, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterClaimChange}, 394 // When a driver has provided additional information, a pod waiting for that information 395 // may be schedulable. 396 {Event: framework.ClusterEvent{Resource: framework.PodSchedulingContext, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterPodSchedulingContextChange}, 397 // A resource might depend on node labels for topology filtering. 398 // A new or updated node may make pods schedulable. 399 // 400 // A note about UpdateNodeTaint event: 401 // NodeAdd QueueingHint isn't always called because of the internal feature called preCheck. 402 // As a common problematic scenario, 403 // when a node is added but not ready, NodeAdd event is filtered out by preCheck and doesn't arrive. 404 // In such cases, this plugin may miss some events that actually make pods schedulable. 405 // As a workaround, we add UpdateNodeTaint event to catch the case. 406 // We can remove UpdateNodeTaint when we remove the preCheck feature. 407 // See: https://github.com/kubernetes/kubernetes/issues/110175 408 {Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint}}, 409 // A pod might be waiting for a class to get created or modified. 410 {Event: framework.ClusterEvent{Resource: framework.ResourceClass, ActionType: framework.Add | framework.Update}}, 411 } 412 return events 413 } 414 415 // PreEnqueue checks if there are known reasons why a pod currently cannot be 416 // scheduled. When this fails, one of the registered events can trigger another 417 // attempt. 418 func (pl *dynamicResources) PreEnqueue(ctx context.Context, pod *v1.Pod) (status *framework.Status) { 419 if err := pl.foreachPodResourceClaim(pod, nil); err != nil { 420 return statusUnschedulable(klog.FromContext(ctx), err.Error()) 421 } 422 return nil 423 } 424 425 // isSchedulableAfterClaimParametersChange is invoked for add and update claim parameters events reported by 426 // an informer. It checks whether that change made a previously unschedulable 427 // pod schedulable. It errs on the side of letting a pod scheduling attempt 428 // happen. The delete claim event will not invoke it, so newObj will never be nil. 429 func (pl *dynamicResources) isSchedulableAfterClaimParametersChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { 430 originalParameters, modifiedParameters, err := schedutil.As[*resourcev1alpha2.ResourceClaimParameters](oldObj, newObj) 431 if err != nil { 432 // Shouldn't happen. 433 return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterClaimParametersChange: %w", err) 434 } 435 436 usesParameters := false 437 if err := pl.foreachPodResourceClaim(pod, func(_ string, claim *resourcev1alpha2.ResourceClaim) { 438 ref := claim.Spec.ParametersRef 439 if ref == nil { 440 return 441 } 442 443 // Using in-tree parameters directly? 444 if ref.APIGroup == resourcev1alpha2.SchemeGroupVersion.Group && 445 ref.Kind == "ResourceClaimParameters" { 446 if modifiedParameters.Name == ref.Name { 447 usesParameters = true 448 } 449 return 450 } 451 452 // Need to look for translated parameters. 453 generatedFrom := modifiedParameters.GeneratedFrom 454 if generatedFrom == nil { 455 return 456 } 457 if generatedFrom.APIGroup == ref.APIGroup && 458 generatedFrom.Kind == ref.Kind && 459 generatedFrom.Name == ref.Name { 460 usesParameters = true 461 } 462 }); err != nil { 463 // This is not an unexpected error: we know that 464 // foreachPodResourceClaim only returns errors for "not 465 // schedulable". 466 logger.V(4).Info("pod is not schedulable", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedParameters), "reason", err.Error()) 467 return framework.QueueSkip, nil 468 } 469 470 if !usesParameters { 471 // This were not the parameters the pod was waiting for. 472 logger.V(6).Info("unrelated claim parameters got modified", "pod", klog.KObj(pod), "claimParameters", klog.KObj(modifiedParameters)) 473 return framework.QueueSkip, nil 474 } 475 476 if originalParameters == nil { 477 logger.V(4).Info("claim parameters for pod got created", "pod", klog.KObj(pod), "claimParameters", klog.KObj(modifiedParameters)) 478 return framework.Queue, nil 479 } 480 481 // Modifications may or may not be relevant. If the entire 482 // requests are as before, then something else must have changed 483 // and we don't care. 484 if apiequality.Semantic.DeepEqual(&originalParameters.DriverRequests, &modifiedParameters.DriverRequests) { 485 logger.V(6).Info("claim parameters for pod got modified where the pod doesn't care", "pod", klog.KObj(pod), "claimParameters", klog.KObj(modifiedParameters)) 486 return framework.QueueSkip, nil 487 } 488 489 logger.V(4).Info("requests in claim parameters for pod got updated", "pod", klog.KObj(pod), "claimParameters", klog.KObj(modifiedParameters)) 490 return framework.Queue, nil 491 } 492 493 // isSchedulableAfterClassParametersChange is invoked for add and update class parameters events reported by 494 // an informer. It checks whether that change made a previously unschedulable 495 // pod schedulable. It errs on the side of letting a pod scheduling attempt 496 // happen. The delete class event will not invoke it, so newObj will never be nil. 497 func (pl *dynamicResources) isSchedulableAfterClassParametersChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { 498 originalParameters, modifiedParameters, err := schedutil.As[*resourcev1alpha2.ResourceClassParameters](oldObj, newObj) 499 if err != nil { 500 // Shouldn't happen. 501 return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterClassParametersChange: %w", err) 502 } 503 504 usesParameters := false 505 if err := pl.foreachPodResourceClaim(pod, func(_ string, claim *resourcev1alpha2.ResourceClaim) { 506 class, err := pl.classLister.Get(claim.Spec.ResourceClassName) 507 if err != nil { 508 if !apierrors.IsNotFound(err) { 509 logger.Error(err, "look up resource class") 510 } 511 return 512 } 513 ref := class.ParametersRef 514 if ref == nil { 515 return 516 } 517 518 // Using in-tree parameters directly? 519 if ref.APIGroup == resourcev1alpha2.SchemeGroupVersion.Group && 520 ref.Kind == "ResourceClassParameters" { 521 if modifiedParameters.Name == ref.Name { 522 usesParameters = true 523 } 524 return 525 } 526 527 // Need to look for translated parameters. 528 generatedFrom := modifiedParameters.GeneratedFrom 529 if generatedFrom == nil { 530 return 531 } 532 if generatedFrom.APIGroup == ref.APIGroup && 533 generatedFrom.Kind == ref.Kind && 534 generatedFrom.Name == ref.Name { 535 usesParameters = true 536 } 537 }); err != nil { 538 // This is not an unexpected error: we know that 539 // foreachPodResourceClaim only returns errors for "not 540 // schedulable". 541 logger.V(4).Info("pod is not schedulable", "pod", klog.KObj(pod), "classParameters", klog.KObj(modifiedParameters), "reason", err.Error()) 542 return framework.QueueSkip, nil 543 } 544 545 if !usesParameters { 546 // This were not the parameters the pod was waiting for. 547 logger.V(6).Info("unrelated class parameters got modified", "pod", klog.KObj(pod), "classParameters", klog.KObj(modifiedParameters)) 548 return framework.QueueSkip, nil 549 } 550 551 if originalParameters == nil { 552 logger.V(4).Info("class parameters for pod got created", "pod", klog.KObj(pod), "class", klog.KObj(modifiedParameters)) 553 return framework.Queue, nil 554 } 555 556 // Modifications may or may not be relevant. If the entire 557 // requests are as before, then something else must have changed 558 // and we don't care. 559 if apiequality.Semantic.DeepEqual(&originalParameters.Filters, &modifiedParameters.Filters) { 560 logger.V(6).Info("class parameters for pod got modified where the pod doesn't care", "pod", klog.KObj(pod), "classParameters", klog.KObj(modifiedParameters)) 561 return framework.QueueSkip, nil 562 } 563 564 logger.V(4).Info("filters in class parameters for pod got updated", "pod", klog.KObj(pod), "classParameters", klog.KObj(modifiedParameters)) 565 return framework.Queue, nil 566 } 567 568 // isSchedulableAfterClaimChange is invoked for add and update claim events reported by 569 // an informer. It checks whether that change made a previously unschedulable 570 // pod schedulable. It errs on the side of letting a pod scheduling attempt 571 // happen. The delete claim event will not invoke it, so newObj will never be nil. 572 func (pl *dynamicResources) isSchedulableAfterClaimChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { 573 originalClaim, modifiedClaim, err := schedutil.As[*resourcev1alpha2.ResourceClaim](oldObj, newObj) 574 if err != nil { 575 // Shouldn't happen. 576 return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterClaimChange: %w", err) 577 } 578 579 usesClaim := false 580 if err := pl.foreachPodResourceClaim(pod, func(_ string, claim *resourcev1alpha2.ResourceClaim) { 581 if claim.UID == modifiedClaim.UID { 582 usesClaim = true 583 } 584 }); err != nil { 585 // This is not an unexpected error: we know that 586 // foreachPodResourceClaim only returns errors for "not 587 // schedulable". 588 logger.V(4).Info("pod is not schedulable", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim), "reason", err.Error()) 589 return framework.QueueSkip, nil 590 } 591 592 if originalClaim != nil && 593 resourceclaim.IsAllocatedWithStructuredParameters(originalClaim) && 594 modifiedClaim.Status.Allocation == nil { 595 // A claim with structured parameters was deallocated. This might have made 596 // resources available for other pods. 597 // 598 // TODO (https://github.com/kubernetes/kubernetes/issues/123697): 599 // check that the pending claims depend on structured parameters (depends on refactoring foreachPodResourceClaim, see other TODO). 600 // 601 // There is a small race here: 602 // - The dynamicresources plugin allocates claim A and updates the assume cache. 603 // - A second pod gets marked as unschedulable based on that assume cache. 604 // - Before the informer cache here catches up, the pod runs, terminates and 605 // the claim gets deallocated without ever sending the claim status with 606 // allocation to the scheduler. 607 // - The comparison below is for a *very* old claim with no allocation and the 608 // new claim where the allocation is already removed again, so no 609 // RemovedClaimAllocation event gets emitted. 610 // 611 // This is extremely unlikely and thus a fix is not needed for alpha in Kubernetes 1.30. 612 // TODO (https://github.com/kubernetes/kubernetes/issues/123698): The solution is to somehow integrate the assume cache 613 // into the event mechanism. This can be tackled together with adding autoscaler 614 // support, which also needs to do something with the assume cache. 615 logger.V(6).Info("claim with structured parameters got deallocated", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim)) 616 return framework.Queue, nil 617 } 618 619 if !usesClaim { 620 // This was not the claim the pod was waiting for. 621 logger.V(6).Info("unrelated claim got modified", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim)) 622 return framework.QueueSkip, nil 623 } 624 625 if originalClaim == nil { 626 logger.V(4).Info("claim for pod got created", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim)) 627 return framework.Queue, nil 628 } 629 630 // Modifications may or may not be relevant. If the entire 631 // status is as before, then something else must have changed 632 // and we don't care. What happens in practice is that the 633 // resource driver adds the finalizer. 634 if apiequality.Semantic.DeepEqual(&originalClaim.Status, &modifiedClaim.Status) { 635 if loggerV := logger.V(7); loggerV.Enabled() { 636 // Log more information. 637 loggerV.Info("claim for pod got modified where the pod doesn't care", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim), "diff", cmp.Diff(originalClaim, modifiedClaim)) 638 } else { 639 logger.V(6).Info("claim for pod got modified where the pod doesn't care", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim)) 640 } 641 return framework.QueueSkip, nil 642 } 643 644 logger.V(4).Info("status of claim for pod got updated", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim)) 645 return framework.Queue, nil 646 } 647 648 // isSchedulableAfterPodSchedulingContextChange is invoked for all 649 // PodSchedulingContext events reported by an informer. It checks whether that 650 // change made a previously unschedulable pod schedulable (updated) or a new 651 // attempt is needed to re-create the object (deleted). It errs on the side of 652 // letting a pod scheduling attempt happen. 653 func (pl *dynamicResources) isSchedulableAfterPodSchedulingContextChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { 654 // Deleted? That can happen because we ourselves delete the PodSchedulingContext while 655 // working on the pod. This can be ignored. 656 if oldObj != nil && newObj == nil { 657 logger.V(4).Info("PodSchedulingContext got deleted") 658 return framework.QueueSkip, nil 659 } 660 661 oldPodScheduling, newPodScheduling, err := schedutil.As[*resourcev1alpha2.PodSchedulingContext](oldObj, newObj) 662 if err != nil { 663 // Shouldn't happen. 664 return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterPodSchedulingContextChange: %w", err) 665 } 666 podScheduling := newPodScheduling // Never nil because deletes are handled above. 667 668 if podScheduling.Name != pod.Name || podScheduling.Namespace != pod.Namespace { 669 logger.V(7).Info("PodSchedulingContext for unrelated pod got modified", "pod", klog.KObj(pod), "podScheduling", klog.KObj(podScheduling)) 670 return framework.QueueSkip, nil 671 } 672 673 // If the drivers have provided information about all 674 // unallocated claims with delayed allocation, then the next 675 // scheduling attempt is able to pick a node, so we let it run 676 // immediately if this occurred for the first time, otherwise 677 // we allow backoff. 678 pendingDelayedClaims := 0 679 if err := pl.foreachPodResourceClaim(pod, func(podResourceName string, claim *resourcev1alpha2.ResourceClaim) { 680 if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer && 681 claim.Status.Allocation == nil && 682 !podSchedulingHasClaimInfo(podScheduling, podResourceName) { 683 pendingDelayedClaims++ 684 } 685 }); err != nil { 686 // This is not an unexpected error: we know that 687 // foreachPodResourceClaim only returns errors for "not 688 // schedulable". 689 logger.V(4).Info("pod is not schedulable, keep waiting", "pod", klog.KObj(pod), "reason", err.Error()) 690 return framework.QueueSkip, nil 691 } 692 693 // Some driver responses missing? 694 if pendingDelayedClaims > 0 { 695 // We could start a pod scheduling attempt to refresh the 696 // potential nodes list. But pod scheduling attempts are 697 // expensive and doing them too often causes the pod to enter 698 // backoff. Let's wait instead for all drivers to reply. 699 if loggerV := logger.V(6); loggerV.Enabled() { 700 loggerV.Info("PodSchedulingContext with missing resource claim information, keep waiting", "pod", klog.KObj(pod), "podSchedulingDiff", cmp.Diff(oldPodScheduling, podScheduling)) 701 } else { 702 logger.V(5).Info("PodSchedulingContext with missing resource claim information, keep waiting", "pod", klog.KObj(pod)) 703 } 704 return framework.QueueSkip, nil 705 } 706 707 if oldPodScheduling == nil /* create */ || 708 len(oldPodScheduling.Status.ResourceClaims) < len(podScheduling.Status.ResourceClaims) /* new information and not incomplete (checked above) */ { 709 // This definitely is new information for the scheduler. Try again immediately. 710 logger.V(4).Info("PodSchedulingContext for pod has all required information, schedule immediately", "pod", klog.KObj(pod)) 711 return framework.Queue, nil 712 } 713 714 // The other situation where the scheduler needs to do 715 // something immediately is when the selected node doesn't 716 // work: waiting in the backoff queue only helps eventually 717 // resources on the selected node become available again. It's 718 // much more likely, in particular when trying to fill up the 719 // cluster, that the choice simply didn't work out. The risk 720 // here is that in a situation where the cluster really is 721 // full, backoff won't be used because the scheduler keeps 722 // trying different nodes. This should not happen when it has 723 // full knowledge about resource availability (= 724 // PodSchedulingContext.*.UnsuitableNodes is complete) but may happen 725 // when it doesn't (= PodSchedulingContext.*.UnsuitableNodes had to be 726 // truncated). 727 // 728 // Truncation only happens for very large clusters and then may slow 729 // down scheduling, but should not break it completely. This is 730 // acceptable while DRA is alpha and will be investigated further 731 // before moving DRA to beta. 732 if podScheduling.Spec.SelectedNode != "" { 733 for _, claimStatus := range podScheduling.Status.ResourceClaims { 734 if slices.Contains(claimStatus.UnsuitableNodes, podScheduling.Spec.SelectedNode) { 735 logger.V(5).Info("PodSchedulingContext has unsuitable selected node, schedule immediately", "pod", klog.KObj(pod), "selectedNode", podScheduling.Spec.SelectedNode, "podResourceName", claimStatus.Name) 736 return framework.Queue, nil 737 } 738 } 739 } 740 741 // Update with only the spec modified? 742 if oldPodScheduling != nil && 743 !apiequality.Semantic.DeepEqual(&oldPodScheduling.Spec, &podScheduling.Spec) && 744 apiequality.Semantic.DeepEqual(&oldPodScheduling.Status, &podScheduling.Status) { 745 logger.V(5).Info("PodSchedulingContext has only the scheduler spec changes, ignore the update", "pod", klog.KObj(pod)) 746 return framework.QueueSkip, nil 747 } 748 749 // Once we get here, all changes which are known to require special responses 750 // have been checked for. Whatever the change was, we don't know exactly how 751 // to handle it and thus return Queue. This will cause the 752 // scheduler to treat the event as if no event hint callback had been provided. 753 // Developers who want to investigate this can enable a diff at log level 6. 754 if loggerV := logger.V(6); loggerV.Enabled() { 755 loggerV.Info("PodSchedulingContext for pod with unknown changes, maybe schedule", "pod", klog.KObj(pod), "podSchedulingDiff", cmp.Diff(oldPodScheduling, podScheduling)) 756 } else { 757 logger.V(5).Info("PodSchedulingContext for pod with unknown changes, maybe schedule", "pod", klog.KObj(pod)) 758 } 759 return framework.Queue, nil 760 761 } 762 763 func podSchedulingHasClaimInfo(podScheduling *resourcev1alpha2.PodSchedulingContext, podResourceName string) bool { 764 for _, claimStatus := range podScheduling.Status.ResourceClaims { 765 if claimStatus.Name == podResourceName { 766 return true 767 } 768 } 769 return false 770 } 771 772 // podResourceClaims returns the ResourceClaims for all pod.Spec.PodResourceClaims. 773 func (pl *dynamicResources) podResourceClaims(pod *v1.Pod) ([]*resourcev1alpha2.ResourceClaim, error) { 774 claims := make([]*resourcev1alpha2.ResourceClaim, 0, len(pod.Spec.ResourceClaims)) 775 if err := pl.foreachPodResourceClaim(pod, func(_ string, claim *resourcev1alpha2.ResourceClaim) { 776 // We store the pointer as returned by the lister. The 777 // assumption is that if a claim gets modified while our code 778 // runs, the cache will store a new pointer, not mutate the 779 // existing object that we point to here. 780 claims = append(claims, claim) 781 }); err != nil { 782 return nil, err 783 } 784 return claims, nil 785 } 786 787 // foreachPodResourceClaim checks that each ResourceClaim for the pod exists. 788 // It calls an optional handler for those claims that it finds. 789 func (pl *dynamicResources) foreachPodResourceClaim(pod *v1.Pod, cb func(podResourceName string, claim *resourcev1alpha2.ResourceClaim)) error { 790 for _, resource := range pod.Spec.ResourceClaims { 791 claimName, mustCheckOwner, err := pl.claimNameLookup.Name(pod, &resource) 792 if err != nil { 793 return err 794 } 795 // The claim name might be nil if no underlying resource claim 796 // was generated for the referenced claim. There are valid use 797 // cases when this might happen, so we simply skip it. 798 if claimName == nil { 799 continue 800 } 801 claim, err := pl.claimLister.ResourceClaims(pod.Namespace).Get(*claimName) 802 if err != nil { 803 return err 804 } 805 806 if claim.DeletionTimestamp != nil { 807 return fmt.Errorf("resourceclaim %q is being deleted", claim.Name) 808 } 809 810 if mustCheckOwner { 811 if err := resourceclaim.IsForPod(pod, claim); err != nil { 812 return err 813 } 814 } 815 if cb != nil { 816 cb(resource.Name, claim) 817 } 818 } 819 return nil 820 } 821 822 // PreFilter invoked at the prefilter extension point to check if pod has all 823 // immediate claims bound. UnschedulableAndUnresolvable is returned if 824 // the pod cannot be scheduled at the moment on any node. 825 func (pl *dynamicResources) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) { 826 if !pl.enabled { 827 return nil, framework.NewStatus(framework.Skip) 828 } 829 logger := klog.FromContext(ctx) 830 831 // If the pod does not reference any claim, we don't need to do 832 // anything for it. We just initialize an empty state to record that 833 // observation for the other functions. This gets updated below 834 // if we get that far. 835 s := &stateData{} 836 state.Write(stateKey, s) 837 838 claims, err := pl.podResourceClaims(pod) 839 if err != nil { 840 return nil, statusUnschedulable(logger, err.Error()) 841 } 842 logger.V(5).Info("pod resource claims", "pod", klog.KObj(pod), "resourceclaims", klog.KObjSlice(claims)) 843 844 // If the pod does not reference any claim, 845 // DynamicResources Filter has nothing to do with the Pod. 846 if len(claims) == 0 { 847 return nil, framework.NewStatus(framework.Skip) 848 } 849 850 // Fetch PodSchedulingContext, it's going to be needed when checking claims. 851 if err := s.podSchedulingState.init(ctx, pod, pl.podSchedulingContextLister); err != nil { 852 return nil, statusError(logger, err) 853 } 854 855 s.informationsForClaim = make([]informationForClaim, len(claims)) 856 needResourceInformation := false 857 for index, claim := range claims { 858 if claim.Status.DeallocationRequested { 859 // This will get resolved by the resource driver. 860 return nil, statusUnschedulable(logger, "resourceclaim must be reallocated", "pod", klog.KObj(pod), "resourceclaim", klog.KObj(claim)) 861 } 862 if claim.Status.Allocation != nil && 863 !resourceclaim.CanBeReserved(claim) && 864 !resourceclaim.IsReservedForPod(pod, claim) { 865 // Resource is in use. The pod has to wait. 866 return nil, statusUnschedulable(logger, "resourceclaim in use", "pod", klog.KObj(pod), "resourceclaim", klog.KObj(claim)) 867 } 868 869 if claim.Status.Allocation != nil { 870 if claim.Status.Allocation.AvailableOnNodes != nil { 871 nodeSelector, err := nodeaffinity.NewNodeSelector(claim.Status.Allocation.AvailableOnNodes) 872 if err != nil { 873 return nil, statusError(logger, err) 874 } 875 s.informationsForClaim[index].availableOnNode = nodeSelector 876 } 877 878 // The claim was allocated by the scheduler if it has the finalizer that is 879 // reserved for Kubernetes. 880 s.informationsForClaim[index].structuredParameters = slices.Contains(claim.Finalizers, resourcev1alpha2.Finalizer) 881 } else { 882 // The ResourceClass might have a node filter. This is 883 // useful for trimming the initial set of potential 884 // nodes before we ask the driver(s) for information 885 // about the specific pod. 886 class, err := pl.classLister.Get(claim.Spec.ResourceClassName) 887 if err != nil { 888 // If the class cannot be retrieved, allocation cannot proceed. 889 if apierrors.IsNotFound(err) { 890 // Here we mark the pod as "unschedulable", so it'll sleep in 891 // the unscheduleable queue until a ResourceClass event occurs. 892 return nil, statusUnschedulable(logger, fmt.Sprintf("resource class %s does not exist", claim.Spec.ResourceClassName)) 893 } 894 // Other error, retry with backoff. 895 return nil, statusError(logger, fmt.Errorf("look up resource class: %v", err)) 896 } 897 if class.SuitableNodes != nil { 898 selector, err := nodeaffinity.NewNodeSelector(class.SuitableNodes) 899 if err != nil { 900 return nil, statusError(logger, err) 901 } 902 s.informationsForClaim[index].availableOnNode = selector 903 } 904 s.informationsForClaim[index].status = statusForClaim(s.podSchedulingState.schedulingCtx, pod.Spec.ResourceClaims[index].Name) 905 906 if class.StructuredParameters != nil && *class.StructuredParameters { 907 s.informationsForClaim[index].structuredParameters = true 908 909 // Allocation in flight? Better wait for that 910 // to finish, see inFlightAllocations 911 // documentation for details. 912 if _, found := pl.inFlightAllocations.Load(claim.UID); found { 913 return nil, statusUnschedulable(logger, fmt.Sprintf("resource claim %s is in the process of being allocated", klog.KObj(claim))) 914 } 915 916 // We need the claim and class parameters. If 917 // they don't exist yet, the pod has to wait. 918 // 919 // TODO (https://github.com/kubernetes/kubernetes/issues/123697): 920 // check this already in foreachPodResourceClaim, together with setting up informationsForClaim. 921 // Then PreEnqueue will also check for existence of parameters. 922 classParameters, claimParameters, status := pl.lookupParameters(logger, class, claim) 923 if status != nil { 924 return nil, status 925 } 926 controller, err := newClaimController(logger, class, classParameters, claimParameters) 927 if err != nil { 928 return nil, statusError(logger, err) 929 } 930 s.informationsForClaim[index].controller = controller 931 needResourceInformation = true 932 } else if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeImmediate { 933 // This will get resolved by the resource driver. 934 return nil, statusUnschedulable(logger, "unallocated immediate resourceclaim", "pod", klog.KObj(pod), "resourceclaim", klog.KObj(claim)) 935 } 936 } 937 } 938 939 if needResourceInformation { 940 // Doing this over and over again for each pod could be avoided 941 // by parsing once when creating the plugin and then updating 942 // that state in informer callbacks. But that would cause 943 // problems for using the plugin in the Cluster Autoscaler. If 944 // this step here turns out to be expensive, we may have to 945 // maintain and update state more persistently. 946 // 947 // Claims are treated as "allocated" if they are in the assume cache 948 // or currently their allocation is in-flight. 949 resources, err := newResourceModel(logger, pl.resourceSliceLister, pl.claimAssumeCache, &pl.inFlightAllocations) 950 logger.V(5).Info("Resource usage", "resources", klog.Format(resources)) 951 if err != nil { 952 return nil, statusError(logger, err) 953 } 954 s.resources = resources 955 } 956 957 s.claims = claims 958 return nil, nil 959 } 960 961 func (pl *dynamicResources) lookupParameters(logger klog.Logger, class *resourcev1alpha2.ResourceClass, claim *resourcev1alpha2.ResourceClaim) (classParameters *resourcev1alpha2.ResourceClassParameters, claimParameters *resourcev1alpha2.ResourceClaimParameters, status *framework.Status) { 962 classParameters, status = pl.lookupClassParameters(logger, class) 963 if status != nil { 964 return 965 } 966 claimParameters, status = pl.lookupClaimParameters(logger, class, claim) 967 return 968 } 969 970 func (pl *dynamicResources) lookupClassParameters(logger klog.Logger, class *resourcev1alpha2.ResourceClass) (*resourcev1alpha2.ResourceClassParameters, *framework.Status) { 971 defaultClassParameters := resourcev1alpha2.ResourceClassParameters{} 972 973 if class.ParametersRef == nil { 974 return &defaultClassParameters, nil 975 } 976 977 if class.ParametersRef.APIGroup == resourcev1alpha2.SchemeGroupVersion.Group && 978 class.ParametersRef.Kind == "ResourceClassParameters" { 979 // Use the parameters which were referenced directly. 980 parameters, err := pl.classParametersLister.ResourceClassParameters(class.ParametersRef.Namespace).Get(class.ParametersRef.Name) 981 if err != nil { 982 if apierrors.IsNotFound(err) { 983 return nil, statusUnschedulable(logger, fmt.Sprintf("class parameters %s not found", klog.KRef(class.ParametersRef.Namespace, class.ParametersRef.Name))) 984 } 985 return nil, statusError(logger, fmt.Errorf("get class parameters %s: %v", klog.KRef(class.Namespace, class.ParametersRef.Name), err)) 986 } 987 return parameters, nil 988 } 989 990 // TODO (https://github.com/kubernetes/kubernetes/issues/123731): use an indexer 991 allParameters, err := pl.classParametersLister.ResourceClassParameters(class.Namespace).List(labels.Everything()) 992 if err != nil { 993 return nil, statusError(logger, fmt.Errorf("listing class parameters failed: %v", err)) 994 } 995 for _, parameters := range allParameters { 996 if parameters.GeneratedFrom == nil { 997 continue 998 } 999 if parameters.GeneratedFrom.APIGroup == class.ParametersRef.APIGroup && 1000 parameters.GeneratedFrom.Kind == class.ParametersRef.Kind && 1001 parameters.GeneratedFrom.Name == class.ParametersRef.Name && 1002 parameters.GeneratedFrom.Namespace == class.ParametersRef.Namespace { 1003 return parameters, nil 1004 } 1005 } 1006 return nil, statusUnschedulable(logger, fmt.Sprintf("generated class parameters for %s.%s %s not found", class.ParametersRef.Kind, class.ParametersRef.APIGroup, klog.KRef(class.Namespace, class.ParametersRef.Name))) 1007 } 1008 1009 func (pl *dynamicResources) lookupClaimParameters(logger klog.Logger, class *resourcev1alpha2.ResourceClass, claim *resourcev1alpha2.ResourceClaim) (*resourcev1alpha2.ResourceClaimParameters, *framework.Status) { 1010 defaultClaimParameters := resourcev1alpha2.ResourceClaimParameters{ 1011 Shareable: true, 1012 DriverRequests: []resourcev1alpha2.DriverRequests{ 1013 { 1014 DriverName: class.DriverName, 1015 Requests: []resourcev1alpha2.ResourceRequest{ 1016 { 1017 ResourceRequestModel: resourcev1alpha2.ResourceRequestModel{ 1018 // TODO: This only works because NamedResources is 1019 // the only model currently implemented. We need to 1020 // match the default to how the resources of this 1021 // class are being advertized in a ResourceSlice. 1022 NamedResources: &resourcev1alpha2.NamedResourcesRequest{ 1023 Selector: "true", 1024 }, 1025 }, 1026 }, 1027 }, 1028 }, 1029 }, 1030 } 1031 1032 if claim.Spec.ParametersRef == nil { 1033 return &defaultClaimParameters, nil 1034 } 1035 if claim.Spec.ParametersRef.APIGroup == resourcev1alpha2.SchemeGroupVersion.Group && 1036 claim.Spec.ParametersRef.Kind == "ResourceClaimParameters" { 1037 // Use the parameters which were referenced directly. 1038 parameters, err := pl.claimParametersLister.ResourceClaimParameters(claim.Namespace).Get(claim.Spec.ParametersRef.Name) 1039 if err != nil { 1040 if apierrors.IsNotFound(err) { 1041 return nil, statusUnschedulable(logger, fmt.Sprintf("claim parameters %s not found", klog.KRef(claim.Namespace, claim.Spec.ParametersRef.Name))) 1042 } 1043 return nil, statusError(logger, fmt.Errorf("get claim parameters %s: %v", klog.KRef(claim.Namespace, claim.Spec.ParametersRef.Name), err)) 1044 } 1045 return parameters, nil 1046 } 1047 1048 // TODO (https://github.com/kubernetes/kubernetes/issues/123731): use an indexer 1049 allParameters, err := pl.claimParametersLister.ResourceClaimParameters(claim.Namespace).List(labels.Everything()) 1050 if err != nil { 1051 return nil, statusError(logger, fmt.Errorf("listing claim parameters failed: %v", err)) 1052 } 1053 for _, parameters := range allParameters { 1054 if parameters.GeneratedFrom == nil { 1055 continue 1056 } 1057 if parameters.GeneratedFrom.APIGroup == claim.Spec.ParametersRef.APIGroup && 1058 parameters.GeneratedFrom.Kind == claim.Spec.ParametersRef.Kind && 1059 parameters.GeneratedFrom.Name == claim.Spec.ParametersRef.Name { 1060 return parameters, nil 1061 } 1062 } 1063 return nil, statusUnschedulable(logger, fmt.Sprintf("generated claim parameters for %s.%s %s not found", claim.Spec.ParametersRef.Kind, claim.Spec.ParametersRef.APIGroup, klog.KRef(claim.Namespace, claim.Spec.ParametersRef.Name))) 1064 } 1065 1066 // PreFilterExtensions returns prefilter extensions, pod add and remove. 1067 func (pl *dynamicResources) PreFilterExtensions() framework.PreFilterExtensions { 1068 return nil 1069 } 1070 1071 func getStateData(cs *framework.CycleState) (*stateData, error) { 1072 state, err := cs.Read(stateKey) 1073 if err != nil { 1074 return nil, err 1075 } 1076 s, ok := state.(*stateData) 1077 if !ok { 1078 return nil, errors.New("unable to convert state into stateData") 1079 } 1080 return s, nil 1081 } 1082 1083 // Filter invoked at the filter extension point. 1084 // It evaluates if a pod can fit due to the resources it requests, 1085 // for both allocated and unallocated claims. 1086 // 1087 // For claims that are bound, then it checks that the node affinity is 1088 // satisfied by the given node. 1089 // 1090 // For claims that are unbound, it checks whether the claim might get allocated 1091 // for the node. 1092 func (pl *dynamicResources) Filter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status { 1093 if !pl.enabled { 1094 return nil 1095 } 1096 state, err := getStateData(cs) 1097 if err != nil { 1098 return statusError(klog.FromContext(ctx), err) 1099 } 1100 if len(state.claims) == 0 { 1101 return nil 1102 } 1103 1104 logger := klog.FromContext(ctx) 1105 node := nodeInfo.Node() 1106 1107 var unavailableClaims []int 1108 for index, claim := range state.claims { 1109 logger.V(10).Info("filtering based on resource claims of the pod", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim)) 1110 switch { 1111 case claim.Status.Allocation != nil: 1112 if nodeSelector := state.informationsForClaim[index].availableOnNode; nodeSelector != nil { 1113 if !nodeSelector.Match(node) { 1114 logger.V(5).Info("AvailableOnNodes does not match", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim)) 1115 unavailableClaims = append(unavailableClaims, index) 1116 } 1117 } 1118 case claim.Status.DeallocationRequested: 1119 // We shouldn't get here. PreFilter already checked this. 1120 return statusUnschedulable(logger, "resourceclaim must be reallocated", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim)) 1121 case claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer || 1122 state.informationsForClaim[index].structuredParameters: 1123 if selector := state.informationsForClaim[index].availableOnNode; selector != nil { 1124 if matches := selector.Match(node); !matches { 1125 return statusUnschedulable(logger, "excluded by resource class node filter", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclassName", claim.Spec.ResourceClassName) 1126 } 1127 } 1128 // Can the builtin controller tell us whether the node is suitable? 1129 if state.informationsForClaim[index].structuredParameters { 1130 suitable, err := state.informationsForClaim[index].controller.nodeIsSuitable(ctx, node.Name, state.resources) 1131 if err != nil { 1132 // An error indicates that something wasn't configured correctly, for example 1133 // writing a CEL expression which doesn't handle a map lookup error. Normally 1134 // this should never fail. We could return an error here, but then the pod 1135 // would get retried. Instead we ignore the node. 1136 return statusUnschedulable(logger, fmt.Sprintf("checking structured parameters failed: %v", err), "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim)) 1137 } 1138 if !suitable { 1139 return statusUnschedulable(logger, "resourceclaim cannot be allocated for the node (unsuitable)", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim)) 1140 } 1141 } else { 1142 if status := state.informationsForClaim[index].status; status != nil { 1143 for _, unsuitableNode := range status.UnsuitableNodes { 1144 if node.Name == unsuitableNode { 1145 return statusUnschedulable(logger, "resourceclaim cannot be allocated for the node (unsuitable)", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim), "unsuitablenodes", status.UnsuitableNodes) 1146 } 1147 } 1148 } 1149 } 1150 default: 1151 // This claim should have been handled above. 1152 // Immediate allocation with control plane controller 1153 // was already checked for in PreFilter. 1154 return statusError(logger, fmt.Errorf("internal error, unexpected allocation mode %v", claim.Spec.AllocationMode)) 1155 } 1156 } 1157 1158 if len(unavailableClaims) > 0 { 1159 state.mutex.Lock() 1160 defer state.mutex.Unlock() 1161 if state.unavailableClaims == nil { 1162 state.unavailableClaims = sets.New[int]() 1163 } 1164 1165 for _, index := range unavailableClaims { 1166 claim := state.claims[index] 1167 // Deallocation makes more sense for claims with 1168 // delayed allocation. Claims with immediate allocation 1169 // would just get allocated again for a random node, 1170 // which is unlikely to help the pod. 1171 // 1172 // Claims with builtin controller are handled like 1173 // claims with delayed allocation. 1174 if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer || 1175 state.informationsForClaim[index].controller != nil { 1176 state.unavailableClaims.Insert(index) 1177 } 1178 } 1179 return statusUnschedulable(logger, "resourceclaim not available on the node", "pod", klog.KObj(pod)) 1180 } 1181 1182 return nil 1183 } 1184 1185 // PostFilter checks whether there are allocated claims that could get 1186 // deallocated to help get the Pod schedulable. If yes, it picks one and 1187 // requests its deallocation. This only gets called when filtering found no 1188 // suitable node. 1189 func (pl *dynamicResources) PostFilter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, filteredNodeStatusMap framework.NodeToStatusMap) (*framework.PostFilterResult, *framework.Status) { 1190 if !pl.enabled { 1191 return nil, framework.NewStatus(framework.Unschedulable, "plugin disabled") 1192 } 1193 logger := klog.FromContext(ctx) 1194 state, err := getStateData(cs) 1195 if err != nil { 1196 return nil, statusError(logger, err) 1197 } 1198 if len(state.claims) == 0 { 1199 return nil, framework.NewStatus(framework.Unschedulable, "no new claims to deallocate") 1200 } 1201 1202 // Iterating over a map is random. This is intentional here, we want to 1203 // pick one claim randomly because there is no better heuristic. 1204 for index := range state.unavailableClaims { 1205 claim := state.claims[index] 1206 if len(claim.Status.ReservedFor) == 0 || 1207 len(claim.Status.ReservedFor) == 1 && claim.Status.ReservedFor[0].UID == pod.UID { 1208 // Is the claim is handled by the builtin controller? 1209 // Then we can simply clear the allocation. Once the 1210 // claim informer catches up, the controllers will 1211 // be notified about this change. 1212 clearAllocation := state.informationsForClaim[index].structuredParameters 1213 1214 // Before we tell a driver to deallocate a claim, we 1215 // have to stop telling it to allocate. Otherwise, 1216 // depending on timing, it will deallocate the claim, 1217 // see a PodSchedulingContext with selected node, and 1218 // allocate again for that same node. 1219 if !clearAllocation && 1220 state.podSchedulingState.schedulingCtx != nil && 1221 state.podSchedulingState.schedulingCtx.Spec.SelectedNode != "" { 1222 state.podSchedulingState.selectedNode = ptr.To("") 1223 if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil { 1224 return nil, statusError(logger, err) 1225 } 1226 } 1227 1228 claim := claim.DeepCopy() 1229 claim.Status.ReservedFor = nil 1230 if clearAllocation { 1231 claim.Status.DriverName = "" 1232 claim.Status.Allocation = nil 1233 } else { 1234 claim.Status.DeallocationRequested = true 1235 } 1236 logger.V(5).Info("Requesting deallocation of ResourceClaim", "pod", klog.KObj(pod), "resourceclaim", klog.KObj(claim)) 1237 if _, err := pl.clientset.ResourceV1alpha2().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{}); err != nil { 1238 return nil, statusError(logger, err) 1239 } 1240 return nil, framework.NewStatus(framework.Unschedulable, "deallocation of ResourceClaim completed") 1241 } 1242 } 1243 return nil, framework.NewStatus(framework.Unschedulable, "still not schedulable") 1244 } 1245 1246 // PreScore is passed a list of all nodes that would fit the pod. Not all 1247 // claims are necessarily allocated yet, so here we can set the SuitableNodes 1248 // field for those which are pending. 1249 func (pl *dynamicResources) PreScore(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status { 1250 if !pl.enabled { 1251 return nil 1252 } 1253 state, err := getStateData(cs) 1254 if err != nil { 1255 return statusError(klog.FromContext(ctx), err) 1256 } 1257 defer func() { 1258 state.preScored = true 1259 }() 1260 if len(state.claims) == 0 { 1261 return nil 1262 } 1263 1264 logger := klog.FromContext(ctx) 1265 pending := false 1266 for index, claim := range state.claims { 1267 if claim.Status.Allocation == nil && 1268 state.informationsForClaim[index].controller == nil { 1269 pending = true 1270 break 1271 } 1272 } 1273 if !pending { 1274 logger.V(5).Info("no pending claims with control plane controller", "pod", klog.KObj(pod)) 1275 return nil 1276 } 1277 1278 if haveAllPotentialNodes(state.podSchedulingState.schedulingCtx, nodes) { 1279 logger.V(5).Info("all potential nodes already set", "pod", klog.KObj(pod), "potentialnodes", klog.KObjSlice(nodes)) 1280 return nil 1281 } 1282 1283 // Remember the potential nodes. The object will get created or 1284 // updated in Reserve. This is both an optimization and 1285 // covers the case that PreScore doesn't get called when there 1286 // is only a single node. 1287 logger.V(5).Info("remembering potential nodes", "pod", klog.KObj(pod), "potentialnodes", klog.KObjSlice(nodes)) 1288 numNodes := len(nodes) 1289 if numNodes > resourcev1alpha2.PodSchedulingNodeListMaxSize { 1290 numNodes = resourcev1alpha2.PodSchedulingNodeListMaxSize 1291 } 1292 potentialNodes := make([]string, 0, numNodes) 1293 if numNodes == len(nodes) { 1294 // Copy all node names. 1295 for _, node := range nodes { 1296 potentialNodes = append(potentialNodes, node.Node().Name) 1297 } 1298 } else { 1299 // Select a random subset of the nodes to comply with 1300 // the PotentialNodes length limit. Randomization is 1301 // done for us by Go which iterates over map entries 1302 // randomly. 1303 nodeNames := map[string]struct{}{} 1304 for _, node := range nodes { 1305 nodeNames[node.Node().Name] = struct{}{} 1306 } 1307 for nodeName := range nodeNames { 1308 if len(potentialNodes) >= resourcev1alpha2.PodSchedulingNodeListMaxSize { 1309 break 1310 } 1311 potentialNodes = append(potentialNodes, nodeName) 1312 } 1313 } 1314 sort.Strings(potentialNodes) 1315 state.podSchedulingState.potentialNodes = &potentialNodes 1316 return nil 1317 } 1318 1319 func haveAllPotentialNodes(schedulingCtx *resourcev1alpha2.PodSchedulingContext, nodes []*framework.NodeInfo) bool { 1320 if schedulingCtx == nil { 1321 return false 1322 } 1323 for _, node := range nodes { 1324 if !slices.Contains(schedulingCtx.Spec.PotentialNodes, node.Node().Name) { 1325 return false 1326 } 1327 } 1328 return true 1329 } 1330 1331 // Reserve reserves claims for the pod. 1332 func (pl *dynamicResources) Reserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) (status *framework.Status) { 1333 if !pl.enabled { 1334 return nil 1335 } 1336 state, err := getStateData(cs) 1337 if err != nil { 1338 return statusError(klog.FromContext(ctx), err) 1339 } 1340 if len(state.claims) == 0 { 1341 return nil 1342 } 1343 1344 numDelayedAllocationPending := 0 1345 numClaimsWithStatusInfo := 0 1346 claimsWithBuiltinController := make([]int, 0, len(state.claims)) 1347 logger := klog.FromContext(ctx) 1348 for index, claim := range state.claims { 1349 if claim.Status.Allocation != nil { 1350 // Allocated, but perhaps not reserved yet. We checked in PreFilter that 1351 // the pod could reserve the claim. Instead of reserving here by 1352 // updating the ResourceClaim status, we assume that reserving 1353 // will work and only do it for real during binding. If it fails at 1354 // that time, some other pod was faster and we have to try again. 1355 continue 1356 } 1357 1358 // Do we have the builtin controller? 1359 if state.informationsForClaim[index].controller != nil { 1360 claimsWithBuiltinController = append(claimsWithBuiltinController, index) 1361 continue 1362 } 1363 1364 // Must be delayed allocation with control plane controller. 1365 numDelayedAllocationPending++ 1366 1367 // Did the driver provide information that steered node 1368 // selection towards a node that it can support? 1369 if statusForClaim(state.podSchedulingState.schedulingCtx, pod.Spec.ResourceClaims[index].Name) != nil { 1370 numClaimsWithStatusInfo++ 1371 } 1372 } 1373 1374 if numDelayedAllocationPending == 0 && len(claimsWithBuiltinController) == 0 { 1375 // Nothing left to do. 1376 return nil 1377 } 1378 1379 if !state.preScored && numDelayedAllocationPending > 0 { 1380 // There was only one candidate that passed the Filters and 1381 // therefore PreScore was not called. 1382 // 1383 // We need to ask whether that node is suitable, otherwise the 1384 // scheduler will pick it forever even when it cannot satisfy 1385 // the claim. 1386 if state.podSchedulingState.schedulingCtx == nil || 1387 !slices.Contains(state.podSchedulingState.schedulingCtx.Spec.PotentialNodes, nodeName) { 1388 potentialNodes := []string{nodeName} 1389 state.podSchedulingState.potentialNodes = &potentialNodes 1390 logger.V(5).Info("asking for information about single potential node", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}) 1391 } 1392 } 1393 1394 // Prepare allocation of claims handled by the schedulder. 1395 for _, index := range claimsWithBuiltinController { 1396 claim := state.claims[index] 1397 driverName, allocation, err := state.informationsForClaim[index].controller.allocate(ctx, nodeName, state.resources) 1398 if err != nil { 1399 // We checked before that the node is suitable. This shouldn't have failed, 1400 // so treat this as an error. 1401 return statusError(logger, fmt.Errorf("claim allocation failed unexpectedly: %v", err)) 1402 } 1403 state.informationsForClaim[index].allocation = allocation 1404 state.informationsForClaim[index].allocationDriverName = driverName 1405 // Strictly speaking, we don't need to store the full modified object. 1406 // The allocation would be enough. The full object is useful for 1407 // debugging and testing, so let's make it realistic. 1408 claim = claim.DeepCopy() 1409 claim.Finalizers = append(claim.Finalizers, resourcev1alpha2.Finalizer) 1410 claim.Status.DriverName = driverName 1411 claim.Status.Allocation = allocation 1412 pl.inFlightAllocations.Store(claim.UID, claim) 1413 logger.V(5).Info("Reserved resource in allocation result", "claim", klog.KObj(claim), "driver", driverName, "allocation", klog.Format(allocation)) 1414 } 1415 1416 // When there is only one pending resource, we can go ahead with 1417 // requesting allocation even when we don't have the information from 1418 // the driver yet. Otherwise we wait for information before blindly 1419 // making a decision that might have to be reversed later. 1420 // 1421 // If all pending claims are handled with the builtin controller, 1422 // there is no need for a PodSchedulingContext change. 1423 if numDelayedAllocationPending == 1 && len(claimsWithBuiltinController) == 0 || 1424 numClaimsWithStatusInfo+len(claimsWithBuiltinController) == numDelayedAllocationPending && len(claimsWithBuiltinController) < numDelayedAllocationPending { 1425 // TODO: can we increase the chance that the scheduler picks 1426 // the same node as before when allocation is on-going, 1427 // assuming that that node still fits the pod? Picking a 1428 // different node may lead to some claims being allocated for 1429 // one node and others for another, which then would have to be 1430 // resolved with deallocation. 1431 if state.podSchedulingState.schedulingCtx == nil || 1432 state.podSchedulingState.schedulingCtx.Spec.SelectedNode != nodeName { 1433 state.podSchedulingState.selectedNode = &nodeName 1434 logger.V(5).Info("start allocation", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}) 1435 // The actual publish happens in PreBind or Unreserve. 1436 return nil 1437 } 1438 } 1439 1440 // May have been modified earlier in PreScore or above. 1441 if state.podSchedulingState.isDirty() { 1442 // The actual publish happens in PreBind or Unreserve. 1443 return nil 1444 } 1445 1446 // If all pending claims are handled with the builtin controller, then 1447 // we can allow the pod to proceed. Allocating and reserving the claims 1448 // will be done in PreBind. 1449 if numDelayedAllocationPending == 0 { 1450 return nil 1451 } 1452 1453 // More than one pending claim and not enough information about all of them. 1454 // 1455 // TODO: can or should we ensure that schedulingCtx gets aborted while 1456 // waiting for resources *before* triggering delayed volume 1457 // provisioning? On the one hand, volume provisioning is currently 1458 // irreversible, so it better should come last. On the other hand, 1459 // triggering both in parallel might be faster. 1460 return statusPending(logger, "waiting for resource driver to provide information", "pod", klog.KObj(pod)) 1461 } 1462 1463 // Unreserve clears the ReservedFor field for all claims. 1464 // It's idempotent, and does nothing if no state found for the given pod. 1465 func (pl *dynamicResources) Unreserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) { 1466 if !pl.enabled { 1467 return 1468 } 1469 state, err := getStateData(cs) 1470 if err != nil { 1471 return 1472 } 1473 if len(state.claims) == 0 { 1474 return 1475 } 1476 1477 logger := klog.FromContext(ctx) 1478 1479 // Was publishing delayed? If yes, do it now. 1480 // 1481 // The most common scenario is that a different set of potential nodes 1482 // was identified. This revised set needs to be published to enable DRA 1483 // drivers to provide better guidance for future scheduling attempts. 1484 if state.podSchedulingState.isDirty() { 1485 if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil { 1486 logger.Error(err, "publish PodSchedulingContext") 1487 } 1488 } 1489 1490 for index, claim := range state.claims { 1491 // If allocation was in-flight, then it's not anymore and we need to revert the 1492 // claim object in the assume cache to what it was before. 1493 if state.informationsForClaim[index].controller != nil { 1494 if _, found := pl.inFlightAllocations.LoadAndDelete(state.claims[index].UID); found { 1495 pl.claimAssumeCache.Restore(claim.Namespace + "/" + claim.Name) 1496 } 1497 } 1498 1499 if claim.Status.Allocation != nil && 1500 resourceclaim.IsReservedForPod(pod, claim) { 1501 // Remove pod from ReservedFor. A strategic-merge-patch is used 1502 // because that allows removing an individual entry without having 1503 // the latest slice. 1504 patch := fmt.Sprintf(`{"metadata": {"uid": %q}, "status": { "reservedFor": [ {"$patch": "delete", "uid": %q} ] }}`, 1505 claim.UID, 1506 pod.UID, 1507 ) 1508 logger.V(5).Info("unreserve", "resourceclaim", klog.KObj(claim), "pod", klog.KObj(pod)) 1509 claim, err := pl.clientset.ResourceV1alpha2().ResourceClaims(claim.Namespace).Patch(ctx, claim.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}, "status") 1510 if err != nil { 1511 // We will get here again when pod scheduling is retried. 1512 logger.Error(err, "unreserve", "resourceclaim", klog.KObj(claim)) 1513 } 1514 } 1515 } 1516 } 1517 1518 // PreBind gets called in a separate goroutine after it has been determined 1519 // that the pod should get bound to this node. Because Reserve did not actually 1520 // reserve claims, we need to do it now. For claims with the builtin controller, 1521 // we also handle the allocation. 1522 // 1523 // If anything fails, we return an error and 1524 // the pod will have to go into the backoff queue. The scheduler will call 1525 // Unreserve as part of the error handling. 1526 func (pl *dynamicResources) PreBind(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status { 1527 if !pl.enabled { 1528 return nil 1529 } 1530 state, err := getStateData(cs) 1531 if err != nil { 1532 return statusError(klog.FromContext(ctx), err) 1533 } 1534 if len(state.claims) == 0 { 1535 return nil 1536 } 1537 1538 logger := klog.FromContext(ctx) 1539 1540 // Was publishing delayed? If yes, do it now and then cause binding to stop. 1541 // This will not happen if all claims get handled by builtin controllers. 1542 if state.podSchedulingState.isDirty() { 1543 if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil { 1544 return statusError(logger, err) 1545 } 1546 return statusPending(logger, "waiting for resource driver", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}) 1547 } 1548 1549 for index, claim := range state.claims { 1550 if !resourceclaim.IsReservedForPod(pod, claim) { 1551 claim, err := pl.bindClaim(ctx, state, index, pod, nodeName) 1552 if err != nil { 1553 return statusError(logger, err) 1554 } 1555 state.claims[index] = claim 1556 } 1557 } 1558 // If we get here, we know that reserving the claim for 1559 // the pod worked and we can proceed with binding it. 1560 return nil 1561 } 1562 1563 // bindClaim gets called by PreBind for claim which is not reserved for the pod yet. 1564 // It might not even be allocated. bindClaim then ensures that the allocation 1565 // and reservation are recorded. This finishes the work started in Reserve. 1566 func (pl *dynamicResources) bindClaim(ctx context.Context, state *stateData, index int, pod *v1.Pod, nodeName string) (patchedClaim *resourcev1alpha2.ResourceClaim, finalErr error) { 1567 logger := klog.FromContext(ctx) 1568 claim := state.claims[index] 1569 allocationPatch := "" 1570 1571 allocation := state.informationsForClaim[index].allocation 1572 logger.V(5).Info("preparing claim status patch", "claim", klog.KObj(state.claims[index]), "allocation", klog.Format(allocation)) 1573 1574 // Do we need to store an allocation result from Reserve? 1575 if allocation != nil { 1576 buffer, err := json.Marshal(allocation) 1577 if err != nil { 1578 return nil, fmt.Errorf("marshaling AllocationResult failed: %v", err) 1579 } 1580 allocationPatch = fmt.Sprintf(`"driverName": %q, "allocation": %s, `, state.informationsForClaim[index].allocationDriverName, string(buffer)) 1581 1582 // The finalizer needs to be added in a normal update. Using a simple update is fine 1583 // because we don't expect concurrent modifications while the claim is not allocated 1584 // yet. If there are any, we want to fail. 1585 // 1586 // If we were interrupted in the past, it might already be set and we simply continue. 1587 if !slices.Contains(claim.Finalizers, resourcev1alpha2.Finalizer) { 1588 claim := state.claims[index].DeepCopy() 1589 claim.Finalizers = append(claim.Finalizers, resourcev1alpha2.Finalizer) 1590 if _, err := pl.clientset.ResourceV1alpha2().ResourceClaims(claim.Namespace).Update(ctx, claim, metav1.UpdateOptions{}); err != nil { 1591 return nil, fmt.Errorf("add finalizer: %v", err) 1592 } 1593 } 1594 } 1595 1596 // The claim might be stale, for example because the claim can get shared and some 1597 // other goroutine has updated it in the meantime. We therefore cannot use 1598 // SSA here to add the pod because then we would have to send the entire slice 1599 // or use different field manager strings for each entry. 1600 // 1601 // With a strategic-merge-patch, we can simply send one new entry. The apiserver 1602 // validation will catch if two goroutines try to do that at the same time and 1603 // the claim cannot be shared. 1604 // 1605 // Note that this also works when the allocation result gets added twice because 1606 // two pods both started using a shared claim: the first pod to get here adds the 1607 // allocation result. The second pod then only adds itself to reservedFor. 1608 patch := fmt.Sprintf(`{"metadata": {"uid": %q}, "status": {%s "reservedFor": [ {"resource": "pods", "name": %q, "uid": %q} ] }}`, 1609 claim.UID, 1610 allocationPatch, 1611 pod.Name, 1612 pod.UID, 1613 ) 1614 if loggerV := logger.V(6); loggerV.Enabled() { 1615 logger.V(5).Info("reserve", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}, "resourceclaim", klog.KObj(claim), "patch", patch) 1616 } else { 1617 logger.V(5).Info("reserve", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}, "resourceclaim", klog.KObj(claim)) 1618 } 1619 claim, err := pl.clientset.ResourceV1alpha2().ResourceClaims(claim.Namespace).Patch(ctx, claim.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}, "status") 1620 logger.V(5).Info("reserved", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}, "resourceclaim", klog.Format(claim), "err", err) 1621 if allocationPatch != "" { 1622 // The scheduler was handling allocation. Now that has 1623 // completed, either successfully or with a failure. 1624 if err == nil { 1625 // This can fail, but only for reasons that are okay (concurrent delete or update). 1626 // Shouldn't happen in this case. 1627 if err := pl.claimAssumeCache.Assume(claim); err != nil { 1628 logger.V(5).Info("Claim not stored in assume cache", "err", err) 1629 } 1630 } 1631 pl.inFlightAllocations.Delete(claim.UID) 1632 } 1633 return claim, err 1634 } 1635 1636 // PostBind is called after a pod is successfully bound to a node. Now we are 1637 // sure that a PodSchedulingContext object, if it exists, is definitely not going to 1638 // be needed anymore and can delete it. This is a one-shot thing, there won't 1639 // be any retries. This is okay because it should usually work and in those 1640 // cases where it doesn't, the garbage collector will eventually clean up. 1641 func (pl *dynamicResources) PostBind(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) { 1642 if !pl.enabled { 1643 return 1644 } 1645 state, err := getStateData(cs) 1646 if err != nil { 1647 return 1648 } 1649 if len(state.claims) == 0 { 1650 return 1651 } 1652 1653 // We cannot know for sure whether the PodSchedulingContext object exists. We 1654 // might have created it in the previous pod schedulingCtx cycle and not 1655 // have it in our informer cache yet. Let's try to delete, just to be 1656 // on the safe side. 1657 logger := klog.FromContext(ctx) 1658 err = pl.clientset.ResourceV1alpha2().PodSchedulingContexts(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}) 1659 switch { 1660 case apierrors.IsNotFound(err): 1661 logger.V(5).Info("no PodSchedulingContext object to delete") 1662 case err != nil: 1663 logger.Error(err, "delete PodSchedulingContext") 1664 default: 1665 logger.V(5).Info("PodSchedulingContext object deleted") 1666 } 1667 } 1668 1669 // statusUnschedulable ensures that there is a log message associated with the 1670 // line where the status originated. 1671 func statusUnschedulable(logger klog.Logger, reason string, kv ...interface{}) *framework.Status { 1672 if loggerV := logger.V(5); loggerV.Enabled() { 1673 helper, loggerV := loggerV.WithCallStackHelper() 1674 helper() 1675 kv = append(kv, "reason", reason) 1676 // nolint: logcheck // warns because it cannot check key/values 1677 loggerV.Info("pod unschedulable", kv...) 1678 } 1679 return framework.NewStatus(framework.UnschedulableAndUnresolvable, reason) 1680 } 1681 1682 // statusPending ensures that there is a log message associated with the 1683 // line where the status originated. 1684 func statusPending(logger klog.Logger, reason string, kv ...interface{}) *framework.Status { 1685 if loggerV := logger.V(5); loggerV.Enabled() { 1686 helper, loggerV := loggerV.WithCallStackHelper() 1687 helper() 1688 kv = append(kv, "reason", reason) 1689 // nolint: logcheck // warns because it cannot check key/values 1690 loggerV.Info("pod waiting for external component", kv...) 1691 } 1692 1693 // When we return Pending, we want to block the Pod at the same time. 1694 return framework.NewStatus(framework.Pending, reason) 1695 } 1696 1697 // statusError ensures that there is a log message associated with the 1698 // line where the error originated. 1699 func statusError(logger klog.Logger, err error, kv ...interface{}) *framework.Status { 1700 if loggerV := logger.V(5); loggerV.Enabled() { 1701 helper, loggerV := loggerV.WithCallStackHelper() 1702 helper() 1703 // nolint: logcheck // warns because it cannot check key/values 1704 loggerV.Error(err, "dynamic resource plugin failed", kv...) 1705 } 1706 return framework.AsStatus(err) 1707 }