k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/cm/dra/manager.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package dra 18 19 import ( 20 "context" 21 "fmt" 22 "time" 23 24 v1 "k8s.io/api/core/v1" 25 resourceapi "k8s.io/api/resource/v1alpha2" 26 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 "k8s.io/apimachinery/pkg/types" 28 "k8s.io/apimachinery/pkg/util/sets" 29 "k8s.io/apimachinery/pkg/util/wait" 30 clientset "k8s.io/client-go/kubernetes" 31 "k8s.io/dynamic-resource-allocation/resourceclaim" 32 "k8s.io/klog/v2" 33 drapb "k8s.io/kubelet/pkg/apis/dra/v1alpha3" 34 dra "k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin" 35 "k8s.io/kubernetes/pkg/kubelet/config" 36 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 37 ) 38 39 // draManagerStateFileName is the file name where dra manager stores its state 40 const draManagerStateFileName = "dra_manager_state" 41 42 // defaultReconcilePeriod is the default reconciliation period to keep all claim info state in sync. 43 const defaultReconcilePeriod = 60 * time.Second 44 45 // ActivePodsFunc is a function that returns a list of pods to reconcile. 46 type ActivePodsFunc func() []*v1.Pod 47 48 // ManagerImpl is the structure in charge of managing DRA resource Plugins. 49 type ManagerImpl struct { 50 // cache contains cached claim info 51 cache *claimInfoCache 52 53 // reconcilePeriod is the duration between calls to reconcileLoop. 54 reconcilePeriod time.Duration 55 56 // activePods is a method for listing active pods on the node 57 // so all claim info state can be updated in the reconciliation loop. 58 activePods ActivePodsFunc 59 60 // sourcesReady provides the readiness of kubelet configuration sources such as apiserver update readiness. 61 // We use it to determine when we can treat pods as inactive and react appropriately. 62 sourcesReady config.SourcesReady 63 64 // KubeClient reference 65 kubeClient clientset.Interface 66 } 67 68 // NewManagerImpl creates a new manager. 69 func NewManagerImpl(kubeClient clientset.Interface, stateFileDirectory string, nodeName types.NodeName) (*ManagerImpl, error) { 70 klog.V(2).InfoS("Creating DRA manager") 71 72 claimInfoCache, err := newClaimInfoCache(stateFileDirectory, draManagerStateFileName) 73 if err != nil { 74 return nil, fmt.Errorf("failed to create claimInfo cache: %+v", err) 75 } 76 77 // TODO: for now the reconcile period is not configurable. 78 // We should consider making it configurable in the future. 79 reconcilePeriod := defaultReconcilePeriod 80 81 manager := &ManagerImpl{ 82 cache: claimInfoCache, 83 kubeClient: kubeClient, 84 reconcilePeriod: reconcilePeriod, 85 activePods: nil, 86 sourcesReady: nil, 87 } 88 89 return manager, nil 90 } 91 92 // Start starts the reconcile loop of the manager. 93 func (m *ManagerImpl) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady) error { 94 m.activePods = activePods 95 m.sourcesReady = sourcesReady 96 go wait.Until(func() { m.reconcileLoop() }, m.reconcilePeriod, wait.NeverStop) 97 return nil 98 } 99 100 // reconcileLoop ensures that any stale state in the manager's claimInfoCache gets periodically reconciled. 101 func (m *ManagerImpl) reconcileLoop() { 102 // Only once all sources are ready do we attempt to reconcile. 103 // This ensures that the call to m.activePods() below will succeed with 104 // the actual active pods list. 105 if m.sourcesReady == nil || !m.sourcesReady.AllReady() { 106 return 107 } 108 109 // Get the full list of active pods. 110 activePods := sets.New[string]() 111 for _, p := range m.activePods() { 112 activePods.Insert(string(p.UID)) 113 } 114 115 // Get the list of inactive pods still referenced by any claimInfos. 116 type podClaims struct { 117 uid types.UID 118 namespace string 119 claimNames []string 120 } 121 inactivePodClaims := make(map[string]*podClaims) 122 m.cache.RLock() 123 for _, claimInfo := range m.cache.claimInfo { 124 for podUID := range claimInfo.PodUIDs { 125 if activePods.Has(podUID) { 126 continue 127 } 128 if inactivePodClaims[podUID] == nil { 129 inactivePodClaims[podUID] = &podClaims{ 130 uid: types.UID(podUID), 131 namespace: claimInfo.Namespace, 132 claimNames: []string{}, 133 } 134 } 135 inactivePodClaims[podUID].claimNames = append(inactivePodClaims[podUID].claimNames, claimInfo.ClaimName) 136 } 137 } 138 m.cache.RUnlock() 139 140 // Loop through all inactive pods and call UnprepareResources on them. 141 for _, podClaims := range inactivePodClaims { 142 if err := m.unprepareResources(podClaims.uid, podClaims.namespace, podClaims.claimNames); err != nil { 143 klog.ErrorS(err, "Unpreparing pod resources in reconcile loop", "podUID", podClaims.uid) 144 } 145 } 146 } 147 148 // PrepareResources attempts to prepare all of the required resource 149 // plugin resources for the input container, issue NodePrepareResources rpc requests 150 // for each new resource requirement, process their responses and update the cached 151 // containerResources on success. 152 func (m *ManagerImpl) PrepareResources(pod *v1.Pod) error { 153 batches := make(map[string][]*drapb.Claim) 154 resourceClaims := make(map[types.UID]*resourceapi.ResourceClaim) 155 for i := range pod.Spec.ResourceClaims { 156 podClaim := &pod.Spec.ResourceClaims[i] 157 klog.V(3).InfoS("Processing resource", "podClaim", podClaim.Name, "pod", pod.Name) 158 claimName, mustCheckOwner, err := resourceclaim.Name(pod, podClaim) 159 if err != nil { 160 return fmt.Errorf("prepare resource claim: %v", err) 161 } 162 163 if claimName == nil { 164 // Nothing to do. 165 continue 166 } 167 // Query claim object from the API server 168 resourceClaim, err := m.kubeClient.ResourceV1alpha2().ResourceClaims(pod.Namespace).Get( 169 context.TODO(), 170 *claimName, 171 metav1.GetOptions{}) 172 if err != nil { 173 return fmt.Errorf("failed to fetch ResourceClaim %s referenced by pod %s: %+v", *claimName, pod.Name, err) 174 } 175 176 if mustCheckOwner { 177 if err = resourceclaim.IsForPod(pod, resourceClaim); err != nil { 178 return err 179 } 180 } 181 182 // Check if pod is in the ReservedFor for the claim 183 if !resourceclaim.IsReservedForPod(pod, resourceClaim) { 184 return fmt.Errorf("pod %s(%s) is not allowed to use resource claim %s(%s)", 185 pod.Name, pod.UID, *claimName, resourceClaim.UID) 186 } 187 188 // If no container actually uses the claim, then we don't need 189 // to prepare it. 190 if !claimIsUsedByPod(podClaim, pod) { 191 klog.V(5).InfoS("Skipping unused resource", "claim", claimName, "pod", pod.Name) 192 continue 193 } 194 195 // Atomically perform some operations on the claimInfo cache. 196 err = m.cache.withLock(func() error { 197 // Get a reference to the claim info for this claim from the cache. 198 // If there isn't one yet, then add it to the cache. 199 claimInfo, exists := m.cache.get(resourceClaim.Name, resourceClaim.Namespace) 200 if !exists { 201 claimInfo = m.cache.add(newClaimInfoFromClaim(resourceClaim)) 202 } 203 204 // Add a reference to the current pod in the claim info. 205 claimInfo.addPodReference(pod.UID) 206 207 // Checkpoint to ensure all claims we plan to prepare are tracked. 208 // If something goes wrong and the newly referenced pod gets 209 // deleted without a successful prepare call, we will catch 210 // that in the reconcile loop and take the appropriate action. 211 if err := m.cache.syncToCheckpoint(); err != nil { 212 return fmt.Errorf("failed to checkpoint claimInfo state: %w", err) 213 } 214 215 // If this claim is already prepared, there is no need to prepare it again. 216 if claimInfo.isPrepared() { 217 return nil 218 } 219 220 // This saved claim will be used to update ClaimInfo cache 221 // after NodePrepareResources GRPC succeeds 222 resourceClaims[claimInfo.ClaimUID] = resourceClaim 223 224 // Loop through all plugins and prepare for calling NodePrepareResources. 225 for _, resourceHandle := range claimInfo.ResourceHandles { 226 claim := &drapb.Claim{ 227 Namespace: claimInfo.Namespace, 228 Uid: string(claimInfo.ClaimUID), 229 Name: claimInfo.ClaimName, 230 ResourceHandle: resourceHandle.Data, 231 } 232 if resourceHandle.StructuredData != nil { 233 claim.StructuredResourceHandle = []*resourceapi.StructuredResourceHandle{resourceHandle.StructuredData} 234 } 235 pluginName := resourceHandle.DriverName 236 batches[pluginName] = append(batches[pluginName], claim) 237 } 238 239 return nil 240 }) 241 if err != nil { 242 return fmt.Errorf("locked cache operation: %w", err) 243 } 244 } 245 246 // Call NodePrepareResources for all claims in each batch. 247 // If there is any error, processing gets aborted. 248 // We could try to continue, but that would make the code more complex. 249 for pluginName, claims := range batches { 250 // Call NodePrepareResources RPC for all resource handles. 251 client, err := dra.NewDRAPluginClient(pluginName) 252 if err != nil { 253 return fmt.Errorf("failed to get DRA Plugin client for plugin name %s: %v", pluginName, err) 254 } 255 response, err := client.NodePrepareResources(context.Background(), &drapb.NodePrepareResourcesRequest{Claims: claims}) 256 if err != nil { 257 // General error unrelated to any particular claim. 258 return fmt.Errorf("NodePrepareResources failed: %v", err) 259 } 260 for claimUID, result := range response.Claims { 261 reqClaim := lookupClaimRequest(claims, claimUID) 262 if reqClaim == nil { 263 return fmt.Errorf("NodePrepareResources returned result for unknown claim UID %s", claimUID) 264 } 265 if result.GetError() != "" { 266 return fmt.Errorf("NodePrepareResources failed for claim %s/%s: %s", reqClaim.Namespace, reqClaim.Name, result.Error) 267 } 268 269 claim := resourceClaims[types.UID(claimUID)] 270 271 // Add the prepared CDI devices to the claim info 272 err := m.cache.withLock(func() error { 273 info, exists := m.cache.get(claim.Name, claim.Namespace) 274 if !exists { 275 return fmt.Errorf("unable to get claim info for claim %s in namespace %s", claim.Name, claim.Namespace) 276 } 277 if err := info.setCDIDevices(pluginName, result.GetCDIDevices()); err != nil { 278 return fmt.Errorf("unable to add CDI devices for plugin %s of claim %s in namespace %s", pluginName, claim.Name, claim.Namespace) 279 } 280 return nil 281 }) 282 if err != nil { 283 return fmt.Errorf("locked cache operation: %w", err) 284 } 285 } 286 287 unfinished := len(claims) - len(response.Claims) 288 if unfinished != 0 { 289 return fmt.Errorf("NodePrepareResources left out %d claims", unfinished) 290 } 291 } 292 293 // Atomically perform some operations on the claimInfo cache. 294 err := m.cache.withLock(func() error { 295 // Mark all pod claims as prepared. 296 for _, claim := range resourceClaims { 297 info, exists := m.cache.get(claim.Name, claim.Namespace) 298 if !exists { 299 return fmt.Errorf("unable to get claim info for claim %s in namespace %s", claim.Name, claim.Namespace) 300 } 301 info.setPrepared() 302 } 303 304 // Checkpoint to ensure all prepared claims are tracked with their list 305 // of CDI devices attached. 306 if err := m.cache.syncToCheckpoint(); err != nil { 307 return fmt.Errorf("failed to checkpoint claimInfo state: %w", err) 308 } 309 310 return nil 311 }) 312 if err != nil { 313 return fmt.Errorf("locked cache operation: %w", err) 314 } 315 316 return nil 317 } 318 319 func lookupClaimRequest(claims []*drapb.Claim, claimUID string) *drapb.Claim { 320 for _, claim := range claims { 321 if claim.Uid == claimUID { 322 return claim 323 } 324 } 325 return nil 326 } 327 328 func claimIsUsedByPod(podClaim *v1.PodResourceClaim, pod *v1.Pod) bool { 329 if claimIsUsedByContainers(podClaim, pod.Spec.InitContainers) { 330 return true 331 } 332 if claimIsUsedByContainers(podClaim, pod.Spec.Containers) { 333 return true 334 } 335 return false 336 } 337 338 func claimIsUsedByContainers(podClaim *v1.PodResourceClaim, containers []v1.Container) bool { 339 for i := range containers { 340 if claimIsUsedByContainer(podClaim, &containers[i]) { 341 return true 342 } 343 } 344 return false 345 } 346 347 func claimIsUsedByContainer(podClaim *v1.PodResourceClaim, container *v1.Container) bool { 348 for _, c := range container.Resources.Claims { 349 if c.Name == podClaim.Name { 350 return true 351 } 352 } 353 return false 354 } 355 356 // GetResources gets a ContainerInfo object from the claimInfo cache. 357 // This information is used by the caller to update a container config. 358 func (m *ManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*ContainerInfo, error) { 359 annotations := []kubecontainer.Annotation{} 360 cdiDevices := []kubecontainer.CDIDevice{} 361 362 for i, podResourceClaim := range pod.Spec.ResourceClaims { 363 claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i]) 364 if err != nil { 365 return nil, fmt.Errorf("list resource claims: %v", err) 366 } 367 // The claim name might be nil if no underlying resource claim 368 // was generated for the referenced claim. There are valid use 369 // cases when this might happen, so we simply skip it. 370 if claimName == nil { 371 continue 372 } 373 for _, claim := range container.Resources.Claims { 374 if podResourceClaim.Name != claim.Name { 375 continue 376 } 377 378 err := m.cache.withRLock(func() error { 379 claimInfo, exists := m.cache.get(*claimName, pod.Namespace) 380 if !exists { 381 return fmt.Errorf("unable to get claim info for claim %s in namespace %s", *claimName, pod.Namespace) 382 } 383 384 claimAnnotations := claimInfo.annotationsAsList() 385 klog.V(3).InfoS("Add resource annotations", "claim", *claimName, "annotations", claimAnnotations) 386 annotations = append(annotations, claimAnnotations...) 387 388 devices := claimInfo.cdiDevicesAsList() 389 klog.V(3).InfoS("Add CDI devices", "claim", *claimName, "CDI devices", devices) 390 cdiDevices = append(cdiDevices, devices...) 391 392 return nil 393 }) 394 if err != nil { 395 return nil, fmt.Errorf("locked cache operation: %w", err) 396 } 397 } 398 } 399 400 return &ContainerInfo{Annotations: annotations, CDIDevices: cdiDevices}, nil 401 } 402 403 // UnprepareResources calls a plugin's NodeUnprepareResource API for each resource claim owned by a pod. 404 // This function is idempotent and may be called multiple times against the same pod. 405 // As such, calls to the underlying NodeUnprepareResource API are skipped for claims that have 406 // already been successfully unprepared. 407 func (m *ManagerImpl) UnprepareResources(pod *v1.Pod) error { 408 var claimNames []string 409 for i := range pod.Spec.ResourceClaims { 410 claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i]) 411 if err != nil { 412 return fmt.Errorf("unprepare resource claim: %v", err) 413 } 414 // The claim name might be nil if no underlying resource claim 415 // was generated for the referenced claim. There are valid use 416 // cases when this might happen, so we simply skip it. 417 if claimName == nil { 418 continue 419 } 420 claimNames = append(claimNames, *claimName) 421 } 422 return m.unprepareResources(pod.UID, pod.Namespace, claimNames) 423 } 424 425 func (m *ManagerImpl) unprepareResources(podUID types.UID, namespace string, claimNames []string) error { 426 batches := make(map[string][]*drapb.Claim) 427 claimNamesMap := make(map[types.UID]string) 428 for _, claimName := range claimNames { 429 // Atomically perform some operations on the claimInfo cache. 430 err := m.cache.withLock(func() error { 431 // Get the claim info from the cache 432 claimInfo, exists := m.cache.get(claimName, namespace) 433 434 // Skip calling NodeUnprepareResource if claim info is not cached 435 if !exists { 436 return nil 437 } 438 439 // Skip calling NodeUnprepareResource if other pods are still referencing it 440 if len(claimInfo.PodUIDs) > 1 { 441 // We delay checkpointing of this change until 442 // UnprepareResources returns successfully. It is OK to do 443 // this because we will only return successfully from this call 444 // if the checkpoint has succeeded. That means if the kubelet 445 // is ever restarted before this checkpoint succeeds, we will 446 // simply call into this (idempotent) function again. 447 claimInfo.deletePodReference(podUID) 448 return nil 449 } 450 451 // This claimInfo name will be used to update ClaimInfo cache 452 // after NodeUnprepareResources GRPC succeeds 453 claimNamesMap[claimInfo.ClaimUID] = claimInfo.ClaimName 454 455 // Loop through all plugins and prepare for calling NodeUnprepareResources. 456 for _, resourceHandle := range claimInfo.ResourceHandles { 457 claim := &drapb.Claim{ 458 Namespace: claimInfo.Namespace, 459 Uid: string(claimInfo.ClaimUID), 460 Name: claimInfo.ClaimName, 461 ResourceHandle: resourceHandle.Data, 462 } 463 if resourceHandle.StructuredData != nil { 464 claim.StructuredResourceHandle = []*resourceapi.StructuredResourceHandle{resourceHandle.StructuredData} 465 } 466 pluginName := resourceHandle.DriverName 467 batches[pluginName] = append(batches[pluginName], claim) 468 } 469 470 return nil 471 }) 472 if err != nil { 473 return fmt.Errorf("locked cache operation: %w", err) 474 } 475 } 476 477 // Call NodeUnprepareResources for all claims in each batch. 478 // If there is any error, processing gets aborted. 479 // We could try to continue, but that would make the code more complex. 480 for pluginName, claims := range batches { 481 // Call NodeUnprepareResources RPC for all resource handles. 482 client, err := dra.NewDRAPluginClient(pluginName) 483 if err != nil { 484 return fmt.Errorf("failed to get DRA Plugin client for plugin name %s: %v", pluginName, err) 485 } 486 response, err := client.NodeUnprepareResources(context.Background(), &drapb.NodeUnprepareResourcesRequest{Claims: claims}) 487 if err != nil { 488 // General error unrelated to any particular claim. 489 return fmt.Errorf("NodeUnprepareResources failed: %v", err) 490 } 491 492 for claimUID, result := range response.Claims { 493 reqClaim := lookupClaimRequest(claims, claimUID) 494 if reqClaim == nil { 495 return fmt.Errorf("NodeUnprepareResources returned result for unknown claim UID %s", claimUID) 496 } 497 if result.GetError() != "" { 498 return fmt.Errorf("NodeUnprepareResources failed for claim %s/%s: %s", reqClaim.Namespace, reqClaim.Name, result.Error) 499 } 500 } 501 502 unfinished := len(claims) - len(response.Claims) 503 if unfinished != 0 { 504 return fmt.Errorf("NodeUnprepareResources left out %d claims", unfinished) 505 } 506 } 507 508 // Atomically perform some operations on the claimInfo cache. 509 err := m.cache.withLock(func() error { 510 // Delete all claimInfos from the cache that have just been unprepared. 511 for _, claimName := range claimNamesMap { 512 m.cache.delete(claimName, namespace) 513 } 514 515 // Atomically sync the cache back to the checkpoint. 516 if err := m.cache.syncToCheckpoint(); err != nil { 517 return fmt.Errorf("failed to checkpoint claimInfo state: %w", err) 518 } 519 return nil 520 }) 521 if err != nil { 522 return fmt.Errorf("locked cache operation: %w", err) 523 } 524 525 return nil 526 } 527 528 // PodMightNeedToUnprepareResources returns true if the pod might need to 529 // unprepare resources 530 func (m *ManagerImpl) PodMightNeedToUnprepareResources(UID types.UID) bool { 531 m.cache.Lock() 532 defer m.cache.Unlock() 533 return m.cache.hasPodReference(UID) 534 } 535 536 // GetContainerClaimInfos gets Container's ClaimInfo 537 func (m *ManagerImpl) GetContainerClaimInfos(pod *v1.Pod, container *v1.Container) ([]*ClaimInfo, error) { 538 claimInfos := make([]*ClaimInfo, 0, len(pod.Spec.ResourceClaims)) 539 540 for i, podResourceClaim := range pod.Spec.ResourceClaims { 541 claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i]) 542 if err != nil { 543 return nil, fmt.Errorf("determine resource claim information: %v", err) 544 } 545 546 for _, claim := range container.Resources.Claims { 547 if podResourceClaim.Name != claim.Name { 548 continue 549 } 550 551 err := m.cache.withRLock(func() error { 552 claimInfo, exists := m.cache.get(*claimName, pod.Namespace) 553 if !exists { 554 return fmt.Errorf("unable to get claim info for claim %s in namespace %s", *claimName, pod.Namespace) 555 } 556 claimInfos = append(claimInfos, claimInfo.DeepCopy()) 557 return nil 558 }) 559 if err != nil { 560 return nil, fmt.Errorf("locked cache operation: %w", err) 561 } 562 } 563 } 564 return claimInfos, nil 565 }