k8s.io/kubernetes@v1.29.3/pkg/kubelet/cm/dra/manager.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package dra 18 19 import ( 20 "context" 21 "fmt" 22 23 v1 "k8s.io/api/core/v1" 24 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 25 "k8s.io/apimachinery/pkg/types" 26 clientset "k8s.io/client-go/kubernetes" 27 "k8s.io/dynamic-resource-allocation/resourceclaim" 28 "k8s.io/klog/v2" 29 drapb "k8s.io/kubelet/pkg/apis/dra/v1alpha3" 30 dra "k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin" 31 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 32 ) 33 34 // draManagerStateFileName is the file name where dra manager stores its state 35 const draManagerStateFileName = "dra_manager_state" 36 37 // ManagerImpl is the structure in charge of managing DRA resource Plugins. 38 type ManagerImpl struct { 39 // cache contains cached claim info 40 cache *claimInfoCache 41 42 // KubeClient reference 43 kubeClient clientset.Interface 44 } 45 46 // NewManagerImpl creates a new manager. 47 func NewManagerImpl(kubeClient clientset.Interface, stateFileDirectory string) (*ManagerImpl, error) { 48 klog.V(2).InfoS("Creating DRA manager") 49 50 claimInfoCache, err := newClaimInfoCache(stateFileDirectory, draManagerStateFileName) 51 if err != nil { 52 return nil, fmt.Errorf("failed to create claimInfo cache: %+v", err) 53 } 54 55 manager := &ManagerImpl{ 56 cache: claimInfoCache, 57 kubeClient: kubeClient, 58 } 59 60 return manager, nil 61 } 62 63 // PrepareResources attempts to prepare all of the required resource 64 // plugin resources for the input container, issue NodePrepareResources rpc requests 65 // for each new resource requirement, process their responses and update the cached 66 // containerResources on success. 67 func (m *ManagerImpl) PrepareResources(pod *v1.Pod) error { 68 batches := make(map[string][]*drapb.Claim) 69 claimInfos := make(map[types.UID]*ClaimInfo) 70 for i := range pod.Spec.ResourceClaims { 71 podClaim := &pod.Spec.ResourceClaims[i] 72 klog.V(3).InfoS("Processing resource", "podClaim", podClaim.Name, "pod", pod.Name) 73 claimName, mustCheckOwner, err := resourceclaim.Name(pod, podClaim) 74 if err != nil { 75 return fmt.Errorf("prepare resource claim: %v", err) 76 } 77 78 if claimName == nil { 79 // Nothing to do. 80 continue 81 } 82 // Query claim object from the API server 83 resourceClaim, err := m.kubeClient.ResourceV1alpha2().ResourceClaims(pod.Namespace).Get( 84 context.TODO(), 85 *claimName, 86 metav1.GetOptions{}) 87 if err != nil { 88 return fmt.Errorf("failed to fetch ResourceClaim %s referenced by pod %s: %+v", *claimName, pod.Name, err) 89 } 90 91 if mustCheckOwner { 92 if err = resourceclaim.IsForPod(pod, resourceClaim); err != nil { 93 return err 94 } 95 } 96 97 // Check if pod is in the ReservedFor for the claim 98 if !resourceclaim.IsReservedForPod(pod, resourceClaim) { 99 return fmt.Errorf("pod %s(%s) is not allowed to use resource claim %s(%s)", 100 pod.Name, pod.UID, *claimName, resourceClaim.UID) 101 } 102 103 // If no container actually uses the claim, then we don't need 104 // to prepare it. 105 if !claimIsUsedByPod(podClaim, pod) { 106 klog.V(5).InfoS("Skipping unused resource", "claim", claimName, "pod", pod.Name) 107 continue 108 } 109 110 claimInfo := m.cache.get(*claimName, pod.Namespace) 111 if claimInfo == nil { 112 // claim does not exist in cache, create new claimInfo object 113 // to be processed later. 114 claimInfo = newClaimInfoFromResourceClaim(resourceClaim) 115 } 116 117 // We delay checkpointing of this change until this call 118 // returns successfully. It is OK to do this because we 119 // will only return successfully from this call if the 120 // checkpoint has succeeded. That means if the kubelet is 121 // ever restarted before this checkpoint succeeds, the pod 122 // whose resources are being prepared would never have 123 // started, so it's OK (actually correct) to not include it 124 // in the cache. 125 claimInfo.addPodReference(pod.UID) 126 127 if claimInfo.prepared { 128 // Already prepared this claim, no need to prepare it again 129 continue 130 } 131 132 // Loop through all plugins and prepare for calling NodePrepareResources. 133 for _, resourceHandle := range claimInfo.ResourceHandles { 134 // If no DriverName is provided in the resourceHandle, we 135 // use the DriverName from the status 136 pluginName := resourceHandle.DriverName 137 if pluginName == "" { 138 pluginName = resourceClaim.Status.DriverName 139 } 140 claim := &drapb.Claim{ 141 Namespace: resourceClaim.Namespace, 142 Uid: string(resourceClaim.UID), 143 Name: resourceClaim.Name, 144 ResourceHandle: resourceHandle.Data, 145 } 146 batches[pluginName] = append(batches[pluginName], claim) 147 } 148 claimInfos[resourceClaim.UID] = claimInfo 149 } 150 151 // Call NodePrepareResources for all claims in each batch. 152 // If there is any error, processing gets aborted. 153 // We could try to continue, but that would make the code more complex. 154 for pluginName, claims := range batches { 155 // Call NodePrepareResources RPC for all resource handles. 156 client, err := dra.NewDRAPluginClient(pluginName) 157 if err != nil { 158 return fmt.Errorf("failed to get DRA Plugin client for plugin name %s: %v", pluginName, err) 159 } 160 response, err := client.NodePrepareResources(context.Background(), &drapb.NodePrepareResourcesRequest{Claims: claims}) 161 if err != nil { 162 // General error unrelated to any particular claim. 163 return fmt.Errorf("NodePrepareResources failed: %v", err) 164 } 165 for claimUID, result := range response.Claims { 166 reqClaim := lookupClaimRequest(claims, claimUID) 167 if reqClaim == nil { 168 return fmt.Errorf("NodePrepareResources returned result for unknown claim UID %s", claimUID) 169 } 170 if result.Error != "" { 171 return fmt.Errorf("NodePrepareResources failed for claim %s/%s: %s", reqClaim.Namespace, reqClaim.Name, result.Error) 172 } 173 174 claimInfo := claimInfos[types.UID(claimUID)] 175 176 // Add the CDI Devices returned by NodePrepareResources to 177 // the claimInfo object. 178 err = claimInfo.addCDIDevices(pluginName, result.CDIDevices) 179 if err != nil { 180 return fmt.Errorf("failed to add CDIDevices to claimInfo %+v: %+v", claimInfo, err) 181 } 182 // mark claim as (successfully) prepared by manager, so next time we dont prepare it. 183 claimInfo.prepared = true 184 185 // TODO: We (re)add the claimInfo object to the cache and 186 // sync it to the checkpoint *after* the 187 // NodePrepareResources call has completed. This will cause 188 // issues if the kubelet gets restarted between 189 // NodePrepareResources and syncToCheckpoint. It will result 190 // in not calling NodeUnprepareResources for this claim 191 // because no claimInfo will be synced back to the cache 192 // for it after the restart. We need to resolve this issue 193 // before moving to beta. 194 m.cache.add(claimInfo) 195 } 196 197 // Checkpoint to reduce redundant calls to 198 // NodePrepareResources after a kubelet restart. 199 err = m.cache.syncToCheckpoint() 200 if err != nil { 201 return fmt.Errorf("failed to checkpoint claimInfo state, err: %+v", err) 202 } 203 204 unfinished := len(claims) - len(response.Claims) 205 if unfinished != 0 { 206 return fmt.Errorf("NodePrepareResources left out %d claims", unfinished) 207 } 208 } 209 // Checkpoint to capture all of the previous addPodReference() calls. 210 err := m.cache.syncToCheckpoint() 211 if err != nil { 212 return fmt.Errorf("failed to checkpoint claimInfo state, err: %+v", err) 213 } 214 return nil 215 } 216 217 func lookupClaimRequest(claims []*drapb.Claim, claimUID string) *drapb.Claim { 218 for _, claim := range claims { 219 if claim.Uid == claimUID { 220 return claim 221 } 222 } 223 return nil 224 } 225 226 func claimIsUsedByPod(podClaim *v1.PodResourceClaim, pod *v1.Pod) bool { 227 if claimIsUsedByContainers(podClaim, pod.Spec.InitContainers) { 228 return true 229 } 230 if claimIsUsedByContainers(podClaim, pod.Spec.Containers) { 231 return true 232 } 233 return false 234 } 235 236 func claimIsUsedByContainers(podClaim *v1.PodResourceClaim, containers []v1.Container) bool { 237 for i := range containers { 238 if claimIsUsedByContainer(podClaim, &containers[i]) { 239 return true 240 } 241 } 242 return false 243 } 244 245 func claimIsUsedByContainer(podClaim *v1.PodResourceClaim, container *v1.Container) bool { 246 for _, c := range container.Resources.Claims { 247 if c.Name == podClaim.Name { 248 return true 249 } 250 } 251 return false 252 } 253 254 // GetResources gets a ContainerInfo object from the claimInfo cache. 255 // This information is used by the caller to update a container config. 256 func (m *ManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*ContainerInfo, error) { 257 annotations := []kubecontainer.Annotation{} 258 cdiDevices := []kubecontainer.CDIDevice{} 259 260 for i, podResourceClaim := range pod.Spec.ResourceClaims { 261 claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i]) 262 if err != nil { 263 return nil, fmt.Errorf("list resource claims: %v", err) 264 } 265 // The claim name might be nil if no underlying resource claim 266 // was generated for the referenced claim. There are valid use 267 // cases when this might happen, so we simply skip it. 268 if claimName == nil { 269 continue 270 } 271 for _, claim := range container.Resources.Claims { 272 if podResourceClaim.Name != claim.Name { 273 continue 274 } 275 276 claimInfo := m.cache.get(*claimName, pod.Namespace) 277 if claimInfo == nil { 278 return nil, fmt.Errorf("unable to get resource for namespace: %s, claim: %s", pod.Namespace, *claimName) 279 } 280 281 claimInfo.RLock() 282 claimAnnotations := claimInfo.annotationsAsList() 283 klog.V(3).InfoS("Add resource annotations", "claim", *claimName, "annotations", claimAnnotations) 284 annotations = append(annotations, claimAnnotations...) 285 for _, devices := range claimInfo.CDIDevices { 286 for _, device := range devices { 287 cdiDevices = append(cdiDevices, kubecontainer.CDIDevice{Name: device}) 288 } 289 } 290 claimInfo.RUnlock() 291 } 292 } 293 294 return &ContainerInfo{Annotations: annotations, CDIDevices: cdiDevices}, nil 295 } 296 297 // UnprepareResources calls a plugin's NodeUnprepareResource API for each resource claim owned by a pod. 298 // This function is idempotent and may be called multiple times against the same pod. 299 // As such, calls to the underlying NodeUnprepareResource API are skipped for claims that have 300 // already been successfully unprepared. 301 func (m *ManagerImpl) UnprepareResources(pod *v1.Pod) error { 302 batches := make(map[string][]*drapb.Claim) 303 claimInfos := make(map[types.UID]*ClaimInfo) 304 for i := range pod.Spec.ResourceClaims { 305 claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i]) 306 if err != nil { 307 return fmt.Errorf("unprepare resource claim: %v", err) 308 } 309 310 // The claim name might be nil if no underlying resource claim 311 // was generated for the referenced claim. There are valid use 312 // cases when this might happen, so we simply skip it. 313 if claimName == nil { 314 continue 315 } 316 317 claimInfo := m.cache.get(*claimName, pod.Namespace) 318 319 // Skip calling NodeUnprepareResource if claim info is not cached 320 if claimInfo == nil { 321 continue 322 } 323 324 // Skip calling NodeUnprepareResource if other pods are still referencing it 325 if len(claimInfo.PodUIDs) > 1 { 326 // We delay checkpointing of this change until this call returns successfully. 327 // It is OK to do this because we will only return successfully from this call if 328 // the checkpoint has succeeded. That means if the kubelet is ever restarted 329 // before this checkpoint succeeds, we will simply call into this (idempotent) 330 // function again. 331 claimInfo.deletePodReference(pod.UID) 332 continue 333 } 334 335 // Loop through all plugins and prepare for calling NodeUnprepareResources. 336 for _, resourceHandle := range claimInfo.ResourceHandles { 337 // If no DriverName is provided in the resourceHandle, we 338 // use the DriverName from the status 339 pluginName := resourceHandle.DriverName 340 if pluginName == "" { 341 pluginName = claimInfo.DriverName 342 } 343 344 claim := &drapb.Claim{ 345 Namespace: claimInfo.Namespace, 346 Uid: string(claimInfo.ClaimUID), 347 Name: claimInfo.ClaimName, 348 ResourceHandle: resourceHandle.Data, 349 } 350 batches[pluginName] = append(batches[pluginName], claim) 351 } 352 claimInfos[claimInfo.ClaimUID] = claimInfo 353 } 354 355 // Call NodeUnprepareResources for all claims in each batch. 356 // If there is any error, processing gets aborted. 357 // We could try to continue, but that would make the code more complex. 358 for pluginName, claims := range batches { 359 // Call NodeUnprepareResources RPC for all resource handles. 360 client, err := dra.NewDRAPluginClient(pluginName) 361 if err != nil { 362 return fmt.Errorf("failed to get DRA Plugin client for plugin name %s: %v", pluginName, err) 363 } 364 response, err := client.NodeUnprepareResources(context.Background(), &drapb.NodeUnprepareResourcesRequest{Claims: claims}) 365 if err != nil { 366 // General error unrelated to any particular claim. 367 return fmt.Errorf("NodeUnprepareResources failed: %v", err) 368 } 369 370 for claimUID, result := range response.Claims { 371 reqClaim := lookupClaimRequest(claims, claimUID) 372 if reqClaim == nil { 373 return fmt.Errorf("NodeUnprepareResources returned result for unknown claim UID %s", claimUID) 374 } 375 if result.Error != "" { 376 return fmt.Errorf("NodeUnprepareResources failed for claim %s/%s: %s", reqClaim.Namespace, reqClaim.Name, err) 377 } 378 379 // Delete last pod UID only if unprepare succeeds. 380 // This ensures that the status manager doesn't enter termination status 381 // for the pod. This logic is implemented in 382 // m.PodMightNeedToUnprepareResources and claimInfo.hasPodReference. 383 claimInfo := claimInfos[types.UID(claimUID)] 384 claimInfo.deletePodReference(pod.UID) 385 m.cache.delete(claimInfo.ClaimName, pod.Namespace) 386 } 387 388 // Checkpoint to reduce redundant calls to NodeUnprepareResources after a kubelet restart. 389 err = m.cache.syncToCheckpoint() 390 if err != nil { 391 return fmt.Errorf("failed to checkpoint claimInfo state, err: %+v", err) 392 } 393 394 unfinished := len(claims) - len(response.Claims) 395 if unfinished != 0 { 396 return fmt.Errorf("NodeUnprepareResources left out %d claims", unfinished) 397 } 398 } 399 400 // Checkpoint to capture all of the previous deletePodReference() calls. 401 err := m.cache.syncToCheckpoint() 402 if err != nil { 403 return fmt.Errorf("failed to checkpoint claimInfo state, err: %+v", err) 404 } 405 return nil 406 } 407 408 // PodMightNeedToUnprepareResources returns true if the pod might need to 409 // unprepare resources 410 func (m *ManagerImpl) PodMightNeedToUnprepareResources(UID types.UID) bool { 411 return m.cache.hasPodReference(UID) 412 } 413 414 // GetCongtainerClaimInfos gets Container's ClaimInfo 415 func (m *ManagerImpl) GetContainerClaimInfos(pod *v1.Pod, container *v1.Container) ([]*ClaimInfo, error) { 416 claimInfos := make([]*ClaimInfo, 0, len(pod.Spec.ResourceClaims)) 417 418 for i, podResourceClaim := range pod.Spec.ResourceClaims { 419 claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i]) 420 if err != nil { 421 return nil, fmt.Errorf("determine resource claim information: %v", err) 422 } 423 424 for _, claim := range container.Resources.Claims { 425 if podResourceClaim.Name != claim.Name { 426 continue 427 } 428 claimInfo := m.cache.get(*claimName, pod.Namespace) 429 if claimInfo == nil { 430 return nil, fmt.Errorf("unable to get resource for namespace: %s, claim: %s", pod.Namespace, *claimName) 431 } 432 claimInfos = append(claimInfos, claimInfo) 433 } 434 } 435 return claimInfos, nil 436 }