k8s.io/kubernetes@v1.29.3/pkg/kubelet/cm/dra/manager.go

k8s.io/kubernetes@v1.29.3/pkg/kubelet/cm/dra/manager.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package dra
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  
    23  	v1 "k8s.io/api/core/v1"
    24  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    25  	"k8s.io/apimachinery/pkg/types"
    26  	clientset "k8s.io/client-go/kubernetes"
    27  	"k8s.io/dynamic-resource-allocation/resourceclaim"
    28  	"k8s.io/klog/v2"
    29  	drapb "k8s.io/kubelet/pkg/apis/dra/v1alpha3"
    30  	dra "k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin"
    31  	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
    32  )
    33  
    34  // draManagerStateFileName is the file name where dra manager stores its state
    35  const draManagerStateFileName = "dra_manager_state"
    36  
    37  // ManagerImpl is the structure in charge of managing DRA resource Plugins.
    38  type ManagerImpl struct {
    39  	// cache contains cached claim info
    40  	cache *claimInfoCache
    41  
    42  	// KubeClient reference
    43  	kubeClient clientset.Interface
    44  }
    45  
    46  // NewManagerImpl creates a new manager.
    47  func NewManagerImpl(kubeClient clientset.Interface, stateFileDirectory string) (*ManagerImpl, error) {
    48  	klog.V(2).InfoS("Creating DRA manager")
    49  
    50  	claimInfoCache, err := newClaimInfoCache(stateFileDirectory, draManagerStateFileName)
    51  	if err != nil {
    52  		return nil, fmt.Errorf("failed to create claimInfo cache: %+v", err)
    53  	}
    54  
    55  	manager := &ManagerImpl{
    56  		cache:      claimInfoCache,
    57  		kubeClient: kubeClient,
    58  	}
    59  
    60  	return manager, nil
    61  }
    62  
    63  // PrepareResources attempts to prepare all of the required resource
    64  // plugin resources for the input container, issue NodePrepareResources rpc requests
    65  // for each new resource requirement, process their responses and update the cached
    66  // containerResources on success.
    67  func (m *ManagerImpl) PrepareResources(pod *v1.Pod) error {
    68  	batches := make(map[string][]*drapb.Claim)
    69  	claimInfos := make(map[types.UID]*ClaimInfo)
    70  	for i := range pod.Spec.ResourceClaims {
    71  		podClaim := &pod.Spec.ResourceClaims[i]
    72  		klog.V(3).InfoS("Processing resource", "podClaim", podClaim.Name, "pod", pod.Name)
    73  		claimName, mustCheckOwner, err := resourceclaim.Name(pod, podClaim)
    74  		if err != nil {
    75  			return fmt.Errorf("prepare resource claim: %v", err)
    76  		}
    77  
    78  		if claimName == nil {
    79  			// Nothing to do.
    80  			continue
    81  		}
    82  		// Query claim object from the API server
    83  		resourceClaim, err := m.kubeClient.ResourceV1alpha2().ResourceClaims(pod.Namespace).Get(
    84  			context.TODO(),
    85  			*claimName,
    86  			metav1.GetOptions{})
    87  		if err != nil {
    88  			return fmt.Errorf("failed to fetch ResourceClaim %s referenced by pod %s: %+v", *claimName, pod.Name, err)
    89  		}
    90  
    91  		if mustCheckOwner {
    92  			if err = resourceclaim.IsForPod(pod, resourceClaim); err != nil {
    93  				return err
    94  			}
    95  		}
    96  
    97  		// Check if pod is in the ReservedFor for the claim
    98  		if !resourceclaim.IsReservedForPod(pod, resourceClaim) {
    99  			return fmt.Errorf("pod %s(%s) is not allowed to use resource claim %s(%s)",
   100  				pod.Name, pod.UID, *claimName, resourceClaim.UID)
   101  		}
   102  
   103  		// If no container actually uses the claim, then we don't need
   104  		// to prepare it.
   105  		if !claimIsUsedByPod(podClaim, pod) {
   106  			klog.V(5).InfoS("Skipping unused resource", "claim", claimName, "pod", pod.Name)
   107  			continue
   108  		}
   109  
   110  		claimInfo := m.cache.get(*claimName, pod.Namespace)
   111  		if claimInfo == nil {
   112  			// claim does not exist in cache, create new claimInfo object
   113  			// to be processed later.
   114  			claimInfo = newClaimInfoFromResourceClaim(resourceClaim)
   115  		}
   116  
   117  		// We delay checkpointing of this change until this call
   118  		// returns successfully. It is OK to do this because we
   119  		// will only return successfully from this call if the
   120  		// checkpoint has succeeded. That means if the kubelet is
   121  		// ever restarted before this checkpoint succeeds, the pod
   122  		// whose resources are being prepared would never have
   123  		// started, so it's OK (actually correct) to not include it
   124  		// in the cache.
   125  		claimInfo.addPodReference(pod.UID)
   126  
   127  		if claimInfo.prepared {
   128  			// Already prepared this claim, no need to prepare it again
   129  			continue
   130  		}
   131  
   132  		// Loop through all plugins and prepare for calling NodePrepareResources.
   133  		for _, resourceHandle := range claimInfo.ResourceHandles {
   134  			// If no DriverName is provided in the resourceHandle, we
   135  			// use the DriverName from the status
   136  			pluginName := resourceHandle.DriverName
   137  			if pluginName == "" {
   138  				pluginName = resourceClaim.Status.DriverName
   139  			}
   140  			claim := &drapb.Claim{
   141  				Namespace:      resourceClaim.Namespace,
   142  				Uid:            string(resourceClaim.UID),
   143  				Name:           resourceClaim.Name,
   144  				ResourceHandle: resourceHandle.Data,
   145  			}
   146  			batches[pluginName] = append(batches[pluginName], claim)
   147  		}
   148  		claimInfos[resourceClaim.UID] = claimInfo
   149  	}
   150  
   151  	// Call NodePrepareResources for all claims in each batch.
   152  	// If there is any error, processing gets aborted.
   153  	// We could try to continue, but that would make the code more complex.
   154  	for pluginName, claims := range batches {
   155  		// Call NodePrepareResources RPC for all resource handles.
   156  		client, err := dra.NewDRAPluginClient(pluginName)
   157  		if err != nil {
   158  			return fmt.Errorf("failed to get DRA Plugin client for plugin name %s: %v", pluginName, err)
   159  		}
   160  		response, err := client.NodePrepareResources(context.Background(), &drapb.NodePrepareResourcesRequest{Claims: claims})
   161  		if err != nil {
   162  			// General error unrelated to any particular claim.
   163  			return fmt.Errorf("NodePrepareResources failed: %v", err)
   164  		}
   165  		for claimUID, result := range response.Claims {
   166  			reqClaim := lookupClaimRequest(claims, claimUID)
   167  			if reqClaim == nil {
   168  				return fmt.Errorf("NodePrepareResources returned result for unknown claim UID %s", claimUID)
   169  			}
   170  			if result.Error != "" {
   171  				return fmt.Errorf("NodePrepareResources failed for claim %s/%s: %s", reqClaim.Namespace, reqClaim.Name, result.Error)
   172  			}
   173  
   174  			claimInfo := claimInfos[types.UID(claimUID)]
   175  
   176  			// Add the CDI Devices returned by NodePrepareResources to
   177  			// the claimInfo object.
   178  			err = claimInfo.addCDIDevices(pluginName, result.CDIDevices)
   179  			if err != nil {
   180  				return fmt.Errorf("failed to add CDIDevices to claimInfo %+v: %+v", claimInfo, err)
   181  			}
   182  			// mark claim as (successfully) prepared by manager, so next time we dont prepare it.
   183  			claimInfo.prepared = true
   184  
   185  			// TODO: We (re)add the claimInfo object to the cache and
   186  			// sync it to the checkpoint *after* the
   187  			// NodePrepareResources call has completed. This will cause
   188  			// issues if the kubelet gets restarted between
   189  			// NodePrepareResources and syncToCheckpoint. It will result
   190  			// in not calling NodeUnprepareResources for this claim
   191  			// because no claimInfo will be synced back to the cache
   192  			// for it after the restart. We need to resolve this issue
   193  			// before moving to beta.
   194  			m.cache.add(claimInfo)
   195  		}
   196  
   197  		// Checkpoint to reduce redundant calls to
   198  		// NodePrepareResources after a kubelet restart.
   199  		err = m.cache.syncToCheckpoint()
   200  		if err != nil {
   201  			return fmt.Errorf("failed to checkpoint claimInfo state, err: %+v", err)
   202  		}
   203  
   204  		unfinished := len(claims) - len(response.Claims)
   205  		if unfinished != 0 {
   206  			return fmt.Errorf("NodePrepareResources left out %d claims", unfinished)
   207  		}
   208  	}
   209  	// Checkpoint to capture all of the previous addPodReference() calls.
   210  	err := m.cache.syncToCheckpoint()
   211  	if err != nil {
   212  		return fmt.Errorf("failed to checkpoint claimInfo state, err: %+v", err)
   213  	}
   214  	return nil
   215  }
   216  
   217  func lookupClaimRequest(claims []*drapb.Claim, claimUID string) *drapb.Claim {
   218  	for _, claim := range claims {
   219  		if claim.Uid == claimUID {
   220  			return claim
   221  		}
   222  	}
   223  	return nil
   224  }
   225  
   226  func claimIsUsedByPod(podClaim *v1.PodResourceClaim, pod *v1.Pod) bool {
   227  	if claimIsUsedByContainers(podClaim, pod.Spec.InitContainers) {
   228  		return true
   229  	}
   230  	if claimIsUsedByContainers(podClaim, pod.Spec.Containers) {
   231  		return true
   232  	}
   233  	return false
   234  }
   235  
   236  func claimIsUsedByContainers(podClaim *v1.PodResourceClaim, containers []v1.Container) bool {
   237  	for i := range containers {
   238  		if claimIsUsedByContainer(podClaim, &containers[i]) {
   239  			return true
   240  		}
   241  	}
   242  	return false
   243  }
   244  
   245  func claimIsUsedByContainer(podClaim *v1.PodResourceClaim, container *v1.Container) bool {
   246  	for _, c := range container.Resources.Claims {
   247  		if c.Name == podClaim.Name {
   248  			return true
   249  		}
   250  	}
   251  	return false
   252  }
   253  
   254  // GetResources gets a ContainerInfo object from the claimInfo cache.
   255  // This information is used by the caller to update a container config.
   256  func (m *ManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*ContainerInfo, error) {
   257  	annotations := []kubecontainer.Annotation{}
   258  	cdiDevices := []kubecontainer.CDIDevice{}
   259  
   260  	for i, podResourceClaim := range pod.Spec.ResourceClaims {
   261  		claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i])
   262  		if err != nil {
   263  			return nil, fmt.Errorf("list resource claims: %v", err)
   264  		}
   265  		// The claim name might be nil if no underlying resource claim
   266  		// was generated for the referenced claim. There are valid use
   267  		// cases when this might happen, so we simply skip it.
   268  		if claimName == nil {
   269  			continue
   270  		}
   271  		for _, claim := range container.Resources.Claims {
   272  			if podResourceClaim.Name != claim.Name {
   273  				continue
   274  			}
   275  
   276  			claimInfo := m.cache.get(*claimName, pod.Namespace)
   277  			if claimInfo == nil {
   278  				return nil, fmt.Errorf("unable to get resource for namespace: %s, claim: %s", pod.Namespace, *claimName)
   279  			}
   280  
   281  			claimInfo.RLock()
   282  			claimAnnotations := claimInfo.annotationsAsList()
   283  			klog.V(3).InfoS("Add resource annotations", "claim", *claimName, "annotations", claimAnnotations)
   284  			annotations = append(annotations, claimAnnotations...)
   285  			for _, devices := range claimInfo.CDIDevices {
   286  				for _, device := range devices {
   287  					cdiDevices = append(cdiDevices, kubecontainer.CDIDevice{Name: device})
   288  				}
   289  			}
   290  			claimInfo.RUnlock()
   291  		}
   292  	}
   293  
   294  	return &ContainerInfo{Annotations: annotations, CDIDevices: cdiDevices}, nil
   295  }
   296  
   297  // UnprepareResources calls a plugin's NodeUnprepareResource API for each resource claim owned by a pod.
   298  // This function is idempotent and may be called multiple times against the same pod.
   299  // As such, calls to the underlying NodeUnprepareResource API are skipped for claims that have
   300  // already been successfully unprepared.
   301  func (m *ManagerImpl) UnprepareResources(pod *v1.Pod) error {
   302  	batches := make(map[string][]*drapb.Claim)
   303  	claimInfos := make(map[types.UID]*ClaimInfo)
   304  	for i := range pod.Spec.ResourceClaims {
   305  		claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i])
   306  		if err != nil {
   307  			return fmt.Errorf("unprepare resource claim: %v", err)
   308  		}
   309  
   310  		// The claim name might be nil if no underlying resource claim
   311  		// was generated for the referenced claim. There are valid use
   312  		// cases when this might happen, so we simply skip it.
   313  		if claimName == nil {
   314  			continue
   315  		}
   316  
   317  		claimInfo := m.cache.get(*claimName, pod.Namespace)
   318  
   319  		// Skip calling NodeUnprepareResource if claim info is not cached
   320  		if claimInfo == nil {
   321  			continue
   322  		}
   323  
   324  		// Skip calling NodeUnprepareResource if other pods are still referencing it
   325  		if len(claimInfo.PodUIDs) > 1 {
   326  			// We delay checkpointing of this change until this call returns successfully.
   327  			// It is OK to do this because we will only return successfully from this call if
   328  			// the checkpoint has succeeded. That means if the kubelet is ever restarted
   329  			// before this checkpoint succeeds, we will simply call into this (idempotent)
   330  			// function again.
   331  			claimInfo.deletePodReference(pod.UID)
   332  			continue
   333  		}
   334  
   335  		// Loop through all plugins and prepare for calling NodeUnprepareResources.
   336  		for _, resourceHandle := range claimInfo.ResourceHandles {
   337  			// If no DriverName is provided in the resourceHandle, we
   338  			// use the DriverName from the status
   339  			pluginName := resourceHandle.DriverName
   340  			if pluginName == "" {
   341  				pluginName = claimInfo.DriverName
   342  			}
   343  
   344  			claim := &drapb.Claim{
   345  				Namespace:      claimInfo.Namespace,
   346  				Uid:            string(claimInfo.ClaimUID),
   347  				Name:           claimInfo.ClaimName,
   348  				ResourceHandle: resourceHandle.Data,
   349  			}
   350  			batches[pluginName] = append(batches[pluginName], claim)
   351  		}
   352  		claimInfos[claimInfo.ClaimUID] = claimInfo
   353  	}
   354  
   355  	// Call NodeUnprepareResources for all claims in each batch.
   356  	// If there is any error, processing gets aborted.
   357  	// We could try to continue, but that would make the code more complex.
   358  	for pluginName, claims := range batches {
   359  		// Call NodeUnprepareResources RPC for all resource handles.
   360  		client, err := dra.NewDRAPluginClient(pluginName)
   361  		if err != nil {
   362  			return fmt.Errorf("failed to get DRA Plugin client for plugin name %s: %v", pluginName, err)
   363  		}
   364  		response, err := client.NodeUnprepareResources(context.Background(), &drapb.NodeUnprepareResourcesRequest{Claims: claims})
   365  		if err != nil {
   366  			// General error unrelated to any particular claim.
   367  			return fmt.Errorf("NodeUnprepareResources failed: %v", err)
   368  		}
   369  
   370  		for claimUID, result := range response.Claims {
   371  			reqClaim := lookupClaimRequest(claims, claimUID)
   372  			if reqClaim == nil {
   373  				return fmt.Errorf("NodeUnprepareResources returned result for unknown claim UID %s", claimUID)
   374  			}
   375  			if result.Error != "" {
   376  				return fmt.Errorf("NodeUnprepareResources failed for claim %s/%s: %s", reqClaim.Namespace, reqClaim.Name, err)
   377  			}
   378  
   379  			// Delete last pod UID only if unprepare succeeds.
   380  			// This ensures that the status manager doesn't enter termination status
   381  			// for the pod. This logic is implemented in
   382  			// m.PodMightNeedToUnprepareResources and claimInfo.hasPodReference.
   383  			claimInfo := claimInfos[types.UID(claimUID)]
   384  			claimInfo.deletePodReference(pod.UID)
   385  			m.cache.delete(claimInfo.ClaimName, pod.Namespace)
   386  		}
   387  
   388  		// Checkpoint to reduce redundant calls to NodeUnprepareResources after a kubelet restart.
   389  		err = m.cache.syncToCheckpoint()
   390  		if err != nil {
   391  			return fmt.Errorf("failed to checkpoint claimInfo state, err: %+v", err)
   392  		}
   393  
   394  		unfinished := len(claims) - len(response.Claims)
   395  		if unfinished != 0 {
   396  			return fmt.Errorf("NodeUnprepareResources left out %d claims", unfinished)
   397  		}
   398  	}
   399  
   400  	// Checkpoint to capture all of the previous deletePodReference() calls.
   401  	err := m.cache.syncToCheckpoint()
   402  	if err != nil {
   403  		return fmt.Errorf("failed to checkpoint claimInfo state, err: %+v", err)
   404  	}
   405  	return nil
   406  }
   407  
   408  // PodMightNeedToUnprepareResources returns true if the pod might need to
   409  // unprepare resources
   410  func (m *ManagerImpl) PodMightNeedToUnprepareResources(UID types.UID) bool {
   411  	return m.cache.hasPodReference(UID)
   412  }
   413  
   414  // GetCongtainerClaimInfos gets Container's ClaimInfo
   415  func (m *ManagerImpl) GetContainerClaimInfos(pod *v1.Pod, container *v1.Container) ([]*ClaimInfo, error) {
   416  	claimInfos := make([]*ClaimInfo, 0, len(pod.Spec.ResourceClaims))
   417  
   418  	for i, podResourceClaim := range pod.Spec.ResourceClaims {
   419  		claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i])
   420  		if err != nil {
   421  			return nil, fmt.Errorf("determine resource claim information: %v", err)
   422  		}
   423  
   424  		for _, claim := range container.Resources.Claims {
   425  			if podResourceClaim.Name != claim.Name {
   426  				continue
   427  			}
   428  			claimInfo := m.cache.get(*claimName, pod.Namespace)
   429  			if claimInfo == nil {
   430  				return nil, fmt.Errorf("unable to get resource for namespace: %s, claim: %s", pod.Namespace, *claimName)
   431  			}
   432  			claimInfos = append(claimInfos, claimInfo)
   433  		}
   434  	}
   435  	return claimInfos, nil
   436  }