k8s.io/kubernetes@v1.29.3/test/e2e_node/dra_test.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  /*
    18  E2E Node test for DRA (Dynamic Resource Allocation)
    19  This test covers node-specific aspects of DRA
    20  The test can be run locally on Linux this way:
    21    make test-e2e-node FOCUS='\[NodeAlphaFeature:DynamicResourceAllocation\]' SKIP='\[Flaky\]' PARALLELISM=1 \
    22         TEST_ARGS='--feature-gates="DynamicResourceAllocation=true" --service-feature-gates="DynamicResourceAllocation=true" --runtime-config=api/all=true'
    23  */
    24  
    25  package e2enode
    26  
    27  import (
    28  	"context"
    29  	"os"
    30  	"path"
    31  	"path/filepath"
    32  	"time"
    33  
    34  	"github.com/onsi/ginkgo/v2"
    35  	"github.com/onsi/gomega"
    36  
    37  	v1 "k8s.io/api/core/v1"
    38  	resourcev1alpha2 "k8s.io/api/resource/v1alpha2"
    39  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    40  	"k8s.io/client-go/kubernetes"
    41  	"k8s.io/klog/v2"
    42  	dra "k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin"
    43  	admissionapi "k8s.io/pod-security-admission/api"
    44  
    45  	"k8s.io/kubernetes/test/e2e/feature"
    46  	"k8s.io/kubernetes/test/e2e/framework"
    47  	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
    48  
    49  	"k8s.io/dynamic-resource-allocation/kubeletplugin"
    50  	testdriver "k8s.io/kubernetes/test/e2e/dra/test-driver/app"
    51  )
    52  
    53  const (
    54  	driverName                = "test-driver.cdi.k8s.io"
    55  	cdiDir                    = "/var/run/cdi"
    56  	endpoint                  = "/var/lib/kubelet/plugins/test-driver/dra.sock"
    57  	pluginRegistrationPath    = "/var/lib/kubelet/plugins_registry"
    58  	draAddress                = "/var/lib/kubelet/plugins/test-driver/dra.sock"
    59  	pluginRegistrationTimeout = time.Second * 60 // how long to wait for a node plugin to be registered
    60  	podInPendingStateTimeout  = time.Second * 60 // how long to wait for a pod to stay in pending state
    61  )
    62  
    63  var _ = framework.SIGDescribe("node")("DRA", feature.DynamicResourceAllocation, "[NodeAlphaFeature:DynamicResourceAllocation]", func() {
    64  	f := framework.NewDefaultFramework("dra-node")
    65  	f.NamespacePodSecurityLevel = admissionapi.LevelBaseline
    66  
    67  	var kubeletPlugin *testdriver.ExamplePlugin
    68  
    69  	f.Context("Resource Kubelet Plugin", f.WithSerial(), func() {
    70  		ginkgo.BeforeEach(func(ctx context.Context) {
    71  			kubeletPlugin = newKubeletPlugin(getNodeName(ctx, f))
    72  		})
    73  
    74  		ginkgo.It("must register after Kubelet restart", func(ctx context.Context) {
    75  			oldCalls := kubeletPlugin.GetGRPCCalls()
    76  			getNewCalls := func() []testdriver.GRPCCall {
    77  				calls := kubeletPlugin.GetGRPCCalls()
    78  				return calls[len(oldCalls):]
    79  			}
    80  
    81  			ginkgo.By("restarting Kubelet")
    82  			restartKubelet(true)
    83  
    84  			ginkgo.By("wait for Kubelet plugin re-registration")
    85  			gomega.Eventually(getNewCalls).WithTimeout(pluginRegistrationTimeout).Should(testdriver.BeRegistered)
    86  		})
    87  
    88  		ginkgo.It("must register after plugin restart", func(ctx context.Context) {
    89  			ginkgo.By("restart Kubelet Plugin")
    90  			kubeletPlugin.Stop()
    91  			kubeletPlugin = newKubeletPlugin(getNodeName(ctx, f))
    92  
    93  			ginkgo.By("wait for Kubelet plugin re-registration")
    94  			gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(pluginRegistrationTimeout).Should(testdriver.BeRegistered)
    95  		})
    96  
    97  		ginkgo.It("must process pod created when kubelet is not running", func(ctx context.Context) {
    98  			// Stop Kubelet
    99  			startKubelet := stopKubelet()
   100  			pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod")
   101  			// Pod must be in pending state
   102  			err := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "Pending", framework.PodStartShortTimeout, func(pod *v1.Pod) (bool, error) {
   103  				return pod.Status.Phase == v1.PodPending, nil
   104  			})
   105  			framework.ExpectNoError(err)
   106  			// Start Kubelet
   107  			startKubelet()
   108  			// Pod should succeed
   109  			err = e2epod.WaitForPodSuccessInNamespaceTimeout(ctx, f.ClientSet, pod.Name, f.Namespace.Name, framework.PodStartShortTimeout)
   110  			framework.ExpectNoError(err)
   111  		})
   112  
   113  		ginkgo.It("must keep pod in pending state if NodePrepareResources times out", func(ctx context.Context) {
   114  			ginkgo.By("set delay for the NodePrepareResources call")
   115  			kubeletPlugin.Block()
   116  			pod := createTestObjects(ctx, f.ClientSet, getNodeName(ctx, f), f.Namespace.Name, "draclass", "external-claim", "drapod")
   117  
   118  			ginkgo.By("wait for pod to be in Pending state")
   119  			err := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "Pending", framework.PodStartShortTimeout, func(pod *v1.Pod) (bool, error) {
   120  				return pod.Status.Phase == v1.PodPending, nil
   121  			})
   122  			framework.ExpectNoError(err)
   123  
   124  			ginkgo.By("wait for NodePrepareResources call")
   125  			gomega.Eventually(kubeletPlugin.GetGRPCCalls).WithTimeout(dra.PluginClientTimeout * 2).Should(testdriver.NodePrepareResourcesCalled)
   126  
   127  			// TODO: Check condition or event when implemented
   128  			// see https://github.com/kubernetes/kubernetes/issues/118468 for details
   129  			ginkgo.By("check that pod is consistently in Pending state")
   130  			gomega.Consistently(ctx, e2epod.Get(f.ClientSet, pod)).WithTimeout(podInPendingStateTimeout).Should(e2epod.BeInPhase(v1.PodPending),
   131  				"Pod should be in Pending state as resource preparation time outed")
   132  		})
   133  	})
   134  })
   135  
   136  // Run Kubelet plugin and wait until it's registered
   137  func newKubeletPlugin(nodeName string) *testdriver.ExamplePlugin {
   138  	ginkgo.By("start Kubelet plugin")
   139  	logger := klog.LoggerWithValues(klog.LoggerWithName(klog.Background(), "kubelet plugin"), "node", nodeName)
   140  
   141  	// Ensure that directories exist, creating them if necessary. We want
   142  	// to know early if there is a setup problem that would prevent
   143  	// creating those directories.
   144  	err := os.MkdirAll(cdiDir, os.FileMode(0750))
   145  	framework.ExpectNoError(err, "create CDI directory")
   146  	err = os.MkdirAll(filepath.Dir(endpoint), 0750)
   147  	framework.ExpectNoError(err, "create socket directory")
   148  
   149  	plugin, err := testdriver.StartPlugin(
   150  		logger,
   151  		cdiDir,
   152  		driverName,
   153  		"",
   154  		testdriver.FileOperations{},
   155  		kubeletplugin.PluginSocketPath(endpoint),
   156  		kubeletplugin.RegistrarSocketPath(path.Join(pluginRegistrationPath, driverName+"-reg.sock")),
   157  		kubeletplugin.KubeletPluginSocketPath(draAddress),
   158  	)
   159  	framework.ExpectNoError(err)
   160  
   161  	gomega.Eventually(plugin.GetGRPCCalls).WithTimeout(pluginRegistrationTimeout).Should(testdriver.BeRegistered)
   162  
   163  	ginkgo.DeferCleanup(plugin.Stop)
   164  
   165  	return plugin
   166  }
   167  
   168  // createTestObjects creates objects required by the test
   169  // NOTE: as scheduler and controller manager are not running by the Node e2e,
   170  // the objects must contain all required data to be processed correctly by the API server
   171  // and placed on the node without involving the scheduler and the DRA controller
   172  func createTestObjects(ctx context.Context, clientSet kubernetes.Interface, nodename, namespace, className, claimName, podName string) *v1.Pod {
   173  	// ResourceClass
   174  	class := &resourcev1alpha2.ResourceClass{
   175  		ObjectMeta: metav1.ObjectMeta{
   176  			Name: className,
   177  		},
   178  		DriverName: driverName,
   179  	}
   180  	_, err := clientSet.ResourceV1alpha2().ResourceClasses().Create(ctx, class, metav1.CreateOptions{})
   181  	framework.ExpectNoError(err)
   182  
   183  	ginkgo.DeferCleanup(clientSet.ResourceV1alpha2().ResourceClasses().Delete, className, metav1.DeleteOptions{})
   184  
   185  	// ResourceClaim
   186  	podClaimName := "resource-claim"
   187  	claim := &resourcev1alpha2.ResourceClaim{
   188  		ObjectMeta: metav1.ObjectMeta{
   189  			Name: claimName,
   190  		},
   191  		Spec: resourcev1alpha2.ResourceClaimSpec{
   192  			ResourceClassName: className,
   193  		},
   194  	}
   195  	createdClaim, err := clientSet.ResourceV1alpha2().ResourceClaims(namespace).Create(ctx, claim, metav1.CreateOptions{})
   196  	framework.ExpectNoError(err)
   197  
   198  	ginkgo.DeferCleanup(clientSet.ResourceV1alpha2().ResourceClaims(namespace).Delete, claimName, metav1.DeleteOptions{})
   199  
   200  	// Pod
   201  	containerName := "testcontainer"
   202  	pod := &v1.Pod{
   203  		ObjectMeta: metav1.ObjectMeta{
   204  			Name:      podName,
   205  			Namespace: namespace,
   206  		},
   207  		Spec: v1.PodSpec{
   208  			NodeName: nodename, // Assign the node as the scheduler is not running
   209  			ResourceClaims: []v1.PodResourceClaim{
   210  				{
   211  					Name: podClaimName,
   212  					Source: v1.ClaimSource{
   213  						ResourceClaimName: &claimName,
   214  					},
   215  				},
   216  			},
   217  			Containers: []v1.Container{
   218  				{
   219  					Name:  containerName,
   220  					Image: e2epod.GetDefaultTestImage(),
   221  					Resources: v1.ResourceRequirements{
   222  						Claims: []v1.ResourceClaim{{Name: podClaimName}},
   223  					},
   224  					Command: []string{"/bin/sh", "-c", "env | grep DRA_PARAM1=PARAM1_VALUE"},
   225  				},
   226  			},
   227  			RestartPolicy: v1.RestartPolicyNever,
   228  		},
   229  	}
   230  	createdPod, err := clientSet.CoreV1().Pods(namespace).Create(ctx, pod, metav1.CreateOptions{})
   231  	framework.ExpectNoError(err)
   232  
   233  	ginkgo.DeferCleanup(clientSet.CoreV1().Pods(namespace).Delete, podName, metav1.DeleteOptions{})
   234  
   235  	// Update claim status: set ReservedFor and AllocationResult
   236  	// NOTE: This is usually done by the DRA controller
   237  	createdClaim.Status = resourcev1alpha2.ResourceClaimStatus{
   238  		DriverName: driverName,
   239  		ReservedFor: []resourcev1alpha2.ResourceClaimConsumerReference{
   240  			{Resource: "pods", Name: podName, UID: createdPod.UID},
   241  		},
   242  		Allocation: &resourcev1alpha2.AllocationResult{
   243  			ResourceHandles: []resourcev1alpha2.ResourceHandle{
   244  				{
   245  					DriverName: driverName,
   246  					Data:       "{\"EnvVars\":{\"DRA_PARAM1\":\"PARAM1_VALUE\"},\"NodeName\":\"\"}",
   247  				},
   248  			},
   249  		},
   250  	}
   251  	_, err = clientSet.ResourceV1alpha2().ResourceClaims(namespace).UpdateStatus(ctx, createdClaim, metav1.UpdateOptions{})
   252  	framework.ExpectNoError(err)
   253  
   254  	return pod
   255  }