k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/e2e/dra/deploy.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package dra
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"net"
    25  	"path"
    26  	"sort"
    27  	"strings"
    28  	"sync"
    29  	"time"
    30  
    31  	"github.com/google/go-cmp/cmp"
    32  	"github.com/onsi/ginkgo/v2"
    33  	"github.com/onsi/gomega"
    34  	"github.com/onsi/gomega/format"
    35  	"google.golang.org/grpc"
    36  
    37  	appsv1 "k8s.io/api/apps/v1"
    38  	v1 "k8s.io/api/core/v1"
    39  	resourcev1alpha2 "k8s.io/api/resource/v1alpha2"
    40  	apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
    41  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    42  	"k8s.io/apimachinery/pkg/labels"
    43  	"k8s.io/apimachinery/pkg/selection"
    44  	resourceapiinformer "k8s.io/client-go/informers/resource/v1alpha2"
    45  	"k8s.io/client-go/tools/cache"
    46  	"k8s.io/dynamic-resource-allocation/kubeletplugin"
    47  	"k8s.io/klog/v2"
    48  	"k8s.io/kubernetes/test/e2e/dra/test-driver/app"
    49  	"k8s.io/kubernetes/test/e2e/framework"
    50  	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
    51  	e2ereplicaset "k8s.io/kubernetes/test/e2e/framework/replicaset"
    52  	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
    53  	"k8s.io/kubernetes/test/e2e/storage/drivers/proxy"
    54  	"k8s.io/kubernetes/test/e2e/storage/utils"
    55  )
    56  
    57  const (
    58  	NodePrepareResourcesMethod      = "/v1alpha3.Node/NodePrepareResources"
    59  	NodeUnprepareResourcesMethod    = "/v1alpha3.Node/NodeUnprepareResources"
    60  	NodeListAndWatchResourcesMethod = "/v1alpha3.Node/NodeListAndWatchResources"
    61  )
    62  
    63  type Nodes struct {
    64  	NodeNames []string
    65  }
    66  
    67  // NewNodes selects nodes to run the test on.
    68  func NewNodes(f *framework.Framework, minNodes, maxNodes int) *Nodes {
    69  	nodes := &Nodes{}
    70  	ginkgo.BeforeEach(func(ctx context.Context) {
    71  		ginkgo.By("selecting nodes")
    72  		// The kubelet plugin is harder. We deploy the builtin manifest
    73  		// after patching in the driver name and all nodes on which we
    74  		// want the plugin to run.
    75  		//
    76  		// Only a subset of the nodes are picked to avoid causing
    77  		// unnecessary load on a big cluster.
    78  		nodeList, err := e2enode.GetBoundedReadySchedulableNodes(ctx, f.ClientSet, maxNodes)
    79  		framework.ExpectNoError(err, "get nodes")
    80  		numNodes := int32(len(nodeList.Items))
    81  		if int(numNodes) < minNodes {
    82  			e2eskipper.Skipf("%d ready nodes required, only have %d", minNodes, numNodes)
    83  		}
    84  		nodes.NodeNames = nil
    85  		for _, node := range nodeList.Items {
    86  			nodes.NodeNames = append(nodes.NodeNames, node.Name)
    87  		}
    88  		framework.Logf("testing on nodes %v", nodes.NodeNames)
    89  
    90  		// Watch claims in the namespace. This is useful for monitoring a test
    91  		// and enables additional sanity checks.
    92  		claimInformer := resourceapiinformer.NewResourceClaimInformer(f.ClientSet, f.Namespace.Name, 100*time.Hour /* resync */, nil)
    93  		cancelCtx, cancel := context.WithCancelCause(context.Background())
    94  		var wg sync.WaitGroup
    95  		ginkgo.DeferCleanup(func() {
    96  			cancel(errors.New("test has completed"))
    97  			wg.Wait()
    98  		})
    99  		_, err = claimInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
   100  			AddFunc: func(obj any) {
   101  				defer ginkgo.GinkgoRecover()
   102  				claim := obj.(*resourcev1alpha2.ResourceClaim)
   103  				framework.Logf("New claim:\n%s", format.Object(claim, 1))
   104  				validateClaim(claim)
   105  			},
   106  			UpdateFunc: func(oldObj, newObj any) {
   107  				defer ginkgo.GinkgoRecover()
   108  				oldClaim := oldObj.(*resourcev1alpha2.ResourceClaim)
   109  				newClaim := newObj.(*resourcev1alpha2.ResourceClaim)
   110  				framework.Logf("Updated claim:\n%s\nDiff:\n%s", format.Object(newClaim, 1), cmp.Diff(oldClaim, newClaim))
   111  				validateClaim(newClaim)
   112  			},
   113  			DeleteFunc: func(obj any) {
   114  				defer ginkgo.GinkgoRecover()
   115  				claim := obj.(*resourcev1alpha2.ResourceClaim)
   116  				framework.Logf("Deleted claim:\n%s", format.Object(claim, 1))
   117  			},
   118  		})
   119  		framework.ExpectNoError(err, "AddEventHandler")
   120  		wg.Add(1)
   121  		go func() {
   122  			defer wg.Done()
   123  			claimInformer.Run(cancelCtx.Done())
   124  		}()
   125  	})
   126  	return nodes
   127  }
   128  
   129  func validateClaim(claim *resourcev1alpha2.ResourceClaim) {
   130  	// The apiserver doesn't enforce that a claim always has a finalizer
   131  	// while being allocated. This is a convention that whoever allocates a
   132  	// claim has to follow to prevent using a claim that is at risk of
   133  	// being deleted.
   134  	if claim.Status.Allocation != nil && len(claim.Finalizers) == 0 {
   135  		framework.Failf("Invalid claim: allocated without any finalizer:\n%s", format.Object(claim, 1))
   136  	}
   137  }
   138  
   139  // NewDriver sets up controller (as client of the cluster) and
   140  // kubelet plugin (via proxy) before the test runs. It cleans
   141  // up after the test.
   142  func NewDriver(f *framework.Framework, nodes *Nodes, configureResources func() app.Resources) *Driver {
   143  	d := &Driver{
   144  		f:            f,
   145  		fail:         map[MethodInstance]bool{},
   146  		callCounts:   map[MethodInstance]int64{},
   147  		NodeV1alpha3: true,
   148  	}
   149  
   150  	ginkgo.BeforeEach(func() {
   151  		resources := configureResources()
   152  		if len(resources.Nodes) == 0 {
   153  			// This always has to be set because the driver might
   154  			// not run on all nodes.
   155  			resources.Nodes = nodes.NodeNames
   156  		}
   157  		ginkgo.DeferCleanup(d.IsGone) // Register first so it gets called last.
   158  		d.SetUp(nodes, resources)
   159  		ginkgo.DeferCleanup(d.TearDown)
   160  	})
   161  	return d
   162  }
   163  
   164  type MethodInstance struct {
   165  	Nodename   string
   166  	FullMethod string
   167  }
   168  
   169  type Driver struct {
   170  	f       *framework.Framework
   171  	ctx     context.Context
   172  	cleanup []func() // executed first-in-first-out
   173  	wg      sync.WaitGroup
   174  
   175  	NameSuffix string
   176  	Controller *app.ExampleController
   177  	Name       string
   178  	Nodes      map[string]*app.ExamplePlugin
   179  
   180  	parameterMode         parameterMode
   181  	parameterAPIGroup     string
   182  	parameterAPIVersion   string
   183  	claimParameterAPIKind string
   184  	classParameterAPIKind string
   185  
   186  	NodeV1alpha3 bool
   187  
   188  	mutex      sync.Mutex
   189  	fail       map[MethodInstance]bool
   190  	callCounts map[MethodInstance]int64
   191  }
   192  
   193  type parameterMode string
   194  
   195  const (
   196  	parameterModeConfigMap  parameterMode = "configmap"  // ConfigMap parameters, control plane controller.
   197  	parameterModeStructured parameterMode = "structured" // No ConfigMaps, directly create and reference in-tree parameter objects.
   198  	parameterModeTranslated parameterMode = "translated" // Reference ConfigMaps in claim and class, generate in-tree parameter objects.
   199  )
   200  
   201  func (d *Driver) SetUp(nodes *Nodes, resources app.Resources) {
   202  	ginkgo.By(fmt.Sprintf("deploying driver on nodes %v", nodes.NodeNames))
   203  	d.Nodes = map[string]*app.ExamplePlugin{}
   204  	d.Name = d.f.UniqueName + d.NameSuffix + ".k8s.io"
   205  	resources.DriverName = d.Name
   206  
   207  	ctx, cancel := context.WithCancel(context.Background())
   208  	if d.NameSuffix != "" {
   209  		logger := klog.FromContext(ctx)
   210  		logger = klog.LoggerWithName(logger, "instance"+d.NameSuffix)
   211  		ctx = klog.NewContext(ctx, logger)
   212  	}
   213  	d.ctx = ctx
   214  	d.cleanup = append(d.cleanup, cancel)
   215  
   216  	switch d.parameterMode {
   217  	case "", parameterModeConfigMap:
   218  		// The controller is easy: we simply connect to the API server.
   219  		d.Controller = app.NewController(d.f.ClientSet, resources)
   220  		d.wg.Add(1)
   221  		go func() {
   222  			defer d.wg.Done()
   223  			d.Controller.Run(d.ctx, 5 /* workers */)
   224  		}()
   225  	}
   226  
   227  	manifests := []string{
   228  		// The code below matches the content of this manifest (ports,
   229  		// container names, etc.).
   230  		"test/e2e/testing-manifests/dra/dra-test-driver-proxy.yaml",
   231  	}
   232  	if d.parameterMode == "" {
   233  		d.parameterMode = parameterModeConfigMap
   234  	}
   235  	var numResourceInstances = -1 // disabled
   236  	if d.parameterMode != parameterModeConfigMap {
   237  		numResourceInstances = resources.MaxAllocations
   238  	}
   239  	switch d.parameterMode {
   240  	case parameterModeConfigMap, parameterModeTranslated:
   241  		d.parameterAPIGroup = ""
   242  		d.parameterAPIVersion = "v1"
   243  		d.claimParameterAPIKind = "ConfigMap"
   244  		d.classParameterAPIKind = "ConfigMap"
   245  	case parameterModeStructured:
   246  		d.parameterAPIGroup = "resource.k8s.io"
   247  		d.parameterAPIVersion = "v1alpha2"
   248  		d.claimParameterAPIKind = "ResourceClaimParameters"
   249  		d.classParameterAPIKind = "ResourceClassParameters"
   250  	default:
   251  		framework.Failf("unknown test driver parameter mode: %s", d.parameterMode)
   252  	}
   253  
   254  	instanceKey := "app.kubernetes.io/instance"
   255  	rsName := ""
   256  	draAddr := path.Join(framework.TestContext.KubeletRootDir, "plugins", d.Name+".sock")
   257  	numNodes := int32(len(nodes.NodeNames))
   258  	err := utils.CreateFromManifests(ctx, d.f, d.f.Namespace, func(item interface{}) error {
   259  		switch item := item.(type) {
   260  		case *appsv1.ReplicaSet:
   261  			item.Name += d.NameSuffix
   262  			rsName = item.Name
   263  			item.Spec.Replicas = &numNodes
   264  			item.Spec.Selector.MatchLabels[instanceKey] = d.Name
   265  			item.Spec.Template.Labels[instanceKey] = d.Name
   266  			item.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution[0].LabelSelector.MatchLabels[instanceKey] = d.Name
   267  			item.Spec.Template.Spec.Affinity.NodeAffinity = &v1.NodeAffinity{
   268  				RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
   269  					NodeSelectorTerms: []v1.NodeSelectorTerm{
   270  						{
   271  							MatchExpressions: []v1.NodeSelectorRequirement{
   272  								{
   273  									Key:      "kubernetes.io/hostname",
   274  									Operator: v1.NodeSelectorOpIn,
   275  									Values:   nodes.NodeNames,
   276  								},
   277  							},
   278  						},
   279  					},
   280  				},
   281  			}
   282  			item.Spec.Template.Spec.Volumes[0].HostPath.Path = path.Join(framework.TestContext.KubeletRootDir, "plugins")
   283  			item.Spec.Template.Spec.Volumes[2].HostPath.Path = path.Join(framework.TestContext.KubeletRootDir, "plugins_registry")
   284  			item.Spec.Template.Spec.Containers[0].Args = append(item.Spec.Template.Spec.Containers[0].Args, "--endpoint=/plugins_registry/"+d.Name+"-reg.sock")
   285  			item.Spec.Template.Spec.Containers[1].Args = append(item.Spec.Template.Spec.Containers[1].Args, "--endpoint=/dra/"+d.Name+".sock")
   286  		case *apiextensionsv1.CustomResourceDefinition:
   287  			item.Name = strings.ReplaceAll(item.Name, "dra.e2e.example.com", d.parameterAPIGroup)
   288  			item.Spec.Group = d.parameterAPIGroup
   289  
   290  		}
   291  		return nil
   292  	}, manifests...)
   293  	framework.ExpectNoError(err, "deploy kubelet plugin replicaset")
   294  
   295  	rs, err := d.f.ClientSet.AppsV1().ReplicaSets(d.f.Namespace.Name).Get(ctx, rsName, metav1.GetOptions{})
   296  	framework.ExpectNoError(err, "get replicaset")
   297  
   298  	// Wait for all pods to be running.
   299  	if err := e2ereplicaset.WaitForReplicaSetTargetAvailableReplicas(ctx, d.f.ClientSet, rs, numNodes); err != nil {
   300  		framework.ExpectNoError(err, "all kubelet plugin proxies running")
   301  	}
   302  	requirement, err := labels.NewRequirement(instanceKey, selection.Equals, []string{d.Name})
   303  	framework.ExpectNoError(err, "create label selector requirement")
   304  	selector := labels.NewSelector().Add(*requirement)
   305  	pods, err := d.f.ClientSet.CoreV1().Pods(d.f.Namespace.Name).List(ctx, metav1.ListOptions{LabelSelector: selector.String()})
   306  	framework.ExpectNoError(err, "list proxy pods")
   307  	gomega.Expect(numNodes).To(gomega.Equal(int32(len(pods.Items))), "number of proxy pods")
   308  
   309  	// Run registar and plugin for each of the pods.
   310  	for _, pod := range pods.Items {
   311  		// Need a local variable, not the loop variable, for the anonymous
   312  		// callback functions below.
   313  		pod := pod
   314  		nodename := pod.Spec.NodeName
   315  		logger := klog.LoggerWithValues(klog.LoggerWithName(klog.Background(), "kubelet plugin"), "node", pod.Spec.NodeName, "pod", klog.KObj(&pod))
   316  		loggerCtx := klog.NewContext(ctx, logger)
   317  		plugin, err := app.StartPlugin(loggerCtx, "/cdi", d.Name, nodename,
   318  			app.FileOperations{
   319  				Create: func(name string, content []byte) error {
   320  					klog.Background().Info("creating CDI file", "node", nodename, "filename", name, "content", string(content))
   321  					return d.createFile(&pod, name, content)
   322  				},
   323  				Remove: func(name string) error {
   324  					klog.Background().Info("deleting CDI file", "node", nodename, "filename", name)
   325  					return d.removeFile(&pod, name)
   326  				},
   327  				NumResourceInstances: numResourceInstances,
   328  			},
   329  			kubeletplugin.GRPCVerbosity(0),
   330  			kubeletplugin.GRPCInterceptor(func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (resp interface{}, err error) {
   331  				return d.interceptor(nodename, ctx, req, info, handler)
   332  			}),
   333  			kubeletplugin.GRPCStreamInterceptor(func(srv interface{}, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) (err error) {
   334  				return d.streamInterceptor(nodename, srv, ss, info, handler)
   335  			}),
   336  			kubeletplugin.PluginListener(listen(ctx, d.f, pod.Name, "plugin", 9001)),
   337  			kubeletplugin.RegistrarListener(listen(ctx, d.f, pod.Name, "registrar", 9000)),
   338  			kubeletplugin.KubeletPluginSocketPath(draAddr),
   339  			kubeletplugin.NodeV1alpha3(d.NodeV1alpha3),
   340  		)
   341  		framework.ExpectNoError(err, "start kubelet plugin for node %s", pod.Spec.NodeName)
   342  		d.cleanup = append(d.cleanup, func() {
   343  			// Depends on cancel being called first.
   344  			plugin.Stop()
   345  		})
   346  		d.Nodes[nodename] = plugin
   347  	}
   348  
   349  	// Wait for registration.
   350  	ginkgo.By("wait for plugin registration")
   351  	gomega.Eventually(func() map[string][]app.GRPCCall {
   352  		notRegistered := make(map[string][]app.GRPCCall)
   353  		for nodename, plugin := range d.Nodes {
   354  			calls := plugin.GetGRPCCalls()
   355  			if contains, err := app.BeRegistered.Match(calls); err != nil || !contains {
   356  				notRegistered[nodename] = calls
   357  			}
   358  		}
   359  		return notRegistered
   360  	}).WithTimeout(time.Minute).Should(gomega.BeEmpty(), "hosts where the plugin has not been registered yet")
   361  }
   362  
   363  func (d *Driver) createFile(pod *v1.Pod, name string, content []byte) error {
   364  	buffer := bytes.NewBuffer(content)
   365  	// Writing the content can be slow. Better create a temporary file and
   366  	// move it to the final destination once it is complete.
   367  	tmpName := name + ".tmp"
   368  	if err := d.podIO(pod).CreateFile(tmpName, buffer); err != nil {
   369  		_ = d.podIO(pod).RemoveAll(tmpName)
   370  		return err
   371  	}
   372  	return d.podIO(pod).Rename(tmpName, name)
   373  }
   374  
   375  func (d *Driver) removeFile(pod *v1.Pod, name string) error {
   376  	return d.podIO(pod).RemoveAll(name)
   377  }
   378  
   379  func (d *Driver) podIO(pod *v1.Pod) proxy.PodDirIO {
   380  	logger := klog.Background()
   381  	return proxy.PodDirIO{
   382  		F:             d.f,
   383  		Namespace:     pod.Namespace,
   384  		PodName:       pod.Name,
   385  		ContainerName: "plugin",
   386  		Logger:        &logger,
   387  	}
   388  }
   389  
   390  func listen(ctx context.Context, f *framework.Framework, podName, containerName string, port int) net.Listener {
   391  	addr := proxy.Addr{
   392  		Namespace:     f.Namespace.Name,
   393  		PodName:       podName,
   394  		ContainerName: containerName,
   395  		Port:          port,
   396  	}
   397  	listener, err := proxy.Listen(ctx, f.ClientSet, f.ClientConfig(), addr)
   398  	framework.ExpectNoError(err, "listen for connections from %+v", addr)
   399  	return listener
   400  }
   401  
   402  func (d *Driver) TearDown() {
   403  	for _, c := range d.cleanup {
   404  		c()
   405  	}
   406  	d.cleanup = nil
   407  	d.wg.Wait()
   408  }
   409  
   410  func (d *Driver) IsGone(ctx context.Context) {
   411  	gomega.Eventually(ctx, func(ctx context.Context) ([]resourcev1alpha2.ResourceSlice, error) {
   412  		slices, err := d.f.ClientSet.ResourceV1alpha2().ResourceSlices().List(ctx, metav1.ListOptions{FieldSelector: "driverName=" + d.Name})
   413  		if err != nil {
   414  			return nil, err
   415  		}
   416  		return slices.Items, err
   417  	}).Should(gomega.BeEmpty())
   418  }
   419  
   420  func (d *Driver) interceptor(nodename string, ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (resp interface{}, err error) {
   421  	d.mutex.Lock()
   422  	defer d.mutex.Unlock()
   423  
   424  	m := MethodInstance{nodename, info.FullMethod}
   425  	d.callCounts[m]++
   426  	if d.fail[m] {
   427  		return nil, errors.New("injected error")
   428  	}
   429  
   430  	return handler(ctx, req)
   431  }
   432  
   433  func (d *Driver) streamInterceptor(nodename string, srv interface{}, stream grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error {
   434  	// Stream calls block for a long time. We must not hold the lock while
   435  	// they are running.
   436  	d.mutex.Lock()
   437  	m := MethodInstance{nodename, info.FullMethod}
   438  	d.callCounts[m]++
   439  	fail := d.fail[m]
   440  	d.mutex.Unlock()
   441  
   442  	if fail {
   443  		return errors.New("injected error")
   444  	}
   445  
   446  	return handler(srv, stream)
   447  }
   448  
   449  func (d *Driver) Fail(m MethodInstance, injectError bool) {
   450  	d.mutex.Lock()
   451  	defer d.mutex.Unlock()
   452  
   453  	d.fail[m] = injectError
   454  }
   455  
   456  func (d *Driver) CallCount(m MethodInstance) int64 {
   457  	d.mutex.Lock()
   458  	defer d.mutex.Unlock()
   459  
   460  	return d.callCounts[m]
   461  }
   462  
   463  func (d *Driver) Nodenames() (nodenames []string) {
   464  	for nodename := range d.Nodes {
   465  		nodenames = append(nodenames, nodename)
   466  	}
   467  	sort.Strings(nodenames)
   468  	return
   469  }