k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/e2e/dra/deploy.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package dra 18 19 import ( 20 "bytes" 21 "context" 22 "errors" 23 "fmt" 24 "net" 25 "path" 26 "sort" 27 "strings" 28 "sync" 29 "time" 30 31 "github.com/google/go-cmp/cmp" 32 "github.com/onsi/ginkgo/v2" 33 "github.com/onsi/gomega" 34 "github.com/onsi/gomega/format" 35 "google.golang.org/grpc" 36 37 appsv1 "k8s.io/api/apps/v1" 38 v1 "k8s.io/api/core/v1" 39 resourcev1alpha2 "k8s.io/api/resource/v1alpha2" 40 apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" 41 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 42 "k8s.io/apimachinery/pkg/labels" 43 "k8s.io/apimachinery/pkg/selection" 44 resourceapiinformer "k8s.io/client-go/informers/resource/v1alpha2" 45 "k8s.io/client-go/tools/cache" 46 "k8s.io/dynamic-resource-allocation/kubeletplugin" 47 "k8s.io/klog/v2" 48 "k8s.io/kubernetes/test/e2e/dra/test-driver/app" 49 "k8s.io/kubernetes/test/e2e/framework" 50 e2enode "k8s.io/kubernetes/test/e2e/framework/node" 51 e2ereplicaset "k8s.io/kubernetes/test/e2e/framework/replicaset" 52 e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" 53 "k8s.io/kubernetes/test/e2e/storage/drivers/proxy" 54 "k8s.io/kubernetes/test/e2e/storage/utils" 55 ) 56 57 const ( 58 NodePrepareResourcesMethod = "/v1alpha3.Node/NodePrepareResources" 59 NodeUnprepareResourcesMethod = "/v1alpha3.Node/NodeUnprepareResources" 60 NodeListAndWatchResourcesMethod = "/v1alpha3.Node/NodeListAndWatchResources" 61 ) 62 63 type Nodes struct { 64 NodeNames []string 65 } 66 67 // NewNodes selects nodes to run the test on. 68 func NewNodes(f *framework.Framework, minNodes, maxNodes int) *Nodes { 69 nodes := &Nodes{} 70 ginkgo.BeforeEach(func(ctx context.Context) { 71 ginkgo.By("selecting nodes") 72 // The kubelet plugin is harder. We deploy the builtin manifest 73 // after patching in the driver name and all nodes on which we 74 // want the plugin to run. 75 // 76 // Only a subset of the nodes are picked to avoid causing 77 // unnecessary load on a big cluster. 78 nodeList, err := e2enode.GetBoundedReadySchedulableNodes(ctx, f.ClientSet, maxNodes) 79 framework.ExpectNoError(err, "get nodes") 80 numNodes := int32(len(nodeList.Items)) 81 if int(numNodes) < minNodes { 82 e2eskipper.Skipf("%d ready nodes required, only have %d", minNodes, numNodes) 83 } 84 nodes.NodeNames = nil 85 for _, node := range nodeList.Items { 86 nodes.NodeNames = append(nodes.NodeNames, node.Name) 87 } 88 framework.Logf("testing on nodes %v", nodes.NodeNames) 89 90 // Watch claims in the namespace. This is useful for monitoring a test 91 // and enables additional sanity checks. 92 claimInformer := resourceapiinformer.NewResourceClaimInformer(f.ClientSet, f.Namespace.Name, 100*time.Hour /* resync */, nil) 93 cancelCtx, cancel := context.WithCancelCause(context.Background()) 94 var wg sync.WaitGroup 95 ginkgo.DeferCleanup(func() { 96 cancel(errors.New("test has completed")) 97 wg.Wait() 98 }) 99 _, err = claimInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ 100 AddFunc: func(obj any) { 101 defer ginkgo.GinkgoRecover() 102 claim := obj.(*resourcev1alpha2.ResourceClaim) 103 framework.Logf("New claim:\n%s", format.Object(claim, 1)) 104 validateClaim(claim) 105 }, 106 UpdateFunc: func(oldObj, newObj any) { 107 defer ginkgo.GinkgoRecover() 108 oldClaim := oldObj.(*resourcev1alpha2.ResourceClaim) 109 newClaim := newObj.(*resourcev1alpha2.ResourceClaim) 110 framework.Logf("Updated claim:\n%s\nDiff:\n%s", format.Object(newClaim, 1), cmp.Diff(oldClaim, newClaim)) 111 validateClaim(newClaim) 112 }, 113 DeleteFunc: func(obj any) { 114 defer ginkgo.GinkgoRecover() 115 claim := obj.(*resourcev1alpha2.ResourceClaim) 116 framework.Logf("Deleted claim:\n%s", format.Object(claim, 1)) 117 }, 118 }) 119 framework.ExpectNoError(err, "AddEventHandler") 120 wg.Add(1) 121 go func() { 122 defer wg.Done() 123 claimInformer.Run(cancelCtx.Done()) 124 }() 125 }) 126 return nodes 127 } 128 129 func validateClaim(claim *resourcev1alpha2.ResourceClaim) { 130 // The apiserver doesn't enforce that a claim always has a finalizer 131 // while being allocated. This is a convention that whoever allocates a 132 // claim has to follow to prevent using a claim that is at risk of 133 // being deleted. 134 if claim.Status.Allocation != nil && len(claim.Finalizers) == 0 { 135 framework.Failf("Invalid claim: allocated without any finalizer:\n%s", format.Object(claim, 1)) 136 } 137 } 138 139 // NewDriver sets up controller (as client of the cluster) and 140 // kubelet plugin (via proxy) before the test runs. It cleans 141 // up after the test. 142 func NewDriver(f *framework.Framework, nodes *Nodes, configureResources func() app.Resources) *Driver { 143 d := &Driver{ 144 f: f, 145 fail: map[MethodInstance]bool{}, 146 callCounts: map[MethodInstance]int64{}, 147 NodeV1alpha3: true, 148 } 149 150 ginkgo.BeforeEach(func() { 151 resources := configureResources() 152 if len(resources.Nodes) == 0 { 153 // This always has to be set because the driver might 154 // not run on all nodes. 155 resources.Nodes = nodes.NodeNames 156 } 157 ginkgo.DeferCleanup(d.IsGone) // Register first so it gets called last. 158 d.SetUp(nodes, resources) 159 ginkgo.DeferCleanup(d.TearDown) 160 }) 161 return d 162 } 163 164 type MethodInstance struct { 165 Nodename string 166 FullMethod string 167 } 168 169 type Driver struct { 170 f *framework.Framework 171 ctx context.Context 172 cleanup []func() // executed first-in-first-out 173 wg sync.WaitGroup 174 175 NameSuffix string 176 Controller *app.ExampleController 177 Name string 178 Nodes map[string]*app.ExamplePlugin 179 180 parameterMode parameterMode 181 parameterAPIGroup string 182 parameterAPIVersion string 183 claimParameterAPIKind string 184 classParameterAPIKind string 185 186 NodeV1alpha3 bool 187 188 mutex sync.Mutex 189 fail map[MethodInstance]bool 190 callCounts map[MethodInstance]int64 191 } 192 193 type parameterMode string 194 195 const ( 196 parameterModeConfigMap parameterMode = "configmap" // ConfigMap parameters, control plane controller. 197 parameterModeStructured parameterMode = "structured" // No ConfigMaps, directly create and reference in-tree parameter objects. 198 parameterModeTranslated parameterMode = "translated" // Reference ConfigMaps in claim and class, generate in-tree parameter objects. 199 ) 200 201 func (d *Driver) SetUp(nodes *Nodes, resources app.Resources) { 202 ginkgo.By(fmt.Sprintf("deploying driver on nodes %v", nodes.NodeNames)) 203 d.Nodes = map[string]*app.ExamplePlugin{} 204 d.Name = d.f.UniqueName + d.NameSuffix + ".k8s.io" 205 resources.DriverName = d.Name 206 207 ctx, cancel := context.WithCancel(context.Background()) 208 if d.NameSuffix != "" { 209 logger := klog.FromContext(ctx) 210 logger = klog.LoggerWithName(logger, "instance"+d.NameSuffix) 211 ctx = klog.NewContext(ctx, logger) 212 } 213 d.ctx = ctx 214 d.cleanup = append(d.cleanup, cancel) 215 216 switch d.parameterMode { 217 case "", parameterModeConfigMap: 218 // The controller is easy: we simply connect to the API server. 219 d.Controller = app.NewController(d.f.ClientSet, resources) 220 d.wg.Add(1) 221 go func() { 222 defer d.wg.Done() 223 d.Controller.Run(d.ctx, 5 /* workers */) 224 }() 225 } 226 227 manifests := []string{ 228 // The code below matches the content of this manifest (ports, 229 // container names, etc.). 230 "test/e2e/testing-manifests/dra/dra-test-driver-proxy.yaml", 231 } 232 if d.parameterMode == "" { 233 d.parameterMode = parameterModeConfigMap 234 } 235 var numResourceInstances = -1 // disabled 236 if d.parameterMode != parameterModeConfigMap { 237 numResourceInstances = resources.MaxAllocations 238 } 239 switch d.parameterMode { 240 case parameterModeConfigMap, parameterModeTranslated: 241 d.parameterAPIGroup = "" 242 d.parameterAPIVersion = "v1" 243 d.claimParameterAPIKind = "ConfigMap" 244 d.classParameterAPIKind = "ConfigMap" 245 case parameterModeStructured: 246 d.parameterAPIGroup = "resource.k8s.io" 247 d.parameterAPIVersion = "v1alpha2" 248 d.claimParameterAPIKind = "ResourceClaimParameters" 249 d.classParameterAPIKind = "ResourceClassParameters" 250 default: 251 framework.Failf("unknown test driver parameter mode: %s", d.parameterMode) 252 } 253 254 instanceKey := "app.kubernetes.io/instance" 255 rsName := "" 256 draAddr := path.Join(framework.TestContext.KubeletRootDir, "plugins", d.Name+".sock") 257 numNodes := int32(len(nodes.NodeNames)) 258 err := utils.CreateFromManifests(ctx, d.f, d.f.Namespace, func(item interface{}) error { 259 switch item := item.(type) { 260 case *appsv1.ReplicaSet: 261 item.Name += d.NameSuffix 262 rsName = item.Name 263 item.Spec.Replicas = &numNodes 264 item.Spec.Selector.MatchLabels[instanceKey] = d.Name 265 item.Spec.Template.Labels[instanceKey] = d.Name 266 item.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution[0].LabelSelector.MatchLabels[instanceKey] = d.Name 267 item.Spec.Template.Spec.Affinity.NodeAffinity = &v1.NodeAffinity{ 268 RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ 269 NodeSelectorTerms: []v1.NodeSelectorTerm{ 270 { 271 MatchExpressions: []v1.NodeSelectorRequirement{ 272 { 273 Key: "kubernetes.io/hostname", 274 Operator: v1.NodeSelectorOpIn, 275 Values: nodes.NodeNames, 276 }, 277 }, 278 }, 279 }, 280 }, 281 } 282 item.Spec.Template.Spec.Volumes[0].HostPath.Path = path.Join(framework.TestContext.KubeletRootDir, "plugins") 283 item.Spec.Template.Spec.Volumes[2].HostPath.Path = path.Join(framework.TestContext.KubeletRootDir, "plugins_registry") 284 item.Spec.Template.Spec.Containers[0].Args = append(item.Spec.Template.Spec.Containers[0].Args, "--endpoint=/plugins_registry/"+d.Name+"-reg.sock") 285 item.Spec.Template.Spec.Containers[1].Args = append(item.Spec.Template.Spec.Containers[1].Args, "--endpoint=/dra/"+d.Name+".sock") 286 case *apiextensionsv1.CustomResourceDefinition: 287 item.Name = strings.ReplaceAll(item.Name, "dra.e2e.example.com", d.parameterAPIGroup) 288 item.Spec.Group = d.parameterAPIGroup 289 290 } 291 return nil 292 }, manifests...) 293 framework.ExpectNoError(err, "deploy kubelet plugin replicaset") 294 295 rs, err := d.f.ClientSet.AppsV1().ReplicaSets(d.f.Namespace.Name).Get(ctx, rsName, metav1.GetOptions{}) 296 framework.ExpectNoError(err, "get replicaset") 297 298 // Wait for all pods to be running. 299 if err := e2ereplicaset.WaitForReplicaSetTargetAvailableReplicas(ctx, d.f.ClientSet, rs, numNodes); err != nil { 300 framework.ExpectNoError(err, "all kubelet plugin proxies running") 301 } 302 requirement, err := labels.NewRequirement(instanceKey, selection.Equals, []string{d.Name}) 303 framework.ExpectNoError(err, "create label selector requirement") 304 selector := labels.NewSelector().Add(*requirement) 305 pods, err := d.f.ClientSet.CoreV1().Pods(d.f.Namespace.Name).List(ctx, metav1.ListOptions{LabelSelector: selector.String()}) 306 framework.ExpectNoError(err, "list proxy pods") 307 gomega.Expect(numNodes).To(gomega.Equal(int32(len(pods.Items))), "number of proxy pods") 308 309 // Run registar and plugin for each of the pods. 310 for _, pod := range pods.Items { 311 // Need a local variable, not the loop variable, for the anonymous 312 // callback functions below. 313 pod := pod 314 nodename := pod.Spec.NodeName 315 logger := klog.LoggerWithValues(klog.LoggerWithName(klog.Background(), "kubelet plugin"), "node", pod.Spec.NodeName, "pod", klog.KObj(&pod)) 316 loggerCtx := klog.NewContext(ctx, logger) 317 plugin, err := app.StartPlugin(loggerCtx, "/cdi", d.Name, nodename, 318 app.FileOperations{ 319 Create: func(name string, content []byte) error { 320 klog.Background().Info("creating CDI file", "node", nodename, "filename", name, "content", string(content)) 321 return d.createFile(&pod, name, content) 322 }, 323 Remove: func(name string) error { 324 klog.Background().Info("deleting CDI file", "node", nodename, "filename", name) 325 return d.removeFile(&pod, name) 326 }, 327 NumResourceInstances: numResourceInstances, 328 }, 329 kubeletplugin.GRPCVerbosity(0), 330 kubeletplugin.GRPCInterceptor(func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (resp interface{}, err error) { 331 return d.interceptor(nodename, ctx, req, info, handler) 332 }), 333 kubeletplugin.GRPCStreamInterceptor(func(srv interface{}, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) (err error) { 334 return d.streamInterceptor(nodename, srv, ss, info, handler) 335 }), 336 kubeletplugin.PluginListener(listen(ctx, d.f, pod.Name, "plugin", 9001)), 337 kubeletplugin.RegistrarListener(listen(ctx, d.f, pod.Name, "registrar", 9000)), 338 kubeletplugin.KubeletPluginSocketPath(draAddr), 339 kubeletplugin.NodeV1alpha3(d.NodeV1alpha3), 340 ) 341 framework.ExpectNoError(err, "start kubelet plugin for node %s", pod.Spec.NodeName) 342 d.cleanup = append(d.cleanup, func() { 343 // Depends on cancel being called first. 344 plugin.Stop() 345 }) 346 d.Nodes[nodename] = plugin 347 } 348 349 // Wait for registration. 350 ginkgo.By("wait for plugin registration") 351 gomega.Eventually(func() map[string][]app.GRPCCall { 352 notRegistered := make(map[string][]app.GRPCCall) 353 for nodename, plugin := range d.Nodes { 354 calls := plugin.GetGRPCCalls() 355 if contains, err := app.BeRegistered.Match(calls); err != nil || !contains { 356 notRegistered[nodename] = calls 357 } 358 } 359 return notRegistered 360 }).WithTimeout(time.Minute).Should(gomega.BeEmpty(), "hosts where the plugin has not been registered yet") 361 } 362 363 func (d *Driver) createFile(pod *v1.Pod, name string, content []byte) error { 364 buffer := bytes.NewBuffer(content) 365 // Writing the content can be slow. Better create a temporary file and 366 // move it to the final destination once it is complete. 367 tmpName := name + ".tmp" 368 if err := d.podIO(pod).CreateFile(tmpName, buffer); err != nil { 369 _ = d.podIO(pod).RemoveAll(tmpName) 370 return err 371 } 372 return d.podIO(pod).Rename(tmpName, name) 373 } 374 375 func (d *Driver) removeFile(pod *v1.Pod, name string) error { 376 return d.podIO(pod).RemoveAll(name) 377 } 378 379 func (d *Driver) podIO(pod *v1.Pod) proxy.PodDirIO { 380 logger := klog.Background() 381 return proxy.PodDirIO{ 382 F: d.f, 383 Namespace: pod.Namespace, 384 PodName: pod.Name, 385 ContainerName: "plugin", 386 Logger: &logger, 387 } 388 } 389 390 func listen(ctx context.Context, f *framework.Framework, podName, containerName string, port int) net.Listener { 391 addr := proxy.Addr{ 392 Namespace: f.Namespace.Name, 393 PodName: podName, 394 ContainerName: containerName, 395 Port: port, 396 } 397 listener, err := proxy.Listen(ctx, f.ClientSet, f.ClientConfig(), addr) 398 framework.ExpectNoError(err, "listen for connections from %+v", addr) 399 return listener 400 } 401 402 func (d *Driver) TearDown() { 403 for _, c := range d.cleanup { 404 c() 405 } 406 d.cleanup = nil 407 d.wg.Wait() 408 } 409 410 func (d *Driver) IsGone(ctx context.Context) { 411 gomega.Eventually(ctx, func(ctx context.Context) ([]resourcev1alpha2.ResourceSlice, error) { 412 slices, err := d.f.ClientSet.ResourceV1alpha2().ResourceSlices().List(ctx, metav1.ListOptions{FieldSelector: "driverName=" + d.Name}) 413 if err != nil { 414 return nil, err 415 } 416 return slices.Items, err 417 }).Should(gomega.BeEmpty()) 418 } 419 420 func (d *Driver) interceptor(nodename string, ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (resp interface{}, err error) { 421 d.mutex.Lock() 422 defer d.mutex.Unlock() 423 424 m := MethodInstance{nodename, info.FullMethod} 425 d.callCounts[m]++ 426 if d.fail[m] { 427 return nil, errors.New("injected error") 428 } 429 430 return handler(ctx, req) 431 } 432 433 func (d *Driver) streamInterceptor(nodename string, srv interface{}, stream grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error { 434 // Stream calls block for a long time. We must not hold the lock while 435 // they are running. 436 d.mutex.Lock() 437 m := MethodInstance{nodename, info.FullMethod} 438 d.callCounts[m]++ 439 fail := d.fail[m] 440 d.mutex.Unlock() 441 442 if fail { 443 return errors.New("injected error") 444 } 445 446 return handler(srv, stream) 447 } 448 449 func (d *Driver) Fail(m MethodInstance, injectError bool) { 450 d.mutex.Lock() 451 defer d.mutex.Unlock() 452 453 d.fail[m] = injectError 454 } 455 456 func (d *Driver) CallCount(m MethodInstance) int64 { 457 d.mutex.Lock() 458 defer d.mutex.Unlock() 459 460 return d.callCounts[m] 461 } 462 463 func (d *Driver) Nodenames() (nodenames []string) { 464 for nodename := range d.Nodes { 465 nodenames = append(nodenames, nodename) 466 } 467 sort.Strings(nodenames) 468 return 469 }