sigs.k8s.io/cluster-api-provider-azure@v1.14.3/test/e2e/azure_gpu.go (about) 1 //go:build e2e 2 // +build e2e 3 4 /* 5 Copyright 2020 The Kubernetes Authors. 6 7 Licensed under the Apache License, Version 2.0 (the "License"); 8 you may not use this file except in compliance with the License. 9 You may obtain a copy of the License at 10 11 http://www.apache.org/licenses/LICENSE-2.0 12 13 Unless required by applicable law or agreed to in writing, software 14 distributed under the License is distributed on an "AS IS" BASIS, 15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 See the License for the specific language governing permissions and 17 limitations under the License. 18 */ 19 20 package e2e 21 22 import ( 23 "context" 24 "fmt" 25 "os" 26 "strings" 27 28 . "github.com/onsi/ginkgo/v2" 29 . "github.com/onsi/gomega" 30 batchv1 "k8s.io/api/batch/v1" 31 corev1 "k8s.io/api/core/v1" 32 "k8s.io/apimachinery/pkg/api/resource" 33 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 34 "k8s.io/client-go/kubernetes" 35 "sigs.k8s.io/cluster-api/test/framework" 36 ) 37 38 // AzureGPUSpecInput is the input for AzureGPUSpec. 39 type AzureGPUSpecInput struct { 40 BootstrapClusterProxy framework.ClusterProxy 41 Namespace *corev1.Namespace 42 ClusterName string 43 SkipCleanup bool 44 } 45 46 // AzureGPUSpec implements a test that verifies a GPU-enabled application runs on an 47 // "nvidia-gpu"-flavored CAPZ cluster. 48 func AzureGPUSpec(ctx context.Context, inputGetter func() AzureGPUSpecInput) { 49 var ( 50 specName = "azure-gpu" 51 input AzureGPUSpecInput 52 machineType = os.Getenv("AZURE_GPU_NODE_MACHINE_TYPE") 53 ) 54 55 input = inputGetter() 56 Expect(input.Namespace).NotTo(BeNil(), "Invalid argument. input.Namespace can't be nil when calling %s spec", specName) 57 Expect(input.ClusterName).NotTo(BeEmpty(), "Invalid argument. input.ClusterName can't be empty when calling %s spec", specName) 58 if machineType != "" { 59 Expect(machineType).To(HavePrefix("Standard_N"), "AZURE_GPU_NODE_MACHINE_TYPE is \"%s\" which isn't a GPU SKU in %s spec", machineType, specName) 60 } 61 62 By("creating a Kubernetes client to the workload cluster") 63 clusterProxy := input.BootstrapClusterProxy.GetWorkloadCluster(ctx, input.Namespace.Name, input.ClusterName) 64 Expect(clusterProxy).NotTo(BeNil()) 65 clientset := clusterProxy.GetClientSet() 66 Expect(clientset).NotTo(BeNil()) 67 68 By("Waiting for a node to have an \"nvidia.com/gpu\" allocatable resource") 69 Eventually(func() bool { 70 nodeList, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) 71 Expect(err).NotTo(HaveOccurred()) 72 for _, node := range nodeList.Items { 73 for k, v := range node.Status.Allocatable { 74 if k == "nvidia.com/gpu" && v.Value() > 0 { 75 return true 76 } 77 } 78 } 79 return false 80 }, e2eConfig.GetIntervals(specName, "wait-worker-nodes")...).Should(BeTrue(), func() string { 81 return getGPUOperatorPodLogs(ctx, clientset) 82 }) 83 84 By("running a CUDA vector calculation job") 85 jobsClient := clientset.BatchV1().Jobs(corev1.NamespaceDefault) 86 jobName := "cuda-vector-add" 87 gpuJob := &batchv1.Job{ 88 ObjectMeta: metav1.ObjectMeta{ 89 Name: jobName, 90 Namespace: corev1.NamespaceDefault, 91 }, 92 Spec: batchv1.JobSpec{ 93 Template: corev1.PodTemplateSpec{ 94 Spec: corev1.PodSpec{ 95 RestartPolicy: corev1.RestartPolicyOnFailure, 96 Containers: []corev1.Container{ 97 { 98 Name: jobName, 99 Image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.2.1", 100 Resources: corev1.ResourceRequirements{ 101 Limits: corev1.ResourceList{ 102 "nvidia.com/gpu": resource.MustParse("1"), 103 }, 104 }, 105 }, 106 }, 107 }, 108 }, 109 }, 110 } 111 Log("starting to create CUDA vector calculation job") 112 _, err := jobsClient.Create(ctx, gpuJob, metav1.CreateOptions{}) 113 Expect(err).NotTo(HaveOccurred()) 114 gpuJobInput := WaitForJobCompleteInput{ 115 Getter: jobsClientAdapter{client: jobsClient}, 116 Job: gpuJob, 117 Clientset: clientset, 118 } 119 WaitForJobComplete(ctx, gpuJobInput, e2eConfig.GetIntervals(specName, "wait-job")...) 120 } 121 122 // getGPUOperatorPodLogs returns the logs of the Nvidia GPU operator pods. 123 func getGPUOperatorPodLogs(ctx context.Context, clientset *kubernetes.Clientset) string { 124 podsClient := clientset.CoreV1().Pods(corev1.NamespaceAll) 125 var pods *corev1.PodList 126 var err error 127 Eventually(func(g Gomega) { 128 pods, err = podsClient.List(ctx, metav1.ListOptions{LabelSelector: "app.kubernetes.io/instance=gpu-operator"}) 129 if err != nil { 130 LogWarning(err.Error()) 131 } 132 g.Expect(err).NotTo(HaveOccurred()) 133 }, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed()) 134 b := strings.Builder{} 135 for _, pod := range pods.Items { 136 b.WriteString(fmt.Sprintf("\nLogs for pod %s:\n", pod.Name)) 137 b.WriteString(getPodLogs(ctx, clientset, pod)) 138 } 139 return b.String() 140 }