sigs.k8s.io/cluster-api-provider-aws@v1.5.5/test/e2e/shared/gpu.go (about) 1 //go:build e2e 2 // +build e2e 3 4 /* 5 Copyright 2021 The Kubernetes Authors. 6 7 Licensed under the Apache License, Version 2.0 (the "License"); 8 you may not use this file except in compliance with the License. 9 You may obtain a copy of the License at 10 11 http://www.apache.org/licenses/LICENSE-2.0 12 13 Unless required by applicable law or agreed to in writing, software 14 distributed under the License is distributed on an "AS IS" BASIS, 15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 See the License for the specific language governing permissions and 17 limitations under the License. 18 */ 19 20 package shared 21 22 import ( 23 "context" 24 "encoding/json" 25 "fmt" 26 "strings" 27 "text/tabwriter" 28 29 "github.com/onsi/ginkgo" 30 . "github.com/onsi/gomega" 31 batchv1 "k8s.io/api/batch/v1" 32 corev1 "k8s.io/api/core/v1" 33 "k8s.io/apimachinery/pkg/api/resource" 34 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 35 "k8s.io/client-go/kubernetes" 36 typedbatchv1 "k8s.io/client-go/kubernetes/typed/batch/v1" 37 crclient "sigs.k8s.io/controller-runtime/pkg/client" 38 39 "sigs.k8s.io/cluster-api/test/framework" 40 ) 41 42 // AWSGPUSpecInput is the input for AWSGPUSpec. 43 type AWSGPUSpecInput struct { 44 BootstrapClusterProxy framework.ClusterProxy 45 NamespaceName string 46 ClusterName string 47 SkipCleanup bool 48 } 49 50 // AWSGPUSpec implements a test that verifies a GPU-enabled application runs on an "nvidia-gpu"-flavored CAPA cluster. 51 func AWSGPUSpec(ctx context.Context, e2eCtx *E2EContext, input AWSGPUSpecInput) { 52 specName := "aws-gpu" 53 54 Expect(input.NamespaceName).NotTo(BeNil(), "Invalid argument. input.Namespace can't be nil when calling %s spec", specName) 55 Expect(input.ClusterName).NotTo(BeEmpty(), "Invalid argument. input.ClusterName can't be empty when calling %s spec", specName) 56 57 ginkgo.By("creating a Kubernetes client to the workload cluster") 58 clusterProxy := input.BootstrapClusterProxy.GetWorkloadCluster(ctx, input.NamespaceName, input.ClusterName) 59 Expect(clusterProxy).NotTo(BeNil()) 60 clientset := clusterProxy.GetClientSet() 61 Expect(clientset).NotTo(BeNil()) 62 63 ginkgo.By("running a CUDA vector calculation job") 64 jobsClient := clientset.BatchV1().Jobs(corev1.NamespaceDefault) 65 jobName := "cuda-vector-add" 66 gpuJob := &batchv1.Job{ 67 ObjectMeta: metav1.ObjectMeta{ 68 Name: jobName, 69 Namespace: corev1.NamespaceDefault, 70 }, 71 Spec: batchv1.JobSpec{ 72 Template: corev1.PodTemplateSpec{ 73 Spec: corev1.PodSpec{ 74 RestartPolicy: corev1.RestartPolicyOnFailure, 75 Containers: []corev1.Container{ 76 { 77 Name: jobName, 78 Image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.1-ubuntu18.04", 79 Resources: corev1.ResourceRequirements{ 80 Limits: corev1.ResourceList{ 81 "nvidia.com/gpu": resource.MustParse("1"), 82 }, 83 }, 84 }, 85 }, 86 }, 87 }, 88 }, 89 } 90 _, err := jobsClient.Create(ctx, gpuJob, metav1.CreateOptions{}) 91 Expect(err).NotTo(HaveOccurred()) 92 gpuJobInput := WaitForJobCompleteInput{ 93 Getter: jobsClientAdapter{client: jobsClient}, 94 Job: gpuJob, 95 Clientset: clientset, 96 } 97 WaitForJobComplete(ctx, gpuJobInput, e2eCtx.E2EConfig.GetIntervals(specName, "wait-job")...) 98 } 99 100 // jobsClientAdapter adapts a Job to work with WaitForJobAvailable. 101 type jobsClientAdapter struct { 102 client typedbatchv1.JobInterface 103 } 104 105 // Get fetches the job named by the key and updates the provided object. 106 func (c jobsClientAdapter) Get(ctx context.Context, key crclient.ObjectKey, obj crclient.Object) error { 107 job, err := c.client.Get(ctx, key.Name, metav1.GetOptions{}) 108 if jobObj, ok := obj.(*batchv1.Job); ok { 109 job.DeepCopyInto(jobObj) 110 } 111 return err 112 } 113 114 // WaitForJobCompleteInput is the input for WaitForJobComplete. 115 type WaitForJobCompleteInput struct { 116 Getter framework.Getter 117 Job *batchv1.Job 118 Clientset *kubernetes.Clientset 119 } 120 121 // WaitForJobComplete waits until the Job completes with at least one success. 122 func WaitForJobComplete(ctx context.Context, input WaitForJobCompleteInput, intervals ...interface{}) { 123 namespace, name := input.Job.GetNamespace(), input.Job.GetName() 124 Eventually(func() bool { 125 key := crclient.ObjectKey{Namespace: namespace, Name: name} 126 if err := input.Getter.Get(ctx, key, input.Job); err == nil { 127 for _, c := range input.Job.Status.Conditions { 128 if c.Type == batchv1.JobComplete && c.Status == corev1.ConditionTrue { 129 return input.Job.Status.Succeeded > 0 130 } 131 } 132 } 133 return false 134 }, intervals...).Should(BeTrue(), func() string { return DescribeFailedJob(ctx, input) }) 135 } 136 137 // DescribeFailedJob returns a string with information to help debug a failed job. 138 func DescribeFailedJob(ctx context.Context, input WaitForJobCompleteInput) string { 139 namespace, name := input.Job.GetNamespace(), input.Job.GetName() 140 b := strings.Builder{} 141 b.WriteString(fmt.Sprintf("Job %s/%s failed", 142 namespace, name)) 143 b.WriteString(fmt.Sprintf("\nJob:\n%s\n", prettyPrint(input.Job))) 144 b.WriteString(describeEvents(ctx, input.Clientset, namespace, name)) 145 return b.String() 146 } 147 148 // prettyPrint returns a formatted JSON version of the object given. 149 func prettyPrint(v interface{}) string { 150 b, err := json.MarshalIndent(v, "", " ") 151 if err != nil { 152 return err.Error() 153 } 154 return string(b) 155 } 156 157 // describeEvents returns a string summarizing recent events involving the named object(s). 158 func describeEvents(ctx context.Context, clientset *kubernetes.Clientset, namespace, name string) string { 159 b := strings.Builder{} 160 if clientset == nil { 161 b.WriteString("clientset is nil, so skipping output of relevant events") 162 } else { 163 opts := metav1.ListOptions{ 164 FieldSelector: fmt.Sprintf("involvedObject.name=%s", name), 165 Limit: 20, 166 } 167 evts, err := clientset.CoreV1().Events(namespace).List(ctx, opts) 168 if err != nil { 169 b.WriteString(err.Error()) 170 } else { 171 w := tabwriter.NewWriter(&b, 0, 4, 2, ' ', tabwriter.FilterHTML) 172 fmt.Fprintln(w, "LAST SEEN\tTYPE\tREASON\tOBJECT\tMESSAGE") 173 for _, e := range evts.Items { 174 fmt.Fprintf(w, "%s\t%s\t%s\t%s/%s\t%s\n", e.LastTimestamp, e.Type, e.Reason, 175 strings.ToLower(e.InvolvedObject.Kind), e.InvolvedObject.Name, e.Message) 176 } 177 w.Flush() 178 } 179 } 180 return b.String() 181 }