sigs.k8s.io/cluster-api-provider-aws@v1.5.5/test/e2e/shared/gpu.go (about)

     1  //go:build e2e
     2  // +build e2e
     3  
     4  /*
     5  Copyright 2021 The Kubernetes Authors.
     6  
     7  Licensed under the Apache License, Version 2.0 (the "License");
     8  you may not use this file except in compliance with the License.
     9  You may obtain a copy of the License at
    10  
    11  	http://www.apache.org/licenses/LICENSE-2.0
    12  
    13  Unless required by applicable law or agreed to in writing, software
    14  distributed under the License is distributed on an "AS IS" BASIS,
    15  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    16  See the License for the specific language governing permissions and
    17  limitations under the License.
    18  */
    19  
    20  package shared
    21  
    22  import (
    23  	"context"
    24  	"encoding/json"
    25  	"fmt"
    26  	"strings"
    27  	"text/tabwriter"
    28  
    29  	"github.com/onsi/ginkgo"
    30  	. "github.com/onsi/gomega"
    31  	batchv1 "k8s.io/api/batch/v1"
    32  	corev1 "k8s.io/api/core/v1"
    33  	"k8s.io/apimachinery/pkg/api/resource"
    34  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    35  	"k8s.io/client-go/kubernetes"
    36  	typedbatchv1 "k8s.io/client-go/kubernetes/typed/batch/v1"
    37  	crclient "sigs.k8s.io/controller-runtime/pkg/client"
    38  
    39  	"sigs.k8s.io/cluster-api/test/framework"
    40  )
    41  
    42  // AWSGPUSpecInput is the input for AWSGPUSpec.
    43  type AWSGPUSpecInput struct {
    44  	BootstrapClusterProxy framework.ClusterProxy
    45  	NamespaceName         string
    46  	ClusterName           string
    47  	SkipCleanup           bool
    48  }
    49  
    50  // AWSGPUSpec implements a test that verifies a GPU-enabled application runs on an "nvidia-gpu"-flavored CAPA cluster.
    51  func AWSGPUSpec(ctx context.Context, e2eCtx *E2EContext, input AWSGPUSpecInput) {
    52  	specName := "aws-gpu"
    53  
    54  	Expect(input.NamespaceName).NotTo(BeNil(), "Invalid argument. input.Namespace can't be nil when calling %s spec", specName)
    55  	Expect(input.ClusterName).NotTo(BeEmpty(), "Invalid argument. input.ClusterName can't be empty when calling %s spec", specName)
    56  
    57  	ginkgo.By("creating a Kubernetes client to the workload cluster")
    58  	clusterProxy := input.BootstrapClusterProxy.GetWorkloadCluster(ctx, input.NamespaceName, input.ClusterName)
    59  	Expect(clusterProxy).NotTo(BeNil())
    60  	clientset := clusterProxy.GetClientSet()
    61  	Expect(clientset).NotTo(BeNil())
    62  
    63  	ginkgo.By("running a CUDA vector calculation job")
    64  	jobsClient := clientset.BatchV1().Jobs(corev1.NamespaceDefault)
    65  	jobName := "cuda-vector-add"
    66  	gpuJob := &batchv1.Job{
    67  		ObjectMeta: metav1.ObjectMeta{
    68  			Name:      jobName,
    69  			Namespace: corev1.NamespaceDefault,
    70  		},
    71  		Spec: batchv1.JobSpec{
    72  			Template: corev1.PodTemplateSpec{
    73  				Spec: corev1.PodSpec{
    74  					RestartPolicy: corev1.RestartPolicyOnFailure,
    75  					Containers: []corev1.Container{
    76  						{
    77  							Name:  jobName,
    78  							Image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.1-ubuntu18.04",
    79  							Resources: corev1.ResourceRequirements{
    80  								Limits: corev1.ResourceList{
    81  									"nvidia.com/gpu": resource.MustParse("1"),
    82  								},
    83  							},
    84  						},
    85  					},
    86  				},
    87  			},
    88  		},
    89  	}
    90  	_, err := jobsClient.Create(ctx, gpuJob, metav1.CreateOptions{})
    91  	Expect(err).NotTo(HaveOccurred())
    92  	gpuJobInput := WaitForJobCompleteInput{
    93  		Getter:    jobsClientAdapter{client: jobsClient},
    94  		Job:       gpuJob,
    95  		Clientset: clientset,
    96  	}
    97  	WaitForJobComplete(ctx, gpuJobInput, e2eCtx.E2EConfig.GetIntervals(specName, "wait-job")...)
    98  }
    99  
   100  // jobsClientAdapter adapts a Job to work with WaitForJobAvailable.
   101  type jobsClientAdapter struct {
   102  	client typedbatchv1.JobInterface
   103  }
   104  
   105  // Get fetches the job named by the key and updates the provided object.
   106  func (c jobsClientAdapter) Get(ctx context.Context, key crclient.ObjectKey, obj crclient.Object) error {
   107  	job, err := c.client.Get(ctx, key.Name, metav1.GetOptions{})
   108  	if jobObj, ok := obj.(*batchv1.Job); ok {
   109  		job.DeepCopyInto(jobObj)
   110  	}
   111  	return err
   112  }
   113  
   114  // WaitForJobCompleteInput is the input for WaitForJobComplete.
   115  type WaitForJobCompleteInput struct {
   116  	Getter    framework.Getter
   117  	Job       *batchv1.Job
   118  	Clientset *kubernetes.Clientset
   119  }
   120  
   121  // WaitForJobComplete waits until the Job completes with at least one success.
   122  func WaitForJobComplete(ctx context.Context, input WaitForJobCompleteInput, intervals ...interface{}) {
   123  	namespace, name := input.Job.GetNamespace(), input.Job.GetName()
   124  	Eventually(func() bool {
   125  		key := crclient.ObjectKey{Namespace: namespace, Name: name}
   126  		if err := input.Getter.Get(ctx, key, input.Job); err == nil {
   127  			for _, c := range input.Job.Status.Conditions {
   128  				if c.Type == batchv1.JobComplete && c.Status == corev1.ConditionTrue {
   129  					return input.Job.Status.Succeeded > 0
   130  				}
   131  			}
   132  		}
   133  		return false
   134  	}, intervals...).Should(BeTrue(), func() string { return DescribeFailedJob(ctx, input) })
   135  }
   136  
   137  // DescribeFailedJob returns a string with information to help debug a failed job.
   138  func DescribeFailedJob(ctx context.Context, input WaitForJobCompleteInput) string {
   139  	namespace, name := input.Job.GetNamespace(), input.Job.GetName()
   140  	b := strings.Builder{}
   141  	b.WriteString(fmt.Sprintf("Job %s/%s failed",
   142  		namespace, name))
   143  	b.WriteString(fmt.Sprintf("\nJob:\n%s\n", prettyPrint(input.Job)))
   144  	b.WriteString(describeEvents(ctx, input.Clientset, namespace, name))
   145  	return b.String()
   146  }
   147  
   148  // prettyPrint returns a formatted JSON version of the object given.
   149  func prettyPrint(v interface{}) string {
   150  	b, err := json.MarshalIndent(v, "", "  ")
   151  	if err != nil {
   152  		return err.Error()
   153  	}
   154  	return string(b)
   155  }
   156  
   157  // describeEvents returns a string summarizing recent events involving the named object(s).
   158  func describeEvents(ctx context.Context, clientset *kubernetes.Clientset, namespace, name string) string {
   159  	b := strings.Builder{}
   160  	if clientset == nil {
   161  		b.WriteString("clientset is nil, so skipping output of relevant events")
   162  	} else {
   163  		opts := metav1.ListOptions{
   164  			FieldSelector: fmt.Sprintf("involvedObject.name=%s", name),
   165  			Limit:         20,
   166  		}
   167  		evts, err := clientset.CoreV1().Events(namespace).List(ctx, opts)
   168  		if err != nil {
   169  			b.WriteString(err.Error())
   170  		} else {
   171  			w := tabwriter.NewWriter(&b, 0, 4, 2, ' ', tabwriter.FilterHTML)
   172  			fmt.Fprintln(w, "LAST SEEN\tTYPE\tREASON\tOBJECT\tMESSAGE")
   173  			for _, e := range evts.Items {
   174  				fmt.Fprintf(w, "%s\t%s\t%s\t%s/%s\t%s\n", e.LastTimestamp, e.Type, e.Reason,
   175  					strings.ToLower(e.InvolvedObject.Kind), e.InvolvedObject.Name, e.Message)
   176  			}
   177  			w.Flush()
   178  		}
   179  	}
   180  	return b.String()
   181  }