sigs.k8s.io/cluster-api-provider-azure@v1.14.3/test/e2e/azure_gpu.go (about)

     1  //go:build e2e
     2  // +build e2e
     3  
     4  /*
     5  Copyright 2020 The Kubernetes Authors.
     6  
     7  Licensed under the Apache License, Version 2.0 (the "License");
     8  you may not use this file except in compliance with the License.
     9  You may obtain a copy of the License at
    10  
    11      http://www.apache.org/licenses/LICENSE-2.0
    12  
    13  Unless required by applicable law or agreed to in writing, software
    14  distributed under the License is distributed on an "AS IS" BASIS,
    15  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    16  See the License for the specific language governing permissions and
    17  limitations under the License.
    18  */
    19  
    20  package e2e
    21  
    22  import (
    23  	"context"
    24  	"fmt"
    25  	"os"
    26  	"strings"
    27  
    28  	. "github.com/onsi/ginkgo/v2"
    29  	. "github.com/onsi/gomega"
    30  	batchv1 "k8s.io/api/batch/v1"
    31  	corev1 "k8s.io/api/core/v1"
    32  	"k8s.io/apimachinery/pkg/api/resource"
    33  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    34  	"k8s.io/client-go/kubernetes"
    35  	"sigs.k8s.io/cluster-api/test/framework"
    36  )
    37  
    38  // AzureGPUSpecInput is the input for AzureGPUSpec.
    39  type AzureGPUSpecInput struct {
    40  	BootstrapClusterProxy framework.ClusterProxy
    41  	Namespace             *corev1.Namespace
    42  	ClusterName           string
    43  	SkipCleanup           bool
    44  }
    45  
    46  // AzureGPUSpec implements a test that verifies a GPU-enabled application runs on an
    47  // "nvidia-gpu"-flavored CAPZ cluster.
    48  func AzureGPUSpec(ctx context.Context, inputGetter func() AzureGPUSpecInput) {
    49  	var (
    50  		specName    = "azure-gpu"
    51  		input       AzureGPUSpecInput
    52  		machineType = os.Getenv("AZURE_GPU_NODE_MACHINE_TYPE")
    53  	)
    54  
    55  	input = inputGetter()
    56  	Expect(input.Namespace).NotTo(BeNil(), "Invalid argument. input.Namespace can't be nil when calling %s spec", specName)
    57  	Expect(input.ClusterName).NotTo(BeEmpty(), "Invalid argument. input.ClusterName can't be empty when calling %s spec", specName)
    58  	if machineType != "" {
    59  		Expect(machineType).To(HavePrefix("Standard_N"), "AZURE_GPU_NODE_MACHINE_TYPE is \"%s\" which isn't a GPU SKU in %s spec", machineType, specName)
    60  	}
    61  
    62  	By("creating a Kubernetes client to the workload cluster")
    63  	clusterProxy := input.BootstrapClusterProxy.GetWorkloadCluster(ctx, input.Namespace.Name, input.ClusterName)
    64  	Expect(clusterProxy).NotTo(BeNil())
    65  	clientset := clusterProxy.GetClientSet()
    66  	Expect(clientset).NotTo(BeNil())
    67  
    68  	By("Waiting for a node to have an \"nvidia.com/gpu\" allocatable resource")
    69  	Eventually(func() bool {
    70  		nodeList, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
    71  		Expect(err).NotTo(HaveOccurred())
    72  		for _, node := range nodeList.Items {
    73  			for k, v := range node.Status.Allocatable {
    74  				if k == "nvidia.com/gpu" && v.Value() > 0 {
    75  					return true
    76  				}
    77  			}
    78  		}
    79  		return false
    80  	}, e2eConfig.GetIntervals(specName, "wait-worker-nodes")...).Should(BeTrue(), func() string {
    81  		return getGPUOperatorPodLogs(ctx, clientset)
    82  	})
    83  
    84  	By("running a CUDA vector calculation job")
    85  	jobsClient := clientset.BatchV1().Jobs(corev1.NamespaceDefault)
    86  	jobName := "cuda-vector-add"
    87  	gpuJob := &batchv1.Job{
    88  		ObjectMeta: metav1.ObjectMeta{
    89  			Name:      jobName,
    90  			Namespace: corev1.NamespaceDefault,
    91  		},
    92  		Spec: batchv1.JobSpec{
    93  			Template: corev1.PodTemplateSpec{
    94  				Spec: corev1.PodSpec{
    95  					RestartPolicy: corev1.RestartPolicyOnFailure,
    96  					Containers: []corev1.Container{
    97  						{
    98  							Name:  jobName,
    99  							Image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.2.1",
   100  							Resources: corev1.ResourceRequirements{
   101  								Limits: corev1.ResourceList{
   102  									"nvidia.com/gpu": resource.MustParse("1"),
   103  								},
   104  							},
   105  						},
   106  					},
   107  				},
   108  			},
   109  		},
   110  	}
   111  	Log("starting to create CUDA vector calculation job")
   112  	_, err := jobsClient.Create(ctx, gpuJob, metav1.CreateOptions{})
   113  	Expect(err).NotTo(HaveOccurred())
   114  	gpuJobInput := WaitForJobCompleteInput{
   115  		Getter:    jobsClientAdapter{client: jobsClient},
   116  		Job:       gpuJob,
   117  		Clientset: clientset,
   118  	}
   119  	WaitForJobComplete(ctx, gpuJobInput, e2eConfig.GetIntervals(specName, "wait-job")...)
   120  }
   121  
   122  // getGPUOperatorPodLogs returns the logs of the Nvidia GPU operator pods.
   123  func getGPUOperatorPodLogs(ctx context.Context, clientset *kubernetes.Clientset) string {
   124  	podsClient := clientset.CoreV1().Pods(corev1.NamespaceAll)
   125  	var pods *corev1.PodList
   126  	var err error
   127  	Eventually(func(g Gomega) {
   128  		pods, err = podsClient.List(ctx, metav1.ListOptions{LabelSelector: "app.kubernetes.io/instance=gpu-operator"})
   129  		if err != nil {
   130  			LogWarning(err.Error())
   131  		}
   132  		g.Expect(err).NotTo(HaveOccurred())
   133  	}, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed())
   134  	b := strings.Builder{}
   135  	for _, pod := range pods.Items {
   136  		b.WriteString(fmt.Sprintf("\nLogs for pod %s:\n", pod.Name))
   137  		b.WriteString(getPodLogs(ctx, clientset, pod))
   138  	}
   139  	return b.String()
   140  }