k8s.io/kubernetes@v1.29.3/test/e2e/instrumentation/monitoring/accelerator.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package monitoring
    18  
    19  import (
    20  	"context"
    21  	"os"
    22  	"time"
    23  
    24  	v1 "k8s.io/api/core/v1"
    25  	"k8s.io/apimachinery/pkg/api/resource"
    26  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    27  	"k8s.io/apimachinery/pkg/util/wait"
    28  	"k8s.io/kubernetes/test/e2e/feature"
    29  	"k8s.io/kubernetes/test/e2e/framework"
    30  	e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
    31  	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
    32  	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
    33  	instrumentation "k8s.io/kubernetes/test/e2e/instrumentation/common"
    34  	"k8s.io/kubernetes/test/e2e/scheduling"
    35  	"k8s.io/kubernetes/test/utils/image"
    36  	admissionapi "k8s.io/pod-security-admission/api"
    37  
    38  	"github.com/onsi/ginkgo/v2"
    39  	"golang.org/x/oauth2/google"
    40  	gcm "google.golang.org/api/monitoring/v3"
    41  	"google.golang.org/api/option"
    42  )
    43  
    44  // Stackdriver container accelerator metrics, as described here:
    45  // https://cloud.google.com/monitoring/api/metrics_gcp#gcp-container
    46  var acceleratorMetrics = []string{
    47  	"accelerator/duty_cycle",
    48  	"accelerator/memory_total",
    49  	"accelerator/memory_used",
    50  }
    51  
    52  var _ = instrumentation.SIGDescribe("Stackdriver Monitoring", func() {
    53  	ginkgo.BeforeEach(func() {
    54  		e2eskipper.SkipUnlessProviderIs("gce", "gke")
    55  	})
    56  
    57  	f := framework.NewDefaultFramework("stackdriver-monitoring")
    58  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
    59  
    60  	f.It("should have accelerator metrics", feature.StackdriverAcceleratorMonitoring, func(ctx context.Context) {
    61  		testStackdriverAcceleratorMonitoring(ctx, f)
    62  	})
    63  
    64  })
    65  
    66  func testStackdriverAcceleratorMonitoring(ctx context.Context, f *framework.Framework) {
    67  	projectID := framework.TestContext.CloudConfig.ProjectID
    68  
    69  	client, err := google.DefaultClient(ctx, gcm.CloudPlatformScope)
    70  	framework.ExpectNoError(err)
    71  
    72  	gcmService, err := gcm.NewService(ctx, option.WithHTTPClient(client))
    73  
    74  	framework.ExpectNoError(err)
    75  
    76  	// set this env var if accessing Stackdriver test endpoint (default is prod):
    77  	// $ export STACKDRIVER_API_ENDPOINT_OVERRIDE=https://test-monitoring.sandbox.googleapis.com/
    78  	basePathOverride := os.Getenv("STACKDRIVER_API_ENDPOINT_OVERRIDE")
    79  	if basePathOverride != "" {
    80  		gcmService.BasePath = basePathOverride
    81  	}
    82  
    83  	scheduling.SetupNVIDIAGPUNode(ctx, f, false)
    84  
    85  	e2epod.NewPodClient(f).Create(ctx, &v1.Pod{
    86  		ObjectMeta: metav1.ObjectMeta{
    87  			Name: rcName,
    88  		},
    89  		Spec: v1.PodSpec{
    90  			RestartPolicy: v1.RestartPolicyNever,
    91  			Containers: []v1.Container{
    92  				{
    93  					Name:    rcName,
    94  					Image:   image.GetE2EImage(image.CudaVectorAdd),
    95  					Command: []string{"/bin/sh", "-c"},
    96  					Args:    []string{"nvidia-smi && sleep infinity"},
    97  					Resources: v1.ResourceRequirements{
    98  						Limits: v1.ResourceList{
    99  							e2egpu.NVIDIAGPUResourceName: *resource.NewQuantity(1, resource.DecimalSI),
   100  						},
   101  					},
   102  				},
   103  			},
   104  		},
   105  	})
   106  
   107  	metricsMap := map[string]bool{}
   108  	pollingFunction := checkForAcceleratorMetrics(projectID, gcmService, time.Now(), metricsMap)
   109  	err = wait.Poll(pollFrequency, pollTimeout, pollingFunction)
   110  	if err != nil {
   111  		framework.Logf("Missing metrics: %+v", metricsMap)
   112  	}
   113  	framework.ExpectNoError(err)
   114  }
   115  
   116  func checkForAcceleratorMetrics(projectID string, gcmService *gcm.Service, start time.Time, metricsMap map[string]bool) func() (bool, error) {
   117  	return func() (bool, error) {
   118  		counter := 0
   119  		for _, metric := range acceleratorMetrics {
   120  			metricsMap[metric] = false
   121  		}
   122  		for _, metric := range acceleratorMetrics {
   123  			// TODO: check only for metrics from this cluster
   124  			ts, err := fetchTimeSeries(projectID, gcmService, metric, start, time.Now())
   125  			framework.ExpectNoError(err)
   126  			if len(ts) > 0 {
   127  				counter = counter + 1
   128  				metricsMap[metric] = true
   129  				framework.Logf("Received %v timeseries for metric %v", len(ts), metric)
   130  			} else {
   131  				framework.Logf("No timeseries for metric %v", metric)
   132  			}
   133  		}
   134  		if counter < 3 {
   135  			return false, nil
   136  		}
   137  		return true, nil
   138  	}
   139  }