k8s.io/kubernetes@v1.29.3/test/e2e/instrumentation/monitoring/accelerator.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package monitoring 18 19 import ( 20 "context" 21 "os" 22 "time" 23 24 v1 "k8s.io/api/core/v1" 25 "k8s.io/apimachinery/pkg/api/resource" 26 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 "k8s.io/apimachinery/pkg/util/wait" 28 "k8s.io/kubernetes/test/e2e/feature" 29 "k8s.io/kubernetes/test/e2e/framework" 30 e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu" 31 e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 32 e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" 33 instrumentation "k8s.io/kubernetes/test/e2e/instrumentation/common" 34 "k8s.io/kubernetes/test/e2e/scheduling" 35 "k8s.io/kubernetes/test/utils/image" 36 admissionapi "k8s.io/pod-security-admission/api" 37 38 "github.com/onsi/ginkgo/v2" 39 "golang.org/x/oauth2/google" 40 gcm "google.golang.org/api/monitoring/v3" 41 "google.golang.org/api/option" 42 ) 43 44 // Stackdriver container accelerator metrics, as described here: 45 // https://cloud.google.com/monitoring/api/metrics_gcp#gcp-container 46 var acceleratorMetrics = []string{ 47 "accelerator/duty_cycle", 48 "accelerator/memory_total", 49 "accelerator/memory_used", 50 } 51 52 var _ = instrumentation.SIGDescribe("Stackdriver Monitoring", func() { 53 ginkgo.BeforeEach(func() { 54 e2eskipper.SkipUnlessProviderIs("gce", "gke") 55 }) 56 57 f := framework.NewDefaultFramework("stackdriver-monitoring") 58 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged 59 60 f.It("should have accelerator metrics", feature.StackdriverAcceleratorMonitoring, func(ctx context.Context) { 61 testStackdriverAcceleratorMonitoring(ctx, f) 62 }) 63 64 }) 65 66 func testStackdriverAcceleratorMonitoring(ctx context.Context, f *framework.Framework) { 67 projectID := framework.TestContext.CloudConfig.ProjectID 68 69 client, err := google.DefaultClient(ctx, gcm.CloudPlatformScope) 70 framework.ExpectNoError(err) 71 72 gcmService, err := gcm.NewService(ctx, option.WithHTTPClient(client)) 73 74 framework.ExpectNoError(err) 75 76 // set this env var if accessing Stackdriver test endpoint (default is prod): 77 // $ export STACKDRIVER_API_ENDPOINT_OVERRIDE=https://test-monitoring.sandbox.googleapis.com/ 78 basePathOverride := os.Getenv("STACKDRIVER_API_ENDPOINT_OVERRIDE") 79 if basePathOverride != "" { 80 gcmService.BasePath = basePathOverride 81 } 82 83 scheduling.SetupNVIDIAGPUNode(ctx, f, false) 84 85 e2epod.NewPodClient(f).Create(ctx, &v1.Pod{ 86 ObjectMeta: metav1.ObjectMeta{ 87 Name: rcName, 88 }, 89 Spec: v1.PodSpec{ 90 RestartPolicy: v1.RestartPolicyNever, 91 Containers: []v1.Container{ 92 { 93 Name: rcName, 94 Image: image.GetE2EImage(image.CudaVectorAdd), 95 Command: []string{"/bin/sh", "-c"}, 96 Args: []string{"nvidia-smi && sleep infinity"}, 97 Resources: v1.ResourceRequirements{ 98 Limits: v1.ResourceList{ 99 e2egpu.NVIDIAGPUResourceName: *resource.NewQuantity(1, resource.DecimalSI), 100 }, 101 }, 102 }, 103 }, 104 }, 105 }) 106 107 metricsMap := map[string]bool{} 108 pollingFunction := checkForAcceleratorMetrics(projectID, gcmService, time.Now(), metricsMap) 109 err = wait.Poll(pollFrequency, pollTimeout, pollingFunction) 110 if err != nil { 111 framework.Logf("Missing metrics: %+v", metricsMap) 112 } 113 framework.ExpectNoError(err) 114 } 115 116 func checkForAcceleratorMetrics(projectID string, gcmService *gcm.Service, start time.Time, metricsMap map[string]bool) func() (bool, error) { 117 return func() (bool, error) { 118 counter := 0 119 for _, metric := range acceleratorMetrics { 120 metricsMap[metric] = false 121 } 122 for _, metric := range acceleratorMetrics { 123 // TODO: check only for metrics from this cluster 124 ts, err := fetchTimeSeries(projectID, gcmService, metric, start, time.Now()) 125 framework.ExpectNoError(err) 126 if len(ts) > 0 { 127 counter = counter + 1 128 metricsMap[metric] = true 129 framework.Logf("Received %v timeseries for metric %v", len(ts), metric) 130 } else { 131 framework.Logf("No timeseries for metric %v", metric) 132 } 133 } 134 if counter < 3 { 135 return false, nil 136 } 137 return true, nil 138 } 139 }