k8s.io/kubernetes@v1.29.3/test/e2e/instrumentation/monitoring/stackdriver.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package monitoring 18 19 import ( 20 "context" 21 "fmt" 22 "math" 23 "os" 24 "time" 25 26 "k8s.io/apimachinery/pkg/util/wait" 27 "k8s.io/kubernetes/test/e2e/feature" 28 "k8s.io/kubernetes/test/e2e/framework" 29 e2eautoscaling "k8s.io/kubernetes/test/e2e/framework/autoscaling" 30 e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" 31 instrumentation "k8s.io/kubernetes/test/e2e/instrumentation/common" 32 admissionapi "k8s.io/pod-security-admission/api" 33 34 "github.com/onsi/ginkgo/v2" 35 "golang.org/x/oauth2/google" 36 gcm "google.golang.org/api/monitoring/v3" 37 "google.golang.org/api/option" 38 ) 39 40 var ( 41 // Stackdriver container metrics, as described here: 42 // https://cloud.google.com/monitoring/api/metrics#gcp-container 43 stackdriverMetrics = []string{ 44 "uptime", 45 "memory/bytes_total", 46 "memory/bytes_used", 47 "cpu/reserved_cores", 48 "cpu/usage_time", 49 "memory/page_fault_count", 50 "disk/bytes_used", 51 "disk/bytes_total", 52 "cpu/utilization", 53 } 54 55 pollFrequency = time.Second * 5 56 pollTimeout = time.Minute * 7 57 58 rcName = "resource-consumer" 59 memoryUsed = 64 60 memoryLimit int64 = 200 61 tolerance = 0.25 62 ) 63 64 var _ = instrumentation.SIGDescribe("Stackdriver Monitoring", func() { 65 ginkgo.BeforeEach(func() { 66 e2eskipper.SkipUnlessProviderIs("gce", "gke") 67 }) 68 69 f := framework.NewDefaultFramework("stackdriver-monitoring") 70 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged 71 72 f.It("should have cluster metrics", feature.StackdriverMonitoring, func(ctx context.Context) { 73 testStackdriverMonitoring(ctx, f, 1, 100, 200) 74 }) 75 76 }) 77 78 func testStackdriverMonitoring(ctx context.Context, f *framework.Framework, pods, allPodsCPU int, perPodCPU int64) { 79 projectID := framework.TestContext.CloudConfig.ProjectID 80 81 client, err := google.DefaultClient(ctx, gcm.CloudPlatformScope) 82 framework.ExpectNoError(err) 83 84 // Hack for running tests locally 85 // If this is your use case, create application default credentials: 86 // $ gcloud auth application-default login 87 // and uncomment following lines (comment out the two lines above): (DON'T set the env var below) 88 /* 89 ts, err := google.DefaultTokenSource(oauth2.NoContext) 90 framework.Logf("Couldn't get application default credentials, %v", err) 91 if err != nil { 92 framework.Failf("Error accessing application default credentials, %v", err) 93 } 94 client := oauth2.NewClient(oauth2.NoContext, ts) 95 */ 96 97 gcmService, err := gcm.NewService(ctx, option.WithHTTPClient(client)) 98 99 // set this env var if accessing Stackdriver test endpoint (default is prod): 100 // $ export STACKDRIVER_API_ENDPOINT_OVERRIDE=https://test-monitoring.sandbox.googleapis.com/ 101 basePathOverride := os.Getenv("STACKDRIVER_API_ENDPOINT_OVERRIDE") 102 if basePathOverride != "" { 103 gcmService.BasePath = basePathOverride 104 } 105 106 framework.ExpectNoError(err) 107 108 rc := e2eautoscaling.NewDynamicResourceConsumer(ctx, rcName, f.Namespace.Name, e2eautoscaling.KindDeployment, pods, allPodsCPU, memoryUsed, 0, perPodCPU, memoryLimit, f.ClientSet, f.ScalesGetter, e2eautoscaling.Disable, e2eautoscaling.Idle) 109 ginkgo.DeferCleanup(rc.CleanUp) 110 111 rc.WaitForReplicas(ctx, pods, 15*time.Minute) 112 113 metricsMap := map[string]bool{} 114 pollingFunction := checkForMetrics(projectID, gcmService, time.Now(), metricsMap, allPodsCPU, perPodCPU) 115 err = wait.Poll(pollFrequency, pollTimeout, pollingFunction) 116 if err != nil { 117 framework.Logf("Missing metrics: %+v\n", metricsMap) 118 } 119 framework.ExpectNoError(err) 120 } 121 122 func checkForMetrics(projectID string, gcmService *gcm.Service, start time.Time, metricsMap map[string]bool, cpuUsed int, cpuLimit int64) func() (bool, error) { 123 return func() (bool, error) { 124 counter := 0 125 correctUtilization := false 126 for _, metric := range stackdriverMetrics { 127 metricsMap[metric] = false 128 } 129 for _, metric := range stackdriverMetrics { 130 // TODO: check only for metrics from this cluster 131 ts, err := fetchTimeSeries(projectID, gcmService, metric, start, time.Now()) 132 framework.ExpectNoError(err) 133 if len(ts) > 0 { 134 counter = counter + 1 135 metricsMap[metric] = true 136 framework.Logf("Received %v timeseries for metric %v\n", len(ts), metric) 137 } else { 138 framework.Logf("No timeseries for metric %v\n", metric) 139 } 140 141 var sum float64 142 switch metric { 143 case "cpu/utilization": 144 for _, t := range ts { 145 max := t.Points[0] 146 maxEnd, _ := time.Parse(time.RFC3339, max.Interval.EndTime) 147 for _, p := range t.Points { 148 pEnd, _ := time.Parse(time.RFC3339, p.Interval.EndTime) 149 if pEnd.After(maxEnd) { 150 max = p 151 maxEnd, _ = time.Parse(time.RFC3339, max.Interval.EndTime) 152 } 153 } 154 sum = sum + *max.Value.DoubleValue 155 framework.Logf("Received %v points for metric %v\n", 156 len(t.Points), metric) 157 } 158 framework.Logf("Most recent cpu/utilization sum*cpu/limit: %v\n", sum*float64(cpuLimit)) 159 if math.Abs(sum*float64(cpuLimit)-float64(cpuUsed)) > tolerance*float64(cpuUsed) { 160 return false, nil 161 } 162 correctUtilization = true 163 } 164 } 165 if counter < 9 || !correctUtilization { 166 return false, nil 167 } 168 return true, nil 169 } 170 } 171 172 func createMetricFilter(metric string, containerName string) string { 173 return fmt.Sprintf(`metric.type="container.googleapis.com/container/%s" AND 174 resource.label.container_name="%s"`, metric, containerName) 175 } 176 177 func fetchTimeSeries(projectID string, gcmService *gcm.Service, metric string, start time.Time, end time.Time) ([]*gcm.TimeSeries, error) { 178 response, err := gcmService.Projects.TimeSeries. 179 List(fullProjectName(projectID)). 180 Filter(createMetricFilter(metric, rcName)). 181 IntervalStartTime(start.Format(time.RFC3339)). 182 IntervalEndTime(end.Format(time.RFC3339)). 183 Do() 184 if err != nil { 185 return nil, err 186 } 187 return response.TimeSeries, nil 188 } 189 190 func fullProjectName(name string) string { 191 return fmt.Sprintf("projects/%s", name) 192 }