github.com/hernad/nomad@v1.6.112/e2e/metrics/metrics.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package metrics 5 6 import ( 7 "fmt" 8 "os" 9 "testing" 10 "time" 11 12 "github.com/hernad/nomad/e2e/e2eutil" 13 "github.com/hernad/nomad/e2e/framework" 14 "github.com/hernad/nomad/helper/uuid" 15 "github.com/hernad/nomad/testutil" 16 "github.com/prometheus/common/model" 17 18 "github.com/stretchr/testify/assert" 19 "github.com/stretchr/testify/require" 20 ) 21 22 type MetricsTest struct { 23 framework.TC 24 jobIDs []string 25 prometheusID string 26 fabioID string 27 fabioAddress string 28 } 29 30 func init() { 31 framework.AddSuites(&framework.TestSuite{ 32 Component: "Metrics", 33 CanRunLocal: true, 34 Cases: []framework.TestCase{ 35 new(MetricsTest), 36 }, 37 }) 38 } 39 40 // BeforeAll stands up Prometheus to collect metrics from all clients and 41 // allocs, with fabio as a system job in front of it so that we don't need to 42 // have prometheus use host networking. 43 func (tc *MetricsTest) BeforeAll(f *framework.F) { 44 t := f.T() 45 e2eutil.WaitForLeader(t, tc.Nomad()) 46 e2eutil.WaitForNodesReady(t, tc.Nomad(), 1) 47 err := tc.setUpPrometheus(f) 48 require.Nil(t, err) 49 } 50 51 // AfterEach CleanS up the target jobs after each test case, but keep 52 // fabio/prometheus for reuse between the two test cases (Windows vs Linux). 53 func (tc *MetricsTest) AfterEach(f *framework.F) { 54 if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { 55 return 56 } 57 for _, jobID := range tc.jobIDs { 58 tc.Nomad().Jobs().Deregister(jobID, true, nil) 59 } 60 tc.jobIDs = []string{} 61 tc.Nomad().System().GarbageCollect() 62 } 63 64 // AfterAll cleans up fabio/prometheus. 65 func (tc *MetricsTest) AfterAll(f *framework.F) { 66 if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { 67 return 68 } 69 tc.tearDownPrometheus(f) 70 } 71 72 // TestMetricsLinux runs a collection of jobs that exercise alloc metrics. 73 // Then we query prometheus to verify we're collecting client and alloc metrics 74 // and correctly presenting them to the prometheus scraper. 75 func (tc *MetricsTest) TestMetricsLinux(f *framework.F) { 76 t := f.T() 77 clientNodes, err := e2eutil.ListLinuxClientNodes(tc.Nomad()) 78 require.Nil(t, err) 79 if len(clientNodes) == 0 { 80 t.Skip("no Linux clients") 81 } 82 83 workloads := map[string]string{ 84 "cpustress": "nomad_client_allocs_cpu_user", 85 "diskstress": "nomad_client_allocs_memory_rss", // TODO(tgross): do we have disk stats? 86 "helloworld": "nomad_client_allocs_cpu_allocated", 87 "memstress": "nomad_client_allocs_memory_usage", 88 "simpleweb": "nomad_client_allocs_memory_rss", 89 } 90 91 tc.runWorkloads(t, workloads) 92 tc.queryClientMetrics(t, clientNodes) 93 tc.queryAllocMetrics(t, workloads) 94 } 95 96 // TestMetricsWindows runs a collection of jobs that exercise alloc metrics. 97 // Then we query prometheus to verify we're collecting client and alloc metrics 98 // and correctly presenting them to the prometheus scraper. 99 func (tc *MetricsTest) TestMetricsWindows(f *framework.F) { 100 t := f.T() 101 clientNodes, err := e2eutil.ListWindowsClientNodes(tc.Nomad()) 102 require.Nil(t, err) 103 if len(clientNodes) == 0 { 104 t.Skip("no Windows clients") 105 } 106 107 workloads := map[string]string{ 108 "factorial_windows": "nomad_client_allocs_cpu_user", 109 "mem_windows": "nomad_client_allocs_memory_rss", 110 } 111 112 tc.runWorkloads(t, workloads) 113 tc.queryClientMetrics(t, clientNodes) 114 tc.queryAllocMetrics(t, workloads) 115 } 116 117 // run workloads and wait for allocations 118 func (tc *MetricsTest) runWorkloads(t *testing.T, workloads map[string]string) { 119 for jobName := range workloads { 120 uuid := uuid.Generate() 121 jobID := "metrics-" + jobName + "-" + uuid[0:8] 122 tc.jobIDs = append(tc.jobIDs, jobID) 123 file := "metrics/input/" + jobName + ".nomad" 124 allocs := e2eutil.RegisterAndWaitForAllocs(t, tc.Nomad(), file, jobID, "") 125 require.NotZerof(t, allocs, "failed to register %s", jobID) 126 } 127 } 128 129 // query prometheus to verify that metrics are being collected 130 // from clients 131 func (tc *MetricsTest) queryClientMetrics(t *testing.T, clientNodes []string) { 132 metrics := []string{ 133 "nomad_client_allocated_memory", 134 "nomad_client_host_cpu_user", 135 "nomad_client_host_disk_available", 136 "nomad_client_host_memory_used", 137 "nomad_client_uptime", 138 } 139 // we start with a very long timeout here because it takes a while for 140 // prometheus to be live and for jobs to initially register metrics. 141 retries := int64(60) 142 143 for _, metric := range metrics { 144 145 var results model.Vector 146 var err error 147 148 testutil.WaitForResultRetries(retries, func() (bool, error) { 149 defer time.Sleep(time.Second) 150 151 results, err = tc.promQuery(metric) 152 if err != nil { 153 return false, err 154 } 155 156 instances := make(map[string]struct{}) 157 for _, result := range results { 158 instances[string(result.Metric["node_id"])] = struct{}{} 159 } 160 // we're testing only clients for a specific OS, so we 161 // want to make sure we're checking for specific node_ids 162 // and not just equal lengths 163 for _, clientNode := range clientNodes { 164 if _, ok := instances[clientNode]; !ok { 165 return false, fmt.Errorf("expected metric '%s' for all clients. got:\n%v", metric, results) 166 } 167 } 168 return true, nil 169 }, func(err error) { 170 require.NoError(t, err) 171 }) 172 173 // shorten the timeout after the first workload is successfully 174 // queried so that we don't hang the whole test run if something's 175 // wrong with only one of the jobs 176 retries = 15 177 } 178 } 179 180 // query promtheus to verify that metrics are being collected 181 // from allocations 182 func (tc *MetricsTest) queryAllocMetrics(t *testing.T, workloads map[string]string) { 183 // we start with a very long timeout here because it takes a while for 184 // prometheus to be live and for jobs to initially register metrics. 185 timeout := 60 * time.Second 186 for jobName, metric := range workloads { 187 query := fmt.Sprintf("%s{exported_job=\"%s\"}", metric, jobName) 188 var results model.Vector 189 var err error 190 ok := assert.Eventually(t, func() bool { 191 results, err = tc.promQuery(query) 192 if err != nil { 193 return false 194 } 195 196 // make sure we didn't just collect a bunch of zero metrics 197 lastResult := results[len(results)-1] 198 if !(float64(lastResult.Value) > 0.0) { 199 err = fmt.Errorf("expected non-zero metrics, got: %v", results) 200 return false 201 } 202 return true 203 }, timeout, 1*time.Second) 204 require.Truef(t, ok, "prometheus query failed (%s): %v", query, err) 205 206 // shorten the timeout after the first workload is successfully 207 // queried so that we don't hang the whole test run if something's 208 // wrong with only one of the jobs 209 timeout = 15 * time.Second 210 } 211 }