github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/e2e/metrics/metrics.go (about) 1 package metrics 2 3 import ( 4 "fmt" 5 "os" 6 "testing" 7 "time" 8 9 "github.com/hashicorp/nomad/e2e/e2eutil" 10 "github.com/hashicorp/nomad/e2e/framework" 11 "github.com/hashicorp/nomad/helper/uuid" 12 "github.com/hashicorp/nomad/testutil" 13 "github.com/prometheus/common/model" 14 15 "github.com/stretchr/testify/assert" 16 "github.com/stretchr/testify/require" 17 ) 18 19 type MetricsTest struct { 20 framework.TC 21 jobIDs []string 22 prometheusID string 23 fabioID string 24 fabioAddress string 25 } 26 27 func init() { 28 framework.AddSuites(&framework.TestSuite{ 29 Component: "Metrics", 30 CanRunLocal: true, 31 Cases: []framework.TestCase{ 32 new(MetricsTest), 33 }, 34 }) 35 } 36 37 // BeforeAll stands up Prometheus to collect metrics from all clients and 38 // allocs, with fabio as a system job in front of it so that we don't need to 39 // have prometheus use host networking. 40 func (tc *MetricsTest) BeforeAll(f *framework.F) { 41 t := f.T() 42 e2eutil.WaitForLeader(t, tc.Nomad()) 43 e2eutil.WaitForNodesReady(t, tc.Nomad(), 1) 44 err := tc.setUpPrometheus(f) 45 require.Nil(t, err) 46 } 47 48 // AfterEach CleanS up the target jobs after each test case, but keep 49 // fabio/prometheus for reuse between the two test cases (Windows vs Linux). 50 func (tc *MetricsTest) AfterEach(f *framework.F) { 51 if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { 52 return 53 } 54 for _, jobID := range tc.jobIDs { 55 tc.Nomad().Jobs().Deregister(jobID, true, nil) 56 } 57 tc.jobIDs = []string{} 58 tc.Nomad().System().GarbageCollect() 59 } 60 61 // AfterAll cleans up fabio/prometheus. 62 func (tc *MetricsTest) AfterAll(f *framework.F) { 63 if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { 64 return 65 } 66 tc.tearDownPrometheus(f) 67 } 68 69 // TestMetricsLinux runs a collection of jobs that exercise alloc metrics. 70 // Then we query prometheus to verify we're collecting client and alloc metrics 71 // and correctly presenting them to the prometheus scraper. 72 func (tc *MetricsTest) TestMetricsLinux(f *framework.F) { 73 t := f.T() 74 clientNodes, err := e2eutil.ListLinuxClientNodes(tc.Nomad()) 75 require.Nil(t, err) 76 if len(clientNodes) == 0 { 77 t.Skip("no Linux clients") 78 } 79 80 workloads := map[string]string{ 81 "cpustress": "nomad_client_allocs_cpu_user", 82 "diskstress": "nomad_client_allocs_memory_rss", // TODO(tgross): do we have disk stats? 83 "helloworld": "nomad_client_allocs_cpu_allocated", 84 "memstress": "nomad_client_allocs_memory_usage", 85 "simpleweb": "nomad_client_allocs_memory_rss", 86 } 87 88 tc.runWorkloads(t, workloads) 89 tc.queryClientMetrics(t, clientNodes) 90 tc.queryAllocMetrics(t, workloads) 91 } 92 93 // TestMetricsWindows runs a collection of jobs that exercise alloc metrics. 94 // Then we query prometheus to verify we're collecting client and alloc metrics 95 // and correctly presenting them to the prometheus scraper. 96 func (tc *MetricsTest) TestMetricsWindows(f *framework.F) { 97 t := f.T() 98 clientNodes, err := e2eutil.ListWindowsClientNodes(tc.Nomad()) 99 require.Nil(t, err) 100 if len(clientNodes) == 0 { 101 t.Skip("no Windows clients") 102 } 103 104 workloads := map[string]string{ 105 "factorial_windows": "nomad_client_allocs_cpu_user", 106 "mem_windows": "nomad_client_allocs_memory_rss", 107 } 108 109 tc.runWorkloads(t, workloads) 110 tc.queryClientMetrics(t, clientNodes) 111 tc.queryAllocMetrics(t, workloads) 112 } 113 114 // run workloads and wait for allocations 115 func (tc *MetricsTest) runWorkloads(t *testing.T, workloads map[string]string) { 116 for jobName := range workloads { 117 uuid := uuid.Generate() 118 jobID := "metrics-" + jobName + "-" + uuid[0:8] 119 tc.jobIDs = append(tc.jobIDs, jobID) 120 file := "metrics/input/" + jobName + ".nomad" 121 allocs := e2eutil.RegisterAndWaitForAllocs(t, tc.Nomad(), file, jobID, "") 122 require.NotZerof(t, allocs, "failed to register %s", jobID) 123 } 124 } 125 126 // query prometheus to verify that metrics are being collected 127 // from clients 128 func (tc *MetricsTest) queryClientMetrics(t *testing.T, clientNodes []string) { 129 metrics := []string{ 130 "nomad_client_allocated_memory", 131 "nomad_client_host_cpu_user", 132 "nomad_client_host_disk_available", 133 "nomad_client_host_memory_used", 134 "nomad_client_uptime", 135 } 136 // we start with a very long timeout here because it takes a while for 137 // prometheus to be live and for jobs to initially register metrics. 138 retries := int64(60) 139 140 for _, metric := range metrics { 141 142 var results model.Vector 143 var err error 144 145 testutil.WaitForResultRetries(retries, func() (bool, error) { 146 defer time.Sleep(time.Second) 147 148 results, err = tc.promQuery(metric) 149 if err != nil { 150 return false, err 151 } 152 153 instances := make(map[string]struct{}) 154 for _, result := range results { 155 instances[string(result.Metric["node_id"])] = struct{}{} 156 } 157 // we're testing only clients for a specific OS, so we 158 // want to make sure we're checking for specific node_ids 159 // and not just equal lengths 160 for _, clientNode := range clientNodes { 161 if _, ok := instances[clientNode]; !ok { 162 return false, fmt.Errorf("expected metric '%s' for all clients. got:\n%v", metric, results) 163 } 164 } 165 return true, nil 166 }, func(err error) { 167 require.NoError(t, err) 168 }) 169 170 // shorten the timeout after the first workload is successfully 171 // queried so that we don't hang the whole test run if something's 172 // wrong with only one of the jobs 173 retries = 15 174 } 175 } 176 177 // query promtheus to verify that metrics are being collected 178 // from allocations 179 func (tc *MetricsTest) queryAllocMetrics(t *testing.T, workloads map[string]string) { 180 // we start with a very long timeout here because it takes a while for 181 // prometheus to be live and for jobs to initially register metrics. 182 timeout := 60 * time.Second 183 for jobName, metric := range workloads { 184 query := fmt.Sprintf("%s{exported_job=\"%s\"}", metric, jobName) 185 var results model.Vector 186 var err error 187 ok := assert.Eventually(t, func() bool { 188 results, err = tc.promQuery(query) 189 if err != nil { 190 return false 191 } 192 193 // make sure we didn't just collect a bunch of zero metrics 194 lastResult := results[len(results)-1] 195 if !(float64(lastResult.Value) > 0.0) { 196 err = fmt.Errorf("expected non-zero metrics, got: %v", results) 197 return false 198 } 199 return true 200 }, timeout, 1*time.Second) 201 require.Truef(t, ok, "prometheus query failed (%s): %v", query, err) 202 203 // shorten the timeout after the first workload is successfully 204 // queried so that we don't hang the whole test run if something's 205 // wrong with only one of the jobs 206 timeout = 15 * time.Second 207 } 208 }