github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/e2e/metrics/metrics.go (about) 1 package metrics 2 3 import ( 4 "fmt" 5 "os" 6 "testing" 7 "time" 8 9 "github.com/hashicorp/nomad/e2e/e2eutil" 10 "github.com/hashicorp/nomad/e2e/framework" 11 "github.com/hashicorp/nomad/helper/uuid" 12 "github.com/prometheus/common/model" 13 14 "github.com/stretchr/testify/assert" 15 "github.com/stretchr/testify/require" 16 ) 17 18 type MetricsTest struct { 19 framework.TC 20 jobIDs []string 21 prometheusID string 22 fabioID string 23 fabioAddress string 24 } 25 26 func init() { 27 framework.AddSuites(&framework.TestSuite{ 28 Component: "Metrics", 29 CanRunLocal: true, 30 Cases: []framework.TestCase{ 31 new(MetricsTest), 32 }, 33 }) 34 } 35 36 // Stand up prometheus to collect metrics from all clients and allocs, 37 // with fabio as a system job in front of it so that we don't need to 38 // have prometheus use host networking 39 func (tc *MetricsTest) BeforeAll(f *framework.F) { 40 t := f.T() 41 e2eutil.WaitForLeader(t, tc.Nomad()) 42 e2eutil.WaitForNodesReady(t, tc.Nomad(), 1) 43 err := tc.setUpPrometheus(f) 44 require.Nil(t, err) 45 } 46 47 // Clean up the target jobs after each test case, but keep fabio/prometheus 48 // for reuse between the two test cases (Windows vs Linux) 49 func (tc *MetricsTest) AfterEach(f *framework.F) { 50 if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { 51 return 52 } 53 for _, jobID := range tc.jobIDs { 54 tc.Nomad().Jobs().Deregister(jobID, true, nil) 55 } 56 tc.jobIDs = []string{} 57 tc.Nomad().System().GarbageCollect() 58 } 59 60 // Clean up fabio/prometheus 61 func (tc *MetricsTest) AfterAll(f *framework.F) { 62 if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { 63 return 64 } 65 tc.tearDownPrometheus(f) 66 } 67 68 // TestMetricsLinux runs a collection of jobs that exercise alloc metrics. 69 // Then we query prometheus to verify we're collecting client and alloc metrics 70 // and correctly presenting them to the prometheus scraper. 71 func (tc *MetricsTest) TestMetricsLinux(f *framework.F) { 72 t := f.T() 73 clientNodes, err := e2eutil.ListLinuxClientNodes(tc.Nomad()) 74 require.Nil(t, err) 75 if len(clientNodes) == 0 { 76 t.Skip("no Linux clients") 77 } 78 79 workloads := map[string]string{ 80 "cpustress": "nomad_client_allocs_cpu_user", 81 "diskstress": "nomad_client_allocs_memory_rss", // TODO(tgross): do we have disk stats? 82 "helloworld": "nomad_client_allocs_cpu_allocated", 83 "memstress": "nomad_client_allocs_memory_usage", 84 "simpleweb": "nomad_client_allocs_memory_rss", 85 } 86 87 tc.runWorkloads(t, workloads) 88 tc.queryClientMetrics(t, clientNodes) 89 tc.queryAllocMetrics(t, workloads) 90 } 91 92 // TestMetricsWindows runs a collection of jobs that exercise alloc metrics. 93 // Then we query prometheus to verify we're collecting client and alloc metrics 94 // and correctly presenting them to the prometheus scraper. 95 func (tc *MetricsTest) TestMetricsWindows(f *framework.F) { 96 t := f.T() 97 clientNodes, err := e2eutil.ListWindowsClientNodes(tc.Nomad()) 98 require.Nil(t, err) 99 if len(clientNodes) == 0 { 100 t.Skip("no Windows clients") 101 } 102 103 workloads := map[string]string{ 104 "factorial_windows": "nomad_client_allocs_cpu_user", 105 "mem_windows": "nomad_client_allocs_memory_rss", 106 } 107 108 tc.runWorkloads(t, workloads) 109 tc.queryClientMetrics(t, clientNodes) 110 tc.queryAllocMetrics(t, workloads) 111 } 112 113 // run workloads and wait for allocations 114 func (tc *MetricsTest) runWorkloads(t *testing.T, workloads map[string]string) { 115 for jobName := range workloads { 116 uuid := uuid.Generate() 117 jobID := "metrics-" + jobName + "-" + uuid[0:8] 118 tc.jobIDs = append(tc.jobIDs, jobID) 119 file := "metrics/input/" + jobName + ".nomad" 120 allocs := e2eutil.RegisterAndWaitForAllocs(t, tc.Nomad(), file, jobID, "") 121 if len(allocs) == 0 { 122 t.Fatalf("failed to register %s", jobID) 123 } 124 } 125 } 126 127 // query prometheus to verify that metrics are being collected 128 // from clients 129 func (tc *MetricsTest) queryClientMetrics(t *testing.T, clientNodes []string) { 130 metrics := []string{ 131 "nomad_client_allocated_memory", 132 "nomad_client_host_cpu_user", 133 "nomad_client_host_disk_available", 134 "nomad_client_host_memory_used", 135 "nomad_client_uptime", 136 } 137 // we start with a very long timeout here because it takes a while for 138 // prometheus to be live and for jobs to initially register metrics. 139 timeout := 60 * time.Second 140 141 for _, metric := range metrics { 142 var results model.Vector 143 var err error 144 ok := assert.Eventually(t, func() bool { 145 results, err = tc.promQuery(metric) 146 if err != nil { 147 return false 148 } 149 instances := make(map[string]struct{}) 150 for _, result := range results { 151 instances[string(result.Metric["node_id"])] = struct{}{} 152 } 153 // we're testing only clients for a specific OS, so we 154 // want to make sure we're checking for specific node_ids 155 // and not just equal lengths 156 for _, clientNode := range clientNodes { 157 if _, ok := instances[clientNode]; !ok { 158 err = fmt.Errorf("expected metric '%s' for all clients. got:\n%v", 159 metric, results) 160 return false 161 } 162 } 163 return true 164 }, timeout, 1*time.Second) 165 require.Truef(t, ok, "prometheus query failed (%s): %v", metric, err) 166 167 // shorten the timeout after the first workload is successfully 168 // queried so that we don't hang the whole test run if something's 169 // wrong with only one of the jobs 170 timeout = 15 * time.Second 171 } 172 } 173 174 // query promtheus to verify that metrics are being collected 175 // from allocations 176 func (tc *MetricsTest) queryAllocMetrics(t *testing.T, workloads map[string]string) { 177 // we start with a very long timeout here because it takes a while for 178 // prometheus to be live and for jobs to initially register metrics. 179 timeout := 60 * time.Second 180 for jobName, metric := range workloads { 181 query := fmt.Sprintf("%s{exported_job=\"%s\"}", metric, jobName) 182 var results model.Vector 183 var err error 184 ok := assert.Eventually(t, func() bool { 185 results, err = tc.promQuery(query) 186 if err != nil { 187 return false 188 } 189 190 // make sure we didn't just collect a bunch of zero metrics 191 lastResult := results[len(results)-1] 192 if !(float64(lastResult.Value) > 0.0) { 193 err = fmt.Errorf("expected non-zero metrics, got: %v", results) 194 return false 195 } 196 return true 197 }, timeout, 1*time.Second) 198 require.Truef(t, ok, "prometheus query failed (%s): %v", query, err) 199 200 // shorten the timeout after the first workload is successfully 201 // queried so that we don't hang the whole test run if something's 202 // wrong with only one of the jobs 203 timeout = 15 * time.Second 204 } 205 }