github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/e2e/metrics/metrics.go (about)

     1  package metrics
     2  
     3  import (
     4  	"fmt"
     5  	"os"
     6  	"testing"
     7  	"time"
     8  
     9  	"github.com/hashicorp/nomad/e2e/e2eutil"
    10  	"github.com/hashicorp/nomad/e2e/framework"
    11  	"github.com/hashicorp/nomad/helper/uuid"
    12  	"github.com/hashicorp/nomad/testutil"
    13  	"github.com/prometheus/common/model"
    14  
    15  	"github.com/stretchr/testify/assert"
    16  	"github.com/stretchr/testify/require"
    17  )
    18  
    19  type MetricsTest struct {
    20  	framework.TC
    21  	jobIDs       []string
    22  	prometheusID string
    23  	fabioID      string
    24  	fabioAddress string
    25  }
    26  
    27  func init() {
    28  	framework.AddSuites(&framework.TestSuite{
    29  		Component:   "Metrics",
    30  		CanRunLocal: true,
    31  		Cases: []framework.TestCase{
    32  			new(MetricsTest),
    33  		},
    34  	})
    35  }
    36  
    37  // BeforeAll stands up Prometheus to collect metrics from all clients and
    38  // allocs, with fabio as a system job in front of it so that we don't need to
    39  // have prometheus use host networking.
    40  func (tc *MetricsTest) BeforeAll(f *framework.F) {
    41  	t := f.T()
    42  	e2eutil.WaitForLeader(t, tc.Nomad())
    43  	e2eutil.WaitForNodesReady(t, tc.Nomad(), 1)
    44  	err := tc.setUpPrometheus(f)
    45  	require.Nil(t, err)
    46  }
    47  
    48  // AfterEach CleanS up the target jobs after each test case, but keep
    49  // fabio/prometheus for reuse between the two test cases (Windows vs Linux).
    50  func (tc *MetricsTest) AfterEach(f *framework.F) {
    51  	if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
    52  		return
    53  	}
    54  	for _, jobID := range tc.jobIDs {
    55  		tc.Nomad().Jobs().Deregister(jobID, true, nil)
    56  	}
    57  	tc.jobIDs = []string{}
    58  	tc.Nomad().System().GarbageCollect()
    59  }
    60  
    61  // AfterAll cleans up fabio/prometheus.
    62  func (tc *MetricsTest) AfterAll(f *framework.F) {
    63  	if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
    64  		return
    65  	}
    66  	tc.tearDownPrometheus(f)
    67  }
    68  
    69  // TestMetricsLinux runs a collection of jobs that exercise alloc metrics.
    70  // Then we query prometheus to verify we're collecting client and alloc metrics
    71  // and correctly presenting them to the prometheus scraper.
    72  func (tc *MetricsTest) TestMetricsLinux(f *framework.F) {
    73  	t := f.T()
    74  	clientNodes, err := e2eutil.ListLinuxClientNodes(tc.Nomad())
    75  	require.Nil(t, err)
    76  	if len(clientNodes) == 0 {
    77  		t.Skip("no Linux clients")
    78  	}
    79  
    80  	workloads := map[string]string{
    81  		"cpustress":  "nomad_client_allocs_cpu_user",
    82  		"diskstress": "nomad_client_allocs_memory_rss", // TODO(tgross): do we have disk stats?
    83  		"helloworld": "nomad_client_allocs_cpu_allocated",
    84  		"memstress":  "nomad_client_allocs_memory_usage",
    85  		"simpleweb":  "nomad_client_allocs_memory_rss",
    86  	}
    87  
    88  	tc.runWorkloads(t, workloads)
    89  	tc.queryClientMetrics(t, clientNodes)
    90  	tc.queryAllocMetrics(t, workloads)
    91  }
    92  
    93  // TestMetricsWindows runs a collection of jobs that exercise alloc metrics.
    94  // Then we query prometheus to verify we're collecting client and alloc metrics
    95  // and correctly presenting them to the prometheus scraper.
    96  func (tc *MetricsTest) TestMetricsWindows(f *framework.F) {
    97  	t := f.T()
    98  	clientNodes, err := e2eutil.ListWindowsClientNodes(tc.Nomad())
    99  	require.Nil(t, err)
   100  	if len(clientNodes) == 0 {
   101  		t.Skip("no Windows clients")
   102  	}
   103  
   104  	workloads := map[string]string{
   105  		"factorial_windows": "nomad_client_allocs_cpu_user",
   106  		"mem_windows":       "nomad_client_allocs_memory_rss",
   107  	}
   108  
   109  	tc.runWorkloads(t, workloads)
   110  	tc.queryClientMetrics(t, clientNodes)
   111  	tc.queryAllocMetrics(t, workloads)
   112  }
   113  
   114  // run workloads and wait for allocations
   115  func (tc *MetricsTest) runWorkloads(t *testing.T, workloads map[string]string) {
   116  	for jobName := range workloads {
   117  		uuid := uuid.Generate()
   118  		jobID := "metrics-" + jobName + "-" + uuid[0:8]
   119  		tc.jobIDs = append(tc.jobIDs, jobID)
   120  		file := "metrics/input/" + jobName + ".nomad"
   121  		allocs := e2eutil.RegisterAndWaitForAllocs(t, tc.Nomad(), file, jobID, "")
   122  		require.NotZerof(t, allocs, "failed to register %s", jobID)
   123  	}
   124  }
   125  
   126  // query prometheus to verify that metrics are being collected
   127  // from clients
   128  func (tc *MetricsTest) queryClientMetrics(t *testing.T, clientNodes []string) {
   129  	metrics := []string{
   130  		"nomad_client_allocated_memory",
   131  		"nomad_client_host_cpu_user",
   132  		"nomad_client_host_disk_available",
   133  		"nomad_client_host_memory_used",
   134  		"nomad_client_uptime",
   135  	}
   136  	// we start with a very long timeout here because it takes a while for
   137  	// prometheus to be live and for jobs to initially register metrics.
   138  	retries := int64(60)
   139  
   140  	for _, metric := range metrics {
   141  
   142  		var results model.Vector
   143  		var err error
   144  
   145  		testutil.WaitForResultRetries(retries, func() (bool, error) {
   146  			defer time.Sleep(time.Second)
   147  
   148  			results, err = tc.promQuery(metric)
   149  			if err != nil {
   150  				return false, err
   151  			}
   152  
   153  			instances := make(map[string]struct{})
   154  			for _, result := range results {
   155  				instances[string(result.Metric["node_id"])] = struct{}{}
   156  			}
   157  			// we're testing only clients for a specific OS, so we
   158  			// want to make sure we're checking for specific node_ids
   159  			// and not just equal lengths
   160  			for _, clientNode := range clientNodes {
   161  				if _, ok := instances[clientNode]; !ok {
   162  					return false, fmt.Errorf("expected metric '%s' for all clients. got:\n%v", metric, results)
   163  				}
   164  			}
   165  			return true, nil
   166  		}, func(err error) {
   167  			require.NoError(t, err)
   168  		})
   169  
   170  		// shorten the timeout after the first workload is successfully
   171  		// queried so that we don't hang the whole test run if something's
   172  		// wrong with only one of the jobs
   173  		retries = 15
   174  	}
   175  }
   176  
   177  // query promtheus to verify that metrics are being collected
   178  // from allocations
   179  func (tc *MetricsTest) queryAllocMetrics(t *testing.T, workloads map[string]string) {
   180  	// we start with a very long timeout here because it takes a while for
   181  	// prometheus to be live and for jobs to initially register metrics.
   182  	timeout := 60 * time.Second
   183  	for jobName, metric := range workloads {
   184  		query := fmt.Sprintf("%s{exported_job=\"%s\"}", metric, jobName)
   185  		var results model.Vector
   186  		var err error
   187  		ok := assert.Eventually(t, func() bool {
   188  			results, err = tc.promQuery(query)
   189  			if err != nil {
   190  				return false
   191  			}
   192  
   193  			// make sure we didn't just collect a bunch of zero metrics
   194  			lastResult := results[len(results)-1]
   195  			if !(float64(lastResult.Value) > 0.0) {
   196  				err = fmt.Errorf("expected non-zero metrics, got: %v", results)
   197  				return false
   198  			}
   199  			return true
   200  		}, timeout, 1*time.Second)
   201  		require.Truef(t, ok, "prometheus query failed (%s): %v", query, err)
   202  
   203  		// shorten the timeout after the first workload is successfully
   204  		// queried so that we don't hang the whole test run if something's
   205  		// wrong with only one of the jobs
   206  		timeout = 15 * time.Second
   207  	}
   208  }