github.com/hernad/nomad@v1.6.112/e2e/metrics/metrics.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package metrics
     5  
     6  import (
     7  	"fmt"
     8  	"os"
     9  	"testing"
    10  	"time"
    11  
    12  	"github.com/hernad/nomad/e2e/e2eutil"
    13  	"github.com/hernad/nomad/e2e/framework"
    14  	"github.com/hernad/nomad/helper/uuid"
    15  	"github.com/hernad/nomad/testutil"
    16  	"github.com/prometheus/common/model"
    17  
    18  	"github.com/stretchr/testify/assert"
    19  	"github.com/stretchr/testify/require"
    20  )
    21  
    22  type MetricsTest struct {
    23  	framework.TC
    24  	jobIDs       []string
    25  	prometheusID string
    26  	fabioID      string
    27  	fabioAddress string
    28  }
    29  
    30  func init() {
    31  	framework.AddSuites(&framework.TestSuite{
    32  		Component:   "Metrics",
    33  		CanRunLocal: true,
    34  		Cases: []framework.TestCase{
    35  			new(MetricsTest),
    36  		},
    37  	})
    38  }
    39  
    40  // BeforeAll stands up Prometheus to collect metrics from all clients and
    41  // allocs, with fabio as a system job in front of it so that we don't need to
    42  // have prometheus use host networking.
    43  func (tc *MetricsTest) BeforeAll(f *framework.F) {
    44  	t := f.T()
    45  	e2eutil.WaitForLeader(t, tc.Nomad())
    46  	e2eutil.WaitForNodesReady(t, tc.Nomad(), 1)
    47  	err := tc.setUpPrometheus(f)
    48  	require.Nil(t, err)
    49  }
    50  
    51  // AfterEach CleanS up the target jobs after each test case, but keep
    52  // fabio/prometheus for reuse between the two test cases (Windows vs Linux).
    53  func (tc *MetricsTest) AfterEach(f *framework.F) {
    54  	if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
    55  		return
    56  	}
    57  	for _, jobID := range tc.jobIDs {
    58  		tc.Nomad().Jobs().Deregister(jobID, true, nil)
    59  	}
    60  	tc.jobIDs = []string{}
    61  	tc.Nomad().System().GarbageCollect()
    62  }
    63  
    64  // AfterAll cleans up fabio/prometheus.
    65  func (tc *MetricsTest) AfterAll(f *framework.F) {
    66  	if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
    67  		return
    68  	}
    69  	tc.tearDownPrometheus(f)
    70  }
    71  
    72  // TestMetricsLinux runs a collection of jobs that exercise alloc metrics.
    73  // Then we query prometheus to verify we're collecting client and alloc metrics
    74  // and correctly presenting them to the prometheus scraper.
    75  func (tc *MetricsTest) TestMetricsLinux(f *framework.F) {
    76  	t := f.T()
    77  	clientNodes, err := e2eutil.ListLinuxClientNodes(tc.Nomad())
    78  	require.Nil(t, err)
    79  	if len(clientNodes) == 0 {
    80  		t.Skip("no Linux clients")
    81  	}
    82  
    83  	workloads := map[string]string{
    84  		"cpustress":  "nomad_client_allocs_cpu_user",
    85  		"diskstress": "nomad_client_allocs_memory_rss", // TODO(tgross): do we have disk stats?
    86  		"helloworld": "nomad_client_allocs_cpu_allocated",
    87  		"memstress":  "nomad_client_allocs_memory_usage",
    88  		"simpleweb":  "nomad_client_allocs_memory_rss",
    89  	}
    90  
    91  	tc.runWorkloads(t, workloads)
    92  	tc.queryClientMetrics(t, clientNodes)
    93  	tc.queryAllocMetrics(t, workloads)
    94  }
    95  
    96  // TestMetricsWindows runs a collection of jobs that exercise alloc metrics.
    97  // Then we query prometheus to verify we're collecting client and alloc metrics
    98  // and correctly presenting them to the prometheus scraper.
    99  func (tc *MetricsTest) TestMetricsWindows(f *framework.F) {
   100  	t := f.T()
   101  	clientNodes, err := e2eutil.ListWindowsClientNodes(tc.Nomad())
   102  	require.Nil(t, err)
   103  	if len(clientNodes) == 0 {
   104  		t.Skip("no Windows clients")
   105  	}
   106  
   107  	workloads := map[string]string{
   108  		"factorial_windows": "nomad_client_allocs_cpu_user",
   109  		"mem_windows":       "nomad_client_allocs_memory_rss",
   110  	}
   111  
   112  	tc.runWorkloads(t, workloads)
   113  	tc.queryClientMetrics(t, clientNodes)
   114  	tc.queryAllocMetrics(t, workloads)
   115  }
   116  
   117  // run workloads and wait for allocations
   118  func (tc *MetricsTest) runWorkloads(t *testing.T, workloads map[string]string) {
   119  	for jobName := range workloads {
   120  		uuid := uuid.Generate()
   121  		jobID := "metrics-" + jobName + "-" + uuid[0:8]
   122  		tc.jobIDs = append(tc.jobIDs, jobID)
   123  		file := "metrics/input/" + jobName + ".nomad"
   124  		allocs := e2eutil.RegisterAndWaitForAllocs(t, tc.Nomad(), file, jobID, "")
   125  		require.NotZerof(t, allocs, "failed to register %s", jobID)
   126  	}
   127  }
   128  
   129  // query prometheus to verify that metrics are being collected
   130  // from clients
   131  func (tc *MetricsTest) queryClientMetrics(t *testing.T, clientNodes []string) {
   132  	metrics := []string{
   133  		"nomad_client_allocated_memory",
   134  		"nomad_client_host_cpu_user",
   135  		"nomad_client_host_disk_available",
   136  		"nomad_client_host_memory_used",
   137  		"nomad_client_uptime",
   138  	}
   139  	// we start with a very long timeout here because it takes a while for
   140  	// prometheus to be live and for jobs to initially register metrics.
   141  	retries := int64(60)
   142  
   143  	for _, metric := range metrics {
   144  
   145  		var results model.Vector
   146  		var err error
   147  
   148  		testutil.WaitForResultRetries(retries, func() (bool, error) {
   149  			defer time.Sleep(time.Second)
   150  
   151  			results, err = tc.promQuery(metric)
   152  			if err != nil {
   153  				return false, err
   154  			}
   155  
   156  			instances := make(map[string]struct{})
   157  			for _, result := range results {
   158  				instances[string(result.Metric["node_id"])] = struct{}{}
   159  			}
   160  			// we're testing only clients for a specific OS, so we
   161  			// want to make sure we're checking for specific node_ids
   162  			// and not just equal lengths
   163  			for _, clientNode := range clientNodes {
   164  				if _, ok := instances[clientNode]; !ok {
   165  					return false, fmt.Errorf("expected metric '%s' for all clients. got:\n%v", metric, results)
   166  				}
   167  			}
   168  			return true, nil
   169  		}, func(err error) {
   170  			require.NoError(t, err)
   171  		})
   172  
   173  		// shorten the timeout after the first workload is successfully
   174  		// queried so that we don't hang the whole test run if something's
   175  		// wrong with only one of the jobs
   176  		retries = 15
   177  	}
   178  }
   179  
   180  // query promtheus to verify that metrics are being collected
   181  // from allocations
   182  func (tc *MetricsTest) queryAllocMetrics(t *testing.T, workloads map[string]string) {
   183  	// we start with a very long timeout here because it takes a while for
   184  	// prometheus to be live and for jobs to initially register metrics.
   185  	timeout := 60 * time.Second
   186  	for jobName, metric := range workloads {
   187  		query := fmt.Sprintf("%s{exported_job=\"%s\"}", metric, jobName)
   188  		var results model.Vector
   189  		var err error
   190  		ok := assert.Eventually(t, func() bool {
   191  			results, err = tc.promQuery(query)
   192  			if err != nil {
   193  				return false
   194  			}
   195  
   196  			// make sure we didn't just collect a bunch of zero metrics
   197  			lastResult := results[len(results)-1]
   198  			if !(float64(lastResult.Value) > 0.0) {
   199  				err = fmt.Errorf("expected non-zero metrics, got: %v", results)
   200  				return false
   201  			}
   202  			return true
   203  		}, timeout, 1*time.Second)
   204  		require.Truef(t, ok, "prometheus query failed (%s): %v", query, err)
   205  
   206  		// shorten the timeout after the first workload is successfully
   207  		// queried so that we don't hang the whole test run if something's
   208  		// wrong with only one of the jobs
   209  		timeout = 15 * time.Second
   210  	}
   211  }