github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/ruler/manager_metrics_test.go (about)

     1  package ruler
     2  
     3  import (
     4  	"bytes"
     5  	"testing"
     6  
     7  	"github.com/prometheus/client_golang/prometheus"
     8  	"github.com/prometheus/client_golang/prometheus/promauto"
     9  	"github.com/prometheus/client_golang/prometheus/testutil"
    10  	dto "github.com/prometheus/client_model/go"
    11  	"github.com/stretchr/testify/assert"
    12  	"github.com/stretchr/testify/require"
    13  )
    14  
    15  func TestManagerMetrics(t *testing.T) {
    16  	mainReg := prometheus.NewPedanticRegistry()
    17  
    18  	managerMetrics := NewManagerMetrics()
    19  	mainReg.MustRegister(managerMetrics)
    20  	managerMetrics.AddUserRegistry("user1", populateManager(1))
    21  	managerMetrics.AddUserRegistry("user2", populateManager(10))
    22  	managerMetrics.AddUserRegistry("user3", populateManager(100))
    23  
    24  	managerMetrics.AddUserRegistry("user4", populateManager(1000))
    25  	managerMetrics.RemoveUserRegistry("user4")
    26  
    27  	//noinspection ALL
    28  	err := testutil.GatherAndCompare(mainReg, bytes.NewBufferString(`
    29  # HELP cortex_prometheus_last_evaluation_samples The number of samples returned during the last rule group evaluation.
    30  # TYPE cortex_prometheus_last_evaluation_samples gauge
    31  cortex_prometheus_last_evaluation_samples{rule_group="group_one",user="user1"} 1000
    32  cortex_prometheus_last_evaluation_samples{rule_group="group_one",user="user2"} 10000
    33  cortex_prometheus_last_evaluation_samples{rule_group="group_one",user="user3"} 100000
    34  cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user1"} 1000
    35  cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user2"} 10000
    36  cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user3"} 100000
    37  # HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute.
    38  # TYPE cortex_prometheus_rule_evaluation_duration_seconds summary
    39  cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1
    40  cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.9"} 1
    41  cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.99"} 1
    42  cortex_prometheus_rule_evaluation_duration_seconds_sum{user="user1"} 1
    43  cortex_prometheus_rule_evaluation_duration_seconds_count{user="user1"} 1
    44  cortex_prometheus_rule_evaluation_duration_seconds{user="user2",quantile="0.5"} 10
    45  cortex_prometheus_rule_evaluation_duration_seconds{user="user2",quantile="0.9"} 10
    46  cortex_prometheus_rule_evaluation_duration_seconds{user="user2",quantile="0.99"} 10
    47  cortex_prometheus_rule_evaluation_duration_seconds_sum{user="user2"} 10
    48  cortex_prometheus_rule_evaluation_duration_seconds_count{user="user2"} 1
    49  cortex_prometheus_rule_evaluation_duration_seconds{user="user3",quantile="0.5"} 100
    50  cortex_prometheus_rule_evaluation_duration_seconds{user="user3",quantile="0.9"} 100
    51  cortex_prometheus_rule_evaluation_duration_seconds{user="user3",quantile="0.99"} 100
    52  cortex_prometheus_rule_evaluation_duration_seconds_sum{user="user3"} 100
    53  cortex_prometheus_rule_evaluation_duration_seconds_count{user="user3"} 1
    54  # HELP cortex_prometheus_rule_evaluation_failures_total The total number of rule evaluation failures.
    55  # TYPE cortex_prometheus_rule_evaluation_failures_total counter
    56  cortex_prometheus_rule_evaluation_failures_total{rule_group="group_one",user="user1"} 1
    57  cortex_prometheus_rule_evaluation_failures_total{rule_group="group_one",user="user2"} 10
    58  cortex_prometheus_rule_evaluation_failures_total{rule_group="group_one",user="user3"} 100
    59  cortex_prometheus_rule_evaluation_failures_total{rule_group="group_two",user="user1"} 1
    60  cortex_prometheus_rule_evaluation_failures_total{rule_group="group_two",user="user2"} 10
    61  cortex_prometheus_rule_evaluation_failures_total{rule_group="group_two",user="user3"} 100
    62  # HELP cortex_prometheus_rule_evaluations_total The total number of rule evaluations.
    63  # TYPE cortex_prometheus_rule_evaluations_total counter
    64  cortex_prometheus_rule_evaluations_total{rule_group="group_one",user="user1"} 1
    65  cortex_prometheus_rule_evaluations_total{rule_group="group_one",user="user2"} 10
    66  cortex_prometheus_rule_evaluations_total{rule_group="group_one",user="user3"} 100
    67  cortex_prometheus_rule_evaluations_total{rule_group="group_two",user="user1"} 1
    68  cortex_prometheus_rule_evaluations_total{rule_group="group_two",user="user2"} 10
    69  cortex_prometheus_rule_evaluations_total{rule_group="group_two",user="user3"} 100
    70  # HELP cortex_prometheus_rule_group_duration_seconds The duration of rule group evaluations.
    71  # TYPE cortex_prometheus_rule_group_duration_seconds summary
    72  cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.01"} 1
    73  cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.05"} 1
    74  cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.5"} 1
    75  cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.9"} 1
    76  cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.99"} 1
    77  cortex_prometheus_rule_group_duration_seconds_sum{user="user1"} 1
    78  cortex_prometheus_rule_group_duration_seconds_count{user="user1"} 1
    79  cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.01"} 10
    80  cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.05"} 10
    81  cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.5"} 10
    82  cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.9"} 10
    83  cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.99"} 10
    84  cortex_prometheus_rule_group_duration_seconds_sum{user="user2"} 10
    85  cortex_prometheus_rule_group_duration_seconds_count{user="user2"} 1
    86  cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.01"} 100
    87  cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.05"} 100
    88  cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.5"} 100
    89  cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.9"} 100
    90  cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.99"} 100
    91  cortex_prometheus_rule_group_duration_seconds_sum{user="user3"} 100
    92  cortex_prometheus_rule_group_duration_seconds_count{user="user3"} 1
    93  # HELP cortex_prometheus_rule_group_iterations_missed_total The total number of rule group evaluations missed due to slow rule group evaluation.
    94  # TYPE cortex_prometheus_rule_group_iterations_missed_total counter
    95  cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_one",user="user1"} 1
    96  cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_one",user="user2"} 10
    97  cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_one",user="user3"} 100
    98  cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_two",user="user1"} 1
    99  cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_two",user="user2"} 10
   100  cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_two",user="user3"} 100
   101  # HELP cortex_prometheus_rule_group_iterations_total The total number of scheduled rule group evaluations, whether executed or missed.
   102  # TYPE cortex_prometheus_rule_group_iterations_total counter
   103  cortex_prometheus_rule_group_iterations_total{rule_group="group_one",user="user1"} 1
   104  cortex_prometheus_rule_group_iterations_total{rule_group="group_one",user="user2"} 10
   105  cortex_prometheus_rule_group_iterations_total{rule_group="group_one",user="user3"} 100
   106  cortex_prometheus_rule_group_iterations_total{rule_group="group_two",user="user1"} 1
   107  cortex_prometheus_rule_group_iterations_total{rule_group="group_two",user="user2"} 10
   108  cortex_prometheus_rule_group_iterations_total{rule_group="group_two",user="user3"} 100
   109  # HELP cortex_prometheus_rule_group_last_duration_seconds The duration of the last rule group evaluation.
   110  # TYPE cortex_prometheus_rule_group_last_duration_seconds gauge
   111  cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_one",user="user1"} 1000
   112  cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_one",user="user2"} 10000
   113  cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_one",user="user3"} 100000
   114  cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_two",user="user1"} 1000
   115  cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_two",user="user2"} 10000
   116  cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_two",user="user3"} 100000
   117  # HELP cortex_prometheus_rule_group_last_evaluation_timestamp_seconds The timestamp of the last rule group evaluation in seconds.
   118  # TYPE cortex_prometheus_rule_group_last_evaluation_timestamp_seconds gauge
   119  cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_one",user="user1"} 1000
   120  cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_one",user="user2"} 10000
   121  cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_one",user="user3"} 100000
   122  cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_two",user="user1"} 1000
   123  cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_two",user="user2"} 10000
   124  cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_two",user="user3"} 100000
   125  # HELP cortex_prometheus_rule_group_rules The number of rules.
   126  # TYPE cortex_prometheus_rule_group_rules gauge
   127  cortex_prometheus_rule_group_rules{rule_group="group_one",user="user1"} 1000
   128  cortex_prometheus_rule_group_rules{rule_group="group_one",user="user2"} 10000
   129  cortex_prometheus_rule_group_rules{rule_group="group_one",user="user3"} 100000
   130  cortex_prometheus_rule_group_rules{rule_group="group_two",user="user1"} 1000
   131  cortex_prometheus_rule_group_rules{rule_group="group_two",user="user2"} 10000
   132  cortex_prometheus_rule_group_rules{rule_group="group_two",user="user3"} 100000
   133  `))
   134  	require.NoError(t, err)
   135  }
   136  
   137  func populateManager(base float64) *prometheus.Registry {
   138  	r := prometheus.NewRegistry()
   139  
   140  	metrics := newGroupMetrics(r)
   141  
   142  	metrics.evalDuration.Observe(base)
   143  	metrics.iterationDuration.Observe(base)
   144  
   145  	metrics.iterationsScheduled.WithLabelValues("group_one").Add(base)
   146  	metrics.iterationsScheduled.WithLabelValues("group_two").Add(base)
   147  	metrics.iterationsMissed.WithLabelValues("group_one").Add(base)
   148  	metrics.iterationsMissed.WithLabelValues("group_two").Add(base)
   149  	metrics.evalTotal.WithLabelValues("group_one").Add(base)
   150  	metrics.evalTotal.WithLabelValues("group_two").Add(base)
   151  	metrics.evalFailures.WithLabelValues("group_one").Add(base)
   152  	metrics.evalFailures.WithLabelValues("group_two").Add(base)
   153  
   154  	metrics.groupLastEvalTime.WithLabelValues("group_one").Add(base * 1000)
   155  	metrics.groupLastEvalTime.WithLabelValues("group_two").Add(base * 1000)
   156  
   157  	metrics.groupLastDuration.WithLabelValues("group_one").Add(base * 1000)
   158  	metrics.groupLastDuration.WithLabelValues("group_two").Add(base * 1000)
   159  
   160  	metrics.groupRules.WithLabelValues("group_one").Add(base * 1000)
   161  	metrics.groupRules.WithLabelValues("group_two").Add(base * 1000)
   162  
   163  	metrics.groupLastEvalSamples.WithLabelValues("group_one").Add(base * 1000)
   164  	metrics.groupLastEvalSamples.WithLabelValues("group_two").Add(base * 1000)
   165  
   166  	return r
   167  }
   168  
   169  // Copied from github.com/prometheus/rules/manager.go
   170  type groupMetrics struct {
   171  	evalDuration         prometheus.Summary
   172  	iterationDuration    prometheus.Summary
   173  	iterationsMissed     *prometheus.CounterVec
   174  	iterationsScheduled  *prometheus.CounterVec
   175  	evalTotal            *prometheus.CounterVec
   176  	evalFailures         *prometheus.CounterVec
   177  	groupInterval        *prometheus.GaugeVec
   178  	groupLastEvalTime    *prometheus.GaugeVec
   179  	groupLastDuration    *prometheus.GaugeVec
   180  	groupRules           *prometheus.GaugeVec
   181  	groupLastEvalSamples *prometheus.GaugeVec
   182  }
   183  
   184  func newGroupMetrics(r prometheus.Registerer) *groupMetrics {
   185  	m := &groupMetrics{
   186  		evalDuration: promauto.With(r).NewSummary(
   187  			prometheus.SummaryOpts{
   188  				Name:       "prometheus_rule_evaluation_duration_seconds",
   189  				Help:       "The duration for a rule to execute.",
   190  				Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
   191  			}),
   192  		iterationDuration: promauto.With(r).NewSummary(prometheus.SummaryOpts{
   193  			Name:       "prometheus_rule_group_duration_seconds",
   194  			Help:       "The duration of rule group evaluations.",
   195  			Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001},
   196  		}),
   197  		iterationsMissed: promauto.With(r).NewCounterVec(
   198  			prometheus.CounterOpts{
   199  				Name: "prometheus_rule_group_iterations_missed_total",
   200  				Help: "The total number of rule group evaluations missed due to slow rule group evaluation.",
   201  			},
   202  			[]string{"rule_group"},
   203  		),
   204  		iterationsScheduled: promauto.With(r).NewCounterVec(
   205  			prometheus.CounterOpts{
   206  				Name: "prometheus_rule_group_iterations_total",
   207  				Help: "The total number of scheduled rule group evaluations, whether executed or missed.",
   208  			},
   209  			[]string{"rule_group"},
   210  		),
   211  		evalTotal: promauto.With(r).NewCounterVec(
   212  			prometheus.CounterOpts{
   213  				Name: "prometheus_rule_evaluations_total",
   214  				Help: "The total number of rule evaluations.",
   215  			},
   216  			[]string{"rule_group"},
   217  		),
   218  		evalFailures: promauto.With(r).NewCounterVec(
   219  			prometheus.CounterOpts{
   220  				Name: "prometheus_rule_evaluation_failures_total",
   221  				Help: "The total number of rule evaluation failures.",
   222  			},
   223  			[]string{"rule_group"},
   224  		),
   225  		groupInterval: promauto.With(r).NewGaugeVec(
   226  			prometheus.GaugeOpts{
   227  				Name: "prometheus_rule_group_interval_seconds",
   228  				Help: "The interval of a rule group.",
   229  			},
   230  			[]string{"rule_group"},
   231  		),
   232  		groupLastEvalTime: promauto.With(r).NewGaugeVec(
   233  			prometheus.GaugeOpts{
   234  				Name: "prometheus_rule_group_last_evaluation_timestamp_seconds",
   235  				Help: "The timestamp of the last rule group evaluation in seconds.",
   236  			},
   237  			[]string{"rule_group"},
   238  		),
   239  		groupLastDuration: promauto.With(r).NewGaugeVec(
   240  			prometheus.GaugeOpts{
   241  				Name: "prometheus_rule_group_last_duration_seconds",
   242  				Help: "The duration of the last rule group evaluation.",
   243  			},
   244  			[]string{"rule_group"},
   245  		),
   246  		groupRules: promauto.With(r).NewGaugeVec(
   247  			prometheus.GaugeOpts{
   248  				Name: "prometheus_rule_group_rules",
   249  				Help: "The number of rules.",
   250  			},
   251  			[]string{"rule_group"},
   252  		),
   253  		groupLastEvalSamples: promauto.With(r).NewGaugeVec(
   254  			prometheus.GaugeOpts{
   255  				Name: "prometheus_rule_group_last_evaluation_samples",
   256  				Help: "The number of samples returned during the last rule group evaluation.",
   257  			},
   258  			[]string{"rule_group"},
   259  		),
   260  	}
   261  
   262  	return m
   263  }
   264  
   265  func TestMetricsArePerUser(t *testing.T) {
   266  	mainReg := prometheus.NewPedanticRegistry()
   267  
   268  	managerMetrics := NewManagerMetrics()
   269  	mainReg.MustRegister(managerMetrics)
   270  	managerMetrics.AddUserRegistry("user1", populateManager(1))
   271  	managerMetrics.AddUserRegistry("user2", populateManager(10))
   272  	managerMetrics.AddUserRegistry("user3", populateManager(100))
   273  
   274  	ch := make(chan prometheus.Metric)
   275  
   276  	defer func() {
   277  		// drain the channel, so that collecting gouroutine can stop.
   278  		// This is useful if test fails.
   279  		for range ch {
   280  		}
   281  	}()
   282  
   283  	go func() {
   284  		managerMetrics.Collect(ch)
   285  		close(ch)
   286  	}()
   287  
   288  	for m := range ch {
   289  		desc := m.Desc()
   290  
   291  		dtoM := &dto.Metric{}
   292  		err := m.Write(dtoM)
   293  
   294  		require.NoError(t, err)
   295  
   296  		foundUserLabel := false
   297  		for _, l := range dtoM.Label {
   298  			if l.GetName() == "user" {
   299  				foundUserLabel = true
   300  				break
   301  			}
   302  		}
   303  
   304  		assert.True(t, foundUserLabel, "user label not found for metric %s", desc.String())
   305  	}
   306  }