github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/ruler/manager_metrics_test.go (about) 1 package ruler 2 3 import ( 4 "bytes" 5 "testing" 6 7 "github.com/prometheus/client_golang/prometheus" 8 "github.com/prometheus/client_golang/prometheus/promauto" 9 "github.com/prometheus/client_golang/prometheus/testutil" 10 dto "github.com/prometheus/client_model/go" 11 "github.com/stretchr/testify/assert" 12 "github.com/stretchr/testify/require" 13 ) 14 15 func TestManagerMetrics(t *testing.T) { 16 mainReg := prometheus.NewPedanticRegistry() 17 18 managerMetrics := NewManagerMetrics() 19 mainReg.MustRegister(managerMetrics) 20 managerMetrics.AddUserRegistry("user1", populateManager(1)) 21 managerMetrics.AddUserRegistry("user2", populateManager(10)) 22 managerMetrics.AddUserRegistry("user3", populateManager(100)) 23 24 managerMetrics.AddUserRegistry("user4", populateManager(1000)) 25 managerMetrics.RemoveUserRegistry("user4") 26 27 //noinspection ALL 28 err := testutil.GatherAndCompare(mainReg, bytes.NewBufferString(` 29 # HELP cortex_prometheus_last_evaluation_samples The number of samples returned during the last rule group evaluation. 30 # TYPE cortex_prometheus_last_evaluation_samples gauge 31 cortex_prometheus_last_evaluation_samples{rule_group="group_one",user="user1"} 1000 32 cortex_prometheus_last_evaluation_samples{rule_group="group_one",user="user2"} 10000 33 cortex_prometheus_last_evaluation_samples{rule_group="group_one",user="user3"} 100000 34 cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user1"} 1000 35 cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user2"} 10000 36 cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user3"} 100000 37 # HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute. 38 # TYPE cortex_prometheus_rule_evaluation_duration_seconds summary 39 cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1 40 cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.9"} 1 41 cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.99"} 1 42 cortex_prometheus_rule_evaluation_duration_seconds_sum{user="user1"} 1 43 cortex_prometheus_rule_evaluation_duration_seconds_count{user="user1"} 1 44 cortex_prometheus_rule_evaluation_duration_seconds{user="user2",quantile="0.5"} 10 45 cortex_prometheus_rule_evaluation_duration_seconds{user="user2",quantile="0.9"} 10 46 cortex_prometheus_rule_evaluation_duration_seconds{user="user2",quantile="0.99"} 10 47 cortex_prometheus_rule_evaluation_duration_seconds_sum{user="user2"} 10 48 cortex_prometheus_rule_evaluation_duration_seconds_count{user="user2"} 1 49 cortex_prometheus_rule_evaluation_duration_seconds{user="user3",quantile="0.5"} 100 50 cortex_prometheus_rule_evaluation_duration_seconds{user="user3",quantile="0.9"} 100 51 cortex_prometheus_rule_evaluation_duration_seconds{user="user3",quantile="0.99"} 100 52 cortex_prometheus_rule_evaluation_duration_seconds_sum{user="user3"} 100 53 cortex_prometheus_rule_evaluation_duration_seconds_count{user="user3"} 1 54 # HELP cortex_prometheus_rule_evaluation_failures_total The total number of rule evaluation failures. 55 # TYPE cortex_prometheus_rule_evaluation_failures_total counter 56 cortex_prometheus_rule_evaluation_failures_total{rule_group="group_one",user="user1"} 1 57 cortex_prometheus_rule_evaluation_failures_total{rule_group="group_one",user="user2"} 10 58 cortex_prometheus_rule_evaluation_failures_total{rule_group="group_one",user="user3"} 100 59 cortex_prometheus_rule_evaluation_failures_total{rule_group="group_two",user="user1"} 1 60 cortex_prometheus_rule_evaluation_failures_total{rule_group="group_two",user="user2"} 10 61 cortex_prometheus_rule_evaluation_failures_total{rule_group="group_two",user="user3"} 100 62 # HELP cortex_prometheus_rule_evaluations_total The total number of rule evaluations. 63 # TYPE cortex_prometheus_rule_evaluations_total counter 64 cortex_prometheus_rule_evaluations_total{rule_group="group_one",user="user1"} 1 65 cortex_prometheus_rule_evaluations_total{rule_group="group_one",user="user2"} 10 66 cortex_prometheus_rule_evaluations_total{rule_group="group_one",user="user3"} 100 67 cortex_prometheus_rule_evaluations_total{rule_group="group_two",user="user1"} 1 68 cortex_prometheus_rule_evaluations_total{rule_group="group_two",user="user2"} 10 69 cortex_prometheus_rule_evaluations_total{rule_group="group_two",user="user3"} 100 70 # HELP cortex_prometheus_rule_group_duration_seconds The duration of rule group evaluations. 71 # TYPE cortex_prometheus_rule_group_duration_seconds summary 72 cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.01"} 1 73 cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.05"} 1 74 cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.5"} 1 75 cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.9"} 1 76 cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.99"} 1 77 cortex_prometheus_rule_group_duration_seconds_sum{user="user1"} 1 78 cortex_prometheus_rule_group_duration_seconds_count{user="user1"} 1 79 cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.01"} 10 80 cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.05"} 10 81 cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.5"} 10 82 cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.9"} 10 83 cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.99"} 10 84 cortex_prometheus_rule_group_duration_seconds_sum{user="user2"} 10 85 cortex_prometheus_rule_group_duration_seconds_count{user="user2"} 1 86 cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.01"} 100 87 cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.05"} 100 88 cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.5"} 100 89 cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.9"} 100 90 cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.99"} 100 91 cortex_prometheus_rule_group_duration_seconds_sum{user="user3"} 100 92 cortex_prometheus_rule_group_duration_seconds_count{user="user3"} 1 93 # HELP cortex_prometheus_rule_group_iterations_missed_total The total number of rule group evaluations missed due to slow rule group evaluation. 94 # TYPE cortex_prometheus_rule_group_iterations_missed_total counter 95 cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_one",user="user1"} 1 96 cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_one",user="user2"} 10 97 cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_one",user="user3"} 100 98 cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_two",user="user1"} 1 99 cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_two",user="user2"} 10 100 cortex_prometheus_rule_group_iterations_missed_total{rule_group="group_two",user="user3"} 100 101 # HELP cortex_prometheus_rule_group_iterations_total The total number of scheduled rule group evaluations, whether executed or missed. 102 # TYPE cortex_prometheus_rule_group_iterations_total counter 103 cortex_prometheus_rule_group_iterations_total{rule_group="group_one",user="user1"} 1 104 cortex_prometheus_rule_group_iterations_total{rule_group="group_one",user="user2"} 10 105 cortex_prometheus_rule_group_iterations_total{rule_group="group_one",user="user3"} 100 106 cortex_prometheus_rule_group_iterations_total{rule_group="group_two",user="user1"} 1 107 cortex_prometheus_rule_group_iterations_total{rule_group="group_two",user="user2"} 10 108 cortex_prometheus_rule_group_iterations_total{rule_group="group_two",user="user3"} 100 109 # HELP cortex_prometheus_rule_group_last_duration_seconds The duration of the last rule group evaluation. 110 # TYPE cortex_prometheus_rule_group_last_duration_seconds gauge 111 cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_one",user="user1"} 1000 112 cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_one",user="user2"} 10000 113 cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_one",user="user3"} 100000 114 cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_two",user="user1"} 1000 115 cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_two",user="user2"} 10000 116 cortex_prometheus_rule_group_last_duration_seconds{rule_group="group_two",user="user3"} 100000 117 # HELP cortex_prometheus_rule_group_last_evaluation_timestamp_seconds The timestamp of the last rule group evaluation in seconds. 118 # TYPE cortex_prometheus_rule_group_last_evaluation_timestamp_seconds gauge 119 cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_one",user="user1"} 1000 120 cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_one",user="user2"} 10000 121 cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_one",user="user3"} 100000 122 cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_two",user="user1"} 1000 123 cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_two",user="user2"} 10000 124 cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{rule_group="group_two",user="user3"} 100000 125 # HELP cortex_prometheus_rule_group_rules The number of rules. 126 # TYPE cortex_prometheus_rule_group_rules gauge 127 cortex_prometheus_rule_group_rules{rule_group="group_one",user="user1"} 1000 128 cortex_prometheus_rule_group_rules{rule_group="group_one",user="user2"} 10000 129 cortex_prometheus_rule_group_rules{rule_group="group_one",user="user3"} 100000 130 cortex_prometheus_rule_group_rules{rule_group="group_two",user="user1"} 1000 131 cortex_prometheus_rule_group_rules{rule_group="group_two",user="user2"} 10000 132 cortex_prometheus_rule_group_rules{rule_group="group_two",user="user3"} 100000 133 `)) 134 require.NoError(t, err) 135 } 136 137 func populateManager(base float64) *prometheus.Registry { 138 r := prometheus.NewRegistry() 139 140 metrics := newGroupMetrics(r) 141 142 metrics.evalDuration.Observe(base) 143 metrics.iterationDuration.Observe(base) 144 145 metrics.iterationsScheduled.WithLabelValues("group_one").Add(base) 146 metrics.iterationsScheduled.WithLabelValues("group_two").Add(base) 147 metrics.iterationsMissed.WithLabelValues("group_one").Add(base) 148 metrics.iterationsMissed.WithLabelValues("group_two").Add(base) 149 metrics.evalTotal.WithLabelValues("group_one").Add(base) 150 metrics.evalTotal.WithLabelValues("group_two").Add(base) 151 metrics.evalFailures.WithLabelValues("group_one").Add(base) 152 metrics.evalFailures.WithLabelValues("group_two").Add(base) 153 154 metrics.groupLastEvalTime.WithLabelValues("group_one").Add(base * 1000) 155 metrics.groupLastEvalTime.WithLabelValues("group_two").Add(base * 1000) 156 157 metrics.groupLastDuration.WithLabelValues("group_one").Add(base * 1000) 158 metrics.groupLastDuration.WithLabelValues("group_two").Add(base * 1000) 159 160 metrics.groupRules.WithLabelValues("group_one").Add(base * 1000) 161 metrics.groupRules.WithLabelValues("group_two").Add(base * 1000) 162 163 metrics.groupLastEvalSamples.WithLabelValues("group_one").Add(base * 1000) 164 metrics.groupLastEvalSamples.WithLabelValues("group_two").Add(base * 1000) 165 166 return r 167 } 168 169 // Copied from github.com/prometheus/rules/manager.go 170 type groupMetrics struct { 171 evalDuration prometheus.Summary 172 iterationDuration prometheus.Summary 173 iterationsMissed *prometheus.CounterVec 174 iterationsScheduled *prometheus.CounterVec 175 evalTotal *prometheus.CounterVec 176 evalFailures *prometheus.CounterVec 177 groupInterval *prometheus.GaugeVec 178 groupLastEvalTime *prometheus.GaugeVec 179 groupLastDuration *prometheus.GaugeVec 180 groupRules *prometheus.GaugeVec 181 groupLastEvalSamples *prometheus.GaugeVec 182 } 183 184 func newGroupMetrics(r prometheus.Registerer) *groupMetrics { 185 m := &groupMetrics{ 186 evalDuration: promauto.With(r).NewSummary( 187 prometheus.SummaryOpts{ 188 Name: "prometheus_rule_evaluation_duration_seconds", 189 Help: "The duration for a rule to execute.", 190 Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, 191 }), 192 iterationDuration: promauto.With(r).NewSummary(prometheus.SummaryOpts{ 193 Name: "prometheus_rule_group_duration_seconds", 194 Help: "The duration of rule group evaluations.", 195 Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001}, 196 }), 197 iterationsMissed: promauto.With(r).NewCounterVec( 198 prometheus.CounterOpts{ 199 Name: "prometheus_rule_group_iterations_missed_total", 200 Help: "The total number of rule group evaluations missed due to slow rule group evaluation.", 201 }, 202 []string{"rule_group"}, 203 ), 204 iterationsScheduled: promauto.With(r).NewCounterVec( 205 prometheus.CounterOpts{ 206 Name: "prometheus_rule_group_iterations_total", 207 Help: "The total number of scheduled rule group evaluations, whether executed or missed.", 208 }, 209 []string{"rule_group"}, 210 ), 211 evalTotal: promauto.With(r).NewCounterVec( 212 prometheus.CounterOpts{ 213 Name: "prometheus_rule_evaluations_total", 214 Help: "The total number of rule evaluations.", 215 }, 216 []string{"rule_group"}, 217 ), 218 evalFailures: promauto.With(r).NewCounterVec( 219 prometheus.CounterOpts{ 220 Name: "prometheus_rule_evaluation_failures_total", 221 Help: "The total number of rule evaluation failures.", 222 }, 223 []string{"rule_group"}, 224 ), 225 groupInterval: promauto.With(r).NewGaugeVec( 226 prometheus.GaugeOpts{ 227 Name: "prometheus_rule_group_interval_seconds", 228 Help: "The interval of a rule group.", 229 }, 230 []string{"rule_group"}, 231 ), 232 groupLastEvalTime: promauto.With(r).NewGaugeVec( 233 prometheus.GaugeOpts{ 234 Name: "prometheus_rule_group_last_evaluation_timestamp_seconds", 235 Help: "The timestamp of the last rule group evaluation in seconds.", 236 }, 237 []string{"rule_group"}, 238 ), 239 groupLastDuration: promauto.With(r).NewGaugeVec( 240 prometheus.GaugeOpts{ 241 Name: "prometheus_rule_group_last_duration_seconds", 242 Help: "The duration of the last rule group evaluation.", 243 }, 244 []string{"rule_group"}, 245 ), 246 groupRules: promauto.With(r).NewGaugeVec( 247 prometheus.GaugeOpts{ 248 Name: "prometheus_rule_group_rules", 249 Help: "The number of rules.", 250 }, 251 []string{"rule_group"}, 252 ), 253 groupLastEvalSamples: promauto.With(r).NewGaugeVec( 254 prometheus.GaugeOpts{ 255 Name: "prometheus_rule_group_last_evaluation_samples", 256 Help: "The number of samples returned during the last rule group evaluation.", 257 }, 258 []string{"rule_group"}, 259 ), 260 } 261 262 return m 263 } 264 265 func TestMetricsArePerUser(t *testing.T) { 266 mainReg := prometheus.NewPedanticRegistry() 267 268 managerMetrics := NewManagerMetrics() 269 mainReg.MustRegister(managerMetrics) 270 managerMetrics.AddUserRegistry("user1", populateManager(1)) 271 managerMetrics.AddUserRegistry("user2", populateManager(10)) 272 managerMetrics.AddUserRegistry("user3", populateManager(100)) 273 274 ch := make(chan prometheus.Metric) 275 276 defer func() { 277 // drain the channel, so that collecting gouroutine can stop. 278 // This is useful if test fails. 279 for range ch { 280 } 281 }() 282 283 go func() { 284 managerMetrics.Collect(ch) 285 close(ch) 286 }() 287 288 for m := range ch { 289 desc := m.Desc() 290 291 dtoM := &dto.Metric{} 292 err := m.Write(dtoM) 293 294 require.NoError(t, err) 295 296 foundUserLabel := false 297 for _, l := range dtoM.Label { 298 if l.GetName() == "user" { 299 foundUserLabel = true 300 break 301 } 302 } 303 304 assert.True(t, foundUserLabel, "user label not found for metric %s", desc.String()) 305 } 306 }