github.com/thanos-io/thanos@v0.32.5/test/e2e/rule_test.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package e2e_test 5 6 import ( 7 "bytes" 8 "context" 9 "encoding/json" 10 "fmt" 11 "io" 12 "net/http" 13 "os" 14 "path/filepath" 15 "testing" 16 "time" 17 18 "github.com/efficientgo/e2e" 19 e2emon "github.com/efficientgo/e2e/monitoring" 20 common_cfg "github.com/prometheus/common/config" 21 "github.com/prometheus/common/model" 22 "github.com/prometheus/prometheus/config" 23 "github.com/prometheus/prometheus/discovery/targetgroup" 24 "github.com/thanos-io/thanos/pkg/errors" 25 "gopkg.in/yaml.v2" 26 27 "github.com/efficientgo/core/testutil" 28 "github.com/thanos-io/thanos/pkg/alert" 29 "github.com/thanos-io/thanos/pkg/httpconfig" 30 "github.com/thanos-io/thanos/pkg/promclient" 31 "github.com/thanos-io/thanos/pkg/rules/rulespb" 32 "github.com/thanos-io/thanos/pkg/runutil" 33 "github.com/thanos-io/thanos/test/e2e/e2ethanos" 34 ) 35 36 const ( 37 testAlertRuleAbortOnPartialResponse = ` 38 groups: 39 - name: example_abort 40 interval: 1s 41 # Abort should be a default: partial_response_strategy: "ABORT" 42 rules: 43 - alert: TestAlert_AbortOnPartialResponse 44 # It must be based on actual metrics otherwise call to StoreAPI would be not involved. 45 expr: absent(some_metric) 46 labels: 47 severity: page 48 annotations: 49 summary: "I always complain, but I don't allow partial response in query." 50 ` 51 testAlertRuleWarnOnPartialResponse = ` 52 groups: 53 - name: example_warn 54 interval: 1s 55 partial_response_strategy: "WARN" 56 rules: 57 - alert: TestAlert_WarnOnPartialResponse 58 # It must be based on actual metric, otherwise call to StoreAPI would be not involved. 59 expr: absent(some_metric) 60 labels: 61 severity: page 62 annotations: 63 summary: "I always complain and allow partial response in query." 64 ` 65 testAlertRuleAddedLaterWebHandler = ` 66 groups: 67 - name: example 68 interval: 1s 69 partial_response_strategy: "WARN" 70 rules: 71 - alert: TestAlert_HasBeenLoadedViaWebHandler 72 # It must be based on actual metric, otherwise call to StoreAPI would be not involved. 73 expr: absent(some_metric) 74 labels: 75 severity: page 76 annotations: 77 summary: "I always complain and I have been loaded via /-/reload." 78 ` 79 testAlertRuleAddedLaterSignal = ` 80 groups: 81 - name: example 82 interval: 1s 83 partial_response_strategy: "WARN" 84 rules: 85 - alert: TestAlert_HasBeenLoadedViaWebHandler 86 # It must be based on actual metric, otherwise call to StoreAPI would be not involved. 87 expr: absent(some_metric) 88 labels: 89 severity: page 90 annotations: 91 summary: "I always complain and I have been loaded via sighup signal." 92 - name: example2 93 interval: 1s 94 partial_response_strategy: "WARN" 95 rules: 96 - alert: TestAlert_HasBeenLoadedViaWebHandler 97 # It must be based on actual metric, otherwise call to StoreAPI would be not involved. 98 expr: absent(some_metric) 99 labels: 100 severity: page 101 annotations: 102 summary: "I always complain and I have been loaded via sighup signal." 103 ` 104 testAlertRuleWithLimit = ` 105 groups: 106 - name: example_with_limit 107 interval: 1s 108 partial_response_strategy: "WARN" 109 limit: 1 110 rules: 111 - alert: TestAlert_WithLimit 112 expr: 'promhttp_metric_handler_requests_total' # It has more than one labels. 113 labels: 114 severity: page 115 annotations: 116 summary: "with limit" 117 ` 118 119 testRuleRecordAbsentMetric = ` 120 groups: 121 - name: example_record_rules 122 interval: 1s 123 rules: 124 - record: test_absent_metric 125 expr: absent(nonexistent{job='thanos-receive'}) 126 ` 127 128 testAlertRuleHoldDuration = ` 129 groups: 130 - name: example_rule_hold_duration 131 interval: 1s 132 rules: 133 - alert: TestAlert_RuleHoldDuration 134 # It must be based on actual metric, otherwise call to StoreAPI would be not involved. 135 expr: absent(some_metric) 136 for: 2s 137 labels: 138 severity: page 139 annotations: 140 summary: "I always complain and allow partial response in query." 141 ` 142 143 amTimeout = model.Duration(10 * time.Second) 144 ) 145 146 type rulesResp struct { 147 Status string 148 Data *rulespb.RuleGroups 149 } 150 151 func createRuleFile(t *testing.T, path, content string) { 152 t.Helper() 153 err := os.WriteFile(path, []byte(content), 0666) 154 testutil.Ok(t, err) 155 } 156 157 func createRuleFiles(t *testing.T, dir string) { 158 t.Helper() 159 160 for i, rule := range []string{testAlertRuleAbortOnPartialResponse, testAlertRuleWarnOnPartialResponse} { 161 createRuleFile(t, filepath.Join(dir, fmt.Sprintf("rules-%d.yaml", i)), rule) 162 } 163 } 164 165 func reloadRulesHTTP(t *testing.T, ctx context.Context, endpoint string) { 166 req, err := http.NewRequestWithContext(ctx, "POST", "http://"+endpoint+"/-/reload", io.NopCloser(bytes.NewReader(nil))) 167 testutil.Ok(t, err) 168 resp, err := http.DefaultClient.Do(req) 169 testutil.Ok(t, err) 170 defer resp.Body.Close() 171 testutil.Equals(t, 200, resp.StatusCode) 172 } 173 174 func reloadRulesSignal(t *testing.T, r *e2emon.InstrumentedRunnable) { 175 c := e2e.NewCommand("kill", "-1", "1") 176 testutil.Ok(t, r.Exec(c)) 177 } 178 179 func checkReloadSuccessful(t *testing.T, ctx context.Context, endpoint string, expectedRulegroupCount int) { 180 data := rulesResp{} 181 errCount := 0 182 183 testutil.Ok(t, runutil.Retry(5*time.Second, ctx.Done(), func() error { 184 req, err := http.NewRequestWithContext(ctx, "GET", "http://"+endpoint+"/api/v1/rules", io.NopCloser(bytes.NewReader(nil))) 185 if err != nil { 186 errCount++ 187 return err 188 } 189 190 resp, err := http.DefaultClient.Do(req) 191 if err != nil { 192 errCount++ 193 return err 194 } 195 196 if resp.StatusCode != 200 { 197 errCount++ 198 return errors.Newf("statuscode is not 200, got %d", resp.StatusCode) 199 } 200 201 body, err := io.ReadAll(resp.Body) 202 if err != nil { 203 errCount++ 204 return errors.Wrapf(err, "error reading body") 205 } 206 207 if err := resp.Body.Close(); err != nil { 208 errCount++ 209 return err 210 } 211 212 if err := json.Unmarshal(body, &data); err != nil { 213 errCount++ 214 return errors.Wrapf(err, "error unmarshaling body") 215 } 216 217 if data.Status != "success" { 218 errCount++ 219 return errors.Newf("response status is not success, got %s", data.Status) 220 } 221 222 if len(data.Data.Groups) == expectedRulegroupCount { 223 return nil 224 } 225 226 errCount++ 227 return errors.Newf("different number of rulegroups: expected %d, got %d", expectedRulegroupCount, len(data.Data.Groups)) 228 })) 229 230 testutil.Assert(t, len(data.Data.Groups) == expectedRulegroupCount, fmt.Sprintf("expected there to be %d rule groups but got %d. encountered %d errors", expectedRulegroupCount, len(data.Data.Groups), errCount)) 231 } 232 233 func rulegroupCorrectData(t *testing.T, ctx context.Context, endpoint string) { 234 req, err := http.NewRequestWithContext(ctx, "GET", "http://"+endpoint+"/api/v1/rules", io.NopCloser(bytes.NewReader(nil))) 235 testutil.Ok(t, err) 236 resp, err := http.DefaultClient.Do(req) 237 testutil.Ok(t, err) 238 testutil.Equals(t, 200, resp.StatusCode) 239 defer resp.Body.Close() 240 241 body, err := io.ReadAll(resp.Body) 242 testutil.Ok(t, err) 243 244 var data = rulesResp{} 245 246 testutil.Ok(t, json.Unmarshal(body, &data)) 247 testutil.Equals(t, "success", data.Status) 248 249 testutil.Assert(t, len(data.Data.Groups) > 0, "expected there to be some rule groups") 250 251 for _, g := range data.Data.Groups { 252 testutil.Assert(t, g.EvaluationDurationSeconds > 0, "expected it to take more than zero seconds to evaluate") 253 testutil.Assert(t, !g.LastEvaluation.IsZero(), "expected the rule group to be evaluated at least once") 254 } 255 } 256 257 func writeTargets(t *testing.T, path string, addrs ...string) { 258 t.Helper() 259 260 var tgs []model.LabelSet 261 for _, a := range addrs { 262 tgs = append( 263 tgs, 264 model.LabelSet{ 265 model.AddressLabel: model.LabelValue(a), 266 }, 267 ) 268 } 269 b, err := yaml.Marshal([]*targetgroup.Group{{Targets: tgs}}) 270 testutil.Ok(t, err) 271 272 testutil.Ok(t, os.WriteFile(path+".tmp", b, 0660)) 273 testutil.Ok(t, os.Rename(path+".tmp", path)) 274 } 275 276 func TestRule(t *testing.T) { 277 t.Parallel() 278 279 e, err := e2e.NewDockerEnvironment("e2e-test-rule") 280 testutil.Ok(t, err) 281 t.Cleanup(e2ethanos.CleanScenario(t, e)) 282 283 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) 284 t.Cleanup(cancel) 285 286 am1 := e2ethanos.NewAlertmanager(e, "1") 287 am2 := e2ethanos.NewAlertmanager(e, "2") 288 testutil.Ok(t, e2e.StartAndWaitReady(am1, am2)) 289 290 rFuture := e2ethanos.NewRulerBuilder(e, "1") 291 292 amTargetsSubDir := filepath.Join("rules_am_targets") 293 testutil.Ok(t, os.MkdirAll(filepath.Join(rFuture.Dir(), amTargetsSubDir), os.ModePerm)) 294 queryTargetsSubDir := filepath.Join("rules_query_targets") 295 testutil.Ok(t, os.MkdirAll(filepath.Join(rFuture.Dir(), queryTargetsSubDir), os.ModePerm)) 296 297 rulesSubDir := filepath.Join("rules") 298 rulesPath := filepath.Join(rFuture.Dir(), rulesSubDir) 299 testutil.Ok(t, os.MkdirAll(rulesPath, os.ModePerm)) 300 createRuleFiles(t, rulesPath) 301 302 r := rFuture.WithAlertManagerConfig([]alert.AlertmanagerConfig{ 303 { 304 EndpointsConfig: httpconfig.EndpointsConfig{ 305 FileSDConfigs: []httpconfig.FileSDConfig{ 306 { 307 // FileSD which will be used to register discover dynamically am1. 308 Files: []string{filepath.Join(rFuture.InternalDir(), amTargetsSubDir, "*.yaml")}, 309 RefreshInterval: model.Duration(time.Second), 310 }, 311 }, 312 StaticAddresses: []string{ 313 am2.InternalEndpoint("http"), 314 }, 315 Scheme: "http", 316 }, 317 Timeout: amTimeout, 318 APIVersion: alert.APIv1, 319 }, 320 }).InitTSDB(filepath.Join(rFuture.InternalDir(), rulesSubDir), []httpconfig.Config{ 321 { 322 EndpointsConfig: httpconfig.EndpointsConfig{ 323 // We test Statically Addressed queries in other tests. Focus on FileSD here. 324 FileSDConfigs: []httpconfig.FileSDConfig{ 325 { 326 // FileSD which will be used to register discover dynamically q. 327 Files: []string{filepath.Join(rFuture.InternalDir(), queryTargetsSubDir, "*.yaml")}, 328 RefreshInterval: model.Duration(time.Second), 329 }, 330 }, 331 Scheme: "http", 332 }, 333 }, 334 }) 335 testutil.Ok(t, e2e.StartAndWaitReady(r)) 336 337 q := e2ethanos.NewQuerierBuilder(e, "1", r.InternalEndpoint("grpc")).Init() 338 testutil.Ok(t, e2e.StartAndWaitReady(q)) 339 340 t.Run("no query configured", func(t *testing.T) { 341 // Check for a few evaluations, check all of them failed. 342 testutil.Ok(t, r.WaitSumMetrics(e2emon.Greater(10), "prometheus_rule_evaluations_total")) 343 testutil.Ok(t, r.WaitSumMetrics(e2emon.EqualsAmongTwo, "prometheus_rule_evaluations_total", "prometheus_rule_evaluation_failures_total")) 344 345 // No alerts sent. 346 testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(0), "thanos_alert_sender_alerts_dropped_total")) 347 }) 348 349 var currentFailures float64 350 t.Run("attach query", func(t *testing.T) { 351 // Attach querier to target files. 352 writeTargets(t, filepath.Join(rFuture.Dir(), queryTargetsSubDir, "targets.yaml"), q.InternalEndpoint("http")) 353 354 testutil.Ok(t, r.WaitSumMetricsWithOptions(e2emon.Equals(1), []string{"thanos_rule_query_apis_dns_provider_results"}, e2emon.WaitMissingMetrics())) 355 testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(1), "thanos_rule_alertmanagers_dns_provider_results")) 356 357 var currentVal float64 358 testutil.Ok(t, r.WaitSumMetrics(func(sums ...float64) bool { 359 currentVal = sums[0] 360 currentFailures = sums[1] 361 return true 362 }, "prometheus_rule_evaluations_total", "prometheus_rule_evaluation_failures_total")) 363 364 // Check for a few evaluations, check all of them failed. 365 testutil.Ok(t, r.WaitSumMetrics(e2emon.Greater(currentVal+4), "prometheus_rule_evaluations_total")) 366 // No failures. 367 testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(currentFailures), "prometheus_rule_evaluation_failures_total")) 368 369 // Alerts sent. 370 testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(0), "thanos_alert_sender_alerts_dropped_total")) 371 testutil.Ok(t, r.WaitSumMetrics(e2emon.Greater(4), "thanos_alert_sender_alerts_sent_total")) 372 373 // Alerts received. 374 testutil.Ok(t, am2.WaitSumMetrics(e2emon.Equals(2), "alertmanager_alerts")) 375 testutil.Ok(t, am2.WaitSumMetrics(e2emon.Greater(4), "alertmanager_alerts_received_total")) 376 testutil.Ok(t, am2.WaitSumMetrics(e2emon.Equals(0), "alertmanager_alerts_invalid_total")) 377 378 // am1 not connected, so should not receive anything. 379 testutil.Ok(t, am1.WaitSumMetrics(e2emon.Equals(0), "alertmanager_alerts")) 380 testutil.Ok(t, am1.WaitSumMetrics(e2emon.Equals(0), "alertmanager_alerts_received_total")) 381 testutil.Ok(t, am1.WaitSumMetrics(e2emon.Equals(0), "alertmanager_alerts_invalid_total")) 382 }) 383 t.Run("attach am1", func(t *testing.T) { 384 // Attach am1 to target files. 385 writeTargets(t, filepath.Join(rFuture.Dir(), amTargetsSubDir, "targets.yaml"), am1.InternalEndpoint("http")) 386 387 testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(1), "thanos_rule_query_apis_dns_provider_results")) 388 testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(2), "thanos_rule_alertmanagers_dns_provider_results")) 389 390 testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(currentFailures), "prometheus_rule_evaluation_failures_total")) 391 392 var currentVal float64 393 testutil.Ok(t, am2.WaitSumMetrics(func(sums ...float64) bool { 394 currentVal = sums[0] 395 return true 396 }, "alertmanager_alerts_received_total")) 397 398 // Alerts received by both am1 and am2. 399 testutil.Ok(t, am2.WaitSumMetrics(e2emon.Equals(2), "alertmanager_alerts")) 400 testutil.Ok(t, am2.WaitSumMetrics(e2emon.Greater(currentVal+4), "alertmanager_alerts_received_total")) 401 testutil.Ok(t, am2.WaitSumMetrics(e2emon.Equals(0), "alertmanager_alerts_invalid_total")) 402 403 testutil.Ok(t, am1.WaitSumMetrics(e2emon.Equals(2), "alertmanager_alerts")) 404 testutil.Ok(t, am1.WaitSumMetrics(e2emon.Greater(4), "alertmanager_alerts_received_total")) 405 testutil.Ok(t, am1.WaitSumMetrics(e2emon.Equals(0), "alertmanager_alerts_invalid_total")) 406 }) 407 408 t.Run("am1 drops again", func(t *testing.T) { 409 testutil.Ok(t, os.RemoveAll(filepath.Join(rFuture.Dir(), amTargetsSubDir, "targets.yaml"))) 410 411 testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(1), "thanos_rule_query_apis_dns_provider_results")) 412 testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(1), "thanos_rule_alertmanagers_dns_provider_results")) 413 testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(currentFailures), "prometheus_rule_evaluation_failures_total")) 414 415 var currentValAm1 float64 416 testutil.Ok(t, am1.WaitSumMetrics(func(sums ...float64) bool { 417 currentValAm1 = sums[0] 418 return true 419 }, "alertmanager_alerts_received_total")) 420 421 var currentValAm2 float64 422 testutil.Ok(t, am2.WaitSumMetrics(func(sums ...float64) bool { 423 currentValAm2 = sums[0] 424 return true 425 }, "alertmanager_alerts_received_total")) 426 427 // Alerts received by both am1 and am2. 428 testutil.Ok(t, am2.WaitSumMetrics(e2emon.Equals(2), "alertmanager_alerts")) 429 testutil.Ok(t, am2.WaitSumMetrics(e2emon.Greater(currentValAm2+4), "alertmanager_alerts_received_total")) 430 testutil.Ok(t, am2.WaitSumMetrics(e2emon.Equals(0), "alertmanager_alerts_invalid_total")) 431 432 // Am1 should not receive more alerts. 433 testutil.Ok(t, am1.WaitSumMetrics(e2emon.Equals(currentValAm1), "alertmanager_alerts_received_total")) 434 }) 435 436 t.Run("duplicate am", func(t *testing.T) { 437 // am2 is already registered in static addresses. 438 writeTargets(t, filepath.Join(rFuture.Dir(), amTargetsSubDir, "targets.yaml"), am2.InternalEndpoint("http")) 439 440 testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(1), "thanos_rule_query_apis_dns_provider_results")) 441 testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(1), "thanos_rule_alertmanagers_dns_provider_results")) 442 }) 443 444 t.Run("rule groups have last evaluation and evaluation duration set", func(t *testing.T) { 445 rulegroupCorrectData(t, ctx, r.Endpoint("http")) 446 }) 447 448 t.Run("signal reload works", func(t *testing.T) { 449 // Add a new rule via sending sighup 450 createRuleFile(t, filepath.Join(rulesPath, "newrule.yaml"), testAlertRuleAddedLaterSignal) 451 reloadRulesSignal(t, r) 452 checkReloadSuccessful(t, ctx, r.Endpoint("http"), 4) 453 }) 454 455 t.Run("http reload works", func(t *testing.T) { 456 // Add a new rule via /-/reload. 457 createRuleFile(t, filepath.Join(rulesPath, "newrule.yaml"), testAlertRuleAddedLaterWebHandler) 458 reloadRulesHTTP(t, ctx, r.Endpoint("http")) 459 checkReloadSuccessful(t, ctx, r.Endpoint("http"), 3) 460 }) 461 462 t.Run("query alerts", func(t *testing.T) { 463 queryAndAssertSeries(t, ctx, q.Endpoint("http"), func() string { return "ALERTS" }, time.Now, promclient.QueryOptions{ 464 Deduplicate: false, 465 }, []model.Metric{ 466 { 467 "__name__": "ALERTS", 468 "severity": "page", 469 "alertname": "TestAlert_AbortOnPartialResponse", 470 "alertstate": "firing", 471 "replica": "1", 472 }, 473 { 474 "__name__": "ALERTS", 475 "severity": "page", 476 "alertname": "TestAlert_HasBeenLoadedViaWebHandler", 477 "alertstate": "firing", 478 "replica": "1", 479 }, 480 { 481 "__name__": "ALERTS", 482 "severity": "page", 483 "alertname": "TestAlert_WarnOnPartialResponse", 484 "alertstate": "firing", 485 "replica": "1", 486 }, 487 }) 488 489 expAlertLabels := []model.LabelSet{ 490 { 491 "severity": "page", 492 "alertname": "TestAlert_AbortOnPartialResponse", 493 "replica": "1", 494 }, 495 { 496 "severity": "page", 497 "alertname": "TestAlert_HasBeenLoadedViaWebHandler", 498 "replica": "1", 499 }, 500 { 501 "severity": "page", 502 "alertname": "TestAlert_WarnOnPartialResponse", 503 "replica": "1", 504 }, 505 } 506 507 alrts, err := promclient.NewDefaultClient().AlertmanagerAlerts(ctx, urlParse(t, "http://"+am2.Endpoint("http"))) 508 testutil.Ok(t, err) 509 510 testutil.Equals(t, len(expAlertLabels), len(alrts)) 511 for i, a := range alrts { 512 testutil.Assert(t, a.Labels.Equal(expAlertLabels[i]), "unexpected labels %s", a.Labels) 513 } 514 }) 515 } 516 517 // TestRule_CanRemoteWriteData checks that Thanos Ruler can be run in stateless mode 518 // where it remote_writes rule evaluations to a Prometheus remote-write endpoint (typically 519 // a Thanos Receiver). 520 func TestRule_CanRemoteWriteData(t *testing.T) { 521 t.Parallel() 522 523 e, err := e2e.NewDockerEnvironment("rule-rw") 524 testutil.Ok(t, err) 525 t.Cleanup(e2ethanos.CleanScenario(t, e)) 526 527 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) 528 t.Cleanup(cancel) 529 530 rFuture := e2ethanos.NewRulerBuilder(e, "1") 531 rulesSubDir := "rules" 532 rulesPath := filepath.Join(rFuture.Dir(), rulesSubDir) 533 testutil.Ok(t, os.MkdirAll(rulesPath, os.ModePerm)) 534 535 for i, rule := range []string{testRuleRecordAbsentMetric, testAlertRuleWarnOnPartialResponse} { 536 createRuleFile(t, filepath.Join(rulesPath, fmt.Sprintf("rules-%d.yaml", i)), rule) 537 } 538 539 am := e2ethanos.NewAlertmanager(e, "1") 540 testutil.Ok(t, e2e.StartAndWaitReady(am)) 541 542 receiver := e2ethanos.NewReceiveBuilder(e, "1").WithIngestionEnabled().Init() 543 testutil.Ok(t, e2e.StartAndWaitReady(receiver)) 544 rwURL := urlParse(t, e2ethanos.RemoteWriteEndpoint(receiver.InternalEndpoint("remote-write"))) 545 546 receiver2 := e2ethanos.NewReceiveBuilder(e, "2").WithIngestionEnabled().Init() 547 testutil.Ok(t, e2e.StartAndWaitReady(receiver2)) 548 rwURL2 := urlParse(t, e2ethanos.RemoteWriteEndpoint(receiver2.InternalEndpoint("remote-write"))) 549 550 q := e2ethanos.NewQuerierBuilder(e, "1", receiver.InternalEndpoint("grpc"), receiver2.InternalEndpoint("grpc")).Init() 551 testutil.Ok(t, e2e.StartAndWaitReady(q)) 552 553 r := rFuture.WithAlertManagerConfig([]alert.AlertmanagerConfig{ 554 { 555 EndpointsConfig: httpconfig.EndpointsConfig{ 556 StaticAddresses: []string{ 557 am.InternalEndpoint("http"), 558 }, 559 Scheme: "http", 560 }, 561 Timeout: amTimeout, 562 APIVersion: alert.APIv1, 563 }, 564 }).InitStateless(filepath.Join(rFuture.InternalDir(), rulesSubDir), []httpconfig.Config{ 565 { 566 EndpointsConfig: httpconfig.EndpointsConfig{ 567 StaticAddresses: []string{ 568 q.InternalEndpoint("http"), 569 }, 570 Scheme: "http", 571 }, 572 }, 573 }, []*config.RemoteWriteConfig{ 574 {URL: &common_cfg.URL{URL: rwURL}, Name: "thanos-receiver"}, 575 {URL: &common_cfg.URL{URL: rwURL2}, Name: "thanos-receiver2"}, 576 }) 577 testutil.Ok(t, e2e.StartAndWaitReady(r)) 578 579 // Wait until remote write samples are written to receivers successfully. 580 testutil.Ok(t, r.WaitSumMetricsWithOptions(e2emon.GreaterOrEqual(1), []string{"prometheus_remote_storage_samples_total"}, e2emon.WaitMissingMetrics())) 581 582 t.Run("can fetch remote-written samples from receiver", func(t *testing.T) { 583 testRecordedSamples := func() string { return "test_absent_metric" } 584 queryAndAssertSeries(t, ctx, q.Endpoint("http"), testRecordedSamples, time.Now, promclient.QueryOptions{ 585 Deduplicate: false, 586 }, []model.Metric{ 587 { 588 "__name__": "test_absent_metric", 589 "job": "thanos-receive", 590 "receive": model.LabelValue(receiver.Name()), 591 "replica": "1", 592 "tenant_id": "default-tenant", 593 }, 594 { 595 "__name__": "test_absent_metric", 596 "job": "thanos-receive", 597 "receive": model.LabelValue(receiver2.Name()), 598 "replica": "1", 599 "tenant_id": "default-tenant", 600 }, 601 }) 602 }) 603 } 604 605 func TestStatelessRulerAlertStateRestore(t *testing.T) { 606 t.Parallel() 607 608 e, err := e2e.NewDockerEnvironment("stateless-state") 609 testutil.Ok(t, err) 610 t.Cleanup(e2ethanos.CleanScenario(t, e)) 611 612 ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) 613 t.Cleanup(cancel) 614 615 am := e2ethanos.NewAlertmanager(e, "1") 616 testutil.Ok(t, e2e.StartAndWaitReady(am)) 617 618 receiver := e2ethanos.NewReceiveBuilder(e, "1").WithIngestionEnabled().Init() 619 testutil.Ok(t, e2e.StartAndWaitReady(receiver)) 620 rwURL := urlParse(t, e2ethanos.RemoteWriteEndpoint(receiver.InternalEndpoint("remote-write"))) 621 622 q := e2ethanos.NewQuerierBuilder(e, "1", receiver.InternalEndpoint("grpc")). 623 WithReplicaLabels("replica", "receive").Init() 624 testutil.Ok(t, e2e.StartAndWaitReady(q)) 625 rulesSubDir := "rules" 626 var rulers []*e2emon.InstrumentedRunnable 627 for i := 1; i <= 2; i++ { 628 rFuture := e2ethanos.NewRulerBuilder(e, fmt.Sprintf("%d", i)) 629 rulesPath := filepath.Join(rFuture.Dir(), rulesSubDir) 630 testutil.Ok(t, os.MkdirAll(rulesPath, os.ModePerm)) 631 for i, rule := range []string{testAlertRuleHoldDuration} { 632 createRuleFile(t, filepath.Join(rulesPath, fmt.Sprintf("rules-%d.yaml", i)), rule) 633 } 634 r := rFuture.WithAlertManagerConfig([]alert.AlertmanagerConfig{ 635 { 636 EndpointsConfig: httpconfig.EndpointsConfig{ 637 StaticAddresses: []string{ 638 am.InternalEndpoint("http"), 639 }, 640 Scheme: "http", 641 }, 642 Timeout: amTimeout, 643 APIVersion: alert.APIv1, 644 }, 645 }).WithForGracePeriod("500ms"). 646 WithRestoreIgnoredLabels("tenant_id"). 647 InitStateless(filepath.Join(rFuture.InternalDir(), rulesSubDir), []httpconfig.Config{ 648 { 649 EndpointsConfig: httpconfig.EndpointsConfig{ 650 StaticAddresses: []string{ 651 q.InternalEndpoint("http"), 652 }, 653 Scheme: "http", 654 }, 655 }, 656 }, []*config.RemoteWriteConfig{ 657 {URL: &common_cfg.URL{URL: rwURL}, Name: "thanos-receiver"}, 658 }) 659 rulers = append(rulers, r) 660 } 661 662 // Start the ruler 1 first. 663 testutil.Ok(t, e2e.StartAndWaitReady(rulers[0])) 664 665 // Wait until the alert firing and ALERTS_FOR_STATE 666 // series has been written to receiver successfully. 667 queryAndAssertSeries(t, ctx, q.Endpoint("http"), func() string { 668 return "ALERTS_FOR_STATE" 669 }, time.Now, promclient.QueryOptions{ 670 Deduplicate: true, 671 }, []model.Metric{ 672 { 673 "__name__": "ALERTS_FOR_STATE", 674 "alertname": "TestAlert_RuleHoldDuration", 675 "severity": "page", 676 "tenant_id": "default-tenant", 677 }, 678 }) 679 680 var alerts []*rulespb.AlertInstance 681 client := promclient.NewDefaultClient() 682 err = runutil.Retry(time.Second*1, ctx.Done(), func() error { 683 alerts, err = client.AlertsInGRPC(ctx, urlParse(t, "http://"+rulers[0].Endpoint("http"))) 684 testutil.Ok(t, err) 685 if len(alerts) > 0 { 686 if alerts[0].State == rulespb.AlertState_FIRING { 687 return nil 688 } 689 } 690 return fmt.Errorf("alert is not firing") 691 }) 692 testutil.Ok(t, err) 693 // Record the alert active time. 694 alertActiveAt := alerts[0].ActiveAt 695 testutil.Ok(t, rulers[0].Stop()) 696 697 // Start the ruler 2 now and ruler 2 should be able 698 // to restore the firing alert state. 699 testutil.Ok(t, e2e.StartAndWaitReady(rulers[1])) 700 701 // Wait for 4 rule evaluation iterations to make sure the alert state is restored. 702 testutil.Ok(t, rulers[1].WaitSumMetricsWithOptions(e2emon.GreaterOrEqual(4), []string{"prometheus_rule_group_iterations_total"}, e2emon.WaitMissingMetrics())) 703 704 // Wait until the alert is firing on the second ruler. 705 err = runutil.Retry(time.Second*1, ctx.Done(), func() error { 706 alerts, err = client.AlertsInGRPC(ctx, urlParse(t, "http://"+rulers[1].Endpoint("http"))) 707 testutil.Ok(t, err) 708 if len(alerts) > 0 { 709 if alerts[0].State == rulespb.AlertState_FIRING { 710 // The second ruler alert's active at time is the same as the previous one, 711 // which means the alert state is restored successfully. 712 if alertActiveAt.Unix() == alerts[0].ActiveAt.Unix() { 713 return nil 714 } else { 715 return fmt.Errorf("alert active time is not restored") 716 } 717 } 718 } 719 return fmt.Errorf("alert is not firing") 720 }) 721 testutil.Ok(t, err) 722 } 723 724 // TestRule_CanPersistWALData checks that in stateless mode, Thanos Ruler can persist rule evaluations 725 // which couldn't be sent to the remote write endpoint (e.g because receiver isn't available). 726 func TestRule_CanPersistWALData(t *testing.T) { 727 //TODO: Implement test with unavailable remote-write endpoint(receiver) 728 } 729 730 // Test Ruler behavior on different storepb.PartialResponseStrategy when having partial response from single `failingStoreAPI`. 731 func TestRulePartialResponse(t *testing.T) { 732 t.Skip("TODO: Allow HTTP ports from binaries running on host to be accessible.") 733 734 // TODO: Implement with failing store. 735 }