github.com/thanos-io/thanos@v0.32.5/test/e2e/rule_test.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  package e2e_test
     5  
     6  import (
     7  	"bytes"
     8  	"context"
     9  	"encoding/json"
    10  	"fmt"
    11  	"io"
    12  	"net/http"
    13  	"os"
    14  	"path/filepath"
    15  	"testing"
    16  	"time"
    17  
    18  	"github.com/efficientgo/e2e"
    19  	e2emon "github.com/efficientgo/e2e/monitoring"
    20  	common_cfg "github.com/prometheus/common/config"
    21  	"github.com/prometheus/common/model"
    22  	"github.com/prometheus/prometheus/config"
    23  	"github.com/prometheus/prometheus/discovery/targetgroup"
    24  	"github.com/thanos-io/thanos/pkg/errors"
    25  	"gopkg.in/yaml.v2"
    26  
    27  	"github.com/efficientgo/core/testutil"
    28  	"github.com/thanos-io/thanos/pkg/alert"
    29  	"github.com/thanos-io/thanos/pkg/httpconfig"
    30  	"github.com/thanos-io/thanos/pkg/promclient"
    31  	"github.com/thanos-io/thanos/pkg/rules/rulespb"
    32  	"github.com/thanos-io/thanos/pkg/runutil"
    33  	"github.com/thanos-io/thanos/test/e2e/e2ethanos"
    34  )
    35  
    36  const (
    37  	testAlertRuleAbortOnPartialResponse = `
    38  groups:
    39  - name: example_abort
    40    interval: 1s
    41    # Abort should be a default: partial_response_strategy: "ABORT"
    42    rules:
    43    - alert: TestAlert_AbortOnPartialResponse
    44      # It must be based on actual metrics otherwise call to StoreAPI would be not involved.
    45      expr: absent(some_metric)
    46      labels:
    47        severity: page
    48      annotations:
    49        summary: "I always complain, but I don't allow partial response in query."
    50  `
    51  	testAlertRuleWarnOnPartialResponse = `
    52  groups:
    53  - name: example_warn
    54    interval: 1s
    55    partial_response_strategy: "WARN"
    56    rules:
    57    - alert: TestAlert_WarnOnPartialResponse
    58      # It must be based on actual metric, otherwise call to StoreAPI would be not involved.
    59      expr: absent(some_metric)
    60      labels:
    61        severity: page
    62      annotations:
    63        summary: "I always complain and allow partial response in query."
    64  `
    65  	testAlertRuleAddedLaterWebHandler = `
    66  groups:
    67  - name: example
    68    interval: 1s
    69    partial_response_strategy: "WARN"
    70    rules:
    71    - alert: TestAlert_HasBeenLoadedViaWebHandler
    72      # It must be based on actual metric, otherwise call to StoreAPI would be not involved.
    73      expr: absent(some_metric)
    74      labels:
    75        severity: page
    76      annotations:
    77        summary: "I always complain and I have been loaded via /-/reload."
    78  `
    79  	testAlertRuleAddedLaterSignal = `
    80  groups:
    81  - name: example
    82    interval: 1s
    83    partial_response_strategy: "WARN"
    84    rules:
    85    - alert: TestAlert_HasBeenLoadedViaWebHandler
    86      # It must be based on actual metric, otherwise call to StoreAPI would be not involved.
    87      expr: absent(some_metric)
    88      labels:
    89        severity: page
    90      annotations:
    91        summary: "I always complain and I have been loaded via sighup signal."
    92  - name: example2
    93    interval: 1s
    94    partial_response_strategy: "WARN"
    95    rules:
    96    - alert: TestAlert_HasBeenLoadedViaWebHandler
    97      # It must be based on actual metric, otherwise call to StoreAPI would be not involved.
    98      expr: absent(some_metric)
    99      labels:
   100        severity: page
   101      annotations:
   102        summary: "I always complain and I have been loaded via sighup signal."
   103  `
   104  	testAlertRuleWithLimit = `
   105  groups:
   106  - name: example_with_limit
   107    interval: 1s
   108    partial_response_strategy: "WARN"
   109    limit: 1
   110    rules:
   111    - alert: TestAlert_WithLimit
   112      expr: 'promhttp_metric_handler_requests_total' # It has more than one labels.
   113      labels:
   114        severity: page
   115      annotations:
   116        summary: "with limit"
   117  `
   118  
   119  	testRuleRecordAbsentMetric = `
   120  groups:
   121  - name: example_record_rules
   122    interval: 1s
   123    rules:
   124    - record: test_absent_metric
   125      expr: absent(nonexistent{job='thanos-receive'})
   126  `
   127  
   128  	testAlertRuleHoldDuration = `
   129  groups:
   130  - name: example_rule_hold_duration
   131    interval: 1s
   132    rules:
   133    - alert: TestAlert_RuleHoldDuration
   134      # It must be based on actual metric, otherwise call to StoreAPI would be not involved.
   135      expr: absent(some_metric)
   136      for: 2s
   137      labels:
   138        severity: page
   139      annotations:
   140        summary: "I always complain and allow partial response in query."
   141  `
   142  
   143  	amTimeout = model.Duration(10 * time.Second)
   144  )
   145  
   146  type rulesResp struct {
   147  	Status string
   148  	Data   *rulespb.RuleGroups
   149  }
   150  
   151  func createRuleFile(t *testing.T, path, content string) {
   152  	t.Helper()
   153  	err := os.WriteFile(path, []byte(content), 0666)
   154  	testutil.Ok(t, err)
   155  }
   156  
   157  func createRuleFiles(t *testing.T, dir string) {
   158  	t.Helper()
   159  
   160  	for i, rule := range []string{testAlertRuleAbortOnPartialResponse, testAlertRuleWarnOnPartialResponse} {
   161  		createRuleFile(t, filepath.Join(dir, fmt.Sprintf("rules-%d.yaml", i)), rule)
   162  	}
   163  }
   164  
   165  func reloadRulesHTTP(t *testing.T, ctx context.Context, endpoint string) {
   166  	req, err := http.NewRequestWithContext(ctx, "POST", "http://"+endpoint+"/-/reload", io.NopCloser(bytes.NewReader(nil)))
   167  	testutil.Ok(t, err)
   168  	resp, err := http.DefaultClient.Do(req)
   169  	testutil.Ok(t, err)
   170  	defer resp.Body.Close()
   171  	testutil.Equals(t, 200, resp.StatusCode)
   172  }
   173  
   174  func reloadRulesSignal(t *testing.T, r *e2emon.InstrumentedRunnable) {
   175  	c := e2e.NewCommand("kill", "-1", "1")
   176  	testutil.Ok(t, r.Exec(c))
   177  }
   178  
   179  func checkReloadSuccessful(t *testing.T, ctx context.Context, endpoint string, expectedRulegroupCount int) {
   180  	data := rulesResp{}
   181  	errCount := 0
   182  
   183  	testutil.Ok(t, runutil.Retry(5*time.Second, ctx.Done(), func() error {
   184  		req, err := http.NewRequestWithContext(ctx, "GET", "http://"+endpoint+"/api/v1/rules", io.NopCloser(bytes.NewReader(nil)))
   185  		if err != nil {
   186  			errCount++
   187  			return err
   188  		}
   189  
   190  		resp, err := http.DefaultClient.Do(req)
   191  		if err != nil {
   192  			errCount++
   193  			return err
   194  		}
   195  
   196  		if resp.StatusCode != 200 {
   197  			errCount++
   198  			return errors.Newf("statuscode is not 200, got %d", resp.StatusCode)
   199  		}
   200  
   201  		body, err := io.ReadAll(resp.Body)
   202  		if err != nil {
   203  			errCount++
   204  			return errors.Wrapf(err, "error reading body")
   205  		}
   206  
   207  		if err := resp.Body.Close(); err != nil {
   208  			errCount++
   209  			return err
   210  		}
   211  
   212  		if err := json.Unmarshal(body, &data); err != nil {
   213  			errCount++
   214  			return errors.Wrapf(err, "error unmarshaling body")
   215  		}
   216  
   217  		if data.Status != "success" {
   218  			errCount++
   219  			return errors.Newf("response status is not success, got %s", data.Status)
   220  		}
   221  
   222  		if len(data.Data.Groups) == expectedRulegroupCount {
   223  			return nil
   224  		}
   225  
   226  		errCount++
   227  		return errors.Newf("different number of rulegroups: expected %d, got %d", expectedRulegroupCount, len(data.Data.Groups))
   228  	}))
   229  
   230  	testutil.Assert(t, len(data.Data.Groups) == expectedRulegroupCount, fmt.Sprintf("expected there to be %d rule groups but got %d. encountered %d errors", expectedRulegroupCount, len(data.Data.Groups), errCount))
   231  }
   232  
   233  func rulegroupCorrectData(t *testing.T, ctx context.Context, endpoint string) {
   234  	req, err := http.NewRequestWithContext(ctx, "GET", "http://"+endpoint+"/api/v1/rules", io.NopCloser(bytes.NewReader(nil)))
   235  	testutil.Ok(t, err)
   236  	resp, err := http.DefaultClient.Do(req)
   237  	testutil.Ok(t, err)
   238  	testutil.Equals(t, 200, resp.StatusCode)
   239  	defer resp.Body.Close()
   240  
   241  	body, err := io.ReadAll(resp.Body)
   242  	testutil.Ok(t, err)
   243  
   244  	var data = rulesResp{}
   245  
   246  	testutil.Ok(t, json.Unmarshal(body, &data))
   247  	testutil.Equals(t, "success", data.Status)
   248  
   249  	testutil.Assert(t, len(data.Data.Groups) > 0, "expected there to be some rule groups")
   250  
   251  	for _, g := range data.Data.Groups {
   252  		testutil.Assert(t, g.EvaluationDurationSeconds > 0, "expected it to take more than zero seconds to evaluate")
   253  		testutil.Assert(t, !g.LastEvaluation.IsZero(), "expected the rule group to be evaluated at least once")
   254  	}
   255  }
   256  
   257  func writeTargets(t *testing.T, path string, addrs ...string) {
   258  	t.Helper()
   259  
   260  	var tgs []model.LabelSet
   261  	for _, a := range addrs {
   262  		tgs = append(
   263  			tgs,
   264  			model.LabelSet{
   265  				model.AddressLabel: model.LabelValue(a),
   266  			},
   267  		)
   268  	}
   269  	b, err := yaml.Marshal([]*targetgroup.Group{{Targets: tgs}})
   270  	testutil.Ok(t, err)
   271  
   272  	testutil.Ok(t, os.WriteFile(path+".tmp", b, 0660))
   273  	testutil.Ok(t, os.Rename(path+".tmp", path))
   274  }
   275  
   276  func TestRule(t *testing.T) {
   277  	t.Parallel()
   278  
   279  	e, err := e2e.NewDockerEnvironment("e2e-test-rule")
   280  	testutil.Ok(t, err)
   281  	t.Cleanup(e2ethanos.CleanScenario(t, e))
   282  
   283  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
   284  	t.Cleanup(cancel)
   285  
   286  	am1 := e2ethanos.NewAlertmanager(e, "1")
   287  	am2 := e2ethanos.NewAlertmanager(e, "2")
   288  	testutil.Ok(t, e2e.StartAndWaitReady(am1, am2))
   289  
   290  	rFuture := e2ethanos.NewRulerBuilder(e, "1")
   291  
   292  	amTargetsSubDir := filepath.Join("rules_am_targets")
   293  	testutil.Ok(t, os.MkdirAll(filepath.Join(rFuture.Dir(), amTargetsSubDir), os.ModePerm))
   294  	queryTargetsSubDir := filepath.Join("rules_query_targets")
   295  	testutil.Ok(t, os.MkdirAll(filepath.Join(rFuture.Dir(), queryTargetsSubDir), os.ModePerm))
   296  
   297  	rulesSubDir := filepath.Join("rules")
   298  	rulesPath := filepath.Join(rFuture.Dir(), rulesSubDir)
   299  	testutil.Ok(t, os.MkdirAll(rulesPath, os.ModePerm))
   300  	createRuleFiles(t, rulesPath)
   301  
   302  	r := rFuture.WithAlertManagerConfig([]alert.AlertmanagerConfig{
   303  		{
   304  			EndpointsConfig: httpconfig.EndpointsConfig{
   305  				FileSDConfigs: []httpconfig.FileSDConfig{
   306  					{
   307  						// FileSD which will be used to register discover dynamically am1.
   308  						Files:           []string{filepath.Join(rFuture.InternalDir(), amTargetsSubDir, "*.yaml")},
   309  						RefreshInterval: model.Duration(time.Second),
   310  					},
   311  				},
   312  				StaticAddresses: []string{
   313  					am2.InternalEndpoint("http"),
   314  				},
   315  				Scheme: "http",
   316  			},
   317  			Timeout:    amTimeout,
   318  			APIVersion: alert.APIv1,
   319  		},
   320  	}).InitTSDB(filepath.Join(rFuture.InternalDir(), rulesSubDir), []httpconfig.Config{
   321  		{
   322  			EndpointsConfig: httpconfig.EndpointsConfig{
   323  				// We test Statically Addressed queries in other tests. Focus on FileSD here.
   324  				FileSDConfigs: []httpconfig.FileSDConfig{
   325  					{
   326  						// FileSD which will be used to register discover dynamically q.
   327  						Files:           []string{filepath.Join(rFuture.InternalDir(), queryTargetsSubDir, "*.yaml")},
   328  						RefreshInterval: model.Duration(time.Second),
   329  					},
   330  				},
   331  				Scheme: "http",
   332  			},
   333  		},
   334  	})
   335  	testutil.Ok(t, e2e.StartAndWaitReady(r))
   336  
   337  	q := e2ethanos.NewQuerierBuilder(e, "1", r.InternalEndpoint("grpc")).Init()
   338  	testutil.Ok(t, e2e.StartAndWaitReady(q))
   339  
   340  	t.Run("no query configured", func(t *testing.T) {
   341  		// Check for a few evaluations, check all of them failed.
   342  		testutil.Ok(t, r.WaitSumMetrics(e2emon.Greater(10), "prometheus_rule_evaluations_total"))
   343  		testutil.Ok(t, r.WaitSumMetrics(e2emon.EqualsAmongTwo, "prometheus_rule_evaluations_total", "prometheus_rule_evaluation_failures_total"))
   344  
   345  		// No alerts sent.
   346  		testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(0), "thanos_alert_sender_alerts_dropped_total"))
   347  	})
   348  
   349  	var currentFailures float64
   350  	t.Run("attach query", func(t *testing.T) {
   351  		// Attach querier to target files.
   352  		writeTargets(t, filepath.Join(rFuture.Dir(), queryTargetsSubDir, "targets.yaml"), q.InternalEndpoint("http"))
   353  
   354  		testutil.Ok(t, r.WaitSumMetricsWithOptions(e2emon.Equals(1), []string{"thanos_rule_query_apis_dns_provider_results"}, e2emon.WaitMissingMetrics()))
   355  		testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(1), "thanos_rule_alertmanagers_dns_provider_results"))
   356  
   357  		var currentVal float64
   358  		testutil.Ok(t, r.WaitSumMetrics(func(sums ...float64) bool {
   359  			currentVal = sums[0]
   360  			currentFailures = sums[1]
   361  			return true
   362  		}, "prometheus_rule_evaluations_total", "prometheus_rule_evaluation_failures_total"))
   363  
   364  		// Check for a few evaluations, check all of them failed.
   365  		testutil.Ok(t, r.WaitSumMetrics(e2emon.Greater(currentVal+4), "prometheus_rule_evaluations_total"))
   366  		// No failures.
   367  		testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(currentFailures), "prometheus_rule_evaluation_failures_total"))
   368  
   369  		// Alerts sent.
   370  		testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(0), "thanos_alert_sender_alerts_dropped_total"))
   371  		testutil.Ok(t, r.WaitSumMetrics(e2emon.Greater(4), "thanos_alert_sender_alerts_sent_total"))
   372  
   373  		// Alerts received.
   374  		testutil.Ok(t, am2.WaitSumMetrics(e2emon.Equals(2), "alertmanager_alerts"))
   375  		testutil.Ok(t, am2.WaitSumMetrics(e2emon.Greater(4), "alertmanager_alerts_received_total"))
   376  		testutil.Ok(t, am2.WaitSumMetrics(e2emon.Equals(0), "alertmanager_alerts_invalid_total"))
   377  
   378  		// am1 not connected, so should not receive anything.
   379  		testutil.Ok(t, am1.WaitSumMetrics(e2emon.Equals(0), "alertmanager_alerts"))
   380  		testutil.Ok(t, am1.WaitSumMetrics(e2emon.Equals(0), "alertmanager_alerts_received_total"))
   381  		testutil.Ok(t, am1.WaitSumMetrics(e2emon.Equals(0), "alertmanager_alerts_invalid_total"))
   382  	})
   383  	t.Run("attach am1", func(t *testing.T) {
   384  		// Attach am1 to target files.
   385  		writeTargets(t, filepath.Join(rFuture.Dir(), amTargetsSubDir, "targets.yaml"), am1.InternalEndpoint("http"))
   386  
   387  		testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(1), "thanos_rule_query_apis_dns_provider_results"))
   388  		testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(2), "thanos_rule_alertmanagers_dns_provider_results"))
   389  
   390  		testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(currentFailures), "prometheus_rule_evaluation_failures_total"))
   391  
   392  		var currentVal float64
   393  		testutil.Ok(t, am2.WaitSumMetrics(func(sums ...float64) bool {
   394  			currentVal = sums[0]
   395  			return true
   396  		}, "alertmanager_alerts_received_total"))
   397  
   398  		// Alerts received by both am1 and am2.
   399  		testutil.Ok(t, am2.WaitSumMetrics(e2emon.Equals(2), "alertmanager_alerts"))
   400  		testutil.Ok(t, am2.WaitSumMetrics(e2emon.Greater(currentVal+4), "alertmanager_alerts_received_total"))
   401  		testutil.Ok(t, am2.WaitSumMetrics(e2emon.Equals(0), "alertmanager_alerts_invalid_total"))
   402  
   403  		testutil.Ok(t, am1.WaitSumMetrics(e2emon.Equals(2), "alertmanager_alerts"))
   404  		testutil.Ok(t, am1.WaitSumMetrics(e2emon.Greater(4), "alertmanager_alerts_received_total"))
   405  		testutil.Ok(t, am1.WaitSumMetrics(e2emon.Equals(0), "alertmanager_alerts_invalid_total"))
   406  	})
   407  
   408  	t.Run("am1 drops again", func(t *testing.T) {
   409  		testutil.Ok(t, os.RemoveAll(filepath.Join(rFuture.Dir(), amTargetsSubDir, "targets.yaml")))
   410  
   411  		testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(1), "thanos_rule_query_apis_dns_provider_results"))
   412  		testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(1), "thanos_rule_alertmanagers_dns_provider_results"))
   413  		testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(currentFailures), "prometheus_rule_evaluation_failures_total"))
   414  
   415  		var currentValAm1 float64
   416  		testutil.Ok(t, am1.WaitSumMetrics(func(sums ...float64) bool {
   417  			currentValAm1 = sums[0]
   418  			return true
   419  		}, "alertmanager_alerts_received_total"))
   420  
   421  		var currentValAm2 float64
   422  		testutil.Ok(t, am2.WaitSumMetrics(func(sums ...float64) bool {
   423  			currentValAm2 = sums[0]
   424  			return true
   425  		}, "alertmanager_alerts_received_total"))
   426  
   427  		// Alerts received by both am1 and am2.
   428  		testutil.Ok(t, am2.WaitSumMetrics(e2emon.Equals(2), "alertmanager_alerts"))
   429  		testutil.Ok(t, am2.WaitSumMetrics(e2emon.Greater(currentValAm2+4), "alertmanager_alerts_received_total"))
   430  		testutil.Ok(t, am2.WaitSumMetrics(e2emon.Equals(0), "alertmanager_alerts_invalid_total"))
   431  
   432  		// Am1 should not receive more alerts.
   433  		testutil.Ok(t, am1.WaitSumMetrics(e2emon.Equals(currentValAm1), "alertmanager_alerts_received_total"))
   434  	})
   435  
   436  	t.Run("duplicate am", func(t *testing.T) {
   437  		// am2 is already registered in static addresses.
   438  		writeTargets(t, filepath.Join(rFuture.Dir(), amTargetsSubDir, "targets.yaml"), am2.InternalEndpoint("http"))
   439  
   440  		testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(1), "thanos_rule_query_apis_dns_provider_results"))
   441  		testutil.Ok(t, r.WaitSumMetrics(e2emon.Equals(1), "thanos_rule_alertmanagers_dns_provider_results"))
   442  	})
   443  
   444  	t.Run("rule groups have last evaluation and evaluation duration set", func(t *testing.T) {
   445  		rulegroupCorrectData(t, ctx, r.Endpoint("http"))
   446  	})
   447  
   448  	t.Run("signal reload works", func(t *testing.T) {
   449  		// Add a new rule via sending sighup
   450  		createRuleFile(t, filepath.Join(rulesPath, "newrule.yaml"), testAlertRuleAddedLaterSignal)
   451  		reloadRulesSignal(t, r)
   452  		checkReloadSuccessful(t, ctx, r.Endpoint("http"), 4)
   453  	})
   454  
   455  	t.Run("http reload works", func(t *testing.T) {
   456  		// Add a new rule via /-/reload.
   457  		createRuleFile(t, filepath.Join(rulesPath, "newrule.yaml"), testAlertRuleAddedLaterWebHandler)
   458  		reloadRulesHTTP(t, ctx, r.Endpoint("http"))
   459  		checkReloadSuccessful(t, ctx, r.Endpoint("http"), 3)
   460  	})
   461  
   462  	t.Run("query alerts", func(t *testing.T) {
   463  		queryAndAssertSeries(t, ctx, q.Endpoint("http"), func() string { return "ALERTS" }, time.Now, promclient.QueryOptions{
   464  			Deduplicate: false,
   465  		}, []model.Metric{
   466  			{
   467  				"__name__":   "ALERTS",
   468  				"severity":   "page",
   469  				"alertname":  "TestAlert_AbortOnPartialResponse",
   470  				"alertstate": "firing",
   471  				"replica":    "1",
   472  			},
   473  			{
   474  				"__name__":   "ALERTS",
   475  				"severity":   "page",
   476  				"alertname":  "TestAlert_HasBeenLoadedViaWebHandler",
   477  				"alertstate": "firing",
   478  				"replica":    "1",
   479  			},
   480  			{
   481  				"__name__":   "ALERTS",
   482  				"severity":   "page",
   483  				"alertname":  "TestAlert_WarnOnPartialResponse",
   484  				"alertstate": "firing",
   485  				"replica":    "1",
   486  			},
   487  		})
   488  
   489  		expAlertLabels := []model.LabelSet{
   490  			{
   491  				"severity":  "page",
   492  				"alertname": "TestAlert_AbortOnPartialResponse",
   493  				"replica":   "1",
   494  			},
   495  			{
   496  				"severity":  "page",
   497  				"alertname": "TestAlert_HasBeenLoadedViaWebHandler",
   498  				"replica":   "1",
   499  			},
   500  			{
   501  				"severity":  "page",
   502  				"alertname": "TestAlert_WarnOnPartialResponse",
   503  				"replica":   "1",
   504  			},
   505  		}
   506  
   507  		alrts, err := promclient.NewDefaultClient().AlertmanagerAlerts(ctx, urlParse(t, "http://"+am2.Endpoint("http")))
   508  		testutil.Ok(t, err)
   509  
   510  		testutil.Equals(t, len(expAlertLabels), len(alrts))
   511  		for i, a := range alrts {
   512  			testutil.Assert(t, a.Labels.Equal(expAlertLabels[i]), "unexpected labels %s", a.Labels)
   513  		}
   514  	})
   515  }
   516  
   517  // TestRule_CanRemoteWriteData checks that Thanos Ruler can be run in stateless mode
   518  // where it remote_writes rule evaluations to a Prometheus remote-write endpoint (typically
   519  // a Thanos Receiver).
   520  func TestRule_CanRemoteWriteData(t *testing.T) {
   521  	t.Parallel()
   522  
   523  	e, err := e2e.NewDockerEnvironment("rule-rw")
   524  	testutil.Ok(t, err)
   525  	t.Cleanup(e2ethanos.CleanScenario(t, e))
   526  
   527  	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
   528  	t.Cleanup(cancel)
   529  
   530  	rFuture := e2ethanos.NewRulerBuilder(e, "1")
   531  	rulesSubDir := "rules"
   532  	rulesPath := filepath.Join(rFuture.Dir(), rulesSubDir)
   533  	testutil.Ok(t, os.MkdirAll(rulesPath, os.ModePerm))
   534  
   535  	for i, rule := range []string{testRuleRecordAbsentMetric, testAlertRuleWarnOnPartialResponse} {
   536  		createRuleFile(t, filepath.Join(rulesPath, fmt.Sprintf("rules-%d.yaml", i)), rule)
   537  	}
   538  
   539  	am := e2ethanos.NewAlertmanager(e, "1")
   540  	testutil.Ok(t, e2e.StartAndWaitReady(am))
   541  
   542  	receiver := e2ethanos.NewReceiveBuilder(e, "1").WithIngestionEnabled().Init()
   543  	testutil.Ok(t, e2e.StartAndWaitReady(receiver))
   544  	rwURL := urlParse(t, e2ethanos.RemoteWriteEndpoint(receiver.InternalEndpoint("remote-write")))
   545  
   546  	receiver2 := e2ethanos.NewReceiveBuilder(e, "2").WithIngestionEnabled().Init()
   547  	testutil.Ok(t, e2e.StartAndWaitReady(receiver2))
   548  	rwURL2 := urlParse(t, e2ethanos.RemoteWriteEndpoint(receiver2.InternalEndpoint("remote-write")))
   549  
   550  	q := e2ethanos.NewQuerierBuilder(e, "1", receiver.InternalEndpoint("grpc"), receiver2.InternalEndpoint("grpc")).Init()
   551  	testutil.Ok(t, e2e.StartAndWaitReady(q))
   552  
   553  	r := rFuture.WithAlertManagerConfig([]alert.AlertmanagerConfig{
   554  		{
   555  			EndpointsConfig: httpconfig.EndpointsConfig{
   556  				StaticAddresses: []string{
   557  					am.InternalEndpoint("http"),
   558  				},
   559  				Scheme: "http",
   560  			},
   561  			Timeout:    amTimeout,
   562  			APIVersion: alert.APIv1,
   563  		},
   564  	}).InitStateless(filepath.Join(rFuture.InternalDir(), rulesSubDir), []httpconfig.Config{
   565  		{
   566  			EndpointsConfig: httpconfig.EndpointsConfig{
   567  				StaticAddresses: []string{
   568  					q.InternalEndpoint("http"),
   569  				},
   570  				Scheme: "http",
   571  			},
   572  		},
   573  	}, []*config.RemoteWriteConfig{
   574  		{URL: &common_cfg.URL{URL: rwURL}, Name: "thanos-receiver"},
   575  		{URL: &common_cfg.URL{URL: rwURL2}, Name: "thanos-receiver2"},
   576  	})
   577  	testutil.Ok(t, e2e.StartAndWaitReady(r))
   578  
   579  	// Wait until remote write samples are written to receivers successfully.
   580  	testutil.Ok(t, r.WaitSumMetricsWithOptions(e2emon.GreaterOrEqual(1), []string{"prometheus_remote_storage_samples_total"}, e2emon.WaitMissingMetrics()))
   581  
   582  	t.Run("can fetch remote-written samples from receiver", func(t *testing.T) {
   583  		testRecordedSamples := func() string { return "test_absent_metric" }
   584  		queryAndAssertSeries(t, ctx, q.Endpoint("http"), testRecordedSamples, time.Now, promclient.QueryOptions{
   585  			Deduplicate: false,
   586  		}, []model.Metric{
   587  			{
   588  				"__name__":  "test_absent_metric",
   589  				"job":       "thanos-receive",
   590  				"receive":   model.LabelValue(receiver.Name()),
   591  				"replica":   "1",
   592  				"tenant_id": "default-tenant",
   593  			},
   594  			{
   595  				"__name__":  "test_absent_metric",
   596  				"job":       "thanos-receive",
   597  				"receive":   model.LabelValue(receiver2.Name()),
   598  				"replica":   "1",
   599  				"tenant_id": "default-tenant",
   600  			},
   601  		})
   602  	})
   603  }
   604  
   605  func TestStatelessRulerAlertStateRestore(t *testing.T) {
   606  	t.Parallel()
   607  
   608  	e, err := e2e.NewDockerEnvironment("stateless-state")
   609  	testutil.Ok(t, err)
   610  	t.Cleanup(e2ethanos.CleanScenario(t, e))
   611  
   612  	ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
   613  	t.Cleanup(cancel)
   614  
   615  	am := e2ethanos.NewAlertmanager(e, "1")
   616  	testutil.Ok(t, e2e.StartAndWaitReady(am))
   617  
   618  	receiver := e2ethanos.NewReceiveBuilder(e, "1").WithIngestionEnabled().Init()
   619  	testutil.Ok(t, e2e.StartAndWaitReady(receiver))
   620  	rwURL := urlParse(t, e2ethanos.RemoteWriteEndpoint(receiver.InternalEndpoint("remote-write")))
   621  
   622  	q := e2ethanos.NewQuerierBuilder(e, "1", receiver.InternalEndpoint("grpc")).
   623  		WithReplicaLabels("replica", "receive").Init()
   624  	testutil.Ok(t, e2e.StartAndWaitReady(q))
   625  	rulesSubDir := "rules"
   626  	var rulers []*e2emon.InstrumentedRunnable
   627  	for i := 1; i <= 2; i++ {
   628  		rFuture := e2ethanos.NewRulerBuilder(e, fmt.Sprintf("%d", i))
   629  		rulesPath := filepath.Join(rFuture.Dir(), rulesSubDir)
   630  		testutil.Ok(t, os.MkdirAll(rulesPath, os.ModePerm))
   631  		for i, rule := range []string{testAlertRuleHoldDuration} {
   632  			createRuleFile(t, filepath.Join(rulesPath, fmt.Sprintf("rules-%d.yaml", i)), rule)
   633  		}
   634  		r := rFuture.WithAlertManagerConfig([]alert.AlertmanagerConfig{
   635  			{
   636  				EndpointsConfig: httpconfig.EndpointsConfig{
   637  					StaticAddresses: []string{
   638  						am.InternalEndpoint("http"),
   639  					},
   640  					Scheme: "http",
   641  				},
   642  				Timeout:    amTimeout,
   643  				APIVersion: alert.APIv1,
   644  			},
   645  		}).WithForGracePeriod("500ms").
   646  			WithRestoreIgnoredLabels("tenant_id").
   647  			InitStateless(filepath.Join(rFuture.InternalDir(), rulesSubDir), []httpconfig.Config{
   648  				{
   649  					EndpointsConfig: httpconfig.EndpointsConfig{
   650  						StaticAddresses: []string{
   651  							q.InternalEndpoint("http"),
   652  						},
   653  						Scheme: "http",
   654  					},
   655  				},
   656  			}, []*config.RemoteWriteConfig{
   657  				{URL: &common_cfg.URL{URL: rwURL}, Name: "thanos-receiver"},
   658  			})
   659  		rulers = append(rulers, r)
   660  	}
   661  
   662  	// Start the ruler 1 first.
   663  	testutil.Ok(t, e2e.StartAndWaitReady(rulers[0]))
   664  
   665  	// Wait until the alert firing and ALERTS_FOR_STATE
   666  	// series has been written to receiver successfully.
   667  	queryAndAssertSeries(t, ctx, q.Endpoint("http"), func() string {
   668  		return "ALERTS_FOR_STATE"
   669  	}, time.Now, promclient.QueryOptions{
   670  		Deduplicate: true,
   671  	}, []model.Metric{
   672  		{
   673  			"__name__":  "ALERTS_FOR_STATE",
   674  			"alertname": "TestAlert_RuleHoldDuration",
   675  			"severity":  "page",
   676  			"tenant_id": "default-tenant",
   677  		},
   678  	})
   679  
   680  	var alerts []*rulespb.AlertInstance
   681  	client := promclient.NewDefaultClient()
   682  	err = runutil.Retry(time.Second*1, ctx.Done(), func() error {
   683  		alerts, err = client.AlertsInGRPC(ctx, urlParse(t, "http://"+rulers[0].Endpoint("http")))
   684  		testutil.Ok(t, err)
   685  		if len(alerts) > 0 {
   686  			if alerts[0].State == rulespb.AlertState_FIRING {
   687  				return nil
   688  			}
   689  		}
   690  		return fmt.Errorf("alert is not firing")
   691  	})
   692  	testutil.Ok(t, err)
   693  	// Record the alert active time.
   694  	alertActiveAt := alerts[0].ActiveAt
   695  	testutil.Ok(t, rulers[0].Stop())
   696  
   697  	// Start the ruler 2 now and ruler 2 should be able
   698  	// to restore the firing alert state.
   699  	testutil.Ok(t, e2e.StartAndWaitReady(rulers[1]))
   700  
   701  	// Wait for 4 rule evaluation iterations to make sure the alert state is restored.
   702  	testutil.Ok(t, rulers[1].WaitSumMetricsWithOptions(e2emon.GreaterOrEqual(4), []string{"prometheus_rule_group_iterations_total"}, e2emon.WaitMissingMetrics()))
   703  
   704  	// Wait until the alert is firing on the second ruler.
   705  	err = runutil.Retry(time.Second*1, ctx.Done(), func() error {
   706  		alerts, err = client.AlertsInGRPC(ctx, urlParse(t, "http://"+rulers[1].Endpoint("http")))
   707  		testutil.Ok(t, err)
   708  		if len(alerts) > 0 {
   709  			if alerts[0].State == rulespb.AlertState_FIRING {
   710  				// The second ruler alert's active at time is the same as the previous one,
   711  				// which means the alert state is restored successfully.
   712  				if alertActiveAt.Unix() == alerts[0].ActiveAt.Unix() {
   713  					return nil
   714  				} else {
   715  					return fmt.Errorf("alert active time is not restored")
   716  				}
   717  			}
   718  		}
   719  		return fmt.Errorf("alert is not firing")
   720  	})
   721  	testutil.Ok(t, err)
   722  }
   723  
   724  // TestRule_CanPersistWALData checks that in stateless mode, Thanos Ruler can persist rule evaluations
   725  // which couldn't be sent to the remote write endpoint (e.g because receiver isn't available).
   726  func TestRule_CanPersistWALData(t *testing.T) {
   727  	//TODO: Implement test with unavailable remote-write endpoint(receiver)
   728  }
   729  
   730  // Test Ruler behavior on different storepb.PartialResponseStrategy when having partial response from single `failingStoreAPI`.
   731  func TestRulePartialResponse(t *testing.T) {
   732  	t.Skip("TODO: Allow HTTP ports from binaries running on host to be accessible.")
   733  
   734  	// TODO: Implement with failing store.
   735  }