github.com/thanos-io/thanos@v0.32.5/pkg/rules/manager_test.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  package rules
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"os"
    10  	"path/filepath"
    11  	"sort"
    12  	"strings"
    13  	"sync"
    14  	"testing"
    15  	"time"
    16  
    17  	"github.com/go-kit/log"
    18  	"github.com/pkg/errors"
    19  	"github.com/prometheus/client_golang/prometheus"
    20  	"github.com/prometheus/prometheus/model/exemplar"
    21  	"github.com/prometheus/prometheus/model/histogram"
    22  	"github.com/prometheus/prometheus/model/labels"
    23  	"github.com/prometheus/prometheus/model/metadata"
    24  	"github.com/prometheus/prometheus/promql"
    25  	"github.com/prometheus/prometheus/rules"
    26  	"github.com/prometheus/prometheus/storage"
    27  	"gopkg.in/yaml.v3"
    28  
    29  	"github.com/efficientgo/core/testutil"
    30  
    31  	"github.com/thanos-io/thanos/pkg/extprom"
    32  	"github.com/thanos-io/thanos/pkg/runutil"
    33  	"github.com/thanos-io/thanos/pkg/store/storepb"
    34  )
    35  
    36  type nopAppendable struct{}
    37  
    38  func (n nopAppendable) Appender(_ context.Context) storage.Appender { return nopAppender{} }
    39  
    40  type nopAppender struct{}
    41  
    42  func (n nopAppender) Append(storage.SeriesRef, labels.Labels, int64, float64) (storage.SeriesRef, error) {
    43  	return 0, nil
    44  }
    45  func (n nopAppender) AppendExemplar(storage.SeriesRef, labels.Labels, exemplar.Exemplar) (storage.SeriesRef, error) {
    46  	return 0, nil
    47  }
    48  
    49  func (n nopAppender) AppendHistogram(ref storage.SeriesRef, l labels.Labels, t int64, h *histogram.Histogram, fh *histogram.FloatHistogram) (storage.SeriesRef, error) {
    50  	return 0, nil
    51  }
    52  
    53  func (n nopAppender) Commit() error                                        { return nil }
    54  func (n nopAppender) Rollback() error                                      { return nil }
    55  func (n nopAppender) Appender(_ context.Context) (storage.Appender, error) { return n, nil }
    56  func (n nopAppender) UpdateMetadata(storage.SeriesRef, labels.Labels, metadata.Metadata) (storage.SeriesRef, error) {
    57  	return 0, nil
    58  }
    59  
    60  type nopQueryable struct{}
    61  
    62  func (n nopQueryable) Querier(_ context.Context, _, _ int64) (storage.Querier, error) {
    63  	return storage.NoopQuerier(), nil
    64  }
    65  
    66  // Regression test against https://github.com/thanos-io/thanos/issues/1779.
    67  func TestRun_Subqueries(t *testing.T) {
    68  	dir := t.TempDir()
    69  
    70  	testutil.Ok(t, os.WriteFile(filepath.Join(dir, "rule.yaml"), []byte(`
    71  groups:
    72  - name: "rule with subquery"
    73    partial_response_strategy: "warn"
    74    rules:
    75    - record: "test"
    76      expr: "rate(some_metric[1h:5m] offset 1d)"
    77  `), os.ModePerm))
    78  
    79  	var (
    80  		queryDone = make(chan struct{})
    81  		queryOnce sync.Once
    82  		query     string
    83  	)
    84  	thanosRuleMgr := NewManager(
    85  		context.Background(),
    86  		nil,
    87  		dir,
    88  		rules.ManagerOptions{
    89  			Logger:     log.NewLogfmtLogger(os.Stderr),
    90  			Context:    context.Background(),
    91  			Appendable: nopAppendable{},
    92  			Queryable:  nopQueryable{},
    93  		},
    94  		func(partialResponseStrategy storepb.PartialResponseStrategy) rules.QueryFunc {
    95  			return func(ctx context.Context, q string, t time.Time) (vectors promql.Vector, e error) {
    96  				queryOnce.Do(func() {
    97  					query = q
    98  					close(queryDone)
    99  				})
   100  				return promql.Vector{}, nil
   101  			}
   102  		},
   103  		labels.FromStrings("replica", "1"),
   104  		"http://localhost",
   105  	)
   106  	testutil.Ok(t, thanosRuleMgr.Update(1*time.Second, []string{filepath.Join(dir, "rule.yaml")}))
   107  
   108  	thanosRuleMgr.Run()
   109  	defer thanosRuleMgr.Stop()
   110  
   111  	select {
   112  	case <-time.After(1 * time.Minute):
   113  		t.Fatal("timeout while waiting on rule manager query evaluation")
   114  	case <-queryDone:
   115  	}
   116  	testutil.Equals(t, "rate(some_metric[1h:5m] offset 1d)", query)
   117  }
   118  
   119  func TestUpdate_Error_UpdatePartial(t *testing.T) {
   120  	dir := t.TempDir()
   121  	dataDir := t.TempDir()
   122  
   123  	err := os.MkdirAll(filepath.Join(dir, "subdir"), 0775)
   124  	testutil.Ok(t, err)
   125  
   126  	testutil.Ok(t, os.WriteFile(filepath.Join(dir, "no_strategy.yaml"), []byte(`
   127  groups:
   128  - name: "something1"
   129    rules:
   130    - alert: "some"
   131      expr: "up"
   132  `), os.ModePerm))
   133  	testutil.Ok(t, os.WriteFile(filepath.Join(dir, "abort.yaml"), []byte(`
   134  groups:
   135  - name: "something2"
   136    partial_response_strategy: "abort"
   137    rules:
   138    - alert: "some"
   139      expr: "up"
   140  `), os.ModePerm))
   141  	testutil.Ok(t, os.WriteFile(filepath.Join(dir, "warn.yaml"), []byte(`
   142  groups:
   143  - name: "something3"
   144    partial_response_strategy: "warn"
   145    rules:
   146    - alert: "some"
   147      expr: "up"
   148  `), os.ModePerm))
   149  	testutil.Ok(t, os.WriteFile(filepath.Join(dir, "wrong.yaml"), []byte(`
   150  groups:
   151  - name: "something4"
   152    partial_response_strategy: "afafsdgsdgs" # Err 1
   153    rules:
   154    - alert: "some"
   155      expr: "up"
   156  `), os.ModePerm))
   157  	testutil.Ok(t, os.WriteFile(filepath.Join(dir, "combined.yaml"), []byte(`
   158  groups:
   159  - name: "something5"
   160    partial_response_strategy: "warn"
   161    rules:
   162    - alert: "some"
   163      expr: "up"
   164  - name: "something6"
   165    partial_response_strategy: "abort"
   166    rules:
   167    - alert: "some"
   168      expr: "up"
   169  - name: "something7"
   170    rules:
   171    - alert: "some"
   172      expr: "up"
   173  `), os.ModePerm))
   174  	// Same filename as the first rule file but different path.
   175  	testutil.Ok(t, os.WriteFile(filepath.Join(dir, "subdir", "no_strategy.yaml"), []byte(`
   176  groups:
   177  - name: "something8"
   178    rules:
   179    - alert: "some"
   180      expr: "up"
   181  `), os.ModePerm))
   182  	reg := prometheus.NewRegistry()
   183  
   184  	thanosRuleMgr := NewManager(
   185  		context.Background(),
   186  		reg,
   187  		dataDir,
   188  		rules.ManagerOptions{
   189  			Logger:    log.NewLogfmtLogger(os.Stderr),
   190  			Queryable: nopQueryable{},
   191  		},
   192  		func(partialResponseStrategy storepb.PartialResponseStrategy) rules.QueryFunc {
   193  			return func(ctx context.Context, q string, t time.Time) (promql.Vector, error) {
   194  				return nil, nil
   195  			}
   196  		},
   197  		labels.FromStrings("replica", "1"),
   198  		"http://localhost",
   199  	)
   200  	err = thanosRuleMgr.Update(10*time.Second, []string{
   201  		filepath.Join(dir, "no_strategy.yaml"),
   202  		filepath.Join(dir, "abort.yaml"),
   203  		filepath.Join(dir, "warn.yaml"),
   204  		filepath.Join(dir, "wrong.yaml"),
   205  		filepath.Join(dir, "combined.yaml"),
   206  		filepath.Join(dir, "non_existing.yaml"),
   207  		filepath.Join(dir, "subdir", "no_strategy.yaml"),
   208  	})
   209  	testutil.NotOk(t, err)
   210  	testutil.Assert(t, strings.Contains(err.Error(), "wrong.yaml: failed to unmarshal \"afafsdgsdgs\" as 'partial_response_strategy'"), err.Error())
   211  	testutil.Assert(t, strings.Contains(err.Error(), "non_existing.yaml: no such file or directory"), err.Error())
   212  
   213  	// Still failed update should load at least partially correct rules.
   214  	// Also, check metrics: Regression test: https://github.com/thanos-io/thanos/issues/3083
   215  	testutil.Equals(t,
   216  		map[string]float64{
   217  			fmt.Sprintf("prometheus_rule_group_rules{rule_group=%s/.tmp-rules/ABORT%s/abort.yaml;something2,strategy=abort}", dataDir, dir):              1,
   218  			fmt.Sprintf("prometheus_rule_group_rules{rule_group=%s/.tmp-rules/ABORT%s/subdir/no_strategy.yaml;something8,strategy=abort}", dataDir, dir): 1,
   219  			fmt.Sprintf("prometheus_rule_group_rules{rule_group=%s/.tmp-rules/ABORT%s/combined.yaml;something6,strategy=abort}", dataDir, dir):           1,
   220  			fmt.Sprintf("prometheus_rule_group_rules{rule_group=%s/.tmp-rules/ABORT%s/combined.yaml;something7,strategy=abort}", dataDir, dir):           1,
   221  			fmt.Sprintf("prometheus_rule_group_rules{rule_group=%s/.tmp-rules/ABORT%s/no_strategy.yaml;something1,strategy=abort}", dataDir, dir):        1,
   222  			fmt.Sprintf("prometheus_rule_group_rules{rule_group=%s/.tmp-rules/WARN%s/combined.yaml;something5,strategy=warn}", dataDir, dir):             1,
   223  			fmt.Sprintf("prometheus_rule_group_rules{rule_group=%s/.tmp-rules/WARN%s/warn.yaml;something3,strategy=warn}", dataDir, dir):                 1,
   224  		},
   225  		extprom.CurrentGaugeValuesFor(t, reg, "prometheus_rule_group_rules"),
   226  	)
   227  
   228  	g := thanosRuleMgr.RuleGroups()
   229  	sort.Slice(g, func(i, j int) bool {
   230  		return g[i].Name() < g[j].Name()
   231  	})
   232  
   233  	exp := []struct {
   234  		name     string
   235  		file     string
   236  		strategy storepb.PartialResponseStrategy
   237  	}{
   238  		{
   239  			name:     "something1",
   240  			file:     filepath.Join(dir, "no_strategy.yaml"),
   241  			strategy: storepb.PartialResponseStrategy_ABORT,
   242  		},
   243  		{
   244  			name:     "something2",
   245  			file:     filepath.Join(dir, "abort.yaml"),
   246  			strategy: storepb.PartialResponseStrategy_ABORT,
   247  		},
   248  		{
   249  			name:     "something3",
   250  			file:     filepath.Join(dir, "warn.yaml"),
   251  			strategy: storepb.PartialResponseStrategy_WARN,
   252  		},
   253  		{
   254  			name:     "something5",
   255  			file:     filepath.Join(dir, "combined.yaml"),
   256  			strategy: storepb.PartialResponseStrategy_WARN,
   257  		},
   258  		{
   259  			name:     "something6",
   260  			file:     filepath.Join(dir, "combined.yaml"),
   261  			strategy: storepb.PartialResponseStrategy_ABORT,
   262  		},
   263  		{
   264  			name:     "something7",
   265  			file:     filepath.Join(dir, "combined.yaml"),
   266  			strategy: storepb.PartialResponseStrategy_ABORT,
   267  		},
   268  		{
   269  			name:     "something8",
   270  			file:     filepath.Join(dir, "subdir", "no_strategy.yaml"),
   271  			strategy: storepb.PartialResponseStrategy_ABORT,
   272  		},
   273  	}
   274  	testutil.Equals(t, len(exp), len(g))
   275  
   276  	for i := range exp {
   277  		t.Run(exp[i].name, func(t *testing.T) {
   278  			testutil.Equals(t, exp[i].strategy, g[i].PartialResponseStrategy)
   279  			testutil.Equals(t, exp[i].name, g[i].Name())
   280  
   281  			p := g[i].toProto()
   282  			testutil.Equals(t, exp[i].strategy, p.PartialResponseStrategy)
   283  			testutil.Equals(t, exp[i].name, p.Name)
   284  			testutil.Equals(t, exp[i].file, p.File)
   285  		})
   286  	}
   287  	defer func() {
   288  		// Update creates go routines. We don't need rules mngrs to run, just to parse things, but let it start and stop
   289  		// at the end to correctly test leaked go routines.
   290  		thanosRuleMgr.Run()
   291  		thanosRuleMgr.Stop()
   292  	}()
   293  }
   294  
   295  func TestConfigRuleAdapterUnmarshalMarshalYAML(t *testing.T) {
   296  	c := configGroups{}
   297  	testutil.Ok(t, yaml.Unmarshal([]byte(`groups:
   298  - name: something1
   299    rules:
   300    - alert: some
   301      expr: up
   302    partial_response_strategy: ABORT
   303    limit: 10
   304  - name: something2
   305    rules:
   306    - alert: some
   307      expr: rate(some_metric[1h:5m] offset 1d)
   308    partial_response_strategy: WARN
   309  `), &c))
   310  	b, err := yaml.Marshal(c)
   311  	testutil.Ok(t, err)
   312  	testutil.Equals(t, `groups:
   313      - limit: 10
   314        name: something1
   315        rules:
   316          - alert: some
   317            expr: up
   318      - name: something2
   319        rules:
   320          - alert: some
   321            expr: rate(some_metric[1h:5m] offset 1d)
   322  `, string(b))
   323  }
   324  
   325  func TestManager_Rules(t *testing.T) {
   326  	dir := t.TempDir()
   327  
   328  	curr, err := os.Getwd()
   329  	testutil.Ok(t, err)
   330  
   331  	thanosRuleMgr := NewManager(
   332  		context.Background(),
   333  		nil,
   334  		dir,
   335  		rules.ManagerOptions{
   336  			Logger:    log.NewLogfmtLogger(os.Stderr),
   337  			Queryable: nopQueryable{},
   338  		},
   339  		func(partialResponseStrategy storepb.PartialResponseStrategy) rules.QueryFunc {
   340  			return func(ctx context.Context, q string, t time.Time) (promql.Vector, error) {
   341  				return nil, nil
   342  			}
   343  		},
   344  		labels.FromStrings("replica", "test1"),
   345  		"http://localhost",
   346  	)
   347  	testutil.Ok(t, thanosRuleMgr.Update(60*time.Second, []string{
   348  		filepath.Join(curr, "../../examples/alerts/alerts.yaml"),
   349  		filepath.Join(curr, "../../examples/alerts/rules.yaml"),
   350  	}))
   351  	defer func() {
   352  		// Update creates go routines. We don't need rules mngrs to run, just to parse things, but let it start and stop
   353  		// at the end to correctly test leaked go routines.
   354  		thanosRuleMgr.Run()
   355  		thanosRuleMgr.Stop()
   356  	}()
   357  	testRulesAgainstExamples(t, filepath.Join(curr, "../../examples/alerts"), thanosRuleMgr)
   358  }
   359  
   360  func TestManagerUpdateWithNoRules(t *testing.T) {
   361  	dir := t.TempDir()
   362  
   363  	testutil.Ok(t, os.WriteFile(filepath.Join(dir, "no_strategy.yaml"), []byte(`
   364  groups:
   365  - name: "something1"
   366    rules:
   367    - alert: "some"
   368      expr: "up"
   369  `), os.ModePerm))
   370  
   371  	thanosRuleMgr := NewManager(
   372  		context.Background(),
   373  		nil,
   374  		dir,
   375  		rules.ManagerOptions{
   376  			Logger:    log.NewLogfmtLogger(os.Stderr),
   377  			Queryable: nopQueryable{},
   378  		},
   379  		func(partialResponseStrategy storepb.PartialResponseStrategy) rules.QueryFunc {
   380  			return func(ctx context.Context, q string, t time.Time) (promql.Vector, error) {
   381  				return nil, nil
   382  			}
   383  		},
   384  		nil,
   385  		"http://localhost",
   386  	)
   387  
   388  	// We need to run the underlying rule managers to update them more than
   389  	// once (otherwise there's a deadlock).
   390  	thanosRuleMgr.Run()
   391  	t.Cleanup(thanosRuleMgr.Stop)
   392  
   393  	err := thanosRuleMgr.Update(1*time.Second, []string{
   394  		filepath.Join(dir, "no_strategy.yaml"),
   395  	})
   396  	testutil.Ok(t, err)
   397  	testutil.Equals(t, 1, len(thanosRuleMgr.RuleGroups()))
   398  
   399  	err = thanosRuleMgr.Update(1*time.Second, []string{})
   400  	testutil.Ok(t, err)
   401  	testutil.Equals(t, 0, len(thanosRuleMgr.RuleGroups()))
   402  }
   403  
   404  func TestManagerRunRulesWithRuleGroupLimit(t *testing.T) {
   405  	dir := t.TempDir()
   406  	filename := filepath.Join(dir, "with_limit.yaml")
   407  	testutil.Ok(t, os.WriteFile(filename, []byte(`
   408  groups:
   409  - name: "something1"
   410    interval: 1ms
   411    limit: 1
   412    rules:
   413    - alert: "some"
   414      expr: "up>0"
   415      for: 0s
   416  `), os.ModePerm))
   417  
   418  	thanosRuleMgr := NewManager(
   419  		context.Background(),
   420  		nil,
   421  		dir,
   422  		rules.ManagerOptions{
   423  			Logger:    log.NewLogfmtLogger(os.Stderr),
   424  			Queryable: nopQueryable{},
   425  		},
   426  		func(partialResponseStrategy storepb.PartialResponseStrategy) rules.QueryFunc {
   427  			return func(ctx context.Context, q string, ts time.Time) (promql.Vector, error) {
   428  				return []promql.Sample{
   429  					{
   430  						T:      0,
   431  						F:      1,
   432  						Metric: labels.FromStrings("foo", "bar"),
   433  					},
   434  					{
   435  						T:      0,
   436  						F:      1,
   437  						Metric: labels.FromStrings("foo1", "bar1"),
   438  					},
   439  				}, nil
   440  			}
   441  		},
   442  		nil,
   443  		"http://localhost",
   444  	)
   445  	thanosRuleMgr.Run()
   446  	t.Cleanup(thanosRuleMgr.Stop)
   447  	testutil.Ok(t, thanosRuleMgr.Update(time.Millisecond, []string{filename}))
   448  	testutil.Equals(t, 1, len(thanosRuleMgr.protoRuleGroups()))
   449  	testutil.Equals(t, 1, len(thanosRuleMgr.protoRuleGroups()[0].Rules))
   450  	ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   451  	defer cancel()
   452  	testutil.Ok(t, runutil.Retry(time.Millisecond, ctx.Done(), func() error {
   453  		if thanosRuleMgr.protoRuleGroups()[0].Rules[0].GetAlert().Health != string(rules.HealthBad) {
   454  			return errors.New("expect HealthBad")
   455  		}
   456  		return nil
   457  	}))
   458  	testutil.Equals(t, "exceeded limit of 1 with 2 alerts", thanosRuleMgr.protoRuleGroups()[0].Rules[0].GetAlert().LastError)
   459  }