github.com/thanos-io/thanos@v0.32.5/pkg/rules/manager_test.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package rules 5 6 import ( 7 "context" 8 "fmt" 9 "os" 10 "path/filepath" 11 "sort" 12 "strings" 13 "sync" 14 "testing" 15 "time" 16 17 "github.com/go-kit/log" 18 "github.com/pkg/errors" 19 "github.com/prometheus/client_golang/prometheus" 20 "github.com/prometheus/prometheus/model/exemplar" 21 "github.com/prometheus/prometheus/model/histogram" 22 "github.com/prometheus/prometheus/model/labels" 23 "github.com/prometheus/prometheus/model/metadata" 24 "github.com/prometheus/prometheus/promql" 25 "github.com/prometheus/prometheus/rules" 26 "github.com/prometheus/prometheus/storage" 27 "gopkg.in/yaml.v3" 28 29 "github.com/efficientgo/core/testutil" 30 31 "github.com/thanos-io/thanos/pkg/extprom" 32 "github.com/thanos-io/thanos/pkg/runutil" 33 "github.com/thanos-io/thanos/pkg/store/storepb" 34 ) 35 36 type nopAppendable struct{} 37 38 func (n nopAppendable) Appender(_ context.Context) storage.Appender { return nopAppender{} } 39 40 type nopAppender struct{} 41 42 func (n nopAppender) Append(storage.SeriesRef, labels.Labels, int64, float64) (storage.SeriesRef, error) { 43 return 0, nil 44 } 45 func (n nopAppender) AppendExemplar(storage.SeriesRef, labels.Labels, exemplar.Exemplar) (storage.SeriesRef, error) { 46 return 0, nil 47 } 48 49 func (n nopAppender) AppendHistogram(ref storage.SeriesRef, l labels.Labels, t int64, h *histogram.Histogram, fh *histogram.FloatHistogram) (storage.SeriesRef, error) { 50 return 0, nil 51 } 52 53 func (n nopAppender) Commit() error { return nil } 54 func (n nopAppender) Rollback() error { return nil } 55 func (n nopAppender) Appender(_ context.Context) (storage.Appender, error) { return n, nil } 56 func (n nopAppender) UpdateMetadata(storage.SeriesRef, labels.Labels, metadata.Metadata) (storage.SeriesRef, error) { 57 return 0, nil 58 } 59 60 type nopQueryable struct{} 61 62 func (n nopQueryable) Querier(_ context.Context, _, _ int64) (storage.Querier, error) { 63 return storage.NoopQuerier(), nil 64 } 65 66 // Regression test against https://github.com/thanos-io/thanos/issues/1779. 67 func TestRun_Subqueries(t *testing.T) { 68 dir := t.TempDir() 69 70 testutil.Ok(t, os.WriteFile(filepath.Join(dir, "rule.yaml"), []byte(` 71 groups: 72 - name: "rule with subquery" 73 partial_response_strategy: "warn" 74 rules: 75 - record: "test" 76 expr: "rate(some_metric[1h:5m] offset 1d)" 77 `), os.ModePerm)) 78 79 var ( 80 queryDone = make(chan struct{}) 81 queryOnce sync.Once 82 query string 83 ) 84 thanosRuleMgr := NewManager( 85 context.Background(), 86 nil, 87 dir, 88 rules.ManagerOptions{ 89 Logger: log.NewLogfmtLogger(os.Stderr), 90 Context: context.Background(), 91 Appendable: nopAppendable{}, 92 Queryable: nopQueryable{}, 93 }, 94 func(partialResponseStrategy storepb.PartialResponseStrategy) rules.QueryFunc { 95 return func(ctx context.Context, q string, t time.Time) (vectors promql.Vector, e error) { 96 queryOnce.Do(func() { 97 query = q 98 close(queryDone) 99 }) 100 return promql.Vector{}, nil 101 } 102 }, 103 labels.FromStrings("replica", "1"), 104 "http://localhost", 105 ) 106 testutil.Ok(t, thanosRuleMgr.Update(1*time.Second, []string{filepath.Join(dir, "rule.yaml")})) 107 108 thanosRuleMgr.Run() 109 defer thanosRuleMgr.Stop() 110 111 select { 112 case <-time.After(1 * time.Minute): 113 t.Fatal("timeout while waiting on rule manager query evaluation") 114 case <-queryDone: 115 } 116 testutil.Equals(t, "rate(some_metric[1h:5m] offset 1d)", query) 117 } 118 119 func TestUpdate_Error_UpdatePartial(t *testing.T) { 120 dir := t.TempDir() 121 dataDir := t.TempDir() 122 123 err := os.MkdirAll(filepath.Join(dir, "subdir"), 0775) 124 testutil.Ok(t, err) 125 126 testutil.Ok(t, os.WriteFile(filepath.Join(dir, "no_strategy.yaml"), []byte(` 127 groups: 128 - name: "something1" 129 rules: 130 - alert: "some" 131 expr: "up" 132 `), os.ModePerm)) 133 testutil.Ok(t, os.WriteFile(filepath.Join(dir, "abort.yaml"), []byte(` 134 groups: 135 - name: "something2" 136 partial_response_strategy: "abort" 137 rules: 138 - alert: "some" 139 expr: "up" 140 `), os.ModePerm)) 141 testutil.Ok(t, os.WriteFile(filepath.Join(dir, "warn.yaml"), []byte(` 142 groups: 143 - name: "something3" 144 partial_response_strategy: "warn" 145 rules: 146 - alert: "some" 147 expr: "up" 148 `), os.ModePerm)) 149 testutil.Ok(t, os.WriteFile(filepath.Join(dir, "wrong.yaml"), []byte(` 150 groups: 151 - name: "something4" 152 partial_response_strategy: "afafsdgsdgs" # Err 1 153 rules: 154 - alert: "some" 155 expr: "up" 156 `), os.ModePerm)) 157 testutil.Ok(t, os.WriteFile(filepath.Join(dir, "combined.yaml"), []byte(` 158 groups: 159 - name: "something5" 160 partial_response_strategy: "warn" 161 rules: 162 - alert: "some" 163 expr: "up" 164 - name: "something6" 165 partial_response_strategy: "abort" 166 rules: 167 - alert: "some" 168 expr: "up" 169 - name: "something7" 170 rules: 171 - alert: "some" 172 expr: "up" 173 `), os.ModePerm)) 174 // Same filename as the first rule file but different path. 175 testutil.Ok(t, os.WriteFile(filepath.Join(dir, "subdir", "no_strategy.yaml"), []byte(` 176 groups: 177 - name: "something8" 178 rules: 179 - alert: "some" 180 expr: "up" 181 `), os.ModePerm)) 182 reg := prometheus.NewRegistry() 183 184 thanosRuleMgr := NewManager( 185 context.Background(), 186 reg, 187 dataDir, 188 rules.ManagerOptions{ 189 Logger: log.NewLogfmtLogger(os.Stderr), 190 Queryable: nopQueryable{}, 191 }, 192 func(partialResponseStrategy storepb.PartialResponseStrategy) rules.QueryFunc { 193 return func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { 194 return nil, nil 195 } 196 }, 197 labels.FromStrings("replica", "1"), 198 "http://localhost", 199 ) 200 err = thanosRuleMgr.Update(10*time.Second, []string{ 201 filepath.Join(dir, "no_strategy.yaml"), 202 filepath.Join(dir, "abort.yaml"), 203 filepath.Join(dir, "warn.yaml"), 204 filepath.Join(dir, "wrong.yaml"), 205 filepath.Join(dir, "combined.yaml"), 206 filepath.Join(dir, "non_existing.yaml"), 207 filepath.Join(dir, "subdir", "no_strategy.yaml"), 208 }) 209 testutil.NotOk(t, err) 210 testutil.Assert(t, strings.Contains(err.Error(), "wrong.yaml: failed to unmarshal \"afafsdgsdgs\" as 'partial_response_strategy'"), err.Error()) 211 testutil.Assert(t, strings.Contains(err.Error(), "non_existing.yaml: no such file or directory"), err.Error()) 212 213 // Still failed update should load at least partially correct rules. 214 // Also, check metrics: Regression test: https://github.com/thanos-io/thanos/issues/3083 215 testutil.Equals(t, 216 map[string]float64{ 217 fmt.Sprintf("prometheus_rule_group_rules{rule_group=%s/.tmp-rules/ABORT%s/abort.yaml;something2,strategy=abort}", dataDir, dir): 1, 218 fmt.Sprintf("prometheus_rule_group_rules{rule_group=%s/.tmp-rules/ABORT%s/subdir/no_strategy.yaml;something8,strategy=abort}", dataDir, dir): 1, 219 fmt.Sprintf("prometheus_rule_group_rules{rule_group=%s/.tmp-rules/ABORT%s/combined.yaml;something6,strategy=abort}", dataDir, dir): 1, 220 fmt.Sprintf("prometheus_rule_group_rules{rule_group=%s/.tmp-rules/ABORT%s/combined.yaml;something7,strategy=abort}", dataDir, dir): 1, 221 fmt.Sprintf("prometheus_rule_group_rules{rule_group=%s/.tmp-rules/ABORT%s/no_strategy.yaml;something1,strategy=abort}", dataDir, dir): 1, 222 fmt.Sprintf("prometheus_rule_group_rules{rule_group=%s/.tmp-rules/WARN%s/combined.yaml;something5,strategy=warn}", dataDir, dir): 1, 223 fmt.Sprintf("prometheus_rule_group_rules{rule_group=%s/.tmp-rules/WARN%s/warn.yaml;something3,strategy=warn}", dataDir, dir): 1, 224 }, 225 extprom.CurrentGaugeValuesFor(t, reg, "prometheus_rule_group_rules"), 226 ) 227 228 g := thanosRuleMgr.RuleGroups() 229 sort.Slice(g, func(i, j int) bool { 230 return g[i].Name() < g[j].Name() 231 }) 232 233 exp := []struct { 234 name string 235 file string 236 strategy storepb.PartialResponseStrategy 237 }{ 238 { 239 name: "something1", 240 file: filepath.Join(dir, "no_strategy.yaml"), 241 strategy: storepb.PartialResponseStrategy_ABORT, 242 }, 243 { 244 name: "something2", 245 file: filepath.Join(dir, "abort.yaml"), 246 strategy: storepb.PartialResponseStrategy_ABORT, 247 }, 248 { 249 name: "something3", 250 file: filepath.Join(dir, "warn.yaml"), 251 strategy: storepb.PartialResponseStrategy_WARN, 252 }, 253 { 254 name: "something5", 255 file: filepath.Join(dir, "combined.yaml"), 256 strategy: storepb.PartialResponseStrategy_WARN, 257 }, 258 { 259 name: "something6", 260 file: filepath.Join(dir, "combined.yaml"), 261 strategy: storepb.PartialResponseStrategy_ABORT, 262 }, 263 { 264 name: "something7", 265 file: filepath.Join(dir, "combined.yaml"), 266 strategy: storepb.PartialResponseStrategy_ABORT, 267 }, 268 { 269 name: "something8", 270 file: filepath.Join(dir, "subdir", "no_strategy.yaml"), 271 strategy: storepb.PartialResponseStrategy_ABORT, 272 }, 273 } 274 testutil.Equals(t, len(exp), len(g)) 275 276 for i := range exp { 277 t.Run(exp[i].name, func(t *testing.T) { 278 testutil.Equals(t, exp[i].strategy, g[i].PartialResponseStrategy) 279 testutil.Equals(t, exp[i].name, g[i].Name()) 280 281 p := g[i].toProto() 282 testutil.Equals(t, exp[i].strategy, p.PartialResponseStrategy) 283 testutil.Equals(t, exp[i].name, p.Name) 284 testutil.Equals(t, exp[i].file, p.File) 285 }) 286 } 287 defer func() { 288 // Update creates go routines. We don't need rules mngrs to run, just to parse things, but let it start and stop 289 // at the end to correctly test leaked go routines. 290 thanosRuleMgr.Run() 291 thanosRuleMgr.Stop() 292 }() 293 } 294 295 func TestConfigRuleAdapterUnmarshalMarshalYAML(t *testing.T) { 296 c := configGroups{} 297 testutil.Ok(t, yaml.Unmarshal([]byte(`groups: 298 - name: something1 299 rules: 300 - alert: some 301 expr: up 302 partial_response_strategy: ABORT 303 limit: 10 304 - name: something2 305 rules: 306 - alert: some 307 expr: rate(some_metric[1h:5m] offset 1d) 308 partial_response_strategy: WARN 309 `), &c)) 310 b, err := yaml.Marshal(c) 311 testutil.Ok(t, err) 312 testutil.Equals(t, `groups: 313 - limit: 10 314 name: something1 315 rules: 316 - alert: some 317 expr: up 318 - name: something2 319 rules: 320 - alert: some 321 expr: rate(some_metric[1h:5m] offset 1d) 322 `, string(b)) 323 } 324 325 func TestManager_Rules(t *testing.T) { 326 dir := t.TempDir() 327 328 curr, err := os.Getwd() 329 testutil.Ok(t, err) 330 331 thanosRuleMgr := NewManager( 332 context.Background(), 333 nil, 334 dir, 335 rules.ManagerOptions{ 336 Logger: log.NewLogfmtLogger(os.Stderr), 337 Queryable: nopQueryable{}, 338 }, 339 func(partialResponseStrategy storepb.PartialResponseStrategy) rules.QueryFunc { 340 return func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { 341 return nil, nil 342 } 343 }, 344 labels.FromStrings("replica", "test1"), 345 "http://localhost", 346 ) 347 testutil.Ok(t, thanosRuleMgr.Update(60*time.Second, []string{ 348 filepath.Join(curr, "../../examples/alerts/alerts.yaml"), 349 filepath.Join(curr, "../../examples/alerts/rules.yaml"), 350 })) 351 defer func() { 352 // Update creates go routines. We don't need rules mngrs to run, just to parse things, but let it start and stop 353 // at the end to correctly test leaked go routines. 354 thanosRuleMgr.Run() 355 thanosRuleMgr.Stop() 356 }() 357 testRulesAgainstExamples(t, filepath.Join(curr, "../../examples/alerts"), thanosRuleMgr) 358 } 359 360 func TestManagerUpdateWithNoRules(t *testing.T) { 361 dir := t.TempDir() 362 363 testutil.Ok(t, os.WriteFile(filepath.Join(dir, "no_strategy.yaml"), []byte(` 364 groups: 365 - name: "something1" 366 rules: 367 - alert: "some" 368 expr: "up" 369 `), os.ModePerm)) 370 371 thanosRuleMgr := NewManager( 372 context.Background(), 373 nil, 374 dir, 375 rules.ManagerOptions{ 376 Logger: log.NewLogfmtLogger(os.Stderr), 377 Queryable: nopQueryable{}, 378 }, 379 func(partialResponseStrategy storepb.PartialResponseStrategy) rules.QueryFunc { 380 return func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { 381 return nil, nil 382 } 383 }, 384 nil, 385 "http://localhost", 386 ) 387 388 // We need to run the underlying rule managers to update them more than 389 // once (otherwise there's a deadlock). 390 thanosRuleMgr.Run() 391 t.Cleanup(thanosRuleMgr.Stop) 392 393 err := thanosRuleMgr.Update(1*time.Second, []string{ 394 filepath.Join(dir, "no_strategy.yaml"), 395 }) 396 testutil.Ok(t, err) 397 testutil.Equals(t, 1, len(thanosRuleMgr.RuleGroups())) 398 399 err = thanosRuleMgr.Update(1*time.Second, []string{}) 400 testutil.Ok(t, err) 401 testutil.Equals(t, 0, len(thanosRuleMgr.RuleGroups())) 402 } 403 404 func TestManagerRunRulesWithRuleGroupLimit(t *testing.T) { 405 dir := t.TempDir() 406 filename := filepath.Join(dir, "with_limit.yaml") 407 testutil.Ok(t, os.WriteFile(filename, []byte(` 408 groups: 409 - name: "something1" 410 interval: 1ms 411 limit: 1 412 rules: 413 - alert: "some" 414 expr: "up>0" 415 for: 0s 416 `), os.ModePerm)) 417 418 thanosRuleMgr := NewManager( 419 context.Background(), 420 nil, 421 dir, 422 rules.ManagerOptions{ 423 Logger: log.NewLogfmtLogger(os.Stderr), 424 Queryable: nopQueryable{}, 425 }, 426 func(partialResponseStrategy storepb.PartialResponseStrategy) rules.QueryFunc { 427 return func(ctx context.Context, q string, ts time.Time) (promql.Vector, error) { 428 return []promql.Sample{ 429 { 430 T: 0, 431 F: 1, 432 Metric: labels.FromStrings("foo", "bar"), 433 }, 434 { 435 T: 0, 436 F: 1, 437 Metric: labels.FromStrings("foo1", "bar1"), 438 }, 439 }, nil 440 } 441 }, 442 nil, 443 "http://localhost", 444 ) 445 thanosRuleMgr.Run() 446 t.Cleanup(thanosRuleMgr.Stop) 447 testutil.Ok(t, thanosRuleMgr.Update(time.Millisecond, []string{filename})) 448 testutil.Equals(t, 1, len(thanosRuleMgr.protoRuleGroups())) 449 testutil.Equals(t, 1, len(thanosRuleMgr.protoRuleGroups()[0].Rules)) 450 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 451 defer cancel() 452 testutil.Ok(t, runutil.Retry(time.Millisecond, ctx.Done(), func() error { 453 if thanosRuleMgr.protoRuleGroups()[0].Rules[0].GetAlert().Health != string(rules.HealthBad) { 454 return errors.New("expect HealthBad") 455 } 456 return nil 457 })) 458 testutil.Equals(t, "exceeded limit of 1 with 2 alerts", thanosRuleMgr.protoRuleGroups()[0].Rules[0].GetAlert().LastError) 459 }