github.com/thanos-io/thanos@v0.32.5/cmd/thanos/rule.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package main 5 6 import ( 7 "context" 8 "math/rand" 9 "net/http" 10 "net/url" 11 "os" 12 "path/filepath" 13 "sort" 14 "strconv" 15 "strings" 16 "time" 17 18 extflag "github.com/efficientgo/tools/extkingpin" 19 "github.com/go-kit/log" 20 "github.com/go-kit/log/level" 21 grpc_logging "github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors/logging" 22 "github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors/tags" 23 "github.com/oklog/run" 24 "github.com/opentracing/opentracing-go" 25 "github.com/pkg/errors" 26 "github.com/prometheus/client_golang/prometheus" 27 "github.com/prometheus/client_golang/prometheus/promauto" 28 "github.com/prometheus/common/model" 29 "github.com/prometheus/common/route" 30 "github.com/prometheus/prometheus/config" 31 "github.com/prometheus/prometheus/model/labels" 32 "github.com/prometheus/prometheus/model/relabel" 33 "github.com/prometheus/prometheus/notifier" 34 "github.com/prometheus/prometheus/promql" 35 "github.com/prometheus/prometheus/rules" 36 "github.com/prometheus/prometheus/storage" 37 "github.com/prometheus/prometheus/storage/remote" 38 "github.com/prometheus/prometheus/tsdb" 39 "github.com/prometheus/prometheus/tsdb/agent" 40 "github.com/prometheus/prometheus/tsdb/wlog" 41 "github.com/prometheus/prometheus/util/strutil" 42 43 "github.com/thanos-io/objstore" 44 "github.com/thanos-io/objstore/client" 45 objstoretracing "github.com/thanos-io/objstore/tracing/opentracing" 46 "gopkg.in/yaml.v2" 47 48 "github.com/thanos-io/thanos/pkg/alert" 49 v1 "github.com/thanos-io/thanos/pkg/api/rule" 50 "github.com/thanos-io/thanos/pkg/block/metadata" 51 "github.com/thanos-io/thanos/pkg/component" 52 "github.com/thanos-io/thanos/pkg/discovery/dns" 53 "github.com/thanos-io/thanos/pkg/errutil" 54 "github.com/thanos-io/thanos/pkg/extkingpin" 55 "github.com/thanos-io/thanos/pkg/extprom" 56 extpromhttp "github.com/thanos-io/thanos/pkg/extprom/http" 57 "github.com/thanos-io/thanos/pkg/httpconfig" 58 "github.com/thanos-io/thanos/pkg/info" 59 "github.com/thanos-io/thanos/pkg/info/infopb" 60 "github.com/thanos-io/thanos/pkg/logging" 61 "github.com/thanos-io/thanos/pkg/prober" 62 "github.com/thanos-io/thanos/pkg/promclient" 63 thanosrules "github.com/thanos-io/thanos/pkg/rules" 64 "github.com/thanos-io/thanos/pkg/runutil" 65 grpcserver "github.com/thanos-io/thanos/pkg/server/grpc" 66 httpserver "github.com/thanos-io/thanos/pkg/server/http" 67 "github.com/thanos-io/thanos/pkg/shipper" 68 "github.com/thanos-io/thanos/pkg/store" 69 "github.com/thanos-io/thanos/pkg/store/labelpb" 70 "github.com/thanos-io/thanos/pkg/store/storepb" 71 "github.com/thanos-io/thanos/pkg/tls" 72 "github.com/thanos-io/thanos/pkg/tracing" 73 "github.com/thanos-io/thanos/pkg/ui" 74 ) 75 76 type ruleConfig struct { 77 http httpConfig 78 grpc grpcConfig 79 web webConfig 80 shipper shipperConfig 81 82 query queryConfig 83 queryConfigYAML []byte 84 85 alertmgr alertMgrConfig 86 alertmgrsConfigYAML []byte 87 alertQueryURL *url.URL 88 alertRelabelConfigYAML []byte 89 90 rwConfig *extflag.PathOrContent 91 92 resendDelay time.Duration 93 evalInterval time.Duration 94 outageTolerance time.Duration 95 forGracePeriod time.Duration 96 ruleFiles []string 97 objStoreConfig *extflag.PathOrContent 98 dataDir string 99 lset labels.Labels 100 ignoredLabelNames []string 101 storeRateLimits store.SeriesSelectLimits 102 } 103 104 func (rc *ruleConfig) registerFlag(cmd extkingpin.FlagClause) { 105 rc.http.registerFlag(cmd) 106 rc.grpc.registerFlag(cmd) 107 rc.web.registerFlag(cmd) 108 rc.shipper.registerFlag(cmd) 109 rc.query.registerFlag(cmd) 110 rc.alertmgr.registerFlag(cmd) 111 rc.storeRateLimits.RegisterFlags(cmd) 112 } 113 114 // registerRule registers a rule command. 115 func registerRule(app *extkingpin.App) { 116 comp := component.Rule 117 cmd := app.Command(comp.String(), "Ruler evaluating Prometheus rules against given Query nodes, exposing Store API and storing old blocks in bucket.") 118 119 conf := &ruleConfig{} 120 conf.registerFlag(cmd) 121 122 labelStrs := cmd.Flag("label", "Labels to be applied to all generated metrics (repeated). Similar to external labels for Prometheus, used to identify ruler and its blocks as unique source."). 123 PlaceHolder("<name>=\"<value>\"").Strings() 124 tsdbBlockDuration := extkingpin.ModelDuration(cmd.Flag("tsdb.block-duration", "Block duration for TSDB block."). 125 Default("2h")) 126 tsdbRetention := extkingpin.ModelDuration(cmd.Flag("tsdb.retention", "Block retention time on local disk."). 127 Default("48h")) 128 noLockFile := cmd.Flag("tsdb.no-lockfile", "Do not create lockfile in TSDB data directory. In any case, the lockfiles will be deleted on next startup.").Default("false").Bool() 129 walCompression := cmd.Flag("tsdb.wal-compression", "Compress the tsdb WAL.").Default("true").Bool() 130 131 cmd.Flag("data-dir", "data directory").Default("data/").StringVar(&conf.dataDir) 132 cmd.Flag("rule-file", "Rule files that should be used by rule manager. Can be in glob format (repeated). Note that rules are not automatically detected, use SIGHUP or do HTTP POST /-/reload to re-read them."). 133 Default("rules/").StringsVar(&conf.ruleFiles) 134 cmd.Flag("resend-delay", "Minimum amount of time to wait before resending an alert to Alertmanager."). 135 Default("1m").DurationVar(&conf.resendDelay) 136 cmd.Flag("eval-interval", "The default evaluation interval to use."). 137 Default("1m").DurationVar(&conf.evalInterval) 138 cmd.Flag("for-outage-tolerance", "Max time to tolerate prometheus outage for restoring \"for\" state of alert."). 139 Default("1h").DurationVar(&conf.outageTolerance) 140 cmd.Flag("for-grace-period", "Minimum duration between alert and restored \"for\" state. This is maintained only for alerts with configured \"for\" time greater than grace period."). 141 Default("10m").DurationVar(&conf.forGracePeriod) 142 cmd.Flag("restore-ignored-label", "Label names to be ignored when restoring alerts from the remote storage. This is only used in stateless mode."). 143 StringsVar(&conf.ignoredLabelNames) 144 145 conf.rwConfig = extflag.RegisterPathOrContent(cmd, "remote-write.config", "YAML config for the remote-write configurations, that specify servers where samples should be sent to (see https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write). This automatically enables stateless mode for ruler and no series will be stored in the ruler's TSDB. If an empty config (or file) is provided, the flag is ignored and ruler is run with its own TSDB.", extflag.WithEnvSubstitution()) 146 147 reqLogDecision := cmd.Flag("log.request.decision", "Deprecation Warning - This flag would be soon deprecated, and replaced with `request.logging-config`. Request Logging for logging the start and end of requests. By default this flag is disabled. LogFinishCall: Logs the finish call of the requests. LogStartAndFinishCall: Logs the start and finish call of the requests. NoLogCall: Disable request logging.").Default("").Enum("NoLogCall", "LogFinishCall", "LogStartAndFinishCall", "") 148 149 conf.objStoreConfig = extkingpin.RegisterCommonObjStoreFlags(cmd, "", false) 150 151 reqLogConfig := extkingpin.RegisterRequestLoggingFlags(cmd) 152 153 var err error 154 cmd.Setup(func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, reload <-chan struct{}, _ bool) error { 155 conf.lset, err = parseFlagLabels(*labelStrs) 156 if err != nil { 157 return errors.Wrap(err, "parse labels") 158 } 159 160 conf.alertQueryURL, err = url.Parse(*conf.alertmgr.alertQueryURL) 161 if err != nil { 162 return errors.Wrap(err, "parse alert query url") 163 } 164 165 tsdbOpts := &tsdb.Options{ 166 MinBlockDuration: int64(time.Duration(*tsdbBlockDuration) / time.Millisecond), 167 MaxBlockDuration: int64(time.Duration(*tsdbBlockDuration) / time.Millisecond), 168 RetentionDuration: int64(time.Duration(*tsdbRetention) / time.Millisecond), 169 NoLockfile: *noLockFile, 170 WALCompression: wlog.ParseCompressionType(*walCompression, string(wlog.CompressionSnappy)), 171 } 172 173 agentOpts := &agent.Options{ 174 WALCompression: wlog.ParseCompressionType(*walCompression, string(wlog.CompressionSnappy)), 175 NoLockfile: *noLockFile, 176 } 177 178 // Parse and check query configuration. 179 lookupQueries := map[string]struct{}{} 180 for _, q := range conf.query.addrs { 181 if _, ok := lookupQueries[q]; ok { 182 return errors.Errorf("Address %s is duplicated for --query flag.", q) 183 } 184 185 lookupQueries[q] = struct{}{} 186 } 187 188 conf.queryConfigYAML, err = conf.query.configPath.Content() 189 if err != nil { 190 return err 191 } 192 if len(conf.query.sdFiles) == 0 && len(conf.query.addrs) == 0 && len(conf.queryConfigYAML) == 0 { 193 return errors.New("no --query parameter was given") 194 } 195 if (len(conf.query.sdFiles) != 0 || len(conf.query.addrs) != 0) && len(conf.queryConfigYAML) != 0 { 196 return errors.New("--query/--query.sd-files and --query.config* parameters cannot be defined at the same time") 197 } 198 199 // Parse and check alerting configuration. 200 conf.alertmgrsConfigYAML, err = conf.alertmgr.configPath.Content() 201 if err != nil { 202 return err 203 } 204 if len(conf.alertmgrsConfigYAML) != 0 && len(conf.alertmgr.alertmgrURLs) != 0 { 205 return errors.New("--alertmanagers.url and --alertmanagers.config* parameters cannot be defined at the same time") 206 } 207 208 conf.alertRelabelConfigYAML, err = conf.alertmgr.alertRelabelConfigPath.Content() 209 if err != nil { 210 return err 211 } 212 213 httpLogOpts, err := logging.ParseHTTPOptions(*reqLogDecision, reqLogConfig) 214 if err != nil { 215 return errors.Wrap(err, "error while parsing config for request logging") 216 } 217 218 tagOpts, grpcLogOpts, err := logging.ParsegRPCOptions(*reqLogDecision, reqLogConfig) 219 if err != nil { 220 return errors.Wrap(err, "error while parsing config for request logging") 221 } 222 223 return runRule(g, 224 logger, 225 reg, 226 tracer, 227 comp, 228 *conf, 229 reload, 230 getFlagsMap(cmd.Flags()), 231 httpLogOpts, 232 grpcLogOpts, 233 tagOpts, 234 tsdbOpts, 235 agentOpts, 236 ) 237 }) 238 } 239 240 // RuleMetrics defines Thanos Ruler metrics. 241 type RuleMetrics struct { 242 configSuccess prometheus.Gauge 243 configSuccessTime prometheus.Gauge 244 duplicatedQuery prometheus.Counter 245 rulesLoaded *prometheus.GaugeVec 246 ruleEvalWarnings *prometheus.CounterVec 247 } 248 249 func newRuleMetrics(reg *prometheus.Registry) *RuleMetrics { 250 m := new(RuleMetrics) 251 252 factory := promauto.With(reg) 253 m.configSuccess = factory.NewGauge(prometheus.GaugeOpts{ 254 Name: "thanos_rule_config_last_reload_successful", 255 Help: "Whether the last configuration reload attempt was successful.", 256 }) 257 m.configSuccessTime = factory.NewGauge(prometheus.GaugeOpts{ 258 Name: "thanos_rule_config_last_reload_success_timestamp_seconds", 259 Help: "Timestamp of the last successful configuration reload.", 260 }) 261 m.duplicatedQuery = factory.NewCounter(prometheus.CounterOpts{ 262 Name: "thanos_rule_duplicated_query_addresses_total", 263 Help: "The number of times a duplicated query addresses is detected from the different configs in rule.", 264 }) 265 m.rulesLoaded = factory.NewGaugeVec( 266 prometheus.GaugeOpts{ 267 Name: "thanos_rule_loaded_rules", 268 Help: "Loaded rules partitioned by file and group.", 269 }, 270 []string{"strategy", "file", "group"}, 271 ) 272 m.ruleEvalWarnings = factory.NewCounterVec( 273 prometheus.CounterOpts{ 274 Name: "thanos_rule_evaluation_with_warnings_total", 275 Help: "The total number of rule evaluation that were successful but had warnings which can indicate partial error.", 276 }, []string{"strategy"}, 277 ) 278 m.ruleEvalWarnings.WithLabelValues(strings.ToLower(storepb.PartialResponseStrategy_ABORT.String())) 279 m.ruleEvalWarnings.WithLabelValues(strings.ToLower(storepb.PartialResponseStrategy_WARN.String())) 280 281 return m 282 } 283 284 // runRule runs a rule evaluation component that continuously evaluates alerting and recording 285 // rules. It sends alert notifications and writes TSDB data for results like a regular Prometheus server. 286 func runRule( 287 g *run.Group, 288 logger log.Logger, 289 reg *prometheus.Registry, 290 tracer opentracing.Tracer, 291 comp component.Component, 292 conf ruleConfig, 293 reloadSignal <-chan struct{}, 294 flagsMap map[string]string, 295 httpLogOpts []logging.Option, 296 grpcLogOpts []grpc_logging.Option, 297 tagOpts []tags.Option, 298 tsdbOpts *tsdb.Options, 299 agentOpts *agent.Options, 300 ) error { 301 metrics := newRuleMetrics(reg) 302 303 var queryCfg []httpconfig.Config 304 var err error 305 if len(conf.queryConfigYAML) > 0 { 306 queryCfg, err = httpconfig.LoadConfigs(conf.queryConfigYAML) 307 if err != nil { 308 return err 309 } 310 } else { 311 queryCfg, err = httpconfig.BuildConfig(conf.query.addrs) 312 if err != nil { 313 return errors.Wrap(err, "query configuration") 314 } 315 316 // Build the query configuration from the legacy query flags. 317 var fileSDConfigs []httpconfig.FileSDConfig 318 if len(conf.query.sdFiles) > 0 { 319 fileSDConfigs = append(fileSDConfigs, httpconfig.FileSDConfig{ 320 Files: conf.query.sdFiles, 321 RefreshInterval: model.Duration(conf.query.sdInterval), 322 }) 323 queryCfg = append(queryCfg, 324 httpconfig.Config{ 325 EndpointsConfig: httpconfig.EndpointsConfig{ 326 Scheme: "http", 327 FileSDConfigs: fileSDConfigs, 328 }, 329 }, 330 ) 331 } 332 } 333 334 queryProvider := dns.NewProvider( 335 logger, 336 extprom.WrapRegistererWithPrefix("thanos_rule_query_apis_", reg), 337 dns.ResolverType(conf.query.dnsSDResolver), 338 ) 339 var ( 340 queryClients []*httpconfig.Client 341 promClients []*promclient.Client 342 ) 343 queryClientMetrics := extpromhttp.NewClientMetrics(extprom.WrapRegistererWith(prometheus.Labels{"client": "query"}, reg)) 344 for _, cfg := range queryCfg { 345 cfg.HTTPClientConfig.ClientMetrics = queryClientMetrics 346 c, err := httpconfig.NewHTTPClient(cfg.HTTPClientConfig, "query") 347 if err != nil { 348 return err 349 } 350 c.Transport = tracing.HTTPTripperware(logger, c.Transport) 351 queryClient, err := httpconfig.NewClient(logger, cfg.EndpointsConfig, c, queryProvider.Clone()) 352 if err != nil { 353 return err 354 } 355 queryClients = append(queryClients, queryClient) 356 promClients = append(promClients, promclient.NewClient(queryClient, logger, "thanos-rule")) 357 // Discover and resolve query addresses. 358 addDiscoveryGroups(g, queryClient, conf.query.dnsSDInterval) 359 } 360 var ( 361 appendable storage.Appendable 362 queryable storage.Queryable 363 tsdbDB *tsdb.DB 364 agentDB *agent.DB 365 ) 366 367 rwCfgYAML, err := conf.rwConfig.Content() 368 if err != nil { 369 return err 370 } 371 372 if len(rwCfgYAML) > 0 { 373 var rwCfg struct { 374 RemoteWriteConfigs []*config.RemoteWriteConfig `yaml:"remote_write,omitempty"` 375 } 376 if err := yaml.Unmarshal(rwCfgYAML, &rwCfg); err != nil { 377 return errors.Wrapf(err, "failed to parse remote write config %v", string(rwCfgYAML)) 378 } 379 380 // flushDeadline is set to 1m, but it is for metadata watcher only so not used here. 381 remoteStore := remote.NewStorage(logger, reg, func() (int64, error) { 382 return 0, nil 383 }, conf.dataDir, 1*time.Minute, nil) 384 if err := remoteStore.ApplyConfig(&config.Config{ 385 GlobalConfig: config.GlobalConfig{ 386 ExternalLabels: labelsTSDBToProm(conf.lset), 387 }, 388 RemoteWriteConfigs: rwCfg.RemoteWriteConfigs, 389 }); err != nil { 390 return errors.Wrap(err, "applying config to remote storage") 391 } 392 393 agentDB, err = agent.Open(logger, reg, remoteStore, conf.dataDir, agentOpts) 394 if err != nil { 395 return errors.Wrap(err, "start remote write agent db") 396 } 397 fanoutStore := storage.NewFanout(logger, agentDB, remoteStore) 398 appendable = fanoutStore 399 // Use a separate queryable to restore the ALERTS firing states. 400 // We cannot use remoteStore directly because it uses remote read for 401 // query. However, remote read is not implemented in Thanos Receiver. 402 queryable = thanosrules.NewPromClientsQueryable(logger, queryClients, promClients, conf.query.httpMethod, conf.query.step, conf.ignoredLabelNames) 403 } else { 404 tsdbDB, err = tsdb.Open(conf.dataDir, log.With(logger, "component", "tsdb"), reg, tsdbOpts, nil) 405 if err != nil { 406 return errors.Wrap(err, "open TSDB") 407 } 408 409 level.Debug(logger).Log("msg", "removing storage lock file if any") 410 if err := removeLockfileIfAny(logger, conf.dataDir); err != nil { 411 return errors.Wrap(err, "remove storage lock files") 412 } 413 414 { 415 done := make(chan struct{}) 416 g.Add(func() error { 417 <-done 418 return tsdbDB.Close() 419 }, func(error) { 420 close(done) 421 }) 422 } 423 appendable = tsdbDB 424 queryable = tsdbDB 425 } 426 427 // Build the Alertmanager clients. 428 var alertingCfg alert.AlertingConfig 429 if len(conf.alertmgrsConfigYAML) > 0 { 430 alertingCfg, err = alert.LoadAlertingConfig(conf.alertmgrsConfigYAML) 431 if err != nil { 432 return err 433 } 434 } else { 435 // Build the Alertmanager configuration from the legacy flags. 436 for _, addr := range conf.alertmgr.alertmgrURLs { 437 cfg, err := alert.BuildAlertmanagerConfig(addr, conf.alertmgr.alertmgrsTimeout) 438 if err != nil { 439 return err 440 } 441 alertingCfg.Alertmanagers = append(alertingCfg.Alertmanagers, cfg) 442 } 443 } 444 445 if len(alertingCfg.Alertmanagers) == 0 { 446 level.Warn(logger).Log("msg", "no alertmanager configured") 447 } 448 449 var alertRelabelConfigs []*relabel.Config 450 if len(conf.alertRelabelConfigYAML) > 0 { 451 alertRelabelConfigs, err = alert.LoadRelabelConfigs(conf.alertRelabelConfigYAML) 452 if err != nil { 453 return err 454 } 455 } 456 457 amProvider := dns.NewProvider( 458 logger, 459 extprom.WrapRegistererWithPrefix("thanos_rule_alertmanagers_", reg), 460 dns.ResolverType(conf.query.dnsSDResolver), 461 ) 462 var alertmgrs []*alert.Alertmanager 463 amClientMetrics := extpromhttp.NewClientMetrics( 464 extprom.WrapRegistererWith(prometheus.Labels{"client": "alertmanager"}, reg), 465 ) 466 for _, cfg := range alertingCfg.Alertmanagers { 467 cfg.HTTPClientConfig.ClientMetrics = amClientMetrics 468 c, err := httpconfig.NewHTTPClient(cfg.HTTPClientConfig, "alertmanager") 469 if err != nil { 470 return err 471 } 472 c.Transport = tracing.HTTPTripperware(logger, c.Transport) 473 // Each Alertmanager client has a different list of targets thus each needs its own DNS provider. 474 amClient, err := httpconfig.NewClient(logger, cfg.EndpointsConfig, c, amProvider.Clone()) 475 if err != nil { 476 return err 477 } 478 // Discover and resolve Alertmanager addresses. 479 addDiscoveryGroups(g, amClient, conf.alertmgr.alertmgrsDNSSDInterval) 480 481 alertmgrs = append(alertmgrs, alert.NewAlertmanager(logger, amClient, time.Duration(cfg.Timeout), cfg.APIVersion)) 482 } 483 484 var ( 485 ruleMgr *thanosrules.Manager 486 alertQ = alert.NewQueue(logger, reg, 10000, 100, labelsTSDBToProm(conf.lset), conf.alertmgr.alertExcludeLabels, alertRelabelConfigs) 487 ) 488 { 489 // Run rule evaluation and alert notifications. 490 notifyFunc := func(ctx context.Context, expr string, alerts ...*rules.Alert) { 491 res := make([]*notifier.Alert, 0, len(alerts)) 492 for _, alrt := range alerts { 493 // Only send actually firing alerts. 494 if alrt.State == rules.StatePending { 495 continue 496 } 497 a := ¬ifier.Alert{ 498 StartsAt: alrt.FiredAt, 499 Labels: alrt.Labels, 500 Annotations: alrt.Annotations, 501 GeneratorURL: conf.alertQueryURL.String() + strutil.TableLinkForExpression(expr), 502 } 503 if !alrt.ResolvedAt.IsZero() { 504 a.EndsAt = alrt.ResolvedAt 505 } else { 506 a.EndsAt = alrt.ValidUntil 507 } 508 res = append(res, a) 509 } 510 alertQ.Push(res) 511 } 512 513 ctx, cancel := context.WithCancel(context.Background()) 514 logger = log.With(logger, "component", "rules") 515 ruleMgr = thanosrules.NewManager( 516 tracing.ContextWithTracer(ctx, tracer), 517 reg, 518 conf.dataDir, 519 rules.ManagerOptions{ 520 NotifyFunc: notifyFunc, 521 Logger: logger, 522 Appendable: appendable, 523 ExternalURL: nil, 524 Queryable: queryable, 525 ResendDelay: conf.resendDelay, 526 OutageTolerance: conf.outageTolerance, 527 ForGracePeriod: conf.forGracePeriod, 528 }, 529 queryFuncCreator(logger, queryClients, promClients, metrics.duplicatedQuery, metrics.ruleEvalWarnings, conf.query.httpMethod, conf.query.doNotAddThanosParams), 530 conf.lset, 531 // In our case the querying URL is the external URL because in Prometheus 532 // --web.external-url points to it i.e. it points at something where the user 533 // could execute the alert or recording rule's expression and get results. 534 conf.alertQueryURL.String(), 535 ) 536 537 // Schedule rule manager that evaluates rules. 538 g.Add(func() error { 539 ruleMgr.Run() 540 <-ctx.Done() 541 542 return nil 543 }, func(err error) { 544 cancel() 545 ruleMgr.Stop() 546 }) 547 } 548 // Run the alert sender. 549 { 550 sdr := alert.NewSender(logger, reg, alertmgrs) 551 ctx, cancel := context.WithCancel(context.Background()) 552 ctx = tracing.ContextWithTracer(ctx, tracer) 553 554 g.Add(func() error { 555 for { 556 tracing.DoInSpan(ctx, "/send_alerts", func(ctx context.Context) { 557 sdr.Send(ctx, alertQ.Pop(ctx.Done())) 558 }) 559 560 select { 561 case <-ctx.Done(): 562 return ctx.Err() 563 default: 564 } 565 } 566 }, func(error) { 567 cancel() 568 }) 569 } 570 571 // Handle reload and termination interrupts. 572 reloadWebhandler := make(chan chan error) 573 { 574 ctx, cancel := context.WithCancel(context.Background()) 575 g.Add(func() error { 576 // Initialize rules. 577 if err := reloadRules(logger, conf.ruleFiles, ruleMgr, conf.evalInterval, metrics); err != nil { 578 level.Error(logger).Log("msg", "initialize rules failed", "err", err) 579 return err 580 } 581 for { 582 select { 583 case <-reloadSignal: 584 if err := reloadRules(logger, conf.ruleFiles, ruleMgr, conf.evalInterval, metrics); err != nil { 585 level.Error(logger).Log("msg", "reload rules by sighup failed", "err", err) 586 } 587 case reloadMsg := <-reloadWebhandler: 588 err := reloadRules(logger, conf.ruleFiles, ruleMgr, conf.evalInterval, metrics) 589 if err != nil { 590 level.Error(logger).Log("msg", "reload rules by webhandler failed", "err", err) 591 } 592 reloadMsg <- err 593 case <-ctx.Done(): 594 return ctx.Err() 595 } 596 } 597 }, func(error) { 598 cancel() 599 }) 600 } 601 602 grpcProbe := prober.NewGRPC() 603 httpProbe := prober.NewHTTP() 604 statusProber := prober.Combine( 605 httpProbe, 606 grpcProbe, 607 prober.NewInstrumentation(comp, logger, extprom.WrapRegistererWithPrefix("thanos_", reg)), 608 ) 609 610 // Start gRPC server. 611 tlsCfg, err := tls.NewServerConfig(log.With(logger, "protocol", "gRPC"), conf.grpc.tlsSrvCert, conf.grpc.tlsSrvKey, conf.grpc.tlsSrvClientCA) 612 if err != nil { 613 return errors.Wrap(err, "setup gRPC server") 614 } 615 616 options := []grpcserver.Option{ 617 grpcserver.WithServer(thanosrules.RegisterRulesServer(ruleMgr)), 618 grpcserver.WithListen(conf.grpc.bindAddress), 619 grpcserver.WithGracePeriod(conf.grpc.gracePeriod), 620 grpcserver.WithGracePeriod(conf.grpc.maxConnectionAge), 621 grpcserver.WithTLSConfig(tlsCfg), 622 } 623 infoOptions := []info.ServerOptionFunc{info.WithRulesInfoFunc()} 624 if tsdbDB != nil { 625 tsdbStore := store.NewTSDBStore(logger, tsdbDB, component.Rule, conf.lset) 626 infoOptions = append( 627 infoOptions, 628 info.WithLabelSetFunc(func() []labelpb.ZLabelSet { 629 return tsdbStore.LabelSet() 630 }), 631 info.WithStoreInfoFunc(func() *infopb.StoreInfo { 632 if httpProbe.IsReady() { 633 mint, maxt := tsdbStore.TimeRange() 634 return &infopb.StoreInfo{ 635 MinTime: mint, 636 MaxTime: maxt, 637 SupportsSharding: true, 638 SupportsWithoutReplicaLabels: true, 639 TsdbInfos: tsdbStore.TSDBInfos(), 640 } 641 } 642 return nil 643 }), 644 ) 645 storeServer := store.NewLimitedStoreServer(store.NewInstrumentedStoreServer(reg, tsdbStore), reg, conf.storeRateLimits) 646 options = append(options, grpcserver.WithServer(store.RegisterStoreServer(storeServer, logger))) 647 } 648 649 options = append(options, grpcserver.WithServer( 650 info.RegisterInfoServer(info.NewInfoServer(component.Rule.String(), infoOptions...)), 651 )) 652 s := grpcserver.New(logger, reg, tracer, grpcLogOpts, tagOpts, comp, grpcProbe, options...) 653 654 g.Add(func() error { 655 statusProber.Ready() 656 return s.ListenAndServe() 657 }, func(err error) { 658 statusProber.NotReady(err) 659 s.Shutdown(err) 660 }) 661 662 // Start UI & metrics HTTP server. 663 { 664 router := route.New() 665 666 // RoutePrefix must always start with '/'. 667 conf.web.routePrefix = "/" + strings.Trim(conf.web.routePrefix, "/") 668 669 // Redirect from / to /webRoutePrefix. 670 if conf.web.routePrefix != "/" { 671 router.Get("/", func(w http.ResponseWriter, r *http.Request) { 672 http.Redirect(w, r, conf.web.routePrefix, http.StatusFound) 673 }) 674 router = router.WithPrefix(conf.web.routePrefix) 675 } 676 677 router.Post("/-/reload", func(w http.ResponseWriter, r *http.Request) { 678 reloadMsg := make(chan error) 679 reloadWebhandler <- reloadMsg 680 if err := <-reloadMsg; err != nil { 681 http.Error(w, err.Error(), http.StatusInternalServerError) 682 } 683 }) 684 685 ins := extpromhttp.NewInstrumentationMiddleware(reg, nil) 686 687 // Configure Request Logging for HTTP calls. 688 logMiddleware := logging.NewHTTPServerMiddleware(logger, httpLogOpts...) 689 690 // TODO(bplotka in PR #513 review): pass all flags, not only the flags needed by prefix rewriting. 691 ui.NewRuleUI(logger, reg, ruleMgr, conf.alertQueryURL.String(), conf.web.externalPrefix, conf.web.prefixHeaderName).Register(router, ins) 692 693 api := v1.NewRuleAPI(logger, reg, thanosrules.NewGRPCClient(ruleMgr), ruleMgr, conf.web.disableCORS, flagsMap) 694 api.Register(router.WithPrefix("/api/v1"), tracer, logger, ins, logMiddleware) 695 696 srv := httpserver.New(logger, reg, comp, httpProbe, 697 httpserver.WithListen(conf.http.bindAddress), 698 httpserver.WithGracePeriod(time.Duration(conf.http.gracePeriod)), 699 httpserver.WithTLSConfig(conf.http.tlsConfig), 700 ) 701 srv.Handle("/", router) 702 703 g.Add(func() error { 704 statusProber.Healthy() 705 706 return srv.ListenAndServe() 707 }, func(err error) { 708 statusProber.NotReady(err) 709 defer statusProber.NotHealthy(err) 710 711 srv.Shutdown(err) 712 }) 713 } 714 715 confContentYaml, err := conf.objStoreConfig.Content() 716 if err != nil { 717 return err 718 } 719 720 if len(confContentYaml) > 0 { 721 // The background shipper continuously scans the data directory and uploads 722 // new blocks to Google Cloud Storage or an S3-compatible storage service. 723 bkt, err := client.NewBucket(logger, confContentYaml, component.Rule.String()) 724 if err != nil { 725 return err 726 } 727 bkt = objstoretracing.WrapWithTraces(objstore.WrapWithMetrics(bkt, extprom.WrapRegistererWithPrefix("thanos_", reg), bkt.Name())) 728 729 // Ensure we close up everything properly. 730 defer func() { 731 if err != nil { 732 runutil.CloseWithLogOnErr(logger, bkt, "bucket client") 733 } 734 }() 735 736 s := shipper.New(logger, reg, conf.dataDir, bkt, func() labels.Labels { return conf.lset }, metadata.RulerSource, nil, conf.shipper.allowOutOfOrderUpload, metadata.HashFunc(conf.shipper.hashFunc)) 737 738 ctx, cancel := context.WithCancel(context.Background()) 739 740 g.Add(func() error { 741 defer runutil.CloseWithLogOnErr(logger, bkt, "bucket client") 742 743 return runutil.Repeat(30*time.Second, ctx.Done(), func() error { 744 if _, err := s.Sync(ctx); err != nil { 745 level.Warn(logger).Log("err", err) 746 } 747 return nil 748 }) 749 }, func(error) { 750 cancel() 751 }) 752 } else { 753 level.Info(logger).Log("msg", "no supported bucket was configured, uploads will be disabled") 754 } 755 756 level.Info(logger).Log("msg", "starting rule node") 757 return nil 758 } 759 760 func removeLockfileIfAny(logger log.Logger, dataDir string) error { 761 absdir, err := filepath.Abs(dataDir) 762 if err != nil { 763 return err 764 } 765 if err := os.Remove(filepath.Join(absdir, "lock")); err != nil { 766 if os.IsNotExist(err) { 767 return nil 768 } 769 return err 770 } 771 level.Info(logger).Log("msg", "a leftover lockfile found and removed") 772 return nil 773 } 774 775 func parseFlagLabels(s []string) (labels.Labels, error) { 776 var lset labels.Labels 777 for _, l := range s { 778 parts := strings.SplitN(l, "=", 2) 779 if len(parts) != 2 { 780 return nil, errors.Errorf("unrecognized label %q", l) 781 } 782 if !model.LabelName.IsValid(model.LabelName(parts[0])) { 783 return nil, errors.Errorf("unsupported format for label %s", l) 784 } 785 val, err := strconv.Unquote(parts[1]) 786 if err != nil { 787 return nil, errors.Wrap(err, "unquote label value") 788 } 789 lset = append(lset, labels.Label{Name: parts[0], Value: val}) 790 } 791 sort.Sort(lset) 792 return lset, nil 793 } 794 795 func labelsTSDBToProm(lset labels.Labels) (res labels.Labels) { 796 for _, l := range lset { 797 res = append(res, labels.Label{ 798 Name: l.Name, 799 Value: l.Value, 800 }) 801 } 802 return res 803 } 804 805 func queryFuncCreator( 806 logger log.Logger, 807 queriers []*httpconfig.Client, 808 promClients []*promclient.Client, 809 duplicatedQuery prometheus.Counter, 810 ruleEvalWarnings *prometheus.CounterVec, 811 httpMethod string, 812 doNotAddThanosParams bool, 813 ) func(partialResponseStrategy storepb.PartialResponseStrategy) rules.QueryFunc { 814 815 // queryFunc returns query function that hits the HTTP query API of query peers in randomized order until we get a result 816 // back or the context get canceled. 817 return func(partialResponseStrategy storepb.PartialResponseStrategy) rules.QueryFunc { 818 var spanID string 819 820 switch partialResponseStrategy { 821 case storepb.PartialResponseStrategy_WARN: 822 spanID = "/rule_instant_query HTTP[client]" 823 case storepb.PartialResponseStrategy_ABORT: 824 spanID = "/rule_instant_query_part_resp_abort HTTP[client]" 825 default: 826 // Programming error will be caught by tests. 827 panic(errors.Errorf("unknown partial response strategy %v", partialResponseStrategy).Error()) 828 } 829 830 return func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { 831 for _, i := range rand.Perm(len(queriers)) { 832 promClient := promClients[i] 833 endpoints := thanosrules.RemoveDuplicateQueryEndpoints(logger, duplicatedQuery, queriers[i].Endpoints()) 834 for _, i := range rand.Perm(len(endpoints)) { 835 span, ctx := tracing.StartSpan(ctx, spanID) 836 v, warns, err := promClient.PromqlQueryInstant(ctx, endpoints[i], q, t, promclient.QueryOptions{ 837 Deduplicate: true, 838 PartialResponseStrategy: partialResponseStrategy, 839 Method: httpMethod, 840 DoNotAddThanosParams: doNotAddThanosParams, 841 }) 842 span.Finish() 843 844 if err != nil { 845 level.Error(logger).Log("err", err, "query", q) 846 continue 847 } 848 if len(warns) > 0 { 849 ruleEvalWarnings.WithLabelValues(strings.ToLower(partialResponseStrategy.String())).Inc() 850 // TODO(bwplotka): Propagate those to UI, probably requires changing rule manager code ): 851 level.Warn(logger).Log("warnings", strings.Join(warns, ", "), "query", q) 852 } 853 return v, nil 854 } 855 } 856 return nil, errors.Errorf("no query API server reachable") 857 } 858 } 859 } 860 861 func addDiscoveryGroups(g *run.Group, c *httpconfig.Client, interval time.Duration) { 862 ctx, cancel := context.WithCancel(context.Background()) 863 g.Add(func() error { 864 c.Discover(ctx) 865 return nil 866 }, func(error) { 867 cancel() 868 }) 869 870 g.Add(func() error { 871 return runutil.Repeat(interval, ctx.Done(), func() error { 872 return c.Resolve(ctx) 873 }) 874 }, func(error) { 875 cancel() 876 }) 877 } 878 879 func reloadRules(logger log.Logger, 880 ruleFiles []string, 881 ruleMgr *thanosrules.Manager, 882 evalInterval time.Duration, 883 metrics *RuleMetrics) error { 884 level.Debug(logger).Log("msg", "configured rule files", "files", strings.Join(ruleFiles, ",")) 885 var ( 886 errs errutil.MultiError 887 files []string 888 seenFiles = make(map[string]struct{}) 889 ) 890 for _, pat := range ruleFiles { 891 fs, err := filepath.Glob(pat) 892 if err != nil { 893 // The only error can be a bad pattern. 894 errs.Add(errors.Wrapf(err, "retrieving rule files failed. Ignoring file. pattern %s", pat)) 895 continue 896 } 897 898 for _, fp := range fs { 899 if _, ok := seenFiles[fp]; ok { 900 continue 901 } 902 files = append(files, fp) 903 seenFiles[fp] = struct{}{} 904 } 905 } 906 907 level.Info(logger).Log("msg", "reload rule files", "numFiles", len(files)) 908 909 if err := ruleMgr.Update(evalInterval, files); err != nil { 910 metrics.configSuccess.Set(0) 911 errs.Add(errors.Wrap(err, "reloading rules failed")) 912 return errs.Err() 913 } 914 915 metrics.configSuccess.Set(1) 916 metrics.configSuccessTime.Set(float64(time.Now().UnixNano()) / 1e9) 917 918 metrics.rulesLoaded.Reset() 919 for _, group := range ruleMgr.RuleGroups() { 920 metrics.rulesLoaded.WithLabelValues(group.PartialResponseStrategy.String(), group.OriginalFile, group.Name()).Set(float64(len(group.Rules()))) 921 } 922 return errs.Err() 923 }