github.com/thanos-io/thanos@v0.32.5/cmd/thanos/rule.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  package main
     5  
     6  import (
     7  	"context"
     8  	"math/rand"
     9  	"net/http"
    10  	"net/url"
    11  	"os"
    12  	"path/filepath"
    13  	"sort"
    14  	"strconv"
    15  	"strings"
    16  	"time"
    17  
    18  	extflag "github.com/efficientgo/tools/extkingpin"
    19  	"github.com/go-kit/log"
    20  	"github.com/go-kit/log/level"
    21  	grpc_logging "github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors/logging"
    22  	"github.com/grpc-ecosystem/go-grpc-middleware/v2/interceptors/tags"
    23  	"github.com/oklog/run"
    24  	"github.com/opentracing/opentracing-go"
    25  	"github.com/pkg/errors"
    26  	"github.com/prometheus/client_golang/prometheus"
    27  	"github.com/prometheus/client_golang/prometheus/promauto"
    28  	"github.com/prometheus/common/model"
    29  	"github.com/prometheus/common/route"
    30  	"github.com/prometheus/prometheus/config"
    31  	"github.com/prometheus/prometheus/model/labels"
    32  	"github.com/prometheus/prometheus/model/relabel"
    33  	"github.com/prometheus/prometheus/notifier"
    34  	"github.com/prometheus/prometheus/promql"
    35  	"github.com/prometheus/prometheus/rules"
    36  	"github.com/prometheus/prometheus/storage"
    37  	"github.com/prometheus/prometheus/storage/remote"
    38  	"github.com/prometheus/prometheus/tsdb"
    39  	"github.com/prometheus/prometheus/tsdb/agent"
    40  	"github.com/prometheus/prometheus/tsdb/wlog"
    41  	"github.com/prometheus/prometheus/util/strutil"
    42  
    43  	"github.com/thanos-io/objstore"
    44  	"github.com/thanos-io/objstore/client"
    45  	objstoretracing "github.com/thanos-io/objstore/tracing/opentracing"
    46  	"gopkg.in/yaml.v2"
    47  
    48  	"github.com/thanos-io/thanos/pkg/alert"
    49  	v1 "github.com/thanos-io/thanos/pkg/api/rule"
    50  	"github.com/thanos-io/thanos/pkg/block/metadata"
    51  	"github.com/thanos-io/thanos/pkg/component"
    52  	"github.com/thanos-io/thanos/pkg/discovery/dns"
    53  	"github.com/thanos-io/thanos/pkg/errutil"
    54  	"github.com/thanos-io/thanos/pkg/extkingpin"
    55  	"github.com/thanos-io/thanos/pkg/extprom"
    56  	extpromhttp "github.com/thanos-io/thanos/pkg/extprom/http"
    57  	"github.com/thanos-io/thanos/pkg/httpconfig"
    58  	"github.com/thanos-io/thanos/pkg/info"
    59  	"github.com/thanos-io/thanos/pkg/info/infopb"
    60  	"github.com/thanos-io/thanos/pkg/logging"
    61  	"github.com/thanos-io/thanos/pkg/prober"
    62  	"github.com/thanos-io/thanos/pkg/promclient"
    63  	thanosrules "github.com/thanos-io/thanos/pkg/rules"
    64  	"github.com/thanos-io/thanos/pkg/runutil"
    65  	grpcserver "github.com/thanos-io/thanos/pkg/server/grpc"
    66  	httpserver "github.com/thanos-io/thanos/pkg/server/http"
    67  	"github.com/thanos-io/thanos/pkg/shipper"
    68  	"github.com/thanos-io/thanos/pkg/store"
    69  	"github.com/thanos-io/thanos/pkg/store/labelpb"
    70  	"github.com/thanos-io/thanos/pkg/store/storepb"
    71  	"github.com/thanos-io/thanos/pkg/tls"
    72  	"github.com/thanos-io/thanos/pkg/tracing"
    73  	"github.com/thanos-io/thanos/pkg/ui"
    74  )
    75  
    76  type ruleConfig struct {
    77  	http    httpConfig
    78  	grpc    grpcConfig
    79  	web     webConfig
    80  	shipper shipperConfig
    81  
    82  	query           queryConfig
    83  	queryConfigYAML []byte
    84  
    85  	alertmgr               alertMgrConfig
    86  	alertmgrsConfigYAML    []byte
    87  	alertQueryURL          *url.URL
    88  	alertRelabelConfigYAML []byte
    89  
    90  	rwConfig *extflag.PathOrContent
    91  
    92  	resendDelay       time.Duration
    93  	evalInterval      time.Duration
    94  	outageTolerance   time.Duration
    95  	forGracePeriod    time.Duration
    96  	ruleFiles         []string
    97  	objStoreConfig    *extflag.PathOrContent
    98  	dataDir           string
    99  	lset              labels.Labels
   100  	ignoredLabelNames []string
   101  	storeRateLimits   store.SeriesSelectLimits
   102  }
   103  
   104  func (rc *ruleConfig) registerFlag(cmd extkingpin.FlagClause) {
   105  	rc.http.registerFlag(cmd)
   106  	rc.grpc.registerFlag(cmd)
   107  	rc.web.registerFlag(cmd)
   108  	rc.shipper.registerFlag(cmd)
   109  	rc.query.registerFlag(cmd)
   110  	rc.alertmgr.registerFlag(cmd)
   111  	rc.storeRateLimits.RegisterFlags(cmd)
   112  }
   113  
   114  // registerRule registers a rule command.
   115  func registerRule(app *extkingpin.App) {
   116  	comp := component.Rule
   117  	cmd := app.Command(comp.String(), "Ruler evaluating Prometheus rules against given Query nodes, exposing Store API and storing old blocks in bucket.")
   118  
   119  	conf := &ruleConfig{}
   120  	conf.registerFlag(cmd)
   121  
   122  	labelStrs := cmd.Flag("label", "Labels to be applied to all generated metrics (repeated). Similar to external labels for Prometheus, used to identify ruler and its blocks as unique source.").
   123  		PlaceHolder("<name>=\"<value>\"").Strings()
   124  	tsdbBlockDuration := extkingpin.ModelDuration(cmd.Flag("tsdb.block-duration", "Block duration for TSDB block.").
   125  		Default("2h"))
   126  	tsdbRetention := extkingpin.ModelDuration(cmd.Flag("tsdb.retention", "Block retention time on local disk.").
   127  		Default("48h"))
   128  	noLockFile := cmd.Flag("tsdb.no-lockfile", "Do not create lockfile in TSDB data directory. In any case, the lockfiles will be deleted on next startup.").Default("false").Bool()
   129  	walCompression := cmd.Flag("tsdb.wal-compression", "Compress the tsdb WAL.").Default("true").Bool()
   130  
   131  	cmd.Flag("data-dir", "data directory").Default("data/").StringVar(&conf.dataDir)
   132  	cmd.Flag("rule-file", "Rule files that should be used by rule manager. Can be in glob format (repeated). Note that rules are not automatically detected, use SIGHUP or do HTTP POST /-/reload to re-read them.").
   133  		Default("rules/").StringsVar(&conf.ruleFiles)
   134  	cmd.Flag("resend-delay", "Minimum amount of time to wait before resending an alert to Alertmanager.").
   135  		Default("1m").DurationVar(&conf.resendDelay)
   136  	cmd.Flag("eval-interval", "The default evaluation interval to use.").
   137  		Default("1m").DurationVar(&conf.evalInterval)
   138  	cmd.Flag("for-outage-tolerance", "Max time to tolerate prometheus outage for restoring \"for\" state of alert.").
   139  		Default("1h").DurationVar(&conf.outageTolerance)
   140  	cmd.Flag("for-grace-period", "Minimum duration between alert and restored \"for\" state. This is maintained only for alerts with configured \"for\" time greater than grace period.").
   141  		Default("10m").DurationVar(&conf.forGracePeriod)
   142  	cmd.Flag("restore-ignored-label", "Label names to be ignored when restoring alerts from the remote storage. This is only used in stateless mode.").
   143  		StringsVar(&conf.ignoredLabelNames)
   144  
   145  	conf.rwConfig = extflag.RegisterPathOrContent(cmd, "remote-write.config", "YAML config for the remote-write configurations, that specify servers where samples should be sent to (see https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write). This automatically enables stateless mode for ruler and no series will be stored in the ruler's TSDB. If an empty config (or file) is provided, the flag is ignored and ruler is run with its own TSDB.", extflag.WithEnvSubstitution())
   146  
   147  	reqLogDecision := cmd.Flag("log.request.decision", "Deprecation Warning - This flag would be soon deprecated, and replaced with `request.logging-config`. Request Logging for logging the start and end of requests. By default this flag is disabled. LogFinishCall: Logs the finish call of the requests. LogStartAndFinishCall: Logs the start and finish call of the requests. NoLogCall: Disable request logging.").Default("").Enum("NoLogCall", "LogFinishCall", "LogStartAndFinishCall", "")
   148  
   149  	conf.objStoreConfig = extkingpin.RegisterCommonObjStoreFlags(cmd, "", false)
   150  
   151  	reqLogConfig := extkingpin.RegisterRequestLoggingFlags(cmd)
   152  
   153  	var err error
   154  	cmd.Setup(func(g *run.Group, logger log.Logger, reg *prometheus.Registry, tracer opentracing.Tracer, reload <-chan struct{}, _ bool) error {
   155  		conf.lset, err = parseFlagLabels(*labelStrs)
   156  		if err != nil {
   157  			return errors.Wrap(err, "parse labels")
   158  		}
   159  
   160  		conf.alertQueryURL, err = url.Parse(*conf.alertmgr.alertQueryURL)
   161  		if err != nil {
   162  			return errors.Wrap(err, "parse alert query url")
   163  		}
   164  
   165  		tsdbOpts := &tsdb.Options{
   166  			MinBlockDuration:  int64(time.Duration(*tsdbBlockDuration) / time.Millisecond),
   167  			MaxBlockDuration:  int64(time.Duration(*tsdbBlockDuration) / time.Millisecond),
   168  			RetentionDuration: int64(time.Duration(*tsdbRetention) / time.Millisecond),
   169  			NoLockfile:        *noLockFile,
   170  			WALCompression:    wlog.ParseCompressionType(*walCompression, string(wlog.CompressionSnappy)),
   171  		}
   172  
   173  		agentOpts := &agent.Options{
   174  			WALCompression: wlog.ParseCompressionType(*walCompression, string(wlog.CompressionSnappy)),
   175  			NoLockfile:     *noLockFile,
   176  		}
   177  
   178  		// Parse and check query configuration.
   179  		lookupQueries := map[string]struct{}{}
   180  		for _, q := range conf.query.addrs {
   181  			if _, ok := lookupQueries[q]; ok {
   182  				return errors.Errorf("Address %s is duplicated for --query flag.", q)
   183  			}
   184  
   185  			lookupQueries[q] = struct{}{}
   186  		}
   187  
   188  		conf.queryConfigYAML, err = conf.query.configPath.Content()
   189  		if err != nil {
   190  			return err
   191  		}
   192  		if len(conf.query.sdFiles) == 0 && len(conf.query.addrs) == 0 && len(conf.queryConfigYAML) == 0 {
   193  			return errors.New("no --query parameter was given")
   194  		}
   195  		if (len(conf.query.sdFiles) != 0 || len(conf.query.addrs) != 0) && len(conf.queryConfigYAML) != 0 {
   196  			return errors.New("--query/--query.sd-files and --query.config* parameters cannot be defined at the same time")
   197  		}
   198  
   199  		// Parse and check alerting configuration.
   200  		conf.alertmgrsConfigYAML, err = conf.alertmgr.configPath.Content()
   201  		if err != nil {
   202  			return err
   203  		}
   204  		if len(conf.alertmgrsConfigYAML) != 0 && len(conf.alertmgr.alertmgrURLs) != 0 {
   205  			return errors.New("--alertmanagers.url and --alertmanagers.config* parameters cannot be defined at the same time")
   206  		}
   207  
   208  		conf.alertRelabelConfigYAML, err = conf.alertmgr.alertRelabelConfigPath.Content()
   209  		if err != nil {
   210  			return err
   211  		}
   212  
   213  		httpLogOpts, err := logging.ParseHTTPOptions(*reqLogDecision, reqLogConfig)
   214  		if err != nil {
   215  			return errors.Wrap(err, "error while parsing config for request logging")
   216  		}
   217  
   218  		tagOpts, grpcLogOpts, err := logging.ParsegRPCOptions(*reqLogDecision, reqLogConfig)
   219  		if err != nil {
   220  			return errors.Wrap(err, "error while parsing config for request logging")
   221  		}
   222  
   223  		return runRule(g,
   224  			logger,
   225  			reg,
   226  			tracer,
   227  			comp,
   228  			*conf,
   229  			reload,
   230  			getFlagsMap(cmd.Flags()),
   231  			httpLogOpts,
   232  			grpcLogOpts,
   233  			tagOpts,
   234  			tsdbOpts,
   235  			agentOpts,
   236  		)
   237  	})
   238  }
   239  
   240  // RuleMetrics defines Thanos Ruler metrics.
   241  type RuleMetrics struct {
   242  	configSuccess     prometheus.Gauge
   243  	configSuccessTime prometheus.Gauge
   244  	duplicatedQuery   prometheus.Counter
   245  	rulesLoaded       *prometheus.GaugeVec
   246  	ruleEvalWarnings  *prometheus.CounterVec
   247  }
   248  
   249  func newRuleMetrics(reg *prometheus.Registry) *RuleMetrics {
   250  	m := new(RuleMetrics)
   251  
   252  	factory := promauto.With(reg)
   253  	m.configSuccess = factory.NewGauge(prometheus.GaugeOpts{
   254  		Name: "thanos_rule_config_last_reload_successful",
   255  		Help: "Whether the last configuration reload attempt was successful.",
   256  	})
   257  	m.configSuccessTime = factory.NewGauge(prometheus.GaugeOpts{
   258  		Name: "thanos_rule_config_last_reload_success_timestamp_seconds",
   259  		Help: "Timestamp of the last successful configuration reload.",
   260  	})
   261  	m.duplicatedQuery = factory.NewCounter(prometheus.CounterOpts{
   262  		Name: "thanos_rule_duplicated_query_addresses_total",
   263  		Help: "The number of times a duplicated query addresses is detected from the different configs in rule.",
   264  	})
   265  	m.rulesLoaded = factory.NewGaugeVec(
   266  		prometheus.GaugeOpts{
   267  			Name: "thanos_rule_loaded_rules",
   268  			Help: "Loaded rules partitioned by file and group.",
   269  		},
   270  		[]string{"strategy", "file", "group"},
   271  	)
   272  	m.ruleEvalWarnings = factory.NewCounterVec(
   273  		prometheus.CounterOpts{
   274  			Name: "thanos_rule_evaluation_with_warnings_total",
   275  			Help: "The total number of rule evaluation that were successful but had warnings which can indicate partial error.",
   276  		}, []string{"strategy"},
   277  	)
   278  	m.ruleEvalWarnings.WithLabelValues(strings.ToLower(storepb.PartialResponseStrategy_ABORT.String()))
   279  	m.ruleEvalWarnings.WithLabelValues(strings.ToLower(storepb.PartialResponseStrategy_WARN.String()))
   280  
   281  	return m
   282  }
   283  
   284  // runRule runs a rule evaluation component that continuously evaluates alerting and recording
   285  // rules. It sends alert notifications and writes TSDB data for results like a regular Prometheus server.
   286  func runRule(
   287  	g *run.Group,
   288  	logger log.Logger,
   289  	reg *prometheus.Registry,
   290  	tracer opentracing.Tracer,
   291  	comp component.Component,
   292  	conf ruleConfig,
   293  	reloadSignal <-chan struct{},
   294  	flagsMap map[string]string,
   295  	httpLogOpts []logging.Option,
   296  	grpcLogOpts []grpc_logging.Option,
   297  	tagOpts []tags.Option,
   298  	tsdbOpts *tsdb.Options,
   299  	agentOpts *agent.Options,
   300  ) error {
   301  	metrics := newRuleMetrics(reg)
   302  
   303  	var queryCfg []httpconfig.Config
   304  	var err error
   305  	if len(conf.queryConfigYAML) > 0 {
   306  		queryCfg, err = httpconfig.LoadConfigs(conf.queryConfigYAML)
   307  		if err != nil {
   308  			return err
   309  		}
   310  	} else {
   311  		queryCfg, err = httpconfig.BuildConfig(conf.query.addrs)
   312  		if err != nil {
   313  			return errors.Wrap(err, "query configuration")
   314  		}
   315  
   316  		// Build the query configuration from the legacy query flags.
   317  		var fileSDConfigs []httpconfig.FileSDConfig
   318  		if len(conf.query.sdFiles) > 0 {
   319  			fileSDConfigs = append(fileSDConfigs, httpconfig.FileSDConfig{
   320  				Files:           conf.query.sdFiles,
   321  				RefreshInterval: model.Duration(conf.query.sdInterval),
   322  			})
   323  			queryCfg = append(queryCfg,
   324  				httpconfig.Config{
   325  					EndpointsConfig: httpconfig.EndpointsConfig{
   326  						Scheme:        "http",
   327  						FileSDConfigs: fileSDConfigs,
   328  					},
   329  				},
   330  			)
   331  		}
   332  	}
   333  
   334  	queryProvider := dns.NewProvider(
   335  		logger,
   336  		extprom.WrapRegistererWithPrefix("thanos_rule_query_apis_", reg),
   337  		dns.ResolverType(conf.query.dnsSDResolver),
   338  	)
   339  	var (
   340  		queryClients []*httpconfig.Client
   341  		promClients  []*promclient.Client
   342  	)
   343  	queryClientMetrics := extpromhttp.NewClientMetrics(extprom.WrapRegistererWith(prometheus.Labels{"client": "query"}, reg))
   344  	for _, cfg := range queryCfg {
   345  		cfg.HTTPClientConfig.ClientMetrics = queryClientMetrics
   346  		c, err := httpconfig.NewHTTPClient(cfg.HTTPClientConfig, "query")
   347  		if err != nil {
   348  			return err
   349  		}
   350  		c.Transport = tracing.HTTPTripperware(logger, c.Transport)
   351  		queryClient, err := httpconfig.NewClient(logger, cfg.EndpointsConfig, c, queryProvider.Clone())
   352  		if err != nil {
   353  			return err
   354  		}
   355  		queryClients = append(queryClients, queryClient)
   356  		promClients = append(promClients, promclient.NewClient(queryClient, logger, "thanos-rule"))
   357  		// Discover and resolve query addresses.
   358  		addDiscoveryGroups(g, queryClient, conf.query.dnsSDInterval)
   359  	}
   360  	var (
   361  		appendable storage.Appendable
   362  		queryable  storage.Queryable
   363  		tsdbDB     *tsdb.DB
   364  		agentDB    *agent.DB
   365  	)
   366  
   367  	rwCfgYAML, err := conf.rwConfig.Content()
   368  	if err != nil {
   369  		return err
   370  	}
   371  
   372  	if len(rwCfgYAML) > 0 {
   373  		var rwCfg struct {
   374  			RemoteWriteConfigs []*config.RemoteWriteConfig `yaml:"remote_write,omitempty"`
   375  		}
   376  		if err := yaml.Unmarshal(rwCfgYAML, &rwCfg); err != nil {
   377  			return errors.Wrapf(err, "failed to parse remote write config %v", string(rwCfgYAML))
   378  		}
   379  
   380  		// flushDeadline is set to 1m, but it is for metadata watcher only so not used here.
   381  		remoteStore := remote.NewStorage(logger, reg, func() (int64, error) {
   382  			return 0, nil
   383  		}, conf.dataDir, 1*time.Minute, nil)
   384  		if err := remoteStore.ApplyConfig(&config.Config{
   385  			GlobalConfig: config.GlobalConfig{
   386  				ExternalLabels: labelsTSDBToProm(conf.lset),
   387  			},
   388  			RemoteWriteConfigs: rwCfg.RemoteWriteConfigs,
   389  		}); err != nil {
   390  			return errors.Wrap(err, "applying config to remote storage")
   391  		}
   392  
   393  		agentDB, err = agent.Open(logger, reg, remoteStore, conf.dataDir, agentOpts)
   394  		if err != nil {
   395  			return errors.Wrap(err, "start remote write agent db")
   396  		}
   397  		fanoutStore := storage.NewFanout(logger, agentDB, remoteStore)
   398  		appendable = fanoutStore
   399  		// Use a separate queryable to restore the ALERTS firing states.
   400  		// We cannot use remoteStore directly because it uses remote read for
   401  		// query. However, remote read is not implemented in Thanos Receiver.
   402  		queryable = thanosrules.NewPromClientsQueryable(logger, queryClients, promClients, conf.query.httpMethod, conf.query.step, conf.ignoredLabelNames)
   403  	} else {
   404  		tsdbDB, err = tsdb.Open(conf.dataDir, log.With(logger, "component", "tsdb"), reg, tsdbOpts, nil)
   405  		if err != nil {
   406  			return errors.Wrap(err, "open TSDB")
   407  		}
   408  
   409  		level.Debug(logger).Log("msg", "removing storage lock file if any")
   410  		if err := removeLockfileIfAny(logger, conf.dataDir); err != nil {
   411  			return errors.Wrap(err, "remove storage lock files")
   412  		}
   413  
   414  		{
   415  			done := make(chan struct{})
   416  			g.Add(func() error {
   417  				<-done
   418  				return tsdbDB.Close()
   419  			}, func(error) {
   420  				close(done)
   421  			})
   422  		}
   423  		appendable = tsdbDB
   424  		queryable = tsdbDB
   425  	}
   426  
   427  	// Build the Alertmanager clients.
   428  	var alertingCfg alert.AlertingConfig
   429  	if len(conf.alertmgrsConfigYAML) > 0 {
   430  		alertingCfg, err = alert.LoadAlertingConfig(conf.alertmgrsConfigYAML)
   431  		if err != nil {
   432  			return err
   433  		}
   434  	} else {
   435  		// Build the Alertmanager configuration from the legacy flags.
   436  		for _, addr := range conf.alertmgr.alertmgrURLs {
   437  			cfg, err := alert.BuildAlertmanagerConfig(addr, conf.alertmgr.alertmgrsTimeout)
   438  			if err != nil {
   439  				return err
   440  			}
   441  			alertingCfg.Alertmanagers = append(alertingCfg.Alertmanagers, cfg)
   442  		}
   443  	}
   444  
   445  	if len(alertingCfg.Alertmanagers) == 0 {
   446  		level.Warn(logger).Log("msg", "no alertmanager configured")
   447  	}
   448  
   449  	var alertRelabelConfigs []*relabel.Config
   450  	if len(conf.alertRelabelConfigYAML) > 0 {
   451  		alertRelabelConfigs, err = alert.LoadRelabelConfigs(conf.alertRelabelConfigYAML)
   452  		if err != nil {
   453  			return err
   454  		}
   455  	}
   456  
   457  	amProvider := dns.NewProvider(
   458  		logger,
   459  		extprom.WrapRegistererWithPrefix("thanos_rule_alertmanagers_", reg),
   460  		dns.ResolverType(conf.query.dnsSDResolver),
   461  	)
   462  	var alertmgrs []*alert.Alertmanager
   463  	amClientMetrics := extpromhttp.NewClientMetrics(
   464  		extprom.WrapRegistererWith(prometheus.Labels{"client": "alertmanager"}, reg),
   465  	)
   466  	for _, cfg := range alertingCfg.Alertmanagers {
   467  		cfg.HTTPClientConfig.ClientMetrics = amClientMetrics
   468  		c, err := httpconfig.NewHTTPClient(cfg.HTTPClientConfig, "alertmanager")
   469  		if err != nil {
   470  			return err
   471  		}
   472  		c.Transport = tracing.HTTPTripperware(logger, c.Transport)
   473  		// Each Alertmanager client has a different list of targets thus each needs its own DNS provider.
   474  		amClient, err := httpconfig.NewClient(logger, cfg.EndpointsConfig, c, amProvider.Clone())
   475  		if err != nil {
   476  			return err
   477  		}
   478  		// Discover and resolve Alertmanager addresses.
   479  		addDiscoveryGroups(g, amClient, conf.alertmgr.alertmgrsDNSSDInterval)
   480  
   481  		alertmgrs = append(alertmgrs, alert.NewAlertmanager(logger, amClient, time.Duration(cfg.Timeout), cfg.APIVersion))
   482  	}
   483  
   484  	var (
   485  		ruleMgr *thanosrules.Manager
   486  		alertQ  = alert.NewQueue(logger, reg, 10000, 100, labelsTSDBToProm(conf.lset), conf.alertmgr.alertExcludeLabels, alertRelabelConfigs)
   487  	)
   488  	{
   489  		// Run rule evaluation and alert notifications.
   490  		notifyFunc := func(ctx context.Context, expr string, alerts ...*rules.Alert) {
   491  			res := make([]*notifier.Alert, 0, len(alerts))
   492  			for _, alrt := range alerts {
   493  				// Only send actually firing alerts.
   494  				if alrt.State == rules.StatePending {
   495  					continue
   496  				}
   497  				a := &notifier.Alert{
   498  					StartsAt:     alrt.FiredAt,
   499  					Labels:       alrt.Labels,
   500  					Annotations:  alrt.Annotations,
   501  					GeneratorURL: conf.alertQueryURL.String() + strutil.TableLinkForExpression(expr),
   502  				}
   503  				if !alrt.ResolvedAt.IsZero() {
   504  					a.EndsAt = alrt.ResolvedAt
   505  				} else {
   506  					a.EndsAt = alrt.ValidUntil
   507  				}
   508  				res = append(res, a)
   509  			}
   510  			alertQ.Push(res)
   511  		}
   512  
   513  		ctx, cancel := context.WithCancel(context.Background())
   514  		logger = log.With(logger, "component", "rules")
   515  		ruleMgr = thanosrules.NewManager(
   516  			tracing.ContextWithTracer(ctx, tracer),
   517  			reg,
   518  			conf.dataDir,
   519  			rules.ManagerOptions{
   520  				NotifyFunc:      notifyFunc,
   521  				Logger:          logger,
   522  				Appendable:      appendable,
   523  				ExternalURL:     nil,
   524  				Queryable:       queryable,
   525  				ResendDelay:     conf.resendDelay,
   526  				OutageTolerance: conf.outageTolerance,
   527  				ForGracePeriod:  conf.forGracePeriod,
   528  			},
   529  			queryFuncCreator(logger, queryClients, promClients, metrics.duplicatedQuery, metrics.ruleEvalWarnings, conf.query.httpMethod, conf.query.doNotAddThanosParams),
   530  			conf.lset,
   531  			// In our case the querying URL is the external URL because in Prometheus
   532  			// --web.external-url points to it i.e. it points at something where the user
   533  			// could execute the alert or recording rule's expression and get results.
   534  			conf.alertQueryURL.String(),
   535  		)
   536  
   537  		// Schedule rule manager that evaluates rules.
   538  		g.Add(func() error {
   539  			ruleMgr.Run()
   540  			<-ctx.Done()
   541  
   542  			return nil
   543  		}, func(err error) {
   544  			cancel()
   545  			ruleMgr.Stop()
   546  		})
   547  	}
   548  	// Run the alert sender.
   549  	{
   550  		sdr := alert.NewSender(logger, reg, alertmgrs)
   551  		ctx, cancel := context.WithCancel(context.Background())
   552  		ctx = tracing.ContextWithTracer(ctx, tracer)
   553  
   554  		g.Add(func() error {
   555  			for {
   556  				tracing.DoInSpan(ctx, "/send_alerts", func(ctx context.Context) {
   557  					sdr.Send(ctx, alertQ.Pop(ctx.Done()))
   558  				})
   559  
   560  				select {
   561  				case <-ctx.Done():
   562  					return ctx.Err()
   563  				default:
   564  				}
   565  			}
   566  		}, func(error) {
   567  			cancel()
   568  		})
   569  	}
   570  
   571  	// Handle reload and termination interrupts.
   572  	reloadWebhandler := make(chan chan error)
   573  	{
   574  		ctx, cancel := context.WithCancel(context.Background())
   575  		g.Add(func() error {
   576  			// Initialize rules.
   577  			if err := reloadRules(logger, conf.ruleFiles, ruleMgr, conf.evalInterval, metrics); err != nil {
   578  				level.Error(logger).Log("msg", "initialize rules failed", "err", err)
   579  				return err
   580  			}
   581  			for {
   582  				select {
   583  				case <-reloadSignal:
   584  					if err := reloadRules(logger, conf.ruleFiles, ruleMgr, conf.evalInterval, metrics); err != nil {
   585  						level.Error(logger).Log("msg", "reload rules by sighup failed", "err", err)
   586  					}
   587  				case reloadMsg := <-reloadWebhandler:
   588  					err := reloadRules(logger, conf.ruleFiles, ruleMgr, conf.evalInterval, metrics)
   589  					if err != nil {
   590  						level.Error(logger).Log("msg", "reload rules by webhandler failed", "err", err)
   591  					}
   592  					reloadMsg <- err
   593  				case <-ctx.Done():
   594  					return ctx.Err()
   595  				}
   596  			}
   597  		}, func(error) {
   598  			cancel()
   599  		})
   600  	}
   601  
   602  	grpcProbe := prober.NewGRPC()
   603  	httpProbe := prober.NewHTTP()
   604  	statusProber := prober.Combine(
   605  		httpProbe,
   606  		grpcProbe,
   607  		prober.NewInstrumentation(comp, logger, extprom.WrapRegistererWithPrefix("thanos_", reg)),
   608  	)
   609  
   610  	// Start gRPC server.
   611  	tlsCfg, err := tls.NewServerConfig(log.With(logger, "protocol", "gRPC"), conf.grpc.tlsSrvCert, conf.grpc.tlsSrvKey, conf.grpc.tlsSrvClientCA)
   612  	if err != nil {
   613  		return errors.Wrap(err, "setup gRPC server")
   614  	}
   615  
   616  	options := []grpcserver.Option{
   617  		grpcserver.WithServer(thanosrules.RegisterRulesServer(ruleMgr)),
   618  		grpcserver.WithListen(conf.grpc.bindAddress),
   619  		grpcserver.WithGracePeriod(conf.grpc.gracePeriod),
   620  		grpcserver.WithGracePeriod(conf.grpc.maxConnectionAge),
   621  		grpcserver.WithTLSConfig(tlsCfg),
   622  	}
   623  	infoOptions := []info.ServerOptionFunc{info.WithRulesInfoFunc()}
   624  	if tsdbDB != nil {
   625  		tsdbStore := store.NewTSDBStore(logger, tsdbDB, component.Rule, conf.lset)
   626  		infoOptions = append(
   627  			infoOptions,
   628  			info.WithLabelSetFunc(func() []labelpb.ZLabelSet {
   629  				return tsdbStore.LabelSet()
   630  			}),
   631  			info.WithStoreInfoFunc(func() *infopb.StoreInfo {
   632  				if httpProbe.IsReady() {
   633  					mint, maxt := tsdbStore.TimeRange()
   634  					return &infopb.StoreInfo{
   635  						MinTime:                      mint,
   636  						MaxTime:                      maxt,
   637  						SupportsSharding:             true,
   638  						SupportsWithoutReplicaLabels: true,
   639  						TsdbInfos:                    tsdbStore.TSDBInfos(),
   640  					}
   641  				}
   642  				return nil
   643  			}),
   644  		)
   645  		storeServer := store.NewLimitedStoreServer(store.NewInstrumentedStoreServer(reg, tsdbStore), reg, conf.storeRateLimits)
   646  		options = append(options, grpcserver.WithServer(store.RegisterStoreServer(storeServer, logger)))
   647  	}
   648  
   649  	options = append(options, grpcserver.WithServer(
   650  		info.RegisterInfoServer(info.NewInfoServer(component.Rule.String(), infoOptions...)),
   651  	))
   652  	s := grpcserver.New(logger, reg, tracer, grpcLogOpts, tagOpts, comp, grpcProbe, options...)
   653  
   654  	g.Add(func() error {
   655  		statusProber.Ready()
   656  		return s.ListenAndServe()
   657  	}, func(err error) {
   658  		statusProber.NotReady(err)
   659  		s.Shutdown(err)
   660  	})
   661  
   662  	// Start UI & metrics HTTP server.
   663  	{
   664  		router := route.New()
   665  
   666  		// RoutePrefix must always start with '/'.
   667  		conf.web.routePrefix = "/" + strings.Trim(conf.web.routePrefix, "/")
   668  
   669  		// Redirect from / to /webRoutePrefix.
   670  		if conf.web.routePrefix != "/" {
   671  			router.Get("/", func(w http.ResponseWriter, r *http.Request) {
   672  				http.Redirect(w, r, conf.web.routePrefix, http.StatusFound)
   673  			})
   674  			router = router.WithPrefix(conf.web.routePrefix)
   675  		}
   676  
   677  		router.Post("/-/reload", func(w http.ResponseWriter, r *http.Request) {
   678  			reloadMsg := make(chan error)
   679  			reloadWebhandler <- reloadMsg
   680  			if err := <-reloadMsg; err != nil {
   681  				http.Error(w, err.Error(), http.StatusInternalServerError)
   682  			}
   683  		})
   684  
   685  		ins := extpromhttp.NewInstrumentationMiddleware(reg, nil)
   686  
   687  		// Configure Request Logging for HTTP calls.
   688  		logMiddleware := logging.NewHTTPServerMiddleware(logger, httpLogOpts...)
   689  
   690  		// TODO(bplotka in PR #513 review): pass all flags, not only the flags needed by prefix rewriting.
   691  		ui.NewRuleUI(logger, reg, ruleMgr, conf.alertQueryURL.String(), conf.web.externalPrefix, conf.web.prefixHeaderName).Register(router, ins)
   692  
   693  		api := v1.NewRuleAPI(logger, reg, thanosrules.NewGRPCClient(ruleMgr), ruleMgr, conf.web.disableCORS, flagsMap)
   694  		api.Register(router.WithPrefix("/api/v1"), tracer, logger, ins, logMiddleware)
   695  
   696  		srv := httpserver.New(logger, reg, comp, httpProbe,
   697  			httpserver.WithListen(conf.http.bindAddress),
   698  			httpserver.WithGracePeriod(time.Duration(conf.http.gracePeriod)),
   699  			httpserver.WithTLSConfig(conf.http.tlsConfig),
   700  		)
   701  		srv.Handle("/", router)
   702  
   703  		g.Add(func() error {
   704  			statusProber.Healthy()
   705  
   706  			return srv.ListenAndServe()
   707  		}, func(err error) {
   708  			statusProber.NotReady(err)
   709  			defer statusProber.NotHealthy(err)
   710  
   711  			srv.Shutdown(err)
   712  		})
   713  	}
   714  
   715  	confContentYaml, err := conf.objStoreConfig.Content()
   716  	if err != nil {
   717  		return err
   718  	}
   719  
   720  	if len(confContentYaml) > 0 {
   721  		// The background shipper continuously scans the data directory and uploads
   722  		// new blocks to Google Cloud Storage or an S3-compatible storage service.
   723  		bkt, err := client.NewBucket(logger, confContentYaml, component.Rule.String())
   724  		if err != nil {
   725  			return err
   726  		}
   727  		bkt = objstoretracing.WrapWithTraces(objstore.WrapWithMetrics(bkt, extprom.WrapRegistererWithPrefix("thanos_", reg), bkt.Name()))
   728  
   729  		// Ensure we close up everything properly.
   730  		defer func() {
   731  			if err != nil {
   732  				runutil.CloseWithLogOnErr(logger, bkt, "bucket client")
   733  			}
   734  		}()
   735  
   736  		s := shipper.New(logger, reg, conf.dataDir, bkt, func() labels.Labels { return conf.lset }, metadata.RulerSource, nil, conf.shipper.allowOutOfOrderUpload, metadata.HashFunc(conf.shipper.hashFunc))
   737  
   738  		ctx, cancel := context.WithCancel(context.Background())
   739  
   740  		g.Add(func() error {
   741  			defer runutil.CloseWithLogOnErr(logger, bkt, "bucket client")
   742  
   743  			return runutil.Repeat(30*time.Second, ctx.Done(), func() error {
   744  				if _, err := s.Sync(ctx); err != nil {
   745  					level.Warn(logger).Log("err", err)
   746  				}
   747  				return nil
   748  			})
   749  		}, func(error) {
   750  			cancel()
   751  		})
   752  	} else {
   753  		level.Info(logger).Log("msg", "no supported bucket was configured, uploads will be disabled")
   754  	}
   755  
   756  	level.Info(logger).Log("msg", "starting rule node")
   757  	return nil
   758  }
   759  
   760  func removeLockfileIfAny(logger log.Logger, dataDir string) error {
   761  	absdir, err := filepath.Abs(dataDir)
   762  	if err != nil {
   763  		return err
   764  	}
   765  	if err := os.Remove(filepath.Join(absdir, "lock")); err != nil {
   766  		if os.IsNotExist(err) {
   767  			return nil
   768  		}
   769  		return err
   770  	}
   771  	level.Info(logger).Log("msg", "a leftover lockfile found and removed")
   772  	return nil
   773  }
   774  
   775  func parseFlagLabels(s []string) (labels.Labels, error) {
   776  	var lset labels.Labels
   777  	for _, l := range s {
   778  		parts := strings.SplitN(l, "=", 2)
   779  		if len(parts) != 2 {
   780  			return nil, errors.Errorf("unrecognized label %q", l)
   781  		}
   782  		if !model.LabelName.IsValid(model.LabelName(parts[0])) {
   783  			return nil, errors.Errorf("unsupported format for label %s", l)
   784  		}
   785  		val, err := strconv.Unquote(parts[1])
   786  		if err != nil {
   787  			return nil, errors.Wrap(err, "unquote label value")
   788  		}
   789  		lset = append(lset, labels.Label{Name: parts[0], Value: val})
   790  	}
   791  	sort.Sort(lset)
   792  	return lset, nil
   793  }
   794  
   795  func labelsTSDBToProm(lset labels.Labels) (res labels.Labels) {
   796  	for _, l := range lset {
   797  		res = append(res, labels.Label{
   798  			Name:  l.Name,
   799  			Value: l.Value,
   800  		})
   801  	}
   802  	return res
   803  }
   804  
   805  func queryFuncCreator(
   806  	logger log.Logger,
   807  	queriers []*httpconfig.Client,
   808  	promClients []*promclient.Client,
   809  	duplicatedQuery prometheus.Counter,
   810  	ruleEvalWarnings *prometheus.CounterVec,
   811  	httpMethod string,
   812  	doNotAddThanosParams bool,
   813  ) func(partialResponseStrategy storepb.PartialResponseStrategy) rules.QueryFunc {
   814  
   815  	// queryFunc returns query function that hits the HTTP query API of query peers in randomized order until we get a result
   816  	// back or the context get canceled.
   817  	return func(partialResponseStrategy storepb.PartialResponseStrategy) rules.QueryFunc {
   818  		var spanID string
   819  
   820  		switch partialResponseStrategy {
   821  		case storepb.PartialResponseStrategy_WARN:
   822  			spanID = "/rule_instant_query HTTP[client]"
   823  		case storepb.PartialResponseStrategy_ABORT:
   824  			spanID = "/rule_instant_query_part_resp_abort HTTP[client]"
   825  		default:
   826  			// Programming error will be caught by tests.
   827  			panic(errors.Errorf("unknown partial response strategy %v", partialResponseStrategy).Error())
   828  		}
   829  
   830  		return func(ctx context.Context, q string, t time.Time) (promql.Vector, error) {
   831  			for _, i := range rand.Perm(len(queriers)) {
   832  				promClient := promClients[i]
   833  				endpoints := thanosrules.RemoveDuplicateQueryEndpoints(logger, duplicatedQuery, queriers[i].Endpoints())
   834  				for _, i := range rand.Perm(len(endpoints)) {
   835  					span, ctx := tracing.StartSpan(ctx, spanID)
   836  					v, warns, err := promClient.PromqlQueryInstant(ctx, endpoints[i], q, t, promclient.QueryOptions{
   837  						Deduplicate:             true,
   838  						PartialResponseStrategy: partialResponseStrategy,
   839  						Method:                  httpMethod,
   840  						DoNotAddThanosParams:    doNotAddThanosParams,
   841  					})
   842  					span.Finish()
   843  
   844  					if err != nil {
   845  						level.Error(logger).Log("err", err, "query", q)
   846  						continue
   847  					}
   848  					if len(warns) > 0 {
   849  						ruleEvalWarnings.WithLabelValues(strings.ToLower(partialResponseStrategy.String())).Inc()
   850  						// TODO(bwplotka): Propagate those to UI, probably requires changing rule manager code ):
   851  						level.Warn(logger).Log("warnings", strings.Join(warns, ", "), "query", q)
   852  					}
   853  					return v, nil
   854  				}
   855  			}
   856  			return nil, errors.Errorf("no query API server reachable")
   857  		}
   858  	}
   859  }
   860  
   861  func addDiscoveryGroups(g *run.Group, c *httpconfig.Client, interval time.Duration) {
   862  	ctx, cancel := context.WithCancel(context.Background())
   863  	g.Add(func() error {
   864  		c.Discover(ctx)
   865  		return nil
   866  	}, func(error) {
   867  		cancel()
   868  	})
   869  
   870  	g.Add(func() error {
   871  		return runutil.Repeat(interval, ctx.Done(), func() error {
   872  			return c.Resolve(ctx)
   873  		})
   874  	}, func(error) {
   875  		cancel()
   876  	})
   877  }
   878  
   879  func reloadRules(logger log.Logger,
   880  	ruleFiles []string,
   881  	ruleMgr *thanosrules.Manager,
   882  	evalInterval time.Duration,
   883  	metrics *RuleMetrics) error {
   884  	level.Debug(logger).Log("msg", "configured rule files", "files", strings.Join(ruleFiles, ","))
   885  	var (
   886  		errs      errutil.MultiError
   887  		files     []string
   888  		seenFiles = make(map[string]struct{})
   889  	)
   890  	for _, pat := range ruleFiles {
   891  		fs, err := filepath.Glob(pat)
   892  		if err != nil {
   893  			// The only error can be a bad pattern.
   894  			errs.Add(errors.Wrapf(err, "retrieving rule files failed. Ignoring file. pattern %s", pat))
   895  			continue
   896  		}
   897  
   898  		for _, fp := range fs {
   899  			if _, ok := seenFiles[fp]; ok {
   900  				continue
   901  			}
   902  			files = append(files, fp)
   903  			seenFiles[fp] = struct{}{}
   904  		}
   905  	}
   906  
   907  	level.Info(logger).Log("msg", "reload rule files", "numFiles", len(files))
   908  
   909  	if err := ruleMgr.Update(evalInterval, files); err != nil {
   910  		metrics.configSuccess.Set(0)
   911  		errs.Add(errors.Wrap(err, "reloading rules failed"))
   912  		return errs.Err()
   913  	}
   914  
   915  	metrics.configSuccess.Set(1)
   916  	metrics.configSuccessTime.Set(float64(time.Now().UnixNano()) / 1e9)
   917  
   918  	metrics.rulesLoaded.Reset()
   919  	for _, group := range ruleMgr.RuleGroups() {
   920  		metrics.rulesLoaded.WithLabelValues(group.PartialResponseStrategy.String(), group.OriginalFile, group.Name()).Set(float64(len(group.Rules())))
   921  	}
   922  	return errs.Err()
   923  }